├── .gitignore
├── README.md
├── init.sh
└── sessions
    ├── 001
        ├── README.md
        └── install.yaml
    ├── 002
        ├── README.md
        └── install
        │   ├── 010-metrics-server.yaml
        │   ├── 020-prometheus-operator.yaml
        │   ├── 021-prometheus-server.yaml
        │   ├── 022-prometheus-strimzi.yaml
        │   ├── 030-grafana-operator.yaml
        │   ├── 031-grafana-server.yaml
        │   ├── 032-grafana-strimzi.yaml
        │   ├── 040-kube-state-metrics.yaml
        │   └── 050-node-exporter.yaml
    ├── 003
        └── README.md
    ├── 004
        ├── README.md
        └── install.yaml
    ├── 005
        ├── README.md
        └── install
        │   ├── apicurio.yaml
        │   ├── application.yaml
        │   ├── greeting.avsc
        │   └── registry.yaml
    ├── 006
        ├── README.md
        └── install
        │   ├── connect.yaml
        │   └── mysql.yaml
    ├── 007
        ├── README.md
        └── install
        │   ├── mm2.yaml
        │   └── target.yaml
    ├── 008
        └── README.md
    ├── 009
        └── README.md
    └── 010
        ├── README.md
        └── install.yaml


/.gitignore:
--------------------------------------------------------------------------------
 1 | .settings/
 2 | .idea/
 3 | .vscode/
 4 | target/
 5 | .project
 6 | .classpath
 7 | .factorypath
 8 | .DS_Store
 9 | *.iml
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Strimzi debugging
 2 | 
 3 | This project contains a series of debugging sessions for Strimzi.
 4 | You can use Minikube to run most of the examples.
 5 | 
 6 | 1. [Deploy a Kafka cluster](/sessions/001)
 7 | 2. [Monitor Kafka metrics](/sessions/002)
 8 | 3. [Get diagnostic data](/sessions/003)
 9 | 4. [Configure TLS authentication](/sessions/004)
10 | 5. [Use Kafka with Apicurio Registry](/sessions/005)
11 | 6. [Use Kafka Connect with Debezium](/sessions/006)
12 | 7. [Use Mirror Maker 2 for disaster recovery](/sessions/007)
13 | 8. [Recover broker volumes](/sessions/008)
14 | 9. [Rebalance with Cruise Control](/sessions/009)
15 | 10. [Run transactional applications](/sessions/010)
16 | 


--------------------------------------------------------------------------------
/init.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | NAMESPACE="test" && export NAMESPACE
 4 | STRIMZI_VERSION="0.46.0" && export STRIMZI_VERSION
 5 | 
 6 | [[ "${BASH_SOURCE[0]}" -ef "$0" ]] && echo "Usage: source init.sh" && exit 1
 7 | 
 8 | kafka-cp() {
 9 |   local id="${1-}" part="${2-50}"
10 |   echo 'public void run(String id, int part) { System.out.println(abs(id.hashCode()) % part); }
11 |     private int abs(int n) { return (n == Integer.MIN_VALUE) ? 0 : Math.abs(n); }
12 |     run("'"$id"'", '"$part"');' | jshell -
13 | }
14 | 
15 | kubectl-kafka() {
16 |   kubectl get po kafka-tools &>/dev/null || kubectl run kafka-tools -q --restart="Never" \
17 |     --image="apache/kafka:latest" -- sh -c "trap : TERM INT; sleep infinity & wait"
18 |   kubectl wait --for=condition=ready po kafka-tools &>/dev/null
19 |   kubectl exec kafka-tools -itq -- sh -c "/opt/kafka/$*"
20 | }
21 | 
22 | echo "Deploying Strimzi"
23 | 
24 | # create test namespace
25 | kubectl config set-context --current --namespace="$NAMESPACE" &>/dev/null
26 | 
27 | # delete any topic first to clean finalizers
28 | kubectl get kt -o yaml 2>/dev/null | yq 'del(.items[].metadata.finalizers[])' \
29 |   | kubectl apply -f - &>/dev/null; kubectl delete kt --all --force &>/dev/null
30 | 
31 | kubectl delete ns "$NAMESPACE" --ignore-not-found --force --wait=false &>/dev/null
32 | kubectl wait --for=delete ns/"$NAMESPACE" --timeout=120s &>/dev/null && kubectl create ns "$NAMESPACE"
33 | 
34 | # set privileged SecurityStandard label for this namespace
35 | kubectl label ns "$NAMESPACE" pod-security.kubernetes.io/enforce=privileged --overwrite &>/dev/null
36 | 
37 | # clean PersistentVolumes
38 | # shellcheck disable=SC2046
39 | kubectl delete pv $(kubectl get pv 2>/dev/null | grep "my-cluster" | awk '{print $1}') --force &>/dev/null
40 | 
41 | # clean monitoring stack
42 | kubectl delete ns grafana prometheus --force --wait=false &>/dev/null
43 | kubectl delete crd $(kubectl get crd 2>/dev/null | grep integreatly.org | awk '{print $1}') &>/dev/null
44 | kubectl delete crd $(kubectl get crd 2>/dev/null | grep monitoring.coreos.com | awk '{print $1}') &>/dev/null
45 | 
46 | # deploy Strimzi
47 | STRIMZI_FILE="/tmp/strimzi-$STRIMZI_VERSION.yaml"
48 | if [[ ! -f "$STRIMZI_FILE" ]]; then
49 |   echo "Downloading Strimzi to $STRIMZI_FILE"
50 |   curl -sLk "https://github.com/strimzi/strimzi-kafka-operator/releases/download/$STRIMZI_VERSION/strimzi-cluster-operator-$STRIMZI_VERSION.yaml" -o "$STRIMZI_FILE"
51 | fi
52 | sed -E "s/namespace: .*/namespace: $NAMESPACE/g ; s/memory: .*/memory: 500Mi/g" "$STRIMZI_FILE" \
53 |   | kubectl create -f - --dry-run=client -o yaml | kubectl replace --force -f - &>/dev/null
54 | kubectl set env deploy/strimzi-cluster-operator STRIMZI_FULL_RECONCILIATION_INTERVAL_MS="30000" &>/dev/null
55 | 
56 | kubectl wait --for=condition=Available deploy strimzi-cluster-operator --timeout=300s
57 | echo "Done"
58 | 


--------------------------------------------------------------------------------
/sessions/001/README.md:
--------------------------------------------------------------------------------
  1 | ## Deploy a Kafka cluster
  2 | 
  3 | In this example, we deploy a Kafka cluster to a Kubernetes cluster using the operator.
  4 | Use the `init.sh` script to easily initialize or reset the test environment.
  5 | 
  6 | > [!IMPORTANT]  
  7 | > Login first if your Kubernetes cluster requires authentication.
  8 | 
  9 | ```sh
 10 | $ source init.sh
 11 | Deploying Strimzi
 12 | namespace/test created
 13 | Done
 14 | ```
 15 | 
 16 | Then, we create a new Kafka cluster and test topic.
 17 | In the YAML files, we can see how the desired cluster state is declared.
 18 | 
 19 | In addition to Kafka pods, the Entity Operator (EO) pod is also deployed, which includes two namespaced operators: the Topic Operator (TO), and the User Operator (UO).
 20 | These operators only support a single namespace and a single Kafka cluster.
 21 | 
 22 | ```sh
 23 | $ kubectl create -f sessions/001/install.yaml
 24 | kafkanodepool.kafka.strimzi.io/controller created
 25 | kafkanodepool.kafka.strimzi.io/broker created
 26 | kafka.kafka.strimzi.io/my-cluster created
 27 | kafkatopic.kafka.strimzi.io/my-topic created
 28 | 
 29 | $ kubectl get sps,knp,k,kt,po
 30 | NAME                                                  PODS   READY PODS   CURRENT PODS   AGE
 31 | strimzipodset.core.strimzi.io/my-cluster-broker       3      3            3              65s
 32 | strimzipodset.core.strimzi.io/my-cluster-controller   3      3            3              65s
 33 | 
 34 | NAME                                        DESIRED REPLICAS   ROLES            NODEIDS
 35 | kafkanodepool.kafka.strimzi.io/broker       3                  ["broker"]       [5,6,7]
 36 | kafkanodepool.kafka.strimzi.io/controller   3                  ["controller"]   [0,1,2]
 37 | 
 38 | NAME                                DESIRED KAFKA REPLICAS   DESIRED ZK REPLICAS   READY   METADATA STATE   WARNINGS
 39 | kafka.kafka.strimzi.io/my-cluster                                                                           
 40 | 
 41 | NAME                                   CLUSTER      PARTITIONS   REPLICATION FACTOR   READY
 42 | kafkatopic.kafka.strimzi.io/my-topic   my-cluster   3            3                    True
 43 | 
 44 | NAME                                             READY   STATUS    RESTARTS   AGE
 45 | pod/my-cluster-broker-10                         1/1     Running   0          64s
 46 | pod/my-cluster-broker-11                         1/1     Running   0          64s
 47 | pod/my-cluster-broker-12                         1/1     Running   0          64s
 48 | pod/my-cluster-controller-0                      1/1     Running   0          63s
 49 | pod/my-cluster-controller-1                      1/1     Running   0          63s
 50 | pod/my-cluster-controller-2                      1/1     Running   0          63s
 51 | pod/my-cluster-entity-operator-bb7c65dd4-9zdmk   2/2     Running   0          31s
 52 | pod/strimzi-cluster-operator-6596f469c9-smsw2    1/1     Running   0          2m5s
 53 | ```
 54 | 
 55 | When the Kafka cluster is ready, we send and receive some messages.
 56 | When consuming messages, you can print additional data such as the partition number.
 57 | Every consumer with the same `group.id` is part of the same consumer group.
 58 | 
 59 | ```sh
 60 | $ kubectl-kafka bin/kafka-console-producer.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --topic my-topic \
 61 |   --property parse.key=true --property key.separator="#"
 62 | >32947#hello
 63 | >24910#kafka
 64 | >45237#world
 65 | >^C
 66 | 
 67 | $ kubectl-kafka bin/kafka-console-consumer.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --topic my-topic \
 68 |   --group my-group --from-beginning --max-messages 3 --property print.partition=true --property print.key=true
 69 | Partition:0	24910	kafka
 70 | Partition:2	32947	hello
 71 | Partition:2	45237	world
 72 | Processed a total of 3 messages
 73 | ```
 74 | 
 75 | It works, but where our messages are being stored?
 76 | The broker property `log.dirs` configures where our topic partitions are stored.
 77 | We have 3 partitions, which corresponds to exactly 3 folders on disk.
 78 | 
 79 | ```sh
 80 | $ kubectl exec my-cluster-broker-10 -- cat /tmp/strimzi.properties | grep log.dirs
 81 | log.dirs=/var/lib/kafka/data/kafka-log10
 82 | 
 83 | $ kubectl exec my-cluster-broker-10 -- ls -lh /var/lib/kafka/data/kafka-log10 | grep my-topic
 84 | drwxr-xr-x. 2 kafka root  167 Mar 23 13:18 my-topic-0
 85 | drwxr-xr-x. 2 kafka root  167 Mar 23 13:15 my-topic-1
 86 | drwxr-xr-x. 2 kafka root  167 Mar 23 13:18 my-topic-2
 87 | ```
 88 | 
 89 | The consumer output shows that messages were sent to partition 0 and 2.
 90 | Looking inside partition 0, we have a `.log` file containing our records (each segment is named after the initial offset), an `.index` file mapping the record offset to its position in the log and a `.timeindex` file mapping the record timestamp to its position in the log.
 91 | The other two files contain additional metadata.
 92 | 
 93 | ```sh
 94 | $ kubectl exec my-cluster-broker-10 -- ls -lh /var/lib/kafka/data/kafka-log10/my-topic-0
 95 | total 12K
 96 | -rw-r--r--. 1 kafka root 10M Mar 23 13:15 00000000000000000000.index
 97 | -rw-r--r--. 1 kafka root  78 Mar 23 13:18 00000000000000000000.log
 98 | -rw-r--r--. 1 kafka root 10M Mar 23 13:15 00000000000000000000.timeindex
 99 | -rw-r--r--. 1 kafka root   8 Mar 23 13:18 leader-epoch-checkpoint
100 | -rw-r--r--. 1 kafka root  43 Mar 23 13:15 partition.metadata
101 | ```
102 | 
103 | Partition log files are in binary format, but Kafka includes a dump tool for decoding them.
104 | On this partition, we have one batch (`baseOffset`), containing only one record (`| offset`) with key "24910" and payload "kafka".
105 | 
106 | ```sh
107 | $ kubectl exec my-cluster-broker-10 -- bin/kafka-dump-log.sh --deep-iteration --print-data-log \
108 |   --files /var/lib/kafka/data/kafka-log10/my-topic-0/00000000000000000000.log
109 | Dumping /var/lib/kafka/data/kafka-log10/my-topic-0/00000000000000000000.log
110 | Log starting offset: 0
111 | baseOffset: 0 lastOffset: 0 count: 1 baseSequence: 0 lastSequence: 0 producerId: 0 producerEpoch: 0 partitionLeaderEpoch: 0 isTransactional: false isControl: false deleteHorizonMs: OptionalLong.empty position: 0 CreateTime: 1742735936663 size: 78 magic: 2 compresscodec: none crc: 825983240 isvalid: true
112 | | offset: 0 CreateTime: 1742735936663 keySize: 5 valueSize: 5 sequence: 0 headerKeys: [] key: 24910 payload: kafka
113 | ```
114 | 
115 | Our consumer group should have committed the offsets to the `__consumer_offsets` internal topic.
116 | The problem is that this topic has 50 partitions by default, so how do we know which partition was used?
117 | We can use the same algorithm that Kafka uses to map a `group.id` to a specific offset coordinating partition.
118 | The `kafka-cp` function is defined inside the `init.sh` script.
119 | 
120 | ```sh
121 | $ kafka-cp my-group
122 | 12
123 | ```
124 | 
125 | We know that the consumer group commit record was sent to `__consumer_offsets-12`, so let's dump this partition too.
126 | Here values are encoded for performance reasons, so we have to pass the `--offsets-decoder` option.
127 | 
128 | This partition contains other metadata, but we are specifically interested in the `offset_commit` key.
129 | We have a batch from our consumer group, which includes 3 records, one for each input topic partition.
130 | As expected, the consumer group committed offset1@partition0, offset2@partition2, and offset0@partition1 (this partition didn't received any message).
131 | 
132 | ```sh
133 | $ kubectl exec my-cluster-broker-10 -- bin/kafka-dump-log.sh --deep-iteration --print-data-log --offsets-decoder \
134 |   --files /var/lib/kafka/data/kafka-log10/__consumer_offsets-12/00000000000000000000.log
135 | Dumping /var/lib/kafka/data/kafka-log10/__consumer_offsets-12/00000000000000000000.log
136 | Log starting offset: 0
137 | ...
138 | baseOffset: 1 lastOffset: 3 count: 3 baseSequence: 0 lastSequence: 2 producerId: -1 producerEpoch: -1 partitionLeaderEpoch: 0 isTransactional: false isControl: false deleteHorizonMs: OptionalLong.empty position: 344 CreateTime: 1742735956644 size: 232 magic: 2 compresscodec: none crc: 4034662502 isvalid: true
139 | | offset: 1 CreateTime: 1742735956644 keySize: 26 valueSize: 24 sequence: 0 headerKeys: [] key: {"type":"1","data":{"group":"my-group","topic":"my-topic","partition":0}} payload: {"version":"3","data":{"offset":1,"leaderEpoch":0,"metadata":"","commitTimestamp":1742735956641}}
140 | | offset: 2 CreateTime: 1742735956644 keySize: 26 valueSize: 24 sequence: 1 headerKeys: [] key: {"type":"1","data":{"group":"my-group","topic":"my-topic","partition":1}} payload: {"version":"3","data":{"offset":0,"leaderEpoch":-1,"metadata":"","commitTimestamp":1742735956641}}
141 | | offset: 3 CreateTime: 1742735956644 keySize: 26 valueSize: 24 sequence: 2 headerKeys: [] key: {"type":"1","data":{"group":"my-group","topic":"my-topic","partition":2}} payload: {"version":"3","data":{"offset":2,"leaderEpoch":0,"metadata":"","commitTimestamp":1742735956641}}
142 | ...
143 | ```
144 | 


--------------------------------------------------------------------------------
/sessions/001/install.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: kafka.strimzi.io/v1beta2
  2 | kind: KafkaNodePool
  3 | metadata:
  4 |   name: controller
  5 |   labels:
  6 |     # must match the cluster name
  7 |     strimzi.io/cluster: my-cluster
  8 |   annotations:
  9 |     strimzi.io/next-node-ids: "[0-9]"
 10 | spec:
 11 |   replicas: 3
 12 |   roles:
 13 |     - controller
 14 |   resources:
 15 |     # set requests==limits to have Guaranteed QoS
 16 |     limits:
 17 |       cpu: 1000m
 18 |       memory: 1Gi
 19 |     requests:
 20 |       cpu: 500m
 21 |       memory: 1Gi
 22 |   storage:
 23 |     size: 5Gi
 24 |     type: persistent-claim
 25 |     deleteClaim: false
 26 | ---
 27 | apiVersion: kafka.strimzi.io/v1beta2
 28 | kind: KafkaNodePool
 29 | metadata:
 30 |   name: broker
 31 |   labels:
 32 |     # must match the cluster name
 33 |     strimzi.io/cluster: my-cluster
 34 |   annotations:
 35 |     strimzi.io/next-node-ids: "[10-100]"
 36 | spec:
 37 |   replicas: 3
 38 |   roles:
 39 |     - broker
 40 |   resources:
 41 |     # set requests==limits to have Guaranteed QoS
 42 |     limits:
 43 |       cpu: 1000m
 44 |       memory: 2Gi
 45 |     requests:
 46 |       cpu: 500m
 47 |       memory: 2Gi
 48 |   storage:
 49 |     size: 10Gi
 50 |     type: persistent-claim
 51 |     deleteClaim: false
 52 | ---
 53 | apiVersion: kafka.strimzi.io/v1beta2
 54 | kind: Kafka
 55 | metadata:
 56 |   name: my-cluster
 57 |   annotations:
 58 |     strimzi.io/node-pools: enabled
 59 |     strimzi.io/kraft: enabled
 60 | spec:
 61 |   kafka:
 62 |     #version: x.y.z
 63 |     #metadataVersion: x.y-IVx
 64 |     config:
 65 |       num.partitions: 3
 66 |       default.replication.factor: 3
 67 |       min.insync.replicas: 2
 68 |       offsets.topic.replication.factor: 3
 69 |       transaction.state.log.replication.factor: 3
 70 |       transaction.state.log.min.isr: 2
 71 |     listeners:
 72 |       - name: plain
 73 |         port: 9092
 74 |         type: internal
 75 |         tls: false
 76 |       - name: tls
 77 |         port: 9093
 78 |         type: internal
 79 |         tls: true
 80 |     logging:
 81 |       type: inline
 82 |       loggers:
 83 |         rootLogger.level: INFO
 84 |         logger.kafka.request.logger.level: INFO
 85 |         logger.kafkatc.name: kafka.coordinator.transaction
 86 |         logger.kafkatc.level: INFO
 87 |         logger.kafkalcm.name: kafka.log.LogCleanerManager
 88 |         logger.kafkalcm.level: INFO
 89 |         logger.strimzi.name: io.strimzi
 90 |         logger.strimzi.level: INFO
 91 |     metricsConfig:
 92 |       type: jmxPrometheusExporter
 93 |       valueFrom:
 94 |         configMapKeyRef:
 95 |           name: kafka-metrics
 96 |           key: kafka-metrics-config.yml
 97 |   entityOperator:
 98 |     topicOperator:
 99 |       logging:
100 |         type: inline
101 |         loggers:
102 |           rootLogger.level: INFO
103 |           logger.top.name: io.strimzi.operator.topic
104 |           logger.top.level: INFO
105 |       resources:
106 |         limits:
107 |           cpu: 500m
108 |           memory: 512Mi
109 |         requests:
110 |           cpu: 500m
111 |           memory: 256Mi
112 |     userOperator:
113 |       logging:
114 |         type: inline
115 |         loggers:
116 |           rootLogger.level: INFO
117 |           logger.uop.name: io.strimzi.operator.user
118 |           logger.uop.level: INFO
119 |       resources:
120 |         limits:
121 |           cpu: 500m
122 |           memory: 512Mi
123 |         requests:
124 |           cpu: 500m
125 |           memory: 256Mi
126 | ---
127 | apiVersion: kafka.strimzi.io/v1beta2
128 | kind: KafkaTopic
129 | metadata:
130 |   name: my-topic
131 |   labels:
132 |     # must match the cluster name
133 |     strimzi.io/cluster: my-cluster
134 | spec:
135 |   partitions: 3
136 |   replicas: 3
137 |   config:
138 |     min.insync.replicas: 2
139 |     # 1 GiB to avoid running out of space with load tests
140 |     retention.bytes: 1073741824
141 | ---
142 | kind: ConfigMap
143 | apiVersion: v1
144 | metadata:
145 |   name: kafka-metrics
146 |   labels:
147 |     app: strimzi
148 | data:
149 |   kafka-metrics-config.yml: |
150 |     # See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics
151 |     lowercaseOutputName: true
152 |     rules:
153 |     # Special cases and very specific rules
154 |     - pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), topic=(.+), partition=(.*)><>Value
155 |       name: kafka_server_$1_$2
156 |       type: GAUGE
157 |       labels:
158 |         clientId: "$3"
159 |         topic: "$4"
160 |         partition: "$5"
161 |     - pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), brokerHost=(.+), brokerPort=(.+)><>Value
162 |       name: kafka_server_$1_$2
163 |       type: GAUGE
164 |       labels:
165 |         clientId: "$3"
166 |         broker: "$4:$5"
167 |     - pattern: kafka.server<type=(.+), cipher=(.+), protocol=(.+), listener=(.+), networkProcessor=(.+)><>connections
168 |       name: kafka_server_$1_connections_tls_info
169 |       type: GAUGE
170 |       labels:
171 |         cipher: "$2"
172 |         protocol: "$3"
173 |         listener: "$4"
174 |         networkProcessor: "$5"
175 |     - pattern: kafka.server<type=(.+), clientSoftwareName=(.+), clientSoftwareVersion=(.+), listener=(.+), networkProcessor=(.+)><>connections
176 |       name: kafka_server_$1_connections_software
177 |       type: GAUGE
178 |       labels:
179 |         clientSoftwareName: "$2"
180 |         clientSoftwareVersion: "$3"
181 |         listener: "$4"
182 |         networkProcessor: "$5"
183 |     - pattern: "kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+-total):"
184 |       name: kafka_server_$1_$4
185 |       type: COUNTER
186 |       labels:
187 |         listener: "$2"
188 |         networkProcessor: "$3"
189 |     - pattern: "kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+):"
190 |       name: kafka_server_$1_$4
191 |       type: GAUGE
192 |       labels:
193 |         listener: "$2"
194 |         networkProcessor: "$3"
195 |     - pattern: kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+-total)
196 |       name: kafka_server_$1_$4
197 |       type: COUNTER
198 |       labels:
199 |         listener: "$2"
200 |         networkProcessor: "$3"
201 |     - pattern: kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+)
202 |       name: kafka_server_$1_$4
203 |       type: GAUGE
204 |       labels:
205 |         listener: "$2"
206 |         networkProcessor: "$3"
207 |     # Some percent metrics use MeanRate attribute
208 |     # Ex) kafka.server<type=(KafkaRequestHandlerPool), name=(RequestHandlerAvgIdlePercent)><>MeanRate
209 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*><>MeanRate
210 |       name: kafka_$1_$2_$3_percent
211 |       type: GAUGE
212 |     # Generic gauges for percents
213 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*><>Value
214 |       name: kafka_$1_$2_$3_percent
215 |       type: GAUGE
216 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*, (.+)=(.+)><>Value
217 |       name: kafka_$1_$2_$3_percent
218 |       type: GAUGE
219 |       labels:
220 |         "$4": "$5"
221 |     # Generic per-second counters with 0-2 key/value pairs
222 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+), (.+)=(.+)><>Count
223 |       name: kafka_$1_$2_$3_total
224 |       type: COUNTER
225 |       labels:
226 |         "$4": "$5"
227 |         "$6": "$7"
228 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+)><>Count
229 |       name: kafka_$1_$2_$3_total
230 |       type: COUNTER
231 |       labels:
232 |         "$4": "$5"
233 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*><>Count
234 |       name: kafka_$1_$2_$3_total
235 |       type: COUNTER
236 |     # Generic gauges with 0-2 key/value pairs
237 |     - pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Value
238 |       name: kafka_$1_$2_$3
239 |       type: GAUGE
240 |       labels:
241 |         "$4": "$5"
242 |         "$6": "$7"
243 |     - pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Value
244 |       name: kafka_$1_$2_$3
245 |       type: GAUGE
246 |       labels:
247 |         "$4": "$5"
248 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)><>Value
249 |       name: kafka_$1_$2_$3
250 |       type: GAUGE
251 |     # Emulate Prometheus 'Summary' metrics for the exported 'Histogram's.
252 |     # Note that these are missing the '_sum' metric!
253 |     - pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Count
254 |       name: kafka_$1_$2_$3_count
255 |       type: COUNTER
256 |       labels:
257 |         "$4": "$5"
258 |         "$6": "$7"
259 |     - pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*), (.+)=(.+)><>(\d+)thPercentile
260 |       name: kafka_$1_$2_$3
261 |       type: GAUGE
262 |       labels:
263 |         "$4": "$5"
264 |         "$6": "$7"
265 |         quantile: "0.$8"
266 |     - pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Count
267 |       name: kafka_$1_$2_$3_count
268 |       type: COUNTER
269 |       labels:
270 |         "$4": "$5"
271 |     - pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*)><>(\d+)thPercentile
272 |       name: kafka_$1_$2_$3
273 |       type: GAUGE
274 |       labels:
275 |         "$4": "$5"
276 |         quantile: "0.$6"
277 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)><>Count
278 |       name: kafka_$1_$2_$3_count
279 |       type: COUNTER
280 |     - pattern: kafka.(\w+)<type=(.+), name=(.+)><>(\d+)thPercentile
281 |       name: kafka_$1_$2_$3
282 |       type: GAUGE
283 |       labels:
284 |         quantile: "0.$4"
285 |     # KRaft overall related metrics
286 |     # distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics
287 |     - pattern: "kafka.server<type=raft-metrics><>(.+-total|.+-max):"
288 |       name: kafka_server_raftmetrics_$1
289 |       type: COUNTER
290 |     - pattern: "kafka.server<type=raft-metrics><>(current-state): (.+)"
291 |       name: kafka_server_raftmetrics_$1
292 |       value: 1
293 |       type: UNTYPED
294 |       labels:
295 |         $1: "$2"
296 |     - pattern: "kafka.server<type=raft-metrics><>(.+):"
297 |       name: kafka_server_raftmetrics_$1
298 |       type: GAUGE
299 |     # KRaft "low level" channels related metrics
300 |     # distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics
301 |     - pattern: "kafka.server<type=raft-channel-metrics><>(.+-total|.+-max):"
302 |       name: kafka_server_raftchannelmetrics_$1
303 |       type: COUNTER
304 |     - pattern: "kafka.server<type=raft-channel-metrics><>(.+):"
305 |       name: kafka_server_raftchannelmetrics_$1
306 |       type: GAUGE
307 |     # Broker metrics related to fetching metadata topic records in KRaft mode
308 |     - pattern: "kafka.server<type=broker-metadata-metrics><>(.+):"
309 |       name: kafka_server_brokermetadatametrics_$1
310 |       type: GAUGE
311 | 


--------------------------------------------------------------------------------
/sessions/002/README.md:
--------------------------------------------------------------------------------
  1 | ## Monitor Kafka metrics
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | 
  5 | When the cluster is ready, install Prometheus, Grafana and Strimzi dashboards.
  6 | Only the Cluster Operator and Kafka dashboards are included, but you can easily add the other components.
  7 | 
  8 | ```sh
  9 | 
 10 | $ for f in sessions/002/install/*.yaml; do
 11 |   echo ">>> Installing $f"
 12 |   envsubst < "$f" | kubectl apply -f -
 13 |   sleep 5
 14 | done
 15 | >>> Installing sessions/002/install/010-metrics-server.yaml
 16 | serviceaccount/metrics-server unchanged
 17 | clusterrole.rbac.authorization.k8s.io/system:aggregated-metrics-reader unchanged
 18 | rolebinding.rbac.authorization.k8s.io/metrics-server-auth-reader unchanged
 19 | clusterrolebinding.rbac.authorization.k8s.io/metrics-server:system:auth-delegator unchanged
 20 | clusterrole.rbac.authorization.k8s.io/system:metrics-server unchanged
 21 | clusterrolebinding.rbac.authorization.k8s.io/system:metrics-server unchanged
 22 | service/metrics-server unchanged
 23 | deployment.apps/metrics-server configured
 24 | apiservice.apiregistration.k8s.io/v1beta1.metrics.k8s.io unchanged
 25 | >>> Installing sessions/002/install/020-prometheus-operator.yaml
 26 | namespace/prometheus created
 27 | customresourcedefinition.apiextensions.k8s.io/alertmanagers.monitoring.coreos.com created
 28 | customresourcedefinition.apiextensions.k8s.io/podmonitors.monitoring.coreos.com created
 29 | customresourcedefinition.apiextensions.k8s.io/prometheuses.monitoring.coreos.com created
 30 | customresourcedefinition.apiextensions.k8s.io/prometheusrules.monitoring.coreos.com created
 31 | customresourcedefinition.apiextensions.k8s.io/servicemonitors.monitoring.coreos.com created
 32 | customresourcedefinition.apiextensions.k8s.io/thanosrulers.monitoring.coreos.com created
 33 | serviceaccount/prometheus-operator created
 34 | clusterrole.rbac.authorization.k8s.io/prometheus-operator unchanged
 35 | clusterrolebinding.rbac.authorization.k8s.io/prometheus-operator unchanged
 36 | service/prometheus-operator created
 37 | deployment.apps/prometheus-operator created
 38 | >>> Installing sessions/002/install/021-prometheus-server.yaml
 39 | namespace/prometheus unchanged
 40 | serviceaccount/prometheus created
 41 | clusterrole.rbac.authorization.k8s.io/prometheus unchanged
 42 | clusterrolebinding.rbac.authorization.k8s.io/prometheus unchanged
 43 | service/prometheus created
 44 | ingress.networking.k8s.io/prometheus created
 45 | prometheus.monitoring.coreos.com/prometheus created
 46 | secret/additional-scrape-configs created
 47 | alertmanager.monitoring.coreos.com/alertmanager created
 48 | service/alertmanager created
 49 | ingress.networking.k8s.io/alertmanager created
 50 | secret/alertmanager-alertmanager created
 51 | >>> Installing sessions/002/install/022-prometheus-strimzi.yaml
 52 | podmonitor.monitoring.coreos.com/strimzi-cluster-operator-metrics-test created
 53 | podmonitor.monitoring.coreos.com/strimzi-entity-operator-metrics-test created
 54 | podmonitor.monitoring.coreos.com/strimzi-bridge-metrics-test created
 55 | podmonitor.monitoring.coreos.com/strimzi-kafka-and-cruise-control-metrics-test created
 56 | >>> Installing sessions/002/install/030-grafana-operator.yaml
 57 | namespace/grafana created
 58 | customresourcedefinition.apiextensions.k8s.io/grafanadashboards.integreatly.org created
 59 | customresourcedefinition.apiextensions.k8s.io/grafanadatasources.integreatly.org created
 60 | customresourcedefinition.apiextensions.k8s.io/grafananotificationchannels.integreatly.org created
 61 | customresourcedefinition.apiextensions.k8s.io/grafanas.integreatly.org created
 62 | serviceaccount/controller-manager created
 63 | role.rbac.authorization.k8s.io/leader-election-role created
 64 | clusterrole.rbac.authorization.k8s.io/manager-role configured
 65 | clusterrole.rbac.authorization.k8s.io/metrics-reader unchanged
 66 | clusterrole.rbac.authorization.k8s.io/proxy-role unchanged
 67 | rolebinding.rbac.authorization.k8s.io/leader-election-rolebinding created
 68 | clusterrolebinding.rbac.authorization.k8s.io/manager-rolebinding unchanged
 69 | clusterrolebinding.rbac.authorization.k8s.io/proxy-rolebinding unchanged
 70 | service/controller-manager-metrics-service created
 71 | configmap/manager-config created
 72 | deployment.apps/controller-manager created
 73 | >>> Installing sessions/002/install/031-grafana-server.yaml
 74 | namespace/grafana unchanged
 75 | grafana.integreatly.org/grafana created
 76 | service/grafana created
 77 | ingress.networking.k8s.io/grafana created
 78 | grafanadatasource.integreatly.org/prometheus created
 79 | >>> Installing sessions/002/install/032-grafana-strimzi.yaml
 80 | grafanadashboard.integreatly.org/strimzi-operators created
 81 | grafanadashboard.integreatly.org/strimzi-kafka created
 82 | >>> Installing sessions/002/install/040-kube-state-metrics.yaml
 83 | serviceaccount/kube-state-metrics unchanged
 84 | clusterrole.rbac.authorization.k8s.io/kube-state-metrics unchanged
 85 | clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics unchanged
 86 | deployment.apps/kube-state-metrics unchanged
 87 | service/kube-state-metrics unchanged
 88 | podmonitor.monitoring.coreos.com/kube-state-metrics created
 89 | grafanadashboard.integreatly.org/kube-state-metrics created
 90 | >>> Installing sessions/002/install/050-node-exporter.yaml
 91 | service/node-exporter created
 92 | daemonset.apps/node-exporter created
 93 | servicemonitor.monitoring.coreos.com/node-exporter created
 94 | grafanadashboard.integreatly.org/node-exporter created
 95 | ```
 96 | 
 97 | When all Grafana is ready, you can access the dashboards from [http://grafana.f12i.io](http://grafana.f12i.io).
 98 | 
 99 | > [!IMPORTANT]  
100 | > Make sure to add ingress mappings to `/etc/hosts`.
101 | > Example: `192.168.49.2 prometheus.f12i.io grafana.f12i.io`
102 | 
103 | It is also possible to create alerting rules to provide notifications about specific conditions observed in metrics.
104 | This is managed by Prometheus Alertmanager, but it is not described here.
105 | 


--------------------------------------------------------------------------------
/sessions/002/install/010-metrics-server.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: ServiceAccount
  3 | metadata:
  4 |   labels:
  5 |     k8s-app: metrics-server
  6 |   name: metrics-server
  7 |   namespace: kube-system
  8 | ---
  9 | apiVersion: rbac.authorization.k8s.io/v1
 10 | kind: ClusterRole
 11 | metadata:
 12 |   name: system:aggregated-metrics-reader
 13 |   labels:
 14 |     k8s-app: metrics-server
 15 |     rbac.authorization.k8s.io/aggregate-to-view: "true"
 16 |     rbac.authorization.k8s.io/aggregate-to-edit: "true"
 17 |     rbac.authorization.k8s.io/aggregate-to-admin: "true"
 18 | rules:
 19 | - apiGroups: ["metrics.k8s.io"]
 20 |   resources: ["pods", "nodes"]
 21 |   verbs: ["get", "list", "watch"]
 22 | ---
 23 | apiVersion: rbac.authorization.k8s.io/v1
 24 | kind: RoleBinding
 25 | metadata:
 26 |   labels:
 27 |     k8s-app: metrics-server
 28 |   name: metrics-server-auth-reader
 29 |   namespace: kube-system
 30 | roleRef:
 31 |   apiGroup: rbac.authorization.k8s.io
 32 |   kind: Role
 33 |   name: extension-apiserver-authentication-reader
 34 | subjects:
 35 |   - kind: ServiceAccount
 36 |     name: metrics-server
 37 |     namespace: kube-system
 38 | ---
 39 | apiVersion: rbac.authorization.k8s.io/v1
 40 | kind: ClusterRoleBinding
 41 | metadata:
 42 |   labels:
 43 |     k8s-app: metrics-server
 44 |   name: metrics-server:system:auth-delegator
 45 | roleRef:
 46 |   apiGroup: rbac.authorization.k8s.io
 47 |   kind: ClusterRole
 48 |   name: system:auth-delegator
 49 | subjects:
 50 |   - kind: ServiceAccount
 51 |     name: metrics-server
 52 |     namespace: kube-system
 53 | ---
 54 | apiVersion: rbac.authorization.k8s.io/v1
 55 | kind: ClusterRole
 56 | metadata:
 57 |   labels:
 58 |     k8s-app: metrics-server
 59 |   name: system:metrics-server
 60 | rules:
 61 |   - apiGroups: [""]
 62 |     resources:
 63 |       - nodes/metrics
 64 |     verbs:
 65 |       - get
 66 |   - apiGroups: [""]
 67 |     resources:
 68 |       - pods
 69 |       - nodes
 70 |     verbs:
 71 |       - get
 72 |       - list
 73 |       - watch
 74 | ---
 75 | apiVersion: rbac.authorization.k8s.io/v1
 76 | kind: ClusterRoleBinding
 77 | metadata:
 78 |   labels:
 79 |     k8s-app: metrics-server
 80 |   name: system:metrics-server
 81 | roleRef:
 82 |   apiGroup: rbac.authorization.k8s.io
 83 |   kind: ClusterRole
 84 |   name: system:metrics-server
 85 | subjects:
 86 |   - kind: ServiceAccount
 87 |     name: metrics-server
 88 |     namespace: kube-system
 89 | ---
 90 | apiVersion: v1
 91 | kind: Service
 92 | metadata:
 93 |   labels:
 94 |     k8s-app: metrics-server
 95 |   name: metrics-server
 96 |   namespace: kube-system
 97 | spec:
 98 |   ports:
 99 |   - name: https
100 |     port: 443
101 |     protocol: TCP
102 |     targetPort: https
103 |   selector:
104 |     k8s-app: metrics-server
105 | ---
106 | apiVersion: apps/v1
107 | kind: Deployment
108 | metadata:
109 |   labels:
110 |     k8s-app: metrics-server
111 |   name: metrics-server
112 |   namespace: kube-system
113 | spec:
114 |   selector:
115 |     matchLabels:
116 |       k8s-app: metrics-server
117 |   strategy:
118 |     rollingUpdate:
119 |       maxUnavailable: 0
120 |   template:
121 |     metadata:
122 |       labels:
123 |         k8s-app: metrics-server
124 |     spec:
125 |       containers:
126 |       - args:
127 |           - --cert-dir=/tmp
128 |           - --secure-port=10250
129 |           - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
130 |           - --kubelet-use-node-status-port
131 |           - --metric-resolution=15s
132 |           - --kubelet-insecure-tls
133 |         image: registry.k8s.io/metrics-server/metrics-server:v0.6.4
134 |         imagePullPolicy: IfNotPresent
135 |         livenessProbe:
136 |           failureThreshold: 3
137 |           httpGet:
138 |             path: /livez
139 |             port: https
140 |             scheme: HTTPS
141 |           periodSeconds: 10
142 |         name: metrics-server
143 |         resources:
144 |           requests:
145 |             cpu: 100m
146 |             memory: 200Mi
147 |         ports:
148 |         - containerPort: 10250
149 |           name: https
150 |           protocol: TCP
151 |         readinessProbe:
152 |           failureThreshold: 3
153 |           httpGet:
154 |             path: /readyz
155 |             port: https
156 |             scheme: HTTPS
157 |           periodSeconds: 10
158 |           initialDelaySeconds: 20
159 |         securityContext:
160 |           readOnlyRootFilesystem: true
161 |           runAsNonRoot: true
162 |           runAsUser: 1000
163 |           allowPrivilegeEscalation: false
164 |         volumeMounts:
165 |         - mountPath: /tmp
166 |           name: tmp-dir
167 |       nodeSelector:
168 |         kubernetes.io/os: linux
169 |       priorityClassName: system-cluster-critical
170 |       serviceAccountName: metrics-server
171 |       volumes:
172 |       # mount in tmp so we can safely use from-scratch images and/or read-only containers
173 |       - emptyDir: {}
174 |         name: tmp-dir
175 | ---
176 | apiVersion: apiregistration.k8s.io/v1
177 | kind: APIService
178 | metadata:
179 |   labels:
180 |     k8s-app: metrics-server
181 |   name: v1beta1.metrics.k8s.io
182 | spec:
183 |   group: metrics.k8s.io
184 |   groupPriorityMinimum: 100
185 |   insecureSkipTLSVerify: true
186 |   service:
187 |     name: metrics-server
188 |     namespace: kube-system
189 |   version: v1beta1
190 |   versionPriority: 100
191 | 


--------------------------------------------------------------------------------
/sessions/002/install/021-prometheus-server.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: Namespace
  3 | metadata:
  4 |   name: prometheus
  5 |   labels:
  6 |     app: prometheus
  7 | ---
  8 | apiVersion: v1
  9 | kind: ServiceAccount
 10 | metadata:
 11 |   name: prometheus
 12 |   namespace: prometheus
 13 | ---
 14 | apiVersion: rbac.authorization.k8s.io/v1
 15 | kind: ClusterRole
 16 | metadata:
 17 |   name: prometheus
 18 | rules:
 19 | - apiGroups: [""]
 20 |   resources:
 21 |   - nodes
 22 |   - nodes/metrics
 23 |   - nodes/proxy
 24 |   - services
 25 |   - endpoints
 26 |   - pods
 27 |   verbs: ["get", "list", "watch"]
 28 | - apiGroups: [""]
 29 |   resources:
 30 |   - configmaps
 31 |   verbs: ["get"]
 32 | - nonResourceURLs: ["/metrics"]
 33 |   verbs: ["get"]
 34 | ---
 35 | apiVersion: rbac.authorization.k8s.io/v1
 36 | kind: ClusterRoleBinding
 37 | metadata:
 38 |   name: prometheus
 39 | roleRef:
 40 |   apiGroup: rbac.authorization.k8s.io
 41 |   kind: ClusterRole
 42 |   name: prometheus
 43 | subjects:
 44 | - kind: ServiceAccount
 45 |   name: prometheus
 46 |   namespace: prometheus
 47 | ---
 48 | kind: Service
 49 | apiVersion: v1
 50 | metadata:
 51 |   name: prometheus
 52 |   labels:
 53 |     app: prometheus
 54 |   namespace: prometheus
 55 | spec:
 56 |   type: ClusterIP
 57 |   ports:
 58 |     - port: 80
 59 |       name: http
 60 |       protocol: TCP
 61 |       targetPort: web
 62 |   selector:
 63 |     prometheus: prometheus
 64 | ---
 65 | apiVersion: networking.k8s.io/v1
 66 | kind: Ingress
 67 | metadata:
 68 |   name: prometheus
 69 |   namespace: prometheus
 70 | spec:
 71 |   ingressClassName: nginx
 72 |   rules:
 73 |   - host: prometheus.f12i.io
 74 |     http:
 75 |       paths:
 76 |       - backend:
 77 |           service:
 78 |             name: prometheus
 79 |             port: 
 80 |               number: 80
 81 |         path: /
 82 |         pathType: Prefix
 83 | ---
 84 | apiVersion: monitoring.coreos.com/v1
 85 | kind: Prometheus
 86 | metadata:
 87 |   name: prometheus
 88 |   labels:
 89 |     app: prometheus
 90 |   namespace: prometheus
 91 | spec:
 92 |   replicas: 1
 93 |   serviceAccountName: prometheus
 94 |   enableAdminAPI: true
 95 |   storage:
 96 |     volumeClaimTemplate:
 97 |       spec:
 98 |         resources:
 99 |           requests:
100 |             storage: 10Gi          
101 |   serviceMonitorSelector:
102 |     matchLabels:
103 |       prometheus: prometheus
104 |   podMonitorSelector:
105 |     matchLabels:
106 |       # monitors must have 'prometheus: prometheus' label
107 |       prometheus: prometheus
108 |   additionalScrapeConfigs:
109 |     name: additional-scrape-configs
110 |     key: prometheus-additional.yaml
111 |   alerting:
112 |     alertmanagers:
113 |     - namespace: prometheus
114 |       name: alertmanager
115 |       port: http
116 | ---
117 | apiVersion: v1
118 | kind: Secret
119 | metadata:
120 |   name: additional-scrape-configs
121 |   labels:
122 |     app: prometheus
123 |   namespace: prometheus
124 | type: Opaque
125 | stringData:
126 |   prometheus-additional.yaml: |
127 |     - job_name: kubernetes-cadvisor
128 |       honor_labels: true
129 |       scrape_interval: 10s
130 |       scrape_timeout: 10s
131 |       metrics_path: /metrics/cadvisor
132 |       scheme: https
133 |       kubernetes_sd_configs:
134 |       - role: node
135 |         namespaces:
136 |           names: []
137 |       bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
138 |       tls_config:
139 |         ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
140 |         insecure_skip_verify: true
141 |       relabel_configs:
142 |       - separator: ;
143 |         regex: __meta_kubernetes_node_label_(.+)
144 |         replacement: $1
145 |         action: labelmap
146 |       - separator: ;
147 |         regex: (.*)
148 |         target_label: __address__
149 |         replacement: kubernetes.default.svc:443
150 |         action: replace
151 |       - source_labels: [__meta_kubernetes_node_name]
152 |         separator: ;
153 |         regex: (.+)
154 |         target_label: __metrics_path__
155 |         replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
156 |         action: replace
157 |       - source_labels: [__meta_kubernetes_node_name]
158 |         separator: ;
159 |         regex: (.*)
160 |         target_label: node_name
161 |         replacement: $1
162 |         action: replace
163 |       - source_labels: [__meta_kubernetes_node_address_InternalIP]
164 |         separator: ;
165 |         regex: (.*)
166 |         target_label: node_ip
167 |         replacement: $1
168 |         action: replace
169 |       metric_relabel_configs:
170 |       - source_labels: [container, __name__]
171 |         separator: ;
172 |         regex: POD;container_(network).*
173 |         target_label: container
174 |         replacement: $1
175 |         action: replace
176 |       - source_labels: [container]
177 |         separator: ;
178 |         regex: POD
179 |         replacement: $1
180 |         action: drop
181 |       - source_labels: [container]
182 |         separator: ;
183 |         regex: ^$
184 |         replacement: $1
185 |         action: drop
186 |       - source_labels: [__name__]
187 |         separator: ;
188 |         regex: container_(network_tcp_usage_total|tasks_state|memory_failures_total|network_udp_usage_total)
189 |         replacement: $1
190 |         action: drop
191 |     - job_name: kubernetes-nodes-kubelet
192 |       scrape_interval: 10s
193 |       scrape_timeout: 10s
194 |       scheme: https
195 |       kubernetes_sd_configs:
196 |       - role: node
197 |         namespaces:
198 |           names: []
199 |       bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
200 |       tls_config:
201 |         ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
202 |         insecure_skip_verify: true
203 |       relabel_configs:
204 |       - action: labelmap
205 |         regex: __meta_kubernetes_node_label_(.+)
206 |       - target_label: __address__
207 |         replacement: kubernetes.default.svc:443
208 |       - source_labels: [__meta_kubernetes_node_name]
209 |         regex: (.+)
210 |         target_label: __metrics_path__
211 |         replacement: /api/v1/nodes/${1}/proxy/metrics
212 | ---
213 | apiVersion: monitoring.coreos.com/v1
214 | kind: Alertmanager
215 | metadata:
216 |   name: alertmanager
217 |   labels:
218 |     app: prometheus
219 |   namespace: prometheus
220 | spec:
221 |   replicas: 1
222 | ---
223 | kind: Service
224 | apiVersion: v1
225 | metadata:
226 |   labels:
227 |     app: prometheus
228 |   name: alertmanager
229 |   namespace: prometheus
230 | spec:
231 |   type: ClusterIP
232 |   ports:
233 |     - port: 80
234 |       name: http
235 |       protocol: TCP
236 |       targetPort: web
237 |   selector:
238 |     alertmanager: alertmanager
239 | ---
240 | apiVersion: networking.k8s.io/v1
241 | kind: Ingress
242 | metadata:
243 |   name: alertmanager
244 |   namespace: prometheus
245 | spec:
246 |   ingressClassName: nginx
247 |   rules:
248 |   - host: alertmanager.f12i.io
249 |     http:
250 |       paths:
251 |       - backend:
252 |           service:
253 |             name: alertmanager
254 |             port: 
255 |               number: 80
256 |         path: /
257 |         pathType: Prefix
258 | ---
259 | kind: Secret
260 | apiVersion: v1
261 | metadata:
262 |   name: alertmanager-alertmanager
263 |   labels:
264 |     app: prometheus
265 |   namespace: prometheus
266 | type: Opaque
267 | stringData:
268 |   alertmanager.yaml: |
269 |     global:
270 |       slack_api_url: https://hooks.slack.com/services/change/me/please
271 |     route:
272 |       receiver: slack
273 |     receivers:
274 |     - name: slack
275 |       slack_configs:
276 |       - channel: "#strimzi-alerts"
277 |         title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}"
278 |         text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}"
279 |         send_resolved: true
280 | 


--------------------------------------------------------------------------------
/sessions/002/install/022-prometheus-strimzi.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: monitoring.coreos.com/v1
  2 | kind: PodMonitor
  3 | metadata:
  4 |   name: strimzi-cluster-operator-metrics-${NAMESPACE}
  5 |   labels:
  6 |     app: strimzi
  7 |     prometheus: prometheus
  8 |   namespace: prometheus
  9 | spec:
 10 |   selector:
 11 |     matchLabels:
 12 |       strimzi.io/kind: cluster-operator
 13 |   namespaceSelector:
 14 |     matchNames:
 15 |       - ${NAMESPACE}
 16 |   podMetricsEndpoints:
 17 |     - path: /metrics
 18 |       port: http
 19 | ---
 20 | apiVersion: monitoring.coreos.com/v1
 21 | kind: PodMonitor
 22 | metadata:
 23 |   name: strimzi-entity-operator-metrics-${NAMESPACE}
 24 |   labels:
 25 |     app: strimzi
 26 |     prometheus: prometheus
 27 |   namespace: prometheus
 28 | spec:
 29 |   selector:
 30 |     matchLabels:
 31 |       app.kubernetes.io/name: entity-operator
 32 |   namespaceSelector:
 33 |     matchNames:
 34 |       - ${NAMESPACE}
 35 |   podMetricsEndpoints:
 36 |     - path: /metrics
 37 |       port: healthcheck
 38 | ---
 39 | apiVersion: monitoring.coreos.com/v1
 40 | kind: PodMonitor
 41 | metadata:
 42 |   name: strimzi-bridge-metrics-${NAMESPACE}
 43 |   labels:
 44 |     app: strimzi
 45 |     prometheus: prometheus
 46 |   namespace: prometheus
 47 | spec:
 48 |   selector:
 49 |     matchLabels:
 50 |       strimzi.io/kind: KafkaBridge
 51 |   namespaceSelector:
 52 |     matchNames:
 53 |       - ${NAMESPACE}
 54 |   podMetricsEndpoints:
 55 |     - path: /metrics
 56 |       port: rest-api
 57 | ---
 58 | apiVersion: monitoring.coreos.com/v1
 59 | kind: PodMonitor
 60 | metadata:
 61 |   name: strimzi-kafka-and-cruise-control-metrics-${NAMESPACE}
 62 |   labels:
 63 |     app: strimzi
 64 |     prometheus: prometheus
 65 |   namespace: prometheus
 66 | spec:
 67 |   selector:
 68 |     matchExpressions:
 69 |       - key: "strimzi.io/kind"
 70 |         operator: In
 71 |         values: ["Kafka", "KafkaConnect", "KafkaConnectS2I", "KafkaMirrorMaker", "KafkaMirrorMaker2"]
 72 |   namespaceSelector:
 73 |     matchNames:
 74 |       - ${NAMESPACE}
 75 |   podMetricsEndpoints:
 76 |     - path: /metrics
 77 |       port: tcp-prometheus
 78 |       relabelings:
 79 |         - separator: ;
 80 |           regex: __meta_kubernetes_pod_label_(strimzi_io_.+)
 81 |           replacement: $1
 82 |           action: labelmap
 83 |         - sourceLabels: [__meta_kubernetes_namespace]
 84 |           separator: ;
 85 |           regex: (.*)
 86 |           targetLabel: namespace
 87 |           replacement: $1
 88 |           action: replace
 89 |         - sourceLabels: [__meta_kubernetes_pod_name]
 90 |           separator: ;
 91 |           regex: (.*)
 92 |           targetLabel: kubernetes_pod_name
 93 |           replacement: $1
 94 |           action: replace
 95 |         - sourceLabels: [__meta_kubernetes_pod_node_name]
 96 |           separator: ;
 97 |           regex: (.*)
 98 |           targetLabel: node_name
 99 |           replacement: $1
100 |           action: replace
101 |         - sourceLabels: [__meta_kubernetes_pod_host_ip]
102 |           separator: ;
103 |           regex: (.*)
104 |           targetLabel: node_ip
105 |           replacement: $1
106 |           action: replace
107 | 


--------------------------------------------------------------------------------
/sessions/002/install/031-grafana-server.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Namespace
 3 | metadata:
 4 |   name: grafana
 5 |   labels:
 6 |     app: grafana
 7 | ---
 8 | apiVersion: integreatly.org/v1alpha1
 9 | kind: Grafana
10 | metadata:
11 |   name: grafana
12 |   labels:
13 |     app: grafana
14 |   namespace: grafana
15 | spec:
16 |   service: {}
17 |   config:
18 |     log:
19 |       mode: "console"
20 |       level: "warn"
21 |     security:
22 |       admin_user: "admin"
23 |       admin_password: "admin"
24 |     auth:
25 |       disable_login_form: False
26 |       disable_signout_menu: True
27 |     auth.anonymous:
28 |       enabled: True
29 |   dashboardLabelSelector:
30 |     - matchExpressions:
31 |         - {key: grafana, operator: In, values: [dashabord]}
32 | ---
33 | kind: Service
34 | apiVersion: v1
35 | metadata:
36 |   name: grafana
37 |   labels:
38 |     app: grafana
39 |   namespace: grafana
40 | spec:
41 |   type: ClusterIP
42 |   ports:
43 |     - name: grafana
44 |       protocol: TCP
45 |       port: 80
46 |       targetPort: grafana-http
47 |   selector:
48 |     app: grafana
49 | ---
50 | apiVersion: networking.k8s.io/v1
51 | kind: Ingress
52 | metadata:
53 |   name: grafana
54 |   namespace: grafana
55 | spec:
56 |   ingressClassName: nginx
57 |   rules:
58 |   - host: grafana.f12i.io
59 |     http:
60 |       paths:
61 |       - backend:
62 |           service:
63 |             name: grafana
64 |             port: 
65 |               number: 80
66 |         path: /
67 |         pathType: Prefix
68 | ---
69 | apiVersion: integreatly.org/v1alpha1
70 | kind: GrafanaDataSource
71 | metadata:
72 |   name: prometheus
73 |   labels:
74 |     app: grafana
75 |   namespace: grafana
76 | spec:
77 |   name: prometheus.yaml
78 |   datasources:
79 |     - name: Prometheus
80 |       type: prometheus
81 |       access: proxy
82 |       url: http://prometheus.prometheus.svc:80
83 |       isDefault: true
84 |       version: 1
85 |       editable: true
86 |       jsonData:
87 |         tlsSkipVerify: true
88 |         timeInterval: "5s"
89 | 


--------------------------------------------------------------------------------
/sessions/002/install/032-grafana-strimzi.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: integreatly.org/v1alpha1
 2 | kind: GrafanaDashboard
 3 | metadata:
 4 |   name: strimzi-operators
 5 |   labels:
 6 |     app: grafana
 7 |     grafana: dashabord
 8 |   namespace: grafana
 9 | spec:
10 |   json: ""
11 |   url: https://raw.githubusercontent.com/strimzi/strimzi-kafka-operator/${STRIMZI_VERSION}/examples/metrics/grafana-dashboards/strimzi-operators.json
12 |   datasources:
13 |     - inputName: "DS_PROMETHEUS"
14 |       datasourceName: "Prometheus"
15 | ---
16 | apiVersion: integreatly.org/v1alpha1
17 | kind: GrafanaDashboard
18 | metadata: 
19 |   name: strimzi-kafka
20 |   labels:
21 |     app: grafana
22 |     grafana: dashabord
23 |   namespace: grafana
24 | spec:
25 |   json: ""
26 |   url: https://raw.githubusercontent.com/strimzi/strimzi-kafka-operator/${STRIMZI_VERSION}/packaging/examples/metrics/grafana-dashboards/strimzi-kafka.json
27 |   datasources:
28 |     - inputName: "DS_PROMETHEUS"
29 |       datasourceName: "Prometheus"
30 | 


--------------------------------------------------------------------------------
/sessions/002/install/040-kube-state-metrics.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: ServiceAccount
  3 | metadata:
  4 |   labels:
  5 |     app.kubernetes.io/name: kube-state-metrics
  6 |   name: kube-state-metrics
  7 |   namespace: kube-system
  8 | ---
  9 | apiVersion: rbac.authorization.k8s.io/v1
 10 | kind: ClusterRole
 11 | metadata:
 12 |   labels:
 13 |     app.kubernetes.io/name: kube-state-metrics
 14 |   name: kube-state-metrics
 15 | rules:
 16 | - apiGroups:
 17 |   - ""
 18 |   resources:
 19 |   - configmaps
 20 |   - secrets
 21 |   - nodes
 22 |   - pods
 23 |   - services
 24 |   - serviceaccounts
 25 |   - resourcequotas
 26 |   - replicationcontrollers
 27 |   - limitranges
 28 |   - persistentvolumeclaims
 29 |   - persistentvolumes
 30 |   - namespaces
 31 |   - endpoints
 32 |   verbs:
 33 |   - list
 34 |   - watch
 35 | - apiGroups:
 36 |   - apps
 37 |   resources:
 38 |   - statefulsets
 39 |   - daemonsets
 40 |   - deployments
 41 |   - replicasets
 42 |   verbs:
 43 |   - list
 44 |   - watch
 45 | - apiGroups:
 46 |   - batch
 47 |   resources:
 48 |   - cronjobs
 49 |   - jobs
 50 |   verbs:
 51 |   - list
 52 |   - watch
 53 | - apiGroups:
 54 |   - autoscaling
 55 |   resources:
 56 |   - horizontalpodautoscalers
 57 |   verbs:
 58 |   - list
 59 |   - watch
 60 | - apiGroups:
 61 |   - authentication.k8s.io
 62 |   resources:
 63 |   - tokenreviews
 64 |   verbs:
 65 |   - create
 66 | - apiGroups:
 67 |   - authorization.k8s.io
 68 |   resources:
 69 |   - subjectaccessreviews
 70 |   verbs:
 71 |   - create
 72 | - apiGroups:
 73 |   - policy
 74 |   resources:
 75 |   - poddisruptionbudgets
 76 |   verbs:
 77 |   - list
 78 |   - watch
 79 | - apiGroups:
 80 |   - certificates.k8s.io
 81 |   resources:
 82 |   - certificatesigningrequests
 83 |   verbs:
 84 |   - list
 85 |   - watch
 86 | - apiGroups:
 87 |   - discovery.k8s.io
 88 |   resources:
 89 |   - endpointslices
 90 |   verbs:
 91 |   - list
 92 |   - watch
 93 | - apiGroups:
 94 |   - storage.k8s.io
 95 |   resources:
 96 |   - storageclasses
 97 |   - volumeattachments
 98 |   verbs:
 99 |   - list
100 |   - watch
101 | - apiGroups:
102 |   - admissionregistration.k8s.io
103 |   resources:
104 |   - mutatingwebhookconfigurations
105 |   - validatingwebhookconfigurations
106 |   verbs:
107 |   - list
108 |   - watch
109 | - apiGroups:
110 |   - networking.k8s.io
111 |   resources:
112 |   - networkpolicies
113 |   - ingressclasses
114 |   - ingresses
115 |   verbs:
116 |   - list
117 |   - watch
118 | - apiGroups:
119 |   - coordination.k8s.io
120 |   resources:
121 |   - leases
122 |   verbs:
123 |   - list
124 |   - watch
125 | - apiGroups:
126 |   - rbac.authorization.k8s.io
127 |   resources:
128 |   - clusterrolebindings
129 |   - clusterroles
130 |   - rolebindings
131 |   - roles
132 |   verbs:
133 |   - list
134 |   - watch
135 | ---
136 | apiVersion: rbac.authorization.k8s.io/v1
137 | kind: ClusterRoleBinding
138 | metadata:
139 |   labels:
140 |     app.kubernetes.io/name: kube-state-metrics
141 |   name: kube-state-metrics
142 | roleRef:
143 |   apiGroup: rbac.authorization.k8s.io
144 |   kind: ClusterRole
145 |   name: kube-state-metrics
146 | subjects:
147 | - kind: ServiceAccount
148 |   name: kube-state-metrics
149 |   namespace: kube-system
150 | ---
151 | apiVersion: apps/v1
152 | kind: Deployment
153 | metadata:
154 |   labels:
155 |     app.kubernetes.io/name: kube-state-metrics
156 |   name: kube-state-metrics
157 |   namespace: kube-system
158 | spec:
159 |   replicas: 1
160 |   selector:
161 |     matchLabels:
162 |       app.kubernetes.io/name: kube-state-metrics
163 |   template:
164 |     metadata:
165 |       labels:
166 |         app.kubernetes.io/name: kube-state-metrics
167 |     spec:
168 |       automountServiceAccountToken: true
169 |       containers:
170 |       - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.8.2
171 |         livenessProbe:
172 |           httpGet:
173 |             path: /healthz
174 |             port: 8080
175 |           initialDelaySeconds: 5
176 |           timeoutSeconds: 5
177 |         name: kube-state-metrics
178 |         ports:
179 |         - containerPort: 8080
180 |           name: http-metrics
181 |         - containerPort: 8081
182 |           name: telemetry
183 |         readinessProbe:
184 |           httpGet:
185 |             path: /
186 |             port: 8081
187 |           initialDelaySeconds: 5
188 |           timeoutSeconds: 5
189 |         securityContext:
190 |           allowPrivilegeEscalation: false
191 |           capabilities:
192 |             drop:
193 |             - ALL
194 |           readOnlyRootFilesystem: true
195 |           runAsNonRoot: true
196 |           runAsUser: 65534
197 |           seccompProfile:
198 |             type: RuntimeDefault
199 |       nodeSelector:
200 |         kubernetes.io/os: linux
201 |       serviceAccountName: kube-state-metrics
202 | ---
203 | apiVersion: v1
204 | kind: Service
205 | metadata:
206 |   labels:
207 |     app.kubernetes.io/name: kube-state-metrics
208 |   name: kube-state-metrics
209 |   namespace: kube-system
210 | spec:
211 |   clusterIP: None
212 |   ports:
213 |   - name: http-metrics
214 |     port: 8080
215 |     targetPort: http-metrics
216 |   - name: telemetry
217 |     port: 8081
218 |     targetPort: telemetry
219 |   selector:
220 |     app.kubernetes.io/name: kube-state-metrics
221 | ---
222 | apiVersion: monitoring.coreos.com/v1
223 | kind: PodMonitor
224 | metadata:
225 |   name: kube-state-metrics
226 |   labels:
227 |     app.kubernetes.io/name: kube-state-metrics
228 |     prometheus: prometheus
229 |   namespace: prometheus
230 | spec:
231 |   selector:
232 |     matchLabels:
233 |       app.kubernetes.io/name: kube-state-metrics
234 |   namespaceSelector:
235 |     matchNames:
236 |       - kube-system
237 |   podMetricsEndpoints:
238 |     - path: /metrics
239 |       port: http-metrics
240 |       relabelings:
241 |         - sourceLabels: [exported_pod]
242 |           targetLabel: pod
243 |           replacement: "$1"
244 |         - sourceLabels: [exported_namespace]
245 |           targetLabel: namespace
246 |           replacement: "$1"
247 |         - sourceLabels: [__address__]
248 |           # Add the cluster label
249 |           targetLabel: cluster
250 |           replacement: "$CLUSTER_NAME"
251 | ---
252 | apiVersion: integreatly.org/v1alpha1
253 | kind: GrafanaDashboard
254 | metadata:
255 |   name: kube-state-metrics
256 |   labels:
257 |     app: grafana
258 |     grafana: dashabord
259 |   namespace: grafana
260 | spec:
261 |   json: ""
262 |   url: https://grafana.com/api/dashboards/13332/revisions/12/download
263 |   datasources:
264 |     - inputName: "VAR_DATASOURCE"
265 |       datasourceName: "Prometheus"
266 | 


--------------------------------------------------------------------------------
/sessions/002/install/050-node-exporter.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: node-exporter
 5 |   labels:
 6 |     app: node-exporter
 7 |   namespace: prometheus
 8 | spec:
 9 |   clusterIP: None
10 |   ports:
11 |   - name: scrape
12 |     port: 9100
13 |     protocol: TCP
14 |   selector:
15 |     app: node-exporter
16 |   type: ClusterIP
17 | ---
18 | apiVersion: apps/v1
19 | kind: DaemonSet
20 | metadata:
21 |   name: node-exporter
22 |   labels:
23 |     app: prometheus
24 |   namespace: prometheus
25 | spec:
26 |   selector:
27 |     matchLabels:
28 |       app: node-exporter
29 |   template:
30 |     metadata:
31 |       labels:
32 |         app: node-exporter
33 |       name: node-exporter
34 |     spec:
35 |       containers:
36 |       - image: prom/node-exporter
37 |         name: node-exporter
38 |         ports:
39 |         - containerPort: 9100
40 |           hostPort: 9100
41 |           name: scrape
42 |       hostNetwork: true
43 |       hostPID: true
44 | ---
45 | apiVersion: monitoring.coreos.com/v1
46 | kind: ServiceMonitor
47 | metadata:
48 |   labels:
49 |     app: node-exporter
50 |     prometheus: prometheus
51 |   name: node-exporter
52 |   namespace: prometheus
53 | spec:
54 |   endpoints:
55 |   - honorLabels: true
56 |     port: scrape
57 |   selector:
58 |     matchLabels:
59 |       app: node-exporter
60 | ---
61 | apiVersion: integreatly.org/v1alpha1
62 | kind: GrafanaDashboard
63 | metadata:
64 |   name: node-exporter
65 |   labels:
66 |     app: grafana
67 |     grafana: dashabord
68 |   namespace: grafana
69 | spec:
70 |   json: ""
71 |   url: https://grafana.com/api/dashboards/1860/revisions/18/download
72 | 


--------------------------------------------------------------------------------
/sessions/003/README.md:
--------------------------------------------------------------------------------
  1 | ## Get diagnostic data
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | 
  5 | When debugging issues, you usually need to retrieve various artifacts from the environment, which can be a lot of effort.
  6 | Fortunately, Strimzi provides a must-gather script that can be used to download all relevant artifacts and logs from a specific Kafka cluster.
  7 | 
  8 | > [!NOTE]  
  9 | > You can add the `--secrets=all` option to also get secret values.
 10 | 
 11 | ```sh
 12 | $ curl -s https://raw.githubusercontent.com/strimzi/strimzi-kafka-operator/main/tools/report.sh \
 13 |   | bash -s -- --namespace=test --cluster=my-cluster --out-dir=~/Downloads
 14 | deployments
 15 |     deployment.apps/my-cluster-entity-operator
 16 | statefulsets
 17 | replicasets
 18 |     replicaset.apps/my-cluster-entity-operator-bb7c65dd4
 19 | configmaps
 20 |     configmap/my-cluster-broker-10
 21 |     configmap/my-cluster-broker-11
 22 |     configmap/my-cluster-broker-12
 23 |     configmap/my-cluster-controller-0
 24 |     configmap/my-cluster-controller-1
 25 |     configmap/my-cluster-controller-2
 26 |     configmap/my-cluster-entity-topic-operator-config
 27 |     configmap/my-cluster-entity-user-operator-config
 28 | secrets
 29 |     secret/my-cluster-clients-ca
 30 |     secret/my-cluster-clients-ca-cert
 31 |     secret/my-cluster-cluster-ca
 32 |     secret/my-cluster-cluster-ca-cert
 33 |     secret/my-cluster-cluster-operator-certs
 34 |     secret/my-cluster-entity-topic-operator-certs
 35 |     secret/my-cluster-entity-user-operator-certs
 36 |     secret/my-cluster-kafka-brokers
 37 | services
 38 |     service/my-cluster-kafka-bootstrap
 39 |     service/my-cluster-kafka-brokers
 40 | poddisruptionbudgets
 41 |     poddisruptionbudget.policy/my-cluster-kafka
 42 | roles
 43 |     role.rbac.authorization.k8s.io/my-cluster-entity-operator
 44 | rolebindings
 45 |     rolebinding.rbac.authorization.k8s.io/my-cluster-entity-topic-operator-role
 46 |     rolebinding.rbac.authorization.k8s.io/my-cluster-entity-user-operator-role
 47 | networkpolicies
 48 |     networkpolicy.networking.k8s.io/my-cluster-entity-operator
 49 |     networkpolicy.networking.k8s.io/my-cluster-network-policy-kafka
 50 | pods
 51 |     pod/my-cluster-broker-10
 52 |     pod/my-cluster-broker-11
 53 |     pod/my-cluster-broker-12
 54 |     pod/my-cluster-controller-0
 55 |     pod/my-cluster-controller-1
 56 |     pod/my-cluster-controller-2
 57 |     pod/my-cluster-entity-operator-bb7c65dd4-9zdmk
 58 | persistentvolumeclaims
 59 |     persistentvolumeclaim/data-my-cluster-broker-10
 60 |     persistentvolumeclaim/data-my-cluster-broker-11
 61 |     persistentvolumeclaim/data-my-cluster-broker-12
 62 |     persistentvolumeclaim/data-my-cluster-controller-0
 63 |     persistentvolumeclaim/data-my-cluster-controller-1
 64 |     persistentvolumeclaim/data-my-cluster-controller-2
 65 | ingresses
 66 | routes
 67 | clusterroles
 68 |     clusterrole.rbac.authorization.k8s.io/strimzi-cluster-operator-global
 69 |     clusterrole.rbac.authorization.k8s.io/strimzi-cluster-operator-leader-election
 70 |     clusterrole.rbac.authorization.k8s.io/strimzi-cluster-operator-namespaced
 71 |     clusterrole.rbac.authorization.k8s.io/strimzi-cluster-operator-watched
 72 |     clusterrole.rbac.authorization.k8s.io/strimzi-entity-operator
 73 |     clusterrole.rbac.authorization.k8s.io/strimzi-kafka-broker
 74 |     clusterrole.rbac.authorization.k8s.io/strimzi-kafka-client
 75 | clusterrolebindings
 76 |     clusterrolebinding.rbac.authorization.k8s.io/strimzi-cluster-operator
 77 |     clusterrolebinding.rbac.authorization.k8s.io/strimzi-cluster-operator-kafka-broker-delegation
 78 |     clusterrolebinding.rbac.authorization.k8s.io/strimzi-cluster-operator-kafka-client-delegation
 79 | clusteroperator
 80 |     deployment.apps/strimzi-cluster-operator
 81 |     replicaset.apps/strimzi-cluster-operator-6596f469c9
 82 |     pod/strimzi-cluster-operator-6596f469c9-smsw2
 83 |     configmap/strimzi-cluster-operator
 84 | draincleaner
 85 | customresources
 86 |     kafkanodepools.kafka.strimzi.io
 87 |         broker
 88 |         controller
 89 |     kafkas.kafka.strimzi.io
 90 |         my-cluster
 91 |     kafkatopics.kafka.strimzi.io
 92 |         my-topic
 93 |     strimzipodsets.core.strimzi.io
 94 |         my-cluster-broker
 95 |         my-cluster-controller
 96 | events
 97 | logs
 98 |     my-cluster-broker-10
 99 |     my-cluster-broker-11
100 |     my-cluster-broker-12
101 |     my-cluster-controller-0
102 |     my-cluster-controller-1
103 |     my-cluster-controller-2
104 |     my-cluster-entity-operator-bb7c65dd4-9zdmk
105 | Report file report-17-03-2025_12-26-05.zip created
106 | ```
107 | 
108 | ## Get heap dumps
109 | 
110 | It is also possible to collect broker JVM heap dumps and other advanced diagnostic data (thread dumps, flame graphs, etc).
111 | 
112 | > [!WARNING]
113 | > Taking a heap dump is a heavy operation that can cause the Java application to hang.
114 | > It is not recommended in production, unless it is not possible to reproduce the memory issue in a test environment.
115 | 
116 | Debugging locally can often be easier and faster.
117 | However, some issues only manifest in Kubernetes due to factors like networking, resource limits, or interactions with other components.
118 | Even if you try to match your local setup to the Kubernetes configuration, subtle differences (e.g. service discovery, security settings, or operator-managed logic) might lead to different behavior.
119 | 
120 | Create an additional volume of the desired size using a PVC.
121 | 
122 | ```sh
123 | $ kubectl create -f - <<EOF
124 | apiVersion: v1
125 | kind: PersistentVolumeClaim
126 | metadata:
127 |   name: my-pvc
128 | spec:
129 |   accessModes:
130 |     - ReadWriteOnce
131 |   resources:
132 |     requests:
133 |       storage: 10Gi
134 |   storageClassName: standard
135 | EOF
136 | persistentvolumeclaim/my-pvc created
137 | ```
138 | 
139 | Mount the new volume using the additional volume feature within Kafka template (rolling update).
140 | It is required to use `/mnt` mount point.
141 | 
142 | > [!WARNING]
143 | > Adding a custom volume triggers pod restarts, which can make it difficult to capture an issue that has already occurred.
144 | > If the issue cannot be easily reproduced in a test environment, configuring the volume in advance could help avoid the pod restarts when you need them most.
145 | 
146 | ```sh
147 | $ kubectl patch k my-cluster --type merge -p '
148 |     spec:
149 |       kafka:
150 |         template:
151 |             pod:
152 |               volumes:
153 |                 - name: my-volume
154 |                   persistentVolumeClaim:
155 |                     claimName: my-pvc
156 |             kafkaContainer:
157 |               volumeMounts:
158 |                 - name: my-volume
159 |                   mountPath: "/mnt/data"'
160 | kafka.kafka.strimzi.io/my-cluster patched
161 | ```
162 | 
163 | When the rolling update completes, create a broker heap dump and copy the output file to localhost.
164 | 
165 | ```sh
166 | $ PID="$(kubectl exec my-cluster-broker-10 -- jcmd | grep "kafka.Kafka" | awk '{print $1}')"
167 | 
168 | $ kubectl exec my-cluster-broker-10 -- jcmd "$PID" VM.flags
169 | 724:
170 | -XX:CICompilerCount=4 -XX:ConcGCThreads=3 -XX:G1ConcRefinementThreads=10 -XX:G1EagerReclaimRemSetThreshold=32 -XX:G1HeapRegionSize=4194304
171 | -XX:GCDrainStackTargetSize=64 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/data/oome.hprof -XX:InitialHeapSize=5368709120
172 | -XX:+ManagementServer -XX:MarkStackSize=4194304 -XX:MaxHeapSize=5368709120 -XX:MaxNewSize=3221225472 -XX:MinHeapDeltaBytes=4194304
173 | -XX:MinHeapSize=5368709120 -XX:NonNMethodCodeHeapSize=5839372 -XX:NonProfiledCodeHeapSize=122909434 -XX:ProfiledCodeHeapSize=122909434
174 | -XX:ReservedCodeCacheSize=251658240 -XX:+SegmentedCodeCache -XX:SoftMaxHeapSize=5368709120 -XX:-THPStackMitigation
175 | -XX:+UseCompressedClassPointers -XX:+UseCompressedOops -XX:+UseFastUnorderedTimeStamps -XX:+UseG1GC
176 | 
177 | $ kubectl exec my-cluster-broker-10 -- jcmd "$PID" GC.heap_dump /mnt/data/heap.hprof
178 | 724:
179 | Dumping heap to /mnt/data/heap.hprof ...
180 | Heap dump file created [179236580 bytes in 0.664 secs]
181 | 
182 | $ kubectl cp my-cluster-broker-10:/mnt/data/heap.hprof "$HOME"/Downloads/heap.hprof
183 | tar: Removing leading `/' from member names
184 | ```
185 | 
186 | If the pod is crash looping, the dump can still be recovered by spinning up a temporary pod and mounting the volume.
187 | 
188 | ```sh
189 | $ kubectl run my-pod --restart "Never" --image "foo" --overrides "{
190 |   \"spec\": {
191 |     \"containers\": [
192 |       {
193 |         \"name\": \"busybox\",
194 |         \"image\": \"busybox\",
195 |         \"imagePullPolicy\": \"IfNotPresent\",
196 |         \"command\": [\"/bin/sh\", \"-c\", \"trap : TERM INT; sleep infinity & wait\"],
197 |         \"volumeMounts\": [
198 |           {\"name\": \"data\", \"mountPath\": \"/mnt/data\"}
199 |         ]
200 |       }
201 |     ],
202 |     \"volumes\": [
203 |       {\"name\": \"data\", \"persistentVolumeClaim\": {\"claimName\": \"my-pvc\"}}
204 |     ]
205 |   }
206 | }"
207 | 
208 | $ kubectl exec my-pod -- ls -lh /mnt/data
209 | total 171M   
210 | -rw-------    1 1001     root      170.9M Mar 17 14:38 heap.hprof
211 | ```
212 | 
213 | For the heap dump analysis you can use a tool like Eclipse Memory Analyzer.
214 | 


--------------------------------------------------------------------------------
/sessions/004/README.md:
--------------------------------------------------------------------------------
  1 | ## Configure TLS authentication
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | 
  5 | We also add an external listener of type ingress with TLS authentication.
  6 | Then, wait for the Cluster Operator to restart all pods one by one (rolling update).
  7 | 
  8 | > [!IMPORTANT]  
  9 | > You need to enable the Nginx ingress controller with `--enable-ssl-passthrough` flag, and add ingress mappings to `/etc/hosts`.
 10 | 
 11 | ```sh
 12 | $ kubectl create -f sessions/004/install.yaml \
 13 |   && kubectl patch k my-cluster --type merge -p '
 14 |     spec:
 15 |       kafka:
 16 |         listeners:
 17 |           - name: external
 18 |             port: 9094
 19 |             type: ingress
 20 |             tls: true
 21 |             authentication:
 22 |               type: tls
 23 |             configuration:
 24 |               class: nginx
 25 |               hostTemplate: broker-{nodeId}.my-cluster.f12i.io
 26 |               bootstrap:
 27 |                 host: bootstrap.my-cluster.f12i.io'
 28 | kafkauser.kafka.strimzi.io/my-user created            
 29 | kafka.kafka.strimzi.io/my-cluster patched
 30 | ```
 31 | 
 32 | The previous command adds a new authentication element to the external listener, which is the endpoint used by clients connecting from outside using TLS.
 33 | It also creates a Kafka user resource with a matching configuration.
 34 | 
 35 | ```sh
 36 | $ kubectl get ingress
 37 | NAME                         CLASS   HOSTS                              ADDRESS        PORTS     AGE
 38 | my-cluster-broker-10         nginx   broker-10.my-cluster.f12i.io       192.168.49.2   80, 443   104s
 39 | my-cluster-broker-11         nginx   broker-11.my-cluster.f12i.io       192.168.49.2   80, 443   104s
 40 | my-cluster-broker-12         nginx   broker-12.my-cluster.f12i.io       192.168.49.2   80, 443   104s
 41 | my-cluster-kafka-bootstrap   nginx   bootstrap.my-cluster.f12i.io       192.168.49.2   80, 443   104s
 42 | 
 43 | $ kubectl get ku my-user -o yaml | yq .spec
 44 | authentication:
 45 |   type: tls
 46 | ```
 47 | 
 48 | When the rolling update is completed, you should be able to see the broker certificate running the following command.
 49 | 
 50 | ```sh
 51 | $ openssl s_client -connect broker-5.my-cluster.f12i.io:443 -servername bootstrap.my-cluster.f12i.io -showcerts
 52 | ...
 53 | Server certificate
 54 | subject=O=io.strimzi, CN=my-cluster-kafka
 55 | issuer=O=io.strimzi, CN=cluster-ca v0
 56 | ...
 57 | ```
 58 | 
 59 | Then, we can try to send some messages using an external Kafka client.
 60 | Here we are using the console producer tool included in every Kafka distribution.
 61 | 
 62 | ```sh
 63 | $ export BOOTSTRAP_SERVERS=$(kubectl get k my-cluster -o yaml | yq '.status.listeners.[] | select(.name == "external").bootstrapServers'); 
 64 |   kubectl get k my-cluster -o yaml | yq '.status.listeners.[] | select(.name == "external").certificates[0]' > /tmp/cluster-ca.crt ; \
 65 |   kubectl get secret my-user -o jsonpath="{.data['user\.crt']}" | base64 -d > /tmp/user.crt ; \
 66 |   kubectl get secret my-user -o jsonpath="{.data['user\.key']}" | base64 -d > /tmp/user.key
 67 | 
 68 | $ CLUSTER_CA_CRT=$(</tmp/cluster-ca.crt) && CLUSTER_CA_CRT=$(echo "$CLUSTER_CA_CRT" |sed ':a;N;$!ba; s;\n; \\\n;g') \
 69 |   USER_CRT=$(</tmp/user.crt) && USER_CRT=$(echo "$USER_CRT" |sed ':a;N;$!ba; s;\n; \\\n;g') \
 70 |   USER_KEY=$(</tmp/user.key) && USER_KEY=$(echo "$USER_KEY" |sed ':a;N;$!ba; s;\n; \\\n;g')
 71 | 
 72 | $ cat <<EOF >/tmp/client.properties
 73 | security.protocol = SSL
 74 | ssl.truststore.type=PEM
 75 | ssl.truststore.certificates=$CLUSTER_CA_CRT
 76 | ssl.keystore.type=PEM
 77 | ssl.keystore.certificate.chain=$USER_CRT
 78 | ssl.keystore.key=$USER_KEY
 79 | EOF
 80 | 
 81 | $ $KAFKA_HOME/bin/kafka-console-producer.sh --bootstrap-server $BOOTSTRAP_SERVERS --topic my-topic --producer.config /tmp/client.properties
 82 | >hello
 83 | >world
 84 | >^C
 85 | 
 86 | $KAFKA_HOME/bin/kafka-console-consumer.sh --bootstrap-server $BOOTSTRAP_SERVERS --topic my-topic --from-beginning --max-messages 2 --consumer.config /tmp/client.properties
 87 | hello
 88 | world
 89 | Processed a total of 2 messages
 90 | ```
 91 | 
 92 | When dealing with TLS issues, it is useful to look inside the certificate to verify its configuration and expiration.
 93 | For example, let's get the cluster CA certificate which is used to sign all server certificates.
 94 | We can use use `kubectl` to do so, but let's suppose we have a must-gather script output.
 95 | Use the command from the first session to generate a new report from the current cluster.
 96 | 
 97 | ```sh
 98 | $ unzip -p ~/Downloads/report-12-10-2024_11-31-59.zip reports/secrets/my-cluster-cluster-ca-cert.yaml \
 99 |   | yq '.data."ca.crt"' | base64 -d | openssl x509 -inform pem -noout -text
100 | Certificate:
101 |     Data:
102 |         Version: 3 (0x2)
103 |         Serial Number:
104 |             26:9e:a1:7d:4d:34:cb:6b:ec:98:03:46:fb:7a:82:ad:68:80:bd:8e
105 |         Signature Algorithm: sha512WithRSAEncryption
106 |         Issuer: O=io.strimzi, CN=cluster-ca v0
107 |         Validity
108 |             Not Before: Sep  8 16:28:42 2022 GMT
109 |             Not After : Sep  8 16:28:42 2023 GMT
110 |         Subject: O=io.strimzi, CN=cluster-ca v0
111 |         Subject Public Key Info:
112 |             Public Key Algorithm: rsaEncryption
113 |                 Public-Key: (4096 bit)
114 |                 Modulus:
115 |                     ...
116 |                 Exponent: 65537 (0x10001)
117 |         X509v3 extensions:
118 |             X509v3 Subject Key Identifier: 
119 |                 2D:1D:63:F6:20:57:33:7D:59:73:DF:15:74:A2:A8:3D:E1:5B:3E:38
120 |             X509v3 Basic Constraints: critical
121 |                 CA:TRUE, pathlen:0
122 |             X509v3 Key Usage: critical
123 |                 Certificate Sign, CRL Sign
124 |     Signature Algorithm: sha512WithRSAEncryption
125 |     Signature Value:
126 |         ...
127 | ```
128 | 
129 | If this is not enough to spot the issue, we can add the `-Djavax.net.debug=ssl:handshake` Java option to the client in order to get more details.
130 | As an additional exercise, try to get the clients CA and user certificates to verify if the first signs the second.
131 | 
132 | ## Use custom TLS certificates
133 | 
134 | Often, security policies don't allow you to run a Kafka cluster with self-signed certificates in production.
135 | Configure the listeners to use a custom certificate signed by an external or well-known CA.
136 | 
137 | Custom certificates are not managed by the operator, so you will be in charge of the renewal process, which requires an update to the listener secret.
138 | A rolling update will start automatically in order to make the new certificate available.
139 | This example only shows TLS encryption, but you can add a custom client certificate for TLS authentication by setting `type: tls-external` in the `KafkaUser` custom resource and creating the user secret (subject can only contain `CN=$USER_NAME`).
140 | 
141 | Typically, the security team will provide a certificate bundle which includes the whole trust chain (i.e. root CA + intermediate CA + listener certificate) and a private key.
142 | If that's not the case, you can easily create the bundle from individual certificates in PEM format, because you need to trust the whole chain, if any.
143 | 
144 | ```sh
145 | $ cat /tmp/listener.crt /tmp/intermca.crt /tmp/rootca.crt >/tmp/bundle.crt
146 | ```
147 | 
148 | Here we generate our own certificate bundle with only one self-signed certificate, pretending it was handed over by the security team.
149 | We also use a wildcard certificate so that we don't need to specify all broker SANs.
150 | 
151 | > [!IMPORTANT]  
152 | > The custom server certificate for a listener must not be a CA and it must include a SAN for each broker address, plus one for the bootstrap address.
153 | > Alternatively, you can use a wildcard certificate to include all addresses with one SAN entry.
154 | 
155 | ```sh
156 | $ CONFIG="
157 | [req]
158 | prompt=no
159 | distinguished_name=dn
160 | x509_extensions=ext
161 | [dn]
162 | countryName=IT
163 | stateOrProvinceName=Rome
164 | organizationName=Fede
165 | commonName=my-cluster
166 | [ext]
167 | subjectAltName=@san
168 | [san]
169 | DNS.1=*.my-cluster.f12i.io
170 | " && openssl genrsa -out /tmp/listener.key 2048 \
171 |   && openssl req -new -x509 -days 3650 -key /tmp/listener.key -out /tmp/bundle.crt -config <(echo "$CONFIG")
172 | ```
173 | 
174 | Now we [deploy the Strimzi Cluster Operator and Kafka cluster](/sessions/001), and set the external listener.
175 | Then, we deploy the secret containing the custom certificate and update the Kafka cluster configuration by adding a reference to that secret.
176 | 
177 | ```sh
178 | $ kubectl create secret generic ext-listener-crt \
179 |   --from-file=/tmp/bundle.crt --from-file=/tmp/listener.key
180 | secret/ext-listener-crt created
181 |   
182 | $ kubectl patch k my-cluster --type merge -p '
183 |   spec:
184 |     kafka:
185 |       listeners:
186 |         - name: external
187 |           port: 9094
188 |           type: ingress
189 |           tls: true
190 |           configuration:
191 |             class: nginx
192 |             hostTemplate: broker-{nodeId}.my-cluster.f12i.io
193 |             bootstrap:
194 |               host: bootstrap.my-cluster.f12i.io
195 |             brokerCertChainAndKey:
196 |               secretName: ext-listener-crt
197 |               certificate: bundle.crt
198 |               key: listener.key'
199 | kafka.kafka.strimzi.io/my-cluster patched
200 | ```
201 | 
202 | When the rolling update is completed, clients just need to trust the external CA and they will be able to connect.
203 | In our case, we don't have a CA, so we just need to trust the self-signed certificate.
204 | 
205 | ```sh
206 | $ PUBLIC_CRT=$(</tmp/bundle.crt) && PUBLIC_CRT=$(echo "$PUBLIC_CRT" |sed ':a;N;$!ba; s;\n; \\\n;g')
207 | 
208 | $ cat <<EOF >/tmp/client.properties
209 | security.protocol=SSL
210 | ssl.truststore.type=PEM
211 | ssl.truststore.certificates=$PUBLIC_CRT
212 | EOF
213 | 
214 | $ $KAFKA_HOME/bin/kafka-console-producer.sh --bootstrap-server $BOOTSTRAP_SERVERS --topic my-topic --producer.config /tmp/client.properties 
215 | >hello
216 | >world
217 | >^C
218 | ```
219 | 


--------------------------------------------------------------------------------
/sessions/004/install.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kafka.strimzi.io/v1beta2
 2 | kind: KafkaUser
 3 | metadata:
 4 |   name: my-user
 5 |   labels:
 6 |     strimzi.io/cluster: my-cluster
 7 | spec:
 8 |   authentication:
 9 |     type: tls
10 | 


--------------------------------------------------------------------------------
/sessions/005/README.md:
--------------------------------------------------------------------------------
  1 | ## Use Kafka with Apicurio Registry
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | 
  5 | When the Kafka cluster is ready, we deploy the Apicurio Registry operator.
  6 | 
  7 | ```sh
  8 | $ envsubst < sessions/005/install/apicurio.yaml | kubectl create -f -
  9 | customresourcedefinition.apiextensions.k8s.io/apicurioregistries.registry.apicur.io created
 10 | serviceaccount/apicurio-registry-operator created
 11 | role.rbac.authorization.k8s.io/apicurio-registry-operator-leader-election-role created
 12 | clusterrole.rbac.authorization.k8s.io/apicurio-registry-operator-role created
 13 | rolebinding.rbac.authorization.k8s.io/apicurio-registry-operator-leader-election-rolebinding created
 14 | clusterrolebinding.rbac.authorization.k8s.io/apicurio-registry-operator-rolebinding created
 15 | deployment.apps/apicurio-registry-operator created
 16 | ```
 17 | 
 18 | After that we deploy our registry instance with in-memory storage system and check the result.
 19 | 
 20 | ```sh
 21 | $ kubectl create -f registry.yaml
 22 | apicurioregistry.registry.apicur.io/my-schema-registry created
 23 | 
 24 | $ kubectl get po
 25 | NAME                                              READY   STATUS    RESTARTS   AGE
 26 | apicurio-registry-operator-9448ffc74-b6whl        1/1     Running   0          69s
 27 | my-cluster-broker-10                              1/1     Running   0          4m54s
 28 | my-cluster-broker-11                              1/1     Running   0          4m27s
 29 | my-cluster-broker-12                              1/1     Running   0          5m19s
 30 | my-cluster-controller-0                           1/1     Running   0          7m32s
 31 | my-cluster-controller-1                           1/1     Running   0          7m32s
 32 | my-cluster-controller-2                           1/1     Running   0          7m32s
 33 | my-cluster-entity-operator-67b8cc5c87-74qlb       2/2     Running   0          6m59s
 34 | my-schema-registry-deployment-858c7dc76b-gjkcs    1/1     Running   0          66s
 35 | strimzi-cluster-operator-d78fd875b-dcjxw          1/1     Running   0          8m36s
 36 | ```
 37 | 
 38 | Now, we export some connection parameters and register the test Avro message schema.
 39 | 
 40 | > [!NOTE]  
 41 | > In addition to the REST API, the registry also provides a web interface for handling schemas and set rules.
 42 | > This is accessible using the auto-generated ingress address.
 43 | 
 44 | The artifact `id` convention for the mapping is to combine the topic name with the key or value, depending on whether the serializer is used for the message key or value.
 45 | The generated `globalId` is then stored in the message headers and used to lookup the schema when consuming messages.
 46 | Different schema `version`s use the same artifact `id`, but have different `globalId`s.
 47 | 
 48 | ```sh
 49 | $ export BOOTSTRAP_SERVERS=$(kubectl get k my-cluster -o yaml | yq '.status.listeners.[] | select(.name == "plain").bootstrapServers') \
 50 |   REGISTRY_URL=http://$(kubectl get apicurioregistries my-schema-registry -o jsonpath="{.status.info.host}")/apis/registry/v2 \
 51 |   ARTIFACT_GROUP="default" \
 52 |   TOPIC_NAME="my-topic"
 53 | 
 54 | $ curl -s -X POST -H "Content-Type: application/json" \
 55 |   -H "X-Registry-ArtifactId: my-topic-value" -H "X-Registry-ArtifactType: AVRO" \
 56 |   -d @sessions/005/install/greeting.avsc \
 57 |   "$REGISTRY_URL/groups/default/artifacts?ifExists=RETURN_OR_UPDATE" | yq -o json
 58 | {
 59 |   "name": "Greeting",
 60 |   "createdBy": "",
 61 |   "createdOn": "2025-03-24T07:26:33+0000",
 62 |   "modifiedBy": "",
 63 |   "modifiedOn": "2025-03-24T07:26:33+0000",
 64 |   "id": "my-topic-value",
 65 |   "version": "1",
 66 |   "type": "AVRO",
 67 |   "globalId": 1,
 68 |   "state": "ENABLED",
 69 |   "contentId": 1,
 70 |   "references": []
 71 | }
 72 | ```
 73 | 
 74 | At this point, we can start the application and observe its output.
 75 | 
 76 | ```sh
 77 | $ envsubst < sessions/005/install/application.yaml | kubectl create -f -
 78 | deployment.apps/kafka-avro created
 79 | 
 80 | $ kubectl logs -f $(kubectl get po -l app=kafka-avro -o name)
 81 | Producing records
 82 | Records produced
 83 | Consuming all records
 84 | Record: Hello-1742801335037
 85 | Record: Hello-1742801335160
 86 | Record: Hello-1742801335160
 87 | Record: Hello-1742801335161
 88 | Record: Hello-1742801335161
 89 | ```
 90 | 
 91 | If we now look at one of the messages, we see that the `globalId` is stored in the message headers and used for the schema lookup when consuming messages.
 92 | 
 93 | ```sh
 94 | $ kubectl exec my-cluster-broker-10 -- bin/kafka-dump-log.sh --deep-iteration --print-data-log \
 95 |   --files /var/lib/kafka/data/kafka-log10/my-topic-0/00000000000000000000.log | tail -n2
 96 | | offset: 15 CreateTime: 1742802014915 keySize: -1 valueSize: 12 sequence: 4 headerKeys: [apicurio.value.globalId,apicurio.value.encoding] payload: 
 97 | Hello????e
 98 | ```
 99 | 
100 | Finally, we can use the REST API to look at the schema content and metadata, which may be useful for debugging.
101 | 
102 | ```sh
103 | $ curl -s "$REGISTRY_URL/search/artifacts" | yq -o json
104 | {
105 |   "artifacts": [
106 |     {
107 |       "id": "my-topic-value",
108 |       "name": "Greeting",
109 |       "createdOn": "2025-03-24T07:26:33+0000",
110 |       "createdBy": "",
111 |       "type": "AVRO",
112 |       "state": "ENABLED",
113 |       "modifiedOn": "2025-03-24T07:26:33+0000",
114 |       "modifiedBy": ""
115 |     }
116 |   ],
117 |   "count": 1
118 | }
119 | 
120 | $ curl -s "$REGISTRY_URL/groups/default/artifacts/my-topic-value" | yq -o json
121 | {
122 |   "type": "record",
123 |   "name": "Greeting",
124 |   "fields": [
125 |     {
126 |       "name": "Message",
127 |       "type": "string"
128 |     },
129 |     {
130 |       "name": "Time",
131 |       "type": "long"
132 |     }
133 |   ]
134 | }
135 | 
136 | $ curl -s "$REGISTRY_URL/groups/default/artifacts/my-topic-value/meta" | yq -o json
137 | {
138 |   "name": "Greeting",
139 |   "createdBy": "",
140 |   "createdOn": "2025-03-24T07:26:33+0000",
141 |   "modifiedBy": "",
142 |   "modifiedOn": "2025-03-24T07:26:33+0000",
143 |   "id": "my-topic-value",
144 |   "version": "1",
145 |   "type": "AVRO",
146 |   "globalId": 1,
147 |   "state": "ENABLED",
148 |   "contentId": 1,
149 |   "references": []
150 | }
151 | ```
152 | 


--------------------------------------------------------------------------------
/sessions/005/install/application.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: kafka-avro
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: kafka-avro
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: kafka-avro
14 |     spec:
15 |       containers:
16 |         - name: kafka-avro
17 |           image: ghcr.io/fvaleri/kafka-avro:latest
18 |           imagePullPolicy: Always
19 |           securityContext:
20 |             allowPrivilegeEscalation: false
21 |             capabilities:
22 |               drop:
23 |                 - ALL
24 |             runAsNonRoot: true
25 |             seccompProfile:
26 |               type: RuntimeDefault
27 |           env:
28 |             - name: BOOTSTRAP_SERVERS
29 |               value: "${BOOTSTRAP_SERVERS}"
30 |             - name: REGISTRY_URL
31 |               value: "${REGISTRY_URL}"
32 |             - name: ARTIFACT_GROUP
33 |               value: "${ARTIFACT_GROUP}"
34 |             - name: TOPIC_NAME
35 |               value: "${TOPIC_NAME}"
36 | 


--------------------------------------------------------------------------------
/sessions/005/install/greeting.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"type": "record",
 3 | 	"name": "Greeting",
 4 | 	"fields": [{
 5 | 		"name": "Message",
 6 | 		"type": "string"
 7 | 	}, {
 8 | 		"name": "Time",
 9 | 		"type": "long"
10 | 	}]
11 | }
12 | 


--------------------------------------------------------------------------------
/sessions/005/install/registry.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: registry.apicur.io/v1
 2 | kind: ApicurioRegistry
 3 | metadata:
 4 |   name: my-schema-registry
 5 | spec:
 6 |   configuration:
 7 |     persistence: mem
 8 |   deployment:
 9 |     host: my-schema-registry.f12i.io
10 | 


--------------------------------------------------------------------------------
/sessions/006/README.md:
--------------------------------------------------------------------------------
  1 | ## Use Kafka Connect with Debezium
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | When the cluster is ready, we deploy a MySQL instance (the external system), and Kafka Connect cluster.
  5 | 
  6 | > [!IMPORTANT]  
  7 | > The Kafka Connect image uses Kaniko to build a custom image containing the configured MySQL connector.
  8 | > In production, this is not recommended, so you should use your own Connect image built from the Strimzi one.
  9 | 
 10 | ```sh
 11 | $ kubectl create -f sessions/006/install/mysql.yaml \
 12 |   && kubectl wait --for=condition=Ready pod -l app=my-mysql --timeout=300s \
 13 |   && kubectl exec my-mysql-0 -- sh -c 'mysql -u root < /tmp/sql/initdb.sql'
 14 | persistentvolumeclaim/my-mysql-data created
 15 | configmap/my-mysql-cfg created
 16 | configmap/my-mysql-env created
 17 | configmap/my-mysql-init created
 18 | statefulset.apps/my-mysql created
 19 | service/my-mysql-svc created
 20 | pod/my-mysql-0 condition met
 21 | 
 22 | $ kubectl create -f sessions/006/install/connect.yaml
 23 | kafkaconnect.kafka.strimzi.io/my-connect-cluster created
 24 | kafkaconnector.kafka.strimzi.io/mysql-source-connector created
 25 | 
 26 | $ kubectl get po,kt,kctr
 27 | NAME                                              READY   STATUS    RESTARTS   AGE
 28 | pod/my-cluster-broker-10                          1/1     Running   0          6m1s
 29 | pod/my-cluster-broker-11                          1/1     Running   0          6m1s
 30 | pod/my-cluster-broker-12                          1/1     Running   0          6m1s
 31 | pod/my-cluster-controller-0                       1/1     Running   0          6m1s
 32 | pod/my-cluster-controller-1                       1/1     Running   0          6m1s
 33 | pod/my-cluster-controller-2                       1/1     Running   0          6m1s
 34 | pod/my-cluster-entity-operator-7bc799c449-8jxmb   2/2     Running   0          5m27s
 35 | pod/my-connect-cluster-connect-0                  1/1     Running   0          2m46s
 36 | pod/my-mysql-0                                    1/1     Running   0          4m19s
 37 | pod/strimzi-cluster-operator-d78fd875b-q9sds      1/1     Running   0          6m30s
 38 | 
 39 | NAME                                   CLUSTER      PARTITIONS   REPLICATION FACTOR   READY
 40 | kafkatopic.kafka.strimzi.io/my-topic   my-cluster   3            3                    True
 41 | 
 42 | NAME                                                     CLUSTER              CONNECTOR CLASS                              MAX TASKS   READY
 43 | kafkaconnector.kafka.strimzi.io/mysql-source-connector   my-connect-cluster   io.debezium.connector.mysql.MySqlConnector   1           True
 44 | ```
 45 | 
 46 | As you may have guessed at this point, we are going to emit MySQL row changes and import them into Kafka, so that other applications can pick them up and process them.
 47 | Let's check if the connector and its tasks are running fine by using the `KafkaConnector` resource, which is easier than interacting via REST requests.
 48 | 
 49 | ```sh
 50 | $ kubectl get kctr mysql-source-connector -o yaml | yq .status
 51 | conditions:
 52 |   - lastTransitionTime: "2024-10-28T10:53:20.123553787Z"
 53 |     status: "True"
 54 |     type: Ready
 55 | connectorStatus:
 56 |   connector:
 57 |     state: RUNNING
 58 |     worker_id: my-connect-cluster-connect-0.my-connect-cluster-connect.test.svc:8083
 59 |   name: mysql-source-connector
 60 |   tasks:
 61 |     - id: 0
 62 |       state: RUNNING
 63 |       worker_id: my-connect-cluster-connect-0.my-connect-cluster-connect.test.svc:8083
 64 |   type: source
 65 | observedGeneration: 1
 66 | tasksMax: 1
 67 | topics:
 68 |   - __debezium-heartbeat.my-mysql
 69 |   - my-mysq
 70 | ```
 71 | 
 72 | Debezium configuration is specific to each connector and it is documented in detail.
 73 | The value of `server_id` must be unique for each server and replication client in the MySQL cluster.
 74 | In this case, the MySQL user must have appropriate permissions on all databases for which the connector captures changes.
 75 | 
 76 | ```sh
 77 | $ kubectl get cm my-mysql-cfg -o yaml | yq .data
 78 | my.cnf: |
 79 |   !include /etc/my.cnf
 80 |   [mysqld]
 81 |   server_id = 111111  
 82 |   log_bin = mysql-bin
 83 |   binlog_format = ROW
 84 |   binlog_row_image = FULL
 85 |   binlog_rows_query_log_events = ON
 86 |   expire_logs_days = 10
 87 |   gtid_mode = ON
 88 |   enforce_gtid_consistency = ON
 89 | 
 90 | $ kubectl get cm my-mysql-init -o yaml | yq .data
 91 | initdb.sql: |
 92 |   use testdb;
 93 |     CREATE TABLE IF NOT EXISTS customers (
 94 |     id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
 95 |     first_name VARCHAR(255) NOT NULL,
 96 |     last_name VARCHAR(255) NOT NULL,
 97 |     email VARCHAR(255) NOT NULL UNIQUE
 98 |   );
 99 | 
100 |   CREATE USER IF NOT EXISTS 'debezium'@'%' IDENTIFIED WITH caching_sha2_password BY 'changeit';
101 |   GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'debezium'@'%';
102 |   FLUSH PRIVILEGES;
103 | ```
104 | 
105 | Enough with describing the configuration, now let's create some changes using good old SQL.
106 | 
107 | ```sh
108 | $ kubectl exec my-mysql-0 -- sh -c 'MYSQL_PWD="changeit" mysql -u admin testdb -e "
109 |   INSERT INTO customers (first_name, last_name, email) VALUES (\"John\", \"Doe\", \"jdoe@example.com\");
110 |   UPDATE customers SET first_name = \"Jane\" WHERE id = 1;
111 |   INSERT INTO customers (first_name, last_name, email) VALUES (\"Dylan\", \"Dog\", \"ddog@example.com\");
112 |   SELECT * FROM customers;"'
113 | id	first_name	last_name	email
114 | 1	Jane	Doe	jdoe@example.com
115 | 2	Dylan	Dog	ddog@example.com
116 | ```
117 | 
118 | The MySQL connector writes change events that occur in a table to a Kafka topic named like `serverName.databaseName.tableName`.
119 | We created 3 changes (insert-update-insert), so we have 3 records in that topic.
120 | It's interesting to look at some record properties: `op` is the change type (c=create, r=read for snapshot only, u=update, d=delete), `gtid` is the global transaction identifier that is unique in a MySQL cluster, `payload.source.ts_ms` is the timestamp when the change was applied, `payload.ts_ms` is the timestamp when Debezium processed that event. The notification lag is the difference with the source timestamp.
121 | 
122 | ```sh
123 | $ kubectl-kafka bin/kafka-console-consumer.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 \
124 |   --topic my-mysql.testdb.customers --from-beginning --max-messages 3
125 | Struct{after=Struct{id=2,first_name=Dylan,last_name=Dog,email=ddog@example.com},source=Struct{version=2.3.7.Final,connector=mysql,name=my-mysql,ts_ms=1730112871000,db=testdb,table=customers,server_id=111111,gtid=500bc4b7-951a-11ef-aae4-9e82de0bd73c:16,file=mysql-bin.000002,pos=2602,row=0,thread=61},op=c,ts_ms=1730112871209}
126 | Struct{after=Struct{id=1,first_name=John,last_name=Doe,email=jdoe@example.com},source=Struct{version=2.3.7.Final,connector=mysql,name=my-mysql,ts_ms=1730112871000,db=testdb,table=customers,server_id=111111,gtid=500bc4b7-951a-11ef-aae4-9e82de0bd73c:14,file=mysql-bin.000002,pos=1707,row=0,thread=61},op=c,ts_ms=1730112871199}
127 | Struct{before=Struct{id=1,first_name=John,last_name=Doe,email=jdoe@example.com},after=Struct{id=1,first_name=Jane,last_name=Doe,email=jdoe@example.com},source=Struct{version=2.3.7.Final,connector=mysql,name=my-mysql,ts_ms=1730112871000,db=testdb,table=customers,server_id=111111,gtid=500bc4b7-951a-11ef-aae4-9e82de0bd73c:15,file=mysql-bin.000002,pos=2120,row=0,thread=61},op=u,ts_ms=1730112871207}
128 | Processed a total of 3 messages
129 | ```
130 | 
131 | As an additional exercise, you can extend this data pipeline by configuring a sink connector and exporting these changes to an external system like Artemis Broker.
132 | 


--------------------------------------------------------------------------------
/sessions/006/install/connect.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kafka.strimzi.io/v1beta2
 2 | kind: KafkaConnect
 3 | metadata:
 4 |   name: my-connect-cluster
 5 |   annotations:
 6 |     strimzi.io/use-connector-resources: "true"
 7 | spec:
 8 |   replicas: 1
 9 |   #version: x.y.z
10 |   bootstrapServers: my-cluster-kafka-bootstrap:9093
11 |   tls:
12 |     trustedCertificates:
13 |       - secretName: my-cluster-cluster-ca-cert
14 |         certificate: ca.crt
15 |   config:
16 |     group.id: my-connect-cluster
17 |     offset.storage.topic: connect-cluster-offsets
18 |     config.storage.topic: connect-cluster-configs
19 |     status.storage.topic: connect-cluster-status
20 |     key.converter: org.apache.kafka.connect.storage.StringConverter
21 |     value.converter: org.apache.kafka.connect.storage.StringConverter
22 |     # -1 means use default broker RF
23 |     config.storage.replication.factor: -1
24 |     offset.storage.replication.factor: -1
25 |     status.storage.replication.factor: -1
26 |   logging:
27 |     type: inline
28 |     loggers:
29 |       rootLogger.level: INFO
30 |       logger.sourcetask.name: org.apache.kafka.connect.runtime.WorkerSourceTask
31 |       logger.sourcetask.level: INFO
32 |       logger.sinktask.name: org.apache.kafka.connect.runtime.WorkerSinkTask
33 |       logger.sinktask.level: INFO
34 |   resources:
35 |     limits:
36 |       cpu: 3000m
37 |       memory: 3Gi
38 |     requests:
39 |       cpu: 1000m
40 |       memory: 3Gi
41 |   build:
42 |     output:
43 |       type: docker
44 |       image: ttl.sh/fvaleri/kafka-connect:24h
45 |     plugins:
46 |       - name: debezium-mysql
47 |         artifacts:
48 |           - type: tgz
49 |             url: https://repo1.maven.org/maven2/io/debezium/debezium-connector-mysql/2.3.7.Final/debezium-connector-mysql-2.3.7.Final-plugin.tar.gz
50 | ---
51 | apiVersion: kafka.strimzi.io/v1beta2
52 | kind: KafkaConnector
53 | metadata:
54 |   name: mysql-source-connector
55 |   labels:
56 |     # must match the connect name
57 |     strimzi.io/cluster: my-connect-cluster
58 | spec:
59 |   tasksMax: 1
60 |   class: io.debezium.connector.mysql.MySqlConnector
61 |   config:
62 |     database.hostname: "my-mysql-svc"
63 |     database.port: 3306
64 |     database.user: "debezium"
65 |     database.password: "changeit"
66 |     database.dbname: "testdb"
67 |     # never change topic.prefix after connector startup
68 |     topic.prefix: "my-mysql"
69 |     # the server.id must be unique for each server or replication client
70 |     database.server.id: "222222"
71 |     database.include.list: "testdb"
72 |     table.include.list: "testdb.customers"
73 |     schema.history.internal.kafka.bootstrap.servers: "my-cluster-kafka-bootstrap:9092"
74 |     schema.history.internal.kafka.topic: "testdb.history"
75 |     include.schema.changes: "true"
76 |     # commit progress even when there are no changes
77 |     heartbeat.interval.ms: 10000
78 |     snapshot.mode: "when_needed"
79 | 


--------------------------------------------------------------------------------
/sessions/006/install/mysql.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: PersistentVolumeClaim
  3 | metadata:
  4 |   name: my-mysql-data
  5 | spec:
  6 |   accessModes:
  7 |     - ReadWriteOnce
  8 |   resources:
  9 |     requests:
 10 |       storage: 10Gi
 11 | ---
 12 | apiVersion: v1
 13 | kind: ConfigMap
 14 | metadata:
 15 |   name: my-mysql-cfg
 16 | data:
 17 |   # the server_id must be unique for each server or replication client
 18 |   my.cnf: |
 19 |     !include /etc/my.cnf
 20 |     [mysqld]
 21 |     server_id = 111111
 22 |     log_bin = mysql-bin
 23 |     binlog_format = ROW
 24 |     binlog_row_image = FULL
 25 |     binlog_rows_query_log_events = ON
 26 |     expire_logs_days = 10
 27 |     gtid_mode = ON
 28 |     enforce_gtid_consistency = ON
 29 | ---
 30 | apiVersion: v1
 31 | kind: Secret
 32 | metadata:
 33 |   name: my-mysql-env
 34 | type: Opaque
 35 | stringData:
 36 |   MYSQL_DEFAULTS_FILE: /config/configdb.d/my.cnf
 37 |   MYSQL_DATABASE: testdb
 38 |   MYSQL_USER: admin
 39 |   MYSQL_PASSWORD: changeit
 40 |   MYSQL_ALLOW_EMPTY_PASSWORD: "true"
 41 | ---
 42 | apiVersion: v1
 43 | kind: ConfigMap
 44 | metadata:
 45 |   name: my-mysql-init
 46 | data:
 47 |   initdb.sql: |
 48 |     use testdb;
 49 |     CREATE TABLE IF NOT EXISTS customers (
 50 |       id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
 51 |       first_name VARCHAR(255) NOT NULL,
 52 |       last_name VARCHAR(255) NOT NULL,
 53 |       email VARCHAR(255) NOT NULL UNIQUE
 54 |     );
 55 | 
 56 |     CREATE USER IF NOT EXISTS 'debezium'@'%' IDENTIFIED WITH caching_sha2_password BY 'changeit';
 57 |     GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'debezium'@'%';
 58 |     FLUSH PRIVILEGES;
 59 | ---
 60 | apiVersion: apps/v1
 61 | kind: StatefulSet
 62 | metadata:
 63 |   name: my-mysql
 64 | spec:
 65 |   replicas: 1
 66 |   serviceName: my-mysql
 67 |   selector:
 68 |     matchLabels:
 69 |       app: my-mysql
 70 |   template:
 71 |     metadata:
 72 |       labels:
 73 |         app: my-mysql
 74 |     spec:
 75 |       containers:
 76 |         - name: mysql
 77 |           image: quay.io/centos7/mysql-80-centos7:20230712
 78 |           resources:
 79 |             limits:
 80 |               cpu: 1000m
 81 |               memory: 1Gi
 82 |             requests:
 83 |               cpu: 500m
 84 |               memory: 1Gi
 85 |           envFrom:
 86 |             - secretRef:
 87 |                 name: my-mysql-env
 88 |           ports:
 89 |             - containerPort: 3306
 90 |               protocol: TCP
 91 |           volumeMounts:
 92 |             - name: my-mysql-data
 93 |               mountPath: /var/lib/mysql
 94 |             - name: my-mysql-cfg
 95 |               mountPath: /config/configdb.d
 96 |             - name: my-mysql-init
 97 |               mountPath: /tmp/sql
 98 |           readinessProbe:
 99 |             exec:
100 |               command: [ "mysqladmin", "-uroot", "ping" ]
101 |             initialDelaySeconds: 60
102 |             timeoutSeconds: 10
103 |           livenessProbe:
104 |             exec:
105 |               command: [ "mysqladmin", "-uroot", "ping" ]
106 |             initialDelaySeconds: 60
107 |             timeoutSeconds: 10
108 |       volumes:
109 |         - name: my-mysql-data
110 |           persistentVolumeClaim:
111 |             claimName: my-mysql-data
112 |         - name: my-mysql-cfg
113 |           configMap:
114 |             name: my-mysql-cfg
115 |         - name: my-mysql-init
116 |           configMap:
117 |             name: my-mysql-init
118 | ---
119 | apiVersion: v1
120 | kind: Service
121 | metadata:
122 |   name: my-mysql-svc
123 | spec:
124 |   ports:
125 |     - name: mysql
126 |       port: 3306
127 |       protocol: TCP
128 |       targetPort: 3306
129 |   selector:
130 |     app: my-mysql
131 | 


--------------------------------------------------------------------------------
/sessions/007/README.md:
--------------------------------------------------------------------------------
  1 | ## Use Mirror Maker 2 for disaster recovery
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | 
  5 | At this point, we can deploy the target cluster.
  6 | 
  7 | ```sh
  8 | $ kubectl create -f sessions/007/instal/target.yaml
  9 | kafkanodepool.kafka.strimzi.io/combined created
 10 | kafka.kafka.strimzi.io/my-cluster-tgt created
 11 | ```
 12 | 
 13 | When the target cluster is ready, we can deploy Mirror Maker 2 (MM2).
 14 | The recommended way of deploying the MM2 is near the target Kafka cluster (same subnet or zone), because the producer overhead is greater than the consumer overhead.
 15 | 
 16 | > [!IMPORTANT]
 17 | > When source and target clusters run on different namespaces or Kubernetes clusters, you have to copy the source `cluster-ca-cert` in the target namespace where MM2 is running.
 18 | 
 19 | ```sh
 20 | $ export SOURCE_NS="$NAMESPACE" TARGET_NS="$NAMESPACE"; envsubst < sessions/007/mm2.yaml | kubectl create -f -
 21 | kafkamirrormaker2.kafka.strimzi.io/my-mm2-cluster created
 22 | configmap/mirror-maker-2-metrics created
 23 | ```
 24 | 
 25 | MM2 runs on top of Kafka Connect with a set of configurable built-in connectors.
 26 | The `MirrorSourceConnector` replicates remote topics, ACLs, and configurations of a single source cluster and emits offset syncs.
 27 | The `MirrorCheckpointConnector` emits consumer group offsets checkpoints to enable failover points.
 28 | 
 29 | ```sh
 30 | $ kubectl get po
 31 | NAME                                          READY   STATUS    RESTARTS   AGE
 32 | my-cluster-broker-10                          1/1     Running   0          11m
 33 | my-cluster-broker-11                          1/1     Running   0          11m
 34 | my-cluster-broker-12                          1/1     Running   0          11m
 35 | my-cluster-controller-0                       1/1     Running   0          11m
 36 | my-cluster-controller-1                       1/1     Running   0          11m
 37 | my-cluster-controller-2                       1/1     Running   0          11m
 38 | my-cluster-entity-operator-657b477d4f-sv77v   2/2     Running   0          10m
 39 | my-cluster-tgt-combined-0                     1/1     Running   0          6m18s
 40 | my-cluster-tgt-combined-1                     1/1     Running   0          6m18s
 41 | my-cluster-tgt-combined-2                     1/1     Running   0          6m18s
 42 | my-mm2-cluster-mirrormaker2-0                 1/1     Running   0          2m5s
 43 | strimzi-cluster-operator-d78fd875b-ljmpl      1/1     Running   0          11m
 44 | 
 45 | $ kubectl get kmm2 my-mm2-cluster -o yaml | yq .status
 46 | conditions:
 47 |   - lastTransitionTime: "2024-10-12T10:14:20.521458310Z"
 48 |     status: "True"
 49 |     type: Ready
 50 | connectors:
 51 |   - connector:
 52 |       state: RUNNING
 53 |       worker_id: my-mm2-cluster-mirrormaker2-0.my-mm2-cluster-mirrormaker2.test.svc:8083
 54 |     name: my-cluster->my-cluster-tgt.MirrorCheckpointConnector
 55 |     tasks: []
 56 |     type: source
 57 |   - connector:
 58 |       state: RUNNING
 59 |       worker_id: my-mm2-cluster-mirrormaker2-0.my-mm2-cluster-mirrormaker2.test.svc:8083
 60 |     name: my-cluster->my-cluster-tgt.MirrorSourceConnector
 61 |     tasks:
 62 |       - id: 0
 63 |         state: RUNNING
 64 |         worker_id: my-mm2-cluster-mirrormaker2-0.my-mm2-cluster-mirrormaker2.test.svc:8083
 65 |       - id: 1
 66 |         state: RUNNING
 67 |         worker_id: my-mm2-cluster-mirrormaker2-0.my-mm2-cluster-mirrormaker2.test.svc:8083
 68 |       - id: 2
 69 |         state: RUNNING
 70 |         worker_id: my-mm2-cluster-mirrormaker2-0.my-mm2-cluster-mirrormaker2.test.svc:8083
 71 |     type: source
 72 | labelSelector: strimzi.io/cluster=my-mm2-cluster,strimzi.io/name=my-mm2-cluster-mirrormaker2,strimzi.io/kind=KafkaMirrorMaker2
 73 | observedGeneration: 2
 74 | replicas: 1
 75 | url: http://my-mm2-cluster-mirrormaker2-api.test.svc:8083
 76 | ```
 77 | 
 78 | In order to test message replication, we can send 1 million messages to the test topic in the source Kafka cluster.
 79 | 
 80 | > [!WARNING]
 81 | > Message replication is asynchronous, so there is always a delta of messaging that is at risk in case of disaster.
 82 |  
 83 | After some time, the log end offsets should match on both clusters.
 84 | In real world scenarios, the actual offsets tend to naturally diverge with time, because each Kafka cluster operates independently.
 85 | 
 86 | ```sh
 87 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 100 --num-records 1000000 \
 88 |   --throughput -1 --producer-props acks=1 bootstrap.servers=my-cluster-kafka-bootstrap:9092
 89 | 837463 records sent, 167492.6 records/sec (15.97 MB/sec), 1207.8 ms avg latency, 2358.0 ms max latency.
 90 | 1000000 records sent, 174733.531365 records/sec (16.66 MB/sec), 1202.91 ms avg latency, 2358.00 ms max latency, 1298 ms 50th, 2138 ms 95th, 2266 ms 99th, 2332 ms 99.9th.
 91 | 
 92 | $ kubectl-kafka bin/kafka-get-offsets.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --topic my-topic --time -1
 93 | my-topic:0:353737
 94 | my-topic:1:358846
 95 | my-topic:2:287417
 96 | 
 97 | $ kubectl-kafka bin/kafka-get-offsets.sh --bootstrap-server my-cluster-tgt-kafka-bootstrap:9092 --topic my-topic --time -1
 98 | my-topic:0:353737
 99 | my-topic:1:358846
100 | my-topic:2:287417
101 | ```
102 | 
103 | ## Tuning MM2 for throughput
104 | 
105 | High-volume message generation, as seen in web activity tracking, can result in a large number of messages.
106 | Additionally, even a source cluster with moderate throughput can create a significant volume of messages when mirroring large amounts of existing data.
107 | In this case MM2 replication is slow even if you have a fast network, because default producers are not optimized for throughput.
108 | 
109 | Let's run a load test and see how fast we can replicate data with default settings.
110 | By looking at `MirrorSourceConnector` task metrics, we see that we are saturating the producer buffer (default: 16384 bytes), which is a bottleneck.
111 | 
112 | ```sh
113 | $ kubectl scale kmm2 my-mm2-cluster --replicas 0
114 | kafkamirrormaker2.kafka.strimzi.io/my-mm2-cluster scaled
115 | 
116 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 100 --num-records 30000000 \
117 |   --throughput -1 --producer-props acks=1 bootstrap.servers=my-cluster-kafka-bootstrap:9092
118 | 1040165 records sent, 207825.2 records/sec (19.82 MB/sec), 752.2 ms avg latency, 1588.0 ms max latency.
119 | ...
120 | 30000000 records sent, 642659.754504 records/sec (61.29 MB/sec), 137.34 ms avg latency, 2517.00 ms max latency, 39 ms 50th, 614 ms 95th, 1474 ms 99th, 2408 ms 99.9th.
121 | ```
122 | 
123 | On my machine, it takes about 10 minutes to get back `NaN` from the following metrics, which means replication completed.
124 | 
125 | ```sh
126 | $ kubectl scale kmm2 my-mm2-cluster --replicas 1
127 | kafkamirrormaker2.kafka.strimzi.io/my-mm2-cluster scaled
128 | 
129 | $ kubectl exec $(kubectl get po | grep my-mm2-cluster | awk '{print $1}') -- curl -s http://localhost:9404/metrics \
130 |   | grep -e 'kafka_producer_batch_size_avg{clientid="\\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector' \
131 |     -e 'kafka_producer_request_latency_avg{clientid="\\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector'
132 | kafka_producer_batch_size_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-0\""} 16277.085847267712
133 | kafka_producer_batch_size_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-1\""} 16278.264065335754
134 | kafka_producer_batch_size_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-2\""} 16277.15397200509
135 | kafka_producer_request_latency_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-0\""} 10.944482877896922
136 | kafka_producer_request_latency_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-1\""} 14.26193724420191
137 | kafka_producer_request_latency_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-2\""} 11.238677867056245
138 | ```
139 | 
140 | We now increase the producer buffer to default value x20 by overriding its configuration.
141 | Every batch will include more data, so the same test should complete in about half of the time or even less.
142 | The request latency increases, but it is still within reasonable bounds.
143 | 
144 | ```sh
145 | $ kubectl get kmm2 my-mm2-cluster -o yaml | yq '.spec.mirrors[0].sourceConnector.config |= ({"producer.override.batch.size": 327680} + .)' | kubectl apply -f -
146 | kafkamirrormaker2.kafka.strimzi.io/my-mm2-cluster configured
147 | 
148 | $ kubectl scale kmm2 my-mm2-cluster --replicas 0
149 | kafkamirrormaker2.kafka.strimzi.io/my-mm2-cluster scaled
150 | 
151 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 100 --num-records 30000000 \
152 |   --throughput -1 --producer-props acks=1 bootstrap.servers=my-cluster-kafka-bootstrap:9092
153 | 3402475 records sent, 680495.0 records/sec (64.90 MB/sec), 32.4 ms avg latency, 342.0 ms max latency.
154 | ...
155 | 30000000 records sent, 923105.326318 records/sec (88.03 MB/sec), 21.94 ms avg latency, 1495.00 ms max latency, 3 ms 50th, 66 ms 95th, 201 ms 99th, 1329 ms 99.9th.
156 | ```
157 | 
158 | On my machine, it now takes about 5 minutes.
159 | 
160 | ```sh
161 | $ kubectl scale kmm2 my-mm2-cluster --replicas 1
162 | kafkamirrormaker2.kafka.strimzi.io/my-mm2-cluster scaled
163 | 
164 | $ kubectl exec $(kubectl get po | grep my-mm2-cluster | awk '{print $1}') -- curl -s http://localhost:9404/metrics \
165 |   | grep -e 'kafka_producer_batch_size_avg{clientid="\\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector' \
166 |     -e 'kafka_producer_request_latency_avg{clientid="\\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector'
167 | kafka_producer_batch_size_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-0\""} 140310.91324200912
168 | kafka_producer_batch_size_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-1\""} 143986.90502793295
169 | kafka_producer_batch_size_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-2\""} 122895.43076923076
170 | kafka_producer_batch_size_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-3\""} 33464.164893617024
171 | kafka_producer_request_latency_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-0\""} 59.678899082568805
172 | kafka_producer_request_latency_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-1\""} 71.0561797752809
173 | kafka_producer_request_latency_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-2\""} 52.08247422680412
174 | kafka_producer_request_latency_avg{clientid="\"connector-producer-my-cluster->my-cluster-tgt.MirrorSourceConnector-3\""} 41.670212765957444
175 | ```
176 | 


--------------------------------------------------------------------------------
/sessions/007/install/mm2.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: kafka.strimzi.io/v1beta2
  2 | kind: KafkaMirrorMaker2
  3 | metadata:
  4 |   name: my-mm2-cluster
  5 | spec:
  6 |   replicas: 0
  7 |   #version: x.y.z
  8 |   connectCluster: my-cluster-tgt
  9 |   clusters:
 10 |     - alias: my-cluster
 11 |       bootstrapServers: my-cluster-kafka-bootstrap.${SOURCE_NS}.svc:9093
 12 |       tls:
 13 |         trustedCertificates:
 14 |           - certificate: ca.crt
 15 |             secretName: my-cluster-cluster-ca-cert
 16 |     - alias: my-cluster-tgt
 17 |       bootstrapServers: my-cluster-tgt-kafka-bootstrap.${TARGET_NS}.svc:9093
 18 |       tls:
 19 |         trustedCertificates:
 20 |           - certificate: ca.crt
 21 |             secretName: my-cluster-tgt-cluster-ca-cert
 22 |       config:
 23 |         # -1 means use default broker RF
 24 |         config.storage.replication.factor: -1
 25 |         offset.storage.replication.factor: -1
 26 |         status.storage.replication.factor: -1
 27 |         ssl.cipher.suites: TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
 28 |         ssl.enabled.protocols: TLSv1.2
 29 |         ssl.protocol: TLSv1.2
 30 |   mirrors:
 31 |     - sourceCluster: my-cluster
 32 |       targetCluster: my-cluster-tgt
 33 |       topicsPattern: ".*"
 34 |       groupsPattern: ".*"
 35 |       sourceConnector:
 36 |         tasksMax: 10
 37 |         config:
 38 |           replication.factor: -1
 39 |           offset-syncs.topic.replication.factor: -1
 40 |           offset-syncs.topic.location: "target"
 41 |           key.converter: "org.apache.kafka.connect.converters.ByteArrayConverter"
 42 |           value.converter: "org.apache.kafka.connect.converters.ByteArrayConverter"
 43 |           # disable source cluster name prefix on target topic
 44 |           replication.policy.class: "org.apache.kafka.connect.mirror.IdentityReplicationPolicy"
 45 |           refresh.topics.interval.seconds: 20
 46 |           sync.topic.configs.enabled: true
 47 |           sync.topic.acls.enabled: false
 48 |       checkpointConnector:
 49 |         tasksMax: 10
 50 |         config:
 51 |           checkpoints.topic.replication.factor: -1
 52 |           key.converter: "org.apache.kafka.connect.converters.ByteArrayConverter"
 53 |           value.converter: "org.apache.kafka.connect.converters.ByteArrayConverter"
 54 |           # disable source cluster name prefix on target topic
 55 |           replication.policy.class: "org.apache.kafka.connect.mirror.IdentityReplicationPolicy"
 56 |           sync.group.offsets.enabled: true
 57 |           sync.group.offsets.interval.seconds: 20
 58 |           emit.checkpoints.enabled: true
 59 |           emit.checkpoints.interval.seconds: 20
 60 |           refresh.groups.interval.seconds: 20
 61 |   logging:
 62 |     type: inline
 63 |     loggers:
 64 |       rootLogger.level: INFO
 65 |       logger.sourcetask.name: org.apache.kafka.connect.runtime.WorkerSourceTask
 66 |       logger.sourcetask.level: INFO
 67 |       logger.sinktask.name: org.apache.kafka.connect.runtime.WorkerSinkTask
 68 |       logger.sinktask.level: INFO
 69 |   resources:
 70 |     limits:
 71 |       cpu: 3000m
 72 |       memory: 3Gi
 73 |     requests:
 74 |       cpu: 1000m
 75 |       memory: 3Gi
 76 |   # expose JMX metrics in Prometheus format on port 9404
 77 |   metricsConfig:
 78 |     type: jmxPrometheusExporter
 79 |     valueFrom:
 80 |       configMapKeyRef:
 81 |         name: mirror-maker-2-metrics
 82 |         key: metrics-config.yml
 83 | ---
 84 | kind: ConfigMap
 85 | apiVersion: v1
 86 | metadata:
 87 |   name: mirror-maker-2-metrics
 88 |   labels:
 89 |     app: strimzi
 90 | data:
 91 |   metrics-config.yml: |
 92 |     # See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics
 93 |     lowercaseOutputName: true
 94 |     lowercaseOutputLabelNames: true
 95 |     rules:
 96 |     #kafka.connect:type=app-info,client-id="{clientid}"
 97 |     #kafka.consumer:type=app-info,client-id="{clientid}"
 98 |     #kafka.producer:type=app-info,client-id="{clientid}"
 99 |     - pattern: 'kafka.(.+)<type=app-info, client-id=(.+)><>start-time-ms'
100 |       name: kafka_$1_start_time_seconds
101 |       labels:
102 |         clientId: "$2"
103 |       help: "Kafka $1 JMX metric start time seconds"
104 |       type: GAUGE
105 |       valueFactor: 0.001
106 |     - pattern: 'kafka.(.+)<type=app-info, client-id=(.+)><>(commit-id|version): (.+)'
107 |       name: kafka_$1_$3_info
108 |       value: 1
109 |       labels:
110 |         clientId: "$2"
111 |         $3: "$4"
112 |       help: "Kafka $1 JMX metric info version and commit-id"
113 |       type: UNTYPED
114 | 
115 |     #kafka.producer:type=producer-topic-metrics,client-id="{clientid}",topic="{topic}"", partition="{partition}"
116 |     #kafka.consumer:type=consumer-fetch-manager-metrics,client-id="{clientid}",topic="{topic}"", partition="{partition}"
117 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.+), topic=(.+), partition=(.+)><>(.+-total)
118 |       name: kafka_$2_$6
119 |       labels:
120 |         clientId: "$3"
121 |         topic: "$4"
122 |         partition: "$5"
123 |       help: "Kafka $1 JMX metric type $2"
124 |       type: COUNTER
125 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.+), topic=(.+), partition=(.+)><>(compression-rate|.+-avg|.+-replica|.+-lag|.+-lead)
126 |       name: kafka_$2_$6
127 |       labels:
128 |         clientId: "$3"
129 |         topic: "$4"
130 |         partition: "$5"
131 |       help: "Kafka $1 JMX metric type $2"
132 |       type: GAUGE
133 | 
134 |     #kafka.producer:type=producer-topic-metrics,client-id="{clientid}",topic="{topic}"
135 |     #kafka.consumer:type=consumer-fetch-manager-metrics,client-id="{clientid}",topic="{topic}"", partition="{partition}"
136 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.+), topic=(.+)><>(.+-total)
137 |       name: kafka_$2_$5
138 |       labels:
139 |         clientId: "$3"
140 |         topic: "$4"
141 |       help: "Kafka $1 JMX metric type $2"
142 |       type: COUNTER
143 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.+), topic=(.+)><>(compression-rate|.+-avg)
144 |       name: kafka_$2_$5
145 |       labels:
146 |         clientId: "$3"
147 |         topic: "$4"
148 |       help: "Kafka $1 JMX metric type $2"
149 |       type: GAUGE
150 | 
151 |     #kafka.connect:type=connect-node-metrics,client-id="{clientid}",node-id="{nodeid}"
152 |     #kafka.consumer:type=consumer-node-metrics,client-id=consumer-1,node-id="{nodeid}"
153 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.+), node-id=(.+)><>(.+-total)
154 |       name: kafka_$2_$5
155 |       labels:
156 |         clientId: "$3"
157 |         nodeId: "$4"
158 |       help: "Kafka $1 JMX metric type $2"
159 |       type: COUNTER
160 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.+), node-id=(.+)><>(.+-avg)
161 |       name: kafka_$2_$5
162 |       labels:
163 |         clientId: "$3"
164 |         nodeId: "$4"
165 |       help: "Kafka $1 JMX metric type $2"
166 |       type: GAUGE
167 | 
168 |     #kafka.connect:type=kafka-metrics-count,client-id="{clientid}"
169 |     #kafka.consumer:type=consumer-fetch-manager-metrics,client-id="{clientid}"
170 |     #kafka.consumer:type=consumer-coordinator-metrics,client-id="{clientid}"
171 |     #kafka.consumer:type=consumer-metrics,client-id="{clientid}"
172 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.*)><>(.+-total)
173 |       name: kafka_$2_$4
174 |       labels:
175 |         clientId: "$3"
176 |       help: "Kafka $1 JMX metric type $2"
177 |       type: COUNTER
178 |     - pattern: kafka.(.+)<type=(.+)-metrics, client-id=(.*)><>(.+-avg|.+-bytes|.+-count|.+-ratio|.+-age|.+-flight|.+-threads|.+-connectors|.+-tasks|.+-ago)
179 |       name: kafka_$2_$4
180 |       labels:
181 |         clientId: "$3"
182 |       help: "Kafka $1 JMX metric type $2"
183 |       type: GAUGE
184 | 
185 |     #kafka.connect:type=connector-task-metrics,connector="{connector}",task="{task}<> status"
186 |     - pattern: 'kafka.connect<type=connector-task-metrics, connector=(.+), task=(.+)><>status: ([a-z-]+)'
187 |       name: kafka_connect_connector_status
188 |       value: 1
189 |       labels:
190 |         connector: "$1"
191 |         task: "$2"
192 |         status: "$3"
193 |       help: "Kafka Connect JMX Connector status"
194 |       type: GAUGE
195 | 
196 |     #kafka.connect:type=task-error-metrics,connector="{connector}",task="{task}"
197 |     #kafka.connect:type=source-task-metrics,connector="{connector}",task="{task}"
198 |     #kafka.connect:type=sink-task-metrics,connector="{connector}",task="{task}"
199 |     #kafka.connect:type=connector-task-metrics,connector="{connector}",task="{task}"
200 |     - pattern: kafka.connect<type=(.+)-metrics, connector=(.+), task=(.+)><>(.+-total)
201 |       name: kafka_connect_$1_$4
202 |       labels:
203 |         connector: "$2"
204 |         task: "$3"
205 |       help: "Kafka Connect JMX metric type $1"
206 |       type: COUNTER
207 |     - pattern: kafka.connect<type=(.+)-metrics, connector=(.+), task=(.+)><>(.+-count|.+-ms|.+-ratio|.+-avg|.+-failures|.+-requests|.+-timestamp|.+-logged|.+-errors|.+-retries|.+-skipped)
208 |       name: kafka_connect_$1_$4
209 |       labels:
210 |         connector: "$2"
211 |         task: "$3"
212 |       help: "Kafka Connect JMX metric type $1"
213 |       type: GAUGE
214 | 
215 |     #kafka.connect:type=connector-metrics,connector="{connector}"
216 |     #kafka.connect:type=connect-worker-metrics,connector="{connector}"
217 |     - pattern: kafka.connect<type=connect-worker-metrics, connector=(.+)><>([a-z-]+)
218 |       name: kafka_connect_worker_$2
219 |       labels:
220 |         connector: "$1"
221 |       help: "Kafka Connect JMX metric $1"
222 |       type: GAUGE
223 | 
224 |     #kafka.connect:type=connect-worker-metrics
225 |     - pattern: kafka.connect<type=connect-worker-metrics><>([a-z-]+-total)
226 |       name: kafka_connect_worker_$1
227 |       help: "Kafka Connect JMX metric worker"
228 |       type: COUNTER
229 |     - pattern: kafka.connect<type=connect-worker-metrics><>([a-z-]+)
230 |       name: kafka_connect_worker_$1
231 |       help: "Kafka Connect JMX metric worker"
232 |       type: GAUGE
233 | 
234 |     #kafka.connect:type=connect-worker-rebalance-metrics
235 |     - pattern: kafka.connect<type=connect-worker-rebalance-metrics><>([a-z-]+-total)
236 |       name: kafka_connect_worker_rebalance_$1
237 |       help: "Kafka Connect JMX metric rebalance information"
238 |       type: COUNTER
239 |     - pattern: kafka.connect<type=connect-worker-rebalance-metrics><>([a-z-]+)
240 |       name: kafka_connect_worker_rebalance_$1
241 |       help: "Kafka Connect JMX metric rebalance information"
242 |       type: GAUGE
243 | 
244 |     #kafka.connect:type=MirrorSourceConnector
245 |     - pattern: kafka.connect.mirror<type=MirrorSourceConnector, target=(.+), topic=(.+), partition=(.+)><>([a-z-_]+)
246 |       name: kafka_connect_mirror_mirrorsourceconnector_$4
247 |       labels:
248 |         target: "$1"
249 |         topic: "$2"
250 |         partition: "$3"
251 |       help: "Kafka Mirror Maker 2 Source Connector metrics"
252 |       type: GAUGE
253 | 
254 |     #kafka.connect:type=MirrorCheckpointConnector
255 |     - pattern: kafka.connect.mirror<type=MirrorCheckpointConnector, source=(.+), target=(.+), group=(.+), topic=(.+), partition=(.+)><>([a-z-_]+)
256 |       name: kafka_connect_mirror_mirrorcheckpointconnector_$6
257 |       labels:
258 |         source: "$1"
259 |         target: "$2"
260 |         group: "$3"
261 |         topic: "$4"
262 |         partition: "$5"
263 |       help: "Kafka Mirror Maker 2 Checkpoint Connector metrics"
264 |       type: GAUGE
265 | 


--------------------------------------------------------------------------------
/sessions/007/install/target.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kafka.strimzi.io/v1beta2
 2 | kind: KafkaNodePool
 3 | metadata:
 4 |   name: combined
 5 |   labels:
 6 |     # must match the cluster name
 7 |     strimzi.io/cluster: my-cluster-tgt
 8 | spec:
 9 |   replicas: 3
10 |   roles:
11 |     - controller
12 |     - broker
13 |   resources:
14 |     # set requests==limits to have Guaranteed QoS
15 |     limits:
16 |       cpu: 500m
17 |       memory: 1Gi
18 |     requests:
19 |       cpu: 250m
20 |       memory: 1Gi
21 |   storage:
22 |     size: 5Gi
23 |     type: persistent-claim
24 |     deleteClaim: false
25 | ---
26 | apiVersion: kafka.strimzi.io/v1beta2
27 | kind: Kafka
28 | metadata:
29 |   name: my-cluster-tgt
30 |   annotations:
31 |     strimzi.io/node-pools: enabled
32 |     strimzi.io/kraft: enabled
33 | spec:
34 |   kafka:
35 |     config:
36 |       num.partitions: 3
37 |       default.replication.factor: 3
38 |       min.insync.replicas: 2
39 |       offsets.topic.replication.factor: 3
40 |       transaction.state.log.replication.factor: 3
41 |       transaction.state.log.min.isr: 2
42 |     listeners:
43 |       - name: plain
44 |         port: 9092
45 |         type: internal
46 |         tls: false
47 |       - name: tls
48 |         port: 9093
49 |         type: internal
50 |         tls: true
51 | 


--------------------------------------------------------------------------------
/sessions/008/README.md:
--------------------------------------------------------------------------------
  1 | ## Avoid running out of disk space with the Strimzi quota plugin
  2 | 
  3 | > [!WARNING]
  4 | > Don't use Minikube, as it uses hostpath volumes that do not enforce storage capacity.
  5 | 
  6 | For the sake of this example, we deploy the Kafka cluster reducing the volume size.
  7 | 
  8 | ```sh
  9 | $ sed -E 's/size: .*/size: "1Gi"/g' sessions/001/install.yaml | kubectl create -f -
 10 | kafkanodepool.kafka.strimzi.io/broker created
 11 | kafkanodepool.kafka.strimzi.io/controller created
 12 | kafka.kafka.strimzi.io/my-cluster created
 13 | kafkatopic.kafka.strimzi.io/my-topic created
 14 | 
 15 | $ kubectl get pv | grep my-cluster-broker
 16 | pvc-2609aaa7-3a13-4bc3-9d0a-cc19c4ccef50   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-10      gp3-csi        <unset>                          4m15s
 17 | pvc-55f69017-6ef9-4701-94ef-ffb90433cebd   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-11      gp3-csi        <unset>                          4m15s
 18 | pvc-741a3a77-9f5a-4656-af71-d41619e12bfc   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-12      gp3-csi        <unset>                          4m15s
 19 | ```
 20 | 
 21 | Only network bandwidth and request rate quotas are supported by the default Kafka quota plugin. 
 22 | Instead, the [Strimzi quota plugin](https://github.com/strimzi/kafka-quotas-plugin) allows to set storage limits independent of the number of clients.
 23 | 
 24 | The Strimzi Kafka images already contains this plugin.
 25 | With the following configuration, all clients will be throttled to 0 when any volume in the cluster has less than 30% available space.
 26 | The check interval is set to 5 seconds.
 27 | 
 28 | ```sh
 29 | $ kubectl patch k my-cluster --type=json \
 30 |   -p='[{"op": "add", "path": "/spec/kafka/config/client.quota.callback.static.storage.check-interval", "value": "5"}]' \
 31 |   && kubectl patch k my-cluster --type merge -p '
 32 |   spec:
 33 |     kafka:
 34 |       quotas:
 35 |         type: strimzi
 36 |         minAvailableRatioPerVolume: 0.3'
 37 | kafka.kafka.strimzi.io/my-cluster patched
 38 | kafka.kafka.strimzi.io/my-cluster patched
 39 | ```
 40 | 
 41 | After that, the cluster operator will roll all brokers to enable the quota plugin.
 42 | When the cluster is ready, we try to break it by sending 3.3 GiB of data to a topic, which exceeds the cluster capacity.
 43 | 
 44 | ```sh
 45 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 1000 --num-records 3300000 \
 46 |   --throughput -1 --producer-props acks=all bootstrap.servers=my-cluster-kafka-bootstrap:9092
 47 | 21873 records sent, 4373.7 records/sec (4.17 MB/sec), 2509.9 ms avg latency, 4285.0 ms max latency.
 48 | 41344 records sent, 8268.8 records/sec (7.89 MB/sec), 4536.6 ms avg latency, 5997.0 ms max latency.
 49 | 49104 records sent, 9820.8 records/sec (9.37 MB/sec), 3575.3 ms avg latency, 4295.0 ms max latency.
 50 | ...
 51 | org.apache.kafka.clients.producer.BufferExhaustedException: Failed to allocate 16384 bytes within the configured max blocking time 60000 ms. Total memory: 33554432 bytes. Available memory: 0 bytes. Poolable size: 16384 bytes
 52 | org.apache.kafka.common.errors.TimeoutException: Expiring 16 record(s) for my-topic-0:120018 ms has passed since batch creation
 53 | org.apache.kafka.common.errors.TimeoutException: Expiring 16 record(s) for my-topic-0:120018 ms has passed since batch creation
 54 | org.apache.kafka.common.errors.TimeoutException: Expiring 16 record(s) for my-topic-0:120018 ms has passed since batch creation
 55 | ...
 56 | ^C
 57 | ```
 58 | 
 59 | At some point, the perf client can't send data anymore, but the cluster is still healthy.
 60 | 
 61 | ```sh
 62 | $ kubectl get po | grep my-cluster-broker
 63 | my-cluster-broker-10  0/1     CrashLoopBackOff   8 (70s ago)   27m
 64 | my-cluster-broker-11  0/1     CrashLoopBackOff   8 (84s ago)   25m
 65 | my-cluster-broker-12  0/1     CrashLoopBackOff   8 (87s ago)   26m
 66 | 
 67 | $ kubectl exec my-cluster-broker-10 -- df -h /var/lib/kafka/data \
 68 |   && kubectl exec my-cluster-broker-11-- df -h /var/lib/kafka/data \
 69 |   && kubectl exec my-cluster-broker-12-- df -h /var/lib/kafka/data
 70 | Filesystem      Size  Used Avail Use% Mounted on
 71 | /dev/nvme1n1    974M  735M  223M  77% /var/lib/kafka/data
 72 | Filesystem      Size  Used Avail Use% Mounted on
 73 | /dev/nvme1n1    974M  735M  223M  77% /var/lib/kafka/data
 74 | Filesystem      Size  Used Avail Use% Mounted on
 75 | /dev/nvme1n1    974M  735M  223M  77% /var/lib/kafka/data
 76 | ```
 77 | 
 78 | ## Online Kafka volume recovery with expansion support
 79 | 
 80 | > [!WARNING]
 81 | > Don't use Minikube, as it uses hostpath volumes that do not enforce storage capacity.
 82 | 
 83 | For the sake of this example, we deploy the Kafka cluster reducing the volume size.
 84 | 
 85 | ```sh
 86 | $ sed -E 's/size: .*/size: "1Gi"/g' sessions/001/install.yaml | kubectl create -f -
 87 | kafkanodepool.kafka.strimzi.io/broker created
 88 | kafkanodepool.kafka.strimzi.io/controller created
 89 | kafka.kafka.strimzi.io/my-cluster created
 90 | kafkatopic.kafka.strimzi.io/my-topic created
 91 | 
 92 | $ kubectl get pv | grep my-cluster-broker
 93 | pvc-568b390e-d8a3-4efa-a528-dbd0934e18e8   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-11      gp3-csi        <unset>                          4m57s
 94 | pvc-875bbcc9-5f86-442e-9e05-f2b8852c83ce   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-10      gp3-csi        <unset>                          4m57s
 95 | pvc-c328aab2-8948-4791-88df-a488e9fd9faa   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-12      gp3-csi        <unset>                          4m57s
 96 | ```
 97 | 
 98 | When the cluster is ready, we break it by sending 3.3 GiB of data to a topic, which exceeds the cluster capacity.
 99 | 
100 | ```sh
101 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 1000 --num-records 3300000 \
102 |   --throughput -1 --producer-props acks=all bootstrap.servers=my-cluster-kafka-bootstrap:9092
103 | 22513 records sent, 4486.4 records/sec (4.28 MB/sec), 2544.8 ms avg latency, 4258.0 ms max latency.
104 | 39104 records sent, 7820.8 records/sec (7.46 MB/sec), 4756.2 ms avg latency, 6197.0 ms max latency.
105 | 52928 records sent, 10585.6 records/sec (10.10 MB/sec), 3318.4 ms avg latency, 4669.0 ms max latency.
106 | ...
107 | [2024-10-12 12:04:09,916] WARN [Producer clientId=perf-producer-client] Connection to node 5 (my-cluster-broker-10.my-cluster-kafka-brokers.test.svc/10.130.0.31:9092) could not be established. Node may not be available. (org.apache.kafka.clients.NetworkClient)
108 | [2024-10-12 12:04:09,920] WARN [Producer clientId=perf-producer-client] Connection to node 7 (my-cluster-broker-12.my-cluster-kafka-brokers.test.svc/10.129.0.28:9092) could not be established. Node may not be available. (org.apache.kafka.clients.NetworkClient)
109 | [2024-10-12 12:04:09,931] WARN [Producer clientId=perf-producer-client] Connection to node 6 (my-cluster-broker-11.my-cluster-kafka-brokers.test.svc/10.131.0.18:9092) could not be established. Node may not be available. (org.apache.kafka.clients.NetworkClient)
110 | ^C
111 | 
112 | $ kubectl get po | grep my-cluster-broker
113 | my-cluster-broker-10  0/1     CrashLoopBackOff   8 (70s ago)   27m
114 | my-cluster-broker-11  0/1     CrashLoopBackOff   8 (84s ago)   25m
115 | my-cluster-broker-12  0/1     CrashLoopBackOff   8 (87s ago)   26m
116 | 
117 | $ kubectl logs my-cluster-broker-10| grep "No space left on device" | tail -n1
118 | Caused by: java.io.IOException: No space left on device
119 | ```
120 | 
121 | Even if not all pods failed, we still need to increase the volume size of all brokers because the storage configuration is shared.
122 | If volume expansion is supported on the storage class, you can simply increase the storage size in the Kafka resource, and the operator will take care of it. 
123 | This operation may take some time to complete, depending on the size of the volume and the available resources in the cluster.
124 | 
125 | > [!WARNING]  
126 | > The expansion is not always feasible in cloud deployments, for example with a standard block size of 4KB an AWS EBS volume can support only up to 16TB.
127 | 
128 | ```sh
129 | [[ $(kubectl get sc $(kubectl get pv | grep data-my-cluster-broker-10| awk '{print $7}') -o yaml | yq .allowVolumeExpansion) == "true" ]] \
130 |   && kubectl patch knp broker --type merge -p '
131 |     spec:
132 |       storage:
133 |         size: 10Gi'
134 | kafkanodepool.kafka.strimzi.io/broker patched
135 | 
136 | $ kubectl logs $(kubectl get po | grep cluster-operator | awk '{print $1}') | grep "Resizing"
137 | 2024-10-12 12:10:08 INFO  PvcReconciler:137 - Reconciliation #1(watch) Kafka(test/my-cluster): Resizing PVC data-my-cluster-broker-10  from 1 to 10Gi.
138 | 2024-10-12 12:10:08 INFO  PvcReconciler:137 - Reconciliation #1(watch) Kafka(test/my-cluster): Resizing PVC data-my-cluster-broker-11  from 1 to 10Gi.
139 | 2024-10-12 12:10:08 INFO  PvcReconciler:137 - Reconciliation #1(watch) Kafka(test/my-cluster): Resizing PVC data-my-cluster-broker-12  from 1 to 10Gi.
140 | 
141 | $ kubectl get po | grep my-cluster-broker
142 | my-cluster-broker-10                          1/1     Running   0          13m
143 | my-cluster-broker-11                          1/1     Running   0          13m
144 | my-cluster-broker-12                          1/1     Running   0          13m
145 | 
146 | $ kubectl get pv | grep my-cluster-broker
147 | pvc-568b390e-d8a3-4efa-a528-dbd0934e18e8   10Gi       RWO            Delete           Bound    test/data-my-cluster-broker-11      gp3-csi        <unset>                          14m
148 | pvc-875bbcc9-5f86-442e-9e05-f2b8852c83ce   10Gi       RWO            Delete           Bound    test/data-my-cluster-broker-10      gp3-csi        <unset>                          14m
149 | pvc-c328aab2-8948-4791-88df-a488e9fd9faa   10Gi       RWO            Delete           Bound    test/data-my-cluster-broker-12      gp3-csi        <unset>                          14m
150 | ```
151 | 
152 | ## Offline Kafka volume recovery with no expansion support (expert level)
153 | 
154 | > [!WARNING]
155 | > Don't use Minikube, as it uses hostpath volumes that do not enforce storage capacity.
156 | 
157 | For the sake of this example, we deploy the Kafka cluster reducing the volume size.
158 | 
159 | ```sh
160 | $ sed -E 's/size: .*/size: "1Gi"/g' sessions/001/install.yaml | kubectl create -f -
161 | kafkanodepool.kafka.strimzi.io/broker created
162 | kafkanodepool.kafka.strimzi.io/controller created
163 | kafka.kafka.strimzi.io/my-cluster created
164 | kafkatopic.kafka.strimzi.io/my-topic created
165 | 
166 | $ kubectl wait --timeout=120s --for=condition=ready k my-cluster; \
167 |   KAFKA_PODS="$(kubectl get po | grep my-cluster-broker | awk '{print $1}')" \
168 |   VOLUME_CLASS="$(kubectl get pv | grep my-cluster-broker | head -n1 | awk '{print $7}')" \
169 |   CLUSTER_ID="$(kubectl get k my-cluster -o yaml | yq .status.clusterId)"
170 |   NEW_VOLUME_SIZE="10Gi"
171 |   
172 | $ kubectl get pv | grep my-cluster-broker
173 | pvc-6efa4986-a8f8-42d3-ae80-0229d262cf81   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-12      gp3-csi        <unset>                          66s
174 | pvc-d76d68c6-52e9-4a9f-a20f-3b052ea49c55   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-11      gp3-csi        <unset>                          66s
175 | pvc-fe5ccdb3-b550-467e-b6e0-f4d3ece79ed0   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-10      gp3-csi        <unset>                          66s
176 | ```
177 | 
178 | When the cluster is ready, we break it by sending 3.3 GiB of data to a topic, which exceeds the cluster capacity.
179 | 
180 | ```sh
181 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 1000 --num-records 3300000 \
182 |   --throughput -1 --producer-props acks=all bootstrap.servers=my-cluster-kafka-bootstrap:9092
183 | 15521 records sent, 3104.2 records/sec (2.96 MB/sec), 2627.4 ms avg latency, 4363.0 ms max latency.
184 | 36192 records sent, 7222.5 records/sec (6.89 MB/sec), 5360.9 ms avg latency, 6964.0 ms max latency.
185 | 43728 records sent, 8745.6 records/sec (8.34 MB/sec), 4132.9 ms avg latency, 5104.0 ms max latency.
186 | ...
187 | [2024-10-16 16:06:47,718] WARN [Producer clientId=perf-producer-client] Connection to node 5 (my-cluster-broker-10.my-cluster-kafka-brokers.test.svc/10.130.0.17:9092) could not be established. Node may not be available. (org.apache.kafka.clients.NetworkClient)
188 | [2024-10-16 16:06:47,718] WARN [Producer clientId=perf-producer-client] Connection to node 7 (my-cluster-broker-12.my-cluster-kafka-brokers.test.svc/10.131.0.24:9092) could not be established. Node may not be available. (org.apache.kafka.clients.NetworkClient)
189 | [2024-10-16 16:06:47,718] WARN [Producer clientId=perf-producer-client] Connection to node 6 (my-cluster-broker-11.my-cluster-kafka-brokers.test.svc/10.129.0.14:9092) could not be established. Node may not be available. (org.apache.kafka.clients.NetworkClient)
190 | ^C
191 | 
192 | $ kubectl get po | grep my-cluster-broker
193 | my-cluster-broker-10                          0/1     CrashLoopBackOff   2 (12s ago)   3m41s
194 | my-cluster-broker-11                          0/1     CrashLoopBackOff   2 (11s ago)   3m41s
195 | my-cluster-broker-12                          0/1     CrashLoopBackOff   2 (14s ago)   3m41s
196 | 
197 | $ kubectl logs $(kubectl get po | grep my-cluster-broker | head -n1 | awk '{print $1}') | grep "No space left on device" | tail -n1
198 | Caused by: java.io.IOException: No space left on device
199 | ```
200 | 
201 | Even if not all pods are failed, we still need to increase the volume size of all brokers because the storage configuration is shared.
202 | This procedure works offline because copying data while they are being modified can cause tricky problems, especially if transactions are enabled.
203 | 
204 | > [!WARNING]  
205 | > Before deleting the Kafka cluster, make sure that delete claim storage configuration is set to false in Kafka resource.
206 | 
207 | ```sh
208 | $ [[ $(kubectl get knp broker -o yaml | yq .spec.storage.deleteClaim) == "false" ]] \
209 |   && kubectl delete knp broker controller && kubectl delete k my-cluster
210 | kafkanodepool.kafka.strimzi.io "controller" deleted
211 | kafkanodepool.kafka.strimzi.io "broker" deleted
212 | kafka.kafka.strimzi.io "my-cluster" deleted
213 | ```
214 | 
215 | Create new and bigger volumes for our brokers.
216 | In this case, volumes are created automatically, but you may need to create them manually.
217 | They will be bound only when the first consumer (pod) will be created.
218 | 
219 | ```sh
220 | $ for pod in $KAFKA_PODS; do
221 | echo "apiVersion: v1
222 | kind: PersistentVolumeClaim
223 | metadata:
224 |   name: data-$pod-new
225 |   labels:
226 |     strimzi.io/name: my-cluster-kafka
227 |     strimzi.io/pool-name: broker
228 | spec:
229 |   accessModes:
230 |     - ReadWriteOnce
231 |   storageClassName: $VOLUME_CLASS
232 |   resources:
233 |     requests:
234 |       storage: $NEW_VOLUME_SIZE" | kubectl create -f- 
235 | done
236 | persistentvolumeclaim/data-my-cluster-broker-10-new created
237 | persistentvolumeclaim/data-my-cluster-broker-11-new created
238 | persistentvolumeclaim/data-my-cluster-broker-12-new created
239 | 
240 | $ kubectl get pvc | grep my-cluster-broker
241 | data-my-cluster-broker-10      Bound     pvc-fe5ccdb3-b550-467e-b6e0-f4d3ece79ed0   1Gi        RWO            gp3-csi        <unset>                 8m25s
242 | data-my-cluster-broker-10-new   Pending                                                                        gp3-csi        <unset>                 15s
243 | data-my-cluster-broker-11      Bound     pvc-d76d68c6-52e9-4a9f-a20f-3b052ea49c55   1Gi        RWO            gp3-csi        <unset>                 8m25s
244 | data-my-cluster-broker-11-new   Pending                                                                        gp3-csi        <unset>                 15s
245 | data-my-cluster-broker-12      Bound     pvc-6efa4986-a8f8-42d3-ae80-0229d262cf81   1Gi        RWO            gp3-csi        <unset>                 8m25s
246 | data-my-cluster-broker-12-new   Pending                                                                        gp3-csi        <unset>                 14s
247 | ```
248 | 
249 | Using a maintenance pod, copy all broker data from the old volumes to the new volumes.
250 | 
251 | > [!NOTE]  
252 | > The following command may take some time, depending on the amount of data to copy.
253 | 
254 | ```sh
255 | $ for pod in $KAFKA_PODS; do
256 |   kubectl run kubectl-patch-$pod -itq --rm --restart "Never" --image "foo" --overrides "{
257 |   \"spec\": {
258 |     \"containers\": [
259 |       {
260 |         \"name\": \"busybox\",
261 |         \"image\": \"busybox\",
262 |         \"imagePullPolicy\": \"IfNotPresent\",
263 |         \"command\": [\"/bin/sh\", \"-c\", \"cp -auvR /old/* /new\"],
264 |         \"volumeMounts\": [
265 |           {\"name\": \"old\", \"mountPath\": \"/old\"},
266 |           {\"name\": \"new\", \"mountPath\": \"/new\"}
267 |         ]
268 |       }
269 |     ],
270 |     \"volumes\": [
271 |       {\"name\": \"old\", \"persistentVolumeClaim\": {\"claimName\": \"data-$pod\"}},
272 |       {\"name\": \"new\", \"persistentVolumeClaim\": {\"claimName\": \"data-$pod-new\"}}
273 |     ]
274 |   }
275 | }"
276 | done
277 | '/old/kafka-log10/.lock' -> '/new/kafka-log10/.lock'
278 | '/old/kafka-log10/bootstrap.checkpoint' -> '/new/kafka-log10/bootstrap.checkpoint'
279 | '/old/kafka-log10/recovery-point-offset-checkpoint' -> '/new/kafka-log10/recovery-point-offset-checkpoint'
280 | '/old/kafka-log10/meta.properties' -> '/new/kafka-log10/meta.properties'
281 | '/old/kafka-log10/__cluster_metadata-0/00000000000000000256.snapshot' -> '/new/kafka-log10/__cluster_metadata-0/00000000000000000256.snapshot'
282 | '/old/kafka-log10/__cluster_metadata-0/partition.metadata' -> '/new/kafka-log10/__cluster_metadata-0/partition.metadata'
283 | '/old/kafka-log10/__cluster_metadata-0/00000000000000000000.log' -> '/new/kafka-log10/__cluster_metadata-0/00000000000000000000.log'
284 | '/old/kafka-log10/__cluster_metadata-0/00000000000000000000.index' -> '/new/kafka-log10/__cluster_metadata-0/00000000000000000000.index'
285 | '/old/kafka-log10/__cluster_metadata-0/00000000000000000000.timeindex' -> '/new/kafka-log10/__cluster_metadata-0/00000000000000000000.timeindex'
286 | '/old/kafka-log10/__cluster_metadata-0/leader-epoch-checkpoint.tmp' -> '/new/kafka-log10/__cluster_metadata-0/leader-epoch-checkpoint.tmp'
287 | '/old/kafka-log10/__cluster_metadata-0/leader-epoch-checkpoint' -> '/new/kafka-log10/__cluster_metadata-0/leader-epoch-checkpoint'
288 | ...
289 | 
290 | $ kubectl get pv | grep my-cluster-broker
291 | pvc-327097ee-094b-4725-afb9-1077b42f8504   10Gi       RWO            Delete           Bound    test/data-my-cluster-broker-11-new   gp3-csi        <unset>                          106s
292 | pvc-5f306a61-0d84-4cbb-b1b4-8e05728f0397   10Gi       RWO            Delete           Bound    test/data-my-cluster-broker-10-new   gp3-csi        <unset>                          2m6s
293 | pvc-6efa4986-a8f8-42d3-ae80-0229d262cf81   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-12      gp3-csi        <unset>                          24m
294 | pvc-777daab8-91e0-4560-8c12-e22318ffd9df   10Gi       RWO            Delete           Bound    test/data-my-cluster-broker-12-new   gp3-csi        <unset>                          84s
295 | pvc-d76d68c6-52e9-4a9f-a20f-3b052ea49c55   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-11      gp3-csi        <unset>                          24m
296 | pvc-fe5ccdb3-b550-467e-b6e0-f4d3ece79ed0   1Gi        RWO            Delete           Bound    test/data-my-cluster-broker-10      gp3-csi        <unset>                          24m
297 | ```
298 | 
299 | > [!WARNING]  
300 | > Set the persistent volume reclaim policy as Retain to avoid losing data when deleting broker PVCs.
301 | 
302 | ```sh
303 | $ for pv in $(kubectl get pv | grep my-cluster-broker | awk '{print $1}'); do
304 |   kubectl patch pv $pv --type merge -p '
305 |     spec:
306 |       persistentVolumeReclaimPolicy: Retain'
307 | done
308 | persistentvolume/pvc-6efa4986-a8f8-42d3-ae80-0229d262cf81 patched
309 | persistentvolume/pvc-d76d68c6-52e9-4a9f-a20f-3b052ea49c55 patched
310 | persistentvolume/pvc-fe5ccdb3-b550-467e-b6e0-f4d3ece79ed0 patched
311 | persistentvolume/pvc-777daab8-91e0-4560-8c12-e22318ffd9df patched
312 | persistentvolume/pvc-327097ee-094b-4725-afb9-1077b42f8504 patched
313 | persistentvolume/pvc-5f306a61-0d84-4cbb-b1b4-8e05728f0397 patched
314 | 
315 | $ kubectl get pv | grep my-cluster-broker
316 | pvc-13e660ba-6a21-4bad-876b-cabab93ce38b   1Gi        RWO            Retain           Bound    test/data-my-cluster-broker-11      gp3-csi        <unset>                          14m
317 | pvc-2522a5ad-5275-4459-83f0-149d8cd007f3   10Gi       RWO            Retain           Bound    test/data-my-cluster-broker-11-new   gp3-csi        <unset>                          79s
318 | pvc-26590b0f-c1ba-4069-9c24-f731287a7ed3   10Gi       RWO            Retain           Bound    test/data-my-cluster-broker-10-new   gp3-csi        <unset>                          100s
319 | pvc-35fed9c0-f12f-4012-899a-759add4cef4e   10Gi       RWO            Retain           Bound    test/data-my-cluster-broker-12-new   gp3-csi        <unset>                          57s
320 | pvc-aed21c6a-3b78-4a18-8e44-596285652b9d   1Gi        RWO            Retain           Bound    test/data-my-cluster-broker-10      gp3-csi        <unset>                          14m
321 | pvc-d7b08cd6-8199-4cbf-9193-98f0f6a3a29d   1Gi        RWO            Retain           Bound    test/data-my-cluster-broker-12      gp3-csi        <unset>                          14m
322 | ```
323 | 
324 | Now, delete all Kafka PVCs and PV claim references, just before creating the new PVCs with the new storage size.
325 | We have to use the same resource name that the operator expects, so that the new volumes will be bound on cluster startup.
326 | 
327 | ```sh
328 | $ for pod in $KAFKA_PODS; do
329 | PVC_NAMES="$(kubectl get pvc | grep data-$pod | awk '{print $1}')"
330 | PV_NAMES="$(kubectl get pv | grep data-$pod | awk '{print $1}')"
331 | NEW_PV_NAME="$(kubectl get pv | grep data-$pod-new | awk '{print $1}')"
332 | kubectl delete pvc $PVC_NAMES
333 | kubectl patch pv $PV_NAMES --type json -p '[{"op":"remove","path":"/spec/claimRef"}]'
334 | echo "apiVersion: v1
335 | kind: PersistentVolumeClaim
336 | metadata:
337 |   name: data-$pod
338 |   labels:
339 |     strimzi.io/name: my-cluster-kafka
340 |     strimzi.io/pool-name: broker
341 | spec:
342 |   accessModes:
343 |     - ReadWriteOnce
344 |   storageClassName: $VOLUME_CLASS
345 |   volumeName: $NEW_PV_NAME
346 |   resources:
347 |     requests:
348 |       storage: $NEW_VOLUME_SIZE" | kubectl create -f -    
349 | done
350 | persistentvolumeclaim "data-my-cluster-broker-10" deleted
351 | persistentvolumeclaim "data-my-cluster-broker-10-new" deleted
352 | persistentvolume/pvc-26590b0f-c1ba-4069-9c24-f731287a7ed3 patched
353 | persistentvolume/pvc-aed21c6a-3b78-4a18-8e44-596285652b9d patched
354 | persistentvolumeclaim/data-my-cluster-broker-10created
355 | persistentvolumeclaim "data-my-cluster-broker-11" deleted
356 | persistentvolumeclaim "data-my-cluster-broker-11-new" deleted
357 | persistentvolume/pvc-13e660ba-6a21-4bad-876b-cabab93ce38b patched
358 | persistentvolume/pvc-2522a5ad-5275-4459-83f0-149d8cd007f3 patched
359 | persistentvolumeclaim/data-my-cluster-broker-11created
360 | persistentvolumeclaim "data-my-cluster-broker-12" deleted
361 | persistentvolumeclaim "data-my-cluster-broker-12-new" deleted
362 | persistentvolume/pvc-35fed9c0-f12f-4012-899a-759add4cef4e patched
363 | persistentvolume/pvc-d7b08cd6-8199-4cbf-9193-98f0f6a3a29d patched
364 | persistentvolumeclaim/data-my-cluster-broker-12created
365 | 
366 | $ kubectl get pvc | grep my-cluster-broker
367 | data-my-cluster-broker-10      Bound    pvc-26590b0f-c1ba-4069-9c24-f731287a7ed3   10Gi       RWO            gp3-csi        <unset>                 25s
368 | data-my-cluster-broker-11      Bound    pvc-2522a5ad-5275-4459-83f0-149d8cd007f3   10Gi       RWO            gp3-csi        <unset>                 21s
369 | data-my-cluster-broker-12      Bound    pvc-35fed9c0-f12f-4012-899a-759add4cef4e   10Gi       RWO            gp3-csi        <unset>                 17
370 | ```
371 | 
372 | Deploy the Kafka cluster with our brand new volumes, wait for the cluster to be ready, and try to consume some data.
373 | 
374 | > [!IMPORTANT]  
375 | > We adjust the storage size in Kafka custom resource, and set the previous `clusterId` in the Kafka CR status.
376 | > To speed up log recovery and partition synchronization, we can also tune recovery threads and replica fetchers.
377 | 
378 | ```sh
379 | $ cat sessions/001/install/001-broker-pool.yaml \
380 |     | yq ".spec.storage.size = \"10Gi\"" | kubectl create -f - \
381 |   && cat sessions/001/install/002-my-cluster.yaml \
382 |     | yq ".metadata.annotations.\"strimzi.io/pause-reconciliation\" = \"true\"" \
383 |     | yq ".spec.kafka.config.\"num.recovery.threads.per.data.dir\" = 5" \
384 |     | yq ".spec.kafka.config.\"num.replica.fetchers\" = 5" | kubectl create -f - \
385 |   && kubectl create -f sessions/001/install 2>/dev/null
386 | kafkanodepool.kafka.strimzi.io/broker created
387 | kafka.kafka.strimzi.io/my-cluster created
388 | kafkanodepool.kafka.strimzi.io/controller created
389 | 
390 | $ kubectl patch k my-cluster --subresource status --type merge -p "
391 |   status:
392 |     clusterId: \"$CLUSTER_ID\""
393 | kafka.kafka.strimzi.io/my-cluster patched
394 | 
395 | $ kubectl annotate k my-cluster strimzi.io/pause-reconciliation=false --overwrite
396 | kafka.kafka.strimzi.io/my-cluster annotated
397 | 
398 | $ kubectl get po | grep my-cluster-broker
399 | my-cluster-broker-10                          1/1     Running   0          4m34s
400 | my-cluster-broker-11                          1/1     Running   0          4m34s
401 | my-cluster-broker-12                          1/1     Running   0          4m33s
402 | 
403 | $ kubectl-kafka bin/kafka-console-consumer.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 \
404 |   --topic my-topic --from-beginning --max-messages 3
405 | XVFTWDJKAIYRBIKZRFOEZNWURGQHGPDMOZYAEBTFLNCXMVOJOCPCXZLUZJKPTIFQVRHWKHBMTMHFHJGAIXNWURPJOKMXRAWLHMUNNWVYSNPIMZXJDKSLVMLJYZFJCQOIQXNFLYYYTEFK...
406 | FVABXPFDUNYNYMNVYWZDVZLGZASDYATOWNFMRODUPWCUVVIZFRLZNDOSQWZVNGMGEYHDVAWZDQLXBAIZGFDUOKGGHDBTLOJLMLPXTPXXZZQXFIVTAZOHHGWJBUSMPKIPCMOAJVSLUYGJ...
407 | OAPJJFCTIWBLZMWUVMWRSGJQMXVLATYRECKCHDEIHYOMLCLKAULDWNSRIXKVWSNHLJUADUZNUMCJQYASBCSJWHIKXLATGMGNENPSSVIUAWSRRABUBXFZZRKOGOFGTBVIWTWFUWHEEMGF...
408 | Processed a total of 3 messages
409 | ```
410 | 
411 | Finally, we delete the old volumes to reclaim some space.
412 | 
413 | ```sh
414 | $ kubectl delete pv $(kubectl get pv | grep Available | awk '{print $1}')
415 | persistentvolume "pvc-2522a5ad-5275-4459-83f0-149d8cd007f3" deleted
416 | persistentvolume "pvc-26590b0f-c1ba-4069-9c24-f731287a7ed3" deleted
417 | persistentvolume "pvc-35fed9c0-f12f-4012-899a-759add4cef4e" deleted
418 | 
419 | $ kubectl get pv | grep my-cluster-broker
420 | pvc-2522a5ad-5275-4459-83f0-149d8cd007f3   10Gi       RWO            Retain           Bound    test/data-my-cluster-broker-11-new   gp3-csi        <unset>                          79s
421 | pvc-26590b0f-c1ba-4069-9c24-f731287a7ed3   10Gi       RWO            Retain           Bound    test/data-my-cluster-broker-10-new   gp3-csi        <unset>                          100s
422 | pvc-35fed9c0-f12f-4012-899a-759add4cef4e   10Gi       RWO            Retain           Bound    test/data-my-cluster-broker-12-new   gp3-csi        <unset>                          57s
423 | ```
424 | 


--------------------------------------------------------------------------------
/sessions/009/README.md:
--------------------------------------------------------------------------------
  1 | ## Scaling up the cluster with the reassign tool
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | 
  5 | Then, we send some data.
  6 | 
  7 | ```sh
  8 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 100 --num-records 1000000 \
  9 |   --throughput -1 --producer-props acks=1 bootstrap.servers=my-cluster-kafka-bootstrap:9092
 10 | 1000000 records sent, 233699.462491 records/sec (22.29 MB/sec), 866.05 ms avg latency, 1652.00 ms max latency, 827 ms 50th, 1500 ms 95th, 1595 ms 99th, 1614 ms 99.9th.  
 11 | ```
 12 | 
 13 | When the cluster is ready, we want to scale it up and put some load on the new broker, which otherwise will sit idle waiting for new topic creation.
 14 | Thanks to the Cluster Operator, we can scale the cluster up by simply raising the number of broker replicas in the Kafka custom resource (CR).
 15 | 
 16 | ```sh
 17 | $ kubectl patch knp broker --type merge -p '
 18 |     spec:
 19 |       replicas: 4'
 20 | kafkanodepool.kafka.strimzi.io/broker patched
 21 | 
 22 | $ kubectl get po -l app.kubernetes.io/name=broker
 23 | NAME                   READY   STATUS    RESTARTS   AGE
 24 | my-cluster-broker-10   1/1     Running   0          2m8s
 25 | my-cluster-broker-11   1/1     Running   0          2m8s
 26 | my-cluster-broker-12   1/1     Running   0          2m8s
 27 | my-cluster-broker-13   1/1     Running   0          30s
 28 | ```
 29 | 
 30 | One option is to use the `kafka-reassign-partitions.sh` tool to move existing data.
 31 | We only have one topic here, but you may have hundreds of them, where some of them are busier than others.
 32 | You would need a custom procedure to figure out which replica changes can be done in order to improve the balance, also considering available disk space and preferred replicas.
 33 | The result of this procedure would be a `reassign.json` file describing the desired partition state for each topic that we can pass to the tool.
 34 | 
 35 | ```sh
 36 | $ kubectl exec -it my-cluster-broker-10 -- bash
 37 | [kafka@my-cluster-broker-10 kafka]$ /opt/kafka/bin/kafka-topics.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --topic my-topic --describe
 38 | Topic: my-topic	TopicId: XbszKNVQSSKTPB3sGvRaGg	PartitionCount: 3	ReplicationFactor: 3	Configs: min.insync.replicas=2,message.format.version=3.0-IV1
 39 | 	Topic: my-topic	Partition: 0	Leader: 10	Replicas: 10,12,11	Isr: 10,12,11
 40 | 	Topic: my-topic	Partition: 1	Leader: 12	Replicas: 12,11,10	Isr: 12,11,10
 41 | 	Topic: my-topic	Partition: 2	Leader: 11	Replicas: 11,10,12	Isr: 11,10,12
 42 | 
 43 | [kafka@my-cluster-broker-10 kafka]$ cat <<EOF >/tmp/reassign.json
 44 | {
 45 |   "version": 1,
 46 |   "partitions": [
 47 |     {"topic": "my-topic", "partition": 0, "replicas": [13, 12, 11]},
 48 |     {"topic": "my-topic", "partition": 1, "replicas": [12, 11, 13]},
 49 |     {"topic": "my-topic", "partition": 2, "replicas": [11, 13, 12]}
 50 |   ]
 51 | }
 52 | EOF
 53 | 
 54 | [kafka@my-cluster-broker-10 kafka]$/opt/kafka/bin/kafka-reassign-partitions.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 \
 55 |   --reassignment-json-file /tmp/reassign.json --throttle 10000000 --execute
 56 | Current partition replica assignment
 57 | 
 58 | {"version":1,"partitions":[{"topic":"my-topic","partition":0,"replicas":[10,12,11],"log_dirs":["any","any","any"]},{"topic":"my-topic","partition":1,"replicas":[12,11,10],"log_dirs":["any","any","any"]},{"topic":"my-topic","partition":2,"replicas":[11,10,12],"log_dirs":["any","any","any"]}]}
 59 | 
 60 | Save this to use as the --reassignment-json-file option during rollback
 61 | Warning: You must run --verify periodically, until the reassignment completes, to ensure the throttle is removed.
 62 | The inter-broker throttle limit was set to 10000000 B/s
 63 | Successfully started partition reassignments for my-topic-0,my-topic-1,my-topic-2
 64 | ```
 65 | 
 66 | To prevent any impact on the cluster while moving partitions between brokers, we use the `--throttle` option with a limit of 10 MB/s.
 67 | 
 68 | > [!IMPORTANT]  
 69 | > The `--throttle` option also applies throttling to the normal replication traffic between brokers.
 70 | > We need to find the right balance to ensure that we can move data in a reasonable amount of time without slowing down replication too much.
 71 | > Don't forget to call `--verify` at the end to disable replication throttling, which otherwise will continue to affect the cluster.
 72 | 
 73 | We can start from a safe throttle value and then use the `kafka.server:type=FetcherLagMetrics,name=ConsumerLag,clientId=([-.\w]+),topic=([-.\w]+),partition=([0-9]+)` metric to observe how far the followers are lagging behind the leader for a given partition. 
 74 | If this lag is growing or the reassignment is taking too much time, we can run the command again with the `--additional` option to increase the throttle value.
 75 | 
 76 | After the reassignment is started, we use the `--verify` option to check the status of the reassignment process and disable the replication throttling.
 77 | When the process is done, we can check if the topic configuration changes have been applied.
 78 | 
 79 | ```sh
 80 | [kafka@my-cluster-broker-10 kafka]$ /opt/kafka/bin/kafka-reassign-partitions.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 \
 81 |   --reassignment-json-file /tmp/reassign.json --verify
 82 | Status of partition reassignment:
 83 | Reassignment of partition my-topic-0 is completed.
 84 | Reassignment of partition my-topic-1 is completed.
 85 | Reassignment of partition my-topic-2 is completed.
 86 | 
 87 | Clearing broker-level throttles on brokers 10,11,12,13
 88 | Clearing topic-level throttles on topic my-topic
 89 | 
 90 | [kafka@my-cluster-broker-10 kafka]$ /opt/kafka/bin/kafka-topics.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --topic my-topic --describe
 91 | Topic: my-topic	TopicId: XbszKNVQSSKTPB3sGvRaGg	PartitionCount: 3	ReplicationFactor: 3	Configs: min.insync.replicas=2,message.format.version=3.0-IV1
 92 | 	Topic: my-topic	Partition: 0	Leader: 13	Replicas: 13,12,11	Isr: 12,11,13
 93 | 	Topic: my-topic	Partition: 1	Leader: 12	Replicas: 12,11,13	Isr: 12,11,13
 94 | 	Topic: my-topic	Partition: 2	Leader: 11	Replicas: 11,13,12	Isr: 11,12,13
 95 | 
 96 | [kafka@my-cluster-broker-10 kafka]$ exit
 97 | exit
 98 | ```
 99 | 
100 | ## Scaling up the cluster with Cruise Control
101 | 
102 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
103 | 
104 | When the cluster is ready, we send some data and check how partitions are distributed between the brokers.
105 | 
106 | ```sh
107 | $ kubectl-kafka bin/kafka-producer-perf-test.sh --topic my-topic --record-size 100 --num-records 10000000 \
108 |   --throughput -1 --producer-props acks=1 bootstrap.servers=my-cluster-kafka-bootstrap:9092
109 | ...
110 | 10000000 records sent, 435085.3 records/sec (41.49 MB/sec), 521.45 ms avg latency, 9808.00 ms max latency, 258 ms 50th, 1399 ms 95th, 9636 ms 99th, 9781 ms 99.9th.
111 | 
112 | $ kubectl-kafka bin/kafka-topics.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --describe --topic my-topic
113 | Topic: my-topic	TopicId: w7uEJVDXSm22zscX2-9AYA	PartitionCount: 3	ReplicationFactor: 3	Configs: min.insync.replicas=2,retention.bytes=1073741824
114 | 	Topic: my-topic	Partition: 0	Leader: 11	Replicas: 11,12,10	Isr: 11,12,10	Elr: 	LastKnownElr: 
115 | 	Topic: my-topic	Partition: 1	Leader: 12	Replicas: 12,10,11	Isr: 12,11,10	Elr: 	LastKnownElr: 
116 | 	Topic: my-topic	Partition: 2	Leader: 11	Replicas: 10,11,12	Isr: 11,12,10	Elr: 	LastKnownElr: 
117 | ```
118 | 
119 | Then, we deploy Cruise Control with the auto-rebalancing feature enabled.
120 | 
121 | > [!NOTE]  
122 | > The auto-rebalancing feature will automatically generate and execute KafkaRebalance resources on cluster scale up and down.
123 | > Each mode can be customized by adding custom KafkaRebalance templates.
124 | 
125 | The Cluster Operator will trigger a rolling update of the brokers to add the metrics reporter, and then it will deploy Cruise Control.
126 | 
127 | ```sh
128 | $ kubectl patch k my-cluster --type merge -p '
129 |     spec:
130 |       cruiseControl: 
131 |         autoRebalance:
132 |           - mode: add-brokers
133 |           - mode: remove-brokers'
134 | kafka.kafka.strimzi.io/my-cluster patched
135 | ```
136 | 
137 | Wait some time for Cruise Control to build its internal workload model, and then scale up the Kafka cluster adding a new broker.
138 | 
139 | ```sh
140 | $ kubectl patch knp broker --type merge -p '
141 |     spec:
142 |       replicas: 4'
143 | kafkanodepool.kafka.strimzi.io/broker patched
144 | ```
145 | 
146 | Follow the KafkaRebalance execution from command line.
147 | 
148 | ```sh
149 | $ kubectl get kr -w
150 | NAME                                      CLUSTER      TEMPLATE   STATUS     
151 | my-cluster-auto-rebalancing-add-brokers   my-cluster              PendingProposal
152 | my-cluster-auto-rebalancing-add-brokers   my-cluster              ProposalReady
153 | my-cluster-auto-rebalancing-add-brokers   my-cluster              Rebalancing
154 | my-cluster-auto-rebalancing-add-brokers   my-cluster              Ready
155 | ```
156 | 
157 | When KafkaRebalance is ready, we can see that the new broker now contains existing replicas.
158 | 
159 | ```sh
160 | $ kubectl-kafka bin/kafka-topics.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --describe --topic my-topic
161 | Topic: my-topic	TopicId: w7uEJVDXSm22zscX2-9AYA	PartitionCount: 3	ReplicationFactor: 3	Configs: min.insync.replicas=2,retention.bytes=1073741824
162 | 	Topic: my-topic	Partition: 0	Leader: 11	Replicas: 11,12,10	Isr: 10,11,12	Elr: 	LastKnownElr: 
163 | 	Topic: my-topic	Partition: 1	Leader: 12	Replicas: 12,10,11	Isr: 10,11,12	Elr: 	LastKnownElr: 
164 | 	Topic: my-topic	Partition: 2	Leader: 10	Replicas: 10,11,13	Isr: 10,11,13	Elr: 	LastKnownElr: 
165 | ```
166 | 


--------------------------------------------------------------------------------
/sessions/010/README.md:
--------------------------------------------------------------------------------
  1 | ## Run transactional applications
  2 | 
  3 | First, use [this session](/sessions/001) to deploy a Kafka cluster on Kubernetes.
  4 | 
  5 | Then, run a transactional application example (read-process-write).
  6 | 
  7 | ```sh
  8 | $ kubectl create -f install.yaml 
  9 | kafkatopic.kafka.strimzi.io/input-topic created
 10 | kafkatopic.kafka.strimzi.io/output-topic created
 11 | statefulset.apps/kafka-txn created
 12 | 
 13 | $ kubectl get po -l app=kafka-txn
 14 | NAME          READY   STATUS    RESTARTS   AGE
 15 | kafka-txn-0   1/1     Running   0          9m56s
 16 | kafka-txn-1   1/1     Running   0          9m53s
 17 | kafka-txn-2   1/1     Running   0          9m50s
 18 | ```
 19 | 
 20 | When the application is running, we send one sentence to the input topic and check the result from the output topic.
 21 | 
 22 | ```sh
 23 | $ kubectl-kafka bin/kafka-console-producer.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --topic input-topic
 24 | >this is a test
 25 | >^C
 26 | 
 27 | $ kubectl-kafka bin/kafka-console-consumer.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --topic output-topic --from-beginning
 28 | tset a si siht
 29 | ^CProcessed a total of 1 messages
 30 | ```
 31 | 
 32 | After that, we can take a look at partition content.
 33 | Our output topic has one partition, but what are the `__consumer_offsets` and `__transaction_state` coordinating partitions?
 34 | We can pass the `group.id` and `transactional.id` to the following function define in `init.sh` to find out.
 35 | 
 36 | ```sh
 37 | $ kafka-cp my-group
 38 | 12
 39 | 
 40 | $ kafka-cp kafka-txn-0
 41 | 30
 42 | ```
 43 | 
 44 | We now check what's happening inside all the partitions involved in this transaction.
 45 | In `output-topic-0`, we see that the data batch is transactional (`isTransactional`) and contains the PID and epoch.
 46 | This batch is followed by a control batch (`isControl`), which contains a single end transaction marker record (`endTxnMarker`).
 47 | In `__consumer_offsets-12`, the consumer group's offset commit batch is followed by a similar control batch.
 48 | 
 49 | ```sh
 50 | $ kubectl exec my-cluster-broker-10 -- bin/kafka-dump-log.sh --deep-iteration --print-data-log \
 51 |   --files /var/lib/kafka/data/kafka-log10/output-topic-0/00000000000000000000.log
 52 | Dumping /var/lib/kafka/data/kafka-log10/output-topic-0/00000000000000000000.log
 53 | Log starting offset: 0
 54 | baseOffset: 0 lastOffset: 0 count: 1 baseSequence: 0 lastSequence: 0 producerId: 1 producerEpoch: 0 partitionLeaderEpoch: 0 isTransactional: true isControl: false deleteHorizonMs: OptionalLong.empty position: 0 CreateTime: 1742739702864 size: 82 magic: 2 compresscodec: none crc: 758896000 isvalid: true
 55 | | offset: 0 CreateTime: 1742739702864 keySize: -1 valueSize: 14 sequence: 0 headerKeys: [] payload: tset a si siht
 56 | baseOffset: 1 lastOffset: 1 count: 1 baseSequence: -1 lastSequence: -1 producerId: 1 producerEpoch: 0 partitionLeaderEpoch: 0 isTransactional: true isControl: true deleteHorizonMs: OptionalLong.empty position: 82 CreateTime: 1742739703234 size: 78 magic: 2 compresscodec: none crc: 2557578104 isvalid: true
 57 | | offset: 1 CreateTime: 1742739703234 keySize: 4 valueSize: 6 sequence: -1 headerKeys: [] endTxnMarker: COMMIT coordinatorEpoch: 0
 58 | 
 59 | $ kubectl exec my-cluster-broker-10 -- bin/kafka-dump-log.sh --deep-iteration --print-data-log --offsets-decoder \
 60 |   --files /var/lib/kafka/data/kafka-log10/__consumer_offsets-12/00000000000000000000.log
 61 | Dumping /var/lib/kafka/data/kafka-log10/__consumer_offsets-12/00000000000000000000.log
 62 | Log starting offset: 0
 63 | ...
 64 | baseOffset: 7 lastOffset: 7 count: 1 baseSequence: 0 lastSequence: 0 producerId: 1 producerEpoch: 0 partitionLeaderEpoch: 0 isTransactional: true isControl: false deleteHorizonMs: OptionalLong.empty position: 1974 CreateTime: 1742739703027 size: 121 magic: 2 compresscodec: none crc: 4292816145 isvalid: true
 65 | | offset: 7 CreateTime: 1742739703027 keySize: 29 valueSize: 24 sequence: 0 headerKeys: [] key: {"type":"1","data":{"group":"my-group","topic":"input-topic","partition":0}} payload: {"version":"3","data":{"offset":1,"leaderEpoch":-1,"metadata":"","commitTimestamp":1742739702993}}
 66 | baseOffset: 8 lastOffset: 8 count: 1 baseSequence: -1 lastSequence: -1 producerId: 1 producerEpoch: 0 partitionLeaderEpoch: 0 isTransactional: true isControl: true deleteHorizonMs: OptionalLong.empty position: 2095 CreateTime: 1742739703213 size: 78 magic: 2 compresscodec: none crc: 1231080676 isvalid: true
 67 | | offset: 8 CreateTime: 1742739703213 keySize: 4 valueSize: 6 sequence: -1 headerKeys: [] endTxnMarker: COMMIT coordinatorEpoch: 0
 68 | ...
 69 | ```
 70 | 
 71 | That was straightforward, but how is the transaction state managed by the coordinator? 
 72 | In `__transaction_state-20` record payloads, we can see all transaction state changes keyed by TID `kafka-txn-0` (we also have PID+epoch).
 73 | The transaction starts in the `Empty` state, then we have two `Ongoing` state changes (one for each partition registration).
 74 | Then, when the commit is called, we have `PrepareCommit` state change, which means the broker is now committed to the transaction.
 75 | This happens in the last batch, where the state is changed to `CompleteCommit`, terminating the transaction.
 76 | 
 77 | ```sh
 78 | $ kubectl exec my-cluster-broker-10 -- bin/kafka-dump-log.sh --deep-iteration --print-data-log --transaction-log-decoder \
 79 |   --files /var/lib/kafka/data/kafka-log10/__transaction_state-30/00000000000000000000.log
 80 | Dumping /var/lib/kafka/data/kafka-log10/__transaction_state-30/00000000000000000000.log
 81 | Log starting offset: 0
 82 | baseOffset: 0 lastOffset: 0 count: 1 baseSequence: -1 lastSequence: -1 producerId: -1 producerEpoch: -1 partitionLeaderEpoch: 0 isTransactional: false isControl: false deleteHorizonMs: OptionalLong.empty position: 0 CreateTime: 1742739549438 size: 120 magic: 2 compresscodec: none crc: 3663501755 isvalid: true
 83 | | offset: 0 CreateTime: 1742739549438 keySize: 15 valueSize: 37 sequence: -1 headerKeys: [] key: transaction_metadata::transactionalId=kafka-txn-0 payload: producerId:1,producerEpoch:0,state=Empty,partitions=[],txnLastUpdateTimestamp=1742739549435,txnTimeoutMs=60000
 84 | baseOffset: 1 lastOffset: 1 count: 1 baseSequence: -1 lastSequence: -1 producerId: -1 producerEpoch: -1 partitionLeaderEpoch: 0 isTransactional: false isControl: false deleteHorizonMs: OptionalLong.empty position: 120 CreateTime: 1742739702876 size: 143 magic: 2 compresscodec: none crc: 563111626 isvalid: true
 85 | | offset: 1 CreateTime: 1742739702876 keySize: 15 valueSize: 59 sequence: -1 headerKeys: [] key: transaction_metadata::transactionalId=kafka-txn-0 payload: producerId:1,producerEpoch:0,state=Ongoing,partitions=[output-topic-0],txnLastUpdateTimestamp=1742739702876,txnTimeoutMs=60000
 86 | baseOffset: 2 lastOffset: 2 count: 1 baseSequence: -1 lastSequence: -1 producerId: -1 producerEpoch: -1 partitionLeaderEpoch: 0 isTransactional: false isControl: false deleteHorizonMs: OptionalLong.empty position: 263 CreateTime: 1742739702882 size: 172 magic: 2 compresscodec: none crc: 1296972565 isvalid: true
 87 | | offset: 2 CreateTime: 1742739702882 keySize: 15 valueSize: 87 sequence: -1 headerKeys: [] key: transaction_metadata::transactionalId=kafka-txn-0 payload: producerId:1,producerEpoch:0,state=Ongoing,partitions=[output-topic-0,__consumer_offsets-12],txnLastUpdateTimestamp=1742739702882,txnTimeoutMs=60000
 88 | baseOffset: 3 lastOffset: 3 count: 1 baseSequence: -1 lastSequence: -1 producerId: -1 producerEpoch: -1 partitionLeaderEpoch: 0 isTransactional: false isControl: false deleteHorizonMs: OptionalLong.empty position: 435 CreateTime: 1742739703134 size: 172 magic: 2 compresscodec: none crc: 598474139 isvalid: true
 89 | | offset: 3 CreateTime: 1742739703134 keySize: 15 valueSize: 87 sequence: -1 headerKeys: [] key: transaction_metadata::transactionalId=kafka-txn-0 payload: producerId:1,producerEpoch:0,state=PrepareCommit,partitions=[output-topic-0,__consumer_offsets-12],txnLastUpdateTimestamp=1742739703132,txnTimeoutMs=60000
 90 | baseOffset: 4 lastOffset: 4 count: 1 baseSequence: -1 lastSequence: -1 producerId: -1 producerEpoch: -1 partitionLeaderEpoch: 0 isTransactional: false isControl: false deleteHorizonMs: OptionalLong.empty position: 607 CreateTime: 1742739703240 size: 120 magic: 2 compresscodec: none crc: 4205821491 isvalid: true
 91 | | offset: 4 CreateTime: 1742739703240 keySize: 15 valueSize: 37 sequence: -1 headerKeys: [] key: transaction_metadata::transactionalId=kafka-txn-0 payload: producerId:1,producerEpoch:0,state=CompleteCommit,partitions=[],txnLastUpdateTimestamp=1742739703142,txnTimeoutMs=60000
 92 | ```
 93 | 
 94 | ## Transaction rollback
 95 | 
 96 | When there is a hanging transaction the LSO is stuck, which means that transactional consumers of this partition can't make any progress (CURRENT-OFFSET==LSO).
 97 | 
 98 | ```sh
 99 | # application log
100 | [Consumer clientId=my-client, groupId=my-group] The following partitions still have unstable offsets which are not cleared on the broker side: [__consumer_offsets-27], 
101 | this could be either transactional offsets waiting for completion, or normal offsets waiting for replication after appending to local log
102 | 
103 | # consumer lag grows
104 | $ kubectl-kafka bin/kafka-consumer-groups.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 --describe --group my-group
105 | GROUP     TOPIC                  PARTITION  CURRENT-OFFSET  LOG-END-OFFSET  LAG   CONSUMER-ID  HOST           CLIENT-ID
106 | my-group  __consumer_offsets-27  9          913095344       913097449       2105  my-client-0  /10.60.172.97  my-client
107 | ```
108 | 
109 | If the partition is part of a compacted topic like `__consumer_offsets`, compaction is also blocked, causing unbounded partition growth.
110 | The last cleaned offset never changes.
111 | 
112 | ```sh
113 | $ kubectl exec -it my-cluster-broker-10 -- bash
114 | 
115 | [kafka@my-cluster-broker-10 kafka]$ grep "__consumer_offsets 27" /var/lib/kafka/data/kafka-log10/cleaner-offset-checkpoint
116 | __consumer_offsets 27 913095344
117 | 
118 | [kafka@my-cluster-broker-10 kafka]$ exit
119 | exit
120 | ```
121 | 
122 | In Kafka 3+ there is an official command line tool that you can use to identify and rollback hanging transactions.
123 | 
124 | > [!IMPORTANT]  
125 | > The `CLUSTER_ACTION` operation type is required when authorization is enabled.
126 | 
127 | ```sh
128 | $ kubectl-kafka bin/kafka-transactions.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 find-hanging --broker 10
129 | Topic                  Partition   ProducerId  ProducerEpoch   StartOffset LastTimestamp               Duration(s)
130 | __consumer_offsets     27          171100      1               913095344   2022-06-06T03:16:47Z        209793
131 | 
132 | $ kubectl-kafka bin/kafka-transactions.sh --bootstrap-server my-cluster-kafka-bootstrap:9092 abort \
133 |   --topic __consumer_offsets --partition 27 --start-offset 913095344
134 | ```
135 | 


--------------------------------------------------------------------------------
/sessions/010/install.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kafka.strimzi.io/v1beta2
 2 | kind: KafkaTopic
 3 | metadata:
 4 |   name: input-topic
 5 |   labels:
 6 |     strimzi.io/cluster: my-cluster
 7 | spec:
 8 |   partitions: 1
 9 |   replicas: 3
10 |   config:
11 |     retention.ms: 7200000
12 |     segment.bytes: 1073741824
13 | ---
14 | apiVersion: kafka.strimzi.io/v1beta2
15 | kind: KafkaTopic
16 | metadata:
17 |   name: output-topic
18 |   labels:
19 |     strimzi.io/cluster: my-cluster
20 | spec:
21 |   partitions: 1
22 |   replicas: 3
23 |   config:
24 |     retention.ms: 7200000
25 |     segment.bytes: 1073741824
26 | ---
27 | # using sts because we need a stable identity (pod names)
28 | apiVersion: apps/v1
29 | kind: StatefulSet
30 | metadata:
31 |   name: kafka-txn
32 | spec:
33 |   replicas: 3
34 |   serviceName: kafka-txn
35 |   selector:
36 |     matchLabels:
37 |       app: kafka-txn
38 |   template:
39 |     metadata:
40 |       labels:
41 |         app: kafka-txn
42 |     spec:
43 |       containers:
44 |         - name: kafka-txn
45 |           image: ghcr.io/fvaleri/kafka-txn:latest
46 |           imagePullPolicy: Always
47 |           env:
48 |             - name: BOOTSTRAP_SERVERS
49 |               value: "my-cluster-kafka-bootstrap:9092"
50 |             - name: GROUP_ID
51 |               value: "my-group"
52 |             - name: INSTANCE_ID
53 |               valueFrom:
54 |                 fieldRef:
55 |                   fieldPath: metadata.name
56 |             - name: INPUT_TOPIC
57 |               value: "input-topic"
58 |             - name: OUTPUT_TOPIC
59 |               value: "output-topic"
60 | 


--------------------------------------------------------------------------------