├── docs
├── index.md
├── images
│ ├── debezium-iceberg.png
│ ├── rdbms-debezium-iceberg.png
│ ├── rdbms-debezium-iceberg_white.png
│ ├── debezium-iceberg-architecture.drawio.png
│ └── debezium-iceberg.drawio
├── contributing.md
├── icebergevents.md
├── python-runner.md
├── faq.md
└── migration.md
├── .dockerignore
├── examples
├── lakekeeper
│ ├── notebooks
│ │ └── .gitignore
│ ├── config
│ │ └── application.properties
│ └── produce_data.py
└── nessie
│ ├── config
│ └── application.properties
│ ├── produce_data.py
│ └── docker-compose.yaml
├── debezium-server-iceberg-dist
├── src
│ └── main
│ │ └── resources
│ │ ├── distro
│ │ ├── jmx
│ │ │ ├── jmxremote.access
│ │ │ ├── jmxremote.password
│ │ │ └── enable_jmx.sh
│ │ ├── lib_metrics
│ │ │ └── enable_exporter.sh
│ │ ├── run.sh
│ │ ├── config
│ │ │ └── metrics.yml
│ │ └── debezium.py
│ │ └── assemblies
│ │ └── server-distribution.xml
└── README.md
├── .github
├── dependabot.yml
└── workflows
│ ├── deploy-documentation.yml
│ ├── build.yml
│ ├── stale.yml
│ ├── codeql-analysis.yml
│ └── release.yml
├── debezium-server-iceberg-sink
└── src
│ ├── main
│ ├── resources
│ │ └── META-INF
│ │ │ └── beans.xml
│ └── java
│ │ └── io
│ │ └── debezium
│ │ └── server
│ │ └── iceberg
│ │ ├── mapper
│ │ ├── IcebergTableMapper.java
│ │ └── DefaultIcebergTableMapper.java
│ │ ├── converter
│ │ ├── SchemaConverter.java
│ │ ├── AbstractVariantObject.java
│ │ ├── IcebergSchemaInfo.java
│ │ ├── EventConverter.java
│ │ └── DateTimeUtils.java
│ │ ├── batchsizewait
│ │ ├── NoBatchSizeWait.java
│ │ ├── BatchSizeWait.java
│ │ └── MaxBatchSizeWait.java
│ │ ├── GlobalConfig.java
│ │ ├── history
│ │ └── IcebergSchemaHistoryConfig.java
│ │ ├── offset
│ │ └── IcebergOffsetBackingStoreConfig.java
│ │ ├── tableoperator
│ │ ├── Operation.java
│ │ ├── PartitionedAppendWriter.java
│ │ ├── UnpartitionedDeltaWriter.java
│ │ ├── PartitionedDeltaWriter.java
│ │ ├── RecordWrapper.java
│ │ ├── BaseDeltaTaskWriter.java
│ │ └── IcebergTableWriterFactory.java
│ │ ├── BatchConfig.java
│ │ ├── storage
│ │ └── BaseIcebergStorageConfig.java
│ │ └── IcebergConfig.java
│ └── test
│ ├── resources
│ ├── json
│ │ ├── serde-unnested-delete-key-withschema.json
│ │ ├── serde-unnested-order-key-withschema.json
│ │ ├── serde-update.json
│ │ ├── unwrap-with-schema.json
│ │ ├── serde-with-array.json
│ │ └── serde-with-schema_geom.json
│ ├── mongodb
│ │ └── Dockerfile
│ └── META-INF
│ │ └── services
│ │ └── org.eclipse.microprofile.config.spi.ConfigSource
│ └── java
│ └── io
│ └── debezium
│ └── server
│ └── iceberg
│ ├── GlobalConfigProducer.java
│ ├── IcebergConfigProducer.java
│ ├── DebeziumConfigProducer.java
│ ├── mapper
│ ├── CustomMapper.java
│ └── CustomMapperTest.java
│ ├── converter
│ ├── JsonEventConverterSchemaDataTest.java
│ └── JsonEventConverterBuilderTest.java
│ ├── testresources
│ ├── CatalogJdbc.java
│ ├── TestUtil.java
│ ├── CatalogRest.java
│ ├── SourceMongoDB.java
│ ├── CatalogNessie.java
│ ├── SourceMysqlDB.java
│ ├── SourcePostgresqlDB.java
│ └── S3Minio.java
│ ├── tableoperator
│ ├── UnpartitionedDeltaWriterTest.java
│ └── BaseWriterTest.java
│ ├── IcebergChangeConsumerJdbcCatalogTest.java
│ ├── GlobalConfigTest.java
│ ├── IcebergChangeConsumerRestCatalogTest.java
│ ├── IcebergChangeConsumerConnectTest.java
│ ├── IcebergChangeConsumerNessieCatalogTest.java
│ ├── IcebergChangeConsumerDecimalTest.java
│ ├── IcebergEventsChangeConsumerTest.java
│ ├── IcebergChangeConsumerMongodbTest.java
│ ├── IcebergChangeConsumerExcludedColumnsTest.java
│ ├── batchsizewait
│ └── MaxBatchSizeWaitTest.java
│ ├── history
│ └── IcebergSchemaHistoryTest.java
│ ├── IcebergChangeConsumerMysqlTest.java
│ ├── IcebergChangeConsumerTestUnwraapped.java
│ └── IcebergChangeConsumerTemporalIsoStringTest.java
├── python
├── debezium
│ ├── __main__.py
│ └── __init__.py
└── pyproject.toml
├── .run
├── IcebergChangeConsumerTest.run.xml
├── IcebergChangeConsumerTest.testSimpleUpload.run.xml
├── All in debezium-server-iceberg-sink.run.xml
├── package.run.xml
├── dependency_tree.run.xml
└── clean,install.run.xml
├── mkdocs.yml
├── Dockerfile
├── README.md
└── .gitignore
/docs/index.md:
--------------------------------------------------------------------------------
1 | --8<-- "README.md"
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | **/target/
3 |
4 | .idea/
5 | .github/
6 | .run/
--------------------------------------------------------------------------------
/examples/lakekeeper/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | spark-warehouse
2 | .ipynb_checkpoints
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/distro/jmx/jmxremote.access:
--------------------------------------------------------------------------------
1 | monitor readonly
2 | admin readwrite
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/distro/jmx/jmxremote.password:
--------------------------------------------------------------------------------
1 | admin admin123
2 | monitor monitor123
--------------------------------------------------------------------------------
/docs/images/debezium-iceberg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/memiiso/debezium-server-iceberg/HEAD/docs/images/debezium-iceberg.png
--------------------------------------------------------------------------------
/docs/images/rdbms-debezium-iceberg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/memiiso/debezium-server-iceberg/HEAD/docs/images/rdbms-debezium-iceberg.png
--------------------------------------------------------------------------------
/docs/images/rdbms-debezium-iceberg_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/memiiso/debezium-server-iceberg/HEAD/docs/images/rdbms-debezium-iceberg_white.png
--------------------------------------------------------------------------------
/docs/images/debezium-iceberg-architecture.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/memiiso/debezium-server-iceberg/HEAD/docs/images/debezium-iceberg-architecture.drawio.png
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/README.md:
--------------------------------------------------------------------------------
1 | Copy of
2 | Debezium [debezium-server-dist](https://github.com/debezium/debezium/tree/master/debezium-server/debezium-server-dist)
3 | project
4 |
5 | Authors : Debezium Authors
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "github-actions"
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 | - package-ecosystem: "maven"
8 | directory: "/"
9 | schedule:
10 | interval: "weekly"
11 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/resources/META-INF/beans.xml:
--------------------------------------------------------------------------------
1 |
8 |
9 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/mapper/IcebergTableMapper.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.mapper;
2 |
3 | import org.apache.iceberg.catalog.TableIdentifier;
4 |
5 | public interface IcebergTableMapper {
6 | TableIdentifier mapDestination(String destination);
7 | }
8 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/json/serde-unnested-delete-key-withschema.json:
--------------------------------------------------------------------------------
1 | {
2 | "schema": {
3 | "type": "struct",
4 | "fields": [
5 | {
6 | "type": "int32",
7 | "optional": false,
8 | "field": "id"
9 | }
10 | ],
11 | "optional": false,
12 | "name": "testc.inventory.customers.Key"
13 | },
14 | "payload": {
15 | "id": 1004
16 | }
17 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/json/serde-unnested-order-key-withschema.json:
--------------------------------------------------------------------------------
1 | {
2 | "schema": {
3 | "type": "struct",
4 | "fields": [
5 | {
6 | "type": "int32",
7 | "optional": false,
8 | "field": "order_number"
9 | }
10 | ],
11 | "optional": false,
12 | "name": "testc.inventory.orders.Key"
13 | },
14 | "payload": {
15 | "order_number": 10004
16 | }
17 | }
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Debezium Iceberg consumer is a very young project and looking for new maintainers. There are definitively many small/big
4 | improvements to do, including documentation, adding new features to submitting bug reports.
5 |
6 | Please feel free to send pull request, report bugs or open feature request.
7 |
8 | ## License
9 |
10 | By contributing, you agree that your contributions will be licensed under Apache 2.0 License.
11 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/distro/lib_metrics/enable_exporter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # To enable Prometheus JMX exporter, set JMX_EXPORTER_PORT environment variable
3 |
4 | if [ -n "${JMX_EXPORTER_PORT}" ]; then
5 | JMX_EXPORTER_CONFIG=${JMX_EXPORTER_CONFIG:-"config/metrics.yml"}
6 | JMX_EXPORTER_AGENT_JAR=$(find lib_metrics -name "jmx_prometheus_javaagent-*.jar")
7 | export JAVA_OPTS="-javaagent:${JMX_EXPORTER_AGENT_JAR}=0.0.0.0:${JMX_EXPORTER_PORT}:${JMX_EXPORTER_CONFIG} ${JAVA_OPTS}"
8 | fi
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/converter/SchemaConverter.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.converter;
2 |
3 | import org.apache.iceberg.Schema;
4 | import org.apache.iceberg.SortOrder;
5 |
6 | public interface SchemaConverter {
7 | @Override
8 | int hashCode();
9 |
10 | @Override
11 | boolean equals(Object o);
12 |
13 | Schema icebergSchema(boolean withIdentifierFields);
14 |
15 | default Schema icebergSchema() {
16 | return icebergSchema(true);
17 | }
18 |
19 | SortOrder sortOrder(Schema schema);
20 | }
21 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/mongodb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM mongo:6.0
2 |
3 | LABEL maintainer="Debezium Community"
4 |
5 | ENV REPLICA_SET_HOSTS="localhost"
6 |
7 | # Starting with MongoDB 4.4 the authentication enabled MongoDB requires a key
8 | # for intra-replica set communication
9 | RUN openssl rand -base64 756 > /etc/mongodb.keyfile &&\
10 | chown mongodb:mongodb /etc/mongodb.keyfile &&\
11 | chmod 400 /etc/mongodb.keyfile
12 |
13 | COPY start-mongodb.sh /usr/local/bin/
14 | RUN chmod +x /usr/local/bin/start-mongodb.sh
15 |
16 | ENTRYPOINT ["start-mongodb.sh"]
17 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/batchsizewait/NoBatchSizeWait.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.batchsizewait;
10 |
11 | import jakarta.enterprise.context.Dependent;
12 | import jakarta.inject.Named;
13 |
14 | /**
15 | * Optimizes batch size around 85%-90% of max,batch.size using dynamically calculated sleep(ms)
16 | *
17 | * @author Ismail Simsek
18 | */
19 | @Dependent
20 | @Named("NoBatchSizeWait")
21 | public class NoBatchSizeWait implements BatchSizeWait {
22 | }
23 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/batchsizewait/BatchSizeWait.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.batchsizewait;
10 |
11 | /**
12 | * When enabled dds waiting to the consumer to control batch size. I will turn the processing to batch processing.
13 | *
14 | * @author Ismail Simsek
15 | */
16 | public interface BatchSizeWait {
17 |
18 | default void initizalize() {
19 | }
20 |
21 | default void waitMs(Integer numRecordsProcessed, Integer processingTimeMs) throws InterruptedException {
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/GlobalConfig.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg;
2 |
3 | import io.quarkus.runtime.annotations.ConfigRoot;
4 | import io.smallrye.config.ConfigMapping;
5 | import io.smallrye.config.WithDefault;
6 | import io.smallrye.config.WithName;
7 | import io.smallrye.config.WithParentName;
8 | import org.jboss.logging.Logger;
9 |
10 | @ConfigRoot
11 | @ConfigMapping
12 | public interface GlobalConfig {
13 |
14 | @WithParentName
15 | IcebergConfig iceberg();
16 |
17 | @WithParentName
18 | DebeziumConfig debezium();
19 |
20 | @WithParentName
21 | BatchConfig batch();
22 |
23 | @WithName("quarkus.log.level")
24 | @WithDefault("INFO")
25 | Logger.Level quarkusLogLevel();
26 |
27 | }
--------------------------------------------------------------------------------
/.github/workflows/deploy-documentation.yml:
--------------------------------------------------------------------------------
1 | name: deploy-mkdocs-documentation
2 | on:
3 | push:
4 | branches:
5 | - master
6 | - main
7 | - docs
8 | permissions:
9 | contents: write
10 | jobs:
11 | deploy:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 | - name: Configure Git Credentials
16 | run: |
17 | git config user.name github-actions[bot]
18 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com
19 | - uses: actions/setup-python@v6
20 | with:
21 | python-version: 3.x
22 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
23 | - run: pip install mkdocs-material
24 | - run: mkdocs gh-deploy --force
--------------------------------------------------------------------------------
/python/debezium/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from debezium import Debezium
4 |
5 |
6 | def main():
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('--debezium_dir', type=str, default=None,
9 | help='Directory of debezium server application')
10 | parser.add_argument('--conf_dir', type=str, default=None,
11 | help='Directory of application.properties')
12 | parser.add_argument('--java_home', type=str, default=None,
13 | help='JAVA_HOME directory')
14 | _args, args = parser.parse_known_args()
15 | ds = Debezium(debezium_dir=_args.debezium_dir, conf_dir=_args.conf_dir, java_home=_args.java_home)
16 | ds.run(*args)
17 |
18 |
19 | if __name__ == '__main__':
20 | main()
21 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/history/IcebergSchemaHistoryConfig.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.history;
2 |
3 | import io.debezium.config.Configuration;
4 | import io.debezium.server.iceberg.storage.BaseIcebergStorageConfig;
5 |
6 |
7 | public class IcebergSchemaHistoryConfig extends BaseIcebergStorageConfig {
8 | public IcebergSchemaHistoryConfig(Configuration config, String configuration_field_prefix) {
9 | super(config, configuration_field_prefix);
10 | }
11 |
12 | @Override
13 | public String tableName() {
14 | return this.config.getProperty("table-name", "debezium_database_history_storage");
15 | }
16 | public String getMigrateHistoryFile() {
17 | return config.getProperty("migrate-history-file", "");
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/offset/IcebergOffsetBackingStoreConfig.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.offset;
2 |
3 | import io.debezium.config.Configuration;
4 | import io.debezium.server.iceberg.storage.BaseIcebergStorageConfig;
5 |
6 |
7 | public class IcebergOffsetBackingStoreConfig extends BaseIcebergStorageConfig {
8 | public IcebergOffsetBackingStoreConfig(Configuration config, String configuration_field_prefix) {
9 | super(config, configuration_field_prefix);
10 | }
11 |
12 | @Override
13 | public String tableName() {
14 | return this.config.getProperty("table-name", "debezium_offset_storage");
15 | }
16 |
17 | public String getMigrateOffsetFile() {
18 | return this.config.getProperty("migrate-offset-file","");
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/.run/IcebergChangeConsumerTest.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/json/serde-update.json:
--------------------------------------------------------------------------------
1 | {
2 | "op": "u",
3 | "ts_ms": 1465491411815,
4 | "before": {
5 | "id": 1004,
6 | "first_name": "Anne-Marie",
7 | "last_name": "Kretchmar",
8 | "email": "annek@noanswer.org"
9 | },
10 | "after": {
11 | "id": 1004,
12 | "first_name": "Anne",
13 | "last_name": "Kretchmar",
14 | "email": "annek@noanswer.org"
15 | },
16 | "source": {
17 | "version": "0.10.0.Final",
18 | "connector": "mysql",
19 | "name": "mysql-server-1",
20 | "ts_ms": 0,
21 | "snapshot": false,
22 | "db": "inventory",
23 | "table": "customers",
24 | "server_id": 0,
25 | "gtid": null,
26 | "file": "mysql-bin.000003",
27 | "pos": 154,
28 | "row": 0,
29 | "thread": 7,
30 | "query": "INSERT INTO customers (first_name, last_name, email) VALUES ('Anne', 'Kretchmar', 'annek@noanswer.org')"
31 | }
32 | }
--------------------------------------------------------------------------------
/.run/IcebergChangeConsumerTest.testSimpleUpload.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.run/All in debezium-server-iceberg-sink.run.xml:
--------------------------------------------------------------------------------
1 |
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "setuptools-scm"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "debezium"
7 | version = "0.1.0"
8 | authors = [
9 | { name = "Memiiso Organization" },
10 | ]
11 | description = "Debezium Server Python runner"
12 | # readme = "README.md"
13 | requires-python = ">=3.8"
14 | keywords = ["Debezium", "Replication", "Apache", "Iceberg"]
15 | license = { text = "Apache License 2.0" }
16 | classifiers = [
17 | "Development Status :: 5 - Production/Stable",
18 | "Programming Language :: Python :: 3",
19 | ]
20 | dependencies = [
21 | "pyjnius==1.6.1"
22 | ]
23 | [project.scripts]
24 | debezium = "debezium.__main__:main"
25 |
26 | [project.urls]
27 | Homepage = "https://github.com/memiiso/debezium-server-iceberg "
28 | Documentation = "https://github.com/memiiso/debezium-server-iceberg "
29 | Repository = "https://github.com/memiiso/debezium-server-iceberg "
30 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/META-INF/services/org.eclipse.microprofile.config.spi.ConfigSource:
--------------------------------------------------------------------------------
1 | #
2 | # /*
3 | # * Copyright memiiso Authors.
4 | # *
5 | # * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | # */
7 | #
8 |
9 | #
10 | # /*
11 | # * Copyright memiiso Authors.
12 | # *
13 | # * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
14 | # */
15 | #
16 |
17 | #
18 | # /*
19 | # * Copyright memiiso Authors.
20 | # *
21 | # * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
22 | # */
23 | #
24 |
25 | #
26 | # /*
27 | # * Copyright memiiso Authors.
28 | # *
29 | # * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
30 | # */
31 | #
32 |
33 | io.debezium.server.iceberg.TestConfigSource
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/distro/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # /*
4 | # * Copyright memiiso Authors.
5 | # *
6 | # * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
7 | # */
8 | #
9 |
10 | LIB_PATH="lib/*"
11 |
12 | if [ "$OSTYPE" = "msys" ] || [ "$OSTYPE" = "cygwin" ]; then
13 | PATH_SEP=";"
14 | else
15 | PATH_SEP=":"
16 | fi
17 |
18 | if [ -z "$JAVA_HOME" ]; then
19 | JAVA_BINARY="java"
20 | else
21 | JAVA_BINARY="$JAVA_HOME/bin/java"
22 | fi
23 |
24 | RUNNER=$(ls debezium-server-*runner.jar)
25 |
26 | ENABLE_DEBEZIUM_SCRIPTING=${ENABLE_DEBEZIUM_SCRIPTING:-false}
27 | if [[ "${ENABLE_DEBEZIUM_SCRIPTING}" == "true" ]]; then
28 | LIB_PATH=$LIB_PATH$PATH_SEP"lib_opt/*"
29 | fi
30 |
31 | source ./jmx/enable_jmx.sh
32 | source ./lib_metrics/enable_exporter.sh
33 |
34 | exec "$JAVA_BINARY" $DEBEZIUM_OPTS $JAVA_OPTS -cp \
35 | $RUNNER$PATH_SEP$LIB_PATH io.debezium.server.Main
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/GlobalConfigProducer.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg;
2 |
3 | import io.smallrye.config.SmallRyeConfig;
4 | import jakarta.enterprise.context.ApplicationScoped;
5 | import jakarta.enterprise.inject.Produces;
6 | import jakarta.inject.Inject;
7 | import org.eclipse.microprofile.config.Config;
8 | import org.mockito.Mockito;
9 |
10 | /**
11 | * This class provides a mocked instance of GlobalConfig for testing purposes,
12 | * allowing selective overriding of configuration values while preserving the original
13 | * configuration.
14 | */
15 | public class GlobalConfigProducer {
16 | @Inject
17 | Config config;
18 |
19 | @Produces
20 | @ApplicationScoped
21 | @io.quarkus.test.Mock
22 | GlobalConfig appConfig() {
23 | GlobalConfig appConfig = config.unwrap(SmallRyeConfig.class).getConfigMapping(GlobalConfig.class);
24 | GlobalConfig appConfigSpy = Mockito.spy(appConfig);
25 | return appConfigSpy;
26 | }
27 |
28 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergConfigProducer.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg;
2 |
3 | import io.smallrye.config.SmallRyeConfig;
4 | import jakarta.enterprise.context.ApplicationScoped;
5 | import jakarta.enterprise.inject.Produces;
6 | import jakarta.inject.Inject;
7 | import org.eclipse.microprofile.config.Config;
8 | import org.mockito.Mockito;
9 |
10 | /**
11 | * This class provides a mocked instance of IcebergConfig for testing purposes,
12 | * allowing selective overriding of configuration values while preserving the original
13 | * configuration.
14 | */
15 | public class IcebergConfigProducer {
16 | @Inject
17 | Config config;
18 |
19 | @Produces
20 | @ApplicationScoped
21 | @io.quarkus.test.Mock
22 | IcebergConfig appConfig() {
23 | IcebergConfig appConfig = config.unwrap(SmallRyeConfig.class).getConfigMapping(IcebergConfig.class);
24 | IcebergConfig appConfigSpy = Mockito.spy(appConfig);
25 | return appConfigSpy;
26 | }
27 |
28 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/DebeziumConfigProducer.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg;
2 |
3 | import io.smallrye.config.SmallRyeConfig;
4 | import jakarta.enterprise.context.ApplicationScoped;
5 | import jakarta.enterprise.inject.Produces;
6 | import jakarta.inject.Inject;
7 | import org.eclipse.microprofile.config.Config;
8 | import org.mockito.Mockito;
9 |
10 | /**
11 | * This class provides a mocked instance of DebeziumConfig for testing purposes,
12 | * allowing selective overriding of configuration values while preserving the original
13 | * configuration.
14 | */
15 | public class DebeziumConfigProducer {
16 | @Inject
17 | Config config;
18 |
19 | @Produces
20 | @ApplicationScoped
21 | @io.quarkus.test.Mock
22 | DebeziumConfig appConfig() {
23 | DebeziumConfig appConfig = config.unwrap(SmallRyeConfig.class).getConfigMapping(DebeziumConfig.class);
24 | DebeziumConfig appConfigSpy = Mockito.spy(appConfig);
25 | return appConfigSpy;
26 | }
27 |
28 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/mapper/CustomMapper.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.mapper;
2 |
3 | import io.debezium.server.iceberg.GlobalConfig;
4 | import jakarta.enterprise.context.Dependent;
5 | import jakarta.inject.Inject;
6 | import jakarta.inject.Named;
7 | import org.apache.iceberg.catalog.Namespace;
8 | import org.apache.iceberg.catalog.TableIdentifier;
9 |
10 | @Named("custom-mapper")
11 | @Dependent
12 | public class CustomMapper implements IcebergTableMapper {
13 | @Inject
14 | GlobalConfig config;
15 |
16 | @Override
17 | public TableIdentifier mapDestination(String destination) {
18 | try {
19 | String[] parts = destination.split("\\.");
20 | String tableName = parts[parts.length - 1];
21 | return TableIdentifier.of(Namespace.of(config.iceberg().namespace()), "custom_mapper_" + tableName);
22 | } catch (Exception e) {
23 | System.out.println("Failed to map:" + destination);
24 | throw new RuntimeException(e);
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Debezium Server Iceberg Consumer
2 | site_url: http://memiiso.github.io/debezium-server-iceberg
3 | repo_url: https://github.com/memiiso/debezium-server-iceberg
4 | theme:
5 | name: material
6 | features:
7 | # - navigation.instant
8 | - navigation.indexes
9 | - navigation.tabs
10 | # - navigation.expand
11 | - toc.integrate
12 | - content.code.copy
13 | - content.tabs.link
14 | nav:
15 | - Home: index.md
16 | - iceberg Consumer: iceberg.md
17 | - icebergevents Consumer: icebergevents.md
18 | - Python Runner: python-runner.md
19 | - Migration Guideline: migration.md
20 | - FAQ: faq.md
21 | - Contributing: contributing.md
22 |
23 | markdown_extensions:
24 | - pymdownx.highlight:
25 | anchor_linenums: true
26 | line_spans: __span
27 | pygments_lang_class: true
28 | - pymdownx.inlinehilite
29 | - pymdownx.snippets
30 | - pymdownx.superfences
31 | - admonition
32 | - pymdownx.details
33 | - abbr
34 | - pymdownx.snippets:
35 | base_path: [ !relative $config_dir ]
36 | check_paths: true
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM maven:3.9.9-eclipse-temurin-21 as builder
2 | ARG RELEASE_VERSION
3 | RUN apt-get -qq update && apt-get -qq install unzip
4 | COPY . /app
5 | WORKDIR /app
6 | RUN mvn clean package -Passembly -Dmaven.test.skip --quiet -Drevision=${RELEASE_VERSION}
7 | RUN unzip /app/debezium-server-iceberg-dist/target/debezium-server-iceberg-dist*.zip -d appdist
8 | RUN mkdir /app/appdist/debezium-server-iceberg/data && \
9 | chown -R 185 /app/appdist/debezium-server-iceberg && \
10 | chmod -R g+w,o+w /app/appdist/debezium-server-iceberg
11 |
12 | # Stage 2: Final image
13 | FROM registry.access.redhat.com/ubi8/openjdk-21
14 |
15 | ENV SERVER_HOME=/debezium
16 |
17 | USER root
18 | RUN microdnf clean all
19 |
20 | USER jboss
21 |
22 | COPY --from=builder /app/appdist/debezium-server-iceberg $SERVER_HOME
23 |
24 | # Set the working directory to the Debezium Server home directory
25 | WORKDIR $SERVER_HOME
26 |
27 | #
28 | # Expose the ports and set up volumes for the data, transaction log, and configuration
29 | #
30 | EXPOSE 8080
31 | VOLUME ["/debezium/config","/debezium/data"]
32 |
33 | CMD ["/debezium/run.sh"]
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/tableoperator/Operation.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 | package io.debezium.server.iceberg.tableoperator;
20 |
21 | public enum Operation {
22 | INSERT,
23 | UPDATE,
24 | DELETE,
25 | READ
26 | }
27 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/distro/jmx/enable_jmx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # To enable JMX functionality, export the JMX_HOST and JMX_PORT environment variables.
3 | # Modify the jmxremote.access and jmxremote.password files accordingly.
4 | if [ -n "${JMX_HOST}" -a -n "${JMX_PORT}" ]; then
5 | export JAVA_OPTS="-Dcom.sun.management.jmxremote.ssl=false \
6 | -Dcom.sun.management.jmxremote.port=${JMX_PORT} \
7 | -Dcom.sun.management.jmxremote.rmi.port=${JMX_PORT} \
8 | -Dcom.sun.management.jmxremote.local.only=false \
9 | -Djava.rmi.server.hostname=${JMX_HOST} \
10 | -Dcom.sun.management.jmxremote.verbose=true"
11 |
12 | if [ -f "jmx/jmxremote.access" -a -f "jmx/jmxremote.password" ]; then
13 | chmod 600 jmx/jmxremote.password
14 | export JAVA_OPTS="${JAVA_OPTS} -Dcom.sun.management.jmxremote.authenticate=true \
15 | -Dcom.sun.management.jmxremote.access.file=jmx/jmxremote.access \
16 | -Dcom.sun.management.jmxremote.password.file=jmx/jmxremote.password"
17 | else
18 | export JAVA_OPTS="${JAVA_OPTS} -Dcom.sun.management.jmxremote.authenticate=false"
19 | fi
20 | fi
21 |
--------------------------------------------------------------------------------
/.run/package.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/.run/dependency_tree.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/converter/JsonEventConverterSchemaDataTest.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.converter;
2 |
3 | import org.junit.jupiter.api.Test;
4 | import org.junit.jupiter.api.condition.DisabledIfEnvironmentVariable;
5 |
6 | import java.util.Set;
7 |
8 | import static org.junit.jupiter.api.Assertions.assertEquals;
9 |
10 | @DisabledIfEnvironmentVariable(named = "DEBEZIUM_FORMAT_VALUE", matches = "connect")
11 | class JsonEventConverterSchemaDataTest {
12 |
13 | @Test
14 | void testIcebergSchemaConverterDataBehaviourAndCloning() {
15 |
16 | IcebergSchemaInfo test = new IcebergSchemaInfo(5);
17 | test.identifierFieldIds().add(3);
18 | assertEquals(6, test.nextFieldId().incrementAndGet());
19 | assertEquals(Set.of(3), test.identifierFieldIds());
20 |
21 | // test cloning and then changing nextFieldId is persisting
22 | IcebergSchemaInfo copy = test.copyPreservingMetadata();
23 | assertEquals(6, test.nextFieldId().get());
24 | copy.nextFieldId().incrementAndGet();
25 | assertEquals(7, test.nextFieldId().get());
26 |
27 | // test cloning and then changing identifier fields is persisting
28 | assertEquals(Set.of(3), copy.identifierFieldIds());
29 | copy.identifierFieldIds().add(7);
30 | assertEquals(Set.of(3, 7), test.identifierFieldIds());
31 |
32 | }
33 |
34 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/BatchConfig.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg;
2 |
3 | import io.debezium.config.CommonConnectorConfig;
4 | import io.quarkus.runtime.annotations.ConfigRoot;
5 | import io.smallrye.config.ConfigMapping;
6 | import io.smallrye.config.WithDefault;
7 | import io.smallrye.config.WithName;
8 |
9 | @ConfigRoot
10 | @ConfigMapping
11 | public interface BatchConfig {
12 | @WithName("debezium.source.max.queue.size")
13 | @WithDefault(CommonConnectorConfig.DEFAULT_MAX_QUEUE_SIZE + "")
14 | int sourceMaxQueueSize();
15 |
16 | @WithName("debezium.source.max.batch.size")
17 | @WithDefault(CommonConnectorConfig.DEFAULT_MAX_BATCH_SIZE + "")
18 | int sourceMaxBatchSize();
19 |
20 | @WithName("debezium.sink.batch.batch-size-wait.max-wait-ms")
21 | @WithDefault("300000")
22 | int batchSizeWaitMaxWaitMs();
23 |
24 | @WithName("debezium.sink.batch.batch-size-wait.wait-interval-ms")
25 | @WithDefault("10000")
26 | int batchSizeWaitWaitIntervalMs();
27 |
28 | @WithName("debezium.sink.batch.batch-size-wait")
29 | @WithDefault("NoBatchSizeWait")
30 | String batchSizeWaitName();
31 |
32 | @WithName("debezium.sink.batch.concurrent-uploads")
33 | @WithDefault("1")
34 | int concurrentUploads();
35 |
36 | @WithName("debezium.sink.batch.concurrent-uploads.timeout-minutes")
37 | @WithDefault("60")
38 | int concurrentUploadsTimeoutMinutes();
39 |
40 |
41 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/CatalogJdbc.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.testresources;
10 |
11 | import io.quarkus.test.common.QuarkusTestResourceLifecycleManager;
12 |
13 | import java.util.Map;
14 | import java.util.concurrent.ConcurrentHashMap;
15 |
16 | import org.testcontainers.containers.MySQLContainer;
17 |
18 | public class CatalogJdbc implements QuarkusTestResourceLifecycleManager {
19 | public static final MySQLContainer> container = new MySQLContainer<>("mysql:8");
20 |
21 | @Override
22 | public Map start() {
23 | container.start();
24 | System.out.println("Jdbc Catalog started: " + container.getJdbcUrl());
25 |
26 | Map config = new ConcurrentHashMap<>();
27 |
28 | config.put("debezium.sink.iceberg.type", "jdbc");
29 | config.put("debezium.sink.iceberg.uri", container.getJdbcUrl());
30 | config.put("debezium.sink.iceberg.jdbc.user", container.getUsername());
31 | config.put("debezium.sink.iceberg.jdbc.password", container.getPassword());
32 | config.put("debezium.sink.iceberg.jdbc.schema-version", "V1");
33 |
34 | return config;
35 | }
36 |
37 | @Override
38 | public void stop() {
39 | container.stop();
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/tableoperator/PartitionedAppendWriter.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.tableoperator;
2 |
3 | import org.apache.iceberg.FileFormat;
4 | import org.apache.iceberg.PartitionKey;
5 | import org.apache.iceberg.PartitionSpec;
6 | import org.apache.iceberg.Schema;
7 | import org.apache.iceberg.data.InternalRecordWrapper;
8 | import org.apache.iceberg.data.Record;
9 | import org.apache.iceberg.io.FileAppenderFactory;
10 | import org.apache.iceberg.io.FileIO;
11 | import org.apache.iceberg.io.OutputFileFactory;
12 | import org.apache.iceberg.io.PartitionedWriter;
13 |
14 | public class PartitionedAppendWriter extends PartitionedWriter {
15 | private final PartitionKey partitionKey;
16 | final InternalRecordWrapper wrapper;
17 |
18 | public PartitionedAppendWriter(PartitionSpec spec, FileFormat format,
19 | FileAppenderFactory appenderFactory,
20 | OutputFileFactory fileFactory, FileIO io, long targetFileSize,
21 | Schema schema) {
22 | super(spec, format, appenderFactory, fileFactory, io, targetFileSize);
23 | this.partitionKey = new PartitionKey(spec, schema);
24 | this.wrapper = new InternalRecordWrapper(schema.asStruct());
25 | }
26 |
27 | @Override
28 | protected PartitionKey partition(Record row) {
29 | partitionKey.partition(wrapper.wrap(row));
30 | return partitionKey;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/tableoperator/UnpartitionedDeltaWriterTest.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.tableoperator;
2 |
3 | import org.apache.iceberg.data.GenericRecord;
4 | import org.apache.iceberg.data.Record;
5 | import org.apache.iceberg.io.WriteResult;
6 | import org.junit.jupiter.api.Assertions;
7 | import org.junit.jupiter.api.Test;
8 |
9 | import java.io.IOException;
10 |
11 | class UnpartitionedDeltaWriterTest extends BaseWriterTest {
12 |
13 | @Test
14 | public void testUnpartitionedDeltaWriter() throws IOException {
15 | UnpartitionedDeltaWriter writer = new UnpartitionedDeltaWriter(table.spec(), format, appenderFactory, fileFactory,
16 | table.io(),
17 | Long.MAX_VALUE, table.schema(), identifierFieldIds, true);
18 |
19 | Record row = GenericRecord.create(SCHEMA);
20 | row.setField("id", "123");
21 | row.setField("data", "hello world!");
22 | row.setField("id2", "123");
23 | row.setField("__op", "u");
24 |
25 | writer.write(new RecordWrapper(row, Operation.UPDATE));
26 | WriteResult result = writer.complete();
27 |
28 | // in upsert mode, each write is a delete + append, so we'll have 1 data file and 1 delete file
29 | Assertions.assertEquals(result.dataFiles().length, 1);
30 | Assertions.assertEquals(result.dataFiles()[0].format(), format);
31 | Assertions.assertEquals(result.deleteFiles().length, 1);
32 | Assertions.assertEquals(result.deleteFiles()[0].format(), format);
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/tableoperator/UnpartitionedDeltaWriter.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.tableoperator;
2 |
3 | import java.io.IOException;
4 | import java.util.Set;
5 |
6 | import org.apache.iceberg.FileFormat;
7 | import org.apache.iceberg.PartitionSpec;
8 | import org.apache.iceberg.Schema;
9 | import org.apache.iceberg.data.Record;
10 | import org.apache.iceberg.io.FileAppenderFactory;
11 | import org.apache.iceberg.io.FileIO;
12 | import org.apache.iceberg.io.OutputFileFactory;
13 |
14 | class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter {
15 | private final RowDataDeltaWriter writer;
16 |
17 | UnpartitionedDeltaWriter(PartitionSpec spec,
18 | FileFormat format,
19 | FileAppenderFactory appenderFactory,
20 | OutputFileFactory fileFactory,
21 | FileIO io,
22 | long targetFileSize,
23 | Schema schema,
24 | Set identifierFieldIds,
25 | boolean keepDeletes) {
26 | super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, identifierFieldIds, keepDeletes);
27 | this.writer = new RowDataDeltaWriter(null);
28 | }
29 |
30 | @Override
31 | RowDataDeltaWriter route(Record row) {
32 | return writer;
33 | }
34 |
35 | @Override
36 | public void close() throws IOException {
37 | writer.close();
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/mapper/DefaultIcebergTableMapper.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.mapper;
2 |
3 | import io.debezium.server.iceberg.GlobalConfig;
4 | import jakarta.enterprise.context.Dependent;
5 | import jakarta.inject.Inject;
6 | import jakarta.inject.Named;
7 | import org.apache.iceberg.catalog.Namespace;
8 | import org.apache.iceberg.catalog.TableIdentifier;
9 |
10 | @Named("default-mapper")
11 | @Dependent
12 | public class DefaultIcebergTableMapper implements IcebergTableMapper {
13 | @Inject
14 | GlobalConfig config;
15 |
16 | @Override
17 | public TableIdentifier mapDestination(String destination) {
18 | final String tableName = destination
19 | .replaceAll(config.iceberg().destinationRegexp().orElse(""), config.iceberg().destinationRegexpReplace().orElse(""))
20 | .replace(".", "_");
21 |
22 | if (config.iceberg().destinationUppercaseTableNames()) {
23 | return TableIdentifier.of(Namespace.of(config.iceberg().namespace()), (config.iceberg().tablePrefix().orElse("") + tableName).toUpperCase());
24 | } else if (config.iceberg().destinationLowercaseTableNames()) {
25 | return TableIdentifier.of(Namespace.of(config.iceberg().namespace()), (config.iceberg().tablePrefix().orElse("") + tableName).toLowerCase());
26 | } else {
27 | return TableIdentifier.of(Namespace.of(config.iceberg().namespace()), config.iceberg().tablePrefix().orElse("") + tableName);
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/.run/clean,install.run.xml:
--------------------------------------------------------------------------------
1 |
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/docs/icebergevents.md:
--------------------------------------------------------------------------------
1 | # DEPRECATED
2 |
3 | Using the `iceberg` consumer with the following settings is recommended to achieve the same results:
4 |
5 | ```properties
6 | # Store nested data in variant fields
7 | debezium.sink.iceberg.nested-as-variant=true
8 | # Ensure event flattening is disabled (flattening is the default behavior)
9 | debezium.transforms=,
10 | ```
11 |
12 | # `icebergevents` Consumer
13 |
14 | This consumer appends all Change Data Capture (CDC) events as JSON strings to a single Iceberg table. The table is
15 | partitioned by `event_destination` and `event_sink_timestamptz` for efficient data organization and query performance.
16 |
17 | ````properties
18 | debezium.sink.type=icebergevents
19 | debezium.sink.iceberg.catalog-name=default
20 | ````
21 |
22 | Iceberg table definition:
23 |
24 | ```java
25 | static final String TABLE_NAME = "debezium_events";
26 | static final Schema TABLE_SCHEMA = new Schema(
27 | required(1, "event_destination", Types.StringType.get()),
28 | optional(2, "event_key", Types.StringType.get()),
29 | optional(3, "event_value", Types.StringType.get()),
30 | optional(4, "event_sink_epoch_ms", Types.LongType.get()),
31 | optional(5, "event_sink_timestamptz", Types.TimestampType.withZone())
32 | );
33 | static final PartitionSpec TABLE_PARTITION = PartitionSpec.builderFor(TABLE_SCHEMA)
34 | .identity("event_destination")
35 | .hour("event_sink_timestamptz")
36 | .build();
37 | static final SortOrder TABLE_SORT_ORDER = SortOrder.builderFor(TABLE_SCHEMA)
38 | .asc("event_sink_epoch_ms", NullOrder.NULLS_LAST)
39 | .build();
40 | ```
41 |
--------------------------------------------------------------------------------
/docs/python-runner.md:
--------------------------------------------------------------------------------
1 | # Python Runner for Debezium Server
2 |
3 | It's possible to use python to run,operate debezium server
4 |
5 | For convenience this project additionally provides Python scripts to automate the startup, shutdown, and configuration
6 | of Debezium Server.
7 | Using Python, you can do various Debezium Server operation and take programmatic, dynamic, debezium configuration.
8 | example:
9 |
10 | ```commandline
11 | pip install git+https://github.com/memiiso/debezium-server-iceberg.git@master#subdirectory=python
12 | debezium
13 | # running with custom arguments
14 | debezium --debezium_dir=/my/debezium_server/dir/ --java_home=/my/java/homedir/
15 | ```
16 |
17 | ```python
18 | from debezium import Debezium
19 |
20 | d = Debezium(debezium_dir="/dbz/server/dir", java_home='/java/home/dir')
21 | java_args = []
22 | java_args.append("-Dquarkus.log.file.enable=true")
23 | java_args.append("-Dquarkus.log.file.path=/logs/dbz_logfile.log")
24 | d.run(*java_args)
25 | ```
26 |
27 | ```python
28 | import os
29 | from debezium import DebeziumRunAsyn
30 |
31 | java_args = []
32 | # using python we can dynamically influence debezium
33 | # by chaning its config within python
34 | if my_custom_condition_check is True:
35 | # Option 1: set config using java arg
36 | java_args.append("-Dsnapshot.mode=always")
37 | # Option 2: set config using ENV variable
38 | os.environ["SNAPSHOT_MODE"] = "always"
39 |
40 | java_args.append("-Dquarkus.log.file.enable=true")
41 | java_args.append("-Dquarkus.log.file.path=/logs/dbz_logfile.log")
42 | d = DebeziumRunAsyn(debezium_dir="/dbz/server/dir", java_home='/java/home/dir', java_args=java_args)
43 | d.run()
44 | d.join()
45 | ```
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build Java Project
2 |
3 | on:
4 | push:
5 | branches: [ master, '*.*' ]
6 | paths-ignore:
7 | - '.github/**'
8 | - '.idea/**'
9 | - '.run/**'
10 | pull_request:
11 | branches: [ master, '*.*' ]
12 | paths-ignore:
13 | - '.github/**'
14 | - '.idea/**'
15 | - '.run/**'
16 |
17 | env:
18 | SPARK_LOCAL_IP: 127.0.0.1
19 |
20 | jobs:
21 | build-java-project-json-format:
22 | name: Build-Test (Json Format)
23 | runs-on: ubuntu-latest
24 | env:
25 | DEBEZIUM_FORMAT_VALUE: json
26 | DEBEZIUM_FORMAT_KEY: json
27 | steps:
28 | - name: Checkout Repository
29 | uses: actions/checkout@v4
30 | - name: Set up Java
31 | uses: actions/setup-java@v5
32 | with:
33 | distribution: 'temurin'
34 | java-version: 21
35 | cache: 'maven'
36 | - name: Build with Maven
37 | run: mvn -B --no-transfer-progress package --file pom.xml -Dsurefire.skipAfterFailureCount=1
38 |
39 |
40 | build-java-project-connect-format:
41 | name: Build-Test (Connect Format)
42 | runs-on: ubuntu-latest
43 | needs: build-java-project-json-format
44 | env:
45 | DEBEZIUM_FORMAT_VALUE: connect
46 | DEBEZIUM_FORMAT_KEY: connect
47 | steps:
48 | - name: Checkout Repository
49 | uses: actions/checkout@v4
50 |
51 | - name: Set up Java
52 | uses: actions/setup-java@v5
53 | with:
54 | distribution: 'temurin'
55 | java-version: 21
56 | cache: 'maven'
57 | - name: Build with Maven (Connect Format)
58 | run: mvn -B --no-transfer-progress package --file pom.xml -Dsurefire.skipAfterFailureCount=1
59 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/TestUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.testresources;
10 |
11 | import io.debezium.embedded.EmbeddedEngineChangeEvent;
12 | import io.debezium.engine.DebeziumEngine;
13 |
14 | import java.security.SecureRandom;
15 |
16 | import org.apache.kafka.connect.source.SourceRecord;
17 |
18 | public class TestUtil {
19 | static final String AB = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
20 | static final SecureRandom rnd = new SecureRandom();
21 |
22 |
23 | public static int randomInt(int low, int high) {
24 | return rnd.nextInt(high - low) + low;
25 | }
26 |
27 | public static String randomString(int len) {
28 | StringBuilder sb = new StringBuilder(len);
29 | for (int i = 0; i < len; i++)
30 | sb.append(AB.charAt(rnd.nextInt(AB.length())));
31 | return sb.toString();
32 | }
33 |
34 | public static DebeziumEngine.RecordCommitter getCommitter() {
35 | return new DebeziumEngine.RecordCommitter() {
36 | public synchronized void markProcessed(SourceRecord record) {
37 | }
38 |
39 | @Override
40 | public void markProcessed(Object record) {
41 | }
42 |
43 | public synchronized void markBatchFinished() {
44 | }
45 |
46 | @Override
47 | public void markProcessed(Object record, DebeziumEngine.Offsets sourceOffsets) {
48 | }
49 |
50 | @Override
51 | public DebeziumEngine.Offsets buildOffsets() {
52 | return null;
53 | }
54 | };
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/json/unwrap-with-schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "schema": {
3 | "type": "struct",
4 | "fields": [
5 | {
6 | "type": "int32",
7 | "optional": false,
8 | "field": "id"
9 | },
10 | {
11 | "type": "int32",
12 | "optional": false,
13 | "name": "io.debezium.time.Date",
14 | "version": 1,
15 | "field": "order_date"
16 | },
17 | {
18 | "type": "int32",
19 | "optional": false,
20 | "field": "purchaser"
21 | },
22 | {
23 | "type": "int32",
24 | "optional": false,
25 | "field": "quantity"
26 | },
27 | {
28 | "type": "int32",
29 | "optional": false,
30 | "field": "product_id"
31 | },
32 | {
33 | "type": "string",
34 | "optional": true,
35 | "field": "__op"
36 | },
37 | {
38 | "type": "string",
39 | "optional": true,
40 | "field": "__table"
41 | },
42 | {
43 | "type": "int64",
44 | "optional": true,
45 | "field": "__lsn"
46 | },
47 | {
48 | "type": "int64",
49 | "optional": true,
50 | "field": "__source_ts_ms"
51 | },
52 | {
53 | "type": "string",
54 | "optional": true,
55 | "field": "__deleted"
56 | }
57 | ],
58 | "optional": false,
59 | "name": "testc.inventory.orders.Value"
60 | },
61 | "payload": {
62 | "id": 10003,
63 | "order_date": 16850,
64 | "purchaser": 1002,
65 | "quantity": 2,
66 | "product_id": 106,
67 | "__op": "r",
68 | "__table": "orders",
69 | "__lsn": 33832960,
70 | "__source_ts_ms": 1596309876678,
71 | "__deleted": "false"
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/CatalogRest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.testresources;
10 |
11 | import io.quarkus.test.common.QuarkusTestResourceLifecycleManager;
12 | import org.testcontainers.containers.GenericContainer;
13 | import org.testcontainers.containers.wait.strategy.Wait;
14 | import org.testcontainers.utility.DockerImageName;
15 |
16 | import java.util.Map;
17 | import java.util.concurrent.ConcurrentHashMap;
18 |
19 | public class CatalogRest implements QuarkusTestResourceLifecycleManager {
20 | public static final int REST_CATALOG_PORT = 8181;
21 | public static final String REST_CATALOG_IMAGE = "apache/iceberg-rest-fixture";
22 |
23 | public static final GenericContainer> container = new GenericContainer<>(DockerImageName.parse(REST_CATALOG_IMAGE))
24 | .withExposedPorts(REST_CATALOG_PORT)
25 | .waitingFor(Wait.forLogMessage(".*Started Server.*", 1));
26 |
27 | public static String getHostUrl() {
28 | return String.format("http://%s:%s", container.getHost(), container.getMappedPort(REST_CATALOG_PORT));
29 | }
30 |
31 | @Override
32 | public Map start() {
33 | container.start();
34 | System.out.println("Rest Catalog started: " + getHostUrl());
35 |
36 | Map config = new ConcurrentHashMap<>();
37 |
38 | config.put("debezium.sink.iceberg.type", "rest");
39 | config.put("debezium.sink.iceberg.uri", CatalogRest.getHostUrl());
40 |
41 | return config;
42 | }
43 |
44 | @Override
45 | public void stop() {
46 | container.stop();
47 | }
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerJdbcCatalogTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import com.google.common.collect.Lists;
12 | import io.debezium.server.iceberg.testresources.CatalogJdbc;
13 | import io.debezium.server.iceberg.testresources.S3Minio;
14 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
15 | import io.quarkus.test.common.QuarkusTestResource;
16 | import io.quarkus.test.junit.QuarkusTest;
17 | import org.apache.iceberg.data.Record;
18 | import org.apache.iceberg.io.CloseableIterable;
19 | import org.awaitility.Awaitility;
20 | import org.junit.jupiter.api.Test;
21 |
22 | import java.time.Duration;
23 |
24 | /**
25 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
26 | *
27 | * @author Ismail Simsek
28 | */
29 | @QuarkusTest
30 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
31 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
32 | @QuarkusTestResource(value = CatalogJdbc.class, restrictToAnnotatedClass = true)
33 | public class IcebergChangeConsumerJdbcCatalogTest extends BaseTest {
34 |
35 | @Test
36 | public void testSimpleUpload() {
37 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
38 | try {
39 | CloseableIterable result = getTableDataV2("testc.inventory.customers");
40 | return Lists.newArrayList(result).size() >= 3;
41 | } catch (Exception e) {
42 | return false;
43 | }
44 | });
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/json/serde-with-array.json:
--------------------------------------------------------------------------------
1 | {
2 | "schema": {
3 | "type": "struct",
4 | "fields": [
5 | {
6 | "type": "string",
7 | "optional": true,
8 | "field": "name"
9 | },
10 | {
11 | "type": "array",
12 | "items": {
13 | "type": "int32",
14 | "optional": true
15 | },
16 | "optional": true,
17 | "field": "pay_by_quarter"
18 | },
19 | {
20 | "type": "array",
21 | "items": {
22 | "type": "string",
23 | "optional": true
24 | },
25 | "optional": true,
26 | "field": "schedule"
27 | },
28 | {
29 | "type": "string",
30 | "optional": true,
31 | "field": "__op"
32 | },
33 | {
34 | "type": "string",
35 | "optional": true,
36 | "field": "__table"
37 | },
38 | {
39 | "type": "int64",
40 | "optional": true,
41 | "field": "__source_ts_ms"
42 | },
43 | {
44 | "type": "string",
45 | "optional": true,
46 | "field": "__db"
47 | },
48 | {
49 | "type": "string",
50 | "optional": true,
51 | "field": "__deleted"
52 | }
53 | ],
54 | "optional": false,
55 | "name": "testc.inventory.array_data.Value"
56 | },
57 | "payload": {
58 | "name": "Bill",
59 | "pay_by_quarter": [
60 | 10000,
61 | 10001,
62 | 10002,
63 | 10003
64 | ],
65 | "schedule": [
66 | "[Ljava.lang.String;@508917a0",
67 | "[Ljava.lang.String;@7412bd2"
68 | ],
69 | "__op": "c",
70 | "__table": "array_data",
71 | "__source_ts_ms": 1638128893618,
72 | "__db": "postgres",
73 | "__deleted": "false"
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/GlobalConfigTest.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg;
2 |
3 | import io.quarkus.test.junit.QuarkusTest;
4 | import io.quarkus.test.junit.QuarkusTestProfile;
5 | import io.quarkus.test.junit.TestProfile;
6 | import org.jboss.logging.Logger;
7 | import org.junit.jupiter.api.Assertions;
8 | import org.junit.jupiter.api.Test;
9 |
10 | import java.util.HashMap;
11 | import java.util.Map;
12 |
13 | import static io.debezium.server.iceberg.TestConfigSource.ICEBERG_CATALOG_NAME;
14 | import static io.debezium.server.iceberg.TestConfigSource.ICEBERG_WAREHOUSE_S3A;
15 |
16 | @QuarkusTest
17 | @TestProfile(GlobalConfigTest.TestProfile.class)
18 | public class GlobalConfigTest extends BaseTest {
19 |
20 | @Test
21 | void configLoadsCorrectly() {
22 | Assertions.assertEquals(ICEBERG_CATALOG_NAME, config.iceberg().catalogName());
23 | // tests are running with false
24 | Assertions.assertEquals(false, config.iceberg().upsert());
25 | Assertions.assertEquals(ICEBERG_WAREHOUSE_S3A, config.iceberg().warehouseLocation());
26 |
27 | Assertions.assertTrue(config.iceberg().icebergConfigs().containsKey("warehouse"));
28 | Assertions.assertTrue(config.iceberg().icebergConfigs().containsValue(ICEBERG_WAREHOUSE_S3A));
29 | Assertions.assertTrue(config.iceberg().icebergConfigs().containsKey("table-namespace"));
30 | Assertions.assertTrue(config.iceberg().icebergConfigs().containsKey("catalog-name"));
31 | Assertions.assertTrue(config.iceberg().icebergConfigs().containsValue(ICEBERG_CATALOG_NAME));
32 | Assertions.assertEquals(Logger.Level.ERROR, config.quarkusLogLevel());
33 | }
34 |
35 | public static class TestProfile implements QuarkusTestProfile {
36 | @Override
37 | public Map getConfigOverrides() {
38 | Map config = new HashMap<>();
39 | config.put("quarkus.log.level", "ERROR");
40 | return config;
41 | }
42 | }
43 |
44 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](http://www.apache.org/licenses/LICENSE-2.0.html)
2 | 
3 | 
4 |
5 | # Debezium Iceberg Consumer
6 |
7 | This project implements Debezium Server Iceberg consumer
8 | see [Debezium Server](https://debezium.io/documentation/reference/operations/debezium-server.html). It enables real-time
9 | replication of Change Data Capture (CDC) events from any database to Iceberg tables. Without requiring Spark, Kafka or
10 | Streaming platform in between.
11 |
12 | See the [Documentation Page](https://memiiso.github.io/debezium-server-iceberg/) for more details.
13 |
14 | 
15 |
16 | ## Installation
17 | - Requirements:
18 | - JDK 21
19 | - Maven
20 | ### Building from source code
21 |
22 | ```bash
23 | git clone https://github.com/memiiso/debezium-server-iceberg.git
24 | cd debezium-server-iceberg
25 | mvn -Passembly -Dmaven.test.skip package
26 | # unzip and run the application
27 | unzip debezium-server-iceberg-dist/target/debezium-server-iceberg-dist*.zip -d appdist
28 | cd appdist/debezium-server-iceberg
29 | mv config/application.properties.example config/application.properties
30 | bash run.sh
31 | ```
32 |
33 | ## Contributing
34 |
35 | The Memiiso community welcomes anyone that wants to help out in any way, whether that includes reporting problems,
36 | helping with documentation, or contributing code changes to fix bugs, add tests, or implement new features.
37 | See [contributing document](docs/contributing.md) for details.
38 |
39 | ### Contributors
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | name: "Close Stale Issues and PRs"
2 | on:
3 | schedule:
4 | - cron: '0 0 * * *'
5 |
6 | permissions:
7 | # All other permissions are set to none
8 | issues: write
9 | pull-requests: write
10 |
11 | jobs:
12 | stale:
13 | if: github.repository_owner == 'memiiso'
14 | runs-on: ubuntu-22.04
15 | steps:
16 | - uses: actions/stale@v10.1.0
17 | with:
18 | # stale issues
19 | stale-issue-label: 'stale'
20 | exempt-issue-labels: 'not-stale'
21 | days-before-issue-stale: 180
22 | days-before-issue-close: 14
23 | stale-issue-message: >
24 | This issue has been automatically marked as stale because it has been open for 180 days
25 | with no activity. It will be closed in next 14 days if no further activity occurs. To
26 | permanently prevent this issue from being considered stale, add the label 'not-stale',
27 | but commenting on the issue is preferred when possible.
28 | close-issue-message: >
29 | This issue has been closed because it has not received any activity in the last 14 days
30 | since being marked as 'stale'
31 | # stale PRs
32 | stale-pr-label: 'stale'
33 | exempt-pr-labels: 'not-stale,security'
34 | stale-pr-message: 'This pull request has been marked as stale due to 30 days of inactivity. It will be closed in 1 week if no further activity occurs. If you think that’s incorrect or this pull request requires a review, please simply write any comment. If closed, you can revive the PR at any time. Thank you for your contributions.'
35 | close-pr-message: 'This pull request has been closed due to lack of activity. This is not a judgement on the merit of the PR in any way. It is just a way of keeping the PR queue manageable. If you think that is incorrect, or the pull request requires review, you can revive the PR at any time.'
36 | days-before-pr-stale: 30
37 | days-before-pr-close: 7
38 | ascending: true
39 | operations-per-run: 200
40 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/distro/config/metrics.yml:
--------------------------------------------------------------------------------
1 | startDelaySeconds: 0
2 | ssl: false
3 | lowercaseOutputName: false
4 | lowercaseOutputLabelNames: false
5 | rules:
6 | - pattern: "kafka.producer]+)><>([^:]+)"
7 | name: "kafka_producer_metrics_$2"
8 | type: GAUGE
9 | labels:
10 | client: "$1"
11 | - pattern: "kafka.producer]+), node-id=([^>]+)><>([^:]+)"
12 | name: "kafka_producer_node_metrics_$3"
13 | type: GAUGE
14 | labels:
15 | client: "$1"
16 | node: "$2"
17 | - pattern: "kafka.producer]+), topic=([^>]+)><>([^:]+)"
18 | name: "kafka_producer_topic_metrics_$3"
19 | type: GAUGE
20 | labels:
21 | client: "$1"
22 | topic: "$2"
23 | - pattern: "kafka.connect([^:]+):"
24 | name: "kafka_connect_worker_metrics_$1"
25 | type: GAUGE
26 | - pattern: "kafka.connect<>([^:]+)"
27 | name: "kafka_connect_metrics_$2"
28 | type: GAUGE
29 | labels:
30 | client: "$1"
31 | - pattern: "debezium.([^:]+)]+)><>RowsScanned"
32 | name: "debezium_metrics_RowsScanned"
33 | type: GAUGE
34 | labels:
35 | plugin: "$1"
36 | name: "$3"
37 | context: "$2"
38 | table: "$4"
39 | - pattern: "debezium.([^:]+)]+)>([^:]+)"
40 | name: "debezium_metrics_$6"
41 | type: GAUGE
42 | labels:
43 | plugin: "$1"
44 | name: "$2"
45 | task: "$3"
46 | context: "$4"
47 | database: "$5"
48 | - pattern: "debezium.([^:]+)]+)>([^:]+)"
49 | name: "debezium_metrics_$5"
50 | type: GAUGE
51 | labels:
52 | plugin: "$1"
53 | name: "$2"
54 | task: "$3"
55 | context: "$4"
56 | - pattern: "debezium.([^:]+)]+)>([^:]+)"
57 | name: "debezium_metrics_$4"
58 | type: GAUGE
59 | labels:
60 | plugin: "$1"
61 | name: "$3"
62 | context: "$2"
63 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/SourceMongoDB.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.testresources;
10 |
11 | import io.quarkus.test.common.QuarkusTestResourceLifecycleManager;
12 |
13 | import java.time.Duration;
14 | import java.util.List;
15 | import java.util.Map;
16 | import java.util.concurrent.ConcurrentHashMap;
17 |
18 | import org.testcontainers.containers.GenericContainer;
19 | import org.testcontainers.containers.wait.strategy.Wait;
20 | import org.testcontainers.images.builder.ImageFromDockerfile;
21 |
22 | public class SourceMongoDB implements QuarkusTestResourceLifecycleManager {
23 |
24 | public static final int MONGODB_PORT = 27017;
25 | public static final GenericContainer> container = new GenericContainer(
26 | new ImageFromDockerfile("debezium_mongodb", false)
27 | .withFileFromClasspath("Dockerfile", "mongodb/Dockerfile")
28 | .withFileFromClasspath("start-mongodb.sh", "mongodb/start-mongodb.sh"))
29 |
30 | .waitingFor(Wait.forLogMessage(".*Successfully initialized inventory database.*", 1))
31 | .withStartupTimeout(Duration.ofSeconds(120L));
32 |
33 | @Override
34 | public Map start() {
35 | container.setPortBindings(List.of(MONGODB_PORT+":"+MONGODB_PORT));
36 | container.withExposedPorts(MONGODB_PORT).start();
37 |
38 | Map params = new ConcurrentHashMap<>();
39 | params.put("%mongodb.debezium.source.mongodb.connection.string",
40 | "mongodb://" + container.getHost() + ":" + container.getMappedPort(MONGODB_PORT) + "/?replicaSet=rs0"
41 | );
42 | params.put("%mongodb.debezium.source.mongodb.authsource", "admin");
43 | params.put("%mongodb.debezium.source.mongodb.user", "debezium");
44 | params.put("%mongodb.debezium.source.mongodb.password", "dbz");
45 | //params.put("%mongodb.debezium.source.mongodb.ssl.enabled", "false");
46 | return params;
47 | }
48 |
49 | @Override
50 | public void stop() {
51 | if (container != null) {
52 | container.stop();
53 | }
54 | }
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/resources/json/serde-with-schema_geom.json:
--------------------------------------------------------------------------------
1 | {
2 | "schema": {
3 | "type": "struct",
4 | "fields": [
5 | {
6 | "type": "int32",
7 | "optional": false,
8 | "default": 0,
9 | "field": "id"
10 | },
11 | {
12 | "type": "struct",
13 | "fields": [
14 | {
15 | "type": "string",
16 | "optional": false,
17 | "field": "wkb"
18 | },
19 | {
20 | "type": "int32",
21 | "optional": true,
22 | "field": "srid"
23 | }
24 | ],
25 | "optional": true,
26 | "name": "io.debezium.data.geometry.Geometry",
27 | "version": 1,
28 | "doc": "Geometry",
29 | "field": "g"
30 | },
31 | {
32 | "type": "struct",
33 | "fields": [
34 | {
35 | "type": "string",
36 | "optional": false,
37 | "field": "wkb"
38 | },
39 | {
40 | "type": "int32",
41 | "optional": true,
42 | "field": "srid"
43 | }
44 | ],
45 | "optional": true,
46 | "name": "io.debezium.data.geometry.Geometry",
47 | "version": 1,
48 | "doc": "Geometry",
49 | "field": "h"
50 | },
51 | {
52 | "type": "string",
53 | "optional": true,
54 | "field": "__op"
55 | },
56 | {
57 | "type": "string",
58 | "optional": true,
59 | "field": "__table"
60 | },
61 | {
62 | "type": "int64",
63 | "optional": true,
64 | "field": "__source_ts_ms"
65 | },
66 | {
67 | "type": "string",
68 | "optional": true,
69 | "field": "__db"
70 | },
71 | {
72 | "type": "string",
73 | "optional": true,
74 | "field": "__deleted"
75 | }
76 | ],
77 | "optional": false,
78 | "name": "testc.inventory.geom.Value"
79 | },
80 | "payload": {
81 | "id": 1,
82 | "g": {
83 | "wkb": "AQEAAAAAAAAAAADwPwAAAAAAAPA/",
84 | "srid": 123
85 | },
86 | "h": null,
87 | "__op": "r",
88 | "__table": "geom",
89 | "__source_ts_ms": 1634844424986,
90 | "__db": "postgres",
91 | "__deleted": "false"
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/converter/AbstractVariantObject.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 | package io.debezium.server.iceberg.converter;
20 |
21 | import org.apache.iceberg.variants.ShreddedObject;
22 | import org.apache.iceberg.variants.VariantMetadata;
23 | import org.apache.iceberg.variants.VariantObject;
24 | import org.apache.iceberg.variants.VariantValue;
25 | import org.apache.iceberg.variants.Variants;
26 |
27 | import java.nio.ByteBuffer;
28 |
29 | public abstract class AbstractVariantObject implements VariantObject {
30 |
31 | protected final ShreddedObject shreddedObject;
32 | protected final VariantMetadata metadata;
33 |
34 | protected AbstractVariantObject(VariantMetadata metadata) {
35 | this.metadata = metadata;
36 | this.shreddedObject = Variants.object(this.metadata);
37 | }
38 |
39 | public VariantMetadata metadata() {
40 | return this.metadata;
41 | }
42 |
43 | @Override
44 | public VariantValue get(String name) {
45 | return shreddedObject.get(name);
46 | }
47 |
48 | @Override
49 | public Iterable fieldNames() {
50 | return shreddedObject.fieldNames();
51 | }
52 |
53 | @Override
54 | public int numFields() {
55 | return shreddedObject.numFields();
56 | }
57 |
58 | @Override
59 | public int sizeInBytes() {
60 | return shreddedObject.sizeInBytes();
61 | }
62 |
63 | @Override
64 | public int writeTo(ByteBuffer buffer, int offset) {
65 | return shreddedObject.writeTo(buffer, offset);
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/CatalogNessie.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.testresources;
2 |
3 | import io.quarkus.test.common.QuarkusTestResourceLifecycleManager;
4 | import org.testcontainers.containers.GenericContainer;
5 | import org.testcontainers.containers.wait.strategy.HttpWaitStrategy;
6 | import org.testcontainers.utility.DockerImageName;
7 |
8 | import java.time.Duration;
9 | import java.util.Map;
10 | import java.util.concurrent.ConcurrentHashMap;
11 | import java.util.concurrent.TimeUnit;
12 |
13 | public class CatalogNessie implements QuarkusTestResourceLifecycleManager {
14 | private static final String NESSIE_IMAGE = "projectnessie/nessie:latest";
15 | private static final int NESSIE_PORT = 19120;
16 | private GenericContainer> nessieContainer = new GenericContainer<>(DockerImageName.parse(NESSIE_IMAGE))
17 | .withNetworkAliases("nessie")
18 | .withEnv("QUARKUS_PROFILE", "prod")
19 | .withEnv("QUARKUS_HTTP_PORT", String.valueOf(NESSIE_PORT))
20 | .withEnv("QUARKUS_LOG_LEVEL", "INFO")
21 | .withExposedPorts(NESSIE_PORT)
22 | .waitingFor(new HttpWaitStrategy()
23 | .forPort(NESSIE_PORT)
24 | .forPath("/q/health")
25 | .withStartupTimeout(Duration.ofSeconds(120)));
26 |
27 | @Override
28 | public Map start() {
29 | long startTime = System.nanoTime(); // Get time before start
30 | nessieContainer.start();
31 | long endTime = System.nanoTime(); // Get time after start
32 | double durationSeconds = TimeUnit.NANOSECONDS.toMillis(endTime - startTime) / 1000.0; // Convert nanoseconds to seconds
33 | System.out.println("Nessie started: " + getNessieUri() + " duration: " + durationSeconds);
34 |
35 | Map config = new ConcurrentHashMap<>();
36 |
37 | config.put("debezium.sink.iceberg.type", "nessie");
38 | config.put("debezium.sink.iceberg.uri", getNessieUri() + "/api/v2");
39 | config.put("debezium.sink.iceberg.ref", "main");
40 | return config;
41 | }
42 |
43 | @Override
44 | public void stop() {
45 | if (nessieContainer != null) {
46 | nessieContainer.stop();
47 | }
48 | }
49 |
50 | public String getNessieUri() {
51 | if (nessieContainer != null && nessieContainer.isRunning()) {
52 | return "http://" + nessieContainer.getHost() + ":" + nessieContainer.getMappedPort(NESSIE_PORT);
53 | }
54 | return null;
55 | }
56 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .gemini
2 | .flattened-pom.xml
3 |
4 | deploy.sh
5 | .ipynb_checkpoints
6 |
7 | # Debezium gitignore
8 |
9 | activemq-data/
10 | .idea/
11 | *.iml
12 | *.ipr
13 | *.iws
14 | .metadata/
15 | .recommenders/
16 | .classpath
17 | .project
18 | .cache
19 | .settings/
20 | .factorypath
21 | .checkstyle
22 | .gradle/
23 | .vscode/
24 | build/
25 | deploy/
26 | target/
27 | mods/
28 | *.swp
29 | epom
30 | log
31 | npm-debug.log
32 | .DS_Store
33 | phantomjsdriver.log
34 |
35 | generated-sources/
36 |
37 | /state/
38 | bin/
39 |
40 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
41 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
42 |
43 | # User-specific stuff
44 | .idea/**/workspace.xml
45 | .idea/**/tasks.xml
46 | .idea/**/usage.statistics.xml
47 | .idea/**/dictionaries
48 | .idea/**/shelf
49 |
50 | # Generated files
51 | .idea/**/contentModel.xml
52 |
53 | # Sensitive or high-churn files
54 | .idea/**/dataSources/
55 | .idea/**/dataSources.ids
56 | .idea/**/dataSources.local.xml
57 | .idea/**/sqlDataSources.xml
58 | .idea/**/dynamic.xml
59 | .idea/**/uiDesigner.xml
60 | .idea/**/dbnavigator.xml
61 |
62 | # Gradle
63 | .idea/**/gradle.xml
64 | .idea/**/libraries
65 |
66 | # Gradle and Maven with auto-import
67 | # When using Gradle or Maven with auto-import, you should exclude module files,
68 | # since they will be recreated, and may cause churn. Uncomment if using
69 | # auto-import.
70 | # .idea/artifacts
71 | # .idea/compiler.xml
72 | # .idea/jarRepositories.xml
73 | # .idea/modules.xml
74 | # .idea/*.iml
75 | # .idea/modules
76 | # *.iml
77 | # *.ipr
78 |
79 | # CMake
80 | cmake-build-*/
81 |
82 | # Mongo Explorer plugin
83 | .idea/**/mongoSettings.xml
84 |
85 | # File-based project format
86 | *.iws
87 |
88 | # IntelliJ
89 | out/
90 |
91 | # mpeltonen/sbt-idea plugin
92 | .idea_modules/
93 |
94 | # JIRA plugin
95 | atlassian-ide-plugin.xml
96 |
97 | # Cursive Clojure plugin
98 | .idea/replstate.xml
99 |
100 | # Crashlytics plugin (for Android Studio and IntelliJ)
101 | com_crashlytics_export_strings.xml
102 | crashlytics.properties
103 | crashlytics-build.properties
104 | fabric.properties
105 |
106 | # Editor-based Rest Client
107 | .idea/httpRequests
108 |
109 | # Android studio 3.1+ serialized cache file
110 | .idea/caches/build_file_checksums.ser
111 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/tableoperator/PartitionedDeltaWriter.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.tableoperator;
2 |
3 | import java.io.IOException;
4 | import java.io.UncheckedIOException;
5 | import java.util.Map;
6 | import java.util.Set;
7 |
8 | import org.apache.iceberg.FileFormat;
9 | import org.apache.iceberg.PartitionKey;
10 | import org.apache.iceberg.PartitionSpec;
11 | import org.apache.iceberg.Schema;
12 | import org.apache.iceberg.data.Record;
13 | import org.apache.iceberg.io.FileAppenderFactory;
14 | import org.apache.iceberg.io.FileIO;
15 | import org.apache.iceberg.io.OutputFileFactory;
16 | import com.google.common.collect.Maps;
17 | import org.apache.iceberg.util.Tasks;
18 |
19 | class PartitionedDeltaWriter extends BaseDeltaTaskWriter {
20 |
21 | private final PartitionKey partitionKey;
22 |
23 | private final Map writers = Maps.newHashMap();
24 |
25 | PartitionedDeltaWriter(PartitionSpec spec,
26 | FileFormat format,
27 | FileAppenderFactory appenderFactory,
28 | OutputFileFactory fileFactory,
29 | FileIO io,
30 | long targetFileSize,
31 | Schema schema,
32 | Set identifierFieldIds,
33 | boolean keepDeletes) {
34 | super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, identifierFieldIds, keepDeletes);
35 | this.partitionKey = new PartitionKey(spec, schema);
36 | }
37 |
38 | @Override
39 | RowDataDeltaWriter route(Record row) {
40 | partitionKey.partition(wrapper().wrap(row));
41 |
42 | RowDataDeltaWriter writer = writers.get(partitionKey);
43 | if (writer == null) {
44 | // NOTICE: we need to copy a new partition key here, in case of messing up the keys in writers.
45 | PartitionKey copiedKey = partitionKey.copy();
46 | writer = new RowDataDeltaWriter(copiedKey);
47 | writers.put(copiedKey, writer);
48 | }
49 |
50 | return writer;
51 | }
52 |
53 | @Override
54 | public void close() {
55 | try {
56 | Tasks.foreach(writers.values())
57 | .throwFailureWhenFinished()
58 | .noRetry()
59 | .run(RowDataDeltaWriter::close, IOException.class);
60 |
61 | writers.clear();
62 | } catch (IOException e) {
63 | throw new UncheckedIOException("Failed to close equality delta writer", e);
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerRestCatalogTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import com.google.common.collect.Lists;
12 | import io.debezium.server.iceberg.testresources.CatalogRest;
13 | import io.debezium.server.iceberg.testresources.S3Minio;
14 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
15 | import io.quarkus.test.common.QuarkusTestResource;
16 | import io.quarkus.test.junit.QuarkusTest;
17 | import org.apache.iceberg.catalog.Namespace;
18 | import org.apache.iceberg.catalog.TableIdentifier;
19 | import org.apache.iceberg.data.Record;
20 | import org.apache.iceberg.io.CloseableIterable;
21 | import org.awaitility.Awaitility;
22 | import org.junit.jupiter.api.Assertions;
23 | import org.junit.jupiter.api.Test;
24 | import org.junit.jupiter.api.condition.DisabledIfEnvironmentVariable;
25 |
26 | import java.time.Duration;
27 | import java.util.List;
28 |
29 | /**
30 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
31 | *
32 | * @author Ismail Simsek
33 | */
34 | @QuarkusTest
35 | @QuarkusTestResource(value = CatalogRest.class, restrictToAnnotatedClass = true)
36 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
37 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
38 | @DisabledIfEnvironmentVariable(named = "GITHUB_ACTIONS", matches = "true")
39 | public class IcebergChangeConsumerRestCatalogTest extends BaseTest {
40 |
41 | @Test
42 | public void testSimpleUpload() {
43 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
44 | try {
45 | CloseableIterable result = getTableDataV2("testc.inventory.customers");
46 | return Lists.newArrayList(result).size() >= 3;
47 | } catch (Exception e) {
48 | return false;
49 | }
50 | });
51 |
52 | List tables = consumer.icebergCatalog.listTables(Namespace.of(consumer.config.iceberg().namespace()));
53 | Assertions.assertTrue(tables.contains(TableIdentifier.of(Namespace.of(consumer.config.iceberg().namespace()), "debezium_offset_storage_table")));
54 | Assertions.assertTrue(tables.contains(TableIdentifier.of(Namespace.of(consumer.config.iceberg().namespace()), "debeziumcdc_testc_inventory_customers")));
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/storage/BaseIcebergStorageConfig.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.storage;
2 |
3 | import com.google.common.collect.Maps;
4 | import io.debezium.config.Configuration;
5 | import io.debezium.server.iceberg.IcebergUtil;
6 | import org.apache.iceberg.CatalogUtil;
7 | import org.apache.iceberg.catalog.Catalog;
8 | import org.apache.iceberg.catalog.Namespace;
9 | import org.apache.iceberg.catalog.TableIdentifier;
10 | import org.eclipse.microprofile.config.ConfigProvider;
11 |
12 | import java.util.Map;
13 | import java.util.Properties;
14 |
15 |
16 | public abstract class BaseIcebergStorageConfig {
17 | private static final String PROP_SINK_PREFIX = "debezium.sink.";
18 | public Properties config = new Properties();
19 |
20 | public BaseIcebergStorageConfig(Configuration config, String configuration_field_prefix) {
21 | Configuration confIcebergSubset = config.subset(configuration_field_prefix + "iceberg.", true);
22 | confIcebergSubset.forEach(this.config::put);
23 |
24 | // debezium is doing config filtering before passing it down to this class!
25 | // so we are taking additional config using ConfigProvider with this we take full iceberg config
26 | Map icebergConf = IcebergUtil.getConfigSubset(ConfigProvider.getConfig(), PROP_SINK_PREFIX + "iceberg.");
27 | icebergConf.forEach(this.config::putIfAbsent);
28 | }
29 |
30 | public String catalogName() {
31 | return this.config.getProperty("catalog-name", "default");
32 | }
33 |
34 | public String tableNamespace() {
35 | return this.config.getProperty("table-namespace", "default");
36 | }
37 |
38 | abstract public String tableName();
39 |
40 | public org.apache.hadoop.conf.Configuration hadoopConfig() {
41 | final org.apache.hadoop.conf.Configuration hadoopConfig = new org.apache.hadoop.conf.Configuration();
42 | config.forEach((key, value) -> hadoopConfig.set((String) key, (String) value));
43 | return hadoopConfig;
44 | }
45 |
46 | public Map icebergProperties() {
47 | return Maps.fromProperties(config);
48 | }
49 |
50 | public Catalog icebergCatalog() {
51 | return CatalogUtil.buildIcebergCatalog(this.catalogName(),
52 | this.icebergProperties(), this.hadoopConfig());
53 | }
54 |
55 | public String tableFullName() {
56 | return String.format("%s.%s", this.tableNamespace(), this.tableName());
57 | }
58 |
59 | public TableIdentifier tableIdentifier() {
60 | return TableIdentifier.of(Namespace.of(this.tableNamespace()), this.tableName());
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/tableoperator/BaseWriterTest.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.tableoperator;
2 |
3 | import io.debezium.server.iceberg.IcebergUtil;
4 | import org.apache.iceberg.*;
5 | import org.apache.iceberg.data.GenericAppenderFactory;
6 | import org.apache.iceberg.encryption.PlaintextEncryptionManager;
7 | import org.apache.iceberg.inmemory.InMemoryFileIO;
8 | import org.apache.iceberg.io.OutputFileFactory;
9 | import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
10 | import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
11 | import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
12 | import org.apache.iceberg.types.Types;
13 | import org.junit.jupiter.api.BeforeEach;
14 |
15 | import java.util.Set;
16 |
17 | import static org.mockito.Mockito.mock;
18 | import static org.mockito.Mockito.when;
19 |
20 | public class BaseWriterTest {
21 |
22 | protected InMemoryFileIO fileIO;
23 | protected Table table;
24 | FileFormat format;
25 | GenericAppenderFactory appenderFactory;
26 | OutputFileFactory fileFactory;
27 | Set identifierFieldIds;
28 |
29 | protected static final Schema SCHEMA =
30 | new Schema(
31 | ImmutableList.of(
32 | Types.NestedField.required(1, "id", Types.StringType.get()),
33 | Types.NestedField.required(2, "data", Types.StringType.get()),
34 | Types.NestedField.required(3, "id2", Types.StringType.get()),
35 | Types.NestedField.required(4, "__op", Types.StringType.get())
36 | ),
37 | ImmutableSet.of(1, 3));
38 |
39 | protected static final PartitionSpec SPEC =
40 | PartitionSpec.builderFor(SCHEMA).identity("data").build();
41 |
42 | @BeforeEach
43 | public void before() {
44 | fileIO = new InMemoryFileIO();
45 |
46 | table = mock(Table.class);
47 | when(table.schema()).thenReturn(SCHEMA);
48 | when(table.spec()).thenReturn(PartitionSpec.unpartitioned());
49 | when(table.io()).thenReturn(fileIO);
50 | when(table.locationProvider())
51 | .thenReturn(LocationProviders.locationsFor("file", ImmutableMap.of()));
52 | when(table.encryption()).thenReturn(PlaintextEncryptionManager.instance());
53 | when(table.properties()).thenReturn(ImmutableMap.of());
54 |
55 | format = IcebergUtil.getTableFileFormat(table);
56 | appenderFactory = IcebergUtil.getTableAppender(table);
57 | fileFactory = IcebergUtil.getTableOutputFileFactory(table, format);
58 | identifierFieldIds = table.schema().identifierFieldIds();
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/tableoperator/RecordWrapper.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 | package io.debezium.server.iceberg.tableoperator;
20 |
21 | import org.apache.iceberg.data.Record;
22 | import org.apache.iceberg.types.Types.StructType;
23 |
24 | import java.util.Map;
25 |
26 | public class RecordWrapper implements Record {
27 |
28 | private final Record delegate;
29 | private final Operation op;
30 |
31 | public RecordWrapper(Record delegate, Operation op) {
32 | this.delegate = delegate;
33 | this.op = op;
34 | }
35 |
36 | public Operation op() {
37 | return op;
38 | }
39 |
40 | @Override
41 | public StructType struct() {
42 | return delegate.struct();
43 | }
44 |
45 | @Override
46 | public Object getField(String name) {
47 | return delegate.getField(name);
48 | }
49 |
50 | @Override
51 | public void setField(String name, Object value) {
52 | delegate.setField(name, value);
53 | }
54 |
55 | @Override
56 | public Object get(int pos) {
57 | return delegate.get(pos);
58 | }
59 |
60 | @Override
61 | public Record copy() {
62 | return new RecordWrapper(delegate.copy(), op);
63 | }
64 |
65 | @Override
66 | public Record copy(Map overwriteValues) {
67 | return new RecordWrapper(delegate.copy(overwriteValues), op);
68 | }
69 |
70 | @Override
71 | public int size() {
72 | return delegate.size();
73 | }
74 |
75 | @Override
76 | public T get(int pos, Class javaClass) {
77 | return delegate.get(pos, javaClass);
78 | }
79 |
80 | @Override
81 | public void set(int pos, T value) {
82 | delegate.set(pos, value);
83 | }
84 |
85 | @Override
86 | public String toString() {
87 | return delegate.toString();
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/mapper/CustomMapperTest.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.mapper;
2 |
3 | import io.debezium.server.iceberg.BaseSparkTest;
4 | import io.debezium.server.iceberg.testresources.CatalogJdbc;
5 | import io.debezium.server.iceberg.testresources.S3Minio;
6 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
7 | import io.quarkus.test.common.QuarkusTestResource;
8 | import io.quarkus.test.junit.QuarkusTest;
9 | import io.quarkus.test.junit.QuarkusTestProfile;
10 | import io.quarkus.test.junit.TestProfile;
11 | import org.awaitility.Awaitility;
12 | import org.junit.jupiter.api.Assertions;
13 | import org.junit.jupiter.api.Test;
14 |
15 | import java.time.Duration;
16 | import java.util.HashMap;
17 | import java.util.Map;
18 |
19 | import static io.debezium.server.iceberg.TestConfigSource.ICEBERG_CATALOG_TABLE_NAMESPACE;
20 | import static org.junit.jupiter.api.Assertions.assertEquals;
21 |
22 | @QuarkusTest
23 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
24 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
25 | @QuarkusTestResource(value = CatalogJdbc.class, restrictToAnnotatedClass = true)
26 | @TestProfile(CustomMapperTest.TestProfile.class)
27 | public class CustomMapperTest extends BaseSparkTest {
28 |
29 | @Test
30 | public void testCustomMapper() throws Exception {
31 | assertEquals(sinkType, "iceberg");
32 | String sql = """
33 | DROP TABLE IF EXISTS inventory.sample;
34 | CREATE TABLE IF NOT EXISTS inventory.sample (id INTEGER, val INTEGER);
35 | """;
36 | SourcePostgresqlDB.runSQL(sql);
37 | SourcePostgresqlDB.runSQL("INSERT INTO inventory.sample (id, val) VALUES (1, 123)");
38 | Awaitility.await().atMost(Duration.ofSeconds(320)).until(() -> {
39 | try {
40 | var df = spark.newSession().table(ICEBERG_CATALOG_TABLE_NAMESPACE + ".custom_mapper_sample");
41 | Assertions.assertEquals(1, df.count());
42 |
43 | return true;
44 | } catch (Exception e) {
45 | e.printStackTrace();
46 | return false;
47 | }
48 | });
49 | }
50 |
51 | public static class TestProfile implements QuarkusTestProfile {
52 | @Override
53 | public Map getConfigOverrides() {
54 | Map config = new HashMap<>();
55 | config.put("debezium.sink.iceberg.table-mapper", "custom-mapper");
56 | return config;
57 | }
58 | }
59 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerConnectTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import io.debezium.server.iceberg.testresources.CatalogNessie;
12 | import io.debezium.server.iceberg.testresources.S3Minio;
13 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
14 | import io.quarkus.test.common.QuarkusTestResource;
15 | import io.quarkus.test.junit.QuarkusTest;
16 | import io.quarkus.test.junit.QuarkusTestProfile;
17 | import io.quarkus.test.junit.TestProfile;
18 | import org.apache.spark.sql.Dataset;
19 | import org.apache.spark.sql.Row;
20 | import org.awaitility.Awaitility;
21 | import org.junit.jupiter.api.Test;
22 | import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable;
23 |
24 | import java.time.Duration;
25 | import java.util.HashMap;
26 | import java.util.Map;
27 |
28 | import static org.junit.jupiter.api.Assertions.assertEquals;
29 |
30 | /**
31 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
32 | *
33 | * @author Ismail Simsek
34 | */
35 | @QuarkusTest
36 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
37 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
38 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
39 | @TestProfile(IcebergChangeConsumerConnectTest.TestProfile.class)
40 | @EnabledIfEnvironmentVariable(named = "DEBEZIUM_FORMAT_VALUE", matches = "connect")
41 | public class IcebergChangeConsumerConnectTest extends BaseSparkTest {
42 |
43 | @Test
44 | public void testSimpleUpload() {
45 | assertEquals("connect", config.debezium().keyValueChangeEventFormat());
46 |
47 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
48 | try {
49 | Dataset ds = getTableData("testc.inventory.customers");
50 | ds.show(false);
51 | return ds.count() >= 3;
52 | } catch (Exception e) {
53 | e.printStackTrace();
54 | return false;
55 | }
56 | });
57 | }
58 |
59 | public static class TestProfile implements QuarkusTestProfile {
60 | @Override
61 | public Map getConfigOverrides() {
62 | Map config = new HashMap<>();
63 | config.put("debezium.format.value", "connect");
64 | config.put("debezium.format.key", "connect");
65 | return config;
66 | }
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/batchsizewait/MaxBatchSizeWait.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.batchsizewait;
10 |
11 | import io.debezium.DebeziumException;
12 | import io.debezium.server.DebeziumMetrics;
13 | import io.debezium.server.iceberg.BatchConfig;
14 | import jakarta.enterprise.context.Dependent;
15 | import jakarta.inject.Inject;
16 | import jakarta.inject.Named;
17 | import org.slf4j.Logger;
18 | import org.slf4j.LoggerFactory;
19 |
20 | /**
21 | * Optimizes batch size around 85%-90% of max,batch.size using dynamically calculated sleep(ms)
22 | *
23 | * @author Ismail Simsek
24 | */
25 | @Dependent
26 | @Named("MaxBatchSizeWait")
27 | public class MaxBatchSizeWait implements BatchSizeWait {
28 | protected static final Logger LOGGER = LoggerFactory.getLogger(MaxBatchSizeWait.class);
29 |
30 | @Inject
31 | BatchConfig config;
32 | @Inject
33 | DebeziumMetrics dbzMetrics;
34 |
35 | @Override
36 | public void initizalize() throws DebeziumException {
37 | assert config.batchSizeWaitWaitIntervalMs() < config.batchSizeWaitMaxWaitMs() : "`wait-interval-ms` cannot be bigger than `max-wait-ms`";
38 | }
39 |
40 | @Override
41 | public void waitMs(Integer numRecordsProcessed, Integer processingTimeMs) throws InterruptedException {
42 |
43 | // don't wait if snapshot process is running
44 | if (dbzMetrics.snapshotRunning()) {
45 | return;
46 | }
47 |
48 | LOGGER.debug("Processed {}, QueueCurrentSize:{}, QueueTotalCapacity:{}, SecondsBehindSource:{}, SnapshotCompleted:{}",
49 | numRecordsProcessed,
50 | dbzMetrics.streamingQueueCurrentSize(),
51 | config.sourceMaxQueueSize(),
52 | (int) (dbzMetrics.streamingMilliSecondsBehindSource() / 1000),
53 | dbzMetrics.snapshotCompleted()
54 | );
55 |
56 | int totalWaitMs = 0;
57 | while (totalWaitMs < config.batchSizeWaitMaxWaitMs() && dbzMetrics.streamingQueueCurrentSize() < config.sourceMaxBatchSize()) {
58 | totalWaitMs += config.batchSizeWaitWaitIntervalMs();
59 | LOGGER.debug("Sleeping {} Milliseconds, QueueCurrentSize:{} < maxBatchSize:{}",
60 | config.batchSizeWaitWaitIntervalMs(), dbzMetrics.streamingQueueCurrentSize(), config.sourceMaxBatchSize());
61 |
62 | Thread.sleep(config.batchSizeWaitWaitIntervalMs());
63 | }
64 |
65 | LOGGER.debug("Total wait {} Milliseconds, QueueCurrentSize:{} < maxBatchSize:{}",
66 | totalWaitMs, dbzMetrics.streamingQueueCurrentSize(), config.sourceMaxBatchSize());
67 |
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | # Frequently Asked Questions (FAQ)
2 |
3 | ???+ question "How does the connector handle deletes, and what are the performance implications?"
4 |
5 | This connector writes data to Iceberg tables using the V2 specification. To optimize write performance, delete events are recorded in delete files, avoiding costly data file rewrites. While this approach significantly improves write performance, it can impact read performance, especially in `upsert` mode. In `append` mode, this performance trade-off is not applicable.
6 |
7 | To optimize read performance, you must run periodic table maintenance jobs to compact data and rewrite the delete files. This is especially critical for `upsert` mode.
8 |
9 | ???+ question "Does the connector support schema evolution?"
10 |
11 | Full schema evolution, such as converting incompatible data types, is not currently supported. However, **schema expansion**—including adding new fields or promoting field data types—is supported. To enable this behavior, set the `debezium.sink.iceberg.allow-field-addition` configuration property to `true`.
12 |
13 | For a more robust way to handle schema changes, you can configure the connector to store all nested data in a `variant` field. This approach can seamlessly absorb many schema changes.
14 |
15 | ```properties
16 | # Store nested data in variant fields
17 | debezium.sink.iceberg.nested-as-variant=true
18 | # Ensure event flattening is disabled (flattening is the default behavior)
19 | debezium.transforms=,
20 | ```
21 |
22 | ???+ question "How can I replicate only specific tables from my source database?"
23 |
24 | By default, the Debezium connector replicates all tables in the database, which can result in unnecessary load. To avoid replicating tables you don't need, configure the `debezium.source.table.include.list` property to specify the exact tables to replicate. This will streamline your data pipeline and reduce overhead. For more details, refer to the [Debezium server source](https://debezium.io/documentation/reference/stable/connectors/mysql.html#mysql-property-table-include-list documentation.
25 |
26 | ???+ question "How do I configure AWS S3 credentials?"
27 |
28 | You can set up AWS credentials in one of the following ways:
29 |
30 | - **In `application.properties`**: Use the `debezium.sink.iceberg.fs.s3a.access.key` and `debezium.sink.iceberg.fs.s3a.secret.key` properties.
31 | - **As environment variables**: Set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
32 | - **Using Hadoop's configuration**: Set up the `HADOOP_HOME` environment variable and add S3A configuration to `core-site.xml`. More information can be found [here](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Authenticating_with_S3).
33 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerNessieCatalogTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import io.debezium.server.iceberg.testresources.CatalogNessie;
12 | import io.debezium.server.iceberg.testresources.S3Minio;
13 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
14 | import io.quarkus.test.common.QuarkusTestResource;
15 | import io.quarkus.test.junit.QuarkusTest;
16 | import org.apache.spark.sql.Dataset;
17 | import org.apache.spark.sql.Row;
18 | import org.awaitility.Awaitility;
19 | import org.junit.jupiter.api.Test;
20 | import org.junit.jupiter.api.condition.DisabledIfEnvironmentVariable;
21 |
22 | import java.sql.SQLException;
23 | import java.time.Duration;
24 |
25 | /**
26 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
27 | *
28 | * @author Ismail Simsek
29 | */
30 | @QuarkusTest
31 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
32 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
33 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
34 | @DisabledIfEnvironmentVariable(named = "GITHUB_ACTIONS", matches = "true")
35 | public class IcebergChangeConsumerNessieCatalogTest extends BaseSparkTest {
36 |
37 | @Test
38 | public void testSimpleUpload() throws InterruptedException, SQLException, ClassNotFoundException {
39 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
40 | try {
41 | Dataset df = getTableData("testc.inventory.customers");
42 | df.show(false);
43 | return df.count() >= 3;
44 | } catch (Exception e) {
45 | return false;
46 | }
47 | });
48 |
49 | int startId = 1005;
50 | int numberOfRecords = 5; // You can change this value
51 | for (int i = 0; i < numberOfRecords; i++) {
52 | int currentId = startId + i;
53 | String firstName = "FirstName" + currentId;
54 | String lastName = "LastName" + currentId;
55 | String email = "user" + currentId + "@example.com";
56 | String insertStatement = String.format(
57 | "INSERT INTO inventory.customers (id, first_name, last_name, email) VALUES (%d, '%s', '%s', '%s');",
58 | currentId,
59 | firstName,
60 | lastName,
61 | email
62 | );
63 | SourcePostgresqlDB.runSQL(insertStatement);
64 | Thread.sleep(3000);
65 | Dataset df = getTableData("testc.inventory.customers");
66 | df.show(false);
67 | }
68 | }
69 |
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/examples/nessie/config/application.properties:
--------------------------------------------------------------------------------
1 | # Use iceberg sink
2 | debezium.sink.type=iceberg
3 | # Iceberg sink config
4 | debezium.sink.iceberg.table-prefix=debeziumcdc_
5 | debezium.sink.iceberg.upsert=true
6 | debezium.sink.iceberg.upsert-keep-deletes=true
7 | debezium.sink.iceberg.write.format.default=parquet
8 | # S3 config using Nessie catalog And S3FileIO
9 | debezium.sink.iceberg.type=nessie
10 | #debezium.sink.iceberg.catalog-impl=org.apache.iceberg.nessie.NessieCatalog
11 | debezium.sink.iceberg.uri=http://nessie:19120/api/v2
12 | debezium.sink.iceberg.ref=main
13 | debezium.sink.iceberg.warehouse=s3://warehouse
14 | debezium.sink.iceberg.table-namespace=icebergdata
15 | debezium.sink.iceberg.catalog-name=nessie
16 | # Use S3FileIO
17 | debezium.sink.iceberg.io-impl=org.apache.iceberg.io.ResolvingFileIO
18 | debezium.sink.iceberg.s3.endpoint=http://minio:9000
19 | debezium.sink.iceberg.s3.path-style-access=true
20 | debezium.sink.iceberg.s3.access-key-id=admin
21 | debezium.sink.iceberg.s3.secret-access-key=password
22 | # postgres source
23 | debezium.source.connector.class=io.debezium.connector.postgresql.PostgresConnector
24 | debezium.source.offset.flush.interval.ms=0
25 | debezium.source.database.hostname=postgresqlsourcedb
26 | debezium.source.database.port=5432
27 | debezium.source.database.user=postgres
28 | debezium.source.database.password=postgres
29 | debezium.source.database.dbname=postgres
30 | debezium.source.database.server.name=tutorial
31 | debezium.source.database.server.id=1234
32 | debezium.source.schema.include.list=inventory
33 | debezium.source.topic.prefix=dbz
34 | # saving debezium state data to destination, iceberg tables
35 | # see https://debezium.io/documentation/reference/stable/development/engine.html#advanced-consuming
36 | debezium.source.offset.storage=io.debezium.server.iceberg.offset.IcebergOffsetBackingStore
37 | debezium.source.offset.storage.iceberg.table-name=debezium_offset_storage_table
38 | # see https://debezium.io/documentation/reference/stable/development/engine.html#database-history-properties
39 | debezium.source.schema.history.internal=io.debezium.server.iceberg.history.IcebergSchemaHistory
40 | debezium.source.schema.history.internal.iceberg.table-name=debezium_database_history_storage_table
41 | # enable event schemas - mandatory
42 | debezium.format.value.schemas.enable=true
43 | debezium.format.key.schemas.enable=true
44 | debezium.format.value=connect
45 | debezium.format.key=connect
46 | # do event flattening. unwrap message!
47 | debezium.transforms=unwrap
48 | debezium.transforms.unwrap.type=io.debezium.transforms.ExtractNewRecordState
49 | debezium.transforms.unwrap.add.fields=op,table,source.ts_ns,db
50 | debezium.transforms.unwrap.delete.tombstone.handling.mode=rewrite
51 | debezium.transforms.unwrap.drop.tombstones=true
52 | # ############ SET LOG LEVELS ############
53 | quarkus.log.level=INFO
54 | quarkus.log.console.json=false
55 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/converter/IcebergSchemaInfo.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.converter;
2 |
3 | import org.apache.iceberg.types.Types;
4 |
5 | import java.util.ArrayList;
6 | import java.util.HashSet;
7 | import java.util.List;
8 | import java.util.Set;
9 | import java.util.concurrent.atomic.AtomicInteger;
10 |
11 | /**
12 | * A record class (Java 14+) representing schema data for Iceberg records.
13 | * This class helps manage the fields, identifier fields, and the next available
14 | * field ID when building Iceberg schemas.
15 | *
16 | * @param fields A list of `Types.NestedField` objects representing the fields
17 | * in the schema. `Types.NestedField` contains information about
18 | * the field's ID, name, type, and nullability.
19 | * @param identifierFieldIds A set of integer IDs that identify the fields that are
20 | * part of the record's key or identifier.
21 | * @param nextFieldId An `AtomicInteger` that keeps track of the next available
22 | * field ID to ensure unique IDs are assigned to new fields. Using
23 | * an `AtomicInteger` makes this class thread-safe.
24 | */
25 |
26 | public record IcebergSchemaInfo(List fields, Set identifierFieldIds,
27 | AtomicInteger nextFieldId) {
28 |
29 | /**
30 | * Constructor for `IcebergSchemaInfo` that initializes the `fields` list and
31 | * `identifierFieldIds` set to empty and sets the `nextFieldId` to the provided
32 | * value.
33 | *
34 | * @param nextFieldId The starting ID to use for new fields.
35 | */
36 | public IcebergSchemaInfo(Integer nextFieldId) {
37 | this(new ArrayList<>(), new HashSet<>(), new AtomicInteger(nextFieldId));
38 | }
39 |
40 | /**
41 | * Default constructor for `IcebergSchemaInfo` that initializes the `fields`
42 | * list and `identifierFieldIds` set to empty and sets the `nextFieldId` to 1.
43 | */
44 | public IcebergSchemaInfo() {
45 | this(1);
46 | }
47 |
48 | /**
49 | * Creates a copy of this `IcebergSchemaInfo` object, but *keeps* the original's
50 | * `identifierFieldIds` and `nextFieldId`. This is useful when you want to
51 | * create a new schema builder based on an existing one but need to preserve
52 | * the identifier field information and the next field ID counter. The `fields`
53 | * list is initialized as a new empty list in the copy.
54 | *
55 | * @return A new `IcebergSchemaInfo` object with the same identifier fields and
56 | * next field ID, but an empty fields list.
57 | */
58 | public IcebergSchemaInfo copyPreservingMetadata() {
59 | return new IcebergSchemaInfo(new ArrayList<>(), this.identifierFieldIds, this.nextFieldId);
60 | }
61 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/converter/JsonEventConverterBuilderTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.converter;
10 |
11 | import io.debezium.server.iceberg.BaseTest;
12 | import io.debezium.server.iceberg.testresources.CatalogNessie;
13 | import io.quarkus.test.common.QuarkusTestResource;
14 | import io.quarkus.test.junit.QuarkusTest;
15 | import org.apache.iceberg.Schema;
16 | import org.apache.iceberg.types.Types;
17 | import org.junit.jupiter.api.Assertions;
18 | import org.junit.jupiter.api.Test;
19 | import org.junit.jupiter.api.condition.DisabledIfEnvironmentVariable;
20 |
21 | import java.util.List;
22 | import java.util.Set;
23 |
24 | import static org.apache.iceberg.types.Types.NestedField.optional;
25 | import static org.apache.iceberg.types.Types.NestedField.required;
26 |
27 |
28 | /**
29 | * @author Ismail Simsek
30 | */
31 | @QuarkusTest
32 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
33 | @DisabledIfEnvironmentVariable(named = "DEBEZIUM_FORMAT_VALUE", matches = "connect")
34 | class JsonEventConverterBuilderTest extends BaseTest {
35 |
36 | @Test
37 | public void testIcebergChangeEventBuilder() {
38 | Schema schema1 = new Schema(
39 | List.of(
40 | required(1, "id", Types.IntegerType.get()),
41 | optional(2, "data", Types.StringType.get()),
42 | optional(3, "preferences", Types.StructType.of(
43 | optional(4, "feature1", Types.BooleanType.get()),
44 | optional(5, "feature2", Types.BooleanType.get())
45 | ))
46 | )
47 | , Set.of(1)
48 | );
49 |
50 | JsonEventConverter t = eventBuilder.
51 | addKeyField("id", 1)
52 | .addField("data", "testdatavalue")
53 | .addField("preferences", "feature1", true)
54 | .addField("preferences", "feature2", true)
55 | .build();
56 | Assertions.assertTrue(schema1.sameSchema(t.icebergSchema()));
57 |
58 | Schema schema2 = new Schema(
59 | optional(1, "id", Types.IntegerType.get()),
60 | optional(2, "data", Types.StringType.get()),
61 | optional(3, "preferences", Types.StructType.of(
62 | optional(4, "feature1", Types.BooleanType.get()),
63 | optional(5, "feature2", Types.BooleanType.get())
64 | ))
65 | );
66 |
67 | JsonEventConverter t2 = eventBuilder.
68 | addField("id", 1)
69 | .addField("data", "testdatavalue")
70 | .addField("preferences", "feature1", true)
71 | .addField("preferences", "feature2", true)
72 | .build();
73 | Assertions.assertEquals(schema2.identifierFieldIds(), t2.icebergSchema().identifierFieldIds());
74 | Assertions.assertTrue(schema2.sameSchema(t2.icebergSchema()));
75 | }
76 |
77 |
78 | }
--------------------------------------------------------------------------------
/examples/lakekeeper/config/application.properties:
--------------------------------------------------------------------------------
1 | # Use iceberg sink
2 | debezium.sink.type=iceberg
3 | # Iceberg sink config
4 | debezium.sink.iceberg.table-prefix=debeziumcdc_
5 | debezium.sink.iceberg.upsert=true
6 | debezium.sink.iceberg.upsert-keep-deletes=true
7 | debezium.sink.iceberg.write.format.default=parquet
8 | # S3 config using lakekeeper catalog And S3FileIO
9 | debezium.sink.iceberg.type=rest
10 | #debezium.sink.iceberg.catalog-impl=org.apache.iceberg.nessie.NessieCatalog
11 | debezium.sink.iceberg.uri=http://lakekeeper:8181/catalog
12 | debezium.sink.iceberg.ref=main
13 | debezium.sink.iceberg.warehouse=iceberg_warehouse
14 | debezium.sink.iceberg.table-namespace=icebergdata
15 | debezium.sink.iceberg.catalog-name=lakekeeper
16 | # Use S3FileIO
17 | debezium.sink.iceberg.io-impl=org.apache.iceberg.io.ResolvingFileIO
18 | debezium.sink.iceberg.s3.endpoint=http://minio:9000
19 | debezium.sink.iceberg.s3.path-style-access=true
20 | debezium.sink.iceberg.s3.access-key-id=admin
21 | debezium.sink.iceberg.s3.secret-access-key=password
22 | # postgres source
23 | debezium.source.connector.class=io.debezium.connector.postgresql.PostgresConnector
24 | debezium.source.offset.flush.interval.ms=0
25 | debezium.source.database.hostname=postgresqlsourcedb
26 | debezium.source.database.port=5432
27 | debezium.source.database.user=postgres
28 | debezium.source.database.password=postgres
29 | debezium.source.database.dbname=postgres
30 | debezium.source.database.server.name=tutorial
31 | debezium.source.database.server.id=1234
32 | debezium.source.schema.include.list=inventory
33 | debezium.source.topic.prefix=dbz
34 | # saving debezium state data to destination, iceberg tables
35 | # see https://debezium.io/documentation/reference/stable/development/engine.html#advanced-consuming
36 | debezium.source.offset.storage=io.debezium.server.iceberg.offset.IcebergOffsetBackingStore
37 | debezium.source.offset.storage.iceberg.table-name=debezium_offset_storage_table
38 | # see https://debezium.io/documentation/reference/stable/development/engine.html#database-history-properties
39 | debezium.source.schema.history.internal=io.debezium.server.iceberg.history.IcebergSchemaHistory
40 | debezium.source.schema.history.internal.iceberg.table-name=debezium_database_history_storage_table
41 | # enable event schemas - mandatory
42 | debezium.format.value.schemas.enable=true
43 | debezium.format.key.schemas.enable=true
44 | debezium.format.value=connect
45 | debezium.format.key=connect
46 | # do event flattening. unwrap message!
47 | debezium.transforms=unwrap
48 | debezium.transforms.unwrap.type=io.debezium.transforms.ExtractNewRecordState
49 | debezium.transforms.unwrap.add.fields=op,table,source.ts_ns,db
50 | debezium.transforms.unwrap.delete.tombstone.handling.mode=rewrite
51 | debezium.transforms.unwrap.drop.tombstones=true
52 | # ############ SET LOG LEVELS ############
53 | quarkus.log.level=INFO
54 | quarkus.log.console.json=false
55 | quarkus.log.category."org.apache.hadoop".level=WARN
56 | #quarkus.log.category."org.apache.iceberg.SnapshotProducer".level=WARN
57 | quarkus.log.category."org.apache.iceberg.CatalogUtil".level=WARN
58 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/tableoperator/BaseDeltaTaskWriter.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.tableoperator;
2 |
3 | import com.google.common.collect.Sets;
4 | import org.apache.iceberg.*;
5 | import org.apache.iceberg.data.InternalRecordWrapper;
6 | import org.apache.iceberg.data.Record;
7 | import org.apache.iceberg.io.BaseTaskWriter;
8 | import org.apache.iceberg.io.FileAppenderFactory;
9 | import org.apache.iceberg.io.FileIO;
10 | import org.apache.iceberg.io.OutputFileFactory;
11 | import org.apache.iceberg.types.TypeUtil;
12 |
13 | import java.io.IOException;
14 | import java.util.Set;
15 |
16 | abstract class BaseDeltaTaskWriter extends BaseTaskWriter {
17 |
18 | private final Schema schema;
19 | private final Schema deleteSchema;
20 | private final InternalRecordWrapper wrapper;
21 | private final InternalRecordWrapper keyWrapper;
22 | private final boolean keepDeletes;
23 | private final RecordProjection keyProjection;
24 |
25 | BaseDeltaTaskWriter(PartitionSpec spec,
26 | FileFormat format,
27 | FileAppenderFactory appenderFactory,
28 | OutputFileFactory fileFactory,
29 | FileIO io,
30 | long targetFileSize,
31 | Schema schema,
32 | Set identifierFieldIds,
33 | boolean keepDeletes) {
34 | super(spec, format, appenderFactory, fileFactory, io, targetFileSize);
35 | this.schema = schema;
36 | this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(identifierFieldIds));
37 | this.wrapper = new InternalRecordWrapper(schema.asStruct());
38 | this.keyWrapper = new InternalRecordWrapper(deleteSchema.asStruct());
39 | this.keyProjection = RecordProjection.create(schema, deleteSchema);
40 | this.keepDeletes = keepDeletes;
41 | }
42 |
43 | abstract RowDataDeltaWriter route(Record row);
44 |
45 | InternalRecordWrapper wrapper() {
46 | return wrapper;
47 | }
48 |
49 | @Override/**/
50 | public void write(Record row) throws IOException {
51 | RowDataDeltaWriter writer = route(row);
52 | Operation rowOperation = ((RecordWrapper) row).op();
53 | if (rowOperation == Operation.INSERT) {
54 | // new row
55 | writer.write(row);
56 | } else if (rowOperation == Operation.DELETE && !keepDeletes) {
57 | // deletes. doing hard delete. when keepDeletes = FALSE we dont keep deleted record
58 | writer.deleteKey(keyProjection.wrap(row));
59 | } else {
60 | writer.deleteKey(keyProjection.wrap(row));
61 | writer.write(row);
62 | }
63 | }
64 |
65 | public class RowDataDeltaWriter extends BaseEqualityDeltaWriter {
66 | RowDataDeltaWriter(PartitionKey partition) {
67 | super(partition, schema, deleteSchema);
68 | }
69 |
70 | @Override
71 | protected StructLike asStructLike(Record data) {
72 | return wrapper.wrap(data);
73 | }
74 |
75 | @Override
76 | protected StructLike asStructLikeKey(Record data) {
77 | return keyWrapper.wrap(data);
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ "master" ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ "master" ]
20 | schedule:
21 | - cron: '36 4 * * 3'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'java', 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v4
42 | - name: Set up JDK
43 | uses: actions/setup-java@v5
44 | with:
45 | distribution: 'temurin'
46 | java-version: 21
47 | cache: 'maven'
48 |
49 | # Initializes the CodeQL tools for scanning.
50 | - name: Initialize CodeQL
51 | uses: github/codeql-action/init@v4
52 | with:
53 | languages: ${{ matrix.language }}
54 | # If you wish to specify custom queries, you can do so here or in a config file.
55 | # By default, queries listed here will override any specified in a config file.
56 | # Prefix the list here with "+" to use these queries and those in the config file.
57 |
58 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
59 | # queries: security-extended,security-and-quality
60 |
61 |
62 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
63 | # If this step fails, then you should remove it and run the build manually (see below)
64 | - name: Autobuild
65 | uses: github/codeql-action/autobuild@v4
66 |
67 | # ℹ️ Command-line programs to run using the OS shell.
68 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
69 |
70 | # If the Autobuild fails above, remove it and uncomment the following three lines.
71 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
72 |
73 | # - run: |
74 | # echo "Run, Build Application using script"
75 | # ./location_of_script_within_repo/buildscript.sh
76 |
77 | - name: Perform CodeQL Analysis
78 | uses: github/codeql-action/analyze@v4
79 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerDecimalTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import io.debezium.server.iceberg.testresources.CatalogNessie;
12 | import io.debezium.server.iceberg.testresources.S3Minio;
13 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
14 | import io.quarkus.test.common.QuarkusTestResource;
15 | import io.quarkus.test.junit.QuarkusTest;
16 | import io.quarkus.test.junit.QuarkusTestProfile;
17 | import io.quarkus.test.junit.TestProfile;
18 | import org.apache.spark.sql.Dataset;
19 | import org.apache.spark.sql.Row;
20 | import org.awaitility.Awaitility;
21 | import org.junit.jupiter.api.Assertions;
22 | import org.junit.jupiter.api.Test;
23 |
24 | import java.time.Duration;
25 | import java.util.HashMap;
26 | import java.util.Map;
27 |
28 | import static org.junit.jupiter.api.Assertions.assertEquals;
29 |
30 | /**
31 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
32 | *
33 | * @author Ismail Simsek
34 | */
35 | @QuarkusTest
36 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
37 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
38 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
39 | @TestProfile(IcebergChangeConsumerDecimalTest.TestProfile.class)
40 | public class IcebergChangeConsumerDecimalTest extends BaseSparkTest {
41 |
42 | @Test
43 | public void testConsumingNumerics() throws Exception {
44 | assertEquals(sinkType, "iceberg");
45 | String sql = "\n" +
46 | " DROP TABLE IF EXISTS inventory.data_types;\n" +
47 | " CREATE TABLE IF NOT EXISTS inventory.data_types (\n" +
48 | " c_id INTEGER ,\n" +
49 | " c_decimal DECIMAL(18,6)\n" +
50 | " );";
51 | SourcePostgresqlDB.runSQL(sql);
52 | sql = "INSERT INTO inventory.data_types (c_id, c_decimal) " +
53 | "VALUES (1, '1234566.34456'::decimal)";
54 | SourcePostgresqlDB.runSQL(sql);
55 | Awaitility.await().atMost(Duration.ofSeconds(320)).until(() -> {
56 | try {
57 | Dataset df = getTableData("testc.inventory.data_types");
58 | df.show(false);
59 |
60 | Assertions.assertEquals(1, df.count());
61 | Assertions.assertEquals(1, df.filter("c_id = 1 AND c_decimal = CAST('1234566.344560' AS DECIMAL(18,6))").count(), "c_decimal not matching");
62 | return true;
63 | } catch (Exception | AssertionError e) {
64 | e.printStackTrace();
65 | return false;
66 | }
67 | });
68 | }
69 |
70 | public static class TestProfile implements QuarkusTestProfile {
71 | @Override
72 | public Map getConfigOverrides() {
73 | Map config = new HashMap<>();
74 | config.put("debezium.sink.iceberg.destination-regexp", "\\d");
75 | config.put("debezium.source.decimal.handling.mode", "precise");
76 | return config;
77 | }
78 | }
79 |
80 | }
81 |
--------------------------------------------------------------------------------
/docs/images/debezium-iceberg.drawio:
--------------------------------------------------------------------------------
1 | 7Vxbc6O4Ev41eTQFSGB4TOLxzFRlqmaTqrM7+5KSQcFaA+KAHNvz648Ewlwk3xKc9UniqhlDIyRZX3fr65aUK3CbrL/mKJv/oCGOr2wzXF+ByZVt+9Dl/wvBphI4FqwEUU7CSmQ1ggfyG0uhKaVLEuKiU5BRGjOSdYUBTVMcsI4M5TlddYs90bjbaoYirAgeAhSr0j9JyOaV1HPMRv4Nk2het2yZ8kmC6sJSUMxRSFctEfhyBW5zSll1laxvcSzGrh6X6r3pjqfbjuU4Zce8MAWzxdfbb4vZ/d3jcnMfF9f3wQjIzj2jeCl/8W1Ml6HsMtvU41AsMAvEDzKvwA1dspik+HY75kIY5SgkvC+3NKY5l6U05e/ezFkS8zuLX67mhOGHDAWizhVXFy57oimTqFt2fS+bFbXyUcvEdbKOhH4ZaFVAI8rpMiub/M5x1z595JePQflTeCUspwtcd+zKBp7j3fhQNEfiuNfhZ5wzwhXgOiaRqJtR0RSSdzF+YqJG/itIGt2VdxNgyp7rmghRMceh/DkZzkmCGRaFcj52KI1i/LMR3qAlo0U1HqK8CrLEXfQSr1siCfpXTHlV+YYXkU+hKy1OWqDtSsxXjT5DW5rpvKXLdq0cSNpQtK27UTN+ITXtBK2DitIp+sYxTMNy2HZpTkuxBhklszNK1tg0xo46UMBRB8p1Def1I7Umf/1nE/wxs0Yg+9P9+f1vpxiNrMMjVVsISUpX1h4XvSrX0js0w/FPrm2MlEY0o4zRhBeIxYMbFCyiEoWWUj+VH16kbOy6yBrzR/XNE1kL3G5kfyZzxoSvvhYDYU+DMAUG4Vb7RDi+uRHwFu1piBjiX0Je8G82xyMc4wilbFRgxiW+K0vNUIGNLI0GgNx1unbhAQVuz1TR9gawCi3W4B1iDQ9iHdOIFuK3Ty2P4zX9sXn4425k2d5AKEMD2D2cva2oBbXrG2Csse1GPDjg41PcoHnYDVbTXE1TwDCO0fJ6ZqKZPmyosZNBpg/tuNU/ojVOOOSsTd7SnM25UqUo/tJIb7oj2ZS5o8JQyvH7BzO2kWREzMPd0eVDmG/+ku+XN7/EDXf98naybj+cbGoG00EF7kOloMs8wIenA4byCLM95byqnBiVvRjnOEaMPHe57uB4eRo9d2MmOVMHSPe/S1o/GFU8iDsT03KzdfOQX0Xie4Jn+DdZJnVts7x+8oBzruGqvJbwn1E13RWH5Lkvel0Pvwe8i3l0oFnhFNstD+sDesxVP08oDHk6NflnIBfiH55pbfMtp9ra0+5zvW/gUk5wDcOZsnz1JyWlZkuMgN+dJQHoUdrKN8m32mFmryJYg7urosp5KRVd5znatIplokCxu8PQ1Xd4Z7/2l+cXVQ8apdoO7ivouxpeH+R0Q1C2HTGslrZ1mJ2eyIW44PUY/8xoURhU+DMuqn1vyeCEZyNc9QXrK/sjvx/rco+2wz1kyekG8Sm2Z3V9iu8YAJrNR3UxjmM4muiNiwE4k5+x/NPxv3hOf6HxG/CgAvmbxm/2EWmN/wNjX61WozDHCaFGillMnjYGyjKBbhKVGJeUZiSsfrQibD5KUYKHM2xg98jCEYbtaax6bDjnsmqg4vqxyMLY7IXUsGdTx5IFpaK+cQ5EFpR2XPO0fnXLn4cs1HmGdxfWHgxXweDxqh5WC3Z9C7R7LmKH3g6G8OVEwr3I9IgQ+b1HpeBfj0rBEWzxY000/agUDhWVKhWdKSqFJ0al8C2iUvgZlb5RVAouMiqFL1hVvED8LyxQOQbr8wUq2nX294HzMospCo0VWZAEhwRJWxf3mbgvrTpJ6tTDMpnx77H4Zwr/yUnJHD9OY5IuHoUuGMWzeN3ic3u2HmkfD6ck/c0X1thXtMIyLcPX7L9wzrX9wn4XaoHmKEUGEU5+lY0CzlUFTLW6CF2wTbElamo6cjoYZTkuGB0OXdfrbRrx1bVRy9JQSDA2IDwTus67QJflJKUVuqgoMCtq2YgOaJ5jv0sLLY/7cAVCH2zn4jaI0D4Tgv9KHkBl/U1mwLC32YBfV620wY7UAF4T1n6P3/5qPWreEjf1SzvRbGcP9u0mO3cs4oy7xm6PPUNs96w/vQqPToFZ3QgA9CsaKDJxXL/bfW/YlJYWmhcsf1ygL3oRAXkS/0BDQB64ancIiGPZDf/oPh3OwfWV1vIsA9otkmqr3s4xxho2Av3tfsjheer7CEh5QGIEOUYcRrkulkj1KDDh7IRfpHjVYia82yzKcakQw8QkHbChaRuWqwCsQ9cxjfEA23/06B6xz+1jpbV8s2uVAL4wreXDAxUNNHns6vBZJw9rd7a83C51erLcrLPOjbDMPyv59Ndm5c1g620aIbAncCJm4V2tzWrBNdfrDfdMhZqev8cZ132SajL3P/hMQlLMW48xytNWGT78s4P5/aP2onE/xHRm1TtDoTlWcSDhn5AwLO1ct5LQtf0B/KQPumQImI5RH0xqx266fa2W6RnnYv72EcHbSScj2sdr3CvlkIrUyJ2Hd3qgbY+yiKpRHsia9/rSE3IlvWja1kTTULcgYw2xIqN3QccE0wdPRvUHHfDGvuwedGXVa/tC26zaPKZzxOggMdp/wCngHSpPIu3QIt0hLYUxaU9t5bia1qozWzf8Vnd6a7YMFpgNpFE99gvrBdi2RkEN1x0gN6PPvLvvgujK1LqByuBFhkNV1v17K+suw5iQiJNuVacxKgbC1ulTkjpubvtvR+MtoLnNtg7vL3QB74egLPdoxeu4Q5sD2+4/eUhHie2eg6r9QTse1+nwGSmILhb/ECp8u8wR4+juUePpBDG04ojP6bLAn4p+fKL9CEW3wNtqOjhmefST3F0quXPrPNaW3I3fjNzp9ekFB3g/yZ0eW78fCvqGp2Yy35zfATUefABn8NlbO32F1+4E68fmAc5BzMeqq3d1a+LnCuKBGnR9m0wfPmHrwuZdGmzqssHX20/Ueo7S7aOm5szeFjU1aX/9e5mLrPRNTGef8HVpsdPbQ+h6hqvSGF+zQfR8CB6xaH+Qtmz/2JV5gH3u2/hfr9VZGtKZ0GCxzIwE5eJL0HIdufbLzwmY7mDtA0ANfdvogc2Ziqc6WTDWoG2fjja/bf64WLUm1/yFNvDlfw==
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergEventsChangeConsumerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import io.debezium.server.iceberg.testresources.CatalogNessie;
12 | import io.debezium.server.iceberg.testresources.S3Minio;
13 | import io.debezium.server.iceberg.testresources.SourceMysqlDB;
14 | import io.quarkus.test.common.QuarkusTestResource;
15 | import io.quarkus.test.junit.QuarkusTest;
16 | import io.quarkus.test.junit.QuarkusTestProfile;
17 | import io.quarkus.test.junit.TestProfile;
18 | import org.apache.spark.sql.Dataset;
19 | import org.apache.spark.sql.Row;
20 | import org.awaitility.Awaitility;
21 | import org.eclipse.microprofile.config.inject.ConfigProperty;
22 | import org.junit.jupiter.api.Assertions;
23 | import org.junit.jupiter.api.Test;
24 | import org.junit.jupiter.api.condition.DisabledIfEnvironmentVariable;
25 |
26 | import java.time.Duration;
27 | import java.util.HashMap;
28 | import java.util.Map;
29 |
30 | import static io.debezium.server.iceberg.TestConfigSource.ICEBERG_CATALOG_TABLE_NAMESPACE;
31 |
32 | /**
33 | *
34 | * @author Ismail Simsek
35 | */
36 | @QuarkusTest
37 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
38 | @QuarkusTestResource(value = SourceMysqlDB.class, restrictToAnnotatedClass = true)
39 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
40 | @TestProfile(IcebergEventsChangeConsumerTest.TestProfile.class)
41 | @DisabledIfEnvironmentVariable(named = "DEBEZIUM_FORMAT_VALUE", matches = "connect")
42 | @Deprecated
43 | public class IcebergEventsChangeConsumerTest extends BaseSparkTest {
44 | @ConfigProperty(name = "debezium.sink.type")
45 | String sinkType;
46 |
47 | @Test
48 | public void testSimpleUpload() {
49 | Assertions.assertEquals(sinkType, "icebergevents");
50 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
51 | try {
52 | Dataset ds = spark.newSession().sql("SELECT * FROM "+ICEBERG_CATALOG_TABLE_NAMESPACE+".debezium_events");
53 | ds.show(false);
54 | return ds.count() >= 10
55 | && ds.select("event_destination").distinct().count() >= 2;
56 | } catch (Exception e) {
57 | return false;
58 | }
59 | });
60 |
61 | // S3Minio.listFiles();
62 | }
63 |
64 | public static class TestProfile implements QuarkusTestProfile {
65 | @Override
66 | public Map getConfigOverrides() {
67 | Map config = new HashMap<>();
68 | config.put("debezium.sink.type", "icebergevents");
69 | config.put("quarkus.profile", "mysql");
70 | config.put("debezium.format.value", "json");
71 | config.put("debezium.format.key", "json");
72 | config.put("%mysql.debezium.source.connector.class", "io.debezium.connector.mysql.MySqlConnector");
73 | config.put("%mysql.debezium.source.table.whitelist", "inventory.customers,inventory.test_delete_table");
74 | return config;
75 | }
76 |
77 | @Override
78 | public String getConfigProfile() {
79 | return "mysql";
80 | }
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerMongodbTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import io.debezium.server.iceberg.testresources.CatalogNessie;
12 | import io.debezium.server.iceberg.testresources.S3Minio;
13 | import io.debezium.server.iceberg.testresources.SourceMongoDB;
14 | import io.quarkus.test.common.QuarkusTestResource;
15 | import io.quarkus.test.junit.QuarkusTest;
16 | import io.quarkus.test.junit.QuarkusTestProfile;
17 | import io.quarkus.test.junit.TestProfile;
18 | import org.apache.spark.sql.Dataset;
19 | import org.apache.spark.sql.Row;
20 | import org.awaitility.Awaitility;
21 | import org.junit.jupiter.api.Test;
22 |
23 | import java.time.Duration;
24 | import java.util.HashMap;
25 | import java.util.Map;
26 |
27 | /**
28 | *
29 | * @author Ismail Simsek
30 | */
31 | @QuarkusTest
32 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
33 | @QuarkusTestResource(value = SourceMongoDB.class, restrictToAnnotatedClass = true)
34 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
35 | @TestProfile(IcebergChangeConsumerMongodbTest.TestProfile.class)
36 | public class IcebergChangeConsumerMongodbTest extends BaseSparkTest {
37 |
38 | @Test
39 | public void testSimpleUpload() {
40 |
41 | Awaitility.await().atMost(Duration.ofSeconds(180)).until(() -> {
42 | try {
43 | Dataset df = getTableData("testc.inventory.products");
44 | df.show();
45 | return df.filter("_id is not null").count() >= 4;
46 | } catch (Exception e) {
47 | //e.printStackTrace();
48 | return false;
49 | }
50 | });
51 | }
52 |
53 | public static class TestProfile implements QuarkusTestProfile {
54 | @Override
55 | public Map getConfigOverrides() {
56 | Map config = new HashMap<>();
57 | config.put("quarkus.profile", "mongodb");
58 | config.put("%mongodb.debezium.source.connector.class", "io.debezium.connector.mongodb.MongoDbConnector");
59 | config.put("%mongodb.debezium.transforms.unwrap.type", "io.debezium.connector.mongodb.transforms.ExtractNewDocumentState");
60 | config.put("%mongodb.debezium.transforms.unwrap.add.fields", "op,source.ts_ns,db");
61 | config.put("%mongodb.debezium.sink.iceberg.allow-field-addition", "false");
62 | config.put("%mongodb.debezium.source.topic.prefix", "testc");
63 | config.put("%mongodb.debezium.source.database.include.list", "inventory"); // ok
64 | config.put("%mongodb.debezium.source.collection.include.list", "inventory.products");
65 | // IMPORTANT !!! FIX MongoDbConnector KEY FIELD NAME "id"=>"_id" !!!
66 | config.put("%mongodb.debezium.transforms", "unwrap,renamekeyfield");
67 | config.put("%mongodb.debezium.transforms.renamekeyfield.type",
68 | "org.apache.kafka.connect.transforms.ReplaceField$Key");
69 | config.put("%mongodb.debezium.transforms.renamekeyfield.renames", "id:_id");
70 |
71 | return config;
72 | }
73 |
74 | @Override
75 | public String getConfigProfile() {
76 | return "mongodb";
77 | }
78 | }
79 |
80 | }
81 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/SourceMysqlDB.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.testresources;
10 |
11 | import io.quarkus.test.common.QuarkusTestResourceLifecycleManager;
12 |
13 | import java.sql.Connection;
14 | import java.sql.DriverManager;
15 | import java.sql.SQLException;
16 | import java.sql.Statement;
17 | import java.time.Duration;
18 | import java.util.Map;
19 | import java.util.concurrent.ConcurrentHashMap;
20 |
21 | import org.slf4j.Logger;
22 | import org.slf4j.LoggerFactory;
23 | import org.testcontainers.containers.GenericContainer;
24 | import org.testcontainers.containers.wait.strategy.Wait;
25 |
26 | public class SourceMysqlDB implements QuarkusTestResourceLifecycleManager {
27 |
28 | public static final String MYSQL_ROOT_PASSWORD = "debezium";
29 | public static final String MYSQL_USER = "mysqluser";
30 | public static final String MYSQL_PASSWORD = "mysqlpw";
31 | public static final String MYSQL_DEBEZIUM_USER = "debezium";
32 | public static final String MYSQL_DEBEZIUM_PASSWORD = "dbz";
33 | public static final String MYSQL_IMAGE = "debezium/example-mysql:3.0.0.Final";
34 | public static final String MYSQL_HOST = "127.0.0.1";
35 | public static final String MYSQL_DATABASE = "inventory";
36 | public static final Integer MYSQL_PORT_DEFAULT = 3306;
37 | private static final Logger LOGGER = LoggerFactory.getLogger(SourceMysqlDB.class);
38 |
39 | static private final GenericContainer> container = new GenericContainer<>(MYSQL_IMAGE)
40 | .waitingFor(Wait.forLogMessage(".*mysqld: ready for connections.*", 2))
41 | .withEnv("MYSQL_USER", MYSQL_USER)
42 | .withEnv("MYSQL_PASSWORD", MYSQL_PASSWORD)
43 | .withEnv("MYSQL_ROOT_PASSWORD", MYSQL_ROOT_PASSWORD)
44 | .withExposedPorts(MYSQL_PORT_DEFAULT)
45 | .withStartupTimeout(Duration.ofSeconds(30));
46 |
47 | public static void runSQL(String query) throws SQLException, ClassNotFoundException {
48 | try {
49 | String url = "jdbc:mysql://" + MYSQL_HOST + ":" + container.getMappedPort(MYSQL_PORT_DEFAULT) + "/" + MYSQL_DATABASE + "?useSSL=false";
50 | Class.forName("com.mysql.cj.jdbc.Driver");
51 | Connection con = DriverManager.getConnection(url, MYSQL_USER, MYSQL_PASSWORD);
52 | Statement st = con.createStatement();
53 | st.execute(query);
54 | con.close();
55 | } catch (Exception e) {
56 | LOGGER.error(query);
57 | throw e;
58 | }
59 | }
60 |
61 | @Override
62 | public Map start() {
63 | container.start();
64 |
65 | Map params = new ConcurrentHashMap<>();
66 | params.put("%mysql.debezium.source.database.hostname", MYSQL_HOST);
67 | params.put("%mysql.debezium.source.database.port", container.getMappedPort(MYSQL_PORT_DEFAULT).toString());
68 | params.put("%mysql.debezium.source.database.user", MYSQL_DEBEZIUM_USER);
69 | params.put("%mysql.debezium.source.database.password", MYSQL_DEBEZIUM_PASSWORD);
70 | params.put("%mysql.debezium.source.database.dbname", MYSQL_DATABASE);
71 | return params;
72 | }
73 |
74 | @Override
75 | public void stop() {
76 | if (container != null) {
77 | container.stop();
78 | }
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/SourcePostgresqlDB.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.testresources;
10 |
11 | import io.quarkus.test.common.QuarkusTestResourceLifecycleManager;
12 |
13 | import java.sql.Connection;
14 | import java.sql.DriverManager;
15 | import java.sql.SQLException;
16 | import java.sql.Statement;
17 | import java.time.Duration;
18 | import java.util.Map;
19 | import java.util.concurrent.ConcurrentHashMap;
20 |
21 | import org.slf4j.Logger;
22 | import org.slf4j.LoggerFactory;
23 | import org.testcontainers.containers.GenericContainer;
24 | import org.testcontainers.containers.wait.strategy.Wait;
25 |
26 | public class SourcePostgresqlDB implements QuarkusTestResourceLifecycleManager {
27 |
28 | public static final String POSTGRES_USER = "postgres";
29 | public static final String POSTGRES_PASSWORD = "postgres";
30 | public static final String POSTGRES_DBNAME = "postgres";
31 | public static final String POSTGRES_IMAGE = "debezium/example-postgres:3.0.0.Final";
32 | public static final String POSTGRES_HOST = "localhost";
33 | public static final Integer POSTGRES_PORT_DEFAULT = 5432;
34 | private static final Logger LOGGER = LoggerFactory.getLogger(SourcePostgresqlDB.class);
35 |
36 | private static GenericContainer> container = new GenericContainer<>(POSTGRES_IMAGE)
37 | .waitingFor(Wait.forLogMessage(".*database system is ready to accept connections.*", 2))
38 | .withEnv("POSTGRES_USER", POSTGRES_USER)
39 | .withEnv("POSTGRES_PASSWORD", POSTGRES_PASSWORD)
40 | .withEnv("POSTGRES_DB", POSTGRES_DBNAME)
41 | .withEnv("POSTGRES_INITDB_ARGS", "-E UTF8")
42 | .withEnv("LANG", "en_US.utf8")
43 | .withExposedPorts(POSTGRES_PORT_DEFAULT)
44 | .withStartupTimeout(Duration.ofSeconds(30));
45 |
46 | public static void runSQL(String query) throws SQLException, ClassNotFoundException {
47 | try {
48 |
49 | String url = "jdbc:postgresql://" + POSTGRES_HOST + ":" + container.getMappedPort(POSTGRES_PORT_DEFAULT) + "/" + POSTGRES_DBNAME;
50 | Class.forName("org.postgresql.Driver");
51 | Connection con = DriverManager.getConnection(url, POSTGRES_USER, POSTGRES_PASSWORD);
52 | Statement st = con.createStatement();
53 | st.execute(query);
54 | con.close();
55 | } catch (Exception e) {
56 | e.printStackTrace();
57 | throw e;
58 | }
59 | }
60 |
61 | @Override
62 | public Map start() {
63 | container.start();
64 | try {
65 | SourcePostgresqlDB.runSQL("CREATE EXTENSION hstore;");
66 | } catch (SQLException | ClassNotFoundException e) {
67 | throw new RuntimeException(e);
68 | }
69 |
70 | Map params = new ConcurrentHashMap<>();
71 | params.put("debezium.source.database.hostname", POSTGRES_HOST);
72 | params.put("debezium.source.database.port", container.getMappedPort(POSTGRES_PORT_DEFAULT).toString());
73 | params.put("debezium.source.database.user", POSTGRES_USER);
74 | params.put("debezium.source.database.password", POSTGRES_PASSWORD);
75 | params.put("debezium.source.database.dbname", POSTGRES_DBNAME);
76 | return params;
77 | }
78 |
79 | @Override
80 | public void stop() {
81 | if (container != null) {
82 | container.stop();
83 | }
84 | }
85 |
86 | }
87 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/IcebergConfig.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg;
2 |
3 | import io.quarkus.runtime.annotations.ConfigRoot;
4 | import io.smallrye.config.ConfigMapping;
5 | import io.smallrye.config.WithDefault;
6 | import io.smallrye.config.WithName;
7 | import org.apache.iceberg.CatalogProperties;
8 |
9 | import java.util.List;
10 | import java.util.Map;
11 | import java.util.Optional;
12 |
13 | import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
14 | import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT;
15 |
16 | @ConfigRoot
17 | @ConfigMapping
18 | public interface IcebergConfig {
19 | String PROP_PREFIX = "debezium.sink.iceberg";
20 |
21 | @WithName(PROP_PREFIX)
22 | Map icebergConfigs();
23 |
24 | @WithName("debezium.sink.iceberg.upsert-op-field")
25 | @WithDefault("__op")
26 | String cdcOpField();
27 |
28 | @WithName("debezium.sink.iceberg.upsert-dedup-column")
29 | @WithDefault("__source_ts_ns")
30 | String cdcSourceTsField();
31 |
32 | @WithName("debezium.sink.iceberg.upsert")
33 | @WithDefault("false")
34 | boolean upsert();
35 |
36 | @WithName("debezium.sink.iceberg.upsert-keep-deletes")
37 | @WithDefault("true")
38 | boolean keepDeletes();
39 |
40 | @WithName("debezium.sink.iceberg." + CatalogProperties.WAREHOUSE_LOCATION)
41 | String warehouseLocation();
42 |
43 | @WithName("debezium.sink.iceberg.table-mapper")
44 | @WithDefault("default-mapper")
45 | String tableMapper();
46 |
47 | @WithName("debezium.sink.iceberg.destination-regexp")
48 | // @WithDefault("")
49 | Optional destinationRegexp();
50 |
51 | @WithName("debezium.sink.iceberg.destination-regexp-replace")
52 | // @WithDefault("")
53 | Optional destinationRegexpReplace();
54 |
55 | @WithName("debezium.sink.iceberg.destination-uppercase-table-names")
56 | @WithDefault("false")
57 | boolean destinationUppercaseTableNames();
58 |
59 | @WithName("debezium.sink.iceberg.destination-lowercase-table-names")
60 | @WithDefault("false")
61 | boolean destinationLowercaseTableNames();
62 |
63 | @WithName("debezium.sink.iceberg.table-prefix")
64 | // @WithDefault("")
65 | Optional tablePrefix();
66 |
67 | @WithName("debezium.sink.iceberg.table-namespace")
68 | @WithDefault("default")
69 | String namespace();
70 |
71 | @WithName("debezium.sink.iceberg.catalog-name")
72 | @WithDefault("default")
73 | String catalogName();
74 |
75 | @WithName("debezium.sink.iceberg.create-identifier-fields")
76 | @WithDefault("true")
77 | boolean createIdentifierFields();
78 |
79 | @WithName("debezium.sink.iceberg." + DEFAULT_FILE_FORMAT)
80 | @WithDefault(DEFAULT_FILE_FORMAT_DEFAULT)
81 | String writeFormat();
82 |
83 | @WithName("debezium.sink.iceberg.allow-field-addition")
84 | @WithDefault("true")
85 | boolean allowFieldAddition();
86 |
87 | @WithName("debezium.sink.iceberg.excluded-columns")
88 | Optional> excludedColumns();
89 |
90 | @WithName("debezium.sink.iceberg.io-impl")
91 | @WithDefault("org.apache.iceberg.io.ResolvingFileIO")
92 | String ioImpl();
93 |
94 | @WithName("debezium.sink.iceberg.preserve-required-property")
95 | @WithDefault("false")
96 | boolean preserveRequiredProperty();
97 |
98 | @WithName("debezium.sink.iceberg.nested-as-variant")
99 | @WithDefault("false")
100 | boolean nestedAsVariant();
101 |
102 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerExcludedColumnsTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import io.debezium.server.iceberg.testresources.CatalogNessie;
12 | import io.debezium.server.iceberg.testresources.S3Minio;
13 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
14 | import io.quarkus.test.common.QuarkusTestResource;
15 | import io.quarkus.test.junit.QuarkusTest;
16 | import io.quarkus.test.junit.QuarkusTestProfile;
17 | import io.quarkus.test.junit.TestProfile;
18 | import org.apache.spark.sql.Dataset;
19 | import org.apache.spark.sql.Row;
20 | import org.awaitility.Awaitility;
21 | import org.junit.jupiter.api.Assertions;
22 | import org.junit.jupiter.api.Test;
23 |
24 | import java.time.Duration;
25 | import java.util.Arrays;
26 | import java.util.HashMap;
27 | import java.util.List;
28 | import java.util.Map;
29 |
30 | /**
31 | * Integration test that verifies columns can be excluded from the written iceberg table
32 | *
33 | * @author Ismail Simsek
34 | */
35 | @QuarkusTest
36 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
37 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
38 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
39 | @TestProfile(IcebergChangeConsumerExcludedColumnsTest.TestProfile.class)
40 | public class IcebergChangeConsumerExcludedColumnsTest extends BaseSparkTest {
41 |
42 | @Test
43 | public void testSupportExcludedColumns() throws Exception {
44 | String sql =
45 | "DROP TABLE IF EXISTS inventory.table_with_excluded_column;\n" +
46 | "CREATE TABLE IF NOT EXISTS inventory.table_with_excluded_column (\n" +
47 | " c_id INTEGER ,\n" +
48 | " c_text TEXT ,\n" +
49 | " c_exclude_me TEXT\n" +
50 | ");";
51 | SourcePostgresqlDB.runSQL(sql);
52 | sql = "INSERT INTO inventory.table_with_excluded_column \n" +
53 | "(c_id, c_text, c_exclude_me) \n" +
54 | "VALUES \n" +
55 | "(1, 'one' , 'should_not_write_to_iceberg' ) \n" +
56 | ",(1, 'two' , 'should_not_write_to_iceberg' )";
57 |
58 | SourcePostgresqlDB.runSQL(sql);
59 | Awaitility.await().atMost(Duration.ofSeconds(320)).until(() -> {
60 | try {
61 | Dataset df = getTableData("testc.inventory.table_with_excluded_column");
62 | df.show(false);
63 | df.schema().printTreeString();
64 |
65 | List columns = Arrays.asList(df.columns());
66 |
67 | Assertions.assertTrue(columns.contains("c_id"));
68 | Assertions.assertTrue(columns.contains("c_text"));
69 | Assertions.assertFalse(columns.contains("c_exclude_me"));
70 | Assertions.assertFalse(columns.contains("__table"));
71 | Assertions.assertFalse(columns.contains("__db"));
72 | return true;
73 | } catch (Exception | AssertionError e) {
74 | return false;
75 | }
76 | });
77 | }
78 |
79 | public static class TestProfile implements QuarkusTestProfile {
80 | @Override
81 | public Map getConfigOverrides() {
82 | Map config = new HashMap<>();
83 | config.put("debezium.sink.iceberg.excluded-columns", "c_exclude_me,__table,__db");
84 | return config;
85 | }
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/examples/nessie/produce_data.py:
--------------------------------------------------------------------------------
1 | import psycopg2
2 | import random
3 | import time
4 | from faker import Faker
5 |
6 | # --- Database Connection Details ---
7 | # Based on your Debezium configuration, the host/port/user/password for the *source* DB
8 | DB_HOST = "localhost"
9 | DB_NAME = "postgres"
10 | DB_USER = "postgres"
11 | DB_PASSWORD = "postgres"
12 | DB_PORT = 5432
13 |
14 | # --- Configuration ---
15 | INSERT_INTERVAL_SECONDS = 10
16 | fake = Faker()
17 |
18 |
19 | def get_db_connection():
20 | """Establishes and returns a PostgreSQL database connection."""
21 | try:
22 | conn = psycopg2.connect(
23 | host=DB_HOST,
24 | database=DB_NAME,
25 | user=DB_USER,
26 | password=DB_PASSWORD,
27 | port=DB_PORT
28 | )
29 | return conn
30 | except Exception as error:
31 | print(f"❌ Error connecting to the database: {error}")
32 | return None
33 |
34 |
35 | def insert_random_customer(conn):
36 | first_name = f"test-{fake.first_name()}"
37 | last_name = fake.last_name()
38 | email = f"{first_name.lower()}.{last_name.lower()}{random.randint(1, 100)}@{fake.domain_name()}"
39 |
40 | insert_query = """
41 | INSERT INTO inventory.customers (first_name, last_name, email)
42 | VALUES (%s, %s, %s)
43 | RETURNING id;
44 | """
45 |
46 | try:
47 | with conn.cursor() as cur:
48 | cur.execute(insert_query, (first_name, last_name, email))
49 | new_id = cur.fetchone()[0]
50 | conn.commit()
51 | print(f"✅ Inserted new customer (ID: {new_id}): {first_name} {last_name}")
52 | return new_id
53 | except Exception as error:
54 | print(f"❌ Error inserting customer data: {error}")
55 | conn.rollback()
56 | raise error
57 |
58 |
59 | def insert_random_order(conn, purchaser_id):
60 | """Inserts a new random order into the inventory.orders table."""
61 | # Ensure there's at least one customer ID to reference
62 | order_date = fake.date_time_this_year()
63 | quantity = random.randint(1, 10)
64 | product_id = random.randint(101, 109) # Assumes existing products
65 |
66 | insert_query = """
67 | INSERT INTO inventory.orders (order_date, purchaser, quantity, product_id)
68 | VALUES (%s, %s, %s, %s);
69 | """
70 |
71 | try:
72 | with conn.cursor() as cur:
73 | cur.execute(insert_query, (order_date, purchaser_id, quantity, product_id))
74 | conn.commit()
75 | print(f"✅ Inserted new order for purchaser ID {purchaser_id} (Product: {product_id}, Quantity: {quantity})")
76 | except Exception as error:
77 | print(f"❌ Error inserting order data: {error}")
78 | conn.rollback()
79 |
80 |
81 | def main():
82 | """Main function to run the continuous insertion loop."""
83 | print("🚀 Starting continuous data insertion script...")
84 |
85 | conn = get_db_connection()
86 | try:
87 | while True:
88 | print("-" * 30)
89 | purchaser_id = insert_random_customer(conn)
90 | num_orders = random.randint(0, 3)
91 | for _ in range(num_orders):
92 | insert_random_order(conn, purchaser_id)
93 |
94 | print(f"⏳ Waiting for {INSERT_INTERVAL_SECONDS} seconds...")
95 | time.sleep(INSERT_INTERVAL_SECONDS)
96 |
97 | except KeyboardInterrupt:
98 | print("\n\n🛑 Script stopped by user (Ctrl+C).")
99 | finally:
100 | if conn:
101 | conn.close()
102 | print("Database connection closed.")
103 |
104 |
105 | if __name__ == "__main__":
106 | main()
107 |
--------------------------------------------------------------------------------
/examples/lakekeeper/produce_data.py:
--------------------------------------------------------------------------------
1 | import psycopg2
2 | import random
3 | import time
4 | from faker import Faker
5 |
6 | # --- Database Connection Details ---
7 | # Based on your Debezium configuration, the host/port/user/password for the *source* DB
8 | DB_HOST = "localhost"
9 | DB_NAME = "postgres"
10 | DB_USER = "postgres"
11 | DB_PASSWORD = "postgres"
12 | DB_PORT = 5432
13 |
14 | # --- Configuration ---
15 | INSERT_INTERVAL_SECONDS = 10
16 | fake = Faker()
17 |
18 |
19 | def get_db_connection():
20 | """Establishes and returns a PostgreSQL database connection."""
21 | try:
22 | conn = psycopg2.connect(
23 | host=DB_HOST,
24 | database=DB_NAME,
25 | user=DB_USER,
26 | password=DB_PASSWORD,
27 | port=DB_PORT
28 | )
29 | return conn
30 | except Exception as error:
31 | print(f"❌ Error connecting to the database: {error}")
32 | return None
33 |
34 |
35 | def insert_random_customer(conn):
36 | first_name = f"test-{fake.first_name()}"
37 | last_name = fake.last_name()
38 | email = f"{first_name.lower()}.{last_name.lower()}{random.randint(1, 100)}@{fake.domain_name()}"
39 |
40 | insert_query = """
41 | INSERT INTO inventory.customers (first_name, last_name, email)
42 | VALUES (%s, %s, %s)
43 | RETURNING id;
44 | """
45 |
46 | try:
47 | with conn.cursor() as cur:
48 | cur.execute(insert_query, (first_name, last_name, email))
49 | new_id = cur.fetchone()[0]
50 | conn.commit()
51 | print(f"✅ Inserted new customer (ID: {new_id}): {first_name} {last_name}")
52 | return new_id
53 | except Exception as error:
54 | print(f"❌ Error inserting customer data: {error}")
55 | conn.rollback()
56 | raise error
57 |
58 |
59 | def insert_random_order(conn, purchaser_id):
60 | """Inserts a new random order into the inventory.orders table."""
61 | # Ensure there's at least one customer ID to reference
62 | order_date = fake.date_time_this_year()
63 | quantity = random.randint(1, 10)
64 | product_id = random.randint(101, 109) # Assumes existing products
65 |
66 | insert_query = """
67 | INSERT INTO inventory.orders (order_date, purchaser, quantity, product_id)
68 | VALUES (%s, %s, %s, %s);
69 | """
70 |
71 | try:
72 | with conn.cursor() as cur:
73 | cur.execute(insert_query, (order_date, purchaser_id, quantity, product_id))
74 | conn.commit()
75 | print(f"✅ Inserted new order for purchaser ID {purchaser_id} (Product: {product_id}, Quantity: {quantity})")
76 | except Exception as error:
77 | print(f"❌ Error inserting order data: {error}")
78 | conn.rollback()
79 |
80 |
81 | def main():
82 | """Main function to run the continuous insertion loop."""
83 | print("🚀 Starting continuous data insertion script...")
84 |
85 | conn = get_db_connection()
86 | try:
87 | while True:
88 | print("-" * 30)
89 | purchaser_id = insert_random_customer(conn)
90 | num_orders = random.randint(0, 3)
91 | for _ in range(num_orders):
92 | insert_random_order(conn, purchaser_id)
93 |
94 | print(f"⏳ Waiting for {INSERT_INTERVAL_SECONDS} seconds...")
95 | time.sleep(INSERT_INTERVAL_SECONDS)
96 |
97 | except KeyboardInterrupt:
98 | print("\n\n🛑 Script stopped by user (Ctrl+C).")
99 | finally:
100 | if conn:
101 | conn.close()
102 | print("Database connection closed.")
103 |
104 |
105 | if __name__ == "__main__":
106 | main()
107 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/batchsizewait/MaxBatchSizeWaitTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.batchsizewait;
10 |
11 | import io.debezium.server.iceberg.BaseSparkTest;
12 | import io.debezium.server.iceberg.testresources.CatalogNessie;
13 | import io.debezium.server.iceberg.testresources.S3Minio;
14 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
15 | import io.quarkus.test.common.QuarkusTestResource;
16 | import io.quarkus.test.junit.QuarkusTest;
17 | import io.quarkus.test.junit.QuarkusTestProfile;
18 | import io.quarkus.test.junit.TestProfile;
19 | import org.apache.spark.sql.Dataset;
20 | import org.apache.spark.sql.Row;
21 | import org.awaitility.Awaitility;
22 | import org.eclipse.microprofile.config.inject.ConfigProperty;
23 | import org.junit.jupiter.api.Disabled;
24 | import org.junit.jupiter.api.Test;
25 | import org.junit.jupiter.api.condition.DisabledIfEnvironmentVariable;
26 |
27 | import java.time.Duration;
28 | import java.util.HashMap;
29 | import java.util.Map;
30 |
31 | @QuarkusTest
32 | @TestProfile(MaxBatchSizeWaitTest.TestProfile.class)
33 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
34 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
35 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
36 | @DisabledIfEnvironmentVariable(named = "DEBEZIUM_FORMAT_VALUE", matches = "connect")
37 | @Disabled
38 | class MaxBatchSizeWaitTest extends BaseSparkTest {
39 |
40 | @ConfigProperty(name = "debezium.source.max.batch.size", defaultValue = "1000")
41 | Integer maxBatchSize;
42 |
43 |
44 | @Test
45 | public void testBatchsizeWait() throws Exception {
46 | int iteration = 100;
47 | PGCreateTestDataTable();
48 | for (int i = 0; i <= iteration; i++) {
49 | this.PGLoadTestDataTable(maxBatchSize / 2, true);
50 | }
51 | Awaitility.await().atMost(Duration.ofSeconds(180)).until(() -> {
52 | try {
53 | Dataset df = spark.sql("SELECT substring(input_file_name(),0,260) as input_file, " +
54 | "count(*) as batch_size FROM debeziumevents.debeziumcdc_testc_inventory_test_data group " +
55 | "by 1");
56 | df.show(false);
57 | // commited batch size should be equal to maxBatchSize
58 | // since timeout is set to high number so the batches should be triggered by hitting maxBatchSize limit
59 | return df.filter("batch_size = " + maxBatchSize).count() >= 3;
60 | } catch (Exception e) {
61 | //e.printStackTrace();
62 | return false;
63 | }
64 | });
65 | }
66 |
67 | public static class TestProfile implements QuarkusTestProfile {
68 | @Override
69 | public Map getConfigOverrides() {
70 | Map config = new HashMap<>();
71 | // wait
72 | config.put("debezium.sink.batch.batch-size-wait", "MaxBatchSizeWait");
73 | config.put("debezium.source.connector.class", "io.debezium.connector.postgresql.PostgresConnector");
74 | config.put("debezium.source.max.batch.size", "2000");
75 | config.put("debezium.source.max.queue.size", "70000");
76 | //config.put("debezium.source.poll.interval.ms", "1000");
77 | config.put("debezium.sink.batch.batch-size-wait.max-wait-ms", "999000");
78 | config.put("debezium.sink.batch.batch-size-wait.wait-interval-ms", "5000");
79 | config.put("quarkus.log.category.\"io.debezium.server.iceberg.batchsizewait\".level", "DEBUG");
80 | return config;
81 | }
82 | }
83 | }
--------------------------------------------------------------------------------
/examples/nessie/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.7"
2 |
3 | services:
4 | debezium-iceberg:
5 | image: ghcr.io/memiiso/debezium-server-iceberg:latest
6 | container_name: debezium-server-iceberg
7 | networks:
8 | - iceberg_nessie_net
9 | depends_on:
10 | - nessie
11 | - minio
12 | - postgresqlsourcedb
13 | - mc
14 | volumes:
15 | - ./config/application.properties:/debezium/config/application.properties
16 | command: >
17 | bash -c "
18 | mvn dependency:copy -Dartifact=org.apache.iceberg:iceberg-nessie:1.10.0:jar -DoutputDirectory=/debezium/lib/ \
19 | && mvn dependency:copy -Dartifact=org.projectnessie.nessie:nessie-client:0.104.5:jar -DoutputDirectory=/debezium/lib/ \
20 | && mvn dependency:copy -Dartifact=org.projectnessie.nessie:nessie-model:0.104.5:jar -DoutputDirectory=/debezium/lib/ \
21 | && /debezium/run.sh
22 | "
23 | environment:
24 | - AWS_ACCESS_KEY_ID=admin
25 | - AWS_SECRET_ACCESS_KEY=password
26 | - AWS_REGION=us-east-1
27 |
28 | nessie:
29 | image: projectnessie/nessie:latest
30 | container_name: nessie
31 | networks:
32 | - iceberg_nessie_net
33 | ports:
34 | - "19120:19120"
35 | depends_on:
36 | - minio
37 | environment:
38 | - QUARKUS_PROFILE=prod
39 | - QUARKUS_HTTP_PORT=19120
40 | - QUARKUS_LOG_LEVEL=DEBUG
41 | - QUARKUS_OTEL_SDK_DISABLED=true
42 | - NESSIE_CATALOG_DEFAULT_WAREHOUSE=warehouse
43 | - NESSIE_SERVER_AUTHENTICATION_ENABLED=false
44 | - NESSIE_CATALOG_SERVICE_S3_DEFAULT_OPTIONS_ENDPOINT=http://minio:9000
45 | - NESSIE_VERSION_STORE_TYPE=IN_MEMORY
46 | - NESSIE_CATALOG_WAREHOUSES_WAREHOUSE_LOCATION=s3a://warehouse/
47 | - NESSIE_CATALOG_SERVICE_S3_DEFAULT_OPTIONS_ACCESS_KEY=urn:nessie-secret:quarkus:nessie.catalog.secrets.access-key
48 | - NESSIE_CATALOG_SERVICE_S3_DEFAULT_OPTIONS_PATH_STYLE_ACCESS=true
49 | - NESSIE_CATALOG_SERVICE_S3_DEFAULT_OPTIONS_AUTH_TYPE=STATIC
50 | - NESSIE_CATALOG_SECRETS_ACCESS_KEY_NAME=admin
51 | - NESSIE_CATALOG_SECRETS_ACCESS_KEY_SECRET=password
52 | - NESSIE_CATALOG_SERVICE_S3_DEFAULT_OPTIONS_REGION=us-east-1
53 | - NESSIE_CLIENT_API_VERSION=2
54 |
55 | postgresqlsourcedb:
56 | image: debezium/example-postgres:2.5
57 | container_name: postgresql-source-db
58 | environment:
59 | - POSTGRES_USER=postgres
60 | - POSTGRES_PASSWORD=postgres
61 | - POSTGRES_DB=postgres
62 | - POSTGRES_INITDB_ARGS="-E UTF8"
63 | - LANG=en_US.utf8
64 | networks:
65 | - iceberg_nessie_net
66 | ports:
67 | - "5432:5432"
68 |
69 | # storage
70 | minio:
71 | image: minio/minio
72 | container_name: minio
73 | environment:
74 | - MINIO_ROOT_USER=admin
75 | - MINIO_ROOT_PASSWORD=password
76 | - MINIO_DOMAIN=minio
77 | - MINIO_BUCKET=warehouse
78 | networks:
79 | - iceberg_nessie_net
80 | ports:
81 | - "9001:9001"
82 | - "9000:9000"
83 | command: [ "server", "/data", "--console-address", ":9001" ]
84 | mc:
85 | image: minio/mc
86 | container_name: mc
87 | networks:
88 | - iceberg_nessie_net
89 | depends_on:
90 | - minio
91 | environment:
92 | - AWS_ACCESS_KEY_ID=demo
93 | - AWS_SECRET_ACCESS_KEY=password
94 | - AWS_REGION=us-east-1
95 | entrypoint: >
96 | /bin/sh -c "
97 | until (/usr/bin/mc alias set minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
98 | /usr/bin/mc mb --ignore-existing minio/warehouse;
99 | /usr/bin/mc anonymous set public minio/warehouse;
100 | exit 0;
101 | "
102 | networks:
103 | iceberg_nessie_net:
104 | name: iceberg_nessie_net
105 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/testresources/S3Minio.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.testresources;
10 |
11 | import io.debezium.server.iceberg.TestConfigSource;
12 | import io.minio.ListObjectsArgs;
13 | import io.minio.MakeBucketArgs;
14 | import io.minio.MinioClient;
15 | import io.minio.Result;
16 | import io.minio.messages.Bucket;
17 | import io.minio.messages.Item;
18 | import io.quarkus.test.common.QuarkusTestResourceLifecycleManager;
19 | import org.slf4j.Logger;
20 | import org.slf4j.LoggerFactory;
21 | import org.testcontainers.containers.MinIOContainer;
22 | import org.testcontainers.utility.DockerImageName;
23 |
24 | import java.util.List;
25 | import java.util.Map;
26 | import java.util.concurrent.ConcurrentHashMap;
27 |
28 | public class S3Minio implements QuarkusTestResourceLifecycleManager {
29 |
30 | protected static final Logger LOGGER = LoggerFactory.getLogger(S3Minio.class);
31 | static final String DEFAULT_IMAGE = "minio/minio:RELEASE.2025-04-08T15-41-24Z";
32 | public static MinioClient client;
33 |
34 | static public final MinIOContainer container = new MinIOContainer(DockerImageName.parse(DEFAULT_IMAGE))
35 | .withUserName(TestConfigSource.S3_MINIO_ACCESS_KEY)
36 | .withPassword(TestConfigSource.S3_MINIO_SECRET_KEY);
37 |
38 | public static void listFiles() {
39 | LOGGER.info("-----------------------------------------------------------------");
40 | try {
41 | List bucketList = client.listBuckets();
42 | for (Bucket bucket : bucketList) {
43 | System.out.printf("Bucket:%s ROOT\n", bucket.name());
44 | Iterable> results = client.listObjects(ListObjectsArgs.builder().bucket(bucket.name()).recursive(true).build());
45 | for (Result- result : results) {
46 | Item item = result.get();
47 | System.out.printf("Bucket:%s Item:%s Size:%s\n", bucket.name(), item.objectName(), item.size());
48 | }
49 | }
50 | } catch (Exception e) {
51 | LOGGER.info("Failed listing bucket");
52 | }
53 | LOGGER.info("-----------------------------------------------------------------");
54 |
55 | }
56 |
57 | @Override
58 | public void stop() {
59 | container.stop();
60 | }
61 |
62 | public static String getS3WebURL() {
63 | return String.format("http://%s:%s", container.getHost(), container.getMappedPort(9001));
64 | }
65 |
66 | @Override
67 | public Map start() {
68 | container.start();
69 | client = MinioClient
70 | .builder()
71 | .endpoint(container.getS3URL())
72 | .credentials(container.getUserName(), container.getPassword())
73 | .build();
74 |
75 | try {
76 | client.ignoreCertCheck();
77 | client.makeBucket(MakeBucketArgs.builder()
78 | .region(TestConfigSource.S3_REGION)
79 | .bucket(TestConfigSource.S3_BUCKET_NAME)
80 | .build());
81 | } catch (Exception e) {
82 | e.printStackTrace();
83 | }
84 | LOGGER.info("Minio Started\nMinio UI: {}\nMinio S3 URL: {}", getS3WebURL(), container.getS3URL());
85 | Map config = new ConcurrentHashMap<>();
86 | // FOR JDBC CATALOG
87 | config.put("debezium.sink.iceberg.s3.endpoint", container.getS3URL());
88 | config.put("debezium.sink.iceberg.s3.path-style-access", "true");
89 | config.put("debezium.sink.iceberg.s3.access-key-id", TestConfigSource.S3_MINIO_ACCESS_KEY);
90 | config.put("debezium.sink.iceberg.s3.secret-access-key", TestConfigSource.S3_MINIO_SECRET_KEY);
91 | config.put("debezium.sink.iceberg.client.region", TestConfigSource.S3_REGION);
92 | config.put("debezium.sink.iceberg.io-impl", TestConfigSource.ICEBERG_FILEIO);
93 |
94 | return config;
95 | }
96 |
97 |
98 | }
99 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Create Release
2 |
3 | on:
4 | push:
5 | tags: [ '*.*.*.*' ]
6 | branches: [ master ]
7 |
8 | env:
9 | SPARK_LOCAL_IP: 127.0.0.1
10 |
11 | jobs:
12 | build-and-release:
13 | name: Build and Release
14 | if: github.repository_owner == 'memiiso'
15 | runs-on: ubuntu-latest
16 | permissions:
17 | contents: write
18 | packages: write
19 | actions: write
20 | steps:
21 | - name: Checkout Repository
22 | uses: actions/checkout@v4
23 |
24 | - name: Set up Java
25 | uses: actions/setup-java@v5
26 | with:
27 | distribution: 'temurin'
28 | java-version: 21
29 | cache: 'maven'
30 |
31 | - name: Determine Release Name
32 | id: set_release_version
33 | run: |
34 | if [[ "${{ github.ref_name }}" == "master" ]]; then
35 | echo "RELEASE_VERSION=latest" >> $GITHUB_ENV
36 | else
37 | echo "RELEASE_VERSION=${{ github.ref_name }}" >> $GITHUB_ENV
38 | fi
39 | shell: bash # Explicitly set shell to bash
40 |
41 | - name: Build Project
42 | run: mvn -B --no-transfer-progress clean package -Passembly --file pom.xml -Drevision=${{ env.RELEASE_VERSION }}
43 |
44 | - name: Delete Existing Release (if any)
45 | run: gh release delete ${{ env.RELEASE_VERSION }} --cleanup-tag --yes
46 | continue-on-error: true
47 | env:
48 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
49 |
50 | - name: Create GitHub Release
51 | id: create_release
52 | uses: softprops/action-gh-release@v2
53 | env:
54 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
55 | with:
56 | name: Release ${{ env.RELEASE_VERSION }}
57 | tag_name: ${{ env.RELEASE_VERSION }}
58 | body: Release ${{ env.RELEASE_VERSION }}
59 | draft: false
60 | prerelease: true
61 | - name: Delete Maven Package from Github
62 | uses: paulushcgcj/delete-github-package@1.0.0
63 | with:
64 | type: maven
65 | name: io.debezium.debezium-server-iceberg
66 | version: ${{ env.RELEASE_VERSION }}
67 | continue-on-error: true
68 | - name: Delete Maven Package Dist from Github
69 | uses: paulushcgcj/delete-github-package@1.0.0
70 | with:
71 | type: maven
72 | name: io.debezium.debezium-server-iceberg-dist
73 | version: ${{ env.RELEASE_VERSION }}
74 | continue-on-error: true
75 | - name: Delete Maven Package Sink from Github
76 | uses: paulushcgcj/delete-github-package@1.0.0
77 | with:
78 | type: maven
79 | name: io.debezium.debezium-server-iceberg-sink
80 | version: ${{ env.RELEASE_VERSION }}
81 | continue-on-error: true
82 | - name: Publish ${{ env.RELEASE_VERSION }} to GitHub Packages
83 | run: mvn --batch-mode clean package -Passembly deploy --file pom.xml -Drevision=${{ env.RELEASE_VERSION }} -Dmaven.test.skip=true
84 | env:
85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 |
87 | - name: Login to GHCR
88 | uses: docker/login-action@v3
89 | with:
90 | registry: ghcr.io
91 | username: ${{ github.repository_owner }}
92 | password: ${{ secrets.GITHUB_TOKEN }}
93 |
94 | - name: Build and Push Docker Image
95 | uses: docker/build-push-action@v6
96 | with:
97 | context: ./
98 | file: ./Dockerfile
99 | push: true
100 | build-args: |
101 | RELEASE_VERSION=${{ env.RELEASE_VERSION }}
102 | tags: ghcr.io/${{ github.repository_owner }}/debezium-server-iceberg:${{ env.RELEASE_VERSION }}
103 |
104 | - name: Delete Untagged Docker Images
105 | uses: dylanratcliffe/delete-untagged-containers@main
106 | with:
107 | package_name: debezium-server-iceberg
108 | token: ${{ secrets.GITHUB_TOKEN }}
109 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/history/IcebergSchemaHistoryTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg.history;
10 |
11 | import io.debezium.server.iceberg.BaseSparkTest;
12 | import io.debezium.server.iceberg.testresources.CatalogNessie;
13 | import io.debezium.server.iceberg.testresources.S3Minio;
14 | import io.debezium.server.iceberg.testresources.SourceMysqlDB;
15 | import io.quarkus.test.common.QuarkusTestResource;
16 | import io.quarkus.test.junit.QuarkusTest;
17 | import io.quarkus.test.junit.QuarkusTestProfile;
18 | import io.quarkus.test.junit.TestProfile;
19 | import org.apache.spark.sql.Dataset;
20 | import org.apache.spark.sql.Row;
21 | import org.awaitility.Awaitility;
22 | import org.junit.jupiter.api.Test;
23 | import org.junit.jupiter.api.condition.DisabledIfEnvironmentVariable;
24 |
25 | import java.sql.SQLException;
26 | import java.time.Duration;
27 | import java.util.HashMap;
28 | import java.util.Map;
29 |
30 | import static io.debezium.server.iceberg.TestConfigSource.ICEBERG_CATALOG_TABLE_NAMESPACE;
31 |
32 | /**
33 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
34 | *
35 | * @author Ismail Simsek
36 | */
37 | @QuarkusTest
38 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
39 | @QuarkusTestResource(value = SourceMysqlDB.class, restrictToAnnotatedClass = true)
40 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
41 | @TestProfile(IcebergSchemaHistoryTest.TestProfile.class)
42 | @DisabledIfEnvironmentVariable(named = "DEBEZIUM_FORMAT_VALUE", matches = "connect")
43 | public class IcebergSchemaHistoryTest extends BaseSparkTest {
44 | @Test
45 | public void testSimpleUpload() throws SQLException, ClassNotFoundException {
46 | String sqlCreate = "CREATE TABLE IF NOT EXISTS inventory.test_schema_history_ddl (" +
47 | " c_id INTEGER ," +
48 | " c_data TEXT," +
49 | " PRIMARY KEY (c_id)" +
50 | " );";
51 | SourceMysqlDB.runSQL(sqlCreate);
52 | String sqlInsert =
53 | "INSERT INTO inventory.test_schema_history_ddl (c_id, c_data ) " +
54 | "VALUES (1,'data-1'),(2,'data-2'),(3,'data-3');";
55 | SourceMysqlDB.runSQL(sqlInsert);
56 |
57 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
58 | try {
59 | Dataset
ds = getTableData("testc.inventory.test_schema_history_ddl");
60 | return ds.count() >= 2;
61 | } catch (Exception e) {
62 | return false;
63 | }
64 | });
65 |
66 | // test nested data(struct) consumed
67 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
68 | try {
69 | Dataset ds = getTableData(ICEBERG_CATALOG_TABLE_NAMESPACE, "debezium_database_history_storage_table");
70 | ds.show(10, false);
71 | return ds.count() > 1
72 | && ds.where("history_data ILIKE '%CREATE%TABLE%test_schema_history_ddl%'").count() == 1
73 | ;
74 | } catch (Exception e) {
75 | return false;
76 | }
77 | });
78 | }
79 |
80 |
81 | public static class TestProfile implements QuarkusTestProfile {
82 | @Override
83 | public Map getConfigOverrides() {
84 | Map config = new HashMap<>();
85 | config.put("quarkus.profile", "mysql");
86 | config.put("%mysql.debezium.source.connector.class", "io.debezium.connector.mysql.MySqlConnector");
87 | // config.put("%mysql.debezium.source.table.whitelist", "inventory.*");
88 | config.put("debezium.source.schema.history.internal", "io.debezium.server.iceberg.history.IcebergSchemaHistory");
89 | config.put("debezium.source.schema.history.internal.iceberg.table-name", "debezium_database_history_storage_table");
90 | config.put("debezium.source.table.whitelist", "inventory.test_schema_history_ddl");
91 | return config;
92 | }
93 |
94 | @Override
95 | public String getConfigProfile() {
96 | return "mysql";
97 | }
98 | }
99 | }
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerMysqlTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import com.google.common.collect.Lists;
12 | import io.debezium.jdbc.TemporalPrecisionMode;
13 | import io.debezium.server.iceberg.testresources.CatalogNessie;
14 | import io.debezium.server.iceberg.testresources.S3Minio;
15 | import io.debezium.server.iceberg.testresources.SourceMysqlDB;
16 | import io.quarkus.test.common.QuarkusTestResource;
17 | import io.quarkus.test.junit.QuarkusTest;
18 | import io.quarkus.test.junit.QuarkusTestProfile;
19 | import io.quarkus.test.junit.TestProfile;
20 | import org.apache.iceberg.data.Record;
21 | import org.apache.iceberg.io.CloseableIterable;
22 | import org.awaitility.Awaitility;
23 | import org.junit.jupiter.api.Test;
24 |
25 | import java.time.Duration;
26 | import java.util.HashMap;
27 | import java.util.Map;
28 |
29 | import static org.junit.jupiter.api.Assertions.assertEquals;
30 |
31 | /**
32 | * @author Ismail Simsek
33 | */
34 | @QuarkusTest
35 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
36 | @QuarkusTestResource(value = SourceMysqlDB.class, restrictToAnnotatedClass = true)
37 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
38 | @TestProfile(IcebergChangeConsumerMysqlTest.TestProfile.class)
39 | public class IcebergChangeConsumerMysqlTest extends BaseTest {
40 |
41 | @Test
42 | public void testSimpleUpload() throws Exception {
43 | assertEquals(config.debezium().isEventFlatteningEnabled(), true);
44 | assertEquals(config.debezium().temporalPrecisionMode(), TemporalPrecisionMode.CONNECT);
45 | String sqlCreate = "CREATE TABLE IF NOT EXISTS inventory.test_delete_table (" +
46 | " c_id INTEGER ," +
47 | " c_id2 INTEGER ," +
48 | " c_data TEXT," +
49 | " PRIMARY KEY (c_id, c_id2)" +
50 | " );";
51 | String sqlInsert =
52 | "INSERT INTO inventory.test_delete_table (c_id, c_id2, c_data ) " +
53 | "VALUES (1,1,'data'),(1,2,'data'),(1,3,'data'),(1,4,'data') ;";
54 | String sqlDelete = "DELETE FROM inventory.test_delete_table where c_id = 1 ;";
55 |
56 | SourceMysqlDB.runSQL(sqlCreate);
57 | SourceMysqlDB.runSQL(sqlInsert);
58 | Awaitility.await().atMost(Duration.ofSeconds(60)).until(() -> {
59 | try {
60 |
61 | CloseableIterable result = getTableDataV2("testc.inventory.test_delete_table");
62 | return Lists.newArrayList(result).size() == 4;
63 | } catch (Exception e) {
64 | return false;
65 | }
66 | });
67 |
68 | SourceMysqlDB.runSQL(sqlDelete);
69 | SourceMysqlDB.runSQL(sqlInsert);
70 | SourceMysqlDB.runSQL(sqlDelete);
71 | SourceMysqlDB.runSQL(sqlInsert);
72 |
73 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
74 | try {
75 | CloseableIterable result = getTableDataV2("testc.inventory.test_delete_table");
76 | //result.forEach(System.out::println);
77 | //System.out.println("======================");
78 | return Lists.newArrayList(result).size() >= 20;
79 | } catch (Exception e) {
80 | return false;
81 | }
82 | });
83 |
84 | }
85 |
86 | public static class TestProfile implements QuarkusTestProfile {
87 | @Override
88 | public Map getConfigOverrides() {
89 | Map config = new HashMap<>();
90 | config.put("quarkus.profile", "mysql");
91 | config.put("%mysql.debezium.source.connector.class", "io.debezium.connector.mysql.MySqlConnector");
92 | config.put("%mysql.debezium.source.table.whitelist", "inventory.customers,inventory.test_delete_table");
93 | //config.put("%mysql.debezium.source.include.schema.changes", "false");
94 | return config;
95 | }
96 |
97 | @Override
98 | public String getConfigProfile() {
99 | return "mysql";
100 | }
101 | }
102 |
103 | }
104 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerTestUnwraapped.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import com.google.common.collect.Lists;
12 | import io.debezium.server.iceberg.testresources.CatalogNessie;
13 | import io.debezium.server.iceberg.testresources.S3Minio;
14 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
15 | import io.quarkus.test.common.QuarkusTestResource;
16 | import io.quarkus.test.junit.QuarkusTest;
17 | import io.quarkus.test.junit.QuarkusTestProfile;
18 | import io.quarkus.test.junit.TestProfile;
19 | import org.apache.iceberg.catalog.TableIdentifier;
20 | import org.apache.iceberg.data.Record;
21 | import org.apache.iceberg.io.CloseableIterable;
22 | import org.apache.spark.sql.Dataset;
23 | import org.apache.spark.sql.Row;
24 | import org.awaitility.Awaitility;
25 | import org.junit.jupiter.api.Test;
26 |
27 | import java.time.Duration;
28 | import java.util.HashMap;
29 | import java.util.Map;
30 |
31 | import static io.debezium.server.iceberg.TestConfigSource.ICEBERG_CATALOG_TABLE_NAMESPACE;
32 | import static org.junit.jupiter.api.Assertions.assertEquals;
33 | import static org.junit.jupiter.api.Assertions.assertTrue;
34 |
35 | /**
36 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
37 | *
38 | * @author Ismail Simsek
39 | */
40 | @QuarkusTest
41 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
42 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
43 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
44 | @TestProfile(IcebergChangeConsumerTestUnwraapped.TestProfile.class)
45 | public class IcebergChangeConsumerTestUnwraapped extends BaseSparkTest {
46 |
47 |
48 | @Test
49 | public void testDebeziumConfig() {
50 | assertTrue(config.debezium().transformsConfigs().containsKey("unwrap.type"));
51 | assertEquals(debeziumConfig.transforms(), ",");
52 | assertEquals(false, config.debezium().isEventFlatteningEnabled());
53 |
54 | debeziumConfig.transformsConfigs().forEach( (k,v) -> {
55 | LOGGER.error("{} ==> {}", k, v);
56 | } );
57 | }
58 |
59 | @Test
60 | public void testSimpleUpload() {
61 |
62 | // make sure its not unwrapped
63 | assertEquals(config.debezium().isEventFlatteningEnabled(), false);
64 |
65 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
66 | try {
67 | Dataset ds = getTableData("testc.inventory.customers");
68 | ds.show(false);
69 | return ds.count() >= 3;
70 | } catch (Exception e) {
71 | return false;
72 | }
73 | });
74 |
75 | // test nested data(struct) consumed
76 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
77 | try {
78 | Dataset ds = getTableData("testc.inventory.geom");
79 | ds.show(false);
80 | return ds.count() >= 3;
81 | } catch (Exception e) {
82 | return false;
83 | }
84 | });
85 |
86 | Awaitility.await().atMost(Duration.ofSeconds(120)).until(() -> {
87 | try {
88 | CloseableIterable d = getTableDataV2(TableIdentifier.of(ICEBERG_CATALOG_TABLE_NAMESPACE, "debezium_offset_storage_table"));
89 | System.out.println(Lists.newArrayList(d));
90 | return Lists.newArrayList(d).size() == 1;
91 | } catch (Exception e) {
92 | return false;
93 | }
94 | });
95 | }
96 |
97 | public static class TestProfile implements QuarkusTestProfile {
98 | @Override
99 | public Map getConfigOverrides() {
100 | Map config = new HashMap<>();
101 | config.put("debezium.sink.iceberg.write.format.default", "orc");
102 | config.put("debezium.sink.iceberg.destination-regexp", "\\d");
103 | config.put("debezium.source.hstore.handling.mode", "map");
104 | config.put("debezium.transforms", ",");
105 | config.put("debezium.sink.iceberg.create-identifier-fields", "false");
106 | return config;
107 | }
108 | }
109 |
110 | }
111 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/assemblies/server-distribution.xml:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
13 | distribution
14 |
15 |
16 | zip
17 |
18 | false
19 |
20 |
21 | ${project.parent.artifactId}/lib
22 | false
23 | runtime
24 | false
25 | true
26 |
27 | org.apache.kafka:kafka-tools:*
28 | javax.ws.rs:javax.ws.rs-api:*
29 | org.apache.kafka:connect-file:*
30 | org.glassfish.jersey.*:*:*
31 | org.eclipse.jetty:*:*
32 | org.codehaus.plexus:*:*
33 | log4j:log4j:*
34 | ch.qos.reload4j:reload4j
35 | io.debezium:debezium-scripting
36 | io.debezium:debezium-scripting-languages
37 | io.prometheus.jmx:jmx_prometheus_javaagent:*
38 |
39 |
40 |
41 | ${project.parent.artifactId}/lib_metrics
42 | false
43 | runtime
44 | false
45 | true
46 |
47 | io.prometheus.jmx:jmx_prometheus_javaagent:*
48 |
49 |
50 |
51 | ${project.parent.artifactId}/lib_opt
52 | false
53 | runtime
54 | false
55 | true
56 |
57 | io.debezium:debezium-scripting:*
58 | io.debezium:debezium-scripting-languages:*
59 |
60 |
61 |
62 |
63 |
64 |
65 | ${project.basedir}/..
66 | ${project.parent.artifactId}
67 |
68 | README*
69 | CHANGELOG*
70 | CONTRIBUTE*
71 | COPYRIGHT*
72 | LICENSE*
73 |
74 | true
75 |
76 |
77 |
78 | ${project.build.directory}
79 | ${project.parent.artifactId}
80 |
81 | *-runner.jar
82 | LICENSE-3RD-PARTY.txt
83 |
84 |
85 |
86 | src/main/resources/distro
87 | ${project.parent.artifactId}
88 |
89 | **/*
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-dist/src/main/resources/distro/debezium.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import jnius_config
3 | import logging
4 | import os
5 | import sys
6 | ##### loggger
7 | import threading
8 | from pathlib import Path
9 |
10 | log = logging.getLogger(name="debezium")
11 | log.setLevel(logging.INFO)
12 | handler = logging.StreamHandler(sys.stdout)
13 | handler.setLevel(logging.INFO)
14 | formatter = logging.Formatter('%(asctime)s %(levelname)s [%(module)s] (%(funcName)s) %(message)s')
15 | handler.setFormatter(formatter)
16 | log.addHandler(handler)
17 |
18 |
19 | #####
20 |
21 | class Debezium():
22 |
23 | def __init__(self, debezium_dir: str = None, conf_dir: str = None, java_home: str = None):
24 | if debezium_dir is None:
25 | self.debezium_server_dir: Path = Path(__file__).resolve().parent
26 | else:
27 | if not Path(debezium_dir).is_dir():
28 | raise Exception("Debezium Server directory '%s' not found" % debezium_dir)
29 | self.debezium_server_dir: Path = Path(debezium_dir)
30 | log.info("Setting Debezium dir to:%s" % self.debezium_server_dir.as_posix())
31 |
32 | if conf_dir is None:
33 | self.conf_dir = self.debezium_server_dir.joinpath("config")
34 | else:
35 | if not Path(conf_dir).is_dir():
36 | raise Exception("Debezium conf directory '%s' not found" % conf_dir)
37 | self.conf_dir: Path = Path(conf_dir)
38 | log.info("Setting conf dir to:%s" % self.conf_dir.as_posix())
39 |
40 | ##### jnius
41 | if java_home:
42 | self.java_home(java_home=java_home)
43 |
44 | DEBEZIUM_CLASSPATH: list = [
45 | self.debezium_server_dir.joinpath('*').as_posix(),
46 | self.debezium_server_dir.joinpath("lib/*").as_posix(),
47 | self.conf_dir.as_posix()]
48 | self.add_classpath(*DEBEZIUM_CLASSPATH)
49 |
50 | def add_classpath(self, *claspath):
51 | if jnius_config.vm_running:
52 | raise ValueError(
53 | "VM is already running, can't set classpath/options; VM started at %s" % jnius_config.vm_started_at)
54 |
55 | jnius_config.add_classpath(*claspath)
56 | log.info("VM Classpath: %s" % jnius_config.get_classpath())
57 |
58 | def java_home(self, java_home: str):
59 | if jnius_config.vm_running:
60 | raise ValueError("VM is already running, can't set java home; VM started at" + jnius_config.vm_started_at)
61 |
62 | os.putenv("JAVA_HOME", java_home)
63 | os.environ["JAVA_HOME"] = java_home
64 | log.info("JAVA_HOME set to %s" % java_home)
65 |
66 | # pylint: disable=no-name-in-module
67 | def run(self, *args: str):
68 |
69 | try:
70 | jnius_config.add_options(*args)
71 | log.info("Configured jvm options:%s" % jnius_config.get_options())
72 |
73 | from jnius import autoclass
74 | DebeziumServer = autoclass('io.debezium.server.Main')
75 | _dbz = DebeziumServer()
76 | return _dbz.main()
77 | finally:
78 | from jnius import detach
79 | detach()
80 |
81 |
82 | class DebeziumRunAsyn(threading.Thread):
83 | def __init__(self, debezium_dir: str, java_args: list, java_home: str = None):
84 | threading.Thread.__init__(self)
85 | self.debezium_dir = debezium_dir
86 | self.java_args = java_args
87 | self.java_home = java_home
88 | self._dbz: Debezium = None
89 |
90 | def run(self):
91 | self._dbz = Debezium(debezium_dir=self.debezium_dir, java_home=self.java_home)
92 | return self._dbz.run(*self.java_args)
93 |
94 |
95 | def main():
96 | parser = argparse.ArgumentParser()
97 | parser.add_argument('--debezium_dir', type=str, default=None,
98 | help='Directory of debezium server application')
99 | parser.add_argument('--conf_dir', type=str, default=None,
100 | help='Directory of application.properties')
101 | parser.add_argument('--java_home', type=str, default=None,
102 | help='JAVA_HOME directory')
103 | _args, args = parser.parse_known_args()
104 | ds = Debezium(debezium_dir=_args.debezium_dir, conf_dir=_args.conf_dir, java_home=_args.java_home)
105 | ds.run(*args)
106 |
107 |
108 | if __name__ == '__main__':
109 | main()
110 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/tableoperator/IcebergTableWriterFactory.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.tableoperator;
2 |
3 | import io.debezium.server.iceberg.GlobalConfig;
4 | import io.debezium.server.iceberg.IcebergUtil;
5 | import jakarta.enterprise.context.Dependent;
6 | import jakarta.inject.Inject;
7 | import org.apache.iceberg.FileFormat;
8 | import org.apache.iceberg.Table;
9 | import org.apache.iceberg.data.GenericAppenderFactory;
10 | import org.apache.iceberg.data.Record;
11 | import org.apache.iceberg.io.BaseTaskWriter;
12 | import org.apache.iceberg.io.OutputFileFactory;
13 | import org.apache.iceberg.io.UnpartitionedWriter;
14 | import org.apache.iceberg.util.PropertyUtil;
15 | import org.slf4j.Logger;
16 | import org.slf4j.LoggerFactory;
17 |
18 | import java.util.Set;
19 |
20 | import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES;
21 | import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT;
22 |
23 | /**
24 | * Iceberg Table Writer Factory to get TaskWriter for the table. upsert modes used to return correct writer.
25 | */
26 | @Dependent
27 | public class IcebergTableWriterFactory {
28 | private static final Logger LOGGER = LoggerFactory.getLogger(IcebergTableWriterFactory.class);
29 | @Inject
30 | GlobalConfig config;
31 |
32 | public BaseTaskWriter create(Table icebergTable) {
33 |
34 | // file format of the table parquet, orc ...
35 | FileFormat format = IcebergUtil.getTableFileFormat(icebergTable);
36 | // appender factory
37 | GenericAppenderFactory appenderFactory = IcebergUtil.getTableAppender(icebergTable);
38 | OutputFileFactory fileFactory = IcebergUtil.getTableOutputFileFactory(icebergTable, format);
39 | // equality Field Ids
40 | long targetFileSize =
41 | PropertyUtil.propertyAsLong(
42 | icebergTable.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
43 |
44 | if (!config.iceberg().upsert()) {
45 | // RUNNING APPEND MODE
46 | return appendWriter(icebergTable, format, appenderFactory, fileFactory, targetFileSize);
47 | } else if (icebergTable.schema().identifierFieldIds().isEmpty()) {
48 | // ITS UPSERT MODE BUT!!!!! TABLE DON'T HAVE identifierFieldIds(Primary Key)
49 | if (config.iceberg().upsert()) {
50 | LOGGER.info("Table don't have Pk defined upsert is not possible falling back to append!");
51 | }
52 | return appendWriter(icebergTable, format, appenderFactory, fileFactory, targetFileSize);
53 | } else {
54 | // ITS UPSERT MODE AND TABLE HAS identifierFieldIds(Primary Key)
55 | // USE DELTA WRITERS
56 | return deltaWriter(icebergTable, format, appenderFactory, fileFactory, targetFileSize);
57 | }
58 | }
59 |
60 | private BaseTaskWriter appendWriter(Table icebergTable, FileFormat format, GenericAppenderFactory appenderFactory, OutputFileFactory fileFactory, long targetFileSize) {
61 |
62 | if (icebergTable.spec().isUnpartitioned()) {
63 | // table is un partitioned use un partitioned append writer
64 | return new UnpartitionedWriter<>(
65 | icebergTable.spec(), format, appenderFactory, fileFactory, icebergTable.io(), targetFileSize);
66 |
67 | } else {
68 | // table is partitioned use partitioned append writer
69 | return new PartitionedAppendWriter(
70 | icebergTable.spec(), format, appenderFactory, fileFactory, icebergTable.io(), targetFileSize, icebergTable.schema());
71 | }
72 | }
73 |
74 | private BaseTaskWriter deltaWriter(Table icebergTable, FileFormat format, GenericAppenderFactory appenderFactory, OutputFileFactory fileFactory, long targetFileSize) {
75 |
76 | Set identifierFieldIds = icebergTable.schema().identifierFieldIds();
77 | if (icebergTable.spec().isUnpartitioned()) {
78 | // running with upsert mode + un partitioned table
79 | return new UnpartitionedDeltaWriter(icebergTable.spec(), format, appenderFactory, fileFactory,
80 | icebergTable.io(),
81 | targetFileSize, icebergTable.schema(), identifierFieldIds, config.iceberg().keepDeletes());
82 | } else {
83 | // running with upsert mode + partitioned table
84 | return new PartitionedDeltaWriter(icebergTable.spec(), format, appenderFactory, fileFactory,
85 | icebergTable.io(),
86 | targetFileSize, icebergTable.schema(), identifierFieldIds, config.iceberg().keepDeletes());
87 | }
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/docs/migration.md:
--------------------------------------------------------------------------------
1 | # Migration Guide
2 |
3 | This document provides guidance on upgrading the Debezium Iceberg consumer and handling potential migration challenges.
4 |
5 | ## General Upgrade Process
6 |
7 | Please be aware that each release may include backward-incompatible changes. Thorough testing in a staging environment is strongly recommended before deploying any new version to production.
8 |
9 | If you encounter any issues not covered here, please feel free to report them as GitHub Issues.
10 |
11 | ## Handling Incompatible Data Type Changes
12 |
13 | An incompatible data type change can occur in two main scenarios:
14 | 1. **Upgrading the Connector:** A newer version of Debezium might improve how it handles certain data types. For example, it might change its representation of timestamps from a `long` (epoch milliseconds) to a logical `timestamp` type.
15 | 2. **Source Database Schema Change:** The schema in your source database might change in a way that results in an incompatible type change in the Debezium event.
16 |
17 | In either case, the Debezium Iceberg consumer will fail to write the new data and log an error similar to this:
18 |
19 | ```
20 | java.lang.IllegalArgumentException: Cannot change column type: order_created_ts_ms: long -> timestamp
21 | ```
22 |
23 | To handle such a change, you need to perform a manual migration step on your Iceberg table. The strategy is to rename the old column, allowing the consumer to create a new column with the correct type for incoming data.
24 |
25 | ### Migration Steps
26 |
27 | Let's use the example of a column `order_created_ts_ms` changing from `long` to `timestamp`. Migrating consumer from 0.8.x to 0.9.x.
28 |
29 | 1. **Stop the Debezium Server** to prevent further write attempts.
30 |
31 | 2. **Adjust the Table Schema**
32 |
33 | You have two primary options to resolve the schema mismatch. Choose the one that best fits your table size and operational requirements.
34 |
35 | **Option 1: Rewrite the Table (for small tables)**
36 |
37 | If your table is small, you can rewrite its entire contents while converting the problematic column to the new data type. This approach avoids having separate columns for old and new data but can be very expensive for large tables.
38 |
39 | ⚠️ **Warning:** This operation rewrites the entire table and can be very slow and costly. It is generally not recommended for large production tables.
40 |
41 | Using Spark SQL, you can replace the table with the result of a query. The new table schema will be inferred from the `SELECT` statement.
42 |
43 | ```sql
44 | -- Make sure to include ALL columns from the original table to avoid data loss.
45 | INSERT OVERWRITE my_catalog.my_db.my_table
46 | SELECT
47 | id,
48 | -- other_column_1,
49 | -- other_column_2,
50 | timestamp_millis(order_created_ts_ms) AS order_created_ts_ms
51 | FROM my_catalog.my_db.my_table;
52 | ```
53 |
54 | **Option 2: Rename the Column (Recommended for large tables)**
55 |
56 | This is the **recommended approach for most production scenarios**. Renaming a column is a fast, metadata-only operation that does not require rewriting any data files. It is nearly instantaneous, making it ideal for large tables.
57 |
58 | You can use any tool that supports Iceberg table management, such as Spark, Flink, or the Iceberg REST catalog API.
59 |
60 | Using Spark SQL:
61 | ```sql
62 | ALTER TABLE my_catalog.my_db.my_table RENAME COLUMN order_created_ts_ms TO order_created_ts_ms_legacy;
63 | ```
64 |
65 | 3. **Upgrade and Restart the Debezium Server**.
66 |
67 | ### What Happens Next?
68 |
69 | When the consumer processes the new events, it will find that the `order_created_ts_ms` column no longer exists. It will then add it to the table schema as a new column with the correct `timestamp` type.
70 |
71 | After this process, your table will have both columns:
72 | - `order_created_ts_ms_legacy` (`long`): Contains the old data. New rows will have `null` in this column.
73 | - `order_created_ts_ms` (`timestamp`): Contains the new data. Old rows will have `null` in this column.
74 |
75 | This approach preserves all your data while allowing the schema to evolve to accommodate the new data type. You can later decide to backfill the data and consolidate it into a single column if needed.
76 |
77 | or you can simply could use COALESCE and read consolidated data
78 | ```sql
79 | SELECT COALESCE(timestamp_millis(order_created_ts_ms_legacy), order_created_ts_ms) AS order_created_ts_ms FROM my_catalog.my_db.my_table
80 | ```
81 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/test/java/io/debezium/server/iceberg/IcebergChangeConsumerTemporalIsoStringTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * * Copyright memiiso Authors.
4 | * *
5 | * * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
6 | *
7 | */
8 |
9 | package io.debezium.server.iceberg;
10 |
11 | import io.debezium.server.iceberg.testresources.CatalogNessie;
12 | import io.debezium.server.iceberg.testresources.S3Minio;
13 | import io.debezium.server.iceberg.testresources.SourcePostgresqlDB;
14 | import io.quarkus.test.common.QuarkusTestResource;
15 | import io.quarkus.test.junit.QuarkusTest;
16 | import io.quarkus.test.junit.QuarkusTestProfile;
17 | import io.quarkus.test.junit.TestProfile;
18 | import org.apache.spark.sql.Dataset;
19 | import org.apache.spark.sql.Row;
20 | import org.apache.spark.sql.types.DataTypes;
21 | import org.awaitility.Awaitility;
22 | import org.junit.jupiter.api.Assertions;
23 | import org.junit.jupiter.api.Test;
24 |
25 | import java.time.Duration;
26 | import java.util.HashMap;
27 | import java.util.Map;
28 |
29 | /**
30 | * Integration test that verifies basic reading from PostgreSQL database and writing to iceberg destination.
31 | *
32 | * @author Ismail Simsek
33 | */
34 | @QuarkusTest
35 | @QuarkusTestResource(value = S3Minio.class, restrictToAnnotatedClass = true)
36 | @QuarkusTestResource(value = SourcePostgresqlDB.class, restrictToAnnotatedClass = true)
37 | @QuarkusTestResource(value = CatalogNessie.class, restrictToAnnotatedClass = true)
38 | @TestProfile(IcebergChangeConsumerTemporalIsoStringTest.TestProfile.class)
39 | public class IcebergChangeConsumerTemporalIsoStringTest extends BaseSparkTest {
40 |
41 | @Test
42 | public void testConsumingVariousDataTypes() throws Exception {
43 | String sql =
44 | "DROP TABLE IF EXISTS inventory.data_types;\n" +
45 | "CREATE TABLE IF NOT EXISTS inventory.data_types (\n" +
46 | " c_id INTEGER ,\n" +
47 | " c_date DATE,\n" +
48 | " c_time TIME,\n" +
49 | " c_timestamp TIMESTAMP,\n" +
50 | " c_timestamptz TIMESTAMPTZ\n" +
51 | ");";
52 | SourcePostgresqlDB.runSQL(sql);
53 | sql = "INSERT INTO inventory.data_types \n" +
54 | "(c_id, c_date, c_time, c_timestamp, c_timestamptz) \n" +
55 | "VALUES \n" +
56 | "(1, null, null, null, null) \n" +
57 | ",(2, CURRENT_DATE , CURRENT_TIME, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP ) \n" +
58 | ",(3, '2024-01-02'::DATE , '18:04:00'::TIME, '2023-10-11 10:30:00'::timestamp, '2023-11-12 10:30:00+02'::timestamptz ) ";
59 |
60 | SourcePostgresqlDB.runSQL(sql);
61 | Awaitility.await().atMost(Duration.ofSeconds(320)).until(() -> {
62 | try {
63 | Dataset df = getTableData("testc.inventory.data_types");
64 | df.show(false);
65 | df.schema().printTreeString();
66 |
67 | Assertions.assertEquals(3, df.count(), "Incorrect row count");
68 | // Validate date field and values
69 | Assertions.assertEquals(DataTypes.DateType, getSchemaField(df, "c_date").dataType());
70 | Assertions.assertEquals(1, df.filter("c_id = 3 AND c_date = to_date('2024-01-02', 'yyyy-MM-dd')").count());
71 | // Validate time field and values
72 | System.out.println(getSchemaField(df, "c_timestamp").dataType());
73 | Assertions.assertEquals(DataTypes.TimestampNTZType, getSchemaField(df, "c_timestamp").dataType());
74 | Assertions.assertEquals(1, df.filter("c_id = 3 AND c_timestamp = to_timestamp('2023-10-11 10:30:00')").count());
75 | Assertions.assertEquals(DataTypes.TimestampType, getSchemaField(df, "c_timestamptz").dataType());
76 | Assertions.assertEquals(1, df.filter("c_id = 3 AND c_timestamptz = to_timestamp('2023-11-12 10:30:00+02')").count());
77 | // time type is kept as string, because spark does not support time type
78 | Assertions.assertEquals(DataTypes.StringType, getSchemaField(df, "c_time").dataType());
79 | Assertions.assertEquals(1, df.filter("c_id = 3 AND c_time = '18:04:00Z'").count());
80 | return true;
81 | } catch (Exception | AssertionError e) {
82 | e.printStackTrace();
83 | return false;
84 | }
85 | });
86 | }
87 |
88 | public static class TestProfile implements QuarkusTestProfile {
89 | @Override
90 | public Map getConfigOverrides() {
91 | Map config = new HashMap<>();
92 | config.put("debezium.source.hstore.handling.mode", "map");
93 | // config.put("debezium.source.table.whitelist", "inventory.data_types");
94 | config.put("debezium.source.time.precision.mode", "isostring");
95 | return config;
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/python/debezium/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | import threading
5 | from pathlib import Path
6 |
7 |
8 | class LoggerClass:
9 | def __init__(self):
10 | self._log = None
11 |
12 | @property
13 | def log(self):
14 | if not self._log:
15 | self._log = logging.getLogger(name="debezium")
16 | self._log.setLevel(logging.INFO)
17 | if not self._log.hasHandlers():
18 | handler = logging.StreamHandler(sys.stdout)
19 | handler.setLevel(logging.INFO)
20 | formatter = logging.Formatter('%(asctime)s %(levelname)s [%(module)s] (%(funcName)s) %(message)s')
21 | handler.setFormatter(formatter)
22 | self._log.addHandler(handler)
23 | return self._log
24 |
25 |
26 | class Debezium(LoggerClass):
27 |
28 | def __init__(self, debezium_dir: str = None, conf_dir: str = None, java_home: str = None):
29 | super().__init__()
30 | if debezium_dir is None:
31 | self.debezium_server_dir: Path = Path(__file__).resolve().parent
32 | else:
33 | if not Path(debezium_dir).is_dir():
34 | raise Exception("Debezium Server directory '%s' not found" % debezium_dir)
35 | self.debezium_server_dir: Path = Path(debezium_dir)
36 | self.log.info("Setting Debezium dir to:%s" % self.debezium_server_dir.as_posix())
37 |
38 | if conf_dir is None:
39 | self.conf_dir = self.debezium_server_dir.joinpath("config")
40 | else:
41 | if not Path(conf_dir).is_dir():
42 | raise Exception("Debezium conf directory '%s' not found" % conf_dir)
43 | self.conf_dir: Path = Path(conf_dir)
44 | self.log.info("Setting conf dir to:%s" % self.conf_dir.as_posix())
45 |
46 | if java_home:
47 | os.putenv("JAVA_HOME", java_home)
48 | os.environ["JAVA_HOME"] = java_home
49 | self.log.info("JAVA_HOME env variable set to %s" % java_home)
50 |
51 | def _jnius_config(self, *java_args):
52 | import jnius_config
53 |
54 | if jnius_config.vm_running:
55 | raise ValueError(
56 | "VM is already running, can't set classpath/options; VM started at %s" % jnius_config.vm_started_at)
57 |
58 | # NOTE this needs to be set before add_classpath
59 | jnius_config.add_options(*java_args)
60 |
61 | debezium_classpath: list = [
62 | self.debezium_server_dir.joinpath('*').as_posix(),
63 | self.debezium_server_dir.joinpath("lib/*").as_posix(),
64 | self.conf_dir.as_posix()]
65 |
66 | jnius_config.add_classpath(*debezium_classpath)
67 | self.log.info("VM Classpath: %s" % jnius_config.get_classpath())
68 | return jnius_config
69 |
70 | def _sanitize(self, jvm_option: str):
71 | """Sanitizes jvm argument like `my.property.secret=xyz` if it contains secret.
72 | >>> dbz = Debezium()
73 | >>> dbz._sanitize("source.pwd=pswd")
74 | 'source.pwd=*****'
75 | >>> dbz._sanitize("source.password=pswd")
76 | 'source.password=*****'
77 | >>> dbz._sanitize("source.secret=pswd")
78 | 'source.secret=*****'
79 | """
80 | if any(x in jvm_option.lower() for x in ['pwd', 'password', 'secret', 'apikey', 'apitoken']):
81 | head, sep, tail = jvm_option.partition('=')
82 | return head + '=*****'
83 | else:
84 | return jvm_option
85 |
86 | # pylint: disable=no-name-in-module
87 | def run(self, *java_args: str):
88 | jnius_config = self._jnius_config(java_args)
89 | try:
90 | __jvm_options: list = [self._sanitize(p) for p in jnius_config.get_options()]
91 | self.log.info("Configured jvm options:%s" % __jvm_options)
92 |
93 | from jnius import autoclass
94 | DebeziumServer = autoclass('io.debezium.server.Main')
95 | _dbz = DebeziumServer()
96 | return _dbz.main()
97 | finally:
98 | from jnius import detach
99 | detach()
100 |
101 |
102 | class DebeziumRunAsyn(threading.Thread):
103 | def __init__(self, debezium_dir: str, java_args: list, java_home: str = None):
104 | threading.Thread.__init__(self)
105 | self.debezium_dir = debezium_dir
106 | self.java_args = java_args
107 | self.java_home = java_home
108 | self._dbz: Debezium = None
109 |
110 | def run(self):
111 | self._dbz = Debezium(debezium_dir=self.debezium_dir, java_home=self.java_home)
112 | return self._dbz.run(*self.java_args)
113 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/converter/EventConverter.java:
--------------------------------------------------------------------------------
1 | package io.debezium.server.iceberg.converter;
2 | import io.debezium.server.iceberg.tableoperator.Operation;
3 | import io.debezium.server.iceberg.tableoperator.RecordWrapper;
4 | import jakarta.annotation.Nullable;
5 | import jakarta.validation.constraints.NotNull;
6 | import org.apache.iceberg.Schema;
7 | import org.apache.iceberg.SortOrder;
8 |
9 | /**
10 | * Interface for converting CDC events from various formats (e.g., Json, Debezium Connect format)
11 | * into Iceberg records and extracting relevant metadata.
12 | */
13 | public interface EventConverter {
14 |
15 | /**
16 | * Extracts the key part of the CDC event.
17 | * The actual type depends on the source event format (e.g., Struct, String).
18 | *
19 | * @param The expected type of the key.
20 | * @return The event key, or potentially null if the event has no key.
21 | */
22 | @Nullable
23 | T key();
24 |
25 | /**
26 | * Checks if the event contains key data. Useful for distinguishing
27 | * events with explicit null keys from events without keys.
28 | *
29 | * @return true if key data is present, false otherwise.
30 | */
31 | boolean hasKeyData();
32 |
33 |
34 | /**
35 | * Extracts the value/payload part of the CDC event.
36 | * The actual type depends on the source event format (e.g., Struct, String).
37 | *
38 | * @param The expected type of the value.
39 | * @return The event value, or null for delete events (tombstones).
40 | */
41 | @Nullable
42 | T value();
43 |
44 | /**
45 | * Extracts the source timestamp of the event.
46 | *
47 | * @return The timestamp, or null if not available in the event.
48 | */
49 | @NotNull
50 | Long cdcSourceTsValue();
51 |
52 | /**
53 | * Extracts the CDC operation type (Create, Update, Delete, Read).
54 | *
55 | * @return The {@link Operation} enum value.
56 | */
57 | @NotNull
58 | Operation cdcOpValue();
59 |
60 | /**
61 | * Provides a converter capable of transforming the event's schema representation
62 | * into an Iceberg {@link Schema}.
63 | *
64 | * @return The schema converter instance.
65 | */
66 | @NotNull
67 | SchemaConverter schemaConverter();
68 |
69 | /**
70 | * Indicates whether this event represents a schema change event rather than a data change event.
71 | *
72 | * @return true if it's a schema change event, false otherwise.
73 | */
74 | boolean isSchemaChangeEvent();
75 |
76 | /**
77 | * Gets the Iceberg {@link Schema} that corresponds to the data payload (`value()`)
78 | * of this specific event, potentially derived from schema information embedded within the event.
79 | * This might differ from the target table's schema if schema evolution is occurring.
80 | * Returns null if the event is a schema change event or has no associated data schema.
81 | *
82 | * @return The Iceberg schema for the event's data, or null.
83 | */
84 | @Nullable
85 | Schema icebergSchema(boolean withIdentifierFields);
86 |
87 | default Schema icebergSchema() {
88 | return icebergSchema(true);
89 | }
90 |
91 | /**
92 | * Gets the Iceberg {@link SortOrder} that corresponds to the data key of this specific event.
93 | * @param schema The Iceberg schema for {@link SortOrder.Builder}.
94 | * @return The Iceberg {@link SortOrder}.
95 | */
96 | @Nullable
97 | SortOrder sortOrder(Schema schema);
98 |
99 | /**
100 | * Gets the destination identifier (e.g., logical table name) for this event.
101 | *
102 | * @return The destination string.
103 | */
104 | @NotNull
105 | String destination();
106 |
107 | /**
108 | * Converts the event data into a {@link RecordWrapper} suitable for direct append operations,
109 | * using the provided target Iceberg schema. This might optimize by only including necessary fields
110 | * for an append (e.g., the 'after' state).
111 | *
112 | * @param schema The target Iceberg schema to conform to.
113 | * @return A {@link RecordWrapper} containing the data formatted for appending.
114 | */
115 | @NotNull
116 | RecordWrapper convertAsAppend(@NotNull Schema schema); // Added @NotNull
117 |
118 | /**
119 | * Converts the event data into a {@link RecordWrapper} suitable for general iceberg consumption
120 | * (Create, Update, Delete), using the provided target Iceberg schema.
121 | *
122 | * @param schema The target Iceberg schema to conform to.
123 | * @return A {@link RecordWrapper} containing the data formatted for iceberg table.
124 | */
125 | @NotNull
126 | RecordWrapper convert(@NotNull Schema schema); // Added @NotNull
127 | }
128 |
--------------------------------------------------------------------------------
/debezium-server-iceberg-sink/src/main/java/io/debezium/server/iceberg/converter/DateTimeUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright Debezium Authors.
3 | *
4 | * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
5 | */
6 | package io.debezium.server.iceberg.converter;
7 |
8 | import io.debezium.time.Conversions;
9 | import org.apache.iceberg.util.DateTimeUtil;
10 |
11 | import java.sql.Timestamp;
12 | import java.time.Duration;
13 | import java.time.Instant;
14 | import java.time.LocalDate;
15 | import java.time.LocalDateTime;
16 | import java.time.LocalTime;
17 | import java.time.OffsetDateTime;
18 | import java.time.ZoneId;
19 | import java.time.ZoneOffset;
20 | import java.time.ZonedDateTime;
21 | import java.time.temporal.ChronoUnit;
22 | import java.util.Date;
23 | import java.util.TimeZone;
24 | import java.util.concurrent.TimeUnit;
25 |
26 | public class DateTimeUtils {
27 |
28 | private DateTimeUtils() {
29 | }
30 |
31 | public static Instant toInstantFromNanos(long epochNanos) {
32 | final long epochSeconds = TimeUnit.NANOSECONDS.toSeconds(epochNanos);
33 | final long adjustment = TimeUnit.NANOSECONDS.toNanos(epochNanos % TimeUnit.SECONDS.toNanos(1));
34 | return Instant.ofEpochSecond(epochSeconds, adjustment);
35 | }
36 |
37 | public static ZonedDateTime toZonedDateTimeFromDate(Date date, TimeZone timeZone) {
38 | return toZonedDateTimeFromDate(date, timeZone.toZoneId());
39 | }
40 |
41 | public static ZonedDateTime toZonedDateTimeFromDate(Date date, ZoneId zoneId) {
42 | return date.toInstant().atZone(zoneId);
43 | }
44 |
45 | public static ZonedDateTime toZonedDateTimeFromInstantEpochMicros(long epochMicros) {
46 | return Conversions.toInstantFromMicros(epochMicros).atZone(ZoneOffset.UTC);
47 | }
48 |
49 | public static ZonedDateTime toZonedDateTimeFromInstantEpochNanos(long epochNanos) {
50 | return ZonedDateTime.ofInstant(toInstantFromNanos(epochNanos), ZoneOffset.UTC);
51 | }
52 |
53 | public static LocalDate toLocalDateOfEpochDays(long epochDays) {
54 | return LocalDate.ofEpochDay(epochDays);
55 | }
56 |
57 | public static LocalDate toLocalDateFromDate(Date date) {
58 | return toLocalDateFromInstantEpochMillis(date.getTime());
59 | }
60 |
61 | public static LocalDate toLocalDateFromInstantEpochMillis(long epochMillis) {
62 | return LocalDate.ofInstant(Instant.ofEpochMilli(epochMillis), ZoneOffset.UTC);
63 | }
64 |
65 | public static LocalTime toLocalTimeFromDurationMilliseconds(long durationMillis) {
66 | return LocalTime.ofNanoOfDay(Duration.of(durationMillis, ChronoUnit.MILLIS).toNanos());
67 | }
68 |
69 | public static LocalTime toLocalTimeFromDurationMicroseconds(long durationMicros) {
70 | return LocalTime.ofNanoOfDay(Duration.of(durationMicros, ChronoUnit.MICROS).toNanos());
71 | }
72 |
73 | public static LocalTime toLocalTimeFromDurationNanoseconds(long durationNanos) {
74 | return LocalTime.ofNanoOfDay(Duration.of(durationNanos, ChronoUnit.NANOS).toNanos());
75 | }
76 |
77 | public static LocalTime toLocalTimeFromUtcDate(Date date) {
78 | return date.toInstant().atOffset(ZoneOffset.UTC).toLocalTime();
79 | }
80 |
81 | public static LocalDateTime toLocalDateTimeFromDate(Date date) {
82 | return toLocalDateTimeFromInstantEpochMillis(date.getTime());
83 | }
84 |
85 | public static LocalDateTime toLocalDateTimeFromInstantEpochMillis(long epochMillis) {
86 | return LocalDateTime.ofInstant(Conversions.toInstantFromMillis(epochMillis), ZoneOffset.UTC);
87 | }
88 |
89 | public static LocalDateTime toLocalDateTimeFromInstantEpochMicros(long epochMicros) {
90 | return LocalDateTime.ofInstant(Conversions.toInstantFromMicros(epochMicros), ZoneOffset.UTC);
91 | }
92 |
93 | public static LocalDateTime toLocalDateTimeFromInstantEpochNanos(long epochNanos) {
94 | return LocalDateTime.ofInstant(toInstantFromNanos(epochNanos), ZoneOffset.UTC);
95 | }
96 |
97 | public static Timestamp toTimestampFromMillis(long epochMilliseconds) {
98 | final Instant instant = Conversions.toInstantFromMillis(epochMilliseconds);
99 | final Timestamp ts = new Timestamp(instant.toEpochMilli());
100 | ts.setNanos(instant.getNano());
101 | return ts;
102 | }
103 |
104 | protected static LocalDateTime timestampFromMillis(long millisFromEpoch) {
105 | return ChronoUnit.MILLIS.addTo(DateTimeUtil.EPOCH, millisFromEpoch).toLocalDateTime();
106 | }
107 |
108 | protected static OffsetDateTime timestamptzFromNanos(long nanosFromEpoch) {
109 | return ChronoUnit.NANOS.addTo(DateTimeUtil.EPOCH, nanosFromEpoch);
110 | }
111 |
112 | protected static OffsetDateTime timestamptzFromMillis(long millisFromEpoch) {
113 | return ChronoUnit.MILLIS.addTo(DateTimeUtil.EPOCH, millisFromEpoch);
114 | }
115 | }
116 |
--------------------------------------------------------------------------------