├── .gitignore ├── Makefile ├── README.md ├── assets └── images │ ├── cs1.png │ ├── cs2.png │ ├── cs3.png │ ├── dag.png │ ├── data_infra.png │ ├── det.png │ ├── det2.png │ ├── fs.png │ ├── proj_1.png │ ├── proj_2.png │ ├── secret.png │ └── tn.png ├── connectors ├── pg-src-connector.json └── s3-sink.json ├── container ├── connect │ └── Dockerfile └── datagen │ └── Dockerfile ├── datagen └── gen_user_payment_data.py ├── docker-compose.yml ├── example └── duckdb_minio_product_scd2.sql ├── postgres └── init.sql └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | ### Vim ### 2 | [._]*.s[a-w][a-z] 3 | [._]s[a-w][a-z] 4 | *.un~ 5 | Session.vim 6 | .netrwhist 7 | *~ 8 | 9 | ### SublimeText ### 10 | # cache files for sublime text 11 | *.tmlanguage.cache 12 | *.tmPreferences.cache 13 | *.stTheme.cache 14 | 15 | # workspace files are user-specific 16 | *.sublime-workspace 17 | 18 | # project files should be checked into the repository, unless a significant 19 | # proportion of contributors will probably not be using SublimeText 20 | # *.sublime-project 21 | 22 | # sftp configuration file 23 | sftp-config.json 24 | 25 | # Python 26 | __pycache__ 27 | 28 | # policy 29 | trust-policy.json 30 | 31 | 32 | # logs 33 | logs/* 34 | *.log 35 | 36 | temp/* 37 | 38 | __MACOSX 39 | 40 | .dockerignore 41 | 42 | *.pem 43 | 44 | ###### TERRAFORM IGNORE 45 | # Local .terraform directories 46 | **/.terraform/* 47 | 48 | # .tfstate files 49 | *.tfstate 50 | *.tfstate.* 51 | 52 | # Crash log files 53 | crash.log 54 | crash.*.log 55 | 56 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 57 | # password, private keys, and other secrets. These should not be part of version 58 | # control as they are data points which are potentially sensitive and subject 59 | # to change depending on the environment. 60 | *.tfvars 61 | *.tfvars.json 62 | 63 | # Ignore override files as they are usually used to override resources locally and so 64 | # are not checked in 65 | override.tf 66 | override.tf.json 67 | *_override.tf 68 | *_override.tf.json 69 | 70 | # Include override files you do wish to add to version control using negated pattern 71 | # !example_override.tf 72 | 73 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 74 | # example: *tfplan* 75 | 76 | # Ignore CLI configuration files 77 | .terraformrc 78 | terraform.rc 79 | Footer 80 | 81 | minio/data/* 82 | kafka_* -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | up: 2 | docker compose up -d --build 3 | 4 | remove-minio-data: 5 | rm -rf ./minio/data 6 | 7 | compose-down: 8 | docker compose down -v 9 | 10 | down: compose-down remove-minio-data 11 | 12 | restart: down up 13 | 14 | minio-ui: 15 | open http://localhost:9001 16 | 17 | pg: 18 | pgcli -h localhost -p 5432 -U postgres -d postgres 19 | 20 | s3-sink: 21 | curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '@./connectors/pg-src-connector.json' 22 | 23 | pg-src: 24 | curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '@./connectors/s3-sink.json' 25 | 26 | connectors: pg-src s3-sink 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | * [Change Data Capture](#change-data-capture) 3 | * [Project Design](#project-design) 4 | * [Run on codespaces](#run-on-codespaces) 5 | * [Prerequisites](#prerequisites) 6 | * [Setup](#setup) 7 | * [Analyze data with duckDB](#analyze-data-with-duckdb) 8 | * [References](#references) 9 | * [duckDB](#duckdb) 10 | * [Project Design](#project-design-1) 11 | 12 | # Change Data Capture 13 | 14 | Repository for the [Change Data Capture with Debezium](https://www.startdataengineering.com/post/change-data-capture-using-debezium-kafka-and-pg/) blog at startdataengineering.com. 15 | 16 | # Project Design 17 | 18 | # Run on codespaces 19 | 20 | You can run this CDC data pipeline using GitHub codespaces. Follow the instructions below. 21 | 22 | 1. Create codespaces by going to the **[change_data_capture](https://github.com/josephmachado/change_data_capture)** repository, cloning(or fork) it and then clicking on `Create codespaces on main` button. 23 | 2. Wait for codespaces to start, then in the terminal type `make up && sleep 60 && make connectors && sleep 60`. 24 | 3. Wait for the above to complete, it can take up a couple of minutes. 25 | 4. Go to the `ports` tab and click on the link exposing port `9001` to access Minio (open source S3) UI. 26 | 5. In the minio UI, use `minio`, and `minio123` as username and password respectively. In the minio UI you will be able to see the the paths `commerce/debezium.commerce.products` and `commerce/debezium.commerce.users` paths, which have json files in them. The json files contain data about the create, updates and deletes in the respective products and users tables. 27 | 28 | **NOTE**: The screenshots below, show the general process to start codespaces, please follow the instructions shown above for this project. 29 | 30 | ![codespace start](./assets/images/cs1.png) 31 | ![codespace make up](./assets/images/cs2.png) 32 | ![codespace access ui](./assets/images/cs3.png) 33 | 34 | **Note** Make sure to switch off codespaces instance, you only have limited free usage; see docs [here](https://github.com/features/codespaces#pricing). 35 | 36 | 37 | # Prerequisites 38 | 39 | 1. [git version >= 2.37.1](https://github.com/git-guides/install-git) 40 | 2. [Docker version >= 20.10.17](https://docs.docker.com/engine/install/) and [Docker compose v2 version >= v2.10.2](https://docs.docker.com/compose/#compose-v2-and-the-new-docker-compose-command). Make sure that docker is running using `docker ps` 41 | 3. [pgcli](https://www.pgcli.com/install) 42 | 43 | **Windows users**: please setup WSL and a local Ubuntu Virtual machine following **[the instructions here](https://ubuntu.com/tutorials/install-ubuntu-on-wsl2-on-windows-10#1-overview)**. Install the above prerequisites on your ubuntu terminal; if you have trouble installing docker, follow **[the steps here](https://www.digitalocean.com/community/tutorials/how-to-install-and-use-docker-on-ubuntu-22-04#step-1-installing-docker)** (only Step 1 is necessary). Please install the make command with `sudo apt install make -y` (if its not already present). 44 | 45 | # Setup 46 | 47 | All the commands shown below are to be run via the terminal (use the Ubuntu terminal for WSL users). We will use docker to set up our containers. Clone and move into the lab repository, as shown below. 48 | 49 | ```bash 50 | git clone https://github.com/josephmachado/change_data_capture.git 51 | cd change_data_capture 52 | ``` 53 | 54 | We have some helpful make commands to make working with our systems more accessible. Shown below are the make commands and their definitions 55 | 56 | 1. **make up**: Spin up the docker containers for Postgres, data generator, Kafka Connect, Kafka, & minio (open source S3 alternative). Note this also sets up [Postgres tables](./postgres/init.sql) and starts a [python script](./datagen/gen_user_payment_data.py) to create-delete-update rows in those tables. 57 | 2. **make conenctors**: Set up the debezium connector to start recording changes from Postgres and another connector to push this data into minio. 58 | 3. **make down**: Stop the docker containers. 59 | 60 | You can see the commands in [this Makefile](./Makefile). If your terminal does not support **make** commands, please use the commands in [the Makefile](./Makefile) directly. All the commands in this book assume that you have the docker containers running. 61 | 62 | In your terminal, do the following: 63 | 64 | ```bash 65 | # Make sure docker is running using docker ps 66 | make up # starts the docker containers 67 | sleep 60 # wait 1 minute for all the containers to set up 68 | make connectors # Sets up the connectors 69 | sleep 60 # wait 1 minute for some data to be pushed into minio 70 | make minio-ui # opens localhost:9001 71 | ``` 72 | 73 | In the minio UI, use `minio`, and `minio123` as username and password respectively. In the minio UI you will be able to see the the paths `commerce/debezium.commerce.products` and `commerce/debezium.commerce.users` paths, which have json files in them. The json files contain data about the create, updates and deletes in the respective products and users tables. 74 | 75 | # Analyze data with duckDB 76 | 77 | ## Access the data in minio via filesystem 78 | We [mount a local folder to minio container](./docker-compose.yml) which allows us to access the data in minio via filesystem. We can start a Python REPL to run DuckDB as shown below: 79 | 80 | ```bash 81 | python 82 | ``` 83 | 84 | Now let's create a SCD2 for `products` table from the data we have in minio. Note we are only looking at rows that have updates and deletes in them (see the `where id in` filter in the below query). 85 | 86 | ```python 87 | import duckdb as d 88 | d.sql(""" 89 | WITH products_create_update_delete AS ( 90 | SELECT 91 | COALESCE(CAST(json->'value'->'after'->'id' AS INT), CAST(json->'value'->'before'->'id' AS INT)) AS id, 92 | json->'value'->'before' AS before_row_value, 93 | json->'value'->'after' AS after_row_value, 94 | CASE 95 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"c"' THEN 'CREATE' 96 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"d"' THEN 'DELETE' 97 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"u"' THEN 'UPDATE' 98 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"r"' THEN 'SNAPSHOT' 99 | ELSE 'INVALID' 100 | END AS operation_type, 101 | CAST(json->'value'->'source'->'lsn' AS BIGINT) AS log_seq_num, 102 | epoch_ms(CAST(json->'value'->'source'->'ts_ms' AS BIGINT)) AS source_timestamp 103 | FROM 104 | read_ndjson_objects('minio/data/commerce/debezium.commerce.products/*/*/*.json') 105 | WHERE 106 | log_seq_num IS NOT NULL 107 | ) 108 | SELECT 109 | id, 110 | CAST(after_row_value->'name' AS VARCHAR(255)) AS name, 111 | CAST(after_row_value->'description' AS TEXT) AS description, 112 | CAST(after_row_value->'price' AS NUMERIC(10, 2)) AS price, 113 | source_timestamp AS row_valid_start_timestamp, 114 | CASE 115 | WHEN LEAD(source_timestamp, 1) OVER lead_txn_timestamp IS NULL THEN CAST('9999-01-01' AS TIMESTAMP) 116 | ELSE LEAD(source_timestamp, 1) OVER lead_txn_timestamp 117 | END AS row_valid_expiration_timestamp 118 | FROM products_create_update_delete 119 | WHERE id in (SELECT id FROM products_create_update_delete GROUP BY id HAVING COUNT(*) > 1) 120 | WINDOW lead_txn_timestamp AS (PARTITION BY id ORDER BY log_seq_num ) 121 | ORDER BY id, row_valid_start_timestamp 122 | LIMIT 123 | 200; 124 | """).execute() 125 | ``` 126 | 127 | ## Access data via s3 api 128 | We can also access the data via the S3 API in duckdb as shown in this [example SQL query](./example/duckdb_minio_product_scd2.sql). 129 | 130 | # References 131 | 132 | 1. [Debezium postgre docs](https://debezium.io/documentation/reference/2.1/connectors/postgresql.html) 133 | 2. [Redpanda CDC example](https://redpanda.com/blog/redpanda-debezium) 134 | 3. [duckDB docs](https://duckdb.org/docs/archive/0.2.9/) 135 | 4. [Kafka docs](https://kafka.apache.org/20/documentation.html) 136 | 5. [Minio DuckDB example](https://blog.min.io/duckdb-and-minio-for-a-modern-data-stack/) 137 | 138 | 249 | -------------------------------------------------------------------------------- /assets/images/cs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/cs1.png -------------------------------------------------------------------------------- /assets/images/cs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/cs2.png -------------------------------------------------------------------------------- /assets/images/cs3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/cs3.png -------------------------------------------------------------------------------- /assets/images/dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/dag.png -------------------------------------------------------------------------------- /assets/images/data_infra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/data_infra.png -------------------------------------------------------------------------------- /assets/images/det.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/det.png -------------------------------------------------------------------------------- /assets/images/det2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/det2.png -------------------------------------------------------------------------------- /assets/images/fs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/fs.png -------------------------------------------------------------------------------- /assets/images/proj_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/proj_1.png -------------------------------------------------------------------------------- /assets/images/proj_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/proj_2.png -------------------------------------------------------------------------------- /assets/images/secret.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/secret.png -------------------------------------------------------------------------------- /assets/images/tn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/change_data_capture/30cac18ad3bb2470a2841af6cfb63fce7346bd13/assets/images/tn.png -------------------------------------------------------------------------------- /connectors/pg-src-connector.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pg-src-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector", 5 | "tasks.max": "1", 6 | "database.hostname": "postgres", 7 | "database.port": "5432", 8 | "database.user": "postgres", 9 | "database.password": "postgres", 10 | "database.dbname": "postgres", 11 | "database.server.name": "postgres", 12 | "database.include.list": "postgres", 13 | "topic.prefix": "debezium", 14 | "schema.include.list": "commerce" 15 | } 16 | } -------------------------------------------------------------------------------- /connectors/s3-sink.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "s3-sink", 3 | "config": { 4 | "connector.class": "io.aiven.kafka.connect.s3.AivenKafkaConnectS3SinkConnector", 5 | "aws.access.key.id": "minio", 6 | "aws.s3.bucket.name": "commerce", 7 | "aws.s3.endpoint": "http://minio:9000", 8 | "aws.s3.region": "us-east-1", 9 | "aws.secret.access.key": "minio123", 10 | "format.output.type": "jsonl", 11 | "topics": "debezium.commerce.users,debezium.commerce.products", 12 | "file.compression.type": "none", 13 | "flush.size": "20", 14 | "file.name.template": "/{{topic}}/{{timestamp:unit=yyyy}}-{{timestamp:unit=MM}}-{{timestamp:unit=dd}}/{{timestamp:unit=HH}}/{{partition:padding=true}}-{{start_offset:padding=true}}.json" 15 | } 16 | } -------------------------------------------------------------------------------- /container/connect/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debezium/connect:latest 2 | 3 | USER kafka 4 | 5 | RUN curl --create-dirs -LO --output-dir /tmp/connector https://github.com/aiven/s3-connector-for-apache-kafka/releases/download/v2.12.0/aiven-kafka-connect-s3-2.12.0.zip && \ 6 | unzip -o /tmp/connector/aiven-kafka-connect-s3-2.12.0.zip -d /tmp/connector && \ 7 | mv /tmp/connector/aiven-kafka-connect-s3-2.12.0 /kafka/connect/aiven-kafka-connect-s3 && \ 8 | rm /tmp/connector/aiven-kafka-connect-s3-2.12.0.zip -------------------------------------------------------------------------------- /container/datagen/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim 2 | 3 | RUN pip install \ 4 | psycopg2-binary==2.9.3 \ 5 | faker==13.3.2 \ 6 | boto3 7 | 8 | 9 | WORKDIR /opt/datagen 10 | 11 | COPY ../../datagen /opt/datagen 12 | 13 | CMD [ "python" "./gen_user_payment_data.py"] 14 | -------------------------------------------------------------------------------- /datagen/gen_user_payment_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | from time import sleep 4 | 5 | import boto3 6 | import psycopg2 7 | from botocore.client import Config 8 | from botocore.exceptions import ClientError 9 | from faker import Faker 10 | 11 | 12 | def create_s3_client(access_key, secret_key, endpoint, region): 13 | """ 14 | Create a boto3 client configured for Minio or any S3-compatible service. 15 | 16 | :param access_key: S3 access key 17 | :param secret_key: S3 secret key 18 | :param endpoint: Endpoint URL for the S3 service 19 | :param region: Region to use, defaults to us-east-1 20 | :return: Configured S3 client 21 | """ 22 | return boto3.client( 23 | "s3", 24 | region_name=region, 25 | endpoint_url=endpoint, 26 | aws_access_key_id=access_key, 27 | aws_secret_access_key=secret_key, 28 | config=Config(signature_version="s3v4"), 29 | ) 30 | 31 | 32 | def create_bucket_if_not_exists(s3_client, bucket_name): 33 | """ 34 | Check if an S3 bucket exists, and if not, create it. 35 | 36 | :param s3_client: Configured S3 client 37 | :param bucket_name: Name of the bucket to create or check 38 | :return: None 39 | """ 40 | try: 41 | s3_client.head_bucket(Bucket=bucket_name) 42 | print(f"Bucket '{bucket_name}' already exists.") 43 | except ClientError as e: 44 | error_code = int(e.response["Error"]["Code"]) 45 | if error_code == 404: 46 | # Bucket does not exist, create it 47 | try: 48 | s3_client.create_bucket(Bucket=bucket_name) 49 | print(f"Bucket '{bucket_name}' created.") 50 | except ClientError as error: 51 | print(f"Failed to create bucket: {error}") 52 | else: 53 | print(f"Error: {e}") 54 | 55 | 56 | fake = Faker() 57 | 58 | 59 | def gen_user_product_data(num_records: int) -> None: 60 | for id in range(num_records): 61 | sleep(0.5) 62 | conn = psycopg2.connect( 63 | "dbname='postgres' user='postgres' host='postgres' password='postgres'" 64 | ) 65 | curr = conn.cursor() 66 | curr.execute( 67 | "INSERT INTO commerce.users (id, username, password) VALUES (%s, %s, %s)", 68 | (id, fake.user_name(), fake.password()), 69 | ) 70 | curr.execute( 71 | "INSERT INTO commerce.products (id, name, description, price) VALUES (%s, %s, %s, %s)", 72 | (id, fake.name(), fake.text(), fake.random_int(min=1, max=100)), 73 | ) 74 | conn.commit() 75 | 76 | sleep(0.5) 77 | # update 10 % of the time 78 | if random.randint(1, 100) >= 90: 79 | curr.execute( 80 | "UPDATE commerce.users SET username = %s WHERE id = %s", 81 | (fake.user_name(), id), 82 | ) 83 | curr.execute( 84 | "UPDATE commerce.products SET name = %s WHERE id = %s", 85 | (fake.name(), id), 86 | ) 87 | conn.commit() 88 | 89 | sleep(0.5) 90 | # delete 5 % of the time 91 | if random.randint(1, 100) >= 95: 92 | curr.execute("DELETE FROM commerce.users WHERE id = %s", (id,)) 93 | curr.execute("DELETE FROM commerce.products WHERE id = %s", (id,)) 94 | 95 | conn.commit() 96 | curr.close() 97 | 98 | return 99 | 100 | 101 | if __name__ == "__main__": 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument( 104 | "-n", 105 | "--num_records", 106 | type=int, 107 | help="Number of records to generate", 108 | default=1000, 109 | ) 110 | args = parser.parse_args() 111 | 112 | # Credentials and Connection Info 113 | access_key = "minio" 114 | secret_key = "minio123" 115 | endpoint = "http://minio:9000" 116 | region = "us-east-1" 117 | 118 | s3_client = create_s3_client(access_key, secret_key, endpoint, region) 119 | bucket_name = "commerce" # Replace with your bucket name 120 | create_bucket_if_not_exists(s3_client, bucket_name) 121 | num_records = args.num_records 122 | 123 | gen_user_product_data(num_records) 124 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | services: 4 | zookeeper: 5 | image: docker.io/bitnami/zookeeper:3.8 6 | ports: 7 | - "2181:2181" 8 | volumes: 9 | - "zookeeper_data:/bitnami" 10 | environment: 11 | - ALLOW_ANONYMOUS_LOGIN=yes 12 | 13 | kafka: 14 | image: docker.io/bitnami/kafka:3.4 15 | ports: 16 | - "9093:9093" 17 | volumes: 18 | - "kafka_data:/bitnami" 19 | environment: 20 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 21 | - ALLOW_PLAINTEXT_LISTENER=yes 22 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,EXTERNAL:PLAINTEXT 23 | - KAFKA_CFG_LISTENERS=CLIENT://:9092,EXTERNAL://:9093 24 | - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://kafka:9092,EXTERNAL://localhost:9093 25 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=CLIENT 26 | depends_on: 27 | - zookeeper 28 | 29 | connect: 30 | build: 31 | context: ./container/connect 32 | dockerfile: Dockerfile 33 | container_name: connect 34 | hostname: connect 35 | ports: 36 | - "8083:8083" 37 | environment: 38 | - GROUP_ID=1 39 | - CONFIG_STORAGE_TOPIC=my-connect-configs 40 | - OFFSET_STORAGE_TOPIC=my-connect-offsets 41 | - BOOTSTRAP_SERVERS=kafka:9092 42 | links: 43 | - zookeeper:zookeeper 44 | - kafka:kafka 45 | depends_on: 46 | - zookeeper 47 | - kafka 48 | 49 | postgres: 50 | image: debezium/postgres:15 51 | container_name: postgres 52 | hostname: postgres 53 | environment: 54 | - POSTGRES_USER=postgres 55 | - POSTGRES_DB=postgres 56 | - POSTGRES_PASSWORD=postgres 57 | ports: 58 | - "5432:5432" 59 | volumes: 60 | - ./postgres/init.sql:/docker-entrypoint-initdb.d/init.sql 61 | - postgres_data:/var/lib/postgresql/data 62 | 63 | minio: 64 | image: quay.io/minio/minio:RELEASE.2022-05-26T05-48-41Z 65 | hostname: minio 66 | container_name: minio 67 | ports: 68 | - '9000:9000' 69 | - '9001:9001' 70 | volumes: 71 | - ./minio/data:/data 72 | environment: 73 | MINIO_ACCESS_KEY: minio 74 | MINIO_SECRET_KEY: minio123 75 | command: server --console-address ":9001" /data 76 | 77 | createbuckets: 78 | image: minio/mc 79 | container_name: createbuckets 80 | links: 81 | - minio:minio 82 | depends_on: 83 | - minio 84 | entrypoint: > 85 | /bin/sh -c " /usr/bin/mc config host add myminio http://minio:9000 minio minio123; /usr/bin/mc rm -r --force myminio/commerce; /usr/bin/mc mb myminio/commerce; /usr/bin/mc policy download myminio/commerce; /usr/bin/mc cp /tmp/data myminio/commerce; exit 0; " 86 | 87 | datagen: 88 | build: 89 | context: . 90 | dockerfile: ./container/datagen/Dockerfile 91 | entrypoint: 92 | - python 93 | - ./gen_user_payment_data.py 94 | container_name: datagen 95 | restart: on-failure 96 | depends_on: 97 | - postgres 98 | 99 | volumes: 100 | zookeeper_data: 101 | driver: local 102 | kafka_data: 103 | driver: local 104 | postgres_data: 105 | driver: local 106 | -------------------------------------------------------------------------------- /example/duckdb_minio_product_scd2.sql: -------------------------------------------------------------------------------- 1 | -- Minio ignores the region so you can use any region name over here 2 | SET s3_region='us-east-1'; 3 | SET s3_url_style='path'; 4 | -- Set this endpoint to where minio is at 5 | -- if you are following the blog leave the endpoint as it is 6 | SET s3_endpoint='localhost:9000'; 7 | -- this is your minio username 8 | SET s3_access_key_id='minio' ; 9 | -- this is your minio password 10 | SET s3_secret_access_key='minio123'; 11 | SET s3_use_ssl=false; 12 | WITH products_create_update_delete AS ( 13 | SELECT 14 | COALESCE(CAST(json->'value'->'after'->'id' AS INT), CAST(json->'value'->'before'->'id' AS INT)) AS id, 15 | json->'value'->'before' AS before_row_value, 16 | json->'value'->'after' AS after_row_value, 17 | CASE 18 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"c"' THEN 'CREATE' 19 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"d"' THEN 'DELETE' 20 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"u"' THEN 'UPDATE' 21 | WHEN CAST(json->'value'->'$.op' AS CHAR(1)) = '"r"' THEN 'SNAPSHOT' 22 | ELSE 'INVALID' 23 | END AS operation_type, 24 | CAST(json->'value'->'source'->'lsn' AS BIGINT) AS log_seq_num, 25 | epoch_ms(CAST(json->'value'->'source'->'ts_ms' AS BIGINT)) AS source_timestamp 26 | FROM 27 | read_ndjson_objects('s3://commerce/debezium.commerce.products/*/*/*.json') 28 | WHERE 29 | log_seq_num IS NOT NULL 30 | ) 31 | SELECT 32 | id, 33 | CAST(after_row_value->'name' AS VARCHAR(255)) AS name, 34 | CAST(after_row_value->'description' AS TEXT) AS description, 35 | CAST(after_row_value->'price' AS NUMERIC(10, 2)) AS price, 36 | source_timestamp AS row_valid_start_timestamp, 37 | -- LEAD(source_timestamp , 1) OVER lead_txn_timestamp AS row_valid_end_timestamp 38 | CASE 39 | WHEN LEAD(source_timestamp, 1) OVER lead_txn_timestamp IS NULL THEN CAST('9999-01-01' AS TIMESTAMP) 40 | ELSE LEAD(source_timestamp, 1) OVER lead_txn_timestamp 41 | END AS row_valid_expiration_timestamp 42 | FROM products_create_update_delete 43 | WHERE id in (SELECT id FROM products_create_update_delete GROUP BY id HAVING COUNT(*) > 1) 44 | WINDOW lead_txn_timestamp AS (PARTITION BY id ORDER BY log_seq_num ) 45 | ORDER BY id, row_valid_start_timestamp 46 | LIMIT 47 | 200; -------------------------------------------------------------------------------- /postgres/init.sql: -------------------------------------------------------------------------------- 1 | -- create a commerce schema 2 | CREATE SCHEMA commerce; 3 | 4 | -- Use commerce schema 5 | SET 6 | search_path TO commerce; 7 | 8 | -- create a table named products 9 | CREATE TABLE products ( 10 | id SERIAL PRIMARY KEY, 11 | name VARCHAR(255) NOT NULL, 12 | description TEXT, 13 | price REAL NOT NULL 14 | ); 15 | 16 | -- create a users table 17 | CREATE TABLE users ( 18 | id SERIAL PRIMARY KEY, 19 | username VARCHAR(255) NOT NULL, 20 | PASSWORD VARCHAR(255) NOT NULL 21 | ); 22 | 23 | ALTER TABLE 24 | products REPLICA IDENTITY FULL; 25 | 26 | ALTER TABLE 27 | users REPLICA IDENTITY FULL; -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | duckdb==1.0.0 2 | --------------------------------------------------------------------------------