├── .dockerignore ├── .github ├── assets │ └── Olake.jpg ├── ISSUE_TEMPLATE │ ├── new-feature.md │ └── bug_report.md ├── workflows │ ├── unit-tests.yml │ ├── issue-states.yml │ ├── release-approval.yml │ ├── todo.yml │ ├── golang-ci.yml │ ├── prevent-direct-master-prs.yml │ ├── documentation.yaml │ ├── build-and-release-driver.yml │ ├── draft-release-and-changelog.yml │ ├── security-ci.yaml │ └── integration-tests.yml └── pull_request_template.md ├── .trivyignore ├── examples ├── trino-tablurarest-minio-mysql │ └── etc │ │ ├── node.properties │ │ ├── config.properties │ │ ├── jvm.config │ │ └── catalog │ │ └── iceberg.properties ├── presto-tabularest-minio-mysql │ └── etc │ │ ├── config.properties │ │ ├── jvm.config │ │ └── catalog │ │ └── iceberg.properties └── README.md ├── go.work ├── types ├── sync_mode.go ├── adapter.go ├── kafka_types.go ├── message_type.go ├── interface.go ├── set.go └── stream.go ├── destination ├── iceberg │ ├── local-test │ │ ├── Dockerfile │ │ ├── spark-defaults.conf │ │ └── hive-site.conf │ ├── olake-iceberg-java-writer │ │ ├── README.md │ │ ├── src │ │ │ └── main │ │ │ │ ├── java │ │ │ │ └── io │ │ │ │ │ └── debezium │ │ │ │ │ └── server │ │ │ │ │ └── iceberg │ │ │ │ │ ├── RecordSchemaData.java │ │ │ │ │ └── tableoperator │ │ │ │ │ ├── Operation.java │ │ │ │ │ ├── UnpartitionedDeltaWriter.java │ │ │ │ │ ├── PartitionedDeltaWriter.java │ │ │ │ │ ├── RecordWrapper.java │ │ │ │ │ └── BaseDeltaTaskWriter.java │ │ │ │ └── resources │ │ │ │ ├── log4j2.xml │ │ │ │ └── record_ingest.proto │ │ └── Olake-changes-notice.txt │ └── proto │ │ └── records_ingest.proto ├── parquet │ ├── config.go │ └── resources │ │ └── spec.json └── interface.go ├── .githooks ├── pre-commit └── commit-msg ├── drivers ├── kafka │ ├── main.go │ └── internal │ │ ├── incremental.go │ │ ├── backfill.go │ │ └── config.go ├── mysql │ ├── docker-entrypoint-initdb.d │ │ └── init.sql │ ├── internal │ │ ├── testdata │ │ │ ├── source.json │ │ │ ├── destination.json │ │ │ └── test_streams.json │ │ ├── mysql_test.go │ │ ├── incremental.go │ │ ├── datatype_conversion.go │ │ ├── config.go │ │ └── cdc.go │ ├── main.go │ ├── docker-compose.yml │ └── go.mod ├── oracle │ ├── main.go │ ├── internal │ │ ├── cdc.go │ │ ├── incremental.go │ │ ├── datatype_conversion.go │ │ └── config.go │ └── resources │ │ └── spec.json ├── postgres │ ├── main.go │ ├── internal │ │ ├── testdata │ │ │ ├── source.json │ │ │ ├── destination.json │ │ │ └── test_streams.json │ │ ├── postgres_test.go │ │ ├── incremental.go │ │ ├── config.go │ │ └── datatype_conversion.go │ ├── pg_hba.conf │ ├── docker-compose.yml │ └── go.mod ├── mongodb │ ├── main.go │ ├── internal │ │ ├── testdata │ │ │ ├── source.json │ │ │ ├── destination.json │ │ │ └── test_streams.json │ │ ├── mon_test.go │ │ └── config.go │ ├── docker-compose.yml │ ├── mongodb-init.sh │ ├── go.mod │ └── resources │ │ └── spec.json └── abstract │ ├── utils.go │ ├── interface.go │ └── backfill.go ├── docker-compose.yml ├── .gitignore ├── Makefile ├── connector.go ├── utils ├── typeutils │ ├── resolver.go │ ├── time.go │ ├── compare.go │ ├── flatten.go │ └── datatype.go ├── error.go ├── ssl.go ├── validation.go ├── safego │ └── safego.go ├── decryption.go ├── ssh.go ├── testutils │ └── test_schema.go └── concurrent.go ├── pkg ├── kafka │ ├── types.go │ └── balancer.go ├── binlog │ └── types.go ├── waljs │ ├── types.go │ ├── filter.go │ └── waljs.go └── jdbc │ └── reader.go ├── Dockerfile ├── protocol ├── spec.go ├── check.go ├── clear.go └── discover.go ├── CONTRIBUTING.md └── constants └── constants.go /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | test/ 3 | -------------------------------------------------------------------------------- /.github/assets/Olake.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datazip-inc/olake/HEAD/.github/assets/Olake.jpg -------------------------------------------------------------------------------- /.trivyignore: -------------------------------------------------------------------------------- 1 | CVE-2025-30065 2 | CVE-2018-1320 3 | CVE-2019-0205 4 | CVE-2020-13949 5 | CVE-2025-46762 -------------------------------------------------------------------------------- /examples/trino-tablurarest-minio-mysql/etc/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=testing 2 | node.id=ffffffff-ffff-ffff-ffff-ffffffffffff 3 | node.data-dir=/var/lib/trino 4 | -------------------------------------------------------------------------------- /go.work: -------------------------------------------------------------------------------- 1 | go 1.24.0 2 | 3 | toolchain go1.23.7 4 | 5 | use ( 6 | . 7 | ./drivers/mongodb 8 | ./drivers/mysql 9 | ./drivers/postgres 10 | ./drivers/oracle 11 | ./drivers/kafka 12 | ) 13 | -------------------------------------------------------------------------------- /examples/trino-tablurarest-minio-mysql/etc/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | discovery.uri=http://localhost:8080 5 | web-ui.preview.enabled=true -------------------------------------------------------------------------------- /examples/presto-tabularest-minio-mysql/etc/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | discovery-server.enabled=true 5 | discovery.uri=http://localhost:8080 6 | node.environment=test 7 | -------------------------------------------------------------------------------- /types/sync_mode.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type SyncMode string 4 | 5 | const ( 6 | FULLREFRESH SyncMode = "full_refresh" 7 | INCREMENTAL SyncMode = "incremental" 8 | CDC SyncMode = "cdc" 9 | STRICTCDC SyncMode = "strict_cdc" 10 | ) 11 | -------------------------------------------------------------------------------- /destination/iceberg/local-test/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tabulario/spark-iceberg 2 | 3 | ENV IVY2_CACHE_DIR=/root/.ivy2 4 | 5 | RUN mkdir -p ${IVY2_CACHE_DIR} 6 | 7 | COPY ./data/ivy-cache ${IVY2_CACHE_DIR} 8 | 9 | COPY spark-defaults.conf /opt/spark/conf/spark-defaults.conf -------------------------------------------------------------------------------- /.githooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Run golangci-lint on staged files 4 | echo "Running golangci-lint..." 5 | make golangci 6 | 7 | # Check if the linter passed 8 | if [ $? -ne 0 ]; then 9 | echo "❌ golangci-lint found issues. Commit aborted." 10 | exit 1 11 | fi -------------------------------------------------------------------------------- /drivers/kafka/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/datazip-inc/olake" 5 | driver "github.com/datazip-inc/olake/drivers/kafka/internal" 6 | ) 7 | 8 | func main() { 9 | driver := &driver.Kafka{} 10 | defer driver.Close() 11 | olake.RegisterDriver(driver) 12 | } 13 | -------------------------------------------------------------------------------- /examples/presto-tabularest-minio-mysql/etc/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx1G 3 | -XX:+UseG1GC 4 | -XX:G1HeapRegionSize=32M 5 | -XX:+UseGCOverheadLimit 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+HeapDumpOnOutOfMemoryError 8 | -XX:+ExitOnOutOfMemoryError 9 | -Djdk.attach.allowAttachSelf=true 10 | -------------------------------------------------------------------------------- /examples/trino-tablurarest-minio-mysql/etc/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx1G 3 | -XX:+UseG1GC 4 | -XX:G1HeapRegionSize=32M 5 | -XX:+UseGCOverheadLimit 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+HeapDumpOnOutOfMemoryError 8 | -XX:+ExitOnOutOfMemoryError 9 | -Djdk.attach.allowAttachSelf=true 10 | -------------------------------------------------------------------------------- /drivers/mysql/docker-entrypoint-initdb.d/init.sql: -------------------------------------------------------------------------------- 1 | DROP USER IF EXISTS 'mysql'@'%'; 2 | CREATE USER 'mysql'@'%' IDENTIFIED BY 'secret1234'; 3 | GRANT REPLICATION CLIENT ON *.* TO 'mysql'@'%'; 4 | GRANT REPLICATION SLAVE ON *.* TO 'mysql'@'%'; 5 | FLUSH PRIVILEGES; 6 | 7 | GRANT ALL PRIVILEGES ON *.* TO 'mysql'@'%'; 8 | FLUSH PRIVILEGES; -------------------------------------------------------------------------------- /examples/presto-tabularest-minio-mysql/etc/catalog/iceberg.properties: -------------------------------------------------------------------------------- 1 | connector.name=iceberg 2 | iceberg.catalog.type=rest 3 | iceberg.rest.uri=http://rest:8181 4 | iceberg.catalog.warehouse=s3://warehouse/ 5 | hive.s3.path-style-access=true 6 | hive.s3.endpoint=http://minio:9090 7 | hive.s3.aws-access-key=minio 8 | hive.s3.aws-secret-key=minio123 9 | -------------------------------------------------------------------------------- /types/adapter.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type DestinationType string 4 | 5 | const ( 6 | Parquet DestinationType = "PARQUET" 7 | Iceberg DestinationType = "ICEBERG" 8 | ) 9 | 10 | // TODO: Add validations 11 | type WriterConfig struct { 12 | Type DestinationType `json:"type"` 13 | WriterConfig any `json:"writer"` 14 | } 15 | -------------------------------------------------------------------------------- /drivers/mysql/internal/testdata/source.json: -------------------------------------------------------------------------------- 1 | { 2 | "hosts": "host.docker.internal", 3 | "username": "mysql", 4 | "password": "secret1234", 5 | "database": "olake_mysql_test", 6 | "port": 3306, 7 | "tls_skip_verify": true, 8 | "update_method": { 9 | "initial_wait_time": 10 10 | }, 11 | "max_threads": 30, 12 | "backoff_retry_count": 4 13 | } -------------------------------------------------------------------------------- /drivers/mysql/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/datazip-inc/olake" 5 | driver "github.com/datazip-inc/olake/drivers/mysql/internal" 6 | _ "github.com/jackc/pgx/v4/stdlib" 7 | ) 8 | 9 | func main() { 10 | driver := &driver.MySQL{ 11 | CDCSupport: false, 12 | } 13 | defer driver.Close() 14 | olake.RegisterDriver(driver) 15 | } 16 | -------------------------------------------------------------------------------- /drivers/oracle/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/datazip-inc/olake" 5 | driver "github.com/datazip-inc/olake/drivers/oracle/internal" 6 | _ "github.com/jackc/pgx/v4/stdlib" 7 | ) 8 | 9 | func main() { 10 | driver := &driver.Oracle{ 11 | CDCSupport: false, 12 | } 13 | defer driver.Close() 14 | olake.RegisterDriver(driver) 15 | } 16 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | mongodb_sync: 3 | image: olakego/source-mongodb:latest 4 | command: > 5 | sync 6 | --config /mnt/config/config.json 7 | --catalog /mnt/config/catalog.json 8 | --destination /mnt/config/writer.json 9 | --state /mnt/config/state.json 10 | volumes: 11 | - ./drivers/mongodb/examples:/mnt/config -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new-feature.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New feature 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: Feature 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Problem 11 | 12 | What problem this feature is going to solve. Why this problem is important 13 | 14 | ### Solution 15 | 16 | Proposed solution: architecture, implementation details etc 17 | -------------------------------------------------------------------------------- /drivers/postgres/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/datazip-inc/olake" 5 | driver "github.com/datazip-inc/olake/drivers/postgres/internal" 6 | _ "github.com/jackc/pgx/v4/stdlib" 7 | ) 8 | 9 | func main() { 10 | driver := &driver.Postgres{ 11 | CDCSupport: false, 12 | } 13 | defer driver.CloseConnection() 14 | olake.RegisterDriver(driver) 15 | } 16 | -------------------------------------------------------------------------------- /drivers/mongodb/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/datazip-inc/olake" 7 | driver "github.com/datazip-inc/olake/drivers/mongodb/internal" 8 | _ "github.com/jackc/pgx/v4/stdlib" 9 | ) 10 | 11 | func main() { 12 | driver := &driver.Mongo{ 13 | CDCSupport: false, 14 | } 15 | defer driver.Close(context.Background()) 16 | olake.RegisterDriver(driver) 17 | } 18 | -------------------------------------------------------------------------------- /drivers/mongodb/internal/testdata/source.json: -------------------------------------------------------------------------------- 1 | { 2 | "hosts": ["host.docker.internal:27017"], 3 | "username": "mongodb", 4 | "password": "secure_password123", 5 | "authdb": "admin", 6 | "replica-set": "rs0", 7 | "read-preference": "secondaryPreferred", 8 | "srv": false, 9 | "server-ram": 16, 10 | "database": "olake_mongodb_test", 11 | "max_threads": 5, 12 | "backoff_retry_count": 4 13 | } -------------------------------------------------------------------------------- /.githooks/commit-msg: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | COMMIT_MSG_FILE=$1 4 | COMMIT_MSG=$(cat "$COMMIT_MSG_FILE") 5 | 6 | # Regex pattern for Conventional Commits 7 | PATTERN="^(feat|fix|docs|style|refactor|test|chore|build|ci|perf|revert)(\(.+\))?: .+" 8 | 9 | if ! echo "$COMMIT_MSG" | grep -Eq "$PATTERN"; then 10 | echo "❌ Commit message must follow Conventional Commits format:" 11 | echo " Example: feat(api): add new endpoint" 12 | exit 1 13 | fi -------------------------------------------------------------------------------- /types/kafka_types.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | // PartitionMetaData holds metadata about a Kafka partition for a specific stream reader 4 | type PartitionMetaData struct { 5 | ReaderID string 6 | Stream StreamInterface 7 | PartitionID int 8 | EndOffset int64 9 | } 10 | 11 | // PartitionKey represents a unique key for a Kafka partition and topic 12 | type PartitionKey struct { 13 | Topic string 14 | Partition int 15 | } 16 | -------------------------------------------------------------------------------- /examples/trino-tablurarest-minio-mysql/etc/catalog/iceberg.properties: -------------------------------------------------------------------------------- 1 | connector.name=iceberg 2 | iceberg.catalog.type=rest 3 | iceberg.rest-catalog.uri=http://rest:8181 4 | iceberg.rest-catalog.warehouse=s3://warehouse/ 5 | 6 | # Native S3 file system (required) 7 | fs.native-s3.enabled=true 8 | s3.endpoint=http://minio:9090 9 | s3.region=us-east-1 10 | s3.path-style-access=true 11 | s3.aws-access-key=minio 12 | s3.aws-secret-key=minio123 13 | 14 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | unit-tests: 9 | name: Run Unit Tests 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - uses: actions/setup-go@v3 15 | with: 16 | check-latest: true 17 | go-version: 1.24.x 18 | 19 | - name: Install Dependencies 20 | run: go mod download 21 | 22 | - name: Run All Unit Tests 23 | run: go test -v ./... 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | olake 2 | .vscode 3 | test 4 | .DS_Store 5 | __debug_bin* 6 | go.sum 7 | go.work.sum 8 | */**/examples 9 | olake-data/ 10 | olake-iceberg-java-writer-0.0.1-SNAPSHOT.jar 11 | destination/iceberg/olake-iceberg-java-writer/target/ 12 | destination/iceberg/olake-iceberg-java-writer/target/**/ 13 | destination/iceberg/local-test/data/ 14 | olake-iceberg-java-writer.jar 15 | local-releaser.sh 16 | destination/iceberg/olake-iceberg-java-writer/.idea/ 17 | destination/iceberg/olake-iceberg-java-writer/dependency-reduced-pom.xml 18 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GOPATH = $(shell go env GOPATH) 2 | 3 | gomod: 4 | find . -name go.mod -execdir go mod tidy \; 5 | 6 | golangci: 7 | go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest; 8 | $(GOPATH)/bin/golangci-lint run 9 | 10 | trivy: 11 | trivy fs --vuln-type os,library --severity HIGH,CRITICAL . 12 | 13 | gofmt: 14 | gofmt -l -s -w . 15 | 16 | pre-commit: 17 | chmod +x $(shell pwd)/.githooks/pre-commit 18 | chmod +x $(shell pwd)/.githooks/commit-msg 19 | git config core.hooksPath $(shell pwd)/.githooks 20 | -------------------------------------------------------------------------------- /drivers/postgres/internal/testdata/source.json: -------------------------------------------------------------------------------- 1 | { 2 | "host": "host.docker.internal", 3 | "port": 5433, 4 | "database": "postgres", 5 | "username": "postgres", 6 | "password": "secret1234", 7 | "ssl": { 8 | "mode": "disable" 9 | }, 10 | "update_method": { 11 | "replication_slot": "olake_slot", 12 | "initial_wait_time": 120 13 | }, 14 | "reader_batch_size": 1, 15 | "default_mode": "cdc", 16 | "max_threads": 30, 17 | "split_column": "", 18 | "aws_region": "us-east-1" 19 | } -------------------------------------------------------------------------------- /.github/workflows/issue-states.yml: -------------------------------------------------------------------------------- 1 | name: 'Issue States' 2 | 3 | on: 4 | project_card: 5 | types: [created, edited, moved] 6 | 7 | permissions: 8 | repository-projects: read 9 | issues: write 10 | pull-requests: write 11 | 12 | jobs: 13 | action: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: dessant/issue-states@v2 17 | with: 18 | github-token: ${{ github.token }} 19 | open-issue-columns: 'To do, In progress, In Beta' 20 | closed-issue-columns: 'Closed, Done' 21 | log-output: false 22 | -------------------------------------------------------------------------------- /drivers/kafka/internal/incremental.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/datazip-inc/olake/drivers/abstract" 7 | "github.com/datazip-inc/olake/types" 8 | "github.com/datazip-inc/olake/utils/logger" 9 | ) 10 | 11 | func (k *Kafka) StreamIncrementalChanges(_ context.Context, _ types.StreamInterface, _ abstract.BackfillMsgFn) error { 12 | logger.Debugf("StreamIncrementalChanges not supported for Kafka driver ") 13 | return nil 14 | } 15 | 16 | func (k *Kafka) FetchMaxCursorValues(ctx context.Context, stream types.StreamInterface) (any, any, error) { 17 | return nil, nil, nil 18 | } 19 | -------------------------------------------------------------------------------- /drivers/mongodb/internal/testdata/destination.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "ICEBERG", 3 | "writer": { 4 | "catalog_type": "jdbc", 5 | "jdbc_url": "jdbc:postgresql://host.docker.internal:5432/iceberg", 6 | "jdbc_username": "iceberg", 7 | "jdbc_password": "password", 8 | "iceberg_s3_path": "s3a://warehouse", 9 | "s3_endpoint": "http://host.docker.internal:9000", 10 | "s3_use_ssl": false, 11 | "s3_path_style": true, 12 | "aws_access_key": "admin", 13 | "aws_secret_key": "password", 14 | "iceberg_db": "olake_iceberg", 15 | "aws_region": "us-east-1" 16 | } 17 | } -------------------------------------------------------------------------------- /types/message_type.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type MessageType string 4 | 5 | const ( 6 | LogMessage MessageType = "LOG" 7 | ConnectionStatusMessage MessageType = "CONNECTION_STATUS" 8 | StateMessage MessageType = "STATE" 9 | RecordMessage MessageType = "RECORD" 10 | CatalogMessage MessageType = "CATALOG" 11 | SpecMessage MessageType = "SPEC" 12 | ActionMessage MessageType = "ACTION" 13 | ) 14 | 15 | type ConnectionStatus string 16 | 17 | const ( 18 | ConnectionSucceed ConnectionStatus = "SUCCEEDED" 19 | ConnectionFailed ConnectionStatus = "FAILED" 20 | ) 21 | -------------------------------------------------------------------------------- /drivers/mysql/internal/testdata/destination.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "ICEBERG", 3 | "writer": { 4 | "catalog_type": "jdbc", 5 | "jdbc_url": "jdbc:postgresql://host.docker.internal:5432/iceberg", 6 | "jdbc_username": "iceberg", 7 | "jdbc_password": "password", 8 | "iceberg_s3_path": "s3a://warehouse", 9 | "s3_endpoint": "http://host.docker.internal:9000", 10 | "s3_use_ssl": false, 11 | "s3_path_style": true, 12 | "aws_access_key": "admin", 13 | "aws_secret_key": "password", 14 | "iceberg_db": "olake_iceberg", 15 | "aws_region": "us-east-1" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /drivers/postgres/internal/testdata/destination.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "ICEBERG", 3 | "writer": { 4 | "catalog_type": "jdbc", 5 | "jdbc_url": "jdbc:postgresql://host.docker.internal:5432/iceberg", 6 | "jdbc_username": "iceberg", 7 | "jdbc_password": "password", 8 | "iceberg_s3_path": "s3a://warehouse", 9 | "s3_endpoint": "http://host.docker.internal:9000", 10 | "s3_use_ssl": false, 11 | "s3_path_style": true, 12 | "aws_access_key": "admin", 13 | "aws_secret_key": "password", 14 | "iceberg_db": "olake_iceberg", 15 | "aws_region": "us-east-1" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /drivers/kafka/internal/backfill.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/datazip-inc/olake/destination" 8 | "github.com/datazip-inc/olake/drivers/abstract" 9 | "github.com/datazip-inc/olake/types" 10 | ) 11 | 12 | func (k *Kafka) GetOrSplitChunks(_ context.Context, _ *destination.WriterPool, _ types.StreamInterface) (*types.Set[types.Chunk], error) { 13 | return nil, fmt.Errorf("GetOrSplitChunks not supported for Kafka driver") 14 | } 15 | 16 | func (k *Kafka) ChunkIterator(_ context.Context, _ types.StreamInterface, _ types.Chunk, _ abstract.BackfillMsgFn) error { 17 | return fmt.Errorf("ChunkIterator not supported for Kafka driver") 18 | } 19 | -------------------------------------------------------------------------------- /destination/parquet/config.go: -------------------------------------------------------------------------------- 1 | package parquet 2 | 3 | import ( 4 | "github.com/datazip-inc/olake/utils" 5 | ) 6 | 7 | type Config struct { 8 | Path string `json:"local_path,omitempty"` // Local file path (for local file system usage) 9 | Bucket string `json:"s3_bucket,omitempty"` 10 | Region string `json:"s3_region,omitempty"` 11 | AccessKey string `json:"s3_access_key,omitempty"` 12 | SecretKey string `json:"s3_secret_key,omitempty"` 13 | Prefix string `json:"s3_path,omitempty"` 14 | // S3 endpoint for custom S3-compatible services (like MinIO) 15 | S3Endpoint string `json:"s3_endpoint,omitempty"` 16 | } 17 | 18 | func (c *Config) Validate() error { 19 | return utils.Validate(c) 20 | } 21 | -------------------------------------------------------------------------------- /connector.go: -------------------------------------------------------------------------------- 1 | package olake 2 | 3 | import ( 4 | "os" 5 | 6 | _ "github.com/datazip-inc/olake/destination/iceberg" // registering iceberg destination 7 | _ "github.com/datazip-inc/olake/destination/parquet" // registering parquet destination 8 | "github.com/datazip-inc/olake/drivers/abstract" 9 | protocol "github.com/datazip-inc/olake/protocol" 10 | "github.com/datazip-inc/olake/utils/logger" 11 | "github.com/datazip-inc/olake/utils/safego" 12 | ) 13 | 14 | func RegisterDriver(driver abstract.DriverInterface) { 15 | defer safego.Recovery(true) 16 | 17 | // Execute the root command 18 | err := protocol.CreateRootCommand(true, driver).Execute() 19 | if err != nil { 20 | logger.Fatal(err) 21 | } 22 | 23 | os.Exit(0) 24 | } 25 | -------------------------------------------------------------------------------- /utils/typeutils/resolver.go: -------------------------------------------------------------------------------- 1 | package typeutils 2 | 3 | import "github.com/datazip-inc/olake/types" 4 | 5 | func Resolve(stream *types.Stream, objects ...map[string]interface{}) error { 6 | allfields := Fields{} 7 | 8 | for _, object := range objects { 9 | fields := Fields{} 10 | // apply default typecast and define column types 11 | for k, v := range object { 12 | fields[k] = NewField(TypeFromValue(v)) 13 | } 14 | 15 | for fieldName, field := range allfields { 16 | if _, found := object[fieldName]; !found { 17 | field.setNullable() 18 | } 19 | } 20 | 21 | allfields.Merge(fields) 22 | } 23 | 24 | for column, field := range allfields { 25 | stream.UpsertField(column, *field.dataType, field.isNullable()) 26 | } 27 | 28 | return nil 29 | } 30 | -------------------------------------------------------------------------------- /drivers/mysql/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | mysql: 5 | image: mysql:8.0 6 | container_name: olake_mysql-test 7 | restart: unless-stopped 8 | environment: 9 | MYSQL_DATABASE: olake_mysql_test 10 | MYSQL_USER: mysql 11 | MYSQL_PASSWORD: secret1234 12 | MYSQL_ROOT_PASSWORD: root1234 13 | ports: 14 | - "3306:3306" 15 | volumes: 16 | - ./docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d 17 | command: > 18 | --server-id=1 19 | --log-bin=mysql-bin 20 | --binlog-format=ROW 21 | --binlog-row-image=FULL 22 | --binlog_row_metadata=FULL 23 | --max_allowed_packet=256M 24 | --innodb_buffer_pool_size=512M 25 | --default-authentication-plugin=mysql_native_password -------------------------------------------------------------------------------- /drivers/abstract/utils.go: -------------------------------------------------------------------------------- 1 | package abstract 2 | 3 | import ( 4 | "strings" 5 | "time" 6 | 7 | "github.com/datazip-inc/olake/constants" 8 | "github.com/datazip-inc/olake/utils/logger" 9 | ) 10 | 11 | func RetryOnBackoff(attempts int, sleep time.Duration, f func() error) (err error) { 12 | for cur := 0; cur < attempts; cur++ { 13 | if err = f(); err == nil { 14 | return nil 15 | } 16 | 17 | // check if error is non retryable 18 | for _, nonRetryableError := range constants.NonRetryableErrors { 19 | if strings.Contains(err.Error(), nonRetryableError) { 20 | return err 21 | } 22 | } 23 | 24 | if attempts > 1 && cur != attempts-1 { 25 | logger.Infof("retry attempt[%d], retrying after %.2f seconds due to err: %s", cur+1, sleep.Seconds(), err) 26 | time.Sleep(sleep) 27 | sleep = sleep * 2 28 | } 29 | } 30 | 31 | return err 32 | } 33 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/README.md: -------------------------------------------------------------------------------- 1 | # Java Iceberg Sink 2 | 3 | This project is a fork and modified version of the [debezium-server-iceberg](https://github.com/memiiso/debezium-server-iceberg) project, originally used to dump data from Debezium Server into Iceberg. The modifications make it compatible with Olake by sending data in Debezium format. 4 | 5 | ## Architecture 6 | 7 | The data flow in this project is as follows: 8 | 9 | 10 | Golang Code --gRPC--> Java (This Project) --Write to Iceberg--> S3 + Iceberg Catalog 11 | 12 | (Check out the Olake Iceberg Writer code to understand how data is sent to Java via gRPC.) 13 | 14 | ## Development and Testing 15 | 16 | For detailed instructions on setting up the development environment, prerequisites, running, debugging, and testing this component, please refer to the [CONTRIBUTING.md](./CONTRIBUTING.md) file. -------------------------------------------------------------------------------- /utils/error.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/hashicorp/go-multierror" 8 | "golang.org/x/sync/errgroup" 9 | ) 10 | 11 | func ErrExec(functions ...func() error) error { 12 | group, _ := errgroup.WithContext(context.Background()) 13 | for _, one := range functions { 14 | group.Go(one) 15 | } 16 | 17 | return group.Wait() 18 | } 19 | 20 | func ErrExecSequential(functions ...func() error) error { 21 | var multErr error 22 | for _, one := range functions { 23 | err := one() 24 | if err != nil { 25 | multErr = multierror.Append(multErr, err) 26 | } 27 | } 28 | 29 | return multErr 30 | } 31 | 32 | func ErrExecFormat(format string, function func() error) func() error { 33 | return func() error { 34 | if err := function(); err != nil { 35 | return fmt.Errorf(format, err) 36 | } 37 | 38 | return nil 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /.github/workflows/release-approval.yml: -------------------------------------------------------------------------------- 1 | name: Olake Driver Releaser 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | inputs: 8 | driver: 9 | description: "Driver to build" 10 | required: true 11 | version: 12 | description: "Version to release" 13 | required: true 14 | 15 | jobs: 16 | build_all_drivers: 17 | if: github.event_name == 'release' || (github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/master') 18 | name: Build and Release Drivers 19 | 20 | strategy: 21 | matrix: 22 | driver: [mongodb, mysql, postgres, oracle, kafka] # Add new drivers here as they become available 23 | 24 | uses: ./.github/workflows/build-and-release-driver.yml 25 | with: 26 | driver: ${{ github.event.inputs.driver || matrix.driver }} 27 | version: ${{ github.event.inputs.version || github.event.release.tag_name }} 28 | secrets: inherit -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Environment 11 | 12 | - **Deploy method**: for instance `docker-compose.yml` or `built from sources` 13 | - **Olake version**: for instance `0.0.1` 14 | - **OS**: for instance `Mac OS BigSur` or `Ubuntu 20.04` or `Windows 10` 15 | - **Cloud Provider**: for instance `AWS EC2` or `AWS Lightsail` or `GCP` 16 | - **Docker Params (if deployed with docker)**: a list of env variables with values (please, hide sensitive credentials!), mapped volumes and ports 17 | 18 | ## Description 19 | A short description of the bug and its impact 20 | 21 | ## Steps to reproduce 22 | 23 | Steps to reproduce the behavior: 24 | 25 | ## Expected behavior 26 | 27 | A clear and concise description of what you expected to happen. 28 | 29 | ## Actual behavior 30 | 31 | What's happening now and how it's different from actual behavior 32 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/java/io/debezium/server/iceberg/RecordSchemaData.java: -------------------------------------------------------------------------------- 1 | package io.debezium.server.iceberg; 2 | 3 | import org.apache.iceberg.types.Types; 4 | 5 | import java.util.ArrayList; 6 | import java.util.HashSet; 7 | import java.util.List; 8 | import java.util.Set; 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | record RecordSchemaData(List fields, Set identifierFieldIds, 12 | AtomicInteger nextFieldId) { 13 | 14 | 15 | public RecordSchemaData(Integer nextFieldId) { 16 | this(new ArrayList<>(), new HashSet<>(), new AtomicInteger(nextFieldId)); 17 | } 18 | 19 | public RecordSchemaData() { 20 | this(new ArrayList<>(), new HashSet<>(), new AtomicInteger(1)); 21 | } 22 | 23 | public RecordSchemaData copyKeepIdentifierFieldIdsAndNextFieldId() { 24 | return new RecordSchemaData(new ArrayList<>(), this.identifierFieldIds, this.nextFieldId); 25 | } 26 | 27 | 28 | } 29 | -------------------------------------------------------------------------------- /types/interface.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type StreamInterface interface { 4 | ID() string 5 | Self() *ConfiguredStream 6 | Name() string 7 | Namespace() string 8 | Schema() *TypeSchema 9 | GetStream() *Stream 10 | GetSyncMode() SyncMode 11 | GetFilter() (Filter, error) 12 | SupportedSyncModes() *Set[SyncMode] 13 | Cursor() (string, string) 14 | Validate(source *Stream) error 15 | NormalizationEnabled() bool 16 | GetDestinationDatabase(icebergDB *string) string 17 | GetDestinationTable() string 18 | } 19 | 20 | type StateInterface interface { 21 | ResetStreams() 22 | SetType(typ StateType) 23 | GetCursor(stream *ConfiguredStream, key string) any 24 | SetCursor(stream *ConfiguredStream, key, value any) 25 | GetChunks(stream *ConfiguredStream) *Set[Chunk] 26 | SetChunks(stream *ConfiguredStream, chunks *Set[Chunk]) 27 | RemoveChunk(stream *ConfiguredStream, chunk Chunk) 28 | SetGlobal(globalState any, streams ...string) 29 | } 30 | 31 | type Iterable interface { 32 | Next() bool 33 | Err() error 34 | } 35 | -------------------------------------------------------------------------------- /drivers/oracle/internal/cdc.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/datazip-inc/olake/drivers/abstract" 7 | "github.com/datazip-inc/olake/types" 8 | ) 9 | 10 | // CDC is not supported yet 11 | 12 | // PostCDC is called after CDC operation completes 13 | func (o *Oracle) PostCDC(ctx context.Context, stream types.StreamInterface, success bool, _ string) error { 14 | return nil 15 | } 16 | 17 | // PreCDC is called before CDC operation starts 18 | func (o *Oracle) PreCDC(ctx context.Context, streams []types.StreamInterface) error { 19 | return nil 20 | } 21 | 22 | // StreamChanges streams CDC changes for a given stream 23 | func (o *Oracle) StreamChanges(ctx context.Context, stream types.StreamInterface, processFn abstract.CDCMsgFn) error { 24 | return nil 25 | } 26 | 27 | // CDCSupported returns whether CDC is supported 28 | func (o *Oracle) CDCSupported() bool { 29 | return o.CDCSupport // CDC is not supported yet 30 | } 31 | 32 | // SetupState sets the state for the driver 33 | func (o *Oracle) SetupState(state *types.State) { 34 | o.state = state 35 | } 36 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /drivers/mongodb/internal/mon_test.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/datazip-inc/olake/constants" 7 | "github.com/datazip-inc/olake/utils/testutils" 8 | ) 9 | 10 | func TestMongodbIntegration(t *testing.T) { 11 | t.Parallel() 12 | testConfig := &testutils.IntegrationTest{ 13 | TestConfig: testutils.GetTestConfig(string(constants.MongoDB)), 14 | Namespace: "olake_mongodb_test", 15 | ExpectedData: ExpectedMongoData, 16 | ExpectedUpdateData: ExpectedUpdatedMongoData, 17 | DataTypeSchema: MongoToIcebergSchema, 18 | ExecuteQuery: ExecuteQuery, 19 | IcebergDB: "mongodb_olake_mongodb_test", 20 | } 21 | testConfig.TestIntegration(t) 22 | } 23 | 24 | func TestMongodbPerformance(t *testing.T) { 25 | config := &testutils.PerformanceTest{ 26 | TestConfig: testutils.GetTestConfig(string(constants.MongoDB)), 27 | Namespace: "twitter_data", 28 | BackfillStreams: []string{"tweets"}, 29 | CDCStreams: []string{"tweets_cdc"}, 30 | ExecuteQuery: ExecuteQuery, 31 | } 32 | 33 | config.TestPerformance(t) 34 | } 35 | -------------------------------------------------------------------------------- /drivers/mysql/internal/mysql_test.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/datazip-inc/olake/constants" 7 | "github.com/datazip-inc/olake/utils/testutils" 8 | ) 9 | 10 | func TestMySQLIntegration(t *testing.T) { 11 | t.Parallel() 12 | testConfig := &testutils.IntegrationTest{ 13 | TestConfig: testutils.GetTestConfig(string(constants.MySQL)), 14 | Namespace: "olake_mysql_test", 15 | ExpectedData: ExpectedMySQLData, 16 | ExpectedUpdateData: ExpectedUpdatedMySQLData, 17 | DataTypeSchema: MySQLToIcebergSchema, 18 | ExecuteQuery: ExecuteQuery, 19 | IcebergDB: "mysql_olake_mysql_test", 20 | } 21 | testConfig.TestIntegration(t) 22 | } 23 | 24 | func TestMySQLPerformance(t *testing.T) { 25 | config := &testutils.PerformanceTest{ 26 | TestConfig: testutils.GetTestConfig(string(constants.MySQL)), 27 | Namespace: "benchmark", 28 | BackfillStreams: []string{"trips", "fhv_trips"}, 29 | CDCStreams: []string{"trips_cdc", "fhv_trips_cdc"}, 30 | ExecuteQuery: ExecuteQuery, 31 | } 32 | 33 | config.TestPerformance(t) 34 | } 35 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/java/io/debezium/server/iceberg/tableoperator/Operation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package io.debezium.server.iceberg.tableoperator; 20 | 21 | public enum Operation { 22 | INSERT, 23 | UPDATE, 24 | DELETE, 25 | READ 26 | } 27 | -------------------------------------------------------------------------------- /pkg/kafka/types.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import ( 4 | "github.com/datazip-inc/olake/types" 5 | "github.com/segmentio/kafka-go" 6 | ) 7 | 8 | // ReaderConfig holds configuration for creating Kafka readers 9 | type ReaderConfig struct { 10 | MaxThreads int 11 | ThreadsEqualTotalPartitions bool 12 | BootstrapServers string 13 | ConsumerGroupID string 14 | Dialer *kafka.Dialer 15 | AdminClient *kafka.Client 16 | } 17 | 18 | // ReaderManager manages Kafka readers and their metadata 19 | type ReaderManager struct { 20 | config ReaderConfig 21 | readers map[string]*kafka.Reader // for fast reader access 22 | partitionIndex map[string]types.PartitionMetaData // get per-partition boundaries 23 | readerClientIDs map[string]string // reader's client id mapping 24 | } 25 | 26 | // CustomGroupBalancer ensures proper consumer ID distribution according to requirements 27 | type CustomGroupBalancer struct { 28 | requiredConsumerIDs int 29 | readerIndex int 30 | partitionIndex map[string]types.PartitionMetaData 31 | } 32 | -------------------------------------------------------------------------------- /drivers/postgres/internal/postgres_test.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/datazip-inc/olake/constants" 7 | "github.com/datazip-inc/olake/utils/testutils" 8 | _ "github.com/lib/pq" 9 | ) 10 | 11 | func TestPostgresIntegration(t *testing.T) { 12 | t.Parallel() 13 | testConfig := &testutils.IntegrationTest{ 14 | TestConfig: testutils.GetTestConfig(string(constants.Postgres)), 15 | Namespace: "public", 16 | ExpectedData: ExpectedPostgresData, 17 | ExpectedUpdateData: ExpectedUpdatedPostgresData, 18 | DataTypeSchema: PostgresToIcebergSchema, 19 | ExecuteQuery: ExecuteQuery, 20 | IcebergDB: "postgres_postgres_public", 21 | } 22 | testConfig.TestIntegration(t) 23 | } 24 | 25 | func TestPostgresPerformance(t *testing.T) { 26 | config := &testutils.PerformanceTest{ 27 | TestConfig: testutils.GetTestConfig(string(constants.Postgres)), 28 | Namespace: "public", 29 | BackfillStreams: []string{"trips", "fhv_trips"}, 30 | CDCStreams: []string{"trips_cdc", "fhv_trips_cdc"}, 31 | ExecuteQuery: ExecuteQuery, 32 | } 33 | 34 | config.TestPerformance(t) 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/todo.yml: -------------------------------------------------------------------------------- 1 | name: TODO Issue CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - "staging" 7 | workflow_dispatch: 8 | inputs: 9 | importAll: 10 | default: false 11 | required: false 12 | type: boolean 13 | description: Enable, if you want to import all TODOs. Runs on checked out branch! Only use if you're sure what you are doing. 14 | 15 | # jobs: 16 | # todos: 17 | # name: Create Issues from TODO 18 | # runs-on: ubuntu-latest 19 | # steps: 20 | # - uses: actions/checkout@v3 21 | # - name: todo-actions 22 | # uses: dtinth/todo-actions@v0.2.0 23 | # env: 24 | # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 25 | # TODO_ACTIONS_MONGO_URL: ${{ secrets.TODO_ACTIONS_MONGO_URL }} 26 | 27 | permissions: 28 | issues: write 29 | repository-projects: read 30 | contents: read 31 | 32 | jobs: 33 | todos: 34 | name: Create Issues from TODO 35 | runs-on: ubuntu-latest 36 | steps: 37 | - uses: actions/checkout@v3 38 | - name: todo-issue 39 | uses: DerJuulsn/todo-issue@v1.1.4 40 | env: 41 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/golang-ci.yml: -------------------------------------------------------------------------------- 1 | name: Go Build and Lint 2 | on: 3 | push: 4 | branches: 5 | - "master" 6 | pull_request: 7 | branches: 8 | - "*" 9 | workflow_dispatch: 10 | inputs: 11 | logLevel: 12 | description: "Log level" 13 | required: true 14 | default: "warning" 15 | 16 | jobs: 17 | golangci: 18 | name: golangci-lint 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v3 22 | - uses: actions/setup-go@v3 23 | with: 24 | check-latest: "true" 25 | go-version: "1.24.x" 26 | - name: golangci-lint 27 | uses: golangci/golangci-lint-action@v3 28 | with: 29 | # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. 30 | version: latest 31 | build-check: 32 | name: Build 33 | runs-on: ubuntu-latest 34 | steps: 35 | - uses: actions/checkout@v3 36 | - uses: actions/setup-go@v3 37 | with: 38 | check-latest: "true" 39 | go-version: "1.24.x" 40 | - name: Run build 41 | run: make build 42 | -------------------------------------------------------------------------------- /pkg/binlog/types.go: -------------------------------------------------------------------------------- 1 | package binlog 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/datazip-inc/olake/types" 7 | "github.com/go-mysql-org/go-mysql/mysql" 8 | "golang.org/x/crypto/ssh" 9 | ) 10 | 11 | // Config holds the configuration for the binlog syncer. 12 | type Config struct { 13 | ServerID uint32 14 | Flavor string 15 | Host string 16 | Port uint16 17 | User string 18 | Password string 19 | Charset string 20 | VerifyChecksum bool 21 | HeartbeatPeriod time.Duration 22 | InitialWaitTime time.Duration 23 | SSHClient *ssh.Client 24 | } 25 | 26 | // BinlogState holds the current binlog position. 27 | type Binlog struct { 28 | Position mysql.Position `json:"position"` 29 | } 30 | 31 | // CDCChange represents a change event captured from the binlog. 32 | type CDCChange struct { 33 | Stream types.StreamInterface 34 | Timestamp time.Time 35 | Position mysql.Position 36 | Kind string 37 | Schema string 38 | Table string 39 | Data map[string]interface{} 40 | } 41 | 42 | // OnChange is a callback function type for processing CDC changes. 43 | type OnChange func(change CDCChange) error 44 | -------------------------------------------------------------------------------- /drivers/postgres/pg_hba.conf: -------------------------------------------------------------------------------- 1 | # This file is required to configure client authentication for PostgreSQL. 2 | # It defines access control rules, specifying which users can connect to which databases, 3 | # from which hosts, and using which authentication methods. Proper configuration ensures 4 | # secure and controlled access to the PostgreSQL server. 5 | 6 | # TYPE DATABASE USER ADDRESS METHOD 7 | local all all trust 8 | host all all 127.0.0.1/32 trust 9 | host all all ::1/128 trust 10 | 11 | # Allow replication for 'postgres' 12 | host replication postgres 172.22.0.0/16 trust 13 | host replication postgres 127.0.0.1/32 trust 14 | host replication postgres ::1/128 trust 15 | 16 | # ✅ Fix: Allow replication from any IP 17 | host replication postgres 0.0.0.0/0 trust 18 | 19 | # Allow SQL connections from Docker network 20 | host all postgres 172.22.0.0/16 trust 21 | host all postgres 0.0.0.0/0 trust 22 | -------------------------------------------------------------------------------- /utils/typeutils/time.go: -------------------------------------------------------------------------------- 1 | package typeutils 2 | 3 | import ( 4 | "strings" 5 | "time" 6 | ) 7 | 8 | type Time struct { 9 | time.Time 10 | } 11 | 12 | // UnmarshalJSON overrides the default unmarshalling for CustomTime 13 | func (ct *Time) UnmarshalJSON(b []byte) error { 14 | // Remove the quotes around the date string 15 | str := strings.Trim(string(b), "\"") 16 | time, err := parseStringTimestamp(str) 17 | if err != nil { 18 | return err 19 | } 20 | 21 | *ct = Time{time} 22 | return nil 23 | } 24 | 25 | // Before reports whether the time instant ct is before u 26 | func (ct Time) Before(u Time) bool { 27 | return ct.Time.Before(u.Time) 28 | } 29 | 30 | // After reports whether the time instant ct is after u 31 | func (ct Time) After(u Time) bool { 32 | return ct.Time.After(u.Time) 33 | } 34 | 35 | // Equal reports whether ct and u represent the same time instant 36 | func (ct Time) Equal(u Time) bool { 37 | return ct.Time.Equal(u.Time) 38 | } 39 | 40 | // Compare compares the time instant ct with u. If ct is before u, it returns -1; 41 | // if ct is after u, it returns +1; if they're the same, it returns 0. 42 | func (ct Time) Compare(u Time) int { 43 | if ct.Before(u) { 44 | return -1 45 | } 46 | if ct.After(u) { 47 | return 1 48 | } 49 | return 0 50 | } 51 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | 4 | 5 | Fixes # (issue) 6 | 7 | ## Type of change 8 | 9 | 11 | 12 | - [ ] Bug fix (non-breaking change which fixes an issue) 13 | - [ ] New feature (non-breaking change which adds functionality) 14 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 15 | - [ ] This change requires a documentation update 16 | 17 | # How Has This Been Tested? 18 | 19 | 20 | 21 | - [ ] Scenario A 22 | - [ ] Scenario B 23 | 24 | # Screenshots or Recordings 25 | 26 | 27 | ## Documentation 28 | 29 | 30 | 31 | - [ ] Documentation Link: [link to README, olake.io/docs, or olake-docs] 32 | - [ ] N/A (bug fix, refactor, or test changes only) 33 | 34 | ## Related PR's (If Any): -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build Stage 2 | FROM golang:1.24-alpine AS base 3 | 4 | WORKDIR /home/app 5 | COPY . . 6 | 7 | ARG DRIVER_NAME=olake 8 | # Build the Go binary 9 | WORKDIR /home/app/drivers/${DRIVER_NAME} 10 | RUN go build -o /olake main.go 11 | 12 | # Final Runtime Stage 13 | FROM alpine:3.18 14 | 15 | # Install Java 17 and iproute2 for ss command 16 | RUN apk add --no-cache openjdk17 iproute2 17 | 18 | # Copy the binary from the build stage 19 | COPY --from=base /olake /home/olake 20 | 21 | ARG DRIVER_VERSION=dev 22 | ARG DRIVER_NAME=olake 23 | 24 | # Copy the pre-built JAR file from Maven 25 | # First try to copy from the source location (works after Maven build) 26 | COPY destination/iceberg/olake-iceberg-java-writer/target/olake-iceberg-java-writer-0.0.1-SNAPSHOT.jar /home/olake-iceberg-java-writer.jar 27 | 28 | # Copy the spec files for driver and destinations 29 | COPY --from=base /home/app/drivers/${DRIVER_NAME}/resources/spec.json /drivers/${DRIVER_NAME}/resources/spec.json 30 | COPY --from=base /home/app/destination/iceberg/resources/spec.json /destination/iceberg/resources/spec.json 31 | COPY --from=base /home/app/destination/parquet/resources/spec.json /destination/parquet/resources/spec.json 32 | 33 | # Metadata 34 | LABEL io.eggwhite.version=${DRIVER_VERSION} 35 | LABEL io.eggwhite.name=olake/source-${DRIVER_NAME} 36 | 37 | # Set working directory 38 | WORKDIR /home 39 | 40 | # Entrypoint 41 | ENTRYPOINT ["./olake"] -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/java/io/debezium/server/iceberg/tableoperator/UnpartitionedDeltaWriter.java: -------------------------------------------------------------------------------- 1 | package io.debezium.server.iceberg.tableoperator; 2 | 3 | import java.io.IOException; 4 | import java.util.Set; 5 | 6 | import org.apache.iceberg.FileFormat; 7 | import org.apache.iceberg.PartitionSpec; 8 | import org.apache.iceberg.Schema; 9 | import org.apache.iceberg.data.Record; 10 | import org.apache.iceberg.io.FileAppenderFactory; 11 | import org.apache.iceberg.io.FileIO; 12 | import org.apache.iceberg.io.OutputFileFactory; 13 | 14 | class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { 15 | private final RowDataDeltaWriter writer; 16 | 17 | UnpartitionedDeltaWriter(PartitionSpec spec, 18 | FileFormat format, 19 | FileAppenderFactory appenderFactory, 20 | OutputFileFactory fileFactory, 21 | FileIO io, 22 | long targetFileSize, 23 | Schema schema, 24 | Set identifierFieldIds, 25 | boolean keepDeletes) { 26 | super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, identifierFieldIds, keepDeletes); 27 | this.writer = new RowDataDeltaWriter(null); 28 | } 29 | 30 | @Override 31 | RowDataDeltaWriter route(Record row) { 32 | return writer; 33 | } 34 | 35 | @Override 36 | public void close() throws IOException { 37 | writer.close(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /utils/ssl.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import "errors" 4 | 5 | const ( 6 | SSLModeRequire = "require" 7 | SSLModeDisable = "disable" 8 | SSLModeVerifyCA = "verify-ca" 9 | SSLModeVerifyFull = "verify-full" 10 | 11 | Unknown = "" 12 | ) 13 | 14 | // SSLConfig is a dto for deserialized SSL configuration for Postgres 15 | type SSLConfig struct { 16 | Mode string `mapstructure:"mode,omitempty" json:"mode,omitempty" yaml:"mode,omitempty"` 17 | ServerCA string `mapstructure:"server_ca,omitempty" json:"server_ca,omitempty" yaml:"server_ca,omitempty"` 18 | ClientCert string `mapstructure:"client_cert,omitempty" json:"client_cert,omitempty" yaml:"client_cert,omitempty"` 19 | ClientKey string `mapstructure:"client_key,omitempty" json:"client_key,omitempty" yaml:"client_key,omitempty"` 20 | } 21 | 22 | // Validate returns err if the ssl configuration is invalid 23 | func (sc *SSLConfig) Validate() error { 24 | // TODO: Add Proper validations and test 25 | if sc == nil { 26 | return errors.New("'ssl' config is required") 27 | } 28 | 29 | if sc.Mode == Unknown { 30 | return errors.New("'ssl.mode' is required parameter") 31 | } 32 | 33 | if sc.Mode == SSLModeVerifyCA || sc.Mode == SSLModeVerifyFull { 34 | if sc.ServerCA == "" { 35 | return errors.New("'ssl.server_ca' is required parameter") 36 | } 37 | 38 | if sc.ClientCert == "" { 39 | return errors.New("'ssl.client_cert' is required parameter") 40 | } 41 | 42 | if sc.ClientKey == "" { 43 | return errors.New("'ssl.client_key' is required parameter") 44 | } 45 | } 46 | 47 | return nil 48 | } 49 | -------------------------------------------------------------------------------- /.github/workflows/prevent-direct-master-prs.yml: -------------------------------------------------------------------------------- 1 | name: Master Branch Protection 2 | 3 | on: 4 | pull_request: 5 | types: [opened, edited, reopened, synchronize] 6 | 7 | jobs: 8 | enforce-rules: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Validate Branch Rules 12 | run: | 13 | SOURCE_BRANCH="${{ github.event.pull_request.head.ref }}" 14 | TARGET_BRANCH="${{ github.event.pull_request.base.ref }}" 15 | 16 | # Allow only staging → master PRs 17 | if [ "$TARGET_BRANCH" == "master" ]; then 18 | if [ "$SOURCE_BRANCH" != "staging" ]; then 19 | echo "::error::Direct PRs to master are blocked. Use staging branch." 20 | exit 1 21 | fi 22 | echo "✅ Valid staging → master PR" 23 | else 24 | echo "ℹ️ Not a master PR - branch rules not enforced" 25 | fi 26 | 27 | - name: Validate PR Title 28 | run: | 29 | PR_TITLE="${{ github.event.pull_request.title }}" 30 | PATTERN="^(feat|fix|docs|style|refactor|test|chore|build|ci|perf|revert)(\(.+\))?: .+" 31 | 32 | if [[ ! "$PR_TITLE" =~ $PATTERN ]]; then 33 | echo "::error::PR title must follow: type(scope): description" 34 | echo "Valid formats:" 35 | echo "- feat: add new feature" 36 | echo "- fix(login): resolve auth issue" 37 | echo "Allowed types: feat|fix|docs|style|refactor|test|chore|build|ci|perf|revert" 38 | exit 1 39 | fi 40 | echo "✅ Valid PR title format" -------------------------------------------------------------------------------- /destination/interface.go: -------------------------------------------------------------------------------- 1 | package destination 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/datazip-inc/olake/types" 7 | ) 8 | 9 | type Config interface { 10 | Validate() error 11 | } 12 | 13 | type Write = func(ctx context.Context, channel <-chan types.Record) error 14 | type FlattenFunction = func(record types.Record) (types.Record, error) 15 | 16 | type Writer interface { 17 | GetConfigRef() Config 18 | Spec() any 19 | Type() string 20 | // Sets up connections and perform checks; doesn't load Streams 21 | // 22 | // Note: Check shouldn't be called before Setup as they're composed at Connector level 23 | Check(ctx context.Context) error 24 | // Setup sets up an Adapter for dedicated use for a stream 25 | // avoiding the headover for different streams 26 | Setup(ctx context.Context, stream types.StreamInterface, schema any, opts *Options) (any, error) 27 | // Write function being used by drivers 28 | Write(ctx context.Context, record []types.RawRecord) error 29 | // flatten data and validates thread schema (return true if thread schema is different w.r.t records) 30 | FlattenAndCleanData(ctx context.Context, records []types.RawRecord) (bool, []types.RawRecord, any, error) 31 | // EvolveSchema updates the schema based on changes. 32 | // Need to pass olakeTimestamp as end argument to get the correct partition path based on record ingestion time. 33 | EvolveSchema(ctx context.Context, globalSchema, recordsSchema any) (any, error) 34 | // DropStreams is used to clear the destination before re-writing the stream 35 | DropStreams(ctx context.Context, dropStreams []types.StreamInterface) error 36 | Close(ctx context.Context) error 37 | } 38 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/resources/record_ingest.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package io.debezium.server.iceberg.rpc; 4 | 5 | service RecordIngestService { 6 | rpc SendRecords(IcebergPayload) returns (RecordIngestResponse); 7 | } 8 | 9 | message IcebergPayload { 10 | enum PayloadType { 11 | RECORDS = 0; 12 | COMMIT = 1; 13 | EVOLVE_SCHEMA = 2; 14 | DROP_TABLE = 3; 15 | GET_OR_CREATE_TABLE = 4; 16 | REFRESH_TABLE_SCHEMA = 5; 17 | } 18 | 19 | PayloadType type = 1; 20 | 21 | message Metadata { 22 | string dest_table_name = 1; 23 | string thread_id = 2; 24 | optional string identifier_field = 3; 25 | repeated SchemaField schema = 4; 26 | } 27 | 28 | message SchemaField { 29 | string ice_type = 1; 30 | string key = 2; 31 | } 32 | 33 | // OPTIMIZED: Replace google.protobuf.Value with typed fields 34 | message IceRecord { 35 | // Use oneof for efficient memory usage - only one field type active per record field 36 | message FieldValue { 37 | oneof value { 38 | string string_value = 1; 39 | int32 int_value = 2; 40 | int64 long_value = 3; 41 | float float_value = 4; 42 | double double_value = 5; 43 | bool bool_value = 6; 44 | bytes bytes_value = 7; 45 | } 46 | } 47 | 48 | repeated FieldValue fields = 1; 49 | string record_type = 2; // "u", "c", "r" 50 | } 51 | 52 | Metadata metadata = 2; 53 | repeated IceRecord records = 3; 54 | } 55 | 56 | message RecordIngestResponse { 57 | string result = 1; 58 | bool success = 2; // Adding success boolean for better error handling 59 | } 60 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yaml: -------------------------------------------------------------------------------- 1 | name: Documentation Check 2 | 3 | on: 4 | pull_request: 5 | types: [ opened, edited, synchronize ] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | validate-documentation: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Validate Documentation 15 | uses: actions/github-script@v6 16 | with: 17 | script: | 18 | const { body } = context.payload.pull_request; 19 | 20 | // If N/A is checked, we're done 21 | if (/-\s*\[\s*x\s*\]\s*N\/A/.test(body)) return; 22 | 23 | // Check for valid documentation link in the Documentation Link line 24 | const docLine = body.match(/-\s*\[\s*x\s*\]\s*Documentation Link:[^\n]*/i); 25 | if (!docLine) { 26 | core.setFailed( 27 | 'Documentation Required: Please either check "Documentation Link" and provide a valid link, or check "N/A" for bug fixes, refactors, or test changes.' 28 | ); 29 | return; 30 | } 31 | 32 | const validLinkPatterns = [ 33 | /https:\/\/olake\.io\/docs[^\s)]*/, 34 | /https:\/\/github\.com\/datazip-inc\/olake-docs[^\s)]*/, 35 | /\[[^\]]*\]\([^)]*README[^)]*\)/i 36 | ]; 37 | 38 | const hasValidLink = validLinkPatterns.some(pattern => pattern.test(docLine[0])); 39 | if (!hasValidLink) { 40 | core.setFailed( 41 | 'Documentation Required: Please provide a valid documentation link next to the checked "Documentation Link" option. Valid sources: README files, olake.io/docs, or olake-docs repository.' 42 | ); 43 | } 44 | -------------------------------------------------------------------------------- /destination/iceberg/proto/records_ingest.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package io.debezium.server.iceberg.rpc; 4 | 5 | option go_package = "iceberg/proto"; 6 | 7 | service RecordIngestService { 8 | rpc SendRecords(IcebergPayload) returns (RecordIngestResponse); 9 | } 10 | 11 | message IcebergPayload { 12 | enum PayloadType { 13 | RECORDS = 0; 14 | COMMIT = 1; 15 | EVOLVE_SCHEMA = 2; 16 | DROP_TABLE = 3; 17 | GET_OR_CREATE_TABLE = 4; 18 | REFRESH_TABLE_SCHEMA = 5; 19 | } 20 | 21 | PayloadType type = 1; 22 | 23 | message Metadata { 24 | string dest_table_name = 1; 25 | string thread_id = 2; 26 | optional string identifier_field = 3; 27 | repeated SchemaField schema = 4; 28 | } 29 | 30 | message SchemaField { 31 | string ice_type = 1; 32 | string key = 2; 33 | } 34 | 35 | // OPTIMIZED: Replace google.protobuf.Value with typed fields 36 | message IceRecord { 37 | // Use oneof for efficient memory usage - only one field type active per record field 38 | message FieldValue { 39 | oneof value { 40 | string string_value = 1; 41 | int32 int_value = 2; 42 | int64 long_value = 3; 43 | float float_value = 4; 44 | double double_value = 5; 45 | bool bool_value = 6; 46 | bytes bytes_value = 7; 47 | } 48 | } 49 | 50 | repeated FieldValue fields = 1; 51 | string record_type = 2; // "u", "c", "r" 52 | } 53 | 54 | Metadata metadata = 2; 55 | repeated IceRecord records = 3; 56 | } 57 | 58 | message RecordIngestResponse { 59 | string result = 1; 60 | bool success = 2; // Adding success boolean for better error handling 61 | } 62 | -------------------------------------------------------------------------------- /drivers/postgres/internal/incremental.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/datazip-inc/olake/constants" 8 | "github.com/datazip-inc/olake/drivers/abstract" 9 | "github.com/datazip-inc/olake/pkg/jdbc" 10 | "github.com/datazip-inc/olake/types" 11 | ) 12 | 13 | func (p *Postgres) StreamIncrementalChanges(ctx context.Context, stream types.StreamInterface, processFn abstract.BackfillMsgFn) error { 14 | opts := jdbc.DriverOptions{ 15 | Driver: constants.Postgres, 16 | Stream: stream, 17 | State: p.state, 18 | } 19 | incrementalQuery, queryArgs, err := jdbc.BuildIncrementalQuery(ctx, opts) 20 | if err != nil { 21 | return fmt.Errorf("failed to build incremental condition: %s", err) 22 | } 23 | 24 | rows, err := p.client.QueryContext(ctx, incrementalQuery, queryArgs...) 25 | if err != nil { 26 | return fmt.Errorf("failed to execute incremental query: %s", err) 27 | } 28 | defer rows.Close() 29 | 30 | for rows.Next() { 31 | record := make(types.Record) 32 | if err := jdbc.MapScan(rows, record, p.dataTypeConverter); err != nil { 33 | return fmt.Errorf("failed to scan record: %s", err) 34 | } 35 | 36 | if err := processFn(ctx, record); err != nil { 37 | return fmt.Errorf("process error: %s", err) 38 | } 39 | } 40 | return rows.Err() 41 | } 42 | 43 | func (p *Postgres) FetchMaxCursorValues(ctx context.Context, stream types.StreamInterface) (any, any, error) { 44 | maxPrimaryCursorValue, maxSecondaryCursorValue, err := jdbc.GetMaxCursorValues(ctx, p.client, constants.Postgres, stream) 45 | if err != nil { 46 | return nil, nil, err 47 | } 48 | return maxPrimaryCursorValue, maxSecondaryCursorValue, nil 49 | } 50 | -------------------------------------------------------------------------------- /protocol/spec.go: -------------------------------------------------------------------------------- 1 | package protocol 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | 8 | "github.com/datazip-inc/olake/utils" 9 | "github.com/datazip-inc/olake/utils/logger" 10 | "github.com/datazip-inc/olake/utils/spec" 11 | "github.com/spf13/cobra" 12 | ) 13 | 14 | var specCmd = &cobra.Command{ 15 | Use: "spec", 16 | Short: "spec command", 17 | RunE: func(_ *cobra.Command, _ []string) error { 18 | specPath, err := resolveSpecPath() 19 | if err != nil { 20 | return err 21 | } 22 | 23 | var specData map[string]interface{} 24 | if err := utils.UnmarshalFile(specPath, &specData, false); err != nil { 25 | return fmt.Errorf("failed to read spec file %s: %v", specPath, err) 26 | } 27 | 28 | schemaType := utils.Ternary(destinationType == "not-set", connector.Type(), destinationType).(string) 29 | uiSchema, err := spec.LoadUISchema(schemaType) 30 | if err != nil { 31 | return fmt.Errorf("failed to get ui schema: %v", err) 32 | } 33 | 34 | specSchema := map[string]interface{}{ 35 | "jsonschema": specData, 36 | "uischema": uiSchema, 37 | } 38 | 39 | logger.Info(specSchema) 40 | return nil 41 | }, 42 | } 43 | 44 | func resolveSpecPath() (string, error) { 45 | // pwd is olake/drivers/(driver) or olake/destination/(destination) 46 | pwd, err := os.Getwd() 47 | if err != nil { 48 | return "", err 49 | } 50 | // olakeRoot is olake's root path 51 | olakeRoot := filepath.Join(pwd, "..", "..") 52 | specPath := utils.Ternary(destinationType == "not-set", filepath.Join(olakeRoot, "drivers", connector.Type(), "resources/spec.json"), filepath.Join(olakeRoot, "destination", destinationType, "resources/spec.json")).(string) 53 | 54 | return specPath, nil 55 | } 56 | -------------------------------------------------------------------------------- /pkg/waljs/types.go: -------------------------------------------------------------------------------- 1 | package waljs 2 | 3 | import ( 4 | "crypto/tls" 5 | "net/url" 6 | "time" 7 | 8 | "github.com/datazip-inc/olake/types" 9 | "github.com/datazip-inc/olake/utils/typeutils" 10 | "github.com/jackc/pglogrepl" 11 | "golang.org/x/crypto/ssh" 12 | ) 13 | 14 | type Config struct { 15 | Tables *types.Set[types.StreamInterface] 16 | Connection url.URL 17 | SSHClient *ssh.Client 18 | ReplicationSlotName string 19 | InitialWaitTime time.Duration 20 | TLSConfig *tls.Config 21 | BatchSize int 22 | // Publications is used with pgoutput 23 | Publication string 24 | } 25 | 26 | type WALState struct { 27 | LSN string `json:"lsn"` 28 | } 29 | 30 | func (s *WALState) IsEmpty() bool { 31 | return s == nil || s.LSN == "" 32 | } 33 | 34 | type ReplicationSlot struct { 35 | SlotType string `db:"slot_type"` 36 | Plugin string `db:"plugin"` 37 | LSN pglogrepl.LSN `db:"confirmed_flush_lsn"` 38 | CurrentLSN pglogrepl.LSN `db:"current_lsn"` 39 | } 40 | 41 | type WALMessage struct { 42 | NextLSN string `json:"nextlsn"` 43 | Timestamp typeutils.Time `json:"timestamp"` 44 | Change []struct { 45 | Kind string `json:"kind"` 46 | Schema string `json:"schema"` 47 | Table string `json:"table"` 48 | Columnnames []string `json:"columnnames"` 49 | Columntypes []string `json:"columntypes"` 50 | Columnvalues []interface{} `json:"columnvalues"` 51 | Oldkeys struct { 52 | Keynames []string `json:"keynames"` 53 | Keytypes []string `json:"keytypes"` 54 | Keyvalues []interface{} `json:"keyvalues"` 55 | } `json:"oldkeys"` 56 | } `json:"change"` 57 | } 58 | -------------------------------------------------------------------------------- /drivers/mysql/internal/incremental.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | 8 | "github.com/datazip-inc/olake/constants" 9 | "github.com/datazip-inc/olake/drivers/abstract" 10 | "github.com/datazip-inc/olake/pkg/jdbc" 11 | "github.com/datazip-inc/olake/types" 12 | ) 13 | 14 | func (m *MySQL) StreamIncrementalChanges(ctx context.Context, stream types.StreamInterface, processFn abstract.BackfillMsgFn) error { 15 | opts := jdbc.DriverOptions{ 16 | Driver: constants.MySQL, 17 | Stream: stream, 18 | State: m.state, 19 | } 20 | incrementalQuery, queryArgs, err := jdbc.BuildIncrementalQuery(ctx, opts) 21 | if err != nil { 22 | return fmt.Errorf("failed to build incremental condition: %s", err) 23 | } 24 | 25 | var rows *sql.Rows 26 | rows, err = m.client.QueryContext(ctx, incrementalQuery, queryArgs...) 27 | if err != nil { 28 | return fmt.Errorf("failed to execute incremental query: %s", err) 29 | } 30 | defer rows.Close() 31 | 32 | // Scan rows and process 33 | for rows.Next() { 34 | record := make(types.Record) 35 | if err := jdbc.MapScan(rows, record, m.dataTypeConverter); err != nil { 36 | return fmt.Errorf("failed to scan record: %s", err) 37 | } 38 | 39 | if err := processFn(ctx, record); err != nil { 40 | return fmt.Errorf("process error: %s", err) 41 | } 42 | } 43 | 44 | return rows.Err() 45 | } 46 | 47 | func (m *MySQL) FetchMaxCursorValues(ctx context.Context, stream types.StreamInterface) (any, any, error) { 48 | maxPrimaryCursorValue, maxSecondaryCursorValue, err := jdbc.GetMaxCursorValues(ctx, m.client, constants.MySQL, stream) 49 | if err != nil { 50 | return nil, nil, err 51 | } 52 | return maxPrimaryCursorValue, maxSecondaryCursorValue, nil 53 | } 54 | -------------------------------------------------------------------------------- /drivers/oracle/internal/incremental.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/datazip-inc/olake/constants" 8 | "github.com/datazip-inc/olake/drivers/abstract" 9 | "github.com/datazip-inc/olake/pkg/jdbc" 10 | "github.com/datazip-inc/olake/types" 11 | ) 12 | 13 | // StreamIncrementalChanges implements incremental sync for Oracle 14 | func (o *Oracle) StreamIncrementalChanges(ctx context.Context, stream types.StreamInterface, processFn abstract.BackfillMsgFn) error { 15 | opts := jdbc.DriverOptions{ 16 | Driver: constants.Oracle, 17 | Stream: stream, 18 | State: o.state, 19 | Client: o.client, 20 | } 21 | incrementalQuery, queryArgs, err := jdbc.BuildIncrementalQuery(ctx, opts) 22 | if err != nil { 23 | return fmt.Errorf("failed to build incremental condition: %s", err) 24 | } 25 | 26 | rows, err := o.client.QueryContext(ctx, incrementalQuery, queryArgs...) 27 | if err != nil { 28 | return fmt.Errorf("failed to execute incremental query: %s", err) 29 | } 30 | defer rows.Close() 31 | 32 | for rows.Next() { 33 | record := make(types.Record) 34 | if err := jdbc.MapScan(rows, record, o.dataTypeConverter); err != nil { 35 | return fmt.Errorf("failed to scan record: %s", err) 36 | } 37 | 38 | if err := processFn(ctx, record); err != nil { 39 | return fmt.Errorf("process error: %s", err) 40 | } 41 | } 42 | return rows.Err() 43 | } 44 | 45 | func (o *Oracle) FetchMaxCursorValues(ctx context.Context, stream types.StreamInterface) (any, any, error) { 46 | maxPrimaryCursorValue, maxSecondaryCursorValue, err := jdbc.GetMaxCursorValues(ctx, o.client, constants.Oracle, stream) 47 | if err != nil { 48 | return nil, nil, err 49 | } 50 | return maxPrimaryCursorValue, maxSecondaryCursorValue, nil 51 | } 52 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/Olake-changes-notice.txt: -------------------------------------------------------------------------------- 1 | NOTICE OF MODIFICATIONS TO DEBEZIUM-SERVER-ICEBERG 2 | 3 | This file documents modifications made by Datazip Inc. to the original debezium-server-iceberg project. 4 | 5 | Original Project: 6 | Name: debezium-server-iceberg 7 | Repository: https://github.com/memiiso/debezium-server-iceberg/ 8 | License: Apache License, Version 2.0 9 | 10 | Modified Project: 11 | Name: debezium-server-iceberg (Olake fork) 12 | Repository: https://github.com/datazip-inc/debezium-server-iceberg 13 | License: Apache License, Version 2.0 14 | 15 | Modifications: 16 | The modifications made to the original project are documented in the pull request: 17 | https://github.com/datazip-inc/debezium-server-iceberg/pull/1 18 | 19 | These changes include but are not limited to: 20 | - Customizations to adapt the project for use within the Olake platform 21 | - Performance optimizations and feature enhancements 22 | - Compatibility adjustments for integration with Olake's infrastructure 23 | 24 | Future Modifications: 25 | All future modifications to this project will be tracked in the commit history of the repository. 26 | Detailed information about changes can be found by examining commit logs, pull requests, 27 | and release notes at https://github.com/datazip-inc/olake 28 | 29 | This notice is provided in compliance with Section 4(b) of the Apache License, Version 2.0, 30 | which requires that any modified files carry prominent notices stating that the files have been changed. 31 | 32 | The full text of the Apache License, Version 2.0 can be found at: 33 | http://www.apache.org/licenses/LICENSE-2.0 34 | 35 | We have also changed the name of the project to olake-iceberg-java-writer 36 | 37 | Copyright 2025 Datazip Inc. 38 | -------------------------------------------------------------------------------- /drivers/kafka/internal/config.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/datazip-inc/olake/constants" 7 | "github.com/datazip-inc/olake/utils" 8 | ) 9 | 10 | type Config struct { 11 | BootstrapServers string `json:"bootstrap_servers"` 12 | ConsumerGroupID string `json:"consumer_group_id,omitempty"` 13 | Protocol ProtocolConfig `json:"protocol"` 14 | MaxThreads int `json:"max_threads"` 15 | RetryCount int `json:"backoff_retry_count"` 16 | ThreadsEqualTotalPartitions bool `json:"threads_equal_total_partitions,omitempty"` 17 | } 18 | 19 | type ProtocolConfig struct { 20 | SecurityProtocol string `json:"security_protocol"` 21 | SASLMechanism string `json:"sasl_mechanism,omitempty"` 22 | SASLJAASConfig string `json:"sasl_jaas_config,omitempty"` 23 | } 24 | 25 | func (c *Config) Validate() error { 26 | if c.BootstrapServers == "" { 27 | return fmt.Errorf("bootstrap_servers is required") 28 | } 29 | 30 | if c.Protocol.SecurityProtocol == "" { 31 | return fmt.Errorf("security_protocol must be either PLAINTEXT or SASL_PLAINTEXT or SASL_SSL") 32 | } 33 | 34 | if c.Protocol.SecurityProtocol == "SASL_PLAINTEXT" || c.Protocol.SecurityProtocol == "SASL_SSL" { 35 | if c.Protocol.SASLMechanism == "" { 36 | return fmt.Errorf("sasl_mechanism must be either PLAIN or SCRAM-SHA-512") 37 | } 38 | if c.Protocol.SASLJAASConfig == "" { 39 | return fmt.Errorf("sasl_jaas_config must be provided") 40 | } 41 | } 42 | 43 | if c.MaxThreads <= 0 { 44 | c.MaxThreads = constants.DefaultThreadCount 45 | } 46 | 47 | if c.RetryCount <= 0 { 48 | c.RetryCount = constants.DefaultRetryCount 49 | } 50 | 51 | return utils.Validate(c) 52 | } 53 | -------------------------------------------------------------------------------- /drivers/postgres/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | postgres: 5 | image: postgres:15 6 | container_name: olake_postgres-test 7 | restart: unless-stopped 8 | environment: 9 | POSTGRES_USER: postgres 10 | POSTGRES_PASSWORD: secret1234 11 | POSTGRES_DB: postgres 12 | ports: 13 | - "5433:5432" 14 | volumes: 15 | - ./pg_hba.conf:/etc/postgresql/pg_hba.conf 16 | command: 17 | - bash 18 | - -c 19 | - | 20 | # Install wal2json 21 | apt-get update && apt-get install -y postgresql-15-wal2json 22 | 23 | # Start PostgreSQL with basic config in background 24 | docker-entrypoint.sh postgres \ 25 | -c hba_file=/etc/postgresql/pg_hba.conf \ 26 | -c listen_addresses='*' \ 27 | -c wal_level=logical \ 28 | -c max_wal_senders=10 \ 29 | -c max_replication_slots=10 \ 30 | -c shared_preload_libraries=wal2json & 31 | 32 | # Wait for PostgreSQL to be ready 33 | while ! pg_isready -U postgres -h localhost -p 5432; do 34 | sleep 1 35 | done 36 | 37 | # Create replication slot if it doesn't exist 38 | psql -v ON_ERROR_STOP=1 -U postgres < 40 | [CLA]: https://docs.google.com/forms/d/e/1FAIpQLSdze2q6gn81fmbIp2bW5cIpAXcpv7Y5OQjQyXflNvoYWiO4OQ/viewform 41 | -------------------------------------------------------------------------------- /.github/workflows/build-and-release-driver.yml: -------------------------------------------------------------------------------- 1 | name: Olake Driver Build And Release 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | driver: 6 | description: "Driver to build" 7 | required: true 8 | version: 9 | description: "Version to release" 10 | required: true 11 | workflow_call: 12 | inputs: 13 | driver: 14 | description: "Driver to build" 15 | required: true 16 | type: string 17 | default: "" 18 | version: 19 | description: "Version to release" 20 | required: true 21 | type: string 22 | default: "" 23 | 24 | jobs: 25 | publish_drivers: 26 | name: Publish driver ${{ github.event.inputs.driver }} [manual] 27 | environment: Publish Driver 28 | runs-on: ubuntu-latest 29 | env: 30 | DOCKER_LOGIN: ${{ secrets.DOCKER_USERNAME }} 31 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} 32 | DHID: ${{ secrets.DOCKER_REPO || 'olakego' }} 33 | DRIVER: ${{ inputs.driver || github.event.inputs.driver || 'mongodb' }} 34 | VERSION: ${{ inputs.version || github.event.inputs.version || 'v0.0.0.dev' }} 35 | steps: 36 | - name: Checkout code 37 | uses: actions/checkout@v3 38 | 39 | - name: Set up JDK 17 40 | uses: actions/setup-java@v3 41 | with: 42 | java-version: "17" 43 | distribution: "temurin" 44 | cache: maven 45 | 46 | - name: Set Driver 47 | run: echo "DRIVER=${{ env.DRIVER }}" >> $GITHUB_ENV 48 | - name: Set VERSION 49 | run: echo "VERSION=${{ env.VERSION }}" >> $GITHUB_ENV 50 | - name: Show VERSION 51 | run: echo "Building driver $DRIVER with version $VERSION" 52 | - name: Run Release tool 53 | run: | 54 | chmod +x ./release-tool.sh 55 | ./release-tool.sh 56 | shell: bash -------------------------------------------------------------------------------- /utils/validation.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "errors" 5 | "reflect" 6 | "strings" 7 | 8 | "github.com/go-playground/locales/en" 9 | ut "github.com/go-playground/universal-translator" 10 | "github.com/go-playground/validator/v10" 11 | en_translations "github.com/go-playground/validator/v10/translations/en" 12 | ) 13 | 14 | // use a single instance, it caches struct info 15 | var ( 16 | uni *ut.UniversalTranslator 17 | validate *validator.Validate 18 | trans ut.Translator 19 | ) 20 | 21 | func translateError(err error) (errs []string) { 22 | trans, _ := uni.GetTranslator("en") 23 | 24 | if err == nil { 25 | return nil 26 | } 27 | validatorErrs := err.(validator.ValidationErrors) 28 | for _, e := range validatorErrs { 29 | translatedErr := errors.New(e.Translate(trans)) 30 | errs = append(errs, translatedErr.Error()) 31 | } 32 | 33 | return errs 34 | } 35 | 36 | func Validate[T any](structure T) error { 37 | if err := validate.Struct(structure); err != nil { 38 | return errors.New(strings.Join(translateError(err), "; ")) 39 | } 40 | 41 | return nil 42 | } 43 | 44 | func init() { 45 | // NOTE: omitting allot of error checking for brevity 46 | en := en.New() 47 | uni = ut.New(en, en) 48 | trans, _ = uni.GetTranslator("en") 49 | 50 | validate = validator.New(validator.WithRequiredStructEnabled()) 51 | validate.RegisterTagNameFunc(func(fld reflect.StructField) string { 52 | name := strings.SplitN(fld.Tag.Get("json"), ",", 2)[0] 53 | if name == "" { 54 | name = strings.SplitN(fld.Tag.Get("yaml"), ",", 2)[0] 55 | } 56 | 57 | fieldName := fld.Name 58 | if name == "-" { 59 | return fieldName 60 | } 61 | 62 | if name != "" { 63 | return name 64 | } 65 | 66 | return fieldName 67 | }) 68 | 69 | err := en_translations.RegisterDefaultTranslations(validate, trans) 70 | if err != nil { 71 | panic(err) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /drivers/mongodb/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | init-keyfile: 3 | image: mongo:8.0 4 | container_name: init_keyfile 5 | command: > 6 | sh -c " 7 | mkdir -p /etc/mongodb/pki && 8 | if [ ! -f /etc/mongodb/pki/keyfile ]; then 9 | echo 'Generating keyfile...'; 10 | openssl rand -base64 756 > /etc/mongodb/pki/keyfile && 11 | chmod 400 /etc/mongodb/pki/keyfile; 12 | else 13 | echo 'Keyfile already exists.'; 14 | fi 15 | " 16 | volumes: 17 | - mongo-keyfile-vol:/etc/mongodb/pki # Mount the volume that stores the keyfile. 18 | networks: 19 | - mongo-cluster 20 | restart: "on-failure" 21 | 22 | # Primary MongoDB container that sets up a replica set and creates an admin user. 23 | primary_mongo: 24 | container_name: primary_mongo 25 | image: mongo:8.0 26 | hostname: primary_mongo 27 | extra_hosts: 28 | - "host.docker.internal:host-gateway" 29 | ports: 30 | - "27017:27017" 31 | depends_on: 32 | - init-keyfile 33 | volumes: 34 | - mongo-keyfile-vol:/etc/mongodb/pki 35 | - ./mongodb-init.sh:/mongodb-init.sh:ro 36 | command: ["bash", "/mongodb-init.sh"] 37 | healthcheck: 38 | test: ["CMD", "mongosh", "--port", "27017", "--eval", "db.adminCommand('ping')"] 39 | interval: 10s 40 | timeout: 10s 41 | retries: 10 42 | networks: 43 | - mongo-cluster 44 | 45 | networks: 46 | mongo-cluster: 47 | 48 | volumes: 49 | mongo-keyfile-vol: -------------------------------------------------------------------------------- /drivers/abstract/interface.go: -------------------------------------------------------------------------------- 1 | package abstract 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/datazip-inc/olake/destination" 7 | "github.com/datazip-inc/olake/types" 8 | ) 9 | 10 | type BackfillMsgFn func(ctx context.Context, message map[string]any) error 11 | type CDCMsgFn func(ctx context.Context, message CDCChange) error 12 | 13 | type Config interface { 14 | Validate() error 15 | } 16 | 17 | type DriverInterface interface { 18 | GetConfigRef() Config 19 | Spec() any 20 | Type() string 21 | // specific to test & setup 22 | Setup(ctx context.Context) error 23 | SetupState(state *types.State) 24 | // sync artifacts 25 | MaxConnections() int 26 | MaxRetries() int 27 | // specific to discover 28 | GetStreamNames(ctx context.Context) ([]string, error) 29 | ProduceSchema(ctx context.Context, stream string) (*types.Stream, error) 30 | // specific to backfill 31 | GetOrSplitChunks(ctx context.Context, pool *destination.WriterPool, stream types.StreamInterface) (*types.Set[types.Chunk], error) 32 | ChunkIterator(ctx context.Context, stream types.StreamInterface, chunk types.Chunk, processFn BackfillMsgFn) error 33 | //incremental specific 34 | FetchMaxCursorValues(ctx context.Context, stream types.StreamInterface) (any, any, error) 35 | StreamIncrementalChanges(ctx context.Context, stream types.StreamInterface, cb BackfillMsgFn) error 36 | // specific to cdc 37 | CDCSupported() bool 38 | PreCDC(ctx context.Context, streams []types.StreamInterface) error // to init state 39 | StreamChanges(ctx context.Context, stream types.StreamInterface, processFn CDCMsgFn) error 40 | PostCDC(ctx context.Context, stream types.StreamInterface, success bool, readerID string) error // to save state 41 | } 42 | 43 | type KafkaInterface interface { 44 | DriverInterface 45 | GetReaderIDs() []string 46 | PartitionStreamChanges(ctx context.Context, readerID string, processFn CDCMsgFn) error 47 | } 48 | -------------------------------------------------------------------------------- /.github/workflows/draft-release-and-changelog.yml: -------------------------------------------------------------------------------- 1 | name: Draft Releaser From Master 2 | 3 | on: 4 | pull_request: 5 | types: [closed] 6 | branches: 7 | - master 8 | 9 | jobs: 10 | create_draft_release: 11 | if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'master' && github.event.pull_request.head.ref == 'staging' 12 | name: Create Draft Release 13 | runs-on: ubuntu-latest 14 | permissions: 15 | contents: write 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | 22 | - name: Get latest release 23 | id: latest-release 24 | run: | 25 | latest_tag=$(git tag -l | sort -V | tail -1) 26 | echo "LATEST_TAG=$latest_tag" >> $GITHUB_ENV 27 | 28 | - name: Generate next version 29 | id: generate-next-version 30 | run: | 31 | if [ -z "$LATEST_TAG" ]; then 32 | next_version="v0.0.0" 33 | else 34 | # Remove 'v' prefix and split version into array 35 | version=${LATEST_TAG#v} 36 | IFS='.' read -ra VERSION_PARTS <<< "$version" 37 | 38 | # Increment the last number (patch version) 39 | VERSION_PARTS[2]=$((VERSION_PARTS[2] + 1)) 40 | 41 | # Reconstruct the version with 'v' prefix 42 | next_version="v${VERSION_PARTS[0]}.${VERSION_PARTS[1]}.${VERSION_PARTS[2]}" 43 | fi 44 | echo "NEXT_VERSION=$next_version" >> $GITHUB_ENV 45 | 46 | - name: Create draft release 47 | id: create-draft-release 48 | uses: ncipollo/release-action@v1 49 | with: 50 | tag: ${{ env.NEXT_VERSION }} 51 | generateReleaseNotes: true 52 | draft: true 53 | env: 54 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 55 | -------------------------------------------------------------------------------- /constants/constants.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | const ( 8 | DefaultRetryCount = 3 9 | DefaultThreadCount = 3 10 | DefaultDiscoverTimeout = 5 * time.Minute 11 | DefaultRetryTimeout = 60 * time.Second 12 | DestError = "destination error" 13 | ParquetFileExt = "parquet" 14 | PartitionRegexIceberg = `\{([^,]+),\s*([^}]+)\}` 15 | PartitionRegexParquet = `\{([^}]+)\}` 16 | MongoPrimaryID = "_id" 17 | OlakeID = "_olake_id" 18 | OlakeTimestamp = "_olake_timestamp" 19 | OpType = "_op_type" 20 | CdcTimestamp = "_cdc_timestamp" 21 | DBName = "_db" 22 | StringifiedData = "data" 23 | DefaultReadPreference = "secondaryPreferred" 24 | EncryptionKey = "OLAKE_ENCRYPTION_KEY" 25 | ConfigFolder = "CONFIG_FOLDER" 26 | StatePath = "STATE_PATH" 27 | StreamsPath = "STREAMS_PATH" 28 | DifferencePath = "DIFFERENCE_STREAMS_PATH" 29 | LSNNotUpdatedError = "LSN not updated after 5 minutes" 30 | NoRecordsFoundError = "no records found in given initial wait time" 31 | // DestinationDatabasePrefix is used as prefix for destination database name 32 | DestinationDatabasePrefix = "DESTINATION_DATABASE_PREFIX" 33 | // EffectiveParquetSize is the effective size in bytes considering 256mb targeted parquet size, compression ratio as 8 34 | EffectiveParquetSize = int64(256) * 1024 * 1024 * int64(8) 35 | ) 36 | 37 | type DriverType string 38 | 39 | const ( 40 | MongoDB DriverType = "mongodb" 41 | Postgres DriverType = "postgres" 42 | MySQL DriverType = "mysql" 43 | Oracle DriverType = "oracle" 44 | Kafka DriverType = "kafka" 45 | ) 46 | 47 | var RelationalDrivers = []DriverType{Postgres, MySQL, Oracle} 48 | 49 | var NonRetryableErrors = []string{DestError, "context canceled", NoRecordsFoundError, LSNNotUpdatedError, "lsn mismatch"} 50 | -------------------------------------------------------------------------------- /destination/parquet/resources/spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "type": { 5 | "type": "string", 6 | "const": "PARQUET" 7 | }, 8 | "writer": { 9 | "type": "object", 10 | "properties": { 11 | "s3_access_key": { 12 | "type": "string", 13 | "title": "AWS Access Key", 14 | "description": "The AWS access key for authenticating S3 requests, typically a 20 character alphanumeric string", 15 | "format": "password" 16 | }, 17 | "s3_bucket": { 18 | "type": "string", 19 | "title": "S3 Bucket", 20 | "description": "The name of an existing Amazon S3 bucket with appropriate access permissions to store output files", 21 | "minLength": 1 22 | }, 23 | "s3_path": { 24 | "type": "string", 25 | "title": "S3 Path", 26 | "description": "Specify the S3 bucket path (prefix) where data files will be written, typically starting with a '/' (e.g., '/data')", 27 | "minLength": 1 28 | }, 29 | "s3_region": { 30 | "type": "string", 31 | "title": "S3 Region", 32 | "description": "Specify the AWS region where the S3 bucket is hosted", 33 | "minLength": 1 34 | }, 35 | "s3_secret_key": { 36 | "type": "string", 37 | "title": "AWS Secret Key", 38 | "description": "The AWS secret key for S3 authentication—typically 40+ characters long", 39 | "format": "password" 40 | }, 41 | "s3_endpoint": { 42 | "type": "string", 43 | "title": "S3 Endpoint", 44 | "description": "Specifies the endpoint URL for S3 compatible services (e.g., MinIO)" 45 | } 46 | }, 47 | "required": [ 48 | "s3_bucket", 49 | "s3_region", 50 | "s3_path" 51 | ] 52 | } 53 | }, 54 | "required": [ 55 | "writer" 56 | ] 57 | } -------------------------------------------------------------------------------- /protocol/check.go: -------------------------------------------------------------------------------- 1 | package protocol 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/datazip-inc/olake/destination" 7 | "github.com/datazip-inc/olake/types" 8 | "github.com/datazip-inc/olake/utils" 9 | "github.com/datazip-inc/olake/utils/logger" 10 | "github.com/spf13/cobra" 11 | ) 12 | 13 | // checkCmd represents the check command 14 | var checkCmd = &cobra.Command{ 15 | Use: "check", 16 | Short: "check command", 17 | PreRunE: func(_ *cobra.Command, _ []string) error { 18 | // If connector is not set, we are checking the destination 19 | if destinationConfigPath == "not-set" && configPath == "not-set" { 20 | return fmt.Errorf("no connector config or destination config provided") 21 | } 22 | 23 | // check for destination config 24 | if destinationConfigPath != "not-set" { 25 | destinationConfig = &types.WriterConfig{} 26 | return utils.UnmarshalFile(destinationConfigPath, destinationConfig, true) 27 | } 28 | 29 | // check for source config 30 | if configPath != "not-set" { 31 | return utils.UnmarshalFile(configPath, connector.GetConfigRef(), true) 32 | } 33 | 34 | return nil 35 | }, 36 | Run: func(cmd *cobra.Command, _ []string) { 37 | err := func() error { 38 | // If connector is not set, we are checking the destination 39 | if destinationConfigPath != "not-set" { 40 | _, err := destination.NewWriterPool(cmd.Context(), destinationConfig, nil, batchSize) 41 | return err 42 | } 43 | 44 | if configPath != "not-set" { 45 | return connector.Setup(cmd.Context()) 46 | } 47 | 48 | return nil 49 | }() 50 | 51 | // log success 52 | message := types.Message{ 53 | Type: types.ConnectionStatusMessage, 54 | ConnectionStatus: &types.StatusRow{ 55 | Status: types.ConnectionSucceed, 56 | }, 57 | } 58 | if err != nil { 59 | message.ConnectionStatus.Message = err.Error() 60 | message.ConnectionStatus.Status = types.ConnectionFailed 61 | } 62 | logger.Info(message) 63 | }, 64 | } 65 | -------------------------------------------------------------------------------- /utils/typeutils/compare.go: -------------------------------------------------------------------------------- 1 | package typeutils 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "strings" 7 | "time" 8 | ) 9 | 10 | // return 0 for equal, -1 if a < b else 1 if a>b 11 | func Compare(a, b any) int { 12 | // Handle nil cases first 13 | if a == nil && b == nil { 14 | return 0 15 | } 16 | if a == nil { 17 | return -1 18 | } 19 | if b == nil { 20 | return 1 21 | } 22 | 23 | switch aVal := a.(type) { 24 | case uint, uint8, uint16, uint32, uint64: 25 | aUint := reflect.ValueOf(a).Convert(reflect.TypeOf(uint64(0))).Uint() 26 | bUint := reflect.ValueOf(b).Convert(reflect.TypeOf(uint64(0))).Uint() 27 | if aUint < bUint { 28 | return -1 29 | } else if aUint > bUint { 30 | return 1 31 | } 32 | return 0 33 | case int, int8, int16, int32, int64: 34 | aInt := reflect.ValueOf(a).Convert(reflect.TypeOf(int64(0))).Int() 35 | bInt := reflect.ValueOf(b).Convert(reflect.TypeOf(int64(0))).Int() 36 | if aInt < bInt { 37 | return -1 38 | } else if aInt > bInt { 39 | return 1 40 | } 41 | return 0 42 | case float32, float64: 43 | aFloat := reflect.ValueOf(a).Convert(reflect.TypeOf(float64(0))).Float() 44 | bFloat := reflect.ValueOf(b).Convert(reflect.TypeOf(float64(0))).Float() 45 | if aFloat < bFloat { 46 | return -1 47 | } else if aFloat > bFloat { 48 | return 1 49 | } 50 | return 0 51 | case time.Time: 52 | bTime := b.(time.Time) 53 | if aVal.Before(bTime) { 54 | return -1 55 | } else if aVal.After(bTime) { 56 | return 1 57 | } 58 | return 0 59 | case bool: 60 | bBool := b.(bool) 61 | // false < true 62 | if !aVal && bBool { 63 | return -1 64 | } else if aVal && !bBool { 65 | return 1 66 | } 67 | return 0 68 | default: 69 | // check for custom timestamp 70 | aTime, aOk := a.(Time) 71 | bTime, bOk := b.(Time) 72 | 73 | if aOk && bOk { 74 | return aTime.Compare(bTime) 75 | } 76 | // For any other types, convert to string for comparison 77 | return strings.Compare(fmt.Sprintf("%v", a), fmt.Sprintf("%v", b)) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /utils/typeutils/flatten.go: -------------------------------------------------------------------------------- 1 | package typeutils 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "time" 7 | 8 | "github.com/goccy/go-json" 9 | 10 | "github.com/datazip-inc/olake/types" 11 | "github.com/datazip-inc/olake/utils" 12 | ) 13 | 14 | type Flattener interface { 15 | Flatten(json types.Record) (types.Record, error) 16 | } 17 | 18 | type FlattenerImpl struct { 19 | omitNilValues bool 20 | } 21 | 22 | func NewFlattener() Flattener { 23 | return &FlattenerImpl{ 24 | omitNilValues: true, 25 | } 26 | } 27 | 28 | func (f *FlattenerImpl) Flatten(json types.Record) (types.Record, error) { 29 | destination := make(types.Record) 30 | 31 | for key, value := range json { 32 | err := f.flatten(key, value, destination) 33 | if err != nil { 34 | return nil, err 35 | } 36 | } 37 | 38 | return destination, nil 39 | } 40 | 41 | // Reformat key 42 | func (f *FlattenerImpl) flatten(key string, value any, destination types.Record) error { 43 | key = utils.Reformat(key) 44 | t := reflect.ValueOf(value) 45 | switch t.Kind() { 46 | case reflect.Slice: // Stringify arrays 47 | b, err := json.Marshal(value) 48 | if err != nil { 49 | return fmt.Errorf("error marshaling array with key %s: %v", key, err) 50 | } 51 | destination[key] = string(b) 52 | case reflect.Map: // Stringify nested maps 53 | b, err := json.Marshal(value) 54 | if err != nil { 55 | return fmt.Errorf("error marshaling array with key[%s] and value %v: %v", key, value, err) 56 | } 57 | destination[key] = string(b) 58 | case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, 59 | reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, 60 | reflect.Float32, reflect.Float64, reflect.String: 61 | destination[key] = value 62 | default: 63 | if !f.omitNilValues || value != nil { 64 | // Handle time.Time values 65 | if tm, ok := value.(time.Time); ok { 66 | destination[key] = tm 67 | } else { 68 | destination[key] = fmt.Sprint(value) 69 | } 70 | } 71 | } 72 | 73 | return nil 74 | } 75 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Olake Examples 2 | 3 | This directory contains self-contained, end-to-end demo stacks for OLake. Each example is a complete combination of source database, storage, catalog, and query engine that runs alongside the [base OLake stack](https://raw.githubusercontent.com/datazip-inc/olake-ui/refs/heads/master/docker-compose.yml). 4 | 5 | ## How it works 6 | 7 | - First run the base Olake stack (OLake UI, Temporal worker, Temporal service, and dependencies). 8 | - Then start one example from this directory. 9 | 10 | ## Available examples 11 | 12 | - `presto-tabularest-minio-mysql` 13 | - MySQL → Olake → Iceberg (Tabular REST) on MinIO → Presto 14 | 15 | - `trino-tablurarest-minio-mysql` 16 | - MySQL → Olake → Iceberg (Tabular REST) on MinIO → Trino 17 | 18 | ## Quick start pattern 19 | 20 | ```bash 21 | # 1) Start base Olake stack 22 | curl -sSL https://raw.githubusercontent.com/datazip-inc/olake-ui/master/docker-compose.yml | docker compose -f - up -d 23 | 24 | # 2) Clone the repository and navigate to root directory 25 | git clone https://github.com/datazip-inc/olake.git 26 | cd olake 27 | 28 | # 3) Start an example 29 | cd examples/presto-tabularest-minio-mysql 30 | docker compose up -d 31 | 32 | # 4) Follow suggested steps in README.md for the example 33 | ``` 34 | 35 | Each example’s `README.md` includes: 36 | - Required ports and endpoints 37 | - Service access URLs 38 | - Pipeline setup steps in Olake 39 | - Sample queries and troubleshooting 40 | 41 | ## Contributing a new example 42 | 43 | - Naming: `()-()-()-()` 44 | - Example: `trino-lakekeeperest-minio-postgresql` 45 | - Include: 46 | - `docker-compose.yml` using the external network `olake-network` 47 | - `README.md` with: 48 | - Prerequisite base-stack command 49 | - Port availability section (list host ports) 50 | - Step-by-step pipeline setup in Olake 51 | - Access URLs and sample queries 52 | - Prefer minimal images and clear, reproducible dataset bootstrapping. 53 | - Test the full flow end-to-end before submitting a PR. 54 | -------------------------------------------------------------------------------- /drivers/mysql/internal/datatype_conversion.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import "github.com/datazip-inc/olake/types" 4 | 5 | // Define a mapping of MySQL data types to internal data types 6 | var mysqlTypeToDataTypes = map[string]types.DataType{ 7 | // Integer types 8 | "tinyint": types.Int32, 9 | "smallint": types.Int32, 10 | "mediumint": types.Int32, 11 | "int": types.Int32, 12 | "integer": types.Int32, 13 | "unsigned int": types.Int32, 14 | "unsigned smallint": types.Int32, 15 | "unsigned tinyint": types.Int32, 16 | "unsigned mediumint": types.Int32, 17 | "bit": types.Int32, 18 | "bigint": types.Int64, 19 | 20 | // Floating point types 21 | "float": types.Float32, 22 | "real": types.Float32, 23 | "decimal": types.Float32, 24 | "numeric": types.Float32, 25 | "double": types.Float64, 26 | 27 | // String types 28 | "char": types.String, 29 | "varchar": types.String, 30 | "tinytext": types.String, 31 | "text": types.String, 32 | "mediumtext": types.String, 33 | "longtext": types.String, 34 | 35 | // Binary types 36 | "binary": types.String, 37 | "varbinary": types.String, 38 | "tinyblob": types.String, 39 | "blob": types.String, 40 | "mediumblob": types.String, 41 | "longblob": types.String, 42 | 43 | // Date and time types 44 | "date": types.Timestamp, 45 | "timestamp": types.Timestamp, 46 | "datetime": types.Timestamp, 47 | "year": types.Int64, 48 | 49 | // time and datetime types treated as string for now 50 | "time": types.String, 51 | 52 | // JSON type 53 | "json": types.String, 54 | // Enum and Set types 55 | "enum": types.String, 56 | "set": types.String, 57 | 58 | // Geometry types 59 | "geometry": types.String, 60 | "point": types.String, 61 | "linestring": types.String, 62 | "polygon": types.String, 63 | "multipoint": types.String, 64 | "multilinestring": types.String, 65 | "multipolygon": types.String, 66 | "geometrycollection": types.String, 67 | } 68 | -------------------------------------------------------------------------------- /drivers/oracle/internal/datatype_conversion.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "database/sql" 5 | "strings" 6 | 7 | "github.com/datazip-inc/olake/types" 8 | ) 9 | 10 | // Oracle type mapping to our internal types 11 | var oracleTypeToDataTypes = map[string]types.DataType{ 12 | // Numeric types 13 | "int32": types.Int32, 14 | "int64": types.Int64, 15 | "ibfloat": types.Float32, 16 | "binary_float": types.Float32, 17 | "ibdouble": types.Float64, 18 | "number": types.Float64, 19 | "float": types.Float64, 20 | "binary_double": types.Float64, 21 | 22 | // String types 23 | "varchar2": types.String, 24 | "nvarchar2": types.String, 25 | "char": types.String, 26 | "nchar": types.String, 27 | "longvarchar": types.String, 28 | "clob": types.String, 29 | "nclob": types.String, 30 | "long": types.String, //LONG 31 | "raw": types.String, //RAW 32 | "longraw": types.String, //LONG RAW 33 | 34 | // Date/Time types 35 | "date": types.TimestampMicro, 36 | "timestampdty": types.TimestampMicro, 37 | "timestamptz_dty": types.TimestampMicro, 38 | "timestampltz_dty": types.TimestampMicro, 39 | 40 | // Interval types 41 | "intervalym_dty": types.String, 42 | "intervalds_dty": types.String, 43 | 44 | "xmltype": types.String, 45 | "blob": types.String, 46 | "bfile": types.String, 47 | } 48 | 49 | // reformatOracleDatatype removes extra information from type names for matching and returns in golang type 50 | func reformatOracleDatatype(dataType string, precision, scale sql.NullInt64) (types.DataType, bool) { 51 | switch { 52 | case strings.HasPrefix(dataType, "TIMESTAMP"): 53 | return types.TimestampMicro, true 54 | 55 | case strings.HasPrefix(dataType, "INTERVAL"): 56 | return types.String, true 57 | 58 | case strings.HasPrefix(dataType, "NUMBER"): 59 | if scale.Valid && scale.Int64 == 0 { 60 | if precision.Valid && precision.Int64 <= 9 { 61 | return types.Int32, true 62 | } 63 | return types.Int64, true 64 | } 65 | return types.Float64, true 66 | 67 | default: 68 | if val, found := oracleTypeToDataTypes[strings.ToLower(dataType)]; found { 69 | return val, true 70 | } 71 | // Treat unknown data types as strings 72 | return types.Unknown, false 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /drivers/mongodb/internal/config.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "net/url" 5 | "strings" 6 | 7 | "github.com/datazip-inc/olake/constants" 8 | "github.com/datazip-inc/olake/utils" 9 | "github.com/datazip-inc/olake/utils/logger" 10 | ) 11 | 12 | type Config struct { 13 | Hosts []string `json:"hosts"` 14 | Username string `json:"username"` 15 | Password string `json:"password"` 16 | AuthDB string `json:"authdb"` 17 | ReplicaSet string `json:"replica_set"` 18 | ReadPreference string `json:"read_preference"` 19 | Srv bool `json:"srv"` 20 | ServerRAM uint `json:"server_ram"` 21 | MaxThreads int `json:"max_threads"` 22 | Database string `json:"database"` 23 | RetryCount int `json:"backoff_retry_count"` 24 | ChunkingStrategy string `json:"chunking_strategy"` 25 | UseIAM bool `json:"use_iam"` 26 | } 27 | 28 | func (c *Config) URI() string { 29 | connectionPrefix := "mongodb" 30 | if c.Srv { 31 | connectionPrefix = "mongodb+srv" 32 | } 33 | 34 | if c.MaxThreads == 0 { 35 | // set default threads 36 | logger.Info("setting max threads to default[%d]", constants.DefaultThreadCount) 37 | c.MaxThreads = constants.DefaultThreadCount 38 | } 39 | 40 | // Build query parameters 41 | query := url.Values{} 42 | 43 | if c.UseIAM { 44 | query.Set("authSource", "$external") 45 | query.Set("authMechanism", "MONGODB-AWS") 46 | } else { 47 | query.Set("authSource", c.AuthDB) 48 | } 49 | 50 | if c.ReplicaSet != "" { 51 | query.Set("replicaSet", c.ReplicaSet) 52 | if c.ReadPreference == "" { 53 | c.ReadPreference = constants.DefaultReadPreference 54 | } 55 | query.Set("readPreference", c.ReadPreference) 56 | } 57 | 58 | host := strings.Join(c.Hosts, ",") 59 | 60 | // Construct final URI using url.URL 61 | u := &url.URL{ 62 | Scheme: connectionPrefix, 63 | Host: host, 64 | Path: "/", 65 | RawQuery: query.Encode(), 66 | } 67 | 68 | if !c.UseIAM { 69 | u.User = utils.Ternary(c.Password != "", url.UserPassword(c.Username, c.Password), url.User(c.Username)).(*url.Userinfo) 70 | } 71 | 72 | return u.String() 73 | } 74 | 75 | // TODO: Add go struct validation in Config 76 | func (c *Config) Validate() error { 77 | return utils.Validate(c) 78 | } 79 | -------------------------------------------------------------------------------- /pkg/kafka/balancer.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/datazip-inc/olake/utils" 7 | "github.com/segmentio/kafka-go" 8 | ) 9 | 10 | // ProtocolName implements kafka.GroupBalancer interface 11 | func (b *CustomGroupBalancer) ProtocolName() string { 12 | return "olake-kafka-round-robin" 13 | } 14 | 15 | // UserData implements kafka.GroupBalancer interface 16 | func (b *CustomGroupBalancer) UserData() ([]byte, error) { 17 | return nil, nil 18 | } 19 | 20 | // AssignGroups implements kafka.GroupBalancer interface 21 | func (b *CustomGroupBalancer) AssignGroups(members []kafka.GroupMember, partitions []kafka.Partition) kafka.GroupMemberAssignments { 22 | assignments := make(kafka.GroupMemberAssignments) 23 | 24 | // number of consumers to use 25 | consumerIDCount := min(b.requiredConsumerIDs, len(members)) 26 | 27 | // active partitions with data in partition index 28 | activePartitions := make([]kafka.Partition, 0, len(partitions)) 29 | err := utils.ForEach(partitions, func(partition kafka.Partition) error { 30 | if _, exists := b.partitionIndex[fmt.Sprintf("%s:%d", partition.Topic, partition.ID)]; exists { 31 | activePartitions = append(activePartitions, partition) 32 | } 33 | return nil 34 | }) 35 | if err != nil { 36 | return assignments 37 | } 38 | 39 | // Assign partitions to consumers in round-robin 40 | for idx, partition := range activePartitions { 41 | consumerIndex := idx % consumerIDCount 42 | memberID := members[consumerIndex].ID 43 | if assignments[memberID] == nil { 44 | assignments[memberID] = make(map[string][]int) 45 | } 46 | assignments[memberID][partition.Topic] = append(assignments[memberID][partition.Topic], partition.ID) 47 | } 48 | 49 | return assignments 50 | } 51 | 52 | // custom balancer example: 53 | // | max_threads | total partitions | reader-IDs per stream (distinct) | reused? | 54 | // | ------------ | ---------------- | -------------------------------- | ------- | 55 | // | 6 | 6 (3+3) | 3 + 3 | no | 56 | // | 5 | 6 | 3 + 2 | 1 ID | 57 | // | 4 | 6 | 2 + 2 | 2 IDs | 58 | // | 3 | 6 | 2 + 1 | 3 IDs | 59 | // | 2 | 6 | 1 + 1 | 4 IDs | 60 | // | 1 | 6 | 1 + 1 | 5 IDs | 61 | -------------------------------------------------------------------------------- /utils/safego/safego.go: -------------------------------------------------------------------------------- 1 | package safego 2 | 3 | import ( 4 | "os" 5 | "runtime/debug" 6 | "strings" 7 | "time" 8 | 9 | "github.com/datazip-inc/olake/utils/logger" 10 | ) 11 | 12 | const defaultRestartTimeout = 2 * time.Second 13 | 14 | type RecoverHandler func(value interface{}) 15 | 16 | var GlobalRecoverHandler RecoverHandler = func(_ interface{}) {} 17 | 18 | var ( 19 | startTime time.Time 20 | ) 21 | 22 | type Execution struct { 23 | f func() 24 | recoverHandler RecoverHandler 25 | restartTimeout time.Duration 26 | } 27 | 28 | // Run runs a new goroutine and add panic handler (without restart) 29 | func Run(f func()) *Execution { 30 | exec := Execution{ 31 | f: f, 32 | recoverHandler: GlobalRecoverHandler, 33 | restartTimeout: 0, 34 | } 35 | return exec.run() 36 | } 37 | 38 | // RunWithRestart run a new goroutine and add panic handler: 39 | // write logs, wait 2 seconds and restart the goroutine 40 | func RunWithRestart(f func()) *Execution { 41 | exec := Execution{ 42 | f: f, 43 | recoverHandler: GlobalRecoverHandler, 44 | restartTimeout: defaultRestartTimeout, 45 | } 46 | return exec.run() 47 | } 48 | 49 | func (exec *Execution) run() *Execution { 50 | go func() { 51 | defer func() { 52 | if r := recover(); r != nil { 53 | exec.recoverHandler(r) 54 | 55 | if exec.restartTimeout > 0 { 56 | time.Sleep(exec.restartTimeout) 57 | exec.run() 58 | } 59 | } 60 | }() 61 | exec.f() 62 | }() 63 | return exec 64 | } 65 | 66 | func (exec *Execution) WithRestartTimeout(timeout time.Duration) *Execution { 67 | exec.restartTimeout = timeout 68 | return exec 69 | } 70 | 71 | func Recovery(exit bool) { 72 | err := recover() 73 | if err != nil { 74 | logger.Error(err) 75 | // capture stacks trace 76 | for _, str := range strings.Split(string(debug.Stack()), "\n") { 77 | logger.Error(strings.ReplaceAll(str, "\t", "")) 78 | } 79 | } 80 | if exit { 81 | logger.Infof("Time of execution %v", time.Since(startTime).String()) 82 | os.Exit(1) 83 | } 84 | } 85 | 86 | func Insert[T any](ch chan<- T, value T) bool { 87 | safeInsert := false 88 | func() { 89 | defer Recovery(false) 90 | ch <- value 91 | safeInsert = true 92 | }() 93 | 94 | return safeInsert 95 | } 96 | 97 | func Close[T any](ch chan T) { 98 | Run(func() { 99 | close(ch) 100 | }) 101 | } 102 | 103 | func init() { 104 | startTime = time.Now() 105 | } 106 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/java/io/debezium/server/iceberg/tableoperator/PartitionedDeltaWriter.java: -------------------------------------------------------------------------------- 1 | package io.debezium.server.iceberg.tableoperator; 2 | 3 | import java.io.IOException; 4 | import java.io.UncheckedIOException; 5 | import java.util.Map; 6 | import java.util.Set; 7 | 8 | import org.apache.iceberg.FileFormat; 9 | import org.apache.iceberg.PartitionKey; 10 | import org.apache.iceberg.PartitionSpec; 11 | import org.apache.iceberg.Schema; 12 | import org.apache.iceberg.data.Record; 13 | import org.apache.iceberg.io.FileAppenderFactory; 14 | import org.apache.iceberg.io.FileIO; 15 | import org.apache.iceberg.io.OutputFileFactory; 16 | import com.google.common.collect.Maps; 17 | import org.apache.iceberg.util.Tasks; 18 | 19 | class PartitionedDeltaWriter extends BaseDeltaTaskWriter { 20 | 21 | private final PartitionKey partitionKey; 22 | 23 | private final Map writers = Maps.newHashMap(); 24 | 25 | PartitionedDeltaWriter(PartitionSpec spec, 26 | FileFormat format, 27 | FileAppenderFactory appenderFactory, 28 | OutputFileFactory fileFactory, 29 | FileIO io, 30 | long targetFileSize, 31 | Schema schema, 32 | Set identifierFieldIds, 33 | boolean keepDeletes) { 34 | super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, identifierFieldIds, keepDeletes); 35 | this.partitionKey = new PartitionKey(spec, schema); 36 | } 37 | 38 | @Override 39 | RowDataDeltaWriter route(Record row) { 40 | partitionKey.partition(wrapper().wrap(row)); 41 | 42 | RowDataDeltaWriter writer = writers.get(partitionKey); 43 | if (writer == null) { 44 | // NOTICE: we need to copy a new partition key here, in case of messing up the keys in writers. 45 | PartitionKey copiedKey = partitionKey.copy(); 46 | writer = new RowDataDeltaWriter(copiedKey); 47 | writers.put(copiedKey, writer); 48 | } 49 | 50 | return writer; 51 | } 52 | 53 | @Override 54 | public void close() { 55 | try { 56 | Tasks.foreach(writers.values()) 57 | .throwFailureWhenFinished() 58 | .noRetry() 59 | .run(RowDataDeltaWriter::close, IOException.class); 60 | 61 | writers.clear(); 62 | } catch (IOException e) { 63 | throw new UncheckedIOException("Failed to close equality delta writer", e); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /pkg/jdbc/reader.go: -------------------------------------------------------------------------------- 1 | package jdbc 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | "strings" 8 | 9 | "github.com/datazip-inc/olake/types" 10 | "github.com/datazip-inc/olake/utils" 11 | "github.com/datazip-inc/olake/utils/typeutils" 12 | ) 13 | 14 | type Reader[T types.Iterable] struct { 15 | query string 16 | args []any 17 | offset int 18 | ctx context.Context 19 | 20 | exec func(ctx context.Context, query string, args ...any) (T, error) 21 | } 22 | 23 | func NewReader[T types.Iterable](ctx context.Context, baseQuery string, 24 | exec func(ctx context.Context, query string, args ...any) (T, error), args ...any) *Reader[T] { 25 | setter := &Reader[T]{ 26 | query: baseQuery, 27 | offset: 0, 28 | ctx: ctx, 29 | exec: exec, 30 | args: args, 31 | } 32 | 33 | return setter 34 | } 35 | 36 | func (o *Reader[T]) Capture(onCapture func(T) error) error { 37 | if strings.HasSuffix(o.query, ";") { 38 | return fmt.Errorf("base query ends with ';': %s", o.query) 39 | } 40 | 41 | rows, err := o.exec(o.ctx, o.query, o.args...) 42 | if err != nil { 43 | return err 44 | } 45 | 46 | for rows.Next() { 47 | err := onCapture(rows) 48 | if err != nil { 49 | return err 50 | } 51 | } 52 | 53 | err = rows.Err() 54 | if err != nil { 55 | return err 56 | } 57 | return nil 58 | } 59 | 60 | func MapScan(rows *sql.Rows, dest map[string]any, converter func(value interface{}, columnType string) (interface{}, error)) error { 61 | columns, err := rows.Columns() 62 | if err != nil { 63 | return err 64 | } 65 | 66 | types, err := rows.ColumnTypes() 67 | if err != nil { 68 | return err 69 | } 70 | 71 | scanValues := make([]any, len(columns)) 72 | for i := range scanValues { 73 | scanValues[i] = new(any) // Allocate pointers for scanning 74 | } 75 | 76 | if err := rows.Scan(scanValues...); err != nil { 77 | return err 78 | } 79 | 80 | for i, col := range columns { 81 | rawData := *(scanValues[i].(*any)) // Dereference pointer before storing 82 | if converter != nil { 83 | datatype := types[i].DatabaseTypeName() 84 | precision, scale, hasPrecisionScale := types[i].DecimalSize() 85 | if datatype == "NUMBER" && hasPrecisionScale && scale == 0 { 86 | datatype = utils.Ternary(precision > 9, "int64", "int32").(string) 87 | } 88 | conv, err := converter(rawData, datatype) 89 | if err != nil && err != typeutils.ErrNullValue { 90 | return err 91 | } 92 | dest[col] = conv 93 | } else { 94 | dest[col] = rawData 95 | } 96 | } 97 | 98 | return nil 99 | } 100 | -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/java/io/debezium/server/iceberg/tableoperator/RecordWrapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package io.debezium.server.iceberg.tableoperator; 20 | 21 | import org.apache.iceberg.data.Record; 22 | import org.apache.iceberg.types.Types.StructType; 23 | 24 | import java.util.Map; 25 | 26 | public class RecordWrapper implements Record { 27 | 28 | private final Record delegate; 29 | private final Operation op; 30 | 31 | public RecordWrapper(Record delegate, Operation op) { 32 | this.delegate = delegate; 33 | this.op = op; 34 | } 35 | 36 | public Operation op() { 37 | return op; 38 | } 39 | 40 | @Override 41 | public StructType struct() { 42 | return delegate.struct(); 43 | } 44 | 45 | @Override 46 | public Object getField(String name) { 47 | return delegate.getField(name); 48 | } 49 | 50 | @Override 51 | public void setField(String name, Object value) { 52 | delegate.setField(name, value); 53 | } 54 | 55 | @Override 56 | public Object get(int pos) { 57 | return delegate.get(pos); 58 | } 59 | 60 | @Override 61 | public Record copy() { 62 | return new RecordWrapper(delegate.copy(), op); 63 | } 64 | 65 | @Override 66 | public Record copy(Map overwriteValues) { 67 | return new RecordWrapper(delegate.copy(overwriteValues), op); 68 | } 69 | 70 | @Override 71 | public int size() { 72 | return delegate.size(); 73 | } 74 | 75 | @Override 76 | public T get(int pos, Class javaClass) { 77 | return delegate.get(pos, javaClass); 78 | } 79 | 80 | @Override 81 | public void set(int pos, T value) { 82 | delegate.set(pos, value); 83 | } 84 | 85 | @Override 86 | public String toString() { 87 | return delegate.toString(); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /drivers/mongodb/internal/testdata/test_streams.json: -------------------------------------------------------------------------------- 1 | {"selected_streams":{"olake_mongodb_test":[{"partition_regex":"","stream_name":"test_collection","normalization":false},{"partition_regex":"","stream_name":"mongodb_test_table_olake","normalization":false}]},"streams":[{"stream":{"name":"test_collection","namespace":"olake_mongodb_test","type_schema":{"properties":{"_cdc_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_cdc_timestamp"},"_olake_id":{"type":["string","null"],"destination_column_name":"_olake_id"},"_olake_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_olake_timestamp"},"_op_type":{"type":["string","null"],"destination_column_name":"_op_type"}}},"supported_sync_modes":["cdc","strict_cdc","full_refresh","incremental"],"source_defined_primary_key":["_id"],"available_cursor_fields":[],"sync_mode":"cdc","destination_database":"mongodb:olake_mongodb_test","destination_table":"test_collection"}},{"stream":{"name":"mongodb_test_table_olake","namespace":"olake_mongodb_test","type_schema":{"properties":{"_cdc_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_cdc_timestamp"},"_id":{"type":["string"],"destination_column_name":"_id"},"_olake_id":{"type":["string","null"],"destination_column_name":"_olake_id"},"_olake_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_olake_timestamp"},"_op_type":{"type":["null","string"],"destination_column_name":"_op_type"},"created_timestamp":{"type":["integer_small"],"destination_column_name":"created_timestamp"},"id_bigint":{"type":["integer"],"destination_column_name":"id_bigint"},"id_bool":{"type":["boolean"],"destination_column_name":"id_bool"},"id_double":{"type":["number"],"destination_column_name":"id_double"},"id_int":{"type":["integer_small"],"destination_column_name":"id_int"},"id_maxkey":{"type":["unknown"],"destination_column_name":"id_maxkey"},"id_minkey":{"type":["unknown"],"destination_column_name":"id_minkey"},"id_nested":{"type":["object"],"destination_column_name":"id_nested"},"id_nil":{"type":["null"],"destination_column_name":"id_nil"},"id_regex":{"type":["unknown"],"destination_column_name":"id_regex"},"id_timestamp":{"type":["timestamp"],"destination_column_name":"id_timestamp"},"name_varchar":{"type":["string"],"destination_column_name":"name_varchar"}}},"supported_sync_modes":["full_refresh","incremental","cdc","strict_cdc"],"source_defined_primary_key":["_id"],"available_cursor_fields":["id_nested","name_varchar","created_timestamp","id_regex","id_minkey","id_int","id_bigint","id_timestamp","id_maxkey","id_bool","id_nil","_id","id_double"],"sync_mode":"cdc","destination_database":"mongodb:olake_mongodb_test","destination_table":"mongodb_test_table_olake"}}]} -------------------------------------------------------------------------------- /.github/workflows/security-ci.yaml: -------------------------------------------------------------------------------- 1 | name: Go Security CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | paths: 8 | - '**/*.go' 9 | - '**/*.java' 10 | pull_request: 11 | branches: 12 | - "*" 13 | paths: 14 | - '**/*.go' 15 | - '**/*.java' 16 | workflow_dispatch: 17 | inputs: 18 | logLevel: 19 | description: "Log level" 20 | required: true 21 | default: "warning" 22 | 23 | jobs: 24 | govulncheck: 25 | name: govulncheck 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v3 29 | - uses: actions/setup-go@v3 30 | with: 31 | check-latest: "true" 32 | go-version: "1.24.x" 33 | - name: Install govulncheck 34 | run: go install golang.org/x/vuln/cmd/govulncheck@latest 35 | - name: Run vulnerability checks 36 | run: govulncheck ./... 37 | 38 | gosec: 39 | name: GoSec Security Scanner 40 | runs-on: ubuntu-latest 41 | steps: 42 | - uses: actions/checkout@v3 43 | - uses: actions/setup-go@v3 44 | with: 45 | check-latest: "true" 46 | go-version: "1.24.x" 47 | - name: install gosec 48 | run: curl -sfL https://raw.githubusercontent.com/securego/gosec/master/install.sh | sh -s -- -b $(go env GOPATH)/bin 49 | - name: Run Gosec Security Scanner 50 | run: $(go env GOPATH)/bin/gosec -exclude=G115 -severity=high -confidence=medium ./... 51 | 52 | trivy-go: 53 | name: trivy-go 54 | runs-on: ubuntu-latest 55 | timeout-minutes: 5 56 | steps: 57 | - name: Checkout code 58 | uses: actions/checkout@v3 59 | - name: Run Trivy Go vulnerability scanner in repo mode 60 | uses: aquasecurity/trivy-action@master 61 | with: 62 | skip-dirs: './destination/iceberg/olake-iceberg-java-writer' 63 | exit-code: '1' 64 | scan-type: 'fs' 65 | scan-ref: './' 66 | scanners: 'vuln' 67 | severity: 'HIGH,CRITICAL' 68 | 69 | # TODO: Add Java Dependency Trivy (removed because trivy getting stuck or run for hours) 70 | # trivy-java: 71 | # name: trivy-java 72 | # runs-on: ubuntu-latest 73 | # timeout-minutes: 10 74 | # steps: 75 | # - name: Checkout code 76 | # uses: actions/checkout@v3 77 | # - name: Run Trivy Java vulnerability scanner in repo mode 78 | # uses: aquasecurity/trivy-action@master 79 | # with: 80 | # exit-code: '1' 81 | # scan-type: 'fs' 82 | # scan-ref: './destination/iceberg/olake-iceberg-java-writer' 83 | # scanners: 'vuln' 84 | # severity: 'HIGH,CRITICAL' -------------------------------------------------------------------------------- /protocol/clear.go: -------------------------------------------------------------------------------- 1 | package protocol 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/datazip-inc/olake/destination" 7 | "github.com/datazip-inc/olake/types" 8 | "github.com/datazip-inc/olake/utils" 9 | "github.com/datazip-inc/olake/utils/logger" 10 | "github.com/spf13/cobra" 11 | ) 12 | 13 | var clearCmd = &cobra.Command{ 14 | Use: "clear-destination", 15 | Short: "Olake clear command to clear destination data and state for selected streams", 16 | PersistentPreRunE: func(_ *cobra.Command, _ []string) error { 17 | if destinationConfigPath == "" { 18 | return fmt.Errorf("--destination not passed") 19 | } else if streamsPath == "" { 20 | return fmt.Errorf("--streams not passed") 21 | } 22 | 23 | destinationConfig = &types.WriterConfig{} 24 | if err := utils.UnmarshalFile(destinationConfigPath, destinationConfig, true); err != nil { 25 | return err 26 | } 27 | 28 | catalog = &types.Catalog{} 29 | if err := utils.UnmarshalFile(streamsPath, catalog, false); err != nil { 30 | return err 31 | } 32 | 33 | state = &types.State{ 34 | Type: types.StreamType, 35 | } 36 | if statePath != "" { 37 | if err := utils.UnmarshalFile(statePath, state, false); err != nil { 38 | return err 39 | } 40 | } 41 | return nil 42 | }, 43 | RunE: func(cmd *cobra.Command, _ []string) error { 44 | selectedStreamsMetadata, err := classifyStreams(catalog, nil, state) 45 | if err != nil { 46 | return fmt.Errorf("failed to get selected streams for clearing: %w", err) 47 | } 48 | dropStreams := []types.StreamInterface{} 49 | dropStreams = append(dropStreams, append(append(selectedStreamsMetadata.IncrementalStreams, selectedStreamsMetadata.FullLoadStreams...), selectedStreamsMetadata.CDCStreams...)...) 50 | if len(dropStreams) == 0 { 51 | logger.Infof("No streams selected for clearing") 52 | return nil 53 | } 54 | 55 | connector.SetupState(state) 56 | // clear state for selected streams 57 | newState, err := connector.ClearState(dropStreams) 58 | if err != nil { 59 | return fmt.Errorf("error clearing state: %w", err) 60 | } 61 | logger.Infof("State for selected streams cleared successfully.") 62 | // Setup new state after clear for connector 63 | connector.SetupState(newState) 64 | 65 | // drop/clear streams from destination 66 | cerr := destination.ClearDestination(cmd.Context(), destinationConfig, dropStreams) 67 | if cerr != nil { 68 | return fmt.Errorf("failed to clear destination: %s", cerr) 69 | } 70 | logger.Infof("Successfully cleared destination data for selected streams.") 71 | // save new state in state file 72 | newState.LogState() 73 | stateBytes, _ := newState.MarshalJSON() 74 | logger.Infof("New saved state: %s", stateBytes) 75 | return nil 76 | }, 77 | } 78 | -------------------------------------------------------------------------------- /utils/decryption.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "crypto/aes" 6 | "crypto/cipher" 7 | "crypto/sha256" 8 | "encoding/base64" 9 | "encoding/json" 10 | "errors" 11 | "fmt" 12 | "strings" 13 | 14 | "github.com/aws/aws-sdk-go-v2/config" 15 | "github.com/aws/aws-sdk-go-v2/service/kms" 16 | "github.com/datazip-inc/olake/constants" 17 | "github.com/spf13/viper" 18 | ) 19 | 20 | func getSecretKey() ([]byte, *kms.Client, error) { 21 | secretKey := viper.GetString(constants.EncryptionKey) 22 | if strings.TrimSpace(secretKey) == "" { 23 | return []byte{}, nil, nil // Encryption is disabled 24 | } 25 | 26 | if strings.HasPrefix(secretKey, "arn:aws:kms:") { 27 | cfg, err := config.LoadDefaultConfig(context.Background()) 28 | if err != nil { 29 | return nil, nil, fmt.Errorf("failed to load AWS config: %s", err) 30 | } 31 | return []byte(secretKey), kms.NewFromConfig(cfg), nil 32 | } 33 | 34 | // Local AES-GCM Mode with SHA-256 derived key 35 | hash := sha256.Sum256([]byte(secretKey)) 36 | return hash[:], nil, nil 37 | } 38 | 39 | func Decrypt(encryptedText string) (string, error) { 40 | if strings.TrimSpace(encryptedText) == "" { 41 | return "", fmt.Errorf("cannot decrypt empty or whitespace-only input") 42 | } 43 | 44 | key, kmsClient, err := getSecretKey() 45 | if err != nil || key == nil || len(key) == 0 { 46 | return encryptedText, err 47 | } 48 | 49 | var config string 50 | err = json.Unmarshal([]byte(encryptedText), &config) 51 | if err != nil { 52 | return "", fmt.Errorf("failed to unmarshal JSON string: %v", err) 53 | } 54 | 55 | encryptedData, err := base64.StdEncoding.DecodeString(config) 56 | if err != nil { 57 | return "", fmt.Errorf("failed to decode base64 data: %v", err) 58 | } 59 | 60 | // Use KMS if client is provided 61 | if kmsClient != nil { 62 | result, err := kmsClient.Decrypt(context.Background(), &kms.DecryptInput{ 63 | CiphertextBlob: encryptedData, 64 | }) 65 | if err != nil { 66 | return "", fmt.Errorf("failed to decrypt with KMS: %s", err) 67 | } 68 | return string(result.Plaintext), nil 69 | } 70 | 71 | // Local AES-GCM decryption 72 | block, err := aes.NewCipher(key) 73 | if err != nil { 74 | return "", fmt.Errorf("failed to create cipher: %s", err) 75 | } 76 | 77 | gcm, err := cipher.NewGCM(block) 78 | if err != nil { 79 | return "", fmt.Errorf("failed to create GCM: %s", err) 80 | } 81 | 82 | if len(encryptedData) < gcm.NonceSize() { 83 | return "", errors.New("ciphertext too short") 84 | } 85 | 86 | plaintext, err := gcm.Open(nil, encryptedData[:gcm.NonceSize()], encryptedData[gcm.NonceSize():], nil) 87 | if err != nil { 88 | return "", fmt.Errorf("failed to decrypt: %s", err) 89 | } 90 | return string(plaintext), nil 91 | } 92 | -------------------------------------------------------------------------------- /utils/ssh.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "net" 7 | "strconv" 8 | "time" 9 | 10 | "golang.org/x/crypto/ssh" 11 | ) 12 | 13 | type SSHConfig struct { 14 | Host string `json:"host,omitempty"` 15 | Port int `json:"port,omitempty"` 16 | Username string `json:"username,omitempty"` 17 | PrivateKey string `json:"private_key,omitempty"` 18 | Passphrase string `json:"passphrase,omitempty"` 19 | Password string `json:"password,omitempty"` 20 | } 21 | 22 | func (c *SSHConfig) Validate() error { 23 | if c.Host == "" { 24 | return errors.New("ssh host is required") 25 | } 26 | 27 | if c.Port <= 0 || c.Port > 65535 { 28 | return errors.New("invalid ssh port number: must be between 1 and 65535") 29 | } 30 | 31 | if c.Username == "" { 32 | return errors.New("ssh username is required") 33 | } 34 | 35 | if c.PrivateKey == "" && c.Password == "" { 36 | return errors.New("private key or password is required") 37 | } 38 | 39 | return nil 40 | } 41 | 42 | func (c *SSHConfig) SetupSSHConnection() (*ssh.Client, error) { 43 | err := c.Validate() 44 | if err != nil { 45 | return nil, fmt.Errorf("failed to validate ssh config: %s", err) 46 | } 47 | var authMethods []ssh.AuthMethod 48 | 49 | if c.Password != "" { 50 | authMethods = append(authMethods, ssh.Password(c.Password)) 51 | } 52 | 53 | if c.PrivateKey != "" { 54 | signer, err := ParsePrivateKey(c.PrivateKey, c.Passphrase) 55 | if err != nil { 56 | return nil, fmt.Errorf("failed to parse SSH private key: %s", err) 57 | } 58 | authMethods = append(authMethods, ssh.PublicKeys(signer)) 59 | } 60 | 61 | sshCfg := &ssh.ClientConfig{ 62 | User: c.Username, 63 | Auth: authMethods, 64 | // Allows everyone to connect to the server without verifying the host key 65 | // TODO: Add proper host key verification 66 | HostKeyCallback: ssh.InsecureIgnoreHostKey(), // #nosec G106 67 | Timeout: 30 * time.Second, 68 | } 69 | 70 | bastionAddr := net.JoinHostPort(c.Host, strconv.Itoa(c.Port)) 71 | sshClient, err := ssh.Dial("tcp", bastionAddr, sshCfg) 72 | if err != nil { 73 | return nil, fmt.Errorf("ssh dial bastion: %s", err) 74 | } 75 | 76 | return sshClient, nil 77 | } 78 | 79 | // ParsePrivateKey parses a private key from a PEM string 80 | func ParsePrivateKey(pemText, passphrase string) (ssh.Signer, error) { 81 | if passphrase != "" { 82 | return ssh.ParsePrivateKeyWithPassphrase([]byte(pemText), []byte(passphrase)) 83 | } 84 | 85 | signer, err := ssh.ParsePrivateKey([]byte(pemText)) 86 | if err == nil { 87 | return signer, nil 88 | } 89 | if _, ok := err.(*ssh.PassphraseMissingError); ok { 90 | return nil, fmt.Errorf("SSH private key appears encrypted, enter the passphrase") 91 | } 92 | return nil, err 93 | } 94 | -------------------------------------------------------------------------------- /drivers/mongodb/mongodb-init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "Waiting for keyfile..." 6 | while [ ! -f /etc/mongodb/pki/keyfile ]; do 7 | sleep 1 8 | done 9 | 10 | echo "Keyfile found, starting mongod without authentication first..." 11 | mongod --replSet rs0 --bind_ip_all --port 27017 & 12 | MONGO_PID=$! 13 | echo "MongoDB started with PID: $MONGO_PID" 14 | 15 | sleep 3 16 | 17 | echo "Waiting for MongoDB to start..." 18 | until mongosh --port 27017 --eval "db.runCommand({ ping: 1 })" >/dev/null 2>&1; do 19 | sleep 2 20 | done 21 | 22 | echo "Initializing replica set..." 23 | mongosh --port 27017 --eval "rs.initiate({_id: 'rs0', members: [{_id: 0, host: 'host.docker.internal:27017'}]})" 24 | 25 | echo "Waiting for PRIMARY..." 26 | for i in {1..30}; do 27 | STATE=$(mongosh --quiet --port 27017 --eval "rs.isMaster().ismaster" 2>/dev/null) 28 | if [ "$STATE" = "true" ]; then 29 | echo "PRIMARY elected" 30 | break 31 | fi 32 | echo "Still waiting for PRIMARY..." 33 | sleep 2 34 | done 35 | 36 | echo "Creating admin user..." 37 | mongosh --port 27017 --eval " 38 | db = db.getSiblingDB('admin'); 39 | db.createUser({ 40 | user: 'admin', 41 | pwd: 'password', 42 | roles: [{ role: 'root', db: 'admin' }] 43 | }); 44 | 45 | db = db.getSiblingDB('olake_mongodb_test'); 46 | db.createCollection('test_collection'); 47 | 48 | db = db.getSiblingDB('admin'); 49 | 50 | try { db.dropUser('mongodb'); } catch(e) { print('User mongodb does not exist, skipping drop'); } 51 | 52 | db.createUser({ 53 | user: 'mongodb', 54 | pwd: 'secure_password123', 55 | roles: [ 56 | { role: 'readWrite', db: 'olake_mongodb_test' } 57 | ] 58 | }); 59 | 60 | try { db.dropRole('splitVectorRole'); } catch(e) { print('Role splitVectorRole does not exist, skipping drop'); } 61 | 62 | db.createRole({ 63 | role: 'splitVectorRole', 64 | privileges: [ 65 | { 66 | resource: { db: '', collection: '' }, 67 | actions: [ 'splitVector' ] 68 | } 69 | ], 70 | roles: [] 71 | }); 72 | 73 | db.grantRolesToUser('mongodb', [ 74 | { role: 'splitVectorRole', db: 'admin' } 75 | ]); 76 | " 77 | 78 | echo "Created user and role" 79 | echo "Stopping MongoDB to restart with authentication..." 80 | if [ ! -z "$MONGO_PID" ] && kill -0 $MONGO_PID 2>/dev/null; then 81 | echo "Killing MongoDB process $MONGO_PID" 82 | kill $MONGO_PID 83 | wait $MONGO_PID 84 | else 85 | echo "MongoDB process not found, using pkill" 86 | pkill mongod 87 | sleep 2 88 | fi 89 | 90 | echo "Starting MongoDB with authentication..." 91 | exec mongod --replSet rs0 --bind_ip_all --port 27017 --keyFile /etc/mongodb/pki/keyfile -------------------------------------------------------------------------------- /drivers/oracle/internal/config.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/datazip-inc/olake/constants" 8 | "github.com/datazip-inc/olake/utils" 9 | go_ora "github.com/sijms/go-ora/v2" 10 | ) 11 | 12 | type Config struct { 13 | Host string `json:"host"` 14 | Username string `json:"username"` 15 | Password string `json:"password"` 16 | ServiceName string `json:"service_name"` 17 | SID string `json:"sid"` 18 | Port int `json:"port"` 19 | MaxThreads int `json:"max_threads"` 20 | RetryCount int `json:"backoff_retry_count"` 21 | SSLConfiguration *utils.SSLConfig `json:"ssl"` 22 | JDBCURLParams map[string]string `json:"jdbc_url_params"` 23 | } 24 | 25 | func (c *Config) connectionString() string { 26 | urlOptions := make(map[string]string) 27 | // Add JDBC-style URL params 28 | for k, v := range c.JDBCURLParams { 29 | urlOptions[k] = v 30 | } 31 | 32 | // Add sid if provided 33 | if c.SID != "" { 34 | urlOptions["sid"] = c.SID 35 | } 36 | 37 | // Add SSL params if provided 38 | if c.SSLConfiguration != nil { 39 | sslmode := string(c.SSLConfiguration.Mode) 40 | if sslmode != "disable" { 41 | urlOptions["ssl"] = "true" 42 | urlOptions["ssl verify"] = "false" 43 | } 44 | // TODO: Add support for more SSL params 45 | } 46 | 47 | // Quote the username to handle case sensitivity 48 | quotedUsername := fmt.Sprintf("%q", c.Username) 49 | 50 | return go_ora.BuildUrl(c.Host, c.Port, c.ServiceName, quotedUsername, c.Password, urlOptions) 51 | } 52 | 53 | // Validate checks the configuration for any missing or invalid fields 54 | func (c *Config) Validate() error { 55 | if c.Host == "" { 56 | return fmt.Errorf("empty host name") 57 | } else if strings.Contains(c.Host, "https") || strings.Contains(c.Host, "http") { 58 | return fmt.Errorf("host should not contain http or https: %s", c.Host) 59 | } 60 | 61 | // Validate port 62 | if c.Port <= 0 || c.Port > 65535 { 63 | return fmt.Errorf("invalid port number: must be between 1 and 65535") 64 | } 65 | // Validate required fields 66 | if c.Username == "" { 67 | return fmt.Errorf("username is required") 68 | } 69 | if c.ServiceName == "" && c.SID == "" { 70 | return fmt.Errorf("service_name or sid is required") 71 | } 72 | 73 | // Set default number of threads if not provided 74 | if c.MaxThreads <= 0 { 75 | c.MaxThreads = constants.DefaultThreadCount 76 | } 77 | 78 | if c.SSLConfiguration == nil { 79 | c.SSLConfiguration = &utils.SSLConfig{ 80 | Mode: "disable", 81 | } 82 | } 83 | err := c.SSLConfiguration.Validate() 84 | if err != nil { 85 | return fmt.Errorf("failed to validate ssl config: %s", err) 86 | } 87 | return utils.Validate(c) 88 | } 89 | -------------------------------------------------------------------------------- /drivers/mysql/internal/config.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/go-sql-driver/mysql" 8 | 9 | "github.com/datazip-inc/olake/constants" 10 | "github.com/datazip-inc/olake/utils" 11 | ) 12 | 13 | // Config represents the configuration for connecting to a MySQL database 14 | type Config struct { 15 | Host string `json:"hosts"` 16 | Username string `json:"username"` 17 | Password string `json:"password"` 18 | Database string `json:"database"` 19 | Port int `json:"port"` 20 | TLSSkipVerify bool `json:"tls_skip_verify"` // Add this field 21 | UpdateMethod interface{} `json:"update_method"` 22 | MaxThreads int `json:"max_threads"` 23 | RetryCount int `json:"backoff_retry_count"` 24 | SSHConfig *utils.SSHConfig `json:"ssh_config"` 25 | } 26 | 27 | type CDC struct { 28 | InitialWaitTime int `json:"initial_wait_time"` 29 | } 30 | 31 | // URI generates the connection URI for the MySQL database 32 | func (c *Config) URI() string { 33 | // Set default port if not specified 34 | if c.Port == 0 { 35 | c.Port = 3306 36 | } 37 | // Construct host string 38 | hostStr := c.Host 39 | if c.Host == "" { 40 | hostStr = "localhost" 41 | } 42 | 43 | cfg := mysql.Config{ 44 | User: c.Username, 45 | Passwd: c.Password, 46 | Net: "tcp", 47 | Addr: fmt.Sprintf("%s:%d", hostStr, c.Port), 48 | DBName: c.Database, 49 | AllowNativePasswords: true, 50 | } 51 | 52 | return cfg.FormatDSN() 53 | } 54 | 55 | // Validate checks the configuration for any missing or invalid fields 56 | func (c *Config) Validate() error { 57 | if c.Host == "" { 58 | return fmt.Errorf("empty host name") 59 | } else if strings.Contains(c.Host, "https") || strings.Contains(c.Host, "http") { 60 | return fmt.Errorf("host should not contain http or https: %s", c.Host) 61 | } 62 | 63 | // Validate port 64 | if c.Port <= 0 || c.Port > 65535 { 65 | return fmt.Errorf("invalid port number: must be between 1 and 65535") 66 | } 67 | 68 | // Validate required fields 69 | if c.Username == "" { 70 | return fmt.Errorf("username is required") 71 | } 72 | if c.Password == "" { 73 | return fmt.Errorf("password is required") 74 | } 75 | 76 | // Optional database name, default to 'mysql' 77 | if c.Database == "" { 78 | c.Database = "mysql" 79 | } 80 | 81 | // Set default number of threads if not provided 82 | if c.MaxThreads <= 0 { 83 | c.MaxThreads = constants.DefaultThreadCount // Aligned with PostgreSQL default 84 | } 85 | 86 | // Set default retry count if not provided 87 | if c.RetryCount <= 0 { 88 | c.RetryCount = constants.DefaultRetryCount // Reasonable default for retries 89 | } 90 | 91 | return utils.Validate(c) 92 | } 93 | -------------------------------------------------------------------------------- /pkg/waljs/filter.go: -------------------------------------------------------------------------------- 1 | package waljs 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | 8 | "github.com/datazip-inc/olake/drivers/abstract" 9 | "github.com/datazip-inc/olake/types" 10 | "github.com/datazip-inc/olake/utils" 11 | "github.com/datazip-inc/olake/utils/typeutils" 12 | "github.com/goccy/go-json" 13 | "github.com/jackc/pglogrepl" 14 | ) 15 | 16 | type ChangeFilter struct { 17 | tables map[string]types.StreamInterface 18 | converter func(value interface{}, columnType string) (interface{}, error) 19 | } 20 | 21 | func NewChangeFilter(typeConverter func(value interface{}, columnType string) (interface{}, error), streams ...types.StreamInterface) ChangeFilter { 22 | filter := ChangeFilter{ 23 | converter: typeConverter, 24 | tables: make(map[string]types.StreamInterface), 25 | } 26 | 27 | for _, stream := range streams { 28 | filter.tables[stream.ID()] = stream 29 | } 30 | return filter 31 | } 32 | 33 | func (c ChangeFilter) FilterWalJsChange(ctx context.Context, change []byte, OnFiltered abstract.CDCMsgFn) (*pglogrepl.LSN, int, error) { 34 | var changes WALMessage 35 | if err := json.NewDecoder(bytes.NewReader(change)).Decode(&changes); err != nil { 36 | return nil, 0, fmt.Errorf("failed to parse change received from wal logs: %s", err) 37 | } 38 | nextLSN, err := pglogrepl.ParseLSN(changes.NextLSN) 39 | if err != nil { 40 | return nil, 0, fmt.Errorf("failed to parse received lsn: %s", err) 41 | } 42 | 43 | if len(changes.Change) == 0 { 44 | return &nextLSN, 0, nil 45 | } 46 | buildChangesMap := func(values []interface{}, types []string, names []string) (map[string]any, error) { 47 | data := make(map[string]any) 48 | for i, val := range values { 49 | colType := types[i] 50 | conv, err := c.converter(val, colType) 51 | if err != nil && err != typeutils.ErrNullValue { 52 | return nil, err 53 | } 54 | data[names[i]] = conv 55 | } 56 | return data, nil 57 | } 58 | rowsCount := 0 59 | for _, ch := range changes.Change { 60 | stream, exists := c.tables[utils.StreamIdentifier(ch.Table, ch.Schema)] 61 | if !exists { 62 | continue 63 | } 64 | rowsCount++ 65 | var changesMap map[string]any 66 | var err error 67 | 68 | if ch.Kind == "delete" { 69 | changesMap, err = buildChangesMap(ch.Oldkeys.Keyvalues, ch.Oldkeys.Keytypes, ch.Oldkeys.Keynames) 70 | } else { 71 | changesMap, err = buildChangesMap(ch.Columnvalues, ch.Columntypes, ch.Columnnames) 72 | } 73 | 74 | if err != nil { 75 | return nil, rowsCount, fmt.Errorf("failed to convert change data: %s", err) 76 | } 77 | 78 | if err := OnFiltered(ctx, abstract.CDCChange{ 79 | Stream: stream, 80 | Kind: ch.Kind, 81 | Timestamp: changes.Timestamp.Time, 82 | Data: changesMap, 83 | }); err != nil { 84 | return nil, rowsCount, fmt.Errorf("failed to write filtered change: %s", err) 85 | } 86 | } 87 | return &nextLSN, rowsCount, nil 88 | } 89 | -------------------------------------------------------------------------------- /destination/iceberg/local-test/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.jars.packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.postgresql:postgresql:42.5.4,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262,org.apache.spark:spark-connect_2.12:3.5.2 2 | spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions 3 | spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog 4 | spark.sql.catalog.spark_catalog.type hive 5 | 6 | # Hive Metastore configuration 7 | spark.hadoop.hive.metastore.uris thrift://hive-metastore:9083 8 | 9 | # Driver classpath configuration 10 | spark.driver.extraClassPath /root/.ivy2/jars/org.postgresql_postgresql-42.5.4.jar 11 | spark.executor.extraClassPath /root/.ivy2/jars/org.postgresql_postgresql-42.5.4.jar 12 | 13 | # S3 Configuration for MinIO 14 | spark.hadoop.fs.s3a.endpoint http://minio:9000 15 | spark.hadoop.fs.s3a.access.key admin 16 | spark.hadoop.fs.s3a.secret.key password 17 | spark.hadoop.fs.s3a.path.style.access true 18 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem 19 | spark.hadoop.fs.s3a.connection.ssl.enabled false 20 | 21 | # Configure olake_iceberg catalog 22 | spark.sql.catalog.olake_iceberg org.apache.iceberg.spark.SparkCatalog 23 | spark.sql.catalog.olake_iceberg.catalog-impl org.apache.iceberg.jdbc.JdbcCatalog 24 | spark.sql.catalog.olake_iceberg.uri jdbc:postgresql://postgres:5432/iceberg?ssl=false 25 | spark.sql.catalog.olake_iceberg.jdbc.user iceberg 26 | spark.sql.catalog.olake_iceberg.jdbc.password password 27 | spark.sql.catalog.olake_iceberg.warehouse s3a://warehouse/olake_iceberg/ 28 | spark.sql.catalog.olake_iceberg.jdbc.driver org.postgresql.Driver 29 | spark.sql.catalog.olake_iceberg.jdbc.schema-version V1 30 | 31 | # Configure hive_catalog for Iceberg tables 32 | spark.sql.catalog.hive_catalog org.apache.iceberg.spark.SparkCatalog 33 | spark.sql.catalog.hive_catalog.type hive 34 | spark.sql.catalog.hive_catalog.uri thrift://hive-metastore:9083 35 | spark.sql.catalog.hive_catalog.warehouse s3a://warehouse/ 36 | 37 | # Configure rest catalog 38 | spark.sql.catalog.rest org.apache.iceberg.spark.SparkCatalog 39 | spark.sql.catalog.rest.catalog-impl org.apache.iceberg.rest.RESTCatalog 40 | spark.sql.catalog.rest.uri http://lakekeeper:8181/catalog 41 | spark.sql.catalog.rest.warehouse warehouse 42 | 43 | # Set default catalog 44 | spark.sql.defaultCatalog olake_iceberg -------------------------------------------------------------------------------- /destination/iceberg/local-test/hive-site.conf: -------------------------------------------------------------------------------- 1 | 2 | 3 | hive.server2.enable.doAs 4 | false 5 | 6 | 7 | hive.tez.exec.inplace.progress 8 | false 9 | 10 | 11 | hive.exec.scratchdir 12 | /opt/hive/scratch_dir 13 | 14 | 15 | hive.user.install.directory 16 | /opt/hive/install_dir 17 | 18 | 19 | tez.runtime.optimize.local.fetch 20 | true 21 | 22 | 23 | hive.exec.submit.local.task.via.child 24 | false 25 | 26 | 27 | mapreduce.framework.name 28 | local 29 | 30 | 31 | tez.local.mode 32 | true 33 | 34 | 35 | hive.execution.engine 36 | tez 37 | 38 | 39 | metastore.metastore.event.db.notification.api.auth 40 | false 41 | 42 | 43 | hive.metastore.warehouse.dir 44 | s3a://warehouse/ 45 | 46 | 47 | fs.s3a.endpoint 48 | http://minio:9000 49 | 50 | 51 | fs.s3a.access.key 52 | admin 53 | 54 | 55 | fs.s3a.secret.key 56 | password 57 | 58 | 59 | fs.s3a.path.style.access 60 | true 61 | 62 | 63 | fs.s3a.impl 64 | org.apache.hadoop.fs.s3a.S3AFileSystem 65 | 66 | 67 | fs.s3a.connection.ssl.enabled 68 | false 69 | 70 | 71 | hive.metastore.authorization.storage.checks 72 | false 73 | Disables storage-based authorization checks to allow Hive better compatibility with MinIO. 74 | 75 | 76 | 77 | hive.metastore.pre.event.listeners 78 | org.apache.hadoop.hive.ql.security.authorization.AuthorizationPreEventListener 79 | 80 | 81 | hive.security.metastore.authorization.manager 82 | org.apache.hadoop.hive.ql.security.authorization.StorageBasedAuthorizationProvider 83 | 84 | -------------------------------------------------------------------------------- /drivers/mysql/internal/cdc.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/datazip-inc/olake/drivers/abstract" 9 | "github.com/datazip-inc/olake/pkg/binlog" 10 | "github.com/datazip-inc/olake/types" 11 | "github.com/datazip-inc/olake/utils" 12 | ) 13 | 14 | func (m *MySQL) prepareBinlogConn(ctx context.Context, globalState MySQLGlobalState, streams []types.StreamInterface) (*binlog.Connection, error) { 15 | if !m.CDCSupport { 16 | return nil, fmt.Errorf("invalid call; %s not running in CDC mode", m.Type()) 17 | } 18 | 19 | // validate global state 20 | if globalState.ServerID == 0 { 21 | return nil, fmt.Errorf("invalid global state; server_id is missing") 22 | } 23 | // TODO: Support all flavour of mysql 24 | config := &binlog.Config{ 25 | ServerID: globalState.ServerID, 26 | Flavor: "mysql", 27 | Host: m.config.Host, 28 | Port: uint16(m.config.Port), 29 | User: m.config.Username, 30 | Password: m.config.Password, 31 | Charset: "utf8mb4", 32 | VerifyChecksum: true, 33 | HeartbeatPeriod: 30 * time.Second, 34 | InitialWaitTime: time.Duration(m.cdcConfig.InitialWaitTime) * time.Second, 35 | SSHClient: m.sshClient, 36 | } 37 | 38 | return binlog.NewConnection(ctx, config, globalState.State.Position, streams, m.dataTypeConverter) 39 | } 40 | 41 | func (m *MySQL) PreCDC(ctx context.Context, streams []types.StreamInterface) error { 42 | // Load or initialize global state 43 | globalState := m.state.GetGlobal() 44 | if globalState == nil || globalState.State == nil { 45 | binlogPos, err := binlog.GetCurrentBinlogPosition(ctx, m.client) 46 | if err != nil { 47 | return fmt.Errorf("failed to get current binlog position: %s", err) 48 | } 49 | m.state.SetGlobal(MySQLGlobalState{ServerID: uint32(1000 + time.Now().UnixNano()%4294966295), State: binlog.Binlog{Position: binlogPos}}) 50 | m.state.ResetStreams() 51 | // reinit state 52 | globalState = m.state.GetGlobal() 53 | } 54 | 55 | var mySQLGlobalState MySQLGlobalState 56 | if err := utils.Unmarshal(globalState.State, &mySQLGlobalState); err != nil { 57 | return fmt.Errorf("failed to unmarshal global state: %s", err) 58 | } 59 | 60 | conn, err := m.prepareBinlogConn(ctx, mySQLGlobalState, streams) 61 | if err != nil { 62 | return fmt.Errorf("failed to prepare binlog conn: %s", err) 63 | } 64 | m.BinlogConn = conn 65 | return nil 66 | } 67 | 68 | func (m *MySQL) StreamChanges(ctx context.Context, _ types.StreamInterface, OnMessage abstract.CDCMsgFn) error { 69 | return m.BinlogConn.StreamMessages(ctx, m.client, OnMessage) 70 | } 71 | 72 | func (m *MySQL) PostCDC(ctx context.Context, stream types.StreamInterface, noErr bool, _ string) error { 73 | if noErr { 74 | m.state.SetGlobal(MySQLGlobalState{ 75 | ServerID: m.BinlogConn.ServerID, 76 | State: binlog.Binlog{ 77 | Position: m.BinlogConn.CurrentPos, 78 | }, 79 | }) 80 | // TODO: Research about acknowledgment of binlogs in mysql 81 | } 82 | m.BinlogConn.Cleanup() 83 | return nil 84 | } 85 | -------------------------------------------------------------------------------- /drivers/postgres/internal/testdata/test_streams.json: -------------------------------------------------------------------------------- 1 | {"selected_streams":{"public":[{"partition_regex":"","stream_name":"postgres_test_table_olake","normalization":true}]},"streams":[{"stream":{"name":"postgres_test_table_olake","namespace":"public","type_schema":{"properties":{"_cdc_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_cdc_timestamp"},"_olake_id":{"type":["string","null"],"destination_column_name":"_olake_id"},"_olake_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_olake_timestamp"},"_op_type":{"type":["string","null"],"destination_column_name":"_op_type"},"col_bigint":{"type":["integer","null"],"destination_column_name":"col_bigint"},"col_bigserial":{"type":["integer"],"destination_column_name":"col_bigserial"},"col_bool":{"type":["boolean","null"],"destination_column_name":"col_bool"},"col_char":{"type":["string","null"],"destination_column_name":"col_char"},"col_character":{"type":["string","null"],"destination_column_name":"col_character"},"col_character_varying":{"type":["string","null"],"destination_column_name":"col_character_varying"},"col_date":{"type":["timestamp","null"],"destination_column_name":"col_date"},"col_decimal":{"type":["number","null"],"destination_column_name":"col_decimal"},"col_double_precision":{"type":["number","null"],"destination_column_name":"col_double_precision"},"col_float4":{"type":["null","number_small"],"destination_column_name":"col_float4"},"col_int":{"type":["integer_small","null"],"destination_column_name":"col_int"},"col_int2":{"type":["integer_small","null"],"destination_column_name":"col_int2"},"col_integer":{"type":["integer_small","null"],"destination_column_name":"col_integer"},"col_interval":{"type":["string","null"],"destination_column_name":"col_interval"},"col_json":{"type":["string","null"],"destination_column_name":"col_json"},"col_jsonb":{"type":["string","null"],"destination_column_name":"col_jsonb"},"col_name":{"type":["string","null"],"destination_column_name":"col_name"},"col_numeric":{"type":["number","null"],"destination_column_name":"col_numeric"},"col_real":{"type":["number_small","null"],"destination_column_name":"col_real"},"col_text":{"type":["string","null"],"destination_column_name":"col_text"},"col_timestamp":{"type":["timestamp","null"],"destination_column_name":"col_timestamp"},"col_timestamptz":{"type":["timestamp","null"],"destination_column_name":"col_timestamptz"},"col_uuid":{"type":["string","null"],"destination_column_name":"col_uuid"},"col_varbit":{"type":["string","null"],"destination_column_name":"col_varbit"},"col_xml":{"type":["string","null"],"destination_column_name":"col_xml"}}},"supported_sync_modes":["incremental","cdc","strict_cdc","full_refresh"],"source_defined_primary_key":["col_bigserial"],"available_cursor_fields":["col_text","col_timestamptz","col_bigserial","col_bool","col_character","col_int","col_numeric","col_uuid","col_xml","col_double_precision","col_integer","col_jsonb","col_name","col_timestamp","col_json","col_real","col_date","col_decimal","col_float4","col_int2","col_interval","col_bigint","col_char","col_character_varying","col_varbit"],"sync_mode":"cdc","destination_database":"postgres_postgres:public","destination_table":"postgres_test_table_olake"}}]} -------------------------------------------------------------------------------- /destination/iceberg/olake-iceberg-java-writer/src/main/java/io/debezium/server/iceberg/tableoperator/BaseDeltaTaskWriter.java: -------------------------------------------------------------------------------- 1 | package io.debezium.server.iceberg.tableoperator; 2 | 3 | import com.google.common.collect.Sets; 4 | import org.apache.iceberg.*; 5 | import org.apache.iceberg.data.InternalRecordWrapper; 6 | import org.apache.iceberg.data.Record; 7 | import org.apache.iceberg.io.BaseTaskWriter; 8 | import org.apache.iceberg.io.FileAppenderFactory; 9 | import org.apache.iceberg.io.FileIO; 10 | import org.apache.iceberg.io.OutputFileFactory; 11 | import org.apache.iceberg.types.TypeUtil; 12 | 13 | import java.io.IOException; 14 | import java.util.Set; 15 | 16 | abstract class BaseDeltaTaskWriter extends BaseTaskWriter { 17 | 18 | private final Schema schema; 19 | private final Schema deleteSchema; 20 | private final InternalRecordWrapper wrapper; 21 | private final InternalRecordWrapper keyWrapper; 22 | private final boolean keepDeletes; 23 | private final RecordProjection keyProjection; 24 | 25 | BaseDeltaTaskWriter(PartitionSpec spec, 26 | FileFormat format, 27 | FileAppenderFactory appenderFactory, 28 | OutputFileFactory fileFactory, 29 | FileIO io, 30 | long targetFileSize, 31 | Schema schema, 32 | Set identifierFieldIds, 33 | boolean keepDeletes) { 34 | super(spec, format, appenderFactory, fileFactory, io, targetFileSize); 35 | this.schema = schema; 36 | this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(identifierFieldIds)); 37 | this.wrapper = new InternalRecordWrapper(schema.asStruct()); 38 | this.keyWrapper = new InternalRecordWrapper(deleteSchema.asStruct()); 39 | this.keyProjection = RecordProjection.create(schema, deleteSchema); 40 | this.keepDeletes = keepDeletes; 41 | } 42 | 43 | abstract RowDataDeltaWriter route(Record row); 44 | 45 | InternalRecordWrapper wrapper() { 46 | return wrapper; 47 | } 48 | 49 | @Override/**/ 50 | public void write(Record row) throws IOException { 51 | RowDataDeltaWriter writer = route(row); 52 | Operation rowOperation = ((RecordWrapper) row).op(); 53 | if (rowOperation == Operation.DELETE && !keepDeletes) { 54 | // deletes. doing hard delete. when keepDeletes = FALSE we dont keep deleted record 55 | writer.deleteKey(keyProjection.wrap(row)); 56 | } else { 57 | // We are deleting key even for insert operations to avoid duplicate records for handling inserts happening while full-load 58 | writer.deleteKey(keyProjection.wrap(row)); 59 | writer.write(row); 60 | } 61 | } 62 | 63 | public class RowDataDeltaWriter extends BaseEqualityDeltaWriter { 64 | RowDataDeltaWriter(PartitionKey partition) { 65 | super(partition, schema, deleteSchema); 66 | } 67 | 68 | @Override 69 | protected StructLike asStructLike(Record data) { 70 | return wrapper.wrap(data); 71 | } 72 | 73 | @Override 74 | protected StructLike asStructLikeKey(Record data) { 75 | return keyWrapper.wrap(data); 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /protocol/discover.go: -------------------------------------------------------------------------------- 1 | package protocol 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "time" 8 | 9 | "github.com/datazip-inc/olake/constants" 10 | "github.com/datazip-inc/olake/types" 11 | "github.com/datazip-inc/olake/utils" 12 | "github.com/datazip-inc/olake/utils/logger" 13 | "github.com/datazip-inc/olake/utils/telemetry" 14 | "github.com/spf13/cobra" 15 | "github.com/spf13/viper" 16 | ) 17 | 18 | var discoverCmd = &cobra.Command{ 19 | Use: "discover", 20 | Short: "discover command", 21 | PreRunE: func(_ *cobra.Command, _ []string) error { 22 | if streamsPath != "" && differencePath != "" { 23 | return nil 24 | } 25 | if configPath == "" { 26 | return fmt.Errorf("--config not passed") 27 | } 28 | 29 | if err := utils.UnmarshalFile(configPath, connector.GetConfigRef(), true); err != nil { 30 | return err 31 | } 32 | destinationDatabasePrefix = utils.Ternary(destinationDatabasePrefix == "", connector.Type(), destinationDatabasePrefix).(string) 33 | viper.Set(constants.DestinationDatabasePrefix, destinationDatabasePrefix) 34 | if streamsPath != "" { 35 | if err := utils.UnmarshalFile(streamsPath, &catalog, false); err != nil { 36 | return fmt.Errorf("failed to read streams from %s: %s", streamsPath, err) 37 | } 38 | } 39 | return nil 40 | }, 41 | RunE: func(cmd *cobra.Command, _ []string) error { 42 | if streamsPath != "" && differencePath != "" { 43 | return compareStreams() 44 | } 45 | 46 | err := connector.Setup(cmd.Context()) 47 | if err != nil { 48 | return err 49 | } 50 | 51 | // build discover ctx 52 | discoverTimeout := utils.Ternary(timeout == -1, constants.DefaultDiscoverTimeout, time.Duration(timeout)*time.Second).(time.Duration) 53 | discoverCtx, cancel := context.WithTimeout(cmd.Context(), discoverTimeout) 54 | defer cancel() 55 | 56 | streams, err := connector.Discover(discoverCtx) 57 | if err != nil { 58 | return err 59 | } 60 | 61 | if len(streams) == 0 { 62 | return errors.New("no streams found in connector") 63 | } 64 | types.LogCatalog(streams, catalog, connector.Type()) 65 | 66 | // Discover Telemetry Tracking 67 | defer func() { 68 | telemetry.TrackDiscover(len(streams), connector.Type()) 69 | logger.Infof("Discover completed, wait 5 seconds cleanup in progress...") 70 | time.Sleep(5 * time.Second) 71 | }() 72 | return nil 73 | }, 74 | } 75 | 76 | // compareStreams reads two streams.json files, computes the difference, and writes the result to difference_streams.json 77 | func compareStreams() error { 78 | var oldStreams, newStreams types.Catalog 79 | if serr := utils.UnmarshalFile(streamsPath, &oldStreams, false); serr != nil { 80 | return fmt.Errorf("failed to read old catalog: %s", serr) 81 | } 82 | 83 | if derr := utils.UnmarshalFile(differencePath, &newStreams, false); derr != nil { 84 | return fmt.Errorf("failed to read new catalog: %s", derr) 85 | } 86 | 87 | diffCatalog := types.GetStreamsDelta(&oldStreams, &newStreams) 88 | if err := logger.FileLoggerWithPath(diffCatalog, viper.GetString(constants.DifferencePath)); err != nil { 89 | return fmt.Errorf("failed to write difference streams: %s", err) 90 | } 91 | logger.Infof("Successfully wrote stream differences") 92 | return nil 93 | } 94 | -------------------------------------------------------------------------------- /drivers/oracle/resources/spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "connection_type": { 5 | "type": "string", 6 | "title": "Connection Type", 7 | "description": "Method used to establish a connection between the Oracle database server", 8 | "enum": [ 9 | "sid", 10 | "service_name" 11 | ], 12 | "default": "service_name" 13 | }, 14 | "host": { 15 | "type": "string", 16 | "title": "Host", 17 | "description": "Database host addresses for connection" 18 | }, 19 | "port": { 20 | "type": "integer", 21 | "title": "Port", 22 | "description": "Database server listening port", 23 | "default": 1521 24 | }, 25 | "username": { 26 | "type": "string", 27 | "title": "Username", 28 | "description": "Username used to authenticate with the database" 29 | }, 30 | "password": { 31 | "type": "string", 32 | "title": "Password", 33 | "description": "Password for database authentication", 34 | "format": "password" 35 | }, 36 | "max_threads": { 37 | "type": "integer", 38 | "title": "Max Threads", 39 | "description": "Maximum concurrent threads for data sync", 40 | "default": 3 41 | }, 42 | "backoff_retry_count": { 43 | "type": "integer", 44 | "title": "Retry Count", 45 | "description": "Number of sync retries (exponential backoff on failure)", 46 | "default": 3 47 | }, 48 | "ssl": { 49 | "type": "object", 50 | "properties": { 51 | "mode": { 52 | "type": "string", 53 | "title": "SSL Mode", 54 | "description": "Database connection SSL configuration (e.g., SSL mode)", 55 | "default": "disable", 56 | "enum": [ 57 | "require", 58 | "disable", 59 | "verify-ca", 60 | "verify-full" 61 | ] 62 | } 63 | }, 64 | "required": [ 65 | "mode" 66 | ] 67 | }, 68 | "jdbc_url_params": { 69 | "type": "object", 70 | "title": "JDBC URL Parameters", 71 | "description": "Additional JDBC URL parameters for connection tuning", 72 | "additionalProperties": true 73 | } 74 | }, 75 | "required": [ 76 | "connection_type", 77 | "host", 78 | "port", 79 | "username" 80 | ], 81 | "dependencies": { 82 | "connection_type": { 83 | "oneOf": [ 84 | { 85 | "properties": { 86 | "connection_type": { 87 | "const": "sid" 88 | }, 89 | "sid": { 90 | "type": "string", 91 | "title": "SID", 92 | "description": "Unique name that identifies an instance of an Oracle database" 93 | } 94 | }, 95 | "required": [ 96 | "sid" 97 | ] 98 | }, 99 | { 100 | "properties": { 101 | "connection_type": { 102 | "const": "service_name" 103 | }, 104 | "service_name": { 105 | "type": "string", 106 | "title": "Service Name", 107 | "description": "A logical database service to connect to" 108 | } 109 | }, 110 | "required": [ 111 | "service_name" 112 | ] 113 | } 114 | ] 115 | } 116 | } 117 | } -------------------------------------------------------------------------------- /drivers/abstract/backfill.go: -------------------------------------------------------------------------------- 1 | package abstract 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "sort" 7 | "time" 8 | 9 | "github.com/datazip-inc/olake/constants" 10 | "github.com/datazip-inc/olake/destination" 11 | "github.com/datazip-inc/olake/types" 12 | "github.com/datazip-inc/olake/utils" 13 | "github.com/datazip-inc/olake/utils/logger" 14 | "github.com/datazip-inc/olake/utils/typeutils" 15 | ) 16 | 17 | func (a *AbstractDriver) Backfill(ctx context.Context, backfilledStreams chan string, pool *destination.WriterPool, stream types.StreamInterface) error { 18 | chunksSet := a.state.GetChunks(stream.Self()) 19 | var err error 20 | if chunksSet == nil || chunksSet.Len() == 0 { 21 | chunksSet, err = a.driver.GetOrSplitChunks(ctx, pool, stream) 22 | if err != nil { 23 | return fmt.Errorf("failed to get or split chunks: %s", err) 24 | } 25 | // set state chunks 26 | a.state.SetChunks(stream.Self(), chunksSet) 27 | } 28 | chunks := chunksSet.Array() 29 | if len(chunks) == 0 { 30 | if backfilledStreams != nil { 31 | backfilledStreams <- stream.ID() 32 | } 33 | return nil 34 | } 35 | 36 | // Sort chunks by their minimum value 37 | sort.Slice(chunks, func(i, j int) bool { 38 | return typeutils.Compare(chunks[i].Min, chunks[j].Min) < 0 39 | }) 40 | logger.Infof("Starting backfill for stream[%s] with %d chunks", stream.GetStream().Name, len(chunks)) 41 | // TODO: create writer instance again on retry 42 | chunkProcessor := func(ctx context.Context, chunk types.Chunk) (err error) { 43 | threadID := fmt.Sprintf("%s_%s", stream.ID(), utils.ULID()) 44 | inserter, err := pool.NewWriter(ctx, stream, destination.WithBackfill(true), destination.WithThreadID(threadID)) 45 | if err != nil { 46 | return fmt.Errorf("failed to create new writer thread: %s", err) 47 | } 48 | logger.Infof("Thread[%s]: created writer for chunk min[%s] and max[%s] of stream %s", threadID, chunk.Min, chunk.Max, stream.ID()) 49 | defer func() { 50 | // wait for chunk completion 51 | if writerErr := inserter.Close(ctx); writerErr != nil { 52 | err = fmt.Errorf("failed to insert chunk min[%s] and max[%s] of stream %s, insert func error: %s, thread error: %s", chunk.Min, chunk.Max, stream.ID(), err, writerErr) 53 | } 54 | 55 | // check for panics before saving state 56 | if r := recover(); r != nil { 57 | err = fmt.Errorf("panic recovered in backfill: %v, prev error: %s", r, err) 58 | } 59 | 60 | if err == nil { 61 | logger.Infof("finished chunk min[%v] and max[%v] of stream %s", chunk.Min, chunk.Max, stream.ID()) 62 | chunksLeft := a.state.RemoveChunk(stream.Self(), chunk) 63 | if chunksLeft == 0 && backfilledStreams != nil { 64 | backfilledStreams <- stream.ID() 65 | } 66 | } else { 67 | err = fmt.Errorf("thread[%s]: %s", threadID, err) 68 | } 69 | }() 70 | return RetryOnBackoff(a.driver.MaxRetries(), constants.DefaultRetryTimeout, func() error { 71 | return a.driver.ChunkIterator(ctx, stream, chunk, func(ctx context.Context, data map[string]any) error { 72 | olakeID := utils.GetKeysHash(data, stream.GetStream().SourceDefinedPrimaryKey.Array()...) 73 | 74 | // persist cdc timestamp for cdc full load 75 | var cdcTimestamp *time.Time 76 | if stream.GetSyncMode() == types.CDC { 77 | t := time.Unix(0, 0) 78 | cdcTimestamp = &t 79 | } 80 | 81 | return inserter.Push(ctx, types.CreateRawRecord(olakeID, data, "r", cdcTimestamp)) 82 | }) 83 | }) 84 | } 85 | utils.ConcurrentInGroup(a.GlobalConnGroup, chunks, chunkProcessor) 86 | return nil 87 | } 88 | -------------------------------------------------------------------------------- /drivers/postgres/internal/config.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "strings" 7 | 8 | "github.com/datazip-inc/olake/constants" 9 | "github.com/datazip-inc/olake/utils" 10 | ) 11 | 12 | type Config struct { 13 | Connection *url.URL `json:"-"` 14 | Host string `json:"host"` 15 | Port int `json:"port"` 16 | Database string `json:"database"` 17 | Username string `json:"username"` 18 | Password string `json:"password"` 19 | JDBCURLParams map[string]string `json:"jdbc_url_params"` 20 | SSLConfiguration *utils.SSLConfig `json:"ssl"` 21 | UpdateMethod interface{} `json:"update_method"` 22 | MaxThreads int `json:"max_threads"` 23 | RetryCount int `json:"retry_count"` 24 | SSHConfig *utils.SSHConfig `json:"ssh_config"` 25 | } 26 | 27 | // Capture Write Ahead Logs 28 | type CDC struct { 29 | ReplicationSlot string `json:"replication_slot"` 30 | // initial wait time must be in range [120,2400), default value 1200 31 | InitialWaitTime int `json:"initial_wait_time"` 32 | // Publications used when OutputPlugin is pgoutput 33 | Publication string `json:"publication"` 34 | } 35 | 36 | func (c *Config) Validate() error { 37 | if c.Host == "" { 38 | return fmt.Errorf("empty host name") 39 | } else if strings.Contains(c.Host, "https") || strings.Contains(c.Host, "http") { 40 | return fmt.Errorf("host should not contain http or https") 41 | } 42 | 43 | // Validate port 44 | if c.Port <= 0 || c.Port > 65535 { 45 | return fmt.Errorf("invalid port number: must be between 1 and 65535") 46 | } 47 | 48 | // default number of threads 49 | if c.MaxThreads <= 0 { 50 | c.MaxThreads = constants.DefaultThreadCount 51 | } 52 | 53 | // Add the connection parameters to the url 54 | parsed := &url.URL{ 55 | Scheme: "postgres", 56 | User: utils.Ternary(c.Password != "", url.UserPassword(c.Username, c.Password), url.User(c.Username)).(*url.Userinfo), 57 | Host: fmt.Sprintf("%s:%d", c.Host, c.Port), 58 | Path: "/" + c.Database, 59 | } 60 | 61 | query := parsed.Query() 62 | 63 | // Set additional connection parameters if available 64 | if len(c.JDBCURLParams) > 0 { 65 | for k, v := range c.JDBCURLParams { 66 | query.Add(k, v) 67 | } 68 | } 69 | 70 | if c.SSLConfiguration == nil { 71 | c.SSLConfiguration = &utils.SSLConfig{ 72 | Mode: "disable", 73 | } 74 | } 75 | 76 | sslmode := string(c.SSLConfiguration.Mode) 77 | if sslmode != "" { 78 | query.Add("sslmode", sslmode) 79 | } 80 | 81 | err := c.SSLConfiguration.Validate() 82 | if err != nil { 83 | return fmt.Errorf("failed to validate ssl config: %s", err) 84 | } 85 | 86 | if c.SSLConfiguration.ServerCA != "" { 87 | query.Add("sslrootcert", c.SSLConfiguration.ServerCA) 88 | } 89 | 90 | if c.SSLConfiguration.ClientCert != "" { 91 | query.Add("sslcert", c.SSLConfiguration.ClientCert) 92 | } 93 | 94 | if c.SSLConfiguration.ClientKey != "" { 95 | query.Add("sslkey", c.SSLConfiguration.ClientKey) 96 | } 97 | parsed.RawQuery = query.Encode() 98 | c.Connection = parsed 99 | 100 | return nil 101 | } 102 | 103 | type Table struct { 104 | Schema string `db:"table_schema"` 105 | Name string `db:"table_name"` 106 | } 107 | 108 | type ColumnDetails struct { 109 | Name string `db:"column_name"` 110 | DataType *string `db:"data_type"` 111 | IsNullable *string `db:"is_nullable"` 112 | } 113 | -------------------------------------------------------------------------------- /utils/typeutils/datatype.go: -------------------------------------------------------------------------------- 1 | package typeutils 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "strings" 7 | "time" 8 | 9 | "github.com/datazip-inc/olake/types" 10 | "github.com/datazip-inc/olake/utils" 11 | ) 12 | 13 | func TypeFromValue(v interface{}) types.DataType { 14 | if v == nil { 15 | return types.Null 16 | } 17 | 18 | // Check if v is a pointer and get the underlying element type if it is 19 | valType := reflect.TypeOf(v) 20 | if valType.Kind() == reflect.Pointer { 21 | val := reflect.ValueOf(v) 22 | if val.IsNil() { 23 | return types.Null 24 | } 25 | return TypeFromValue(val.Elem().Interface()) 26 | } 27 | 28 | switch valType.Kind() { 29 | case reflect.Invalid: 30 | return types.Null 31 | case reflect.Bool: 32 | return types.Bool 33 | case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, 34 | reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32: 35 | return types.Int32 36 | case reflect.Int64, reflect.Uint64: 37 | return types.Int64 38 | case reflect.Float32: 39 | return types.Float32 40 | case reflect.Float64: 41 | return types.Float64 42 | case reflect.String: 43 | t, err := ReformatDate(v) 44 | if err == nil { 45 | return detectTimestampPrecision(t) 46 | } 47 | return types.String 48 | case reflect.Slice, reflect.Array: 49 | return types.Array 50 | case reflect.Map: 51 | return types.Object 52 | default: 53 | // Check if the type is time.Time for timestamp detection 54 | if valType == reflect.TypeOf(time.Time{}) { 55 | return detectTimestampPrecision(v.(time.Time)) 56 | } 57 | 58 | return types.Unknown 59 | } 60 | } 61 | 62 | func MaximumOnDataType[T any](typ types.DataType, a, b T) (T, error) { 63 | switch typ { 64 | case types.Timestamp: 65 | adate, err := ReformatDate(a) 66 | if err != nil { 67 | return a, fmt.Errorf("failed to reformat[%v] while comparing: %s", a, err) 68 | } 69 | bdate, err := ReformatDate(b) 70 | if err != nil { 71 | return a, fmt.Errorf("failed to reformat[%v] while comparing: %s", b, err) 72 | } 73 | 74 | if utils.MaxDate(adate, bdate) == adate { 75 | return a, nil 76 | } 77 | 78 | return b, nil 79 | case types.Int64: 80 | aint, err := ReformatInt64(a) 81 | if err != nil { 82 | return a, fmt.Errorf("failed to reformat[%v] while comparing: %s", a, err) 83 | } 84 | 85 | bint, err := ReformatInt64(b) 86 | if err != nil { 87 | return a, fmt.Errorf("failed to reformat[%v] while comparing: %s", b, err) 88 | } 89 | 90 | if aint > bint { 91 | return a, nil 92 | } 93 | 94 | return b, nil 95 | default: 96 | return a, fmt.Errorf("comparison not available for data types %v now", typ) 97 | } 98 | } 99 | 100 | // Detect timestamp precision depending on time value 101 | func detectTimestampPrecision(t time.Time) types.DataType { 102 | nanos := t.Nanosecond() 103 | if nanos == 0 { // if their is no nanosecond 104 | return types.Timestamp 105 | } 106 | switch { 107 | case nanos%int(time.Millisecond) == 0: 108 | return types.TimestampMilli // store in milliseconds 109 | case nanos%int(time.Microsecond) == 0: 110 | return types.TimestampMicro // store in microseconds 111 | default: 112 | return types.TimestampNano // store in nanoseconds 113 | } 114 | } 115 | 116 | func ExtractAndMapColumnType(columnType string, typeMapping map[string]types.DataType) types.DataType { 117 | // extracts the base type (e.g., varchar(50) -> varchar) 118 | baseType := strings.ToLower(strings.TrimSpace(strings.Split(columnType, "(")[0])) 119 | return typeMapping[baseType] 120 | } 121 | -------------------------------------------------------------------------------- /drivers/postgres/internal/datatype_conversion.go: -------------------------------------------------------------------------------- 1 | package driver 2 | 3 | import ( 4 | "github.com/datazip-inc/olake/types" 5 | ) 6 | 7 | var pgTypeToDataTypes = map[string]types.DataType{ 8 | // TODO: add proper types (not only int64) 9 | "bigint": types.Int64, 10 | "int8": types.Int64, 11 | "tinyint": types.Int32, 12 | "integer": types.Int32, 13 | "smallint": types.Int32, 14 | "smallserial": types.Int32, 15 | "int": types.Int32, 16 | "int2": types.Int32, 17 | "int4": types.Int32, 18 | "serial": types.Int32, 19 | "serial2": types.Int32, 20 | "serial4": types.Int32, 21 | "serial8": types.Int64, 22 | "bigserial": types.Int64, 23 | 24 | // numbers 25 | "decimal": types.Float64, 26 | "numeric": types.Float64, 27 | "double precision": types.Float64, 28 | "float": types.Float32, 29 | "float4": types.Float32, 30 | "float8": types.Float64, 31 | "real": types.Float32, 32 | 33 | // boolean 34 | "bool": types.Bool, 35 | "boolean": types.Bool, 36 | 37 | // strings 38 | "bit varying": types.String, 39 | "box": types.String, 40 | "bytea": types.String, 41 | "character": types.String, 42 | "char": types.String, 43 | "varbit": types.String, 44 | "bit": types.String, 45 | "bit(n)": types.String, 46 | "varying(n)": types.String, 47 | "cidr": types.String, 48 | "inet": types.String, 49 | "macaddr": types.String, 50 | "macaddr8": types.String, 51 | "character varying": types.String, 52 | "text": types.String, 53 | "varchar": types.String, 54 | "longvarchar": types.String, 55 | "circle": types.String, 56 | "hstore": types.String, 57 | "name": types.String, 58 | "uuid": types.String, 59 | "json": types.String, 60 | "jsonb": types.String, 61 | "line": types.String, 62 | "lseg": types.String, 63 | "money": types.String, 64 | "path": types.String, 65 | "pg_lsn": types.String, 66 | "point": types.String, 67 | "polygon": types.String, 68 | "tsquery": types.String, 69 | "tsvector": types.String, 70 | "xml": types.String, 71 | "enum": types.String, 72 | "tsrange": types.String, 73 | "bpchar": types.String, // blank-padded character 74 | 75 | // date/time 76 | "time": types.String, 77 | "timez": types.String, 78 | "interval": types.String, 79 | "date": types.Timestamp, 80 | "timestamp": types.Timestamp, 81 | "timestampz": types.Timestamp, 82 | "timestamp with time zone": types.Timestamp, 83 | "timestamp without time zone": types.Timestamp, 84 | "timestamptz": types.Timestamp, 85 | 86 | // arrays 87 | "ARRAY": types.Array, 88 | "array": types.Array, 89 | "bool[]": types.Array, 90 | "int2[]": types.Array, 91 | "int4[]": types.Array, 92 | "text[]": types.Array, 93 | "bytea[]": types.Array, 94 | "int8[]": types.Array, 95 | "float4[]": types.Array, 96 | "float8[]": types.Array, 97 | "timestamp[]": types.Array, 98 | "date[]": types.Array, 99 | "timestamptz[]": types.Array, 100 | "numeric[]": types.Array, 101 | "uuid[]": types.Array, 102 | "jsonb[]": types.Array, 103 | } 104 | -------------------------------------------------------------------------------- /drivers/postgres/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/datazip-inc/olake/drivers/postgres 2 | 3 | go 1.24.0 4 | 5 | toolchain go1.24.3 6 | 7 | require ( 8 | github.com/datazip-inc/olake v0.0.0-20250414061859-a168ad00bb4b 9 | github.com/jackc/pglogrepl v0.0.0-20250322012620-f1e2b1498ed6 10 | github.com/lib/pq v1.10.9 11 | ) 12 | 13 | require ( 14 | github.com/andybalholm/brotli v1.1.1 // indirect 15 | github.com/apache/thrift v0.21.0 // indirect 16 | github.com/aws/aws-sdk-go v1.55.6 // indirect 17 | github.com/felixge/fgprof v0.9.5 // indirect 18 | github.com/fsnotify/fsnotify v1.8.0 // indirect 19 | github.com/gabriel-vasile/mimetype v1.4.8 // indirect 20 | github.com/go-playground/locales v0.14.1 // indirect 21 | github.com/go-playground/universal-translator v0.18.1 // indirect 22 | github.com/go-playground/validator/v10 v10.25.0 // indirect 23 | github.com/go-viper/mapstructure/v2 v2.3.0 // indirect 24 | github.com/goccy/go-json v0.10.5 // indirect 25 | github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e // indirect 26 | github.com/google/uuid v1.6.0 // indirect 27 | github.com/gorilla/mux v1.8.1 // indirect 28 | github.com/hashicorp/errwrap v1.1.0 // indirect 29 | github.com/hashicorp/go-multierror v1.1.1 // indirect 30 | github.com/jackc/chunkreader/v2 v2.0.1 // indirect 31 | github.com/jackc/pgconn v1.14.3 // indirect 32 | github.com/jackc/pgio v1.0.0 // indirect 33 | github.com/jackc/pgpassfile v1.0.0 // indirect 34 | github.com/jackc/pgproto3/v2 v2.3.3 // indirect 35 | github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect 36 | github.com/jackc/pgtype v1.14.0 // indirect 37 | github.com/jackc/pgx/v5 v5.7.3 // indirect 38 | github.com/jmespath/go-jmespath v0.4.0 // indirect 39 | github.com/klauspost/compress v1.18.0 // indirect 40 | github.com/leodido/go-urn v1.4.0 // indirect 41 | github.com/mattn/go-colorable v0.1.14 // indirect 42 | github.com/mattn/go-isatty v0.0.20 // indirect 43 | github.com/mattn/go-runewidth v0.0.16 // indirect 44 | github.com/mitchellh/hashstructure v1.1.0 // indirect 45 | github.com/oklog/ulid v1.3.1 // indirect 46 | github.com/olekukonko/tablewriter v0.0.5 // indirect 47 | github.com/parquet-go/parquet-go v0.25.0 // indirect 48 | github.com/pelletier/go-toml/v2 v2.2.3 // indirect 49 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 50 | github.com/rivo/uniseg v0.4.7 // indirect 51 | github.com/rs/zerolog v1.34.0 // indirect 52 | github.com/sagikazarmark/locafero v0.8.0 // indirect 53 | github.com/sourcegraph/conc v0.3.0 // indirect 54 | github.com/spf13/afero v1.14.0 // indirect 55 | github.com/spf13/cast v1.7.1 // indirect 56 | github.com/spf13/viper v1.20.1 // indirect 57 | github.com/subosito/gotenv v1.6.0 // indirect 58 | github.com/xitongsys/parquet-go v1.6.2 // indirect 59 | github.com/xitongsys/parquet-go-source v0.0.0-20241021075129-b732d2ac9c9b // indirect 60 | go.mongodb.org/mongo-driver v1.17.3 // indirect 61 | go.uber.org/multierr v1.11.0 // indirect 62 | golang.org/x/crypto v0.37.0 // indirect 63 | golang.org/x/net v0.38.0 // indirect 64 | golang.org/x/sync v0.13.0 // indirect 65 | golang.org/x/text v0.24.0 // indirect 66 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect 67 | gopkg.in/yaml.v3 v3.0.1 // indirect 68 | ) 69 | 70 | require ( 71 | github.com/brainicorn/ganno v0.0.0-20220304182003-e638228cd865 // indirect 72 | github.com/brainicorn/goblex v0.0.0-20220304181919-81f017b0ee95 // indirect 73 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 74 | github.com/jackc/pgx/v4 v4.18.2 75 | github.com/jmoiron/sqlx v1.4.0 76 | github.com/spf13/cobra v1.9.1 // indirect 77 | github.com/spf13/pflag v1.0.6 // indirect 78 | golang.org/x/sys v0.32.0 // indirect 79 | golang.org/x/tools v0.30.0 // indirect 80 | sigs.k8s.io/yaml v1.4.0 // indirect 81 | ) -------------------------------------------------------------------------------- /drivers/mysql/internal/testdata/test_streams.json: -------------------------------------------------------------------------------- 1 | {"selected_streams":{"olake_mysql_test":[{"partition_regex":"","stream_name":"mysql_test_table_olake","normalization":true}]},"streams":[{"stream":{"name":"mysql_test_table_olake","namespace":"olake_mysql_test","type_schema":{"properties":{"_cdc_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_cdc_timestamp"},"_olake_id":{"type":["null","string"],"destination_column_name":"_olake_id"},"_olake_timestamp":{"type":["timestamp_micro","null"],"destination_column_name":"_olake_timestamp"},"_op_type":{"type":["string","null"],"destination_column_name":"_op_type"},"created_date":{"type":["timestamp","null"],"destination_column_name":"created_date"},"created_timestamp":{"type":["timestamp","null"],"destination_column_name":"created_timestamp"},"id":{"type":["integer_small"],"destination_column_name":"id"},"id_bigint":{"type":["integer","null"],"destination_column_name":"id_bigint"},"id_int":{"type":["integer_small","null"],"destination_column_name":"id_int"},"id_int_unsigned":{"type":["integer_small","null"],"destination_column_name":"id_int_unsigned"},"id_integer":{"type":["integer_small","null"],"destination_column_name":"id_integer"},"id_integer_unsigned":{"type":["integer_small","null"],"destination_column_name":"id_integer_unsigned"},"id_mediumint":{"type":["integer_small","null"],"destination_column_name":"id_mediumint"},"id_mediumint_unsigned":{"type":["integer_small","null"],"destination_column_name":"id_mediumint_unsigned"},"id_smallint":{"type":["integer_small","null"],"destination_column_name":"id_smallint"},"id_smallint_unsigned":{"type":["integer_small","null"],"destination_column_name":"id_smallint_unsigned"},"id_tinyint":{"type":["integer_small","null"],"destination_column_name":"id_tinyint"},"id_tinyint_unsigned":{"type":["integer_small","null"],"destination_column_name":"id_tinyint_unsigned"},"is_active":{"type":["integer_small","null"],"destination_column_name":"is_active"},"long_varchar":{"type":["string","null"],"destination_column_name":"long_varchar"},"name_bool":{"type":["null","integer_small"],"destination_column_name":"name_bool"},"name_char":{"type":["string","null"],"destination_column_name":"name_char"},"name_longtext":{"type":["null","string"],"destination_column_name":"name_longtext"},"name_mediumtext":{"type":["string","null"],"destination_column_name":"name_mediumtext"},"name_text":{"type":["string","null"],"destination_column_name":"name_text"},"name_tinytext":{"type":["string","null"],"destination_column_name":"name_tinytext"},"name_varchar":{"type":["string","null"],"destination_column_name":"name_varchar"},"price_decimal":{"type":["number_small","null"],"destination_column_name":"price_decimal"},"price_double":{"type":["number","null"],"destination_column_name":"price_double"},"price_double_precision":{"type":["number","null"],"destination_column_name":"price_double_precision"},"price_float":{"type":["number_small","null"],"destination_column_name":"price_float"},"price_numeric":{"type":["number_small","null"],"destination_column_name":"price_numeric"},"price_real":{"type":["number","null"],"destination_column_name":"price_real"}}},"supported_sync_modes":["incremental","cdc","strict_cdc","full_refresh"],"source_defined_primary_key":["id"],"available_cursor_fields":["id_mediumint","price_double","name_char","name_tinytext","name_longtext","id_int_unsigned","id_mediumint_unsigned","price_double_precision","price_real","id_integer_unsigned","price_float","name_mediumtext","price_decimal","price_numeric","id_bigint","id_int","id_integer","id_tinyint_unsigned","id","id_smallint_unsigned","name_varchar","is_active","name_bool","id_smallint","id_tinyint","name_text","created_timestamp","created_date","long_varchar"],"sync_mode":"cdc","destination_database":"mysql:olake_mysql_test","destination_table":"mysql_test_table_olake"}}]} -------------------------------------------------------------------------------- /utils/testutils/test_schema.go: -------------------------------------------------------------------------------- 1 | package testutils 2 | 3 | // GlobalTypeToDataType maps database-specific types (Postgres/MySQL/MongoDB) to internal standard types 4 | var GlobalTypeMapping = map[string]string{ 5 | // Integer Types 6 | "tinyint": "int", 7 | "smallint": "int", 8 | "mediumint": "int", 9 | "int": "int", 10 | "integer": "int", 11 | "unsigned int": "int", 12 | "unsigned smallint": "int", 13 | "unsigned tinyint": "int", 14 | "unsigned mediumint": "int", 15 | "int2": "int", 16 | "int4": "int", 17 | "smallserial": "int", 18 | "serial": "int", 19 | "serial2": "int", 20 | "serial4": "int", 21 | 22 | "bigint": "bigint", 23 | "int8": "bigint", 24 | "serial8": "bigint", 25 | "bigserial": "bigint", 26 | "year": "bigint", 27 | 28 | // Floating Point Types 29 | "float": "float", 30 | "real": "float", 31 | "decimal": "float", 32 | "numeric": "float", 33 | "float4": "float", 34 | "money": "float", 35 | 36 | "double": "double", 37 | "float8": "double", 38 | "double precision": "double", 39 | 40 | // Boolean Types 41 | "bool": "boolean", 42 | "boolean": "boolean", 43 | 44 | // String Types 45 | "char": "string", 46 | "varchar": "string", 47 | "tinytext": "string", 48 | "text": "string", 49 | "mediumtext": "string", 50 | "longtext": "string", 51 | "character": "string", 52 | "character varying": "string", 53 | "longvarchar": "string", 54 | "bpchar": "string", 55 | "name": "string", 56 | 57 | // Binary Types 58 | "binary": "string", 59 | "varbinary": "string", 60 | "tinyblob": "string", 61 | "blob": "string", 62 | "mediumblob": "string", 63 | "longblob": "string", 64 | "bytea": "string", 65 | 66 | // JSON and Document Types 67 | "json": "string", 68 | "jsonb": "string", 69 | "xml": "string", 70 | "hstore": "string", 71 | 72 | // Network Types 73 | "cidr": "string", 74 | "inet": "string", 75 | "macaddr": "string", 76 | "macaddr8": "string", 77 | 78 | // Spatial Types 79 | "geometry": "string", 80 | "point": "string", 81 | "linestring": "string", 82 | "polygon": "string", 83 | "multipoint": "string", 84 | "multilinestring": "string", 85 | "multipolygon": "string", 86 | "geometrycollection": "string", 87 | "circle": "string", 88 | "path": "string", 89 | "box": "string", 90 | "line": "string", 91 | "lseg": "string", 92 | 93 | // Full Text Search Types 94 | "tsvector": "string", 95 | "tsquery": "string", 96 | 97 | // UUID 98 | "uuid": "string", 99 | 100 | // Range Types 101 | "tsrange": "string", 102 | "tstzrange": "string", 103 | "int4range": "string", 104 | "numrange": "string", 105 | "daterange": "string", 106 | 107 | // Array 108 | "array": "string", 109 | "ARRAY": "string", 110 | "int2vector": "string", 111 | 112 | // Enum and Set 113 | "enum": "string", 114 | "set": "string", 115 | 116 | // Date/Time 117 | "date": "timestamp", 118 | "timestamp": "timestamp", 119 | "datetime": "timestamp", 120 | "timestamptz": "timestamp", 121 | "timestamp with time zone": "timestamp", 122 | "timestamp without time zone": "timestamp", 123 | 124 | "time": "string", 125 | "timez": "string", 126 | "interval": "string", 127 | 128 | // Misc 129 | "pg_lsn": "string", 130 | "bit varying": "string", 131 | "varbit": "string", 132 | "bit(n)": "string", 133 | "varying(n)": "string", 134 | } 135 | -------------------------------------------------------------------------------- /drivers/mongodb/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/datazip-inc/olake/drivers/mongodb 2 | 3 | go 1.24.0 4 | 5 | toolchain go1.24.3 6 | 7 | require ( 8 | github.com/datazip-inc/olake v0.0.0-20241104091615-994075730612 9 | github.com/jackc/pgx/v4 v4.18.3 10 | ) 11 | 12 | require ( 13 | github.com/andybalholm/brotli v1.1.1 // indirect 14 | github.com/apache/thrift v0.21.0 // indirect 15 | github.com/aws/aws-sdk-go v1.55.6 // indirect 16 | github.com/felixge/fgprof v0.9.5 // indirect 17 | github.com/fsnotify/fsnotify v1.8.0 // indirect 18 | github.com/go-viper/mapstructure/v2 v2.3.0 // indirect 19 | github.com/golang/snappy v0.0.4 // indirect 20 | github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e // indirect 21 | github.com/google/uuid v1.6.0 // indirect 22 | github.com/gorilla/mux v1.8.1 // indirect 23 | github.com/hashicorp/errwrap v1.1.0 // indirect 24 | github.com/hashicorp/go-multierror v1.1.1 // indirect 25 | github.com/jmespath/go-jmespath v0.4.0 // indirect 26 | github.com/klauspost/compress v1.18.0 // indirect 27 | github.com/mattn/go-colorable v0.1.14 // indirect 28 | github.com/mattn/go-isatty v0.0.20 // indirect 29 | github.com/mattn/go-runewidth v0.0.16 // indirect 30 | github.com/montanaflynn/stats v0.7.1 // indirect 31 | github.com/oklog/ulid v1.3.1 // indirect 32 | github.com/olekukonko/tablewriter v0.0.5 // indirect 33 | github.com/parquet-go/parquet-go v0.25.0 // indirect 34 | github.com/pelletier/go-toml/v2 v2.2.3 // indirect 35 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 36 | github.com/rivo/uniseg v0.4.7 // indirect 37 | github.com/rs/zerolog v1.34.0 // indirect 38 | github.com/sagikazarmark/locafero v0.8.0 // indirect 39 | github.com/sourcegraph/conc v0.3.0 // indirect 40 | github.com/spf13/afero v1.14.0 // indirect 41 | github.com/spf13/cast v1.7.1 // indirect 42 | github.com/spf13/viper v1.20.1 // indirect 43 | github.com/subosito/gotenv v1.6.0 // indirect 44 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect 45 | github.com/xdg-go/scram v1.1.2 // indirect 46 | github.com/xdg-go/stringprep v1.0.4 // indirect 47 | github.com/xitongsys/parquet-go v1.6.2 // indirect 48 | github.com/xitongsys/parquet-go-source v0.0.0-20241021075129-b732d2ac9c9b // indirect 49 | github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect 50 | go.uber.org/multierr v1.11.0 // indirect 51 | golang.org/x/sync v0.13.0 // indirect 52 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect 53 | gopkg.in/yaml.v3 v3.0.1 // indirect 54 | ) 55 | 56 | require ( 57 | github.com/brainicorn/ganno v0.0.0-20220304182003-e638228cd865 // indirect 58 | github.com/brainicorn/goblex v0.0.0-20220304181919-81f017b0ee95 // indirect 59 | github.com/gabriel-vasile/mimetype v1.4.8 // indirect 60 | github.com/go-playground/locales v0.14.1 // indirect 61 | github.com/go-playground/universal-translator v0.18.1 // indirect 62 | github.com/go-playground/validator/v10 v10.25.0 // indirect 63 | github.com/goccy/go-json v0.10.5 // indirect 64 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 65 | github.com/jackc/chunkreader/v2 v2.0.1 // indirect 66 | github.com/jackc/pgconn v1.14.3 // indirect 67 | github.com/jackc/pgio v1.0.0 // indirect 68 | github.com/jackc/pgpassfile v1.0.0 // indirect 69 | github.com/jackc/pgproto3/v2 v2.3.3 // indirect 70 | github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect 71 | github.com/jackc/pgtype v1.14.0 // indirect 72 | github.com/leodido/go-urn v1.4.0 // indirect 73 | github.com/mitchellh/hashstructure v1.1.0 // indirect 74 | github.com/rogpeppe/go-internal v1.13.1 // indirect 75 | github.com/spf13/cobra v1.9.1 // indirect 76 | github.com/spf13/pflag v1.0.6 // indirect 77 | go.mongodb.org/mongo-driver v1.17.3 78 | golang.org/x/crypto v0.37.0 // indirect 79 | golang.org/x/net v0.38.0 // indirect 80 | golang.org/x/sys v0.32.0 // indirect 81 | golang.org/x/text v0.24.0 // indirect 82 | golang.org/x/tools v0.30.0 // indirect 83 | sigs.k8s.io/yaml v1.4.0 // indirect 84 | ) -------------------------------------------------------------------------------- /utils/concurrent.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "sync/atomic" 6 | 7 | "golang.org/x/sync/errgroup" 8 | ) 9 | 10 | type CtxFunc = func(ctx context.Context) error 11 | 12 | func Concurrent[T any](ctx context.Context, array []T, concurrency int, execute func(ctx context.Context, one T, executionNumber int) error) error { 13 | executor, ctx := errgroup.WithContext(ctx) 14 | executor.SetLimit(concurrency) 15 | 16 | for idx, one := range array { 17 | // schedule an execution 18 | // hold loop till a slot is available 19 | executor.Go(func() error { 20 | return execute(ctx, one, idx) 21 | }) 22 | } 23 | 24 | // block the execution 25 | return executor.Wait() 26 | } 27 | 28 | func ConcurrentF(ctx context.Context, functions ...CtxFunc) error { 29 | executor, ctx := errgroup.WithContext(ctx) 30 | 31 | for _, one := range functions { 32 | // schedule an execution 33 | executor.Go(func() error { 34 | return one(ctx) 35 | }) 36 | } 37 | 38 | // block the execution 39 | return executor.Wait() 40 | } 41 | 42 | func ConcurrentC[T any](ctx context.Context, next *Next[T], concurrency int, execute func(ctx context.Context, one T, sequence int64) error) error { 43 | ctx, cancel := context.WithCancel(ctx) 44 | defer cancel() 45 | 46 | executor, ctx := errgroup.WithContext(ctx) 47 | executor.SetLimit(concurrency) 48 | 49 | // Channel to signal that all tasks have been scheduled 50 | done := make(chan struct{}) 51 | 52 | go func() { 53 | defer close(done) 54 | counter := atomic.Int64{} // execution count to track the executions 55 | for next.Next() { 56 | select { 57 | case <-ctx.Done(): 58 | return 59 | default: 60 | one := next.curr 61 | sequence := counter.Add(1) 62 | // schedule an execution 63 | executor.Go(func() error { 64 | return execute(ctx, one, sequence) 65 | }) 66 | } 67 | } 68 | }() 69 | 70 | // block the execution 71 | select { 72 | case <-done: 73 | if next.err != nil { 74 | return next.err 75 | } 76 | 77 | return executor.Wait() 78 | case <-ctx.Done(): 79 | return executor.Wait() 80 | } 81 | } 82 | 83 | type Next[T any] struct { 84 | closed bool 85 | err error 86 | curr T 87 | next func(curr T) (exit bool, one T, err error) 88 | } 89 | 90 | func (n *Next[T]) Next() bool { 91 | exit, next, err := n.next(n.curr) 92 | if err != nil { 93 | n.err = err 94 | return false 95 | } 96 | if exit { 97 | return false 98 | } 99 | 100 | n.curr = next 101 | return true 102 | } 103 | 104 | func (n *Next[T]) Close() { 105 | n.closed = true 106 | } 107 | 108 | func Yield[T any](next func(prev T) (bool, T, error)) *Next[T] { 109 | return &Next[T]{ 110 | next: next, 111 | } 112 | } 113 | 114 | type CxGroup struct { 115 | ctx context.Context 116 | executor *errgroup.Group 117 | } 118 | 119 | func NewCGroup(ctx context.Context) *CxGroup { 120 | return newCGroup(ctx, 0) 121 | } 122 | 123 | func NewCGroupWithLimit(ctx context.Context, limit int) *CxGroup { 124 | return newCGroup(ctx, limit) 125 | } 126 | 127 | func newCGroup(ctx context.Context, limit int) *CxGroup { 128 | group := &CxGroup{} 129 | group.executor, group.ctx = errgroup.WithContext(ctx) 130 | if limit > 0 { 131 | group.executor.SetLimit(limit) 132 | } 133 | 134 | return group 135 | } 136 | 137 | func (g *CxGroup) Ctx() context.Context { 138 | return g.ctx 139 | } 140 | 141 | func (g *CxGroup) Add(execute func(ctx context.Context) error) { 142 | g.executor.Go(func() error { 143 | return execute(g.ctx) 144 | }) 145 | } 146 | 147 | func (g *CxGroup) Block() error { 148 | return g.executor.Wait() 149 | } 150 | 151 | func ConcurrentInGroup[T any](group *CxGroup, array []T, execute func(ctx context.Context, one T) error) { 152 | for _, one := range array { 153 | select { 154 | case <-group.ctx.Done(): 155 | break 156 | default: 157 | // schedule an execution 158 | group.Add(func(ctx context.Context) error { 159 | return execute(ctx, one) 160 | }) 161 | } 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /.github/workflows/integration-tests.yml: -------------------------------------------------------------------------------- 1 | name: Integration Tests 2 | on: 3 | push: 4 | branches: 5 | - "master" 6 | paths: 7 | - '**/*.go' 8 | - '**/*.java' 9 | pull_request: 10 | branches: 11 | - "*" 12 | paths: 13 | - '**/*.go' 14 | - '**/*.java' 15 | 16 | env: 17 | MYSQL_ROOT_PASSWORD: root1234 18 | 19 | jobs: 20 | integration-tests: 21 | runs-on: 32gb-runner 22 | timeout-minutes: 30 23 | steps: 24 | - name: Checkout code 25 | uses: actions/checkout@v3 26 | 27 | - name: Set up Go 28 | uses: actions/setup-go@v4 29 | with: 30 | go-version: '1.24.x' 31 | 32 | - name: Set up Java for Maven 33 | uses: actions/setup-java@v3 34 | with: 35 | distribution: 'temurin' 36 | java-version: '17' 37 | 38 | - name: Start Test Infrastructure 39 | run: | 40 | docker compose -f ./drivers/mysql/docker-compose.yml up -d 41 | docker compose -f ./drivers/postgres/docker-compose.yml up -d 42 | docker compose -f ./drivers/mongodb/docker-compose.yml up -d 43 | docker compose -f ./destination/iceberg/local-test/docker-compose.yml up minio mc postgres spark-iceberg -d 44 | 45 | - name: Wait for MySQL 46 | uses: nick-fields/retry@v2 47 | with: 48 | timeout_minutes: 5 49 | max_attempts: 30 50 | retry_wait_seconds: 5 51 | command: | 52 | docker exec olake_mysql-test mysql -h localhost -u root -p${{ env.MYSQL_ROOT_PASSWORD }} -e "SELECT 1" 53 | 54 | - name: Wait for PostgreSQL 55 | uses: nick-fields/retry@v2 56 | with: 57 | timeout_minutes: 5 58 | max_attempts: 30 59 | retry_wait_seconds: 5 60 | command: | 61 | docker exec olake_postgres-test psql -h localhost -U postgres -d postgres -c "SELECT 1" 62 | 63 | - name: Wait for MongoDB 64 | uses: nick-fields/retry@v2 65 | with: 66 | timeout_minutes: 5 67 | max_attempts: 30 68 | retry_wait_seconds: 5 69 | command: | 70 | docker exec primary_mongo mongosh --host localhost --port 27017 -u mongodb -p secure_password123 --authenticationDatabase admin --eval "db.adminCommand('ping')" 71 | 72 | - name: Set up Data Directories 73 | run: | 74 | sudo mkdir -p /home/runner/work/olake/olake/destination/iceberg/local-test/data/postgres-data 75 | sudo mkdir -p /home/runner/work/olake/olake/destination/iceberg/local-test/data/minio-data 76 | sudo mkdir -p /home/runner/work/olake/olake/destination/iceberg/local-test/data/ivy-cache 77 | sudo chown -R 999:999 /home/runner/work/olake/olake/destination/iceberg/local-test/data 78 | sudo chmod -R 777 /home/runner/work/olake/olake/destination/iceberg/local-test/data 79 | 80 | - name: Install Go Dependencies 81 | run: go mod download 82 | 83 | - name: Build Project 84 | run: go build -v ./... 85 | 86 | - name: Cache Maven dependencies 87 | uses: actions/cache@v4 88 | with: 89 | path: | 90 | ~/.m2/repository 91 | key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} 92 | restore-keys: | 93 | ${{ runner.os }}-maven- 94 | 95 | - name: Build Iceberg Sink 96 | working-directory: ./destination/iceberg/olake-iceberg-java-writer 97 | run: mvn clean package -DskipTests 98 | 99 | - name: Run Integration Tests 100 | run: go test -v -p 3 ./drivers/mysql/internal/... ./drivers/postgres/internal/... ./drivers/mongodb/internal/... -timeout 0 -run 'Integration' 101 | 102 | - name: Cleanup 103 | if: always() 104 | run: | 105 | docker compose -f ./destination/iceberg/local-test/docker-compose.yml down 106 | docker compose -f ./drivers/mysql/docker-compose.yml down 107 | docker compose -f ./drivers/postgres/docker-compose.yml down 108 | -------------------------------------------------------------------------------- /drivers/mongodb/resources/spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "authdb": { 5 | "type": "string", 6 | "title": "Auth DB", 7 | "description": "Authentication database (mostly admin)", 8 | "default": "admin" 9 | }, 10 | "backoff_retry_count": { 11 | "type": "integer", 12 | "title": "Retry Count", 13 | "description": "Number of sync retry attempts using exponential backoff", 14 | "default": 3 15 | }, 16 | "chunking_strategy": { 17 | "type": "string", 18 | "title": "Chunking Strategy", 19 | "description": "Chunking strategy (timestamp, uses Split Vector strategy if the field is left empty)", 20 | "default": "Split Vector", 21 | "enum": [ 22 | "Split Vector", 23 | "Timestamp" 24 | ] 25 | }, 26 | "database": { 27 | "type": "string", 28 | "title": "Database Name", 29 | "description": "Name of the mongodb database selected for replication" 30 | }, 31 | "hosts": { 32 | "type": "array", 33 | "title": "Hosts", 34 | "description": "Specifies the hostnames or IP addresses of MongoDB for connection", 35 | "items": { 36 | "type": "string" 37 | }, 38 | "minItems": 1 39 | }, 40 | "max_threads": { 41 | "type": "integer", 42 | "title": "Max Threads", 43 | "description": "Max parallel threads for chunk snapshotting", 44 | "default": 3 45 | }, 46 | "password": { 47 | "type": "string", 48 | "title": "Password", 49 | "description": "Password with the username for authentication", 50 | "format": "password" 51 | }, 52 | "read_preference": { 53 | "type": "string", 54 | "title": "Read Preference", 55 | "description": "Read preference for MongoDB (e.g., secondaryPreferred)" 56 | }, 57 | "replica_set": { 58 | "type": "string", 59 | "title": "Replica Set", 60 | "description": "MongoDB replica set name (if applicable)" 61 | }, 62 | "srv": { 63 | "type": "boolean", 64 | "title": "Use SRV", 65 | "description": "Enable this option if using DNS SRV connection strings. When set to true, the hosts field must contain only one entry - a DNS SRV address (['mongodataset.pigiy.mongodb.net'])", 66 | "default": false 67 | }, 68 | "username": { 69 | "type": "string", 70 | "title": "Username", 71 | "description": "Username for MongoDB authentication" 72 | }, 73 | "use_iam": { 74 | "type": "boolean", 75 | "title": "IAM Authentication", 76 | "description": "Enable this option to use IAM-based authentication instead of username and password", 77 | "default": false 78 | } 79 | }, 80 | "required": [ 81 | "hosts", 82 | "database" 83 | ], 84 | "dependencies": { 85 | "use_iam": { 86 | "oneOf": [ 87 | { 88 | "properties": { 89 | "use_iam": { 90 | "const": false 91 | }, 92 | "password": { "type": "string", "minLength": 1 } 93 | }, 94 | "required": [ 95 | "authdb", 96 | "username", 97 | "password" 98 | ] 99 | }, 100 | { 101 | "properties": { 102 | "use_iam": { 103 | "const": true 104 | } 105 | } 106 | } 107 | ] 108 | } 109 | } 110 | } -------------------------------------------------------------------------------- /drivers/mysql/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/datazip-inc/olake/drivers/mysql 2 | 3 | go 1.24.0 4 | 5 | require ( 6 | github.com/datazip-inc/olake v0.0.0-20250312070222-ed705d25bc0c 7 | github.com/go-sql-driver/mysql v1.8.1 8 | github.com/jackc/pgproto3/v2 v2.3.3 // Added explicit requirement 9 | github.com/jackc/pgx/v4 v4.18.2 // Updated from v4.15.0 10 | ) 11 | 12 | require ( 13 | github.com/mattn/go-isatty v0.0.20 // indirect 14 | github.com/rs/zerolog v1.15.0 // indirect 15 | go.mongodb.org/mongo-driver v1.17.3 // indirect 16 | ) 17 | 18 | require ( 19 | // github.com/BurntSushi/toml v1.3.2 // indirect 20 | github.com/Masterminds/semver v1.5.0 // indirect 21 | github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb // indirect 22 | //github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 // indirect 23 | github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22 // indirect 24 | github.com/pingcap/tidb/pkg/parser v0.0.0-20241118164214-4f047be191be // indirect 25 | github.com/shopspring/decimal v1.2.0 // indirect 26 | github.com/siddontang/go-log v0.0.0-20180807004314-8d05993dda07 // indirect 27 | go.uber.org/atomic v1.11.0 // indirect 28 | go.uber.org/multierr v1.11.0 // indirect 29 | go.uber.org/zap v1.27.0 // indirect 30 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect 31 | ) 32 | 33 | require ( 34 | filippo.io/edwards25519 v1.1.0 // indirect 35 | github.com/andybalholm/brotli v1.1.0 // indirect 36 | github.com/apache/thrift v0.16.0 // indirect 37 | github.com/aws/aws-sdk-go v1.43.31 // indirect 38 | github.com/brainicorn/ganno v0.0.0-20220304182003-e638228cd865 // indirect 39 | github.com/brainicorn/goblex v0.0.0-20210908194630-cfe0cfdf87dd // indirect 40 | github.com/felixge/fgprof v0.9.5 // indirect 41 | github.com/fsnotify/fsnotify v1.5.1 // indirect 42 | github.com/gabriel-vasile/mimetype v1.4.3 // indirect 43 | github.com/go-mysql-org/go-mysql v1.11.0 44 | github.com/go-playground/locales v0.14.1 // indirect 45 | github.com/go-playground/universal-translator v0.18.1 // indirect 46 | github.com/go-playground/validator/v10 v10.22.1 // indirect 47 | github.com/goccy/go-json v0.10.3 // indirect 48 | github.com/google/uuid v1.6.0 // indirect 49 | github.com/gorilla/mux v1.8.1 // indirect 50 | github.com/hashicorp/errwrap v1.0.0 // indirect 51 | github.com/hashicorp/go-multierror v1.1.1 // indirect 52 | github.com/hashicorp/hcl v1.0.0 // indirect 53 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 54 | github.com/jackc/chunkreader/v2 v2.0.1 // indirect 55 | github.com/jackc/pgconn v1.14.3 // indirect 56 | github.com/jackc/pgio v1.0.0 // indirect 57 | github.com/jackc/pgpassfile v1.0.0 // indirect 58 | github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect 59 | github.com/jackc/pgtype v1.14.0 // indirect 60 | github.com/jmespath/go-jmespath v0.4.0 // indirect 61 | github.com/klauspost/compress v1.17.9 // indirect 62 | github.com/leodido/go-urn v1.4.0 // indirect 63 | github.com/magiconair/properties v1.8.0 // indirect 64 | github.com/mattn/go-runewidth v0.0.15 // indirect 65 | github.com/mitchellh/hashstructure v1.1.0 // indirect 66 | github.com/mitchellh/mapstructure v1.4.3 // indirect 67 | github.com/oklog/ulid v1.3.1 // indirect 68 | github.com/olekukonko/tablewriter v0.0.5 // indirect 69 | github.com/parquet-go/parquet-go v0.24.0 // indirect 70 | github.com/pelletier/go-toml v1.2.0 // indirect 71 | github.com/pierrec/lz4/v4 v4.1.21 // indirect 72 | github.com/rivo/uniseg v0.4.7 // indirect 73 | github.com/spf13/afero v1.2.2 // indirect 74 | github.com/spf13/cast v1.3.0 // indirect 75 | github.com/spf13/cobra v1.8.0 // indirect 76 | github.com/spf13/jwalterweatherman v1.0.0 // indirect 77 | github.com/spf13/pflag v1.0.5 // indirect 78 | github.com/spf13/viper v1.3.2 // indirect 79 | github.com/xitongsys/parquet-go v1.6.2 // indirect 80 | github.com/xitongsys/parquet-go-source v0.0.0-20241021075129-b732d2ac9c9b // indirect 81 | golang.org/x/crypto v0.35.0 // indirect 82 | golang.org/x/net v0.36.0 // indirect 83 | golang.org/x/sync v0.11.0 // indirect 84 | golang.org/x/sys v0.31.0 // indirect 85 | golang.org/x/text v0.22.0 // indirect 86 | gopkg.in/yaml.v2 v2.4.0 // indirect 87 | sigs.k8s.io/yaml v1.3.0 // indirect 88 | ) 89 | -------------------------------------------------------------------------------- /pkg/waljs/waljs.go: -------------------------------------------------------------------------------- 1 | package waljs 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strings" 7 | "time" 8 | 9 | "github.com/datazip-inc/olake/constants" 10 | "github.com/datazip-inc/olake/drivers/abstract" 11 | "github.com/datazip-inc/olake/utils/logger" 12 | "github.com/jackc/pglogrepl" 13 | "github.com/jackc/pgx/v5/pgproto3" 14 | "github.com/jmoiron/sqlx" 15 | ) 16 | 17 | const AdvanceLSNTemplate = "SELECT * FROM pg_replication_slot_advance('%s', '%s')" 18 | 19 | var pluginArguments = []string{ 20 | "\"include-lsn\" 'on'", 21 | "\"pretty-print\" 'off'", 22 | "\"include-timestamp\" 'on'", 23 | } 24 | 25 | // wal2jsonReplicator implements Replicator for wal2json plugin 26 | type wal2jsonReplicator struct { 27 | socket *Socket 28 | } 29 | 30 | func (w *wal2jsonReplicator) Socket() *Socket { 31 | return w.socket 32 | } 33 | 34 | func (w *wal2jsonReplicator) StreamChanges(ctx context.Context, db *sqlx.DB, callback abstract.CDCMsgFn) error { 35 | // update current lsn information 36 | var slot ReplicationSlot 37 | if err := db.GetContext(ctx, &slot, fmt.Sprintf(ReplicationSlotTempl, w.socket.ReplicationSlot)); err != nil { 38 | return fmt.Errorf("failed to get replication slot: %s", err) 39 | } 40 | 41 | // update current wal lsn 42 | w.socket.CurrentWalPosition = slot.CurrentLSN 43 | 44 | // Start logical replication with wal2json plugin arguments. 45 | var tables []string 46 | for key := range w.socket.changeFilter.tables { 47 | tables = append(tables, key) 48 | } 49 | pluginArguments = append(pluginArguments, fmt.Sprintf("\"add-tables\" '%s'", strings.Join(tables, ","))) 50 | if err := pglogrepl.StartReplication( 51 | ctx, 52 | w.socket.pgConn, 53 | w.socket.ReplicationSlot, 54 | w.socket.ConfirmedFlushLSN, 55 | pglogrepl.StartReplicationOptions{PluginArgs: pluginArguments}, 56 | ); err != nil { 57 | return fmt.Errorf("starting replication slot failed: %s", err) 58 | } 59 | logger.Infof("Started logical replication for slot[%s] from lsn[%s] to lsn[%s]", w.socket.ReplicationSlot, w.socket.ConfirmedFlushLSN, w.socket.CurrentWalPosition) 60 | messageReceived := false 61 | cdcStartTime := time.Now() 62 | for { 63 | select { 64 | case <-ctx.Done(): 65 | return nil 66 | default: 67 | if !messageReceived && w.socket.initialWaitTime > 0 && time.Since(cdcStartTime) > w.socket.initialWaitTime { 68 | return fmt.Errorf("%s, try increasing it or do full load", constants.NoRecordsFoundError) 69 | } 70 | 71 | if w.socket.ClientXLogPos >= w.socket.CurrentWalPosition { 72 | logger.Infof("finishing sync, reached wal position: %s", w.socket.CurrentWalPosition) 73 | return nil 74 | } 75 | 76 | msg, err := w.socket.pgConn.ReceiveMessage(ctx) 77 | if err != nil { 78 | if strings.Contains(err.Error(), "EOF") { 79 | return nil 80 | } 81 | return fmt.Errorf("failed to receive message from wal logs: %s", err) 82 | } 83 | 84 | // Process only CopyData messages. 85 | copyData, ok := msg.(*pgproto3.CopyData) 86 | if !ok { 87 | return fmt.Errorf("unexpected message type: %T", msg) 88 | } 89 | 90 | switch copyData.Data[0] { 91 | case pglogrepl.PrimaryKeepaliveMessageByteID: 92 | // For keepalive messages, process them (but no ack is sent here). 93 | pkm, err := pglogrepl.ParsePrimaryKeepaliveMessage(copyData.Data[1:]) 94 | if err != nil { 95 | return fmt.Errorf("failed to parse primary keepalive message: %s", err) 96 | } 97 | w.socket.ClientXLogPos = pkm.ServerWALEnd 98 | if pkm.ReplyRequested { 99 | logger.Debugf("keep alive message received: %v", pkm) 100 | // send fake acknowledgement 101 | err := AcknowledgeLSN(ctx, db, w.socket, true) 102 | if err != nil { 103 | return fmt.Errorf("failed to ack lsn: %s", err) 104 | } 105 | } 106 | case pglogrepl.XLogDataByteID: 107 | // Reset the idle timer on receiving WAL data. 108 | xld, err := pglogrepl.ParseXLogData(copyData.Data[1:]) 109 | if err != nil { 110 | return fmt.Errorf("failed to parse XLogData: %s", err) 111 | } 112 | // Process change with the provided callback. 113 | nextLSN, records, err := w.socket.changeFilter.FilterWalJsChange(ctx, xld.WALData, callback) 114 | if err != nil { 115 | return fmt.Errorf("failed to filter change: %s", err) 116 | } 117 | messageReceived = records > 0 || messageReceived 118 | w.socket.ClientXLogPos = *nextLSN 119 | default: 120 | logger.Warnf("received unhandled message type: %v", copyData.Data[0]) 121 | } 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /types/set.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/goccy/go-json" 8 | 9 | "github.com/mitchellh/hashstructure" 10 | ) 11 | 12 | type Hashable interface { 13 | Hash() string 14 | } 15 | 16 | type Identifier interface { 17 | ID() string 18 | } 19 | 20 | type ( 21 | Set[T comparable] struct { 22 | hash map[string]nothing 23 | storage map[string]T 24 | funcHash func(T) string 25 | } 26 | 27 | nothing struct{} 28 | ) 29 | 30 | // Create a new set 31 | func NewSet[T comparable](initial ...T) *Set[T] { 32 | s := &Set[T]{ 33 | hash: make(map[string]nothing), 34 | storage: make(map[string]T), 35 | } 36 | 37 | for _, v := range initial { 38 | s.Insert(v) 39 | } 40 | 41 | return s 42 | } 43 | 44 | func (st *Set[T]) WithHasher(f func(T) string) *Set[T] { 45 | st.funcHash = f 46 | 47 | return st 48 | } 49 | 50 | func (st *Set[T]) Hash(elem T) string { 51 | hashable, yes := any(elem).(Hashable) 52 | if yes { 53 | return hashable.Hash() 54 | } 55 | 56 | identifiable, yes := any(elem).(Identifier) 57 | if yes { 58 | return identifiable.ID() 59 | } 60 | 61 | if st.funcHash != nil { 62 | return st.funcHash(elem) 63 | } 64 | 65 | uniqueHash, err := hashstructure.Hash(elem, nil) 66 | if err != nil { 67 | // TODO: Handle st 68 | return "false" 69 | } 70 | 71 | return fmt.Sprintf("%d", uniqueHash) 72 | } 73 | 74 | // Find the difference between two sets 75 | func (st *Set[T]) Difference(set *Set[T]) *Set[T] { 76 | difference := NewSet[T]() 77 | 78 | for k := range st.hash { 79 | if _, exists := set.hash[k]; !exists { 80 | difference.Insert(st.storage[k]) 81 | } 82 | } 83 | 84 | return difference 85 | } 86 | 87 | // Call f for each item in the set 88 | func (st *Set[T]) Range(f func(T)) { 89 | for _, value := range st.storage { 90 | f(value) 91 | } 92 | } 93 | 94 | // Test to see whether or not the element is in the set 95 | func (st *Set[T]) Exists(element T) bool { 96 | _, exists := st.hash[st.Hash(element)] 97 | return exists 98 | } 99 | 100 | // Add an element to the set 101 | func (st *Set[T]) Insert(elements ...T) { 102 | for _, elem := range elements { 103 | if st.Exists(elem) { 104 | continue 105 | } 106 | 107 | hash := st.Hash(elem) 108 | 109 | st.hash[hash] = nothing{} 110 | st.storage[hash] = elem 111 | } 112 | } 113 | 114 | // Find the intersection of two sets 115 | func (st *Set[T]) Intersection(set *Set[T]) *Set[T] { 116 | subset := NewSet[T]() 117 | 118 | for k := range st.hash { 119 | if _, exists := set.hash[k]; exists { 120 | subset.Insert(set.storage[k]) 121 | } 122 | } 123 | 124 | return subset 125 | } 126 | 127 | // Return the number of items in the set 128 | func (st *Set[T]) Len() int { 129 | return len(st.hash) 130 | } 131 | 132 | // Test whether or not st set is a proper subset of "set" 133 | func (st *Set[T]) ProperSubsetOf(set *Set[T]) bool { 134 | return st.SubsetOf(set) && st.Len() < set.Len() 135 | } 136 | 137 | // Remove an element from the set 138 | func (st *Set[T]) Remove(element T) { 139 | hash := st.Hash(element) 140 | 141 | delete(st.hash, hash) 142 | delete(st.storage, hash) 143 | } 144 | 145 | // Test whether or not st set is a subset of "set" 146 | func (st *Set[T]) SubsetOf(set *Set[T]) bool { 147 | if st.Len() > set.Len() { 148 | return false 149 | } 150 | for k := range st.hash { 151 | if _, exists := set.hash[k]; !exists { 152 | return false 153 | } 154 | } 155 | return true 156 | } 157 | 158 | // Find the union of two sets 159 | func (st *Set[T]) Union(set *Set[T]) *Set[T] { 160 | union := NewSet[T]() 161 | 162 | for k := range st.hash { 163 | union.Insert(st.storage[k]) 164 | } 165 | for k := range set.hash { 166 | union.Insert(set.storage[k]) 167 | } 168 | 169 | return union 170 | } 171 | 172 | func (st *Set[T]) String() string { 173 | values := []string{} 174 | 175 | for _, value := range st.storage { 176 | values = append(values, fmt.Sprint(value)) 177 | } 178 | 179 | return fmt.Sprintf("[%s]", strings.Join(values, ", ")) 180 | } 181 | 182 | func (st *Set[T]) Array() []T { 183 | arr := []T{} 184 | 185 | for _, value := range st.storage { 186 | arr = append(arr, value) 187 | } 188 | 189 | return arr 190 | } 191 | 192 | func (st *Set[T]) UnmarshalJSON(data []byte) error { 193 | // to init underlying field during unmarshalling 194 | *st = *NewSet[T]() 195 | arr := []T{} 196 | err := json.Unmarshal(data, &arr) 197 | if err != nil { 198 | return err 199 | } 200 | 201 | for _, item := range arr { 202 | st.Insert(item) 203 | } 204 | 205 | return nil 206 | } 207 | 208 | func (st *Set[T]) MarshalJSON() ([]byte, error) { 209 | return json.Marshal(st.Array()) 210 | } 211 | -------------------------------------------------------------------------------- /types/stream.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | import ( 4 | "github.com/goccy/go-json" 5 | "github.com/spf13/viper" 6 | 7 | "github.com/datazip-inc/olake/constants" 8 | "github.com/datazip-inc/olake/utils" 9 | "github.com/datazip-inc/olake/utils/logger" 10 | ) 11 | 12 | // Output Stream Object for dsynk 13 | type Stream struct { 14 | // Name of the Stream 15 | Name string `json:"name,omitempty"` 16 | // Namespace of the Stream, or Database it belongs to 17 | // helps in identifying collections with same name in different database 18 | Namespace string `json:"namespace,omitempty"` 19 | // Possible Schema of the Stream 20 | Schema *TypeSchema `json:"type_schema,omitempty"` 21 | // Supported sync modes from driver for the respective Stream 22 | SupportedSyncModes *Set[SyncMode] `json:"supported_sync_modes,omitempty"` 23 | // Primary key if available 24 | SourceDefinedPrimaryKey *Set[string] `json:"source_defined_primary_key,omitempty"` 25 | // Available cursor fields supported by driver 26 | AvailableCursorFields *Set[string] `json:"available_cursor_fields,omitempty"` 27 | // Input of JSON Schema from Client to be parsed by driver 28 | AdditionalProperties string `json:"additional_properties,omitempty"` 29 | // Cursor field to be used for incremental sync 30 | CursorField string `json:"cursor_field,omitempty"` 31 | // Mode being used for syncing data 32 | SyncMode SyncMode `json:"sync_mode,omitempty"` 33 | // Normalized Destination Database and Table used as default values for destination database and table 34 | DestinationDatabase string `json:"destination_database,omitempty"` 35 | DestinationTable string `json:"destination_table,omitempty"` 36 | } 37 | 38 | func NewStream(name, namespace string, sourceDatabase *string) *Stream { 39 | DestDatabase, DestTable := utils.GenerateDestinationDetails(namespace, name, sourceDatabase) 40 | return &Stream{ 41 | Name: name, 42 | Namespace: namespace, 43 | SupportedSyncModes: NewSet[SyncMode](), 44 | SourceDefinedPrimaryKey: NewSet[string](), 45 | AvailableCursorFields: NewSet[string](), 46 | Schema: NewTypeSchema(), 47 | DestinationDatabase: DestDatabase, 48 | DestinationTable: DestTable, 49 | } 50 | } 51 | 52 | func (s *Stream) ID() string { 53 | return utils.StreamIdentifier(s.Name, s.Namespace) 54 | } 55 | 56 | func (s *Stream) WithSyncMode(modes ...SyncMode) *Stream { 57 | for _, mode := range modes { 58 | s.SupportedSyncModes.Insert(mode) 59 | } 60 | 61 | return s 62 | } 63 | 64 | func (s *Stream) WithPrimaryKey(keys ...string) *Stream { 65 | for _, key := range keys { 66 | s.SourceDefinedPrimaryKey.Insert(key) 67 | } 68 | 69 | return s 70 | } 71 | 72 | func (s *Stream) WithCursorField(columns ...string) *Stream { 73 | for _, column := range columns { 74 | s.AvailableCursorFields.Insert(column) 75 | } 76 | 77 | return s 78 | } 79 | 80 | func (s *Stream) WithSchema(schema *TypeSchema) *Stream { 81 | s.Schema = schema 82 | return s 83 | } 84 | 85 | // Add or Update Column in Stream Type Schema 86 | func (s *Stream) UpsertField(column string, typ DataType, nullable bool) { 87 | types := []DataType{typ} 88 | if nullable { 89 | types = append(types, Null) 90 | } 91 | 92 | s.Schema.AddTypes(column, types...) 93 | } 94 | 95 | func (s *Stream) Wrap(_ int) *ConfiguredStream { 96 | return &ConfiguredStream{ 97 | Stream: s, 98 | } 99 | } 100 | 101 | func (s *Stream) UnmarshalJSON(data []byte) error { 102 | // Define a type alias to avoid recursion 103 | type Alias Stream 104 | 105 | // Create a temporary alias value to unmarshal into 106 | var temp Alias 107 | 108 | temp.AvailableCursorFields = NewSet[string]() 109 | temp.SourceDefinedPrimaryKey = NewSet[string]() 110 | temp.SupportedSyncModes = NewSet[SyncMode]() 111 | 112 | err := json.Unmarshal(data, &temp) 113 | if err != nil { 114 | return err 115 | } 116 | 117 | *s = Stream(temp) 118 | return nil 119 | } 120 | 121 | func StreamsToMap(streams ...*Stream) map[string]*Stream { 122 | output := make(map[string]*Stream) 123 | for _, stream := range streams { 124 | output[stream.ID()] = stream 125 | } 126 | 127 | return output 128 | } 129 | 130 | func LogCatalog(streams []*Stream, oldCatalog *Catalog, driver string) { 131 | message := Message{ 132 | Type: CatalogMessage, 133 | Catalog: GetWrappedCatalog(streams, driver), 134 | } 135 | logger.Info(message) 136 | // write catalog to the specified file 137 | message.Catalog = mergeCatalogs(oldCatalog, message.Catalog) 138 | 139 | err := logger.FileLoggerWithPath(message.Catalog, viper.GetString(constants.StreamsPath)) 140 | if err != nil { 141 | logger.Fatalf("failed to create streams file: %s", err) 142 | } 143 | } 144 | --------------------------------------------------------------------------------