├── .github ├── dependabot.yml └── workflows │ ├── ci.yml │ ├── integration.yml │ └── release-and-docker.yml ├── .gitignore ├── .golangci.yml ├── .goreleaser.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── cmd └── root.go ├── go.mod ├── go.sum ├── internal ├── docker-compose.yml ├── examples │ └── README.md ├── how-it-works.md ├── nats-server.conf ├── pg-flo.yaml ├── pg_flo_logo.png └── scripts │ ├── e2e_common.sh │ ├── e2e_copy_and_stream.sh │ ├── e2e_copy_only.sh │ ├── e2e_ddl.sh │ ├── e2e_multi_tenant.sh │ ├── e2e_order_test.rb │ ├── e2e_postgres.sh │ ├── e2e_postgres_data_type.sh │ ├── e2e_postgres_uniqueness_test.rb │ ├── e2e_resume_test.rb │ ├── e2e_routing.sh │ ├── e2e_stream_only.sh │ ├── e2e_test_local.sh │ ├── e2e_transform_filter.sh │ ├── multi_tenant_rules.yml │ ├── rules.yml │ └── webhook_test.sh ├── main.go └── pkg ├── pgflonats └── pgflonats.go ├── replicator ├── base_replicator.go ├── buffer.go ├── config.go ├── copy_and_stream_replicator.go ├── ddl_replicator.go ├── errors.go ├── factory.go ├── interfaces.go ├── json_encoder.go ├── replication_connection.go ├── standard_connection.go ├── stream_replicator.go ├── table_handling.go └── tests │ ├── base_replicator_test.go │ ├── buffer_test.go │ ├── copy_and_stream_replicator_test.go │ ├── ddl_replicator_test.go │ ├── json_encoder_test.go │ └── mocks_test.go ├── routing ├── README.md ├── router.go └── tests │ └── routing_test.go ├── rules ├── README.md ├── engine.go ├── rules.go ├── tests │ ├── engine_test.go │ ├── mocks_test.go │ └── rules_test.go └── types.go ├── sinks ├── README.md ├── file.go ├── postgres.go ├── shared.go ├── sink.go ├── stdout.go ├── types.go └── webhooks.go ├── utils ├── cdc_encoding.go ├── cdc_message.go ├── retry.go ├── shared.go ├── shared_types.go └── zerolog_logger.go └── worker └── worker.go /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "gomod" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Set up Go 15 | uses: actions/setup-go@v4 16 | with: 17 | go-version: "1.21" 18 | 19 | - name: Install golangci-lint 20 | run: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.60.1 21 | 22 | - name: Lint 23 | run: make lint 24 | 25 | test: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v4 29 | 30 | - name: Set up Go 31 | uses: actions/setup-go@v4 32 | with: 33 | go-version: "1.21" 34 | 35 | - name: Test 36 | run: make test 37 | build: 38 | needs: [lint, test] 39 | runs-on: ubuntu-latest 40 | steps: 41 | - uses: actions/checkout@v4 42 | 43 | - name: Set up Go 44 | uses: actions/setup-go@v4 45 | with: 46 | go-version: "1.21" 47 | 48 | - name: Build 49 | run: make build 50 | 51 | - name: Set up QEMU 52 | uses: docker/setup-qemu-action@v3 53 | 54 | - name: Set up Docker Buildx 55 | uses: docker/setup-buildx-action@v3 56 | with: 57 | buildkitd-flags: --debug 58 | 59 | - name: Set build timestamp 60 | id: timestamp 61 | run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT 62 | 63 | - name: Build Docker image 64 | uses: docker/build-push-action@v5 65 | with: 66 | context: . 67 | platforms: linux/amd64 68 | push: false 69 | load: true 70 | tags: pg_flo:test 71 | build-args: | 72 | VERSION=${{ github.sha }} 73 | COMMIT=${{ github.sha }} 74 | DATE=${{ steps.timestamp.outputs.timestamp }} 75 | 76 | - name: Verify Docker image version 77 | run: | 78 | docker run --rm pg_flo:test version | grep ${{ github.sha }} 79 | -------------------------------------------------------------------------------- /.github/workflows/integration.yml: -------------------------------------------------------------------------------- 1 | name: Integration Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | 8 | permissions: 9 | contents: read 10 | actions: write 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Set up Go 19 | uses: actions/setup-go@v4 20 | with: 21 | go-version: "1.21" 22 | 23 | - name: Build 24 | run: make build 25 | 26 | - name: Upload binary 27 | uses: actions/upload-artifact@v4 28 | with: 29 | name: pg_flo-binary 30 | path: bin/pg_flo 31 | 32 | tests: 33 | needs: build 34 | runs-on: ubuntu-latest 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | test: 39 | [ 40 | stream_only, 41 | copy_only, 42 | transform_filter, 43 | ddl, 44 | postgres, 45 | postgres_data_type, 46 | multi_tenant, 47 | routing, 48 | copy_and_stream, 49 | order, 50 | resume, 51 | postgres_uniqueness, 52 | ] 53 | steps: 54 | - uses: actions/checkout@v4 55 | - name: Download binary 56 | uses: actions/download-artifact@v4 57 | with: 58 | name: pg_flo-binary 59 | path: bin 60 | - name: Make binary executable 61 | run: chmod +x bin/pg_flo 62 | - name: Install dependencies 63 | run: | 64 | sudo apt-get update 65 | sudo apt-get install -y postgresql-client jq ruby ruby-dev libpq-dev build-essential 66 | sudo gem install pg 67 | - name: Set up Docker Compose 68 | run: | 69 | sudo curl -L "https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose 70 | sudo chmod +x /usr/local/bin/docker-compose 71 | - name: Run test 72 | env: 73 | PG_HOST: localhost 74 | PG_PORT: 5433 75 | PG_USER: myuser 76 | PG_PASSWORD: mypassword!@#%1234 77 | PG_DB: mydb 78 | TARGET_PG_HOST: localhost 79 | TARGET_PG_PORT: 5434 80 | TARGET_PG_USER: targetuser 81 | TARGET_PG_PASSWORD: targetpassword!@#1234 82 | TARGET_PG_DB: targetdb 83 | run: | 84 | docker-compose -f internal/docker-compose.yml up -d 85 | sleep 10 86 | if [[ "${{ matrix.test }}" == "order" || "${{ matrix.test }}" == "resume" || "${{ matrix.test }}" == "postgres_uniqueness" ]]; then 87 | ruby ./internal/scripts/e2e_${{ matrix.test }}_test.rb 88 | else 89 | ./internal/scripts/e2e_${{ matrix.test }}.sh 90 | fi 91 | docker-compose -f internal/docker-compose.yml down -v 92 | -------------------------------------------------------------------------------- /.github/workflows/release-and-docker.yml: -------------------------------------------------------------------------------- 1 | name: Release and Docker 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | permissions: 9 | contents: write 10 | packages: write 11 | 12 | jobs: 13 | goreleaser: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Set up Go 22 | uses: actions/setup-go@v4 23 | with: 24 | go-version: "1.21" 25 | 26 | - name: Run GoReleaser 27 | uses: goreleaser/goreleaser-action@v5 28 | with: 29 | distribution: goreleaser 30 | version: latest 31 | args: release --clean 32 | env: 33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | 35 | docker: 36 | needs: goreleaser 37 | runs-on: ubuntu-latest 38 | steps: 39 | - name: Checkout 40 | uses: actions/checkout@v4 41 | 42 | - name: Set up QEMU 43 | uses: docker/setup-qemu-action@v3 44 | 45 | - name: Set up Docker Buildx 46 | uses: docker/setup-buildx-action@v3 47 | with: 48 | buildkitd-flags: --debug 49 | 50 | - name: Extract metadata (shayonj) 51 | id: meta_shayonj 52 | uses: docker/metadata-action@v5 53 | with: 54 | images: docker.io/shayonj/pg_flo 55 | tags: | 56 | type=semver,pattern={{version}} 57 | type=semver,pattern={{major}}.{{minor}} 58 | type=semver,pattern={{major}} 59 | 60 | - name: Login to DockerHub (shayonj) 61 | uses: docker/login-action@v3 62 | with: 63 | username: ${{ secrets.DOCKERHUB_USERNAME }} 64 | password: ${{ secrets.DOCKERHUB_TOKEN }} 65 | 66 | - name: Set build timestamp 67 | id: timestamp 68 | run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT 69 | 70 | - name: Build and push (shayonj) 71 | uses: docker/build-push-action@v5 72 | with: 73 | context: . 74 | platforms: linux/amd64,linux/arm64 75 | push: true 76 | tags: ${{ steps.meta_shayonj.outputs.tags }} 77 | labels: ${{ steps.meta_shayonj.outputs.labels }} 78 | build-args: | 79 | VERSION=${{ github.ref_name }} 80 | COMMIT=${{ github.sha }} 81 | DATE=${{ steps.timestamp.outputs.timestamp }} 82 | 83 | - name: Extract metadata (pgflo) 84 | id: meta_pgflo 85 | uses: docker/metadata-action@v5 86 | with: 87 | images: docker.io/pgflo/pg_flo 88 | tags: | 89 | type=semver,pattern={{version}} 90 | type=semver,pattern={{major}}.{{minor}} 91 | type=semver,pattern={{major}} 92 | 93 | - name: Login to DockerHub (pgflo) 94 | uses: docker/login-action@v3 95 | with: 96 | username: ${{ secrets.PG_FLO_DOCKER_HUB_USERNAME }} 97 | password: ${{ secrets.PG_FLO_DOCKER_HUB_TOKEN }} 98 | 99 | - name: Build and push (pgflo) 100 | uses: docker/build-push-action@v5 101 | with: 102 | context: . 103 | platforms: linux/amd64,linux/arm64 104 | push: true 105 | tags: ${{ steps.meta_pgflo.outputs.tags }} 106 | labels: ${{ steps.meta_pgflo.outputs.labels }} 107 | build-args: | 108 | VERSION=${{ github.ref_name }} 109 | COMMIT=${{ github.sha }} 110 | DATE=${{ steps.timestamp.outputs.timestamp }} 111 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | # Go workspace file 18 | go.work 19 | 20 | # IDE-specific files 21 | .idea/ 22 | .vscode/ 23 | 24 | # OS-specific files 25 | .DS_Store 26 | Thumbs.db 27 | 28 | # Log files 29 | *.log 30 | 31 | # Binary output directory 32 | /bin/ 33 | 34 | # Environment variables file 35 | .env 36 | 37 | pg_flo 38 | 39 | bin/ 40 | coverage.txt 41 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | linters: 2 | enable: 3 | - gofmt 4 | - goimports 5 | - govet 6 | - errcheck 7 | - staticcheck 8 | - ineffassign 9 | - unconvert 10 | - misspell 11 | - gosec 12 | - revive 13 | 14 | linters-settings: 15 | govet: 16 | # Check-shadowing option removed 17 | revive: 18 | min-confidence: 0.8 19 | gocyclo: 20 | min-complexity: 15 21 | maligned: 22 | suggest-new: true 23 | dupl: 24 | threshold: 100 25 | goconst: 26 | min-len: 2 27 | min-occurrences: 2 28 | 29 | issues: 30 | exclude-rules: 31 | - path: _test\.go 32 | linters: 33 | - gocyclo 34 | - errcheck 35 | - dupl 36 | - gosec 37 | exclude-dirs: 38 | - vendor/ 39 | exclude-files: 40 | - ".*_test.go" 41 | 42 | output: 43 | formats: colored-line-number 44 | print-issued-lines: true 45 | print-linter-name: true 46 | -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | before: 2 | hooks: 3 | - go mod tidy 4 | 5 | builds: 6 | - main: . 7 | env: 8 | - CGO_ENABLED=0 9 | goos: 10 | - linux 11 | - darwin 12 | goarch: 13 | - amd64 14 | - arm64 15 | ldflags: 16 | - -s -w 17 | - -X github.com/pgflo/pg_flo/cmd.version={{.Version}} 18 | - -X github.com/pgflo/pg_flo/cmd.commit={{.Commit}} 19 | - -X github.com/pgflo/pg_flo/cmd.date={{.Date}} 20 | binary: pg_flo 21 | 22 | archives: 23 | - format: tar.gz 24 | name_template: >- 25 | {{ .ProjectName }}_ 26 | {{- title .Os }}_ 27 | {{- if eq .Arch "amd64" }}x86_64 28 | {{- else }}{{ .Arch }}{{ end }} 29 | format_overrides: 30 | - goos: windows 31 | format: zip 32 | 33 | changelog: 34 | sort: asc 35 | filters: 36 | exclude: 37 | - "^docs:" 38 | - "^test:" 39 | - "^ci:" 40 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.21-alpine AS builder 2 | RUN apk update && apk upgrade --no-cache 3 | WORKDIR /app 4 | COPY . . 5 | ARG VERSION=dev 6 | ARG COMMIT=none 7 | ARG DATE=unknown 8 | RUN CGO_ENABLED=0 GOOS=linux go build -v \ 9 | -ldflags "-s -w \ 10 | -X 'github.com/pgflo/pg_flo/cmd.version=${VERSION}' \ 11 | -X 'github.com/pgflo/pg_flo/cmd.commit=${COMMIT}' \ 12 | -X 'github.com/pgflo/pg_flo/cmd.date=${DATE}'" \ 13 | -o pg_flo . 14 | 15 | FROM alpine:latest 16 | RUN apk update && apk upgrade --no-cache && \ 17 | apk add --no-cache postgresql15-client 18 | COPY --from=builder /app/pg_flo /usr/local/bin/ 19 | ENTRYPOINT ["pg_flo"] 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test lint build clean 2 | 3 | # Define the default goal 4 | .DEFAULT_GOAL := build 5 | 6 | # Build the application 7 | build: 8 | go build -o bin/pg_flo 9 | 10 | # Run tests with race detection and coverage 11 | test: 12 | go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... 13 | 14 | # Run linter 15 | lint: 16 | golangci-lint run --timeout=5m 17 | 18 | # Clean build artifacts 19 | clean: 20 | rm -rf bin/ coverage.txt 21 | 22 | # Run all checks (lint and test) 23 | check: lint test 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pg_flo logo pg_flo 2 | 3 | [![CI](https://github.com/pgflo/pg_flo/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/pgflo/pg_flo/actions/workflows/ci.yml) 4 | [![Integration](https://github.com/pgflo/pg_flo/actions/workflows/integration.yml/badge.svg?branch=main)](https://github.com/pgflo/pg_flo/actions/workflows/integration.yml) 5 | [![Release](https://img.shields.io/github/v/release/pgflo/pg_flo?style=flat&color=#959DA5&sort=semver)](https://github.com/pgflo/pg_flo/releases/latest) 6 | [![Docker Image](https://img.shields.io/docker/v/pgflo/pg_flo?style=flat&label=docker&color=#959DA5&label=docker&sort=semver)](https://hub.docker.com/r/pgflo/pg_flo/tags) 7 | 8 | > The easiest way to move and transform data between PostgreSQL databases using Logical Replication. 9 | 10 | ℹ️ `pg_flo` is in active development. The design and architecture is continuously improving. PRs/Issues are very much welcome 🙏 11 | 12 | ## Key Features 13 | 14 | - **Real-time Data Streaming** - Capture inserts, updates, deletes, and DDL changes in near real-time 15 | - **Fast Initial Loads** - Parallel copy of existing data with automatic follow-up continuous replication 16 | - **Powerful Transformations** - Filter and transform data on-the-fly ([see rules](pkg/rules/README.md)) 17 | - **Flexible Routing** - Route to different tables and remap columns ([see routing](pkg/routing/README.md)) 18 | - **Production Ready** - Supports resumable streaming, DDL tracking, and more 19 | 20 | ## Common Use Cases 21 | 22 | - Real-time data replication between PostgreSQL databases 23 | - ETL pipelines with data transformation 24 | - Data re-routing, masking and filtering 25 | - Database migration with zero downtime 26 | - Event streaming from PostgreSQL 27 | 28 | [View detailed examples →](internal/examples/README.md) 29 | 30 | ## Quick Start 31 | 32 | ### Prerequisites 33 | 34 | - Docker 35 | - PostgreSQL database with `wal_level=logical` 36 | 37 | ### 1. Install 38 | 39 | ```shell 40 | docker pull pgflo/pg_flo:latest 41 | ``` 42 | 43 | ### 2. Configure 44 | 45 | Choose one: 46 | 47 | - Environment variables 48 | - YAML configuration file ([example](internal/pg-flo.yaml)) 49 | - CLI flags 50 | 51 | ### 3. Run 52 | 53 | ```shell 54 | # Start NATS server 55 | docker run -d --name pg_flo_nats \ 56 | --network host \ 57 | -v /path/to/nats-server.conf:/etc/nats/nats-server.conf \ 58 | nats:latest \ 59 | -c /etc/nats/nats-server.conf 60 | 61 | # Start replicator (using config file) 62 | docker run -d --name pg_flo_replicator \ 63 | --network host \ 64 | -v /path/to/config.yaml:/etc/pg_flo/config.yaml \ 65 | pgflo/pg_flo:latest \ 66 | replicator --config /etc/pg_flo/config.yaml 67 | 68 | # Start worker 69 | docker run -d --name pg_flo_worker \ 70 | --network host \ 71 | -v /path/to/config.yaml:/etc/pg_flo/config.yaml \ 72 | pgflo/pg_flo:latest \ 73 | worker postgres --config /etc/pg_flo/config.yaml 74 | ``` 75 | 76 | #### Example Configuration (config.yaml) 77 | 78 | ```yaml 79 | # Replicator settings 80 | host: "localhost" 81 | port: 5432 82 | dbname: "myapp" 83 | user: "replicator" 84 | password: "secret" 85 | group: "users" 86 | tables: 87 | - "users" 88 | 89 | # Worker settings (postgres sink) 90 | target-host: "dest-db" 91 | target-dbname: "myapp" 92 | target-user: "writer" 93 | target-password: "secret" 94 | 95 | # Common settings 96 | nats-url: "nats://localhost:4222" 97 | ``` 98 | 99 | [View full configuration options →](internal/pg-flo.yaml) 100 | 101 | ## Core Concepts 102 | 103 | ### Architecture 104 | 105 | pg_flo uses two main components: 106 | 107 | - **Replicator**: Captures PostgreSQL changes via logical replication 108 | - **Worker**: Processes and routes changes through NATS 109 | 110 | [Learn how it works →](internal/how-it-works.md) 111 | 112 | ### Groups 113 | 114 | Groups are used to: 115 | 116 | - Identify replication processes 117 | - Isolate replication slots and publications 118 | - Run multiple instances on same database 119 | - Maintain state for resumability 120 | - Enable parallel processing 121 | 122 | ```shell 123 | # Example: Separate groups for different tables 124 | pg_flo replicator --group users_orders --tables users,orders 125 | 126 | pg_flo replicator --group products --tables products 127 | ``` 128 | 129 | ### Streaming Modes 130 | 131 | 1. **Stream Only** (default) 132 | - Real-time streaming of changes 133 | 134 | ```shell 135 | pg_flo replicator --stream 136 | ``` 137 | 138 | 2. **Copy Only** 139 | - One-time parallel copy of existing data 140 | 141 | ```shell 142 | pg_flo replicator --copy --max-copy-workers-per-table 4 143 | ``` 144 | 145 | 3. **Copy and Stream** 146 | - Initial parallel copy followed by continuous streaming 147 | 148 | ```shell 149 | pg_flo replicator --copy-and-stream --max-copy-workers-per-table 4 150 | ``` 151 | 152 | ### Destinations 153 | 154 | - **stdout**: Console output 155 | - **file**: File writing 156 | - **postgres**: Database replication 157 | - **webhook**: HTTP endpoints 158 | 159 | [View destination details →](pkg/sinks/README.md) 160 | 161 | ## Advanced Features 162 | 163 | ### Message Routing 164 | 165 | Routing configuration is defined in a separate YAML file: 166 | 167 | ```yaml 168 | # routing.yaml 169 | users: 170 | source_table: users 171 | destination_table: customers 172 | column_mappings: 173 | - source: id 174 | destination: customer_id 175 | ``` 176 | 177 | ```shell 178 | # Apply routing configuration 179 | pg_flo worker postgres --routing-config /path/to/routing.yaml 180 | ``` 181 | 182 | [Learn about routing →](pkg/routing/README.md) 183 | 184 | ### Transformation Rules 185 | 186 | Rules are defined in a separate YAML file: 187 | 188 | ```yaml 189 | # rules.yaml 190 | users: 191 | - type: exclude_columns 192 | columns: [password, ssn] 193 | - type: mask_columns 194 | columns: [email] 195 | ``` 196 | 197 | ```shell 198 | # Apply transformation rules 199 | pg_flo worker file --rules-config /path/to/rules.yaml 200 | ``` 201 | 202 | [View transformation options →](pkg/rules/README.md) 203 | 204 | ### Combined Example 205 | 206 | ```shell 207 | pg_flo worker postgres --config /etc/pg_flo/config.yaml --routing-config routing.yaml --rules-config rules.yaml 208 | ``` 209 | 210 | ## Scaling Guide 211 | 212 | Best practices: 213 | 214 | - Run one worker per group 215 | - Use groups to replicate different tables independently 216 | - Scale horizontally using multiple groups 217 | 218 | Example scaling setup: 219 | 220 | ```shell 221 | # Group: sales 222 | pg_flo replicator --group sales --tables sales 223 | pg_flo worker postgres --group sales 224 | 225 | # Group: inventory 226 | pg_flo replicator --group inventory --tables inventory 227 | pg_flo worker postgres --group inventory 228 | ``` 229 | 230 | ## Limits and Considerations 231 | 232 | - NATS message size: 8MB (configurable) 233 | - One worker per group recommended 234 | - PostgreSQL logical replication prerequisites required 235 | - Tables must have one of the following for replication: 236 | - Primary key 237 | - Unique constraint with `NOT NULL` columns 238 | - `REPLICA IDENTITY FULL` set 239 | 240 | Example table configurations: 241 | 242 | ```sql 243 | -- Using primary key (recommended) 244 | CREATE TABLE users ( 245 | id SERIAL PRIMARY KEY, 246 | email TEXT, 247 | name TEXT 248 | ); 249 | 250 | -- Using unique constraint 251 | CREATE TABLE orders ( 252 | order_id TEXT NOT NULL, 253 | customer_id TEXT NOT NULL, 254 | data JSONB, 255 | CONSTRAINT orders_unique UNIQUE (order_id, customer_id) 256 | ); 257 | ALTER TABLE orders REPLICA IDENTITY USING INDEX orders_unique; 258 | 259 | -- Using all columns (higher overhead in terms of performance) 260 | CREATE TABLE audit_logs ( 261 | id SERIAL, 262 | action TEXT, 263 | data JSONB 264 | ); 265 | ALTER TABLE audit_logs REPLICA IDENTITY FULL; 266 | ``` 267 | 268 | ## Development 269 | 270 | ```shell 271 | make build 272 | make test 273 | make lint 274 | 275 | # E2E tests 276 | ./internal/scripts/e2e_local.sh 277 | ``` 278 | 279 | ## Contributing 280 | 281 | Contributions welcome! Please open an issue or submit a pull request. 282 | 283 | ## License 284 | 285 | Apache License 2.0. [View license →](LICENSE) 286 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/pgflo/pg_flo 2 | 3 | go 1.21.5 4 | 5 | require ( 6 | github.com/goccy/go-json v0.10.5 7 | github.com/jackc/pglogrepl v0.0.0-20240307033717-828fbfe908e9 8 | github.com/jackc/pgtype v1.14.4 9 | github.com/jackc/pgx/v5 v5.7.2 10 | github.com/nats-io/nats.go v1.38.0 11 | github.com/rs/zerolog v1.33.0 12 | github.com/shopspring/decimal v1.4.0 13 | github.com/spf13/cobra v1.9.1 14 | github.com/spf13/pflag v1.0.6 15 | github.com/spf13/viper v1.19.0 16 | github.com/stretchr/testify v1.10.0 17 | gopkg.in/yaml.v2 v2.4.0 18 | ) 19 | 20 | require ( 21 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 22 | github.com/fsnotify/fsnotify v1.7.0 // indirect 23 | github.com/hashicorp/hcl v1.0.0 // indirect 24 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 25 | github.com/jackc/pgio v1.0.0 // indirect 26 | github.com/jackc/pgpassfile v1.0.0 // indirect 27 | github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect 28 | github.com/jackc/puddle/v2 v2.2.2 // indirect 29 | github.com/klauspost/compress v1.17.9 // indirect 30 | github.com/magiconair/properties v1.8.7 // indirect 31 | github.com/mattn/go-colorable v0.1.13 // indirect 32 | github.com/mattn/go-isatty v0.0.20 // indirect 33 | github.com/mitchellh/mapstructure v1.5.0 // indirect 34 | github.com/nats-io/nkeys v0.4.9 // indirect 35 | github.com/nats-io/nuid v1.0.1 // indirect 36 | github.com/pelletier/go-toml/v2 v2.2.2 // indirect 37 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 38 | github.com/sagikazarmark/locafero v0.6.0 // indirect 39 | github.com/sagikazarmark/slog-shim v0.1.0 // indirect 40 | github.com/sourcegraph/conc v0.3.0 // indirect 41 | github.com/spf13/afero v1.11.0 // indirect 42 | github.com/spf13/cast v1.7.0 // indirect 43 | github.com/stretchr/objx v0.5.2 // indirect 44 | github.com/subosito/gotenv v1.6.0 // indirect 45 | go.uber.org/multierr v1.11.0 // indirect 46 | golang.org/x/crypto v0.31.0 // indirect 47 | golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa // indirect 48 | golang.org/x/sync v0.10.0 // indirect 49 | golang.org/x/sys v0.28.0 // indirect 50 | golang.org/x/text v0.21.0 // indirect 51 | gopkg.in/ini.v1 v1.67.0 // indirect 52 | gopkg.in/yaml.v3 v3.0.1 // indirect 53 | ) 54 | -------------------------------------------------------------------------------- /internal/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | postgres: 5 | image: postgres:14 6 | container_name: pg_logical_replication 7 | environment: 8 | POSTGRES_USER: myuser 9 | POSTGRES_PASSWORD: mypassword!@#%1234 10 | POSTGRES_DB: mydb 11 | volumes: 12 | - postgres_data:/var/lib/postgresql/data 13 | ports: 14 | - "5433:5432" 15 | command: 16 | - "postgres" 17 | - "-c" 18 | - "wal_level=logical" 19 | - "-c" 20 | - "max_replication_slots=5" 21 | - "-c" 22 | - "max_wal_senders=5" 23 | restart: unless-stopped 24 | 25 | target_postgres: 26 | image: postgres:14 27 | container_name: pg_target 28 | environment: 29 | POSTGRES_USER: targetuser 30 | POSTGRES_PASSWORD: targetpassword!@#1234 31 | POSTGRES_DB: targetdb 32 | volumes: 33 | - target_postgres_data:/var/lib/postgresql/data 34 | ports: 35 | - "5434:5432" 36 | restart: unless-stopped 37 | 38 | nats: 39 | image: nats:latest 40 | container_name: pg_flo_nats 41 | command: ["-c", "/etc/nats/nats-server.conf"] 42 | volumes: 43 | - ./nats-server.conf:/etc/nats/nats-server.conf 44 | - nats_data:/data 45 | ports: 46 | - "4222:4222" 47 | - "8222:8222" 48 | restart: unless-stopped 49 | 50 | volumes: 51 | postgres_data: 52 | target_postgres_data: 53 | nats_data: 54 | -------------------------------------------------------------------------------- /internal/examples/README.md: -------------------------------------------------------------------------------- 1 | # pg_flo Examples 2 | 3 | This guide demonstrates common use cases for pg_flo with practical examples. For full configuration options, see the [example config file](../pg-flo.yaml). 4 | 5 | ## Basic Replication 6 | 7 | Simple database-to-database replication: 8 | 9 | ```bash 10 | # Start NATS server 11 | docker run -d --name pg_flo_nats \ 12 | --network host \ 13 | -v /path/to/nats-server.conf:/etc/nats/nats-server.conf \ 14 | nats:latest \ 15 | -c /etc/nats/nats-server.conf 16 | 17 | # Start replicator 18 | docker run -d --name pg_flo_replicator \ 19 | --network host \ 20 | -v /path/to/config.yaml:/etc/pg_flo/config.yaml \ 21 | pgflo/pg_flo:latest \ 22 | replicator --config /etc/pg_flo/config.yaml 23 | 24 | # Start worker 25 | docker run -d --name pg_flo_worker \ 26 | --network host \ 27 | -v /path/to/config.yaml:/etc/pg_flo/config.yaml \ 28 | pgflo/pg_flo:latest \ 29 | worker postgres --config /etc/pg_flo/config.yaml 30 | ``` 31 | 32 | ## Data Masking and Transformation 33 | 34 | Mask sensitive data during replication: 35 | 36 | ```yaml 37 | # rules.yaml 38 | rules: 39 | - table: users 40 | type: transform 41 | column: email 42 | parameters: 43 | type: mask 44 | mask_char: "*" 45 | operations: [INSERT, UPDATE] 46 | - table: payments 47 | type: transform 48 | column: card_number 49 | parameters: 50 | type: regex_replace 51 | pattern: "(\d{12})(\d{4})" 52 | replacement: "************$2" 53 | ``` 54 | 55 | ```bash 56 | pg_flo worker postgres \ 57 | --group sensitive_data \ 58 | --rules-config /path/to/rules.yaml \ 59 | # ... other postgres connection flags 60 | ``` 61 | 62 | ## Custom Table Routing 63 | 64 | Route and rename tables/columns: 65 | 66 | ```yaml 67 | # routing.yaml 68 | users: 69 | source_table: users 70 | destination_table: customers 71 | column_mappings: 72 | - source: user_id 73 | destination: customer_id 74 | - source: created_at 75 | destination: signup_date 76 | operations: 77 | - INSERT 78 | - UPDATE 79 | ``` 80 | 81 | ```bash 82 | pg_flo worker postgres \ 83 | --group user_migration \ 84 | --routing-config /path/to/routing.yaml \ 85 | # ... other config flags 86 | ``` 87 | 88 | ## Initial Load Options 89 | 90 | ### Copy Only (One-time Data Copy) 91 | 92 | Copy existing data without streaming changes: 93 | 94 | ```bash 95 | pg_flo replicator \ 96 | --copy \ 97 | --max-copy-workers-per-table 4 \ 98 | --group initial_load \ 99 | # ... other config flags 100 | ``` 101 | 102 | ### Copy and Stream 103 | 104 | Perform parallel initial data load followed by continuous streaming: 105 | 106 | ```bash 107 | pg_flo replicator \ 108 | --copy-and-stream \ 109 | --max-copy-workers-per-table 4 \ 110 | --group full_sync \ 111 | # ... other config flags 112 | ``` 113 | 114 | ## Multi-Destination Pipeline 115 | 116 | Stream changes to multiple destinations simultaneously: 117 | 118 | ```bash 119 | # Terminal 1: Stream to PostgreSQL 120 | pg_flo worker postgres \ 121 | --group audit \ 122 | # ... other config flags 123 | 124 | # Terminal 2: Stream to files for archival 125 | pg_flo worker file \ 126 | --group audit \ 127 | --file-output-dir /archive/changes 128 | 129 | # Terminal 3: Stream to webhook for external processing 130 | pg_flo worker webhook \ 131 | --group audit \ 132 | --webhook-url https://api.example.com/changes \ 133 | --webhook-batch-size 100 134 | ``` 135 | 136 | ## Schema Tracking 137 | 138 | Enable DDL tracking to capture schema changes. DDLs are applied on the destination as they arrive: 139 | 140 | ```bash 141 | pg_flo replicator \ 142 | --track-ddl \ 143 | --group schema_sync \ 144 | # ... other config flags 145 | 146 | pg_flo worker postgres \ 147 | --group schema_sync \ 148 | --target-sync-schema true \ 149 | # ... other postgres connection flags 150 | ``` 151 | 152 | ## Configuration File 153 | 154 | Instead of CLI flags, you can use a configuration file: 155 | 156 | ```yaml 157 | # ~/.pg_flo.yaml 158 | host: "source-db.example.com" 159 | port: 5432 160 | dbname: "myapp" 161 | user: "replicator" 162 | password: "secret" 163 | group: "production" 164 | tables: 165 | - users 166 | - orders 167 | - payments 168 | nats-url: "nats://localhost:4222" 169 | target-host: "dest-db.example.com" 170 | target-dbname: "myapp" 171 | target-user: "writer" 172 | target-password: "secret" 173 | ``` 174 | 175 | ```bash 176 | pg_flo replicator --config /path/to/config.yaml 177 | pg_flo worker postgres --config /path/to/config.yaml 178 | ``` 179 | 180 | See the [example config file](../pg-flo.yaml) for more details. 181 | 182 | ## Environment Variables 183 | 184 | All configuration options can also be set via environment variables: 185 | 186 | ```bash 187 | export PG_FLO_HOST=source-db.example.com 188 | export PG_FLO_PORT=5432 189 | export PG_FLO_DBNAME=myapp 190 | export PG_FLO_USER=replicator 191 | export PG_FLO_PASSWORD=secret 192 | export PG_FLO_GROUP=production 193 | export PG_FLO_NATS_URL=nats://localhost:4222 194 | 195 | pg_flo replicator 196 | ``` 197 | -------------------------------------------------------------------------------- /internal/how-it-works.md: -------------------------------------------------------------------------------- 1 | # How it Works 2 | 3 | `pg_flo` leverages PostgreSQL's logical replication system to capture and stream data while applying transformations and filtrations to the data before it reaches the destination. It utilizes **NATS** as a message broker to decouple the replicator and worker processes, providing flexibility and scalability. 4 | 5 | 1. **Publication Creation**: Creates a PostgreSQL publication for the specified tables or all tables (per `group`). 6 | 7 | 2. **Replication Slot**: A replication slot is created to ensure no data is lost between streaming sessions. 8 | 9 | 3. **Operation Modes**: 10 | 11 | - **Copy-and-Stream**: Performs an initial bulk copy followed by streaming changes. 12 | - **Stream-Only**: Starts streaming changes immediately from the last known position. 13 | 14 | 4. **Initial Bulk Copy** (for Copy-and-Stream mode): 15 | 16 | - If no valid LSN is found in NATS, `pg_flo` performs an initial bulk copy of existing data. 17 | - This process is parallelized for fast data sync: 18 | - A snapshot is taken to ensure consistency. 19 | - Each table is divided into page ranges. 20 | - Multiple workers copy different ranges concurrently. 21 | 22 | 5. **Streaming Changes**: 23 | 24 | - After the initial copy (or immediately in Stream-Only mode), the replicator streams changes from PostgreSQL and publishes them to NATS. 25 | - The last processed LSN is stored in NATS, allowing `pg_flo` to resume operations from where it left off in case of interruptions. 26 | 27 | 6. **Message Processing**: The worker processes various types of messages from NATS: 28 | 29 | - Relation messages to understand table structures 30 | - Insert, Update, and Delete messages containing actual data changes 31 | - Begin and Commit messages for transaction boundaries 32 | - DDL changes like ALTER TABLE, CREATE INDEX, etc. 33 | 34 | 7. **Data Transformation**: Received data is converted into a structured format, with type-aware conversions for different PostgreSQL data types. 35 | 36 | 8. **Rule Application**: If configured, transformation and filtering rules are applied to the data: 37 | 38 | - **Transform Rules**: 39 | - Regex: Apply regular expression transformations to string values. 40 | - Mask: Mask sensitive data, keeping the first and last characters visible. 41 | - **Filter Rules**: 42 | - Comparison: Filter based on equality, inequality, greater than, less than, etc. 43 | - Contains: Filter string values based on whether they contain a specific substring. 44 | - Rules can be applied selectively to insert, update, or delete operations. 45 | 46 | 9. **Buffering**: Processed data is buffered and written in batches to optimize write operations to the destination. 47 | 48 | 10. **Writing to Sink**: Data is periodically flushed from the buffer to the configured sink (e.g., stdout, file, or other destinations). 49 | 50 | 11. **State Management**: 51 | - The replicator keeps track of its progress by updating the Last LSN in NATS. 52 | - The worker maintains its progress to ensure data consistency. 53 | - This allows for resumable operations across multiple runs. 54 | - Periodic status updates are sent to PostgreSQL to maintain the replication connection. 55 | -------------------------------------------------------------------------------- /internal/nats-server.conf: -------------------------------------------------------------------------------- 1 | jetstream: enabled 2 | store_dir: /data 3 | http_port: 8222 4 | max_payload: 8388608 5 | -------------------------------------------------------------------------------- /internal/pg-flo.yaml: -------------------------------------------------------------------------------- 1 | # [Replicator] PostgreSQL connection settings 2 | host: "localhost" # PostgreSQL host (env: PG_FLO_HOST) 3 | port: 5432 # PostgreSQL port (env: PG_FLO_PORT) 4 | dbname: "your_database" # PostgreSQL database name (env: PG_FLO_DBNAME) 5 | user: "your_user" # PostgreSQL user (env: PG_FLO_USER) 6 | password: "your_password" # PostgreSQL password (env: PG_FLO_PASSWORD) 7 | schema: "public" # PostgreSQL schema to replicate from (env: PG_FLO_SCHEMA) 8 | 9 | # Replication settings 10 | group: "your_group" # Group name to identify each replication (env: PG_FLO_GROUP) 11 | tables: # Tables to replicate (empty for all tables) (env: PG_FLO_TABLES) 12 | - "table1" 13 | - "table2" 14 | copy-and-stream: false # Enable copy and stream mode (env: PG_FLO_COPY_AND_STREAM) 15 | max-copy-workers-per-table: 4 # Maximum number of parallel workers for copy operation (env: PG_FLO_MAX_COPY_WORKERS_PER_TABLE) 16 | track-ddl: false # Enable tracking of DDL changes (env: PG_FLO_TRACK_DDL) 17 | 18 | # NATS settings 19 | nats-url: "nats://localhost:4222" # NATS server URL (env: PG_FLO_NATS_URL) 20 | 21 | # Worker settings 22 | batch-size: 1000 # Number of messages to process in a batch (env: PG_FLO_BATCH_SIZE) 23 | rules-config: "/path/to/rules.yaml" # Path to rules configuration file (env: PG_FLO_RULES_CONFIG) 24 | routing-config: "/path/to/routing.yaml" # Path to routing configuration file (env: PG_FLO_ROUTING_CONFIG) 25 | 26 | # File sink settings 27 | file-output-dir: "/tmp/pg_flo-output" # Output directory for file sink (env: PG_FLO_FILE_OUTPUT_DIR) 28 | 29 | # [Worker] Postgres sink settings 30 | target-host: "" # Target PostgreSQL host (env: PG_FLO_TARGET_HOST) 31 | target-port: 5432 # Target PostgreSQL port (env: PG_FLO_TARGET_PORT) 32 | target-dbname: "" # Target PostgreSQL database name (env: PG_FLO_TARGET_DBNAME) 33 | target-user: "" # Target PostgreSQL user (env: PG_FLO_TARGET_USER) 34 | target-password: "" # Target PostgreSQL password (env: PG_FLO_TARGET_PASSWORD) 35 | target-sync-schema: false # Sync schema from source to target (env: PG_FLO_TARGET_SYNC_SCHEMA) 36 | target-disable-foreign-keys: false # Disable foreign key constraints on target (env: PG_FLO_TARGET_DISABLE_FOREIGN_KEYS) 37 | 38 | # Source connection for schema sync (only needed with target-sync-schema: true) 39 | source-host: "" # Source PostgreSQL host (env: PG_FLO_SOURCE_HOST) 40 | source-port: 5432 # Source PostgreSQL port (env: PG_FLO_SOURCE_PORT) 41 | source-dbname: "" # Source PostgreSQL database name (env: PG_FLO_SOURCE_DBNAME) 42 | source-user: "" # Source PostgreSQL user (env: PG_FLO_SOURCE_USER) 43 | source-password: "" # Source PostgreSQL password (env: PG_FLO_SOURCE_PASSWORD) 44 | 45 | # Webhook sink settings 46 | webhook-url: "" # Webhook URL to send data (env: PG_FLO_WEBHOOK_URL) 47 | -------------------------------------------------------------------------------- /internal/pg_flo_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgflo/pg_flo/e9be74c2ffaa91b13f9a4326d4b5d83c81e4b450/internal/pg_flo_logo.png -------------------------------------------------------------------------------- /internal/scripts/e2e_common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PG_HOST="${PG_HOST:-localhost}" 4 | PG_PORT="${PG_PORT:-5433}" 5 | PG_USER="${PG_USER:-myuser}" 6 | PG_PASSWORD="${PG_PASSWORD:-mypassword!@#%1234}" 7 | PG_DB="${PG_DB:-mydb}" 8 | 9 | TARGET_PG_HOST="${TARGET_PG_HOST:-localhost}" 10 | TARGET_PG_PORT="${TARGET_PG_PORT:-5434}" 11 | TARGET_PG_USER="${TARGET_PG_USER:-targetuser}" 12 | TARGET_PG_PASSWORD="${TARGET_PG_PASSWORD:-targetpassword!@#1234}" 13 | TARGET_PG_DB="${TARGET_PG_DB:-targetdb}" 14 | 15 | NATS_URL="${NATS_URL:-nats://localhost:4222}" 16 | 17 | pg_flo_BIN="./bin/pg_flo" 18 | OUTPUT_DIR="/tmp/pg_flo-output" 19 | pg_flo_LOG="/tmp/pg_flo.log" 20 | pg_flo_WORKER_LOG="/tmp/pg_flo_worker.log" 21 | 22 | # Helper functions 23 | log() { echo "🔹 $1"; } 24 | success() { echo "✅ $1"; } 25 | error() { echo "❌ $1"; } 26 | 27 | run_sql() { 28 | if [ ${#1} -gt 1000 ]; then 29 | local temp_file=$(mktemp) 30 | echo "$1" >"$temp_file" 31 | PGPASSWORD=$PG_PASSWORD psql -h "$PG_HOST" -U "$PG_USER" -d "$PG_DB" -p "$PG_PORT" -q -t -f "$temp_file" 32 | rm "$temp_file" 33 | else 34 | PGPASSWORD=$PG_PASSWORD psql -h "$PG_HOST" -U "$PG_USER" -d "$PG_DB" -p "$PG_PORT" -q -t -c "$1" 35 | fi 36 | } 37 | 38 | setup_postgres() { 39 | log "Ensuring PostgreSQL is ready..." 40 | for i in {1..30}; do 41 | if PGPASSWORD=$PG_PASSWORD psql -h "$PG_HOST" -U "$PG_USER" -d "$PG_DB" -p "$PG_PORT" -c '\q' >/dev/null 2>&1; then 42 | success "PostgreSQL is ready" 43 | return 0 44 | fi 45 | sleep 1 46 | done 47 | error "PostgreSQL is not ready after 30 seconds" 48 | exit 1 49 | } 50 | 51 | stop_pg_flo_gracefully() { 52 | log "Stopping pg_flo replicator..." 53 | if kill -0 "$pg_flo_PID" 2>/dev/null; then 54 | kill -TERM "$pg_flo_PID" 55 | wait "$pg_flo_PID" 2>/dev/null || true 56 | success "pg_flo replicator stopped" 57 | else 58 | log "pg_flo replicator process not found, it may have already completed" 59 | fi 60 | 61 | log "Stopping pg_flo worker..." 62 | if kill -0 "$pg_flo_WORKER_PID" 2>/dev/null; then 63 | kill -TERM "$pg_flo_WORKER_PID" 64 | wait "$pg_flo_WORKER_PID" 2>/dev/null || true 65 | success "pg_flo worker stopped" 66 | else 67 | log "pg_flo worker process not found, it may have already completed" 68 | fi 69 | } 70 | 71 | show_pg_flo_logs() { 72 | log "pg_flo replicator logs:" 73 | echo "----------------------------------------" 74 | cat $pg_flo_LOG* 75 | echo "----------------------------------------" 76 | 77 | log "pg_flo worker logs:" 78 | echo "----------------------------------------" 79 | cat $pg_flo_WORKER_LOG* 80 | echo "----------------------------------------" 81 | } 82 | 83 | run_sql_target() { 84 | if [ ${#1} -gt 1000 ]; then 85 | local temp_file=$(mktemp) 86 | echo "$1" >"$temp_file" 87 | PGPASSWORD=$TARGET_PG_PASSWORD psql -h "$TARGET_PG_HOST" -U "$TARGET_PG_USER" -d "$TARGET_PG_DB" -p "$TARGET_PG_PORT" -q -t -f "$temp_file" 88 | rm "$temp_file" 89 | else 90 | PGPASSWORD=$TARGET_PG_PASSWORD psql -h "$TARGET_PG_HOST" -U "$TARGET_PG_USER" -d "$TARGET_PG_DB" -p "$TARGET_PG_PORT" -q -t -c "$1" 91 | fi 92 | } 93 | 94 | setup_docker() { 95 | rm -Rf /tmp/pg* 96 | log "Setting up Docker environment..." 97 | docker compose -f internal/docker-compose.yml down -v 98 | docker compose -f internal/docker-compose.yml up -d 99 | success "Docker environment is set up" 100 | } 101 | -------------------------------------------------------------------------------- /internal/scripts/e2e_copy_and_stream.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | create_users() { 7 | log "Creating test table..." 8 | run_sql "DROP TABLE IF EXISTS public.users;" 9 | run_sql "CREATE TABLE public.users ( 10 | id serial PRIMARY KEY, 11 | int_col integer, 12 | float_col float, 13 | text_col text, 14 | bool_col boolean, 15 | date_col date, 16 | timestamp_col timestamp with time zone, 17 | json_col jsonb, 18 | array_col integer[], 19 | bytea_col bytea 20 | );" 21 | success "Test table created" 22 | } 23 | 24 | populate_initial_data() { 25 | log "Populating initial data..." 26 | run_sql "INSERT INTO public.users ( 27 | int_col, float_col, text_col, bool_col, date_col, timestamp_col, json_col, array_col, bytea_col 28 | ) SELECT 29 | generate_series(1, 500000), 30 | random() * 100, 31 | 'Initial data ' || generate_series(1, 500000), 32 | (random() > 0.5), 33 | current_date + (random() * 365)::integer * interval '1 day', 34 | current_timestamp + (random() * 365 * 24 * 60 * 60)::integer * interval '1 second', 35 | json_build_object('key', 'value' || generate_series(1, 500000), 'number', generate_series(1, 500000)), 36 | ARRAY[generate_series(1, 3)], 37 | decode(lpad(to_hex(generate_series(1, 4)), 8, '0'), 'hex') 38 | ;" 39 | run_sql "UPDATE public.users SET text_col = text_col || ' - Updated';" 40 | 41 | log "Inserting large JSON data..." 42 | local large_json='{"data":[' 43 | for i in {1..10000}; do 44 | if [ "$i" -ne 1 ]; then 45 | large_json+=',' 46 | fi 47 | large_json+='{"id":'$i',"name":"Item '$i'","description":"This is a long description for item '$i'. It contains a lot of text to make the JSON larger.","attributes":{"color":"red","size":"large","weight":10.5,"tags":["tag1","tag2","tag3"]}}' 48 | done 49 | large_json+=']}' 50 | 51 | run_sql "INSERT INTO public.users (int_col, json_col) VALUES (1000001, '$large_json'::jsonb);" 52 | 53 | run_sql "ANALYZE public.users;" 54 | success "Initial data populated" 55 | } 56 | 57 | simulate_concurrent_changes() { 58 | log "Simulating concurrent changes..." 59 | for i in {1..3000}; do 60 | run_sql "INSERT INTO public.users ( 61 | int_col, float_col, text_col, bool_col, date_col, timestamp_col, json_col, array_col, bytea_col 62 | ) VALUES ( 63 | $i, 64 | $i * 1.5, 65 | 'Concurrent data $i', 66 | ($i % 2 = 0), 67 | current_date + ($i % 365) * interval '1 day', 68 | current_timestamp + ($i % (365 * 24)) * interval '1 hour', 69 | '{\"key\": \"concurrent_$i\", \"value\": $i}', 70 | ARRAY[$i, $i+1, $i+2], 71 | decode(lpad(to_hex($i), 8, '0'), 'hex') 72 | );" 73 | done 74 | success "Concurrent changes simulated" 75 | } 76 | 77 | start_pg_flo_replication() { 78 | log "Starting pg_flo replication..." 79 | $pg_flo_BIN replicator \ 80 | --host "$PG_HOST" \ 81 | --port "$PG_PORT" \ 82 | --dbname "$PG_DB" \ 83 | --user "$PG_USER" \ 84 | --password "$PG_PASSWORD" \ 85 | --group "test_group" \ 86 | --tables "users" \ 87 | --schema "public" \ 88 | --nats-url "$NATS_URL" \ 89 | --copy-and-stream \ 90 | --max-copy-workers-per-table 4 \ 91 | >"$pg_flo_LOG" 2>&1 & 92 | pg_flo_PID=$! 93 | log "pg_flo started with PID: $pg_flo_PID" 94 | success "pg_flo replication started" 95 | } 96 | 97 | start_pg_flo_worker() { 98 | log "Starting pg_flo worker with PostgreSQL sink..." 99 | $pg_flo_BIN worker postgres \ 100 | --group "test_group" \ 101 | --nats-url "$NATS_URL" \ 102 | --source-host "$PG_HOST" \ 103 | --source-port "$PG_PORT" \ 104 | --source-dbname "$PG_DB" \ 105 | --source-user "$PG_USER" \ 106 | --source-password "$PG_PASSWORD" \ 107 | --target-host "$TARGET_PG_HOST" \ 108 | --target-port "$TARGET_PG_PORT" \ 109 | --target-dbname "$TARGET_PG_DB" \ 110 | --target-user "$TARGET_PG_USER" \ 111 | --target-password "$TARGET_PG_PASSWORD" \ 112 | --batch-size 5000 \ 113 | --target-sync-schema \ 114 | >"$pg_flo_WORKER_LOG" 2>&1 & 115 | pg_flo_WORKER_PID=$! 116 | log "pg_flo worker started with PID: $pg_flo_WORKER_PID" 117 | success "pg_flo worker started" 118 | } 119 | 120 | compare_row_counts() { 121 | log "Comparing row counts..." 122 | SOURCE_COUNT=$(run_sql "SELECT COUNT(*) FROM public.users") 123 | TARGET_COUNT=$(run_sql_target "SELECT COUNT(*) FROM public.users") 124 | 125 | log "Source database row count: $SOURCE_COUNT" 126 | log "Target database row count: $TARGET_COUNT" 127 | 128 | EXPECTED_COUNT=503001 129 | 130 | if [ "$SOURCE_COUNT" -eq "$TARGET_COUNT" ] && [ "$SOURCE_COUNT" -eq "$EXPECTED_COUNT" ]; then 131 | success "Row counts match and total is correct ($EXPECTED_COUNT)" 132 | return 0 133 | else 134 | error "Row counts do not match or total is incorrect. Expected $EXPECTED_COUNT, Source: $SOURCE_COUNT, Target: $TARGET_COUNT" 135 | return 1 136 | fi 137 | } 138 | 139 | verify_large_json() { 140 | log "Verifying large JSON data..." 141 | local source_json_length=$(run_sql " 142 | SELECT jsonb_array_length(json_col->'data') 143 | FROM public.users 144 | WHERE int_col = 1000001 145 | ") 146 | local target_json_length=$(run_sql_target " 147 | SELECT jsonb_array_length(json_col->'data') 148 | FROM public.users 149 | WHERE int_col = 1000001 150 | ") 151 | 152 | log "Source JSON length: $source_json_length" 153 | log "Target JSON length: $target_json_length" 154 | 155 | if [ -n "$source_json_length" ] && [ -n "$target_json_length" ] && 156 | [ "$source_json_length" -eq "$target_json_length" ] && 157 | [ "$source_json_length" -eq 10000 ]; then 158 | success "Large JSON data verified successfully" 159 | return 0 160 | else 161 | error "Large JSON data verification failed. Expected length 10000, got Source: $source_json_length, Target: $target_json_length" 162 | return 1 163 | fi 164 | } 165 | 166 | verify_data_integrity() { 167 | log "Verifying data integrity..." 168 | 169 | generate_table_hash() { 170 | local db=$1 171 | local csv_file="/tmp/pg_flo_${db}_dump.csv" 172 | 173 | if [ "$db" = "source" ]; then 174 | PGPASSWORD=$PG_PASSWORD psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -d "$PG_DB" \ 175 | -c "\COPY (SELECT * FROM public.users ORDER BY id) TO '$csv_file' WITH CSV" 176 | else 177 | PGPASSWORD=$TARGET_PG_PASSWORD psql -h "$TARGET_PG_HOST" -p "$TARGET_PG_PORT" -U "$TARGET_PG_USER" -d "$TARGET_PG_DB" \ 178 | -c "\COPY (SELECT * FROM public.users ORDER BY id) TO '$csv_file' WITH CSV" 179 | fi 180 | 181 | if command -v md5 >/dev/null; then 182 | md5 -q "$csv_file" 183 | elif command -v md5sum >/dev/null; then 184 | md5sum "$csv_file" | awk '{ print $1 }' 185 | else 186 | echo "Neither md5 nor md5sum command found" >&2 187 | return 1 188 | fi 189 | } 190 | 191 | local source_hash=$(generate_table_hash "source") 192 | local target_hash=$(generate_table_hash "target") 193 | 194 | log "Source data hash: $source_hash" 195 | log "Target data hash: $target_hash" 196 | log "Source CSV file: /tmp/pg_flo_source_dump.csv" 197 | log "Target CSV file: /tmp/pg_flo_target_dump.csv" 198 | 199 | if [ "$source_hash" = "$target_hash" ]; then 200 | success "Data integrity verified: source and target databases match 100%" 201 | return 0 202 | else 203 | error "Data integrity check failed: source and target databases do not match" 204 | log "You can compare the dumps using: diff /tmp/pg_flo_source_dump.csv /tmp/pg_flo_target_dump.csv" 205 | return 1 206 | fi 207 | } 208 | 209 | test_pg_flo_cdc() { 210 | setup_postgres 211 | create_users 212 | populate_initial_data 213 | 214 | start_pg_flo_replication 215 | start_pg_flo_worker 216 | simulate_concurrent_changes 217 | 218 | log "Waiting for changes to replicate..." 219 | sleep 90 220 | stop_pg_flo_gracefully 221 | compare_row_counts || return 1 222 | verify_large_json || return 1 223 | verify_data_integrity || return 1 224 | } 225 | 226 | log "Starting pg_flo CDC test..." 227 | if test_pg_flo_cdc; then 228 | success "All tests passed! 🎉" 229 | exit 0 230 | else 231 | error "Some tests failed. Please check the logs." 232 | show_pg_flo_logs 233 | exit 1 234 | fi 235 | -------------------------------------------------------------------------------- /internal/scripts/e2e_copy_only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | create_users() { 7 | log "Creating test table..." 8 | run_sql "DROP TABLE IF EXISTS public.users;" 9 | run_sql "CREATE TABLE public.users ( 10 | id serial PRIMARY KEY, 11 | int_col integer, 12 | float_col float, 13 | text_col text, 14 | bool_col boolean, 15 | date_col date, 16 | timestamp_col timestamp with time zone, 17 | json_col jsonb, 18 | array_col integer[], 19 | bytea_col bytea 20 | );" 21 | success "Test table created" 22 | } 23 | 24 | populate_initial_data() { 25 | log "Populating initial data..." 26 | run_sql "INSERT INTO public.users ( 27 | int_col, float_col, text_col, bool_col, date_col, timestamp_col, json_col, array_col, bytea_col 28 | ) SELECT 29 | generate_series(1, 500000), 30 | random() * 100, 31 | 'Initial data ' || generate_series(1, 500000), 32 | (random() > 0.5), 33 | current_date + (random() * 365)::integer * interval '1 day', 34 | current_timestamp + (random() * 365 * 24 * 60 * 60)::integer * interval '1 second', 35 | json_build_object('key', 'value' || generate_series(1, 500000), 'number', generate_series(1, 500000)), 36 | ARRAY[generate_series(1, 3)], 37 | decode(lpad(to_hex(generate_series(1, 4)), 8, '0'), 'hex') 38 | ;" 39 | 40 | log "Inserting large JSON data..." 41 | local large_json='{"data":[' 42 | for i in {1..10000}; do 43 | if [ "$i" -ne 1 ]; then 44 | large_json+=',' 45 | fi 46 | large_json+='{"id":'$i',"name":"Item '$i'","description":"This is a long description for item '$i'. It contains a lot of text to make the JSON larger.","attributes":{"color":"red","size":"large","weight":10.5,"tags":["tag1","tag2","tag3"]}}' 47 | done 48 | large_json+=']}' 49 | 50 | run_sql "INSERT INTO public.users (int_col, json_col) VALUES (1000001, '$large_json'::jsonb);" 51 | 52 | run_sql "ANALYZE public.users;" 53 | success "Initial data populated" 54 | } 55 | 56 | start_pg_flo_copy_only() { 57 | log "Starting pg_flo in copy-only mode..." 58 | $pg_flo_BIN replicator \ 59 | --host "$PG_HOST" \ 60 | --port "$PG_PORT" \ 61 | --dbname "$PG_DB" \ 62 | --user "$PG_USER" \ 63 | --password "$PG_PASSWORD" \ 64 | --group "test_group" \ 65 | --tables "users" \ 66 | --schema "public" \ 67 | --nats-url "$NATS_URL" \ 68 | --copy \ 69 | --max-copy-workers-per-table 10 \ 70 | >"$pg_flo_LOG" 2>&1 & 71 | pg_flo_PID=$! 72 | log "pg_flo started with PID: $pg_flo_PID" 73 | success "pg_flo copy-only started" 74 | } 75 | 76 | start_pg_flo_worker() { 77 | log "Starting pg_flo worker with PostgreSQL sink..." 78 | $pg_flo_BIN worker postgres \ 79 | --group "test_group" \ 80 | --nats-url "$NATS_URL" \ 81 | --source-host "$PG_HOST" \ 82 | --source-port "$PG_PORT" \ 83 | --source-dbname "$PG_DB" \ 84 | --source-user "$PG_USER" \ 85 | --source-password "$PG_PASSWORD" \ 86 | --target-host "$TARGET_PG_HOST" \ 87 | --target-port "$TARGET_PG_PORT" \ 88 | --target-dbname "$TARGET_PG_DB" \ 89 | --target-user "$TARGET_PG_USER" \ 90 | --target-password "$TARGET_PG_PASSWORD" \ 91 | --target-sync-schema \ 92 | >"$pg_flo_WORKER_LOG" 2>&1 & 93 | pg_flo_WORKER_PID=$! 94 | log "pg_flo worker started with PID: $pg_flo_WORKER_PID" 95 | success "pg_flo worker started" 96 | } 97 | 98 | compare_row_counts() { 99 | log "Comparing row counts..." 100 | SOURCE_COUNT=$(run_sql "SELECT COUNT(*) FROM public.users") 101 | TARGET_COUNT=$(run_sql_target "SELECT COUNT(*) FROM public.users") 102 | 103 | log "Source database row count: $SOURCE_COUNT" 104 | log "Target database row count: $TARGET_COUNT" 105 | 106 | EXPECTED_COUNT=500001 # 500,000 regular rows + 1 large JSON row 107 | 108 | if [ "$SOURCE_COUNT" -eq "$TARGET_COUNT" ] && [ "$SOURCE_COUNT" -eq "$EXPECTED_COUNT" ]; then 109 | success "Row counts match and total is correct ($EXPECTED_COUNT)" 110 | return 0 111 | else 112 | error "Row counts do not match or total is incorrect. Expected $EXPECTED_COUNT, Source: $SOURCE_COUNT, Target: $TARGET_COUNT" 113 | return 1 114 | fi 115 | } 116 | 117 | verify_large_json() { 118 | log "Verifying large JSON data..." 119 | local source_json_length=$(run_sql " 120 | SELECT jsonb_array_length(json_col->'data') 121 | FROM public.users 122 | WHERE int_col = 1000001 123 | ") 124 | local target_json_length=$(run_sql_target " 125 | SELECT jsonb_array_length(json_col->'data') 126 | FROM public.users 127 | WHERE int_col = 1000001 128 | ") 129 | 130 | log "Source JSON length: $source_json_length" 131 | log "Target JSON length: $target_json_length" 132 | 133 | if [ -n "$source_json_length" ] && [ -n "$target_json_length" ] && 134 | [ "$source_json_length" -eq "$target_json_length" ] && 135 | [ "$source_json_length" -eq 10000 ]; then 136 | success "Large JSON data verified successfully" 137 | return 0 138 | else 139 | error "Large JSON data verification failed. Expected length 10000, got Source: $source_json_length, Target: $target_json_length" 140 | return 1 141 | fi 142 | } 143 | 144 | verify_data_integrity() { 145 | log "Verifying data integrity..." 146 | 147 | generate_table_hash() { 148 | local db=$1 149 | local csv_file="/tmp/pg_flo_${db}_dump.csv" 150 | 151 | if [ "$db" = "source" ]; then 152 | PGPASSWORD=$PG_PASSWORD psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -d "$PG_DB" \ 153 | -c "\COPY (SELECT * FROM public.users ORDER BY id) TO '$csv_file' WITH CSV" 154 | else 155 | PGPASSWORD=$TARGET_PG_PASSWORD psql -h "$TARGET_PG_HOST" -p "$TARGET_PG_PORT" -U "$TARGET_PG_USER" -d "$TARGET_PG_DB" \ 156 | -c "\COPY (SELECT * FROM public.users ORDER BY id) TO '$csv_file' WITH CSV" 157 | fi 158 | 159 | if command -v md5 >/dev/null; then 160 | md5 -q "$csv_file" 161 | elif command -v md5sum >/dev/null; then 162 | md5sum "$csv_file" | awk '{ print $1 }' 163 | else 164 | echo "Neither md5 nor md5sum command found" >&2 165 | return 1 166 | fi 167 | } 168 | 169 | local source_hash=$(generate_table_hash "source") 170 | local target_hash=$(generate_table_hash "target") 171 | 172 | log "Source data hash: $source_hash" 173 | log "Target data hash: $target_hash" 174 | 175 | if [ "$source_hash" = "$target_hash" ]; then 176 | success "Data integrity verified: source and target databases match 100%" 177 | return 0 178 | else 179 | error "Data integrity check failed: source and target databases do not match" 180 | log "You can compare the dumps using: diff /tmp/pg_flo_source_dump.csv /tmp/pg_flo_target_dump.csv" 181 | return 1 182 | fi 183 | } 184 | 185 | test_pg_flo_copy_only() { 186 | setup_postgres 187 | create_users 188 | populate_initial_data 189 | 190 | start_pg_flo_copy_only 191 | start_pg_flo_worker 192 | 193 | log "Waiting for changes to replicate..." 194 | sleep 180 195 | stop_pg_flo_gracefully 196 | 197 | compare_row_counts || return 1 198 | verify_large_json || return 1 199 | verify_data_integrity || return 1 200 | } 201 | 202 | log "Starting pg_flo copy-only test..." 203 | if test_pg_flo_copy_only; then 204 | success "All tests passed! 🎉" 205 | exit 0 206 | else 207 | error "Some tests failed. Please check the logs." 208 | show_pg_flo_logs 209 | exit 1 210 | fi 211 | -------------------------------------------------------------------------------- /internal/scripts/e2e_ddl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | create_test_tables() { 7 | log "Creating test schemas and tables..." 8 | run_sql "DROP SCHEMA IF EXISTS app CASCADE; CREATE SCHEMA app;" 9 | run_sql "DROP SCHEMA IF EXISTS public CASCADE; CREATE SCHEMA public;" 10 | 11 | run_sql "CREATE TABLE app.users (id serial PRIMARY KEY, data text);" 12 | run_sql "CREATE TABLE app.posts (id serial PRIMARY KEY, content text);" 13 | 14 | run_sql "CREATE TABLE app.comments (id serial PRIMARY KEY, text text);" 15 | run_sql "CREATE TABLE public.metrics (id serial PRIMARY KEY, value numeric);" 16 | success "Test tables created" 17 | } 18 | 19 | start_pg_flo_replication() { 20 | log "Starting pg_flo replication..." 21 | if [ -f "$pg_flo_LOG" ]; then 22 | mv "$pg_flo_LOG" "${pg_flo_LOG}.bak" 23 | log "Backed up previous replicator log to ${pg_flo_LOG}.bak" 24 | fi 25 | $pg_flo_BIN replicator \ 26 | --host "$PG_HOST" \ 27 | --port "$PG_PORT" \ 28 | --dbname "$PG_DB" \ 29 | --user "$PG_USER" \ 30 | --password "$PG_PASSWORD" \ 31 | --group "group_ddl" \ 32 | --schema "app" \ 33 | --tables "users,posts" \ 34 | --nats-url "$NATS_URL" \ 35 | --track-ddl \ 36 | >"$pg_flo_LOG" 2>&1 & 37 | pg_flo_PID=$! 38 | log "pg_flo replicator started with PID: $pg_flo_PID" 39 | success "pg_flo replication started" 40 | } 41 | 42 | start_pg_flo_worker() { 43 | log "Starting pg_flo worker with PostgreSQL sink..." 44 | if [ -f "$pg_flo_WORKER_LOG" ]; then 45 | mv "$pg_flo_WORKER_LOG" "${pg_flo_WORKER_LOG}.bak" 46 | log "Backed up previous worker log to ${pg_flo_WORKER_LOG}.bak" 47 | fi 48 | $pg_flo_BIN worker postgres \ 49 | --group "group_ddl" \ 50 | --nats-url "$NATS_URL" \ 51 | --source-host "$PG_HOST" \ 52 | --source-port "$PG_PORT" \ 53 | --source-dbname "$PG_DB" \ 54 | --source-user "$PG_USER" \ 55 | --source-password "$PG_PASSWORD" \ 56 | --target-host "$TARGET_PG_HOST" \ 57 | --target-port "$TARGET_PG_PORT" \ 58 | --target-dbname "$TARGET_PG_DB" \ 59 | --target-user "$TARGET_PG_USER" \ 60 | --target-password "$TARGET_PG_PASSWORD" \ 61 | --target-sync-schema \ 62 | >"$pg_flo_WORKER_LOG" 2>&1 & 63 | pg_flo_WORKER_PID=$! 64 | log "pg_flo worker started with PID: $pg_flo_WORKER_PID" 65 | success "pg_flo worker started" 66 | } 67 | 68 | perform_ddl_operations() { 69 | log "Performing DDL operations..." 70 | 71 | # Column operations on tracked tables 72 | run_sql "ALTER TABLE app.users ADD COLUMN email text;" 73 | run_sql "ALTER TABLE app.users ADD COLUMN status varchar(50) DEFAULT 'active';" 74 | run_sql "ALTER TABLE app.posts ADD COLUMN category text;" 75 | 76 | # Index operations on tracked tables 77 | run_sql "CREATE INDEX CONCURRENTLY idx_users_email ON app.users (email);" 78 | run_sql "CREATE UNIQUE INDEX idx_posts_unique ON app.posts (content) WHERE content IS NOT NULL;" 79 | 80 | # Column modifications on tracked tables 81 | run_sql "ALTER TABLE app.users ALTER COLUMN status SET DEFAULT 'pending';" 82 | run_sql "ALTER TABLE app.posts ALTER COLUMN category TYPE varchar(100);" 83 | 84 | # Rename operations on tracked tables 85 | run_sql "ALTER TABLE app.users RENAME COLUMN data TO profile;" 86 | 87 | # Drop operations on tracked tables 88 | run_sql "DROP INDEX CONCURRENTLY IF EXISTS idx_users_email;" 89 | run_sql "ALTER TABLE app.posts DROP COLUMN IF EXISTS category;" 90 | 91 | # Operations on non-tracked tables (should be ignored) 92 | run_sql "ALTER TABLE app.comments ADD COLUMN author text;" 93 | run_sql "CREATE INDEX idx_comments_text ON app.comments (text);" 94 | run_sql "ALTER TABLE public.metrics ADD COLUMN timestamp timestamptz;" 95 | 96 | success "DDL operations performed" 97 | } 98 | 99 | verify_ddl_changes() { 100 | log "Verifying DDL changes in target database..." 101 | local failures=0 102 | 103 | check_column() { 104 | local table=$1 105 | local column=$2 106 | local expected_exists=$3 107 | local expected_type=${4:-""} 108 | local expected_default=${5:-""} 109 | local query=" 110 | SELECT COUNT(*), 111 | data_type, 112 | character_maximum_length, 113 | column_default 114 | FROM information_schema.columns 115 | WHERE table_schema='app' 116 | AND table_name='$table' 117 | AND column_name='$column' 118 | GROUP BY data_type, character_maximum_length, column_default;" 119 | 120 | local result 121 | result=$(run_sql_target "$query") 122 | 123 | if [ -z "$result" ]; then 124 | exists=0 125 | data_type="" 126 | char_length="" 127 | default_value="" 128 | else 129 | read exists data_type char_length default_value < <(echo "$result" | tr '|' ' ') 130 | fi 131 | 132 | exists=${exists:-0} 133 | 134 | if [ "$exists" -eq "$expected_exists" ]; then 135 | if [ "$expected_exists" -eq 1 ]; then 136 | local type_ok=true 137 | local default_ok=true 138 | 139 | if [ -n "$expected_type" ]; then 140 | # Handle character varying type specifically 141 | if [ "$expected_type" = "character varying" ]; then 142 | if [ "$data_type" = "character varying" ] || [ "$data_type" = "varchar" ] || [ "$data_type" = "character" ]; then 143 | type_ok=true 144 | else 145 | type_ok=false 146 | fi 147 | elif [ "$data_type" != "$expected_type" ]; then 148 | type_ok=false 149 | fi 150 | fi 151 | 152 | if [ -n "$expected_default" ]; then 153 | if [[ "$default_value" == *"$expected_default"* ]]; then 154 | default_ok=true 155 | else 156 | default_ok=false 157 | fi 158 | fi 159 | 160 | if [ "$type_ok" = true ] && [ "$default_ok" = true ]; then 161 | if [[ "$expected_type" == "character varying" && -n "$char_length" ]]; then 162 | success "Column app.$table.$column verification passed (type: $data_type($char_length), default: $default_value)" 163 | else 164 | success "Column app.$table.$column verification passed (type: $data_type, default: $default_value)" 165 | fi 166 | else 167 | if [ "$type_ok" = false ]; then 168 | error "Column app.$table.$column type mismatch (expected: $expected_type, got: $data_type)" 169 | failures=$((failures + 1)) 170 | fi 171 | if [ "$default_ok" = false ]; then 172 | error "Column app.$table.$column default value mismatch (expected: $expected_default, got: $default_value)" 173 | failures=$((failures + 1)) 174 | fi 175 | fi 176 | else 177 | success "Column app.$table.$column verification passed (not exists)" 178 | fi 179 | else 180 | error "Column app.$table.$column verification failed (expected: $expected_exists, got: $exists)" 181 | failures=$((failures + 1)) 182 | fi 183 | } 184 | 185 | check_index() { 186 | local index=$1 187 | local expected=$2 188 | local exists=$(run_sql_target "SELECT COUNT(*) FROM pg_indexes WHERE schemaname='app' AND indexname='$index';") 189 | 190 | if [ "$exists" -eq "$expected" ]; then 191 | success "Index app.$index verification passed (expected: $expected)" 192 | else 193 | error "Index app.$index verification failed (expected: $expected, got: $exists)" 194 | failures=$((failures + 1)) 195 | fi 196 | } 197 | 198 | # Verify app.users changes 199 | check_column "users" "email" 1 "text" 200 | check_column "users" "status" 1 "character varying" "'pending'" 201 | check_column "users" "data" 0 202 | check_column "users" "profile" 1 "text" 203 | 204 | # Verify app.posts changes 205 | check_column "posts" "category" 0 206 | check_column "posts" "content" 1 "text" 207 | check_index "idx_posts_unique" 1 "unique" 208 | 209 | # Verify non-tracked tables 210 | check_column "comments" "author" 0 211 | check_index "idx_comments_text" 0 212 | 213 | local remaining_rows=$(run_sql "SELECT COUNT(*) FROM internal_pg_flo.ddl_log;") 214 | if [ "$remaining_rows" -eq 0 ]; then 215 | success "internal_pg_flo.ddl_log table is empty" 216 | else 217 | error "internal_pg_flo.ddl_log table is not empty. Remaining rows: $remaining_rows" 218 | failures=$((failures + 1)) 219 | fi 220 | 221 | if [ "$failures" -eq 0 ]; then 222 | success "All DDL changes verified successfully" 223 | return 0 224 | else 225 | error "DDL verification failed with $failures errors" 226 | return 1 227 | fi 228 | } 229 | 230 | test_pg_flo_ddl() { 231 | setup_postgres 232 | create_test_tables 233 | start_pg_flo_worker 234 | sleep 5 235 | start_pg_flo_replication 236 | sleep 3 237 | perform_ddl_operations 238 | stop_pg_flo_gracefully 239 | verify_ddl_changes || return 1 240 | } 241 | 242 | log "Starting pg_flo CDC test with DDL tracking..." 243 | if test_pg_flo_ddl; then 244 | success "DDL tracking test passed! 🎉" 245 | exit 0 246 | else 247 | error "DDL tracking test failed. Please check the logs." 248 | show_pg_flo_logs 249 | exit 1 250 | fi 251 | -------------------------------------------------------------------------------- /internal/scripts/e2e_multi_tenant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | create_multi_tenant_table() { 7 | log "Creating multi-tenant test table..." 8 | run_sql "DROP TABLE IF EXISTS public.events;" 9 | run_sql "CREATE TABLE public.events ( 10 | id serial PRIMARY KEY, 11 | tenant_id int NOT NULL, 12 | name text, 13 | email text, 14 | created_at timestamp DEFAULT current_timestamp 15 | );" 16 | success "Multi-tenant test table created" 17 | } 18 | 19 | start_pg_flo_replication() { 20 | log "Starting pg_flo replication..." 21 | $pg_flo_BIN replicator \ 22 | --host "$PG_HOST" \ 23 | --port "$PG_PORT" \ 24 | --dbname "$PG_DB" \ 25 | --user "$PG_USER" \ 26 | --password "$PG_PASSWORD" \ 27 | --group "group_multi_tenant" \ 28 | --tables "events" \ 29 | --schema "public" \ 30 | --nats-url "$NATS_URL" \ 31 | >"$pg_flo_LOG" 2>&1 & 32 | pg_flo_PID=$! 33 | log "pg_flo replicator started with PID: $pg_flo_PID" 34 | success "pg_flo replication started" 35 | } 36 | 37 | start_pg_flo_worker() { 38 | log "Starting pg_flo worker with PostgreSQL sink..." 39 | $pg_flo_BIN worker postgres \ 40 | --group "group_multi_tenant" \ 41 | --nats-url "$NATS_URL" \ 42 | --source-host "$PG_HOST" \ 43 | --source-port "$PG_PORT" \ 44 | --source-dbname "$PG_DB" \ 45 | --source-user "$PG_USER" \ 46 | --source-password "$PG_PASSWORD" \ 47 | --target-host "$TARGET_PG_HOST" \ 48 | --target-port "$TARGET_PG_PORT" \ 49 | --target-dbname "$TARGET_PG_DB" \ 50 | --target-user "$TARGET_PG_USER" \ 51 | --target-password "$TARGET_PG_PASSWORD" \ 52 | --target-sync-schema \ 53 | --rules-config "$(dirname "$0")/multi_tenant_rules.yml" \ 54 | >"$pg_flo_WORKER_LOG" 2>&1 & 55 | pg_flo_WORKER_PID=$! 56 | log "pg_flo worker started with PID: $pg_flo_WORKER_PID" 57 | success "pg_flo worker started" 58 | } 59 | 60 | simulate_multi_tenant_changes() { 61 | log "Simulating multi-tenant changes..." 62 | run_sql "INSERT INTO public.events (tenant_id, name, email) VALUES 63 | (1, 'Alice', 'alice@tenant1.com'), 64 | (2, 'Bob', 'bob@tenant2.com'), 65 | (3, 'Charlie', 'charlie@tenant3.com'), 66 | (3, 'David', 'david@tenant3.com'), 67 | (4, 'Eve', 'eve@tenant4.com'), 68 | (3, 'Frank', 'frank@tenant3.com');" 69 | success "Multi-tenant changes simulated" 70 | } 71 | 72 | verify_multi_tenant_changes() { 73 | log "Verifying multi-tenant changes in target database..." 74 | local tenant_3_count=$(run_sql_target "SELECT COUNT(*) FROM public.events WHERE tenant_id = 3;" | xargs) 75 | local total_count=$(run_sql_target "SELECT COUNT(*) FROM public.events;" | xargs) 76 | 77 | log "Tenant 3 count: $tenant_3_count (expected 3)" 78 | log "Total count: $total_count (expected 3)" 79 | 80 | if [ "$tenant_3_count" -eq 3 ] && [ "$total_count" -eq 3 ]; then 81 | success "Multi-tenant filtering verified successfully" 82 | return 0 83 | else 84 | error "Multi-tenant filtering verification failed" 85 | return 1 86 | fi 87 | } 88 | 89 | test_pg_flo_multi_tenant() { 90 | setup_postgres 91 | create_multi_tenant_table 92 | start_pg_flo_replication 93 | sleep 2 94 | start_pg_flo_worker 95 | simulate_multi_tenant_changes 96 | 97 | log "Waiting for pg_flo to process changes..." 98 | sleep 5 99 | 100 | stop_pg_flo_gracefully 101 | verify_multi_tenant_changes || return 1 102 | } 103 | 104 | # Run the test 105 | log "Starting pg_flo CDC test with multi-tenant filtering..." 106 | if test_pg_flo_multi_tenant; then 107 | success "All tests passed! 🎉" 108 | exit 0 109 | else 110 | error "Some tests failed. Please check the logs." 111 | show_pg_flo_logs 112 | exit 1 113 | fi 114 | -------------------------------------------------------------------------------- /internal/scripts/e2e_postgres.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | create_tables() { 7 | log "Creating test tables in source database..." 8 | run_sql "DROP TABLE IF EXISTS public.users;" 9 | run_sql "CREATE TABLE public.users ( 10 | id serial PRIMARY KEY, 11 | data text, 12 | nullable_column text, 13 | toasted_column text, 14 | created_at timestamp DEFAULT current_timestamp 15 | );" 16 | run_sql "DROP TABLE IF EXISTS public.toast_test;" 17 | run_sql "CREATE TABLE public.toast_test (id serial PRIMARY KEY, large_jsonb jsonb, small_text text);" 18 | success "Test tables created in source database" 19 | } 20 | 21 | create_config_files() { 22 | log "Creating config files..." 23 | 24 | # Create replicator config 25 | cat >"/tmp/pg_flo_replicator.yml" <"/tmp/pg_flo_worker.yml" <"$pg_flo_LOG" 2>&1 & 74 | pg_flo_PID=$! 75 | log "pg_flo replicator started with PID: $pg_flo_PID" 76 | success "pg_flo replicator started" 77 | } 78 | 79 | start_pg_flo_worker() { 80 | log "Starting pg_flo worker with PostgreSQL sink..." 81 | if [ -f "$pg_flo_WORKER_LOG" ]; then 82 | mv "$pg_flo_WORKER_LOG" "${pg_flo_WORKER_LOG}.bak" 83 | log "Backed up previous worker log to ${pg_flo_WORKER_LOG}.bak" 84 | fi 85 | 86 | $pg_flo_BIN worker postgres --config "/tmp/pg_flo_worker.yml" >"$pg_flo_WORKER_LOG" 2>&1 & 87 | pg_flo_WORKER_PID=$! 88 | log "pg_flo worker started with PID: $pg_flo_WORKER_PID" 89 | success "pg_flo worker started" 90 | } 91 | 92 | simulate_changes() { 93 | log "Simulating changes..." 94 | local insert_count=6000 95 | 96 | for i in $(seq 1 "$insert_count"); do 97 | run_sql "INSERT INTO public.users (data, nullable_column, toasted_column) VALUES ('Data $i', 'Nullable $i', 'Toasted $i');" 98 | done 99 | 100 | # Insert specific rows for deletion 101 | run_sql "INSERT INTO public.users (id, data) VALUES (10001, 'To be deleted 1');" 102 | run_sql "INSERT INTO public.users (id, data) VALUES (10002, 'To be deleted 2');" 103 | run_sql "INSERT INTO public.users (id, data) VALUES (10003, 'To be deleted 3');" 104 | run_sql "INSERT INTO public.users (id, data) VALUES (10004, 'To be deleted 4');" 105 | run_sql "INSERT INTO public.users (id, data) VALUES (10005, 'To be deleted 5');" 106 | 107 | # Insert a row with potentially toasted data 108 | run_sql "INSERT INTO public.users (id, toasted_column) VALUES (10006, repeat('Large toasted data ', 1000));" 109 | 110 | # Update with various scenarios 111 | run_sql "UPDATE public.users SET data = 'Updated data' WHERE id = 1;" 112 | run_sql "UPDATE public.users SET nullable_column = NULL WHERE id = 2;" 113 | run_sql "UPDATE public.users SET data = 'Updated data', nullable_column = NULL WHERE id = 3;" 114 | run_sql "UPDATE public.users SET toasted_column = repeat('A', 10000) WHERE id = 4;" 115 | run_sql "UPDATE public.users SET data = 'Updated data' WHERE id = 5;" 116 | 117 | # Generate large JSONB data (approximately 1MB) 118 | log "Generating 1MB JSONB data..." 119 | local json_data='{"data":"' 120 | for i in {1..100000}; do 121 | json_data+="AAAAAAAAAA" 122 | done 123 | json_data+='"}' 124 | 125 | # Insert large JSONB data 126 | run_sql "INSERT INTO public.toast_test (large_jsonb, small_text) VALUES ('$json_data'::jsonb, 'Initial small text');" 127 | log "Inserted large JSONB data, waiting for replication..." 128 | 129 | # Update unrelated column 130 | run_sql "UPDATE public.toast_test SET small_text = 'Updated small text' WHERE id = 1;" 131 | log "Updated unrelated column, waiting for replication..." 132 | 133 | # Delete operations 134 | run_sql "DELETE FROM public.users WHERE id = 10001;" 135 | run_sql "DELETE FROM public.users WHERE id IN (10002, 10003);" 136 | run_sql "DELETE FROM public.users WHERE id >= 10004 AND id <= 10005;" 137 | run_sql "DELETE FROM public.users WHERE id = 10006;" 138 | 139 | success "Changes simulated" 140 | } 141 | 142 | verify_changes() { 143 | log "Verifying changes in target database..." 144 | 145 | local updated_data=$(run_sql_target "SELECT data FROM public.users WHERE id = 1;" | xargs) 146 | log "Updated data for id 1: '$updated_data' (expected 'Updated data')" 147 | 148 | local null_column=$(run_sql_target "SELECT coalesce(nullable_column, 'NULL') FROM public.users WHERE id = 2;" | xargs) 149 | log "Nullable column for id 2: '$null_column' (expected 'NULL')" 150 | 151 | local mixed_update=$(run_sql_target "SELECT data || ' | ' || coalesce(nullable_column, 'NULL') FROM public.users WHERE id = 3;" | xargs) 152 | log "Mixed update for id 3: '$mixed_update' (expected 'Updated data | NULL')" 153 | 154 | local toast_length=$(run_sql_target "SELECT length(toasted_column) FROM public.users WHERE id = 4;" | xargs) 155 | log "TOAST column length for id 4: '$toast_length' (expected '10000')" 156 | 157 | local unrelated_column=$(run_sql_target "SELECT nullable_column FROM public.users WHERE id = 5;" | xargs) 158 | log "Unrelated column for id 5: '$unrelated_column' (expected 'Nullable 5')" 159 | 160 | local jsonb_length=$(run_sql_target "SELECT octet_length(large_jsonb::text) FROM public.toast_test LIMIT 1;" | xargs) 161 | log "JSONB column length: '$jsonb_length' bytes (expected > 1000000)" 162 | 163 | local small_text=$(run_sql_target "SELECT small_text FROM public.toast_test LIMIT 1;" | xargs) 164 | log "small_text content: '$small_text' (expected 'Updated small text')" 165 | 166 | local deleted_single=$(run_sql_target "SELECT COUNT(*) FROM public.users WHERE id = 10001;" | xargs) 167 | log "Count of deleted user (id 10001): '$deleted_single' (expected '0')" 168 | 169 | local deleted_multiple=$(run_sql_target "SELECT COUNT(*) FROM public.users WHERE id IN (10002, 10003);" | xargs) 170 | log "Count of deleted users (ids 10002, 10003): '$deleted_multiple' (expected '0')" 171 | 172 | local deleted_range=$(run_sql_target "SELECT COUNT(*) FROM public.users WHERE id >= 10004 AND id <= 10005;" | xargs) 173 | log "Count of deleted users (ids 10004-10005): '$deleted_range' (expected '0')" 174 | 175 | local deleted_toasted=$(run_sql_target "SELECT COUNT(*) FROM public.users WHERE id = 10006;" | xargs) 176 | log "Count of deleted user with toasted data (id 10006): '$deleted_toasted' (expected '0')" 177 | 178 | log "Detailed verification:" 179 | 180 | if [ "$updated_data" != "Updated data" ]; then 181 | log "updated_data: '$updated_data' != 'Updated data'" 182 | error "Verification failed: updated_data mismatch" 183 | return 1 184 | fi 185 | 186 | if [ "$null_column" != "NULL" ]; then 187 | log "null_column: '$null_column' != 'NULL'" 188 | error "Verification failed: null_column mismatch" 189 | return 1 190 | fi 191 | 192 | if [ "$mixed_update" != "Updated data | NULL" ]; then 193 | log "mixed_update: '$mixed_update' != 'Updated data | NULL'" 194 | error "Verification failed: mixed_update mismatch" 195 | return 1 196 | fi 197 | 198 | if [ "$toast_length" != "10000" ]; then 199 | log "toast_length: '$toast_length' != '10000'" 200 | error "Verification failed: toast_length mismatch" 201 | return 1 202 | fi 203 | 204 | if [ "$unrelated_column" != "Nullable 5" ]; then 205 | log "unrelated_column: '$unrelated_column' != 'Nullable 5'" 206 | error "Verification failed: unrelated_column mismatch" 207 | return 1 208 | fi 209 | 210 | if [ -z "$jsonb_length" ] || [ "$jsonb_length" -le 1000000 ]; then 211 | log "jsonb_length: '$jsonb_length' <= 1000000" 212 | error "Verification failed: jsonb_length mismatch" 213 | return 1 214 | fi 215 | 216 | if [ "$small_text" != "Updated small text" ]; then 217 | log "small_text: '$small_text' != 'Updated small text'" 218 | error "Verification failed: small_text mismatch" 219 | return 1 220 | fi 221 | 222 | if [ "$deleted_single" != "0" ]; then 223 | log "deleted_single: '$deleted_single' != '0'" 224 | error "Verification failed: deleted_single mismatch" 225 | return 1 226 | fi 227 | 228 | if [ "$deleted_multiple" != "0" ]; then 229 | log "deleted_multiple: '$deleted_multiple' != '0'" 230 | error "Verification failed: deleted_multiple mismatch" 231 | return 1 232 | fi 233 | 234 | if [ "$deleted_range" != "0" ]; then 235 | log "deleted_range: '$deleted_range' != '0'" 236 | error "Verification failed: deleted_range mismatch" 237 | return 1 238 | fi 239 | 240 | if [ "$deleted_toasted" != "0" ]; then 241 | log "deleted_toasted: '$deleted_toasted' != '0'" 242 | error "Verification failed: deleted_toasted mismatch" 243 | return 1 244 | fi 245 | 246 | success "All changes verified successfully in target database" 247 | return 0 248 | } 249 | 250 | test_pg_flo_postgres_sink() { 251 | setup_postgres 252 | create_tables 253 | create_config_files 254 | start_pg_flo_replication 255 | sleep 2 256 | start_pg_flo_worker 257 | simulate_changes 258 | 259 | log "Waiting for pg_flo to process changes..." 260 | 261 | stop_pg_flo_gracefully 262 | verify_changes || return 1 263 | } 264 | 265 | # Run the test 266 | log "Starting pg_flo CDC test with PostgreSQL sink..." 267 | if test_pg_flo_postgres_sink; then 268 | success "All tests passed! 🎉" 269 | exit 0 270 | else 271 | error "Some tests failed. Please check the logs." 272 | show_pg_flo_logs 273 | exit 1 274 | fi 275 | -------------------------------------------------------------------------------- /internal/scripts/e2e_stream_only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | create_users() { 7 | log "Creating test table..." 8 | run_sql "DROP TABLE IF EXISTS public.users;" 9 | run_sql "CREATE TABLE public.users (id serial PRIMARY KEY, data text, created_at timestamp DEFAULT current_timestamp);" 10 | success "Test table created" 11 | } 12 | 13 | start_pg_flo_replication() { 14 | log "Starting pg_flo replication..." 15 | $pg_flo_BIN replicator \ 16 | --host "$PG_HOST" \ 17 | --port "$PG_PORT" \ 18 | --dbname "$PG_DB" \ 19 | --user "$PG_USER" \ 20 | --password "$PG_PASSWORD" \ 21 | --group "group-2" \ 22 | --tables "users" \ 23 | --schema "public" \ 24 | --nats-url "$NATS_URL" \ 25 | >"$pg_flo_LOG" 2>&1 & 26 | pg_flo_PID=$! 27 | log "pg_flo started with PID: $pg_flo_PID" 28 | success "pg_flo replication started" 29 | } 30 | 31 | start_pg_flo_worker() { 32 | log "Starting pg_flo worker with file sink..." 33 | $pg_flo_BIN worker file \ 34 | --group "group-2" \ 35 | --nats-url "$NATS_URL" \ 36 | --file-output-dir "$OUTPUT_DIR" \ 37 | >"$pg_flo_WORKER_LOG" 2>&1 & 38 | pg_flo_WORKER_PID=$! 39 | log "pg_flo worker started with PID: $pg_flo_WORKER_PID" 40 | success "pg_flo worker started" 41 | } 42 | 43 | simulate_changes() { 44 | log "Simulating changes..." 45 | local insert_count=1000 46 | local update_count=500 47 | local delete_count=250 48 | 49 | log "Simulating inserts..." 50 | for i in $(seq 1 $insert_count); do 51 | run_sql "INSERT INTO public.users (data) VALUES ('Data $i');" 52 | done 53 | 54 | log "Simulating updates..." 55 | for i in $(seq 1 $update_count); do 56 | run_sql "UPDATE public.users SET data = 'Updated data $i' WHERE id = $i;" 57 | done 58 | 59 | log "Simulating deletes..." 60 | for i in $(seq 1 $delete_count); do 61 | run_sql "DELETE FROM public.users WHERE id = $i;" 62 | done 63 | 64 | success "Changes simulated" 65 | } 66 | 67 | verify_changes() { 68 | log "Verifying changes in ${OUTPUT_DIR}..." 69 | local insert_count=$(jq -s '[.[] | select(.Type == "INSERT")] | length' "$OUTPUT_DIR"/*.jsonl) 70 | local update_count=$(jq -s '[.[] | select(.Type == "UPDATE")] | length' "$OUTPUT_DIR"/*.jsonl) 71 | local delete_count=$(jq -s '[.[] | select(.Type == "DELETE")] | length' "$OUTPUT_DIR"/*.jsonl) 72 | 73 | log "INSERT count: $insert_count (expected 1000)" 74 | log "UPDATE count: $update_count (expected 500)" 75 | log "DELETE count: $delete_count (expected 250)" 76 | 77 | if [ "$insert_count" -eq 1000 ] && [ "$update_count" -eq 500 ] && [ "$delete_count" -eq 250 ]; then 78 | success "Change counts match expected values" 79 | return 0 80 | else 81 | error "Change counts do not match expected values" 82 | return 1 83 | fi 84 | } 85 | 86 | # Main test function 87 | test_pg_flo_cdc() { 88 | setup_postgres 89 | create_users 90 | start_pg_flo_replication 91 | start_pg_flo_worker 92 | log "Waiting for replicator to initialize..." 93 | sleep 2 94 | simulate_changes 95 | 96 | log "Waiting for pg_flo to process changes..." 97 | sleep 2 98 | 99 | stop_pg_flo_gracefully 100 | verify_changes || return 1 101 | } 102 | 103 | # Run the test 104 | log "Starting pg_flo CDC test with changes..." 105 | if test_pg_flo_cdc; then 106 | success "All tests passed! 🎉" 107 | exit 0 108 | else 109 | error "Some tests failed. Please check the logs." 110 | show_pg_flo_logs 111 | exit 1 112 | fi 113 | -------------------------------------------------------------------------------- /internal/scripts/e2e_test_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | setup_docker() { 7 | pkill -9 "pg_flo" || true 8 | rm -Rf /tmp/pg* 9 | log "Setting up Docker environment..." 10 | docker compose -f internal/docker-compose.yml down -v 11 | docker compose -f internal/docker-compose.yml up -d 12 | success "Docker environment is set up" 13 | } 14 | 15 | cleanup_data() { 16 | log "Cleaning up data..." 17 | run_sql "DROP TABLE IF EXISTS public.users;" 18 | run_sql "DROP SCHEMA IF EXISTS internal_pg_flo CASCADE;" 19 | rm -rf /tmp/pg_flo-output 20 | rm -f /tmp/pg_flo.log 21 | success "Data cleanup complete" 22 | } 23 | 24 | cleanup() { 25 | log "Cleaning up..." 26 | docker compose down -v 27 | success "Cleanup complete" 28 | } 29 | 30 | trap cleanup EXIT 31 | 32 | make build 33 | 34 | setup_docker 35 | 36 | log "Running e2e ddl tests..." 37 | if CI=false ruby ./internal/scripts/e2e_resume_test.rb; then 38 | success "e2e ddl tests completed successfully" 39 | else 40 | error "Original e2e tests failed" 41 | exit 1 42 | fi 43 | -------------------------------------------------------------------------------- /internal/scripts/e2e_transform_filter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | create_users() { 7 | log "Creating test table..." 8 | run_sql "DROP TABLE IF EXISTS public.users;" 9 | run_sql "CREATE TABLE public.users ( 10 | id serial PRIMARY KEY, 11 | email text, 12 | phone text, 13 | age int, 14 | ssn text, 15 | created_at timestamp DEFAULT current_timestamp 16 | );" 17 | success "Test table created" 18 | } 19 | 20 | start_pg_flo_replication() { 21 | log "Starting pg_flo replication..." 22 | if [ -f "$pg_flo_LOG" ]; then 23 | mv "$pg_flo_LOG" "${pg_flo_LOG}.bak" 24 | log "Backed up previous replicator log to ${pg_flo_LOG}.bak" 25 | fi 26 | $pg_flo_BIN replicator \ 27 | --host "$PG_HOST" \ 28 | --port "$PG_PORT" \ 29 | --dbname "$PG_DB" \ 30 | --user "$PG_USER" \ 31 | --password "$PG_PASSWORD" \ 32 | --group "group_transform_filter" \ 33 | --tables "users" \ 34 | --schema "public" \ 35 | --nats-url "$NATS_URL" \ 36 | >"$pg_flo_LOG" 2>&1 & 37 | pg_flo_PID=$! 38 | log "pg_flo replicator started with PID: $pg_flo_PID" 39 | success "pg_flo replication started" 40 | } 41 | 42 | start_pg_flo_worker() { 43 | log "Starting pg_flo worker with file sink..." 44 | if [ -f "$pg_flo_WORKER_LOG" ]; then 45 | mv "$pg_flo_WORKER_LOG" "${pg_flo_WORKER_LOG}.bak" 46 | log "Backed up previous worker log to ${pg_flo_WORKER_LOG}.bak" 47 | fi 48 | $pg_flo_BIN worker file \ 49 | --group "group_transform_filter" \ 50 | --nats-url "$NATS_URL" \ 51 | --file-output-dir "$OUTPUT_DIR" \ 52 | --rules-config "$(dirname "$0")/rules.yml" \ 53 | >"$pg_flo_WORKER_LOG" 2>&1 & 54 | pg_flo_WORKER_PID=$! 55 | log "pg_flo worker started with PID: $pg_flo_WORKER_PID" 56 | success "pg_flo worker started" 57 | } 58 | 59 | simulate_changes() { 60 | log "Simulating changes..." 61 | run_sql "INSERT INTO public.users (email, phone, age, ssn) VALUES 62 | ('john@example.com', '1234567890', 25, '123-45-6789'), 63 | ('jane@example.com', '9876543210', 17, '987-65-4321'), 64 | ('bob@example.com', '5551234567', 30, '555-12-3456');" 65 | 66 | run_sql "UPDATE public.users SET email = 'updated@example.com', phone = '1112223333' WHERE id = 1;" 67 | run_sql "DELETE FROM public.users WHERE age = 30;" 68 | run_sql "DELETE FROM public.users WHERE age = 17;" 69 | 70 | success "Changes simulated" 71 | } 72 | 73 | verify_changes() { 74 | log "Verifying changes..." 75 | local insert_count=$(jq -s '[.[] | select(.Type == "INSERT")] | length' "$OUTPUT_DIR"/*.jsonl) 76 | local update_count=$(jq -s '[.[] | select(.Type == "UPDATE")] | length' "$OUTPUT_DIR"/*.jsonl) 77 | local delete_count=$(jq -s '[.[] | select(.Type == "DELETE")] | length' "$OUTPUT_DIR"/*.jsonl) 78 | 79 | log "INSERT count: $insert_count (expected 2)" 80 | log "UPDATE count: $update_count (expected 1)" 81 | log "DELETE count: $delete_count (expected 2)" 82 | 83 | if [ "$insert_count" -eq 2 ] && [ "$update_count" -eq 1 ] && [ "$delete_count" -eq 2 ]; then 84 | success "Change counts match expected values" 85 | else 86 | error "Change counts do not match expected values" 87 | return 1 88 | fi 89 | 90 | # Verify transformations and filters 91 | local masked_email=$(jq -r 'select(.Type == "INSERT" and .NewTuple.id == 1) | .NewTuple.email' "$OUTPUT_DIR"/*.jsonl) 92 | local formatted_phone=$(jq -r 'select(.Type == "INSERT" and .NewTuple.id == 1) | .NewTuple.phone' "$OUTPUT_DIR"/*.jsonl) 93 | local filtered_insert=$(jq -r 'select(.Type == "INSERT" and .NewTuple.id == 2) | .NewTuple.id' "$OUTPUT_DIR"/*.jsonl) 94 | local updated_email=$(jq -r 'select(.Type == "UPDATE") | .NewTuple.email' "$OUTPUT_DIR"/*.jsonl) 95 | local masked_ssn=$(jq -r 'select(.Type == "INSERT" and .NewTuple.id == 1) | .NewTuple.ssn' "$OUTPUT_DIR"/*.jsonl) 96 | local filtered_age=$(jq -r 'select(.Type == "INSERT" and .NewTuple.id == 2) | .NewTuple.age' "$OUTPUT_DIR"/*.jsonl) 97 | 98 | if [[ "$masked_email" == "j**************m" ]] && 99 | [[ "$formatted_phone" == "(123) 456-7890" ]] && 100 | [[ -z "$filtered_insert" ]] && 101 | [[ "$updated_email" == "u*****************m" ]] && 102 | [[ "$masked_ssn" == "1XXXXXXXXX9" ]] && 103 | [[ -z "$filtered_age" ]]; then 104 | success "Transformations and filters applied correctly" 105 | else 106 | error "Transformations or filters not applied correctly" 107 | log "Masked email: $masked_email" 108 | log "Formatted phone: $formatted_phone" 109 | log "Filtered insert: $filtered_insert" 110 | log "Updated email: $updated_email" 111 | log "Masked SSN: $masked_ssn" 112 | log "Filtered age: $filtered_age" 113 | return 1 114 | fi 115 | } 116 | 117 | test_pg_flo_transform_filter() { 118 | setup_postgres 119 | create_users 120 | start_pg_flo_replication 121 | start_pg_flo_worker 122 | sleep 2 123 | simulate_changes 124 | 125 | log "Waiting for pg_flo to process changes..." 126 | 127 | stop_pg_flo_gracefully 128 | verify_changes || return 1 129 | } 130 | 131 | log "Starting pg_flo CDC test with transformations and filters..." 132 | if test_pg_flo_transform_filter; then 133 | success "All tests passed! 🎉" 134 | exit 0 135 | else 136 | error "Some tests failed. Please check the logs." 137 | show_pg_flo_logs 138 | exit 1 139 | fi 140 | -------------------------------------------------------------------------------- /internal/scripts/multi_tenant_rules.yml: -------------------------------------------------------------------------------- 1 | tables: 2 | events: 3 | - type: filter 4 | column: tenant_id 5 | parameters: 6 | operator: "eq" 7 | value: 3 8 | operations: [INSERT, UPDATE, DELETE] 9 | -------------------------------------------------------------------------------- /internal/scripts/rules.yml: -------------------------------------------------------------------------------- 1 | tables: 2 | users: 3 | - type: transform 4 | column: email 5 | parameters: 6 | type: mask 7 | mask_char: "*" 8 | allow_empty_deletes: true 9 | operations: [INSERT, UPDATE, DELETE] 10 | - type: transform 11 | column: phone 12 | parameters: 13 | type: regex 14 | pattern: "^(\\d{3})(\\d{3})(\\d{4})$" 15 | replace: "($1) $2-$3" 16 | allow_empty_deletes: true 17 | operations: [INSERT, UPDATE, DELETE] 18 | - type: filter 19 | column: age 20 | parameters: 21 | operator: "gte" 22 | value: 18 23 | allow_empty_deletes: true 24 | operations: [INSERT, UPDATE, DELETE] 25 | - type: transform 26 | column: ssn 27 | parameters: 28 | type: mask 29 | mask_char: "X" 30 | allow_empty_deletes: true 31 | operations: [INSERT, UPDATE, DELETE] 32 | -------------------------------------------------------------------------------- /internal/scripts/webhook_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | source "$(dirname "$0")/e2e_common.sh" 5 | 6 | WEBHOOK_URL="https://deep-article-49.webhook.cool" 7 | 8 | create_users() { 9 | log "Creating initial test table..." 10 | run_sql "DROP TABLE IF EXISTS public.users;" 11 | run_sql "CREATE TABLE public.users (id serial PRIMARY KEY, data text);" 12 | success "Initial test table created" 13 | } 14 | 15 | start_pg_flo_replication() { 16 | log "Starting pg_flo replication..." 17 | $pg_flo_BIN stream webhook \ 18 | --host "$PG_HOST" \ 19 | --port "$PG_PORT" \ 20 | --dbname "$PG_DB" \ 21 | --user "$PG_USER" \ 22 | --password "$PG_PASSWORD" \ 23 | --group "group-webhook" \ 24 | --tables "users" \ 25 | --schema "public" \ 26 | --status-dir "/tmp" \ 27 | --webhook-url "$WEBHOOK_URL" \ 28 | --track-ddl >"$pg_flo_LOG" 2>&1 & 29 | pg_flo_PID=$! 30 | log "pg_flo started with PID: $pg_flo_PID" 31 | success "pg_flo replication started" 32 | } 33 | 34 | simulate_changes() { 35 | log "Simulating changes..." 36 | local insert_count=10 37 | local update_count=5 38 | local delete_count=3 39 | 40 | for i in $(seq 1 $insert_count); do 41 | run_sql "INSERT INTO public.users (data) VALUES ('Data $i');" 42 | done 43 | 44 | for i in $(seq 1 $update_count); do 45 | run_sql "UPDATE public.users SET data = 'Updated data $i' WHERE id = $i;" 46 | done 47 | 48 | for i in $(seq 1 $delete_count); do 49 | run_sql "DELETE FROM public.users WHERE id = $i;" 50 | done 51 | 52 | success "Changes simulated" 53 | } 54 | 55 | perform_ddl_operations() { 56 | log "Performing DDL operations..." 57 | run_sql "ALTER TABLE users ADD COLUMN new_column int;" 58 | run_sql "CREATE INDEX CONCURRENTLY idx_users_data ON users (data);" 59 | run_sql "ALTER TABLE users RENAME COLUMN data TO old_data;" 60 | run_sql "DROP INDEX idx_users_data;" 61 | run_sql "ALTER TABLE users ADD COLUMN new_column_one int;" 62 | run_sql "ALTER TABLE users ALTER COLUMN old_data TYPE varchar(255);" 63 | success "DDL operations performed" 64 | } 65 | 66 | test_pg_flo_webhook() { 67 | setup_docker 68 | setup_postgres 69 | create_users 70 | start_pg_flo_replication 71 | sleep 2 72 | simulate_changes 73 | perform_ddl_operations 74 | 75 | log "Waiting for pg_flo to process changes..." 76 | sleep 10 77 | 78 | stop_pg_flo_gracefully 79 | log "Test completed. Please check https://webhook.site/#!/f5a9abdb-c779-44a2-98ce-0760b4a2fc5c for received events." 80 | } 81 | 82 | # Run the test 83 | log "Starting pg_flo CDC test with webhook sink..." 84 | if test_pg_flo_webhook; then 85 | success "Test completed successfully. Please verify the results on webhook.site" 86 | exit 0 87 | else 88 | error "Test failed. Please check the logs." 89 | show_pg_flo_logs 90 | exit 1 91 | fi 92 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/pgflo/pg_flo/cmd" 8 | ) 9 | 10 | func main() { 11 | if err := cmd.Execute(); err != nil { 12 | fmt.Println(err) 13 | os.Exit(1) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /pkg/pgflonats/pgflonats.go: -------------------------------------------------------------------------------- 1 | package pgflonats 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "time" 9 | 10 | "github.com/jackc/pglogrepl" 11 | "github.com/nats-io/nats.go" 12 | ) 13 | 14 | const ( 15 | defaultNATSURL = "nats://localhost:4222" 16 | envNATSURL = "PG_FLO_NATS_URL" 17 | ) 18 | 19 | // NATSClient represents a client for interacting with NATS 20 | type NATSClient struct { 21 | conn *nats.Conn 22 | js nats.JetStreamContext 23 | stream string 24 | stateBucket string 25 | } 26 | 27 | // State represents the current state of the replication process 28 | type State struct { 29 | LSN pglogrepl.LSN `json:"lsn"` 30 | LastProcessedSeq map[string]uint64 31 | } 32 | 33 | // NewNATSClient creates a new NATS client with the specified configuration, setting up the connection, main stream, and state bucket. 34 | func NewNATSClient(url, stream, group string) (*NATSClient, error) { 35 | if url == "" { 36 | url = os.Getenv(envNATSURL) 37 | if url == "" { 38 | url = defaultNATSURL 39 | } 40 | } 41 | 42 | if stream == "" { 43 | stream = fmt.Sprintf("pgflo_%s_stream", group) 44 | } 45 | 46 | nc, err := nats.Connect(url, 47 | nats.RetryOnFailedConnect(true), 48 | nats.MaxReconnects(-1), 49 | nats.ReconnectWait(time.Second), 50 | nats.DisconnectErrHandler(func(_ *nats.Conn, err error) { 51 | fmt.Printf("Disconnected due to: %s, will attempt reconnects\n", err) 52 | }), 53 | nats.ReconnectHandler(func(nc *nats.Conn) { 54 | fmt.Printf("Reconnected [%s]\n", nc.ConnectedUrl()) 55 | }), 56 | nats.ClosedHandler(func(nc *nats.Conn) { 57 | fmt.Printf("Exiting: %v\n", nc.LastError()) 58 | }), 59 | ) 60 | if err != nil { 61 | return nil, fmt.Errorf("failed to connect to NATS: %w", err) 62 | } 63 | 64 | js, err := nc.JetStream() 65 | if err != nil { 66 | return nil, fmt.Errorf("failed to create JetStream context: %w", err) 67 | } 68 | 69 | // Create the main stream 70 | streamConfig := &nats.StreamConfig{ 71 | Name: stream, 72 | Subjects: []string{fmt.Sprintf("pgflo.%s", group)}, 73 | Storage: nats.FileStorage, 74 | Retention: nats.LimitsPolicy, 75 | MaxAge: 24 * time.Hour, 76 | } 77 | _, err = js.AddStream(streamConfig) 78 | if err != nil && !errors.Is(err, nats.ErrStreamNameAlreadyInUse) { 79 | return nil, fmt.Errorf("failed to create main stream: %w", err) 80 | } 81 | 82 | // Create the state bucket 83 | stateBucket := fmt.Sprintf("pg_flo_state_%s", group) 84 | _, kvErr := js.KeyValue(stateBucket) 85 | if kvErr != nil { 86 | if errors.Is(kvErr, nats.ErrBucketNotFound) { 87 | _, err = js.CreateKeyValue(&nats.KeyValueConfig{ 88 | Bucket: stateBucket, 89 | }) 90 | if err != nil { 91 | return nil, fmt.Errorf("failed to create state bucket: %w", err) 92 | } 93 | } else { 94 | return nil, fmt.Errorf("failed to access state bucket: %w", kvErr) 95 | } 96 | } 97 | 98 | return &NATSClient{ 99 | conn: nc, 100 | js: js, 101 | stream: stream, 102 | stateBucket: stateBucket, 103 | }, nil 104 | } 105 | 106 | // PublishMessage publishes a message to the specified NATS subject. 107 | func (nc *NATSClient) PublishMessage(subject string, data []byte) error { 108 | _, err := nc.js.Publish(subject, data) 109 | if err != nil { 110 | return fmt.Errorf("failed to publish message: %w", err) 111 | } 112 | return nil 113 | } 114 | 115 | // Close closes the NATS connection. 116 | func (nc *NATSClient) Close() error { 117 | nc.conn.Close() 118 | return nil 119 | } 120 | 121 | // SaveState saves the current replication state to NATS. 122 | func (nc *NATSClient) SaveState(state State) error { 123 | kv, err := nc.js.KeyValue(nc.stateBucket) 124 | if err != nil { 125 | return fmt.Errorf("failed to get KV bucket: %v", err) 126 | } 127 | 128 | data, err := json.Marshal(state) 129 | if err != nil { 130 | return fmt.Errorf("failed to marshal state: %v", err) 131 | } 132 | 133 | _, err = kv.Put("state", data) 134 | if err != nil { 135 | return fmt.Errorf("failed to save state: %v", err) 136 | } 137 | 138 | return nil 139 | } 140 | 141 | // GetState retrieves the last saved state from NATS, initializing a new state if none is found. 142 | func (nc *NATSClient) GetState() (State, error) { 143 | kv, err := nc.js.KeyValue(nc.stateBucket) 144 | if err != nil { 145 | return State{}, fmt.Errorf("failed to get KV bucket: %v", err) 146 | } 147 | 148 | entry, err := kv.Get("state") 149 | if err != nil { 150 | if errors.Is(err, nats.ErrKeyNotFound) { 151 | initialState := State{LastProcessedSeq: make(map[string]uint64)} 152 | // Try to create initial state 153 | if err := nc.SaveState(initialState); err != nil { 154 | // If SaveState fails because the key already exists, fetch it again 155 | if errors.Is(err, nats.ErrKeyExists) || errors.Is(err, nats.ErrUpdateMetaDeleted) { 156 | entry, err = kv.Get("state") 157 | if err != nil { 158 | return State{}, fmt.Errorf("failed to get state after conflict: %v", err) 159 | } 160 | if err := json.Unmarshal(entry.Value(), &initialState); err != nil { 161 | return State{}, fmt.Errorf("failed to unmarshal state after conflict: %v", err) 162 | } 163 | return initialState, nil 164 | } 165 | return State{}, fmt.Errorf("failed to save initial state: %v", err) 166 | } 167 | return initialState, nil 168 | } 169 | return State{}, fmt.Errorf("failed to get state: %v", err) 170 | } 171 | 172 | var state State 173 | if err := json.Unmarshal(entry.Value(), &state); err != nil { 174 | return State{}, fmt.Errorf("failed to unmarshal state: %v", err) 175 | } 176 | 177 | if state.LastProcessedSeq == nil { 178 | state.LastProcessedSeq = make(map[string]uint64) 179 | } 180 | return state, nil 181 | } 182 | 183 | // JetStream returns the JetStream context. 184 | func (nc *NATSClient) JetStream() nats.JetStreamContext { 185 | return nc.js 186 | } 187 | -------------------------------------------------------------------------------- /pkg/replicator/buffer.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | // Buffer is a structure that holds data to be flushed periodically or when certain conditions are met 9 | type Buffer struct { 10 | data []interface{} 11 | maxRows int 12 | flushTimeout time.Duration 13 | lastFlush time.Time 14 | mutex sync.Mutex 15 | } 16 | 17 | // NewBuffer creates a new Buffer instance 18 | func NewBuffer(maxRows int, flushTimeout time.Duration) *Buffer { 19 | return &Buffer{ 20 | data: make([]interface{}, 0, maxRows), 21 | maxRows: maxRows, 22 | flushTimeout: flushTimeout, 23 | lastFlush: time.Now(), 24 | } 25 | } 26 | 27 | // Add adds an item to the buffer and returns true if the buffer should be flushed 28 | func (b *Buffer) Add(item interface{}) bool { 29 | b.mutex.Lock() 30 | defer b.mutex.Unlock() 31 | 32 | b.data = append(b.data, item) 33 | 34 | return b.shouldFlush() 35 | } 36 | 37 | // shouldFlush checks if the buffer should be flushed based on row count, or timeout 38 | func (b *Buffer) shouldFlush() bool { 39 | return len(b.data) >= b.maxRows || time.Since(b.lastFlush) >= b.flushTimeout 40 | } 41 | 42 | // Flush flushes the buffer and returns the data 43 | func (b *Buffer) Flush() []interface{} { 44 | b.mutex.Lock() 45 | defer b.mutex.Unlock() 46 | 47 | if len(b.data) == 0 { 48 | return nil 49 | } 50 | 51 | data := b.data 52 | b.data = make([]interface{}, 0, b.maxRows) 53 | b.lastFlush = time.Now() 54 | 55 | return data 56 | } 57 | -------------------------------------------------------------------------------- /pkg/replicator/config.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import "fmt" 4 | 5 | // Config holds the configuration for the replicator 6 | type Config struct { 7 | Host string 8 | Port uint16 9 | Database string 10 | User string 11 | Password string 12 | Group string 13 | Schema string 14 | Tables []string 15 | TrackDDL bool 16 | } 17 | 18 | // ConnectionString generates and returns a PostgreSQL connection string 19 | func (c Config) ConnectionString() string { 20 | return fmt.Sprintf("postgres://%s:%s@%s:%d/%s", c.User, c.Password, c.Host, c.Port, c.Database) 21 | } 22 | -------------------------------------------------------------------------------- /pkg/replicator/ddl_replicator.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | "strings" 8 | "time" 9 | 10 | "github.com/jackc/pglogrepl" 11 | "github.com/jackc/pgtype" 12 | "github.com/pgflo/pg_flo/pkg/utils" 13 | ) 14 | 15 | type DDLReplicator struct { 16 | DDLConn StandardConnection 17 | BaseRepl *BaseReplicator 18 | Config Config 19 | } 20 | 21 | // NewDDLReplicator creates a new DDLReplicator instance 22 | func NewDDLReplicator(config Config, BaseRepl *BaseReplicator, ddlConn StandardConnection) (*DDLReplicator, error) { 23 | return &DDLReplicator{ 24 | Config: config, 25 | BaseRepl: BaseRepl, 26 | DDLConn: ddlConn, 27 | }, nil 28 | } 29 | 30 | // SetupDDLTracking sets up the necessary schema, table, and triggers for DDL tracking 31 | func (d *DDLReplicator) SetupDDLTracking(ctx context.Context) error { 32 | tables, err := d.BaseRepl.GetConfiguredTables(ctx) 33 | if err != nil { 34 | return fmt.Errorf("failed to get configured tables: %w", err) 35 | } 36 | 37 | tableConditions := make([]string, len(tables)) 38 | for i, table := range tables { 39 | parts := strings.Split(table, ".") 40 | if len(parts) != 2 { 41 | return fmt.Errorf("invalid table name format: %s", table) 42 | } 43 | tableConditions[i] = fmt.Sprintf("(nspname = '%s' AND relname = '%s')", 44 | parts[0], parts[1]) 45 | } 46 | tableFilter := strings.Join(tableConditions, " OR ") 47 | 48 | _, err = d.DDLConn.Exec(ctx, fmt.Sprintf(` 49 | CREATE SCHEMA IF NOT EXISTS internal_pg_flo; 50 | 51 | CREATE TABLE IF NOT EXISTS internal_pg_flo.ddl_log ( 52 | id SERIAL PRIMARY KEY, 53 | event_type TEXT NOT NULL, 54 | object_type TEXT, 55 | object_identity TEXT, 56 | table_name TEXT, 57 | ddl_command TEXT NOT NULL, 58 | created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP 59 | ); 60 | 61 | CREATE OR REPLACE FUNCTION internal_pg_flo.ddl_trigger() RETURNS event_trigger AS $$ 62 | DECLARE 63 | obj record; 64 | ddl_command text; 65 | table_name text; 66 | should_track boolean; 67 | BEGIN 68 | SELECT current_query() INTO ddl_command; 69 | 70 | IF TG_EVENT = 'ddl_command_end' THEN 71 | FOR obj IN SELECT * FROM pg_event_trigger_ddl_commands() 72 | LOOP 73 | should_track := false; 74 | -- Extract table name if object type is table or index 75 | IF obj.object_type IN ('table', 'table column') THEN 76 | SELECT nspname || '.' || relname, (%s) 77 | INTO table_name, should_track 78 | FROM pg_class c 79 | JOIN pg_namespace n ON c.relnamespace = n.oid 80 | WHERE c.oid = obj.objid; 81 | ELSIF obj.object_type = 'index' THEN 82 | WITH target_table AS ( 83 | SELECT t.oid as table_oid, n.nspname, t.relname 84 | FROM pg_index i 85 | JOIN pg_class t ON t.oid = i.indrelid 86 | JOIN pg_namespace n ON t.relnamespace = n.oid 87 | WHERE i.indexrelid = obj.objid 88 | ) 89 | SELECT nspname || '.' || relname, (%s) 90 | INTO table_name, should_track 91 | FROM target_table; 92 | END IF; 93 | 94 | IF should_track THEN 95 | INSERT INTO internal_pg_flo.ddl_log (event_type, object_type, object_identity, table_name, ddl_command) 96 | VALUES (TG_EVENT, obj.object_type, obj.object_identity, table_name, ddl_command); 97 | END IF; 98 | END LOOP; 99 | END IF; 100 | END; 101 | $$ LANGUAGE plpgsql; 102 | 103 | DROP EVENT TRIGGER IF EXISTS pg_flo_ddl_trigger; 104 | CREATE EVENT TRIGGER pg_flo_ddl_trigger ON ddl_command_end 105 | EXECUTE FUNCTION internal_pg_flo.ddl_trigger(); 106 | `, tableFilter, tableFilter)) 107 | 108 | if err != nil { 109 | d.BaseRepl.Logger.Error().Err(err).Msg("Failed to setup DDL tracking") 110 | return err 111 | } 112 | return nil 113 | } 114 | 115 | // StartDDLReplication starts the DDL replication process 116 | func (d *DDLReplicator) StartDDLReplication(ctx context.Context) { 117 | ticker := time.NewTicker(1 * time.Second) 118 | defer ticker.Stop() 119 | 120 | for { 121 | select { 122 | case <-ctx.Done(): 123 | d.BaseRepl.Logger.Info().Msg("DDL replication stopping...") 124 | return 125 | case <-ticker.C: 126 | if err := d.ProcessDDLEvents(ctx); err != nil { 127 | if ctx.Err() != nil { 128 | // Context canceled, exit gracefully 129 | return 130 | } 131 | d.BaseRepl.Logger.Error().Err(err).Msg("Failed to process DDL events") 132 | } 133 | } 134 | } 135 | } 136 | 137 | // ProcessDDLEvents processes DDL events from the log table 138 | func (d *DDLReplicator) ProcessDDLEvents(ctx context.Context) error { 139 | rows, err := d.DDLConn.Query(ctx, ` 140 | SELECT id, event_type, object_type, object_identity, table_name, ddl_command, created_at 141 | FROM internal_pg_flo.ddl_log 142 | ORDER BY created_at ASC 143 | `) 144 | if err != nil { 145 | d.BaseRepl.Logger.Error().Err(err).Msg("Failed to query DDL log") 146 | return nil 147 | } 148 | defer rows.Close() 149 | 150 | var processedIDs []int 151 | seenCommands := make(map[string]bool) 152 | 153 | for rows.Next() { 154 | var id int 155 | var eventType, objectType, objectIdentity, ddlCommand string 156 | var tableName sql.NullString 157 | var createdAt time.Time 158 | if err := rows.Scan(&id, &eventType, &objectType, &objectIdentity, &tableName, &ddlCommand, &createdAt); err != nil { 159 | d.BaseRepl.Logger.Error().Err(err).Msg("Failed to scan DDL log row") 160 | return nil 161 | } 162 | 163 | if d.shouldSkipDDLEvent(ddlCommand) { 164 | processedIDs = append(processedIDs, id) 165 | continue 166 | } 167 | 168 | if seenCommands[ddlCommand] { 169 | processedIDs = append(processedIDs, id) 170 | continue 171 | } 172 | seenCommands[ddlCommand] = true 173 | 174 | var schema, table string 175 | if tableName.Valid { 176 | schema, table = splitSchemaAndTable(tableName.String) 177 | } else { 178 | schema, table = "public", "" 179 | } 180 | 181 | cdcMessage := utils.CDCMessage{ 182 | Type: utils.OperationDDL, 183 | Schema: schema, 184 | Table: table, 185 | EmittedAt: time.Now(), 186 | Columns: []*pglogrepl.RelationMessageColumn{ 187 | {Name: "event_type", DataType: pgtype.TextOID}, 188 | {Name: "object_type", DataType: pgtype.TextOID}, 189 | {Name: "object_identity", DataType: pgtype.TextOID}, 190 | {Name: "ddl_command", DataType: pgtype.TextOID}, 191 | {Name: "created_at", DataType: pgtype.TimestamptzOID}, 192 | }, 193 | NewTuple: &pglogrepl.TupleData{ 194 | Columns: []*pglogrepl.TupleDataColumn{ 195 | {Data: []byte(eventType)}, 196 | {Data: []byte(objectType)}, 197 | {Data: []byte(objectIdentity)}, 198 | {Data: []byte(ddlCommand)}, 199 | {Data: []byte(createdAt.Format(time.RFC3339))}, 200 | }, 201 | }, 202 | } 203 | 204 | if err := d.BaseRepl.PublishToNATS(cdcMessage); err != nil { 205 | d.BaseRepl.Logger.Error().Err(err).Msg("Error during publishing DDL event to NATS") 206 | return nil 207 | } 208 | 209 | processedIDs = append(processedIDs, id) 210 | } 211 | 212 | if err := rows.Err(); err != nil { 213 | d.BaseRepl.Logger.Error().Err(err).Msg("Error during DDL log iteration") 214 | return nil 215 | } 216 | 217 | if len(processedIDs) > 0 { 218 | _, err = d.DDLConn.Exec(ctx, "DELETE FROM internal_pg_flo.ddl_log WHERE id = ANY($1)", processedIDs) 219 | if err != nil { 220 | d.BaseRepl.Logger.Error().Err(err).Msg("Failed to clear processed DDL events") 221 | return nil 222 | } 223 | } 224 | 225 | return nil 226 | } 227 | 228 | // splitSchemaAndTable splits a full table name into schema and table parts 229 | func splitSchemaAndTable(fullName string) (string, string) { 230 | parts := strings.SplitN(fullName, ".", 2) 231 | if len(parts) == 2 { 232 | return parts[0], parts[1] 233 | } 234 | return "public", fullName 235 | } 236 | 237 | // Close closes the DDL connection 238 | func (d *DDLReplicator) Close(ctx context.Context) error { 239 | if d.DDLConn != nil { 240 | return d.DDLConn.Close(ctx) 241 | } 242 | return nil 243 | } 244 | 245 | // Shutdown performs a graceful shutdown of the DDL replicator 246 | func (d *DDLReplicator) Shutdown(ctx context.Context) error { 247 | d.BaseRepl.Logger.Info().Msg("Shutting down DDL replicator") 248 | 249 | // Process remaining events with the provided context 250 | if err := d.ProcessDDLEvents(ctx); err != nil { 251 | d.BaseRepl.Logger.Error().Err(err).Msg("Failed to process final DDL events") 252 | // Continue with shutdown even if processing fails 253 | } 254 | 255 | // Wait for any pending events with respect to context deadline 256 | ticker := time.NewTicker(100 * time.Millisecond) 257 | defer ticker.Stop() 258 | 259 | for { 260 | select { 261 | case <-ctx.Done(): 262 | d.BaseRepl.Logger.Warn().Msg("Context deadline exceeded while waiting for DDL events") 263 | return ctx.Err() 264 | case <-ticker.C: 265 | hasEvents, err := d.HasPendingDDLEvents(ctx) 266 | if err != nil { 267 | d.BaseRepl.Logger.Error().Err(err).Msg("Failed to check pending DDL events") 268 | return err 269 | } 270 | if !hasEvents { 271 | d.BaseRepl.Logger.Info().Msg("All DDL events processed") 272 | return d.Close(ctx) 273 | } 274 | } 275 | } 276 | } 277 | 278 | // HasPendingDDLEvents checks if there are pending DDL events in the log 279 | func (d *DDLReplicator) HasPendingDDLEvents(ctx context.Context) (bool, error) { 280 | var count int 281 | err := d.DDLConn.QueryRow(ctx, ` 282 | SELECT COUNT(*) FROM internal_pg_flo.ddl_log 283 | `).Scan(&count) 284 | if err != nil { 285 | return false, err 286 | } 287 | return count > 0, nil 288 | } 289 | 290 | // shouldSkipDDLEvent determines if a DDL event should be skipped from processing 291 | func (d *DDLReplicator) shouldSkipDDLEvent(ddlCommand string) bool { 292 | if strings.Contains(ddlCommand, "internal_pg_flo.") { 293 | return true 294 | } 295 | 296 | publicationName := GeneratePublicationName(d.Config.Group) 297 | if strings.Contains(ddlCommand, fmt.Sprintf("CREATE PUBLICATION %q", publicationName)) || 298 | strings.Contains(ddlCommand, fmt.Sprintf("DROP PUBLICATION %q", publicationName)) || 299 | strings.Contains(ddlCommand, "CREATE PUBLICATION pg_flo_") || 300 | strings.Contains(ddlCommand, "DROP PUBLICATION pg_flo_") { 301 | return true 302 | } 303 | 304 | return false 305 | } 306 | -------------------------------------------------------------------------------- /pkg/replicator/errors.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | ) 7 | 8 | var ( 9 | ErrReplicatorAlreadyStarted = errors.New("replicator already started") 10 | ErrReplicatorNotStarted = errors.New("replicator not started") 11 | ErrReplicatorAlreadyStopped = errors.New("replicator already stopped") 12 | ) 13 | 14 | // ReplicationError represents an error that occurred during replication. 15 | type ReplicationError struct { 16 | Op string // The operation that caused the error 17 | Err error // The underlying error 18 | } 19 | 20 | // Error returns a formatted error message. 21 | func (e *ReplicationError) Error() string { 22 | return fmt.Sprintf("replication error during %s: %v", e.Op, e.Err) 23 | } 24 | -------------------------------------------------------------------------------- /pkg/replicator/factory.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | ) 7 | 8 | // ReplicatorFactory defines the interface for creating replicators 9 | type Factory interface { 10 | CreateReplicator(config Config, natsClient NATSClient) (Replicator, error) 11 | } 12 | 13 | // BaseFactory provides common functionality for factories 14 | type BaseFactory struct{} 15 | 16 | // CreateConnections creates replication and standard connections 17 | func (f *BaseFactory) CreateConnections(config Config) (ReplicationConnection, StandardConnection, error) { 18 | replicationConn := NewReplicationConnection(config) 19 | if err := replicationConn.Connect(context.Background()); err != nil { 20 | return nil, nil, fmt.Errorf("failed to connect for replication: %v", err) 21 | } 22 | 23 | standardConn, err := NewStandardConnection(config) 24 | if err != nil { 25 | return nil, nil, fmt.Errorf("failed to create standard connection: %v", err) 26 | } 27 | 28 | return replicationConn, standardConn, nil 29 | } 30 | 31 | // StreamReplicatorFactory creates `StreamReplicator` instances 32 | type StreamReplicatorFactory struct { 33 | BaseFactory 34 | } 35 | 36 | // CreateReplicator creates a new `StreamReplicator` 37 | func (f *StreamReplicatorFactory) CreateReplicator(config Config, natsClient NATSClient) (Replicator, error) { 38 | replicationConn, standardConn, err := f.CreateConnections(config) 39 | if err != nil { 40 | return nil, err 41 | } 42 | 43 | baseReplicator := NewBaseReplicator(config, replicationConn, standardConn, natsClient) 44 | return &StreamReplicator{BaseReplicator: baseReplicator}, nil 45 | } 46 | 47 | // CopyAndStreamReplicatorFactory creates `CopyAndStreamReplicator` instances 48 | type CopyAndStreamReplicatorFactory struct { 49 | BaseFactory 50 | MaxCopyWorkersPerTable int 51 | CopyOnly bool 52 | } 53 | 54 | // CreateReplicator creates a new `CopyAndStreamReplicator` 55 | func (f *CopyAndStreamReplicatorFactory) CreateReplicator(config Config, natsClient NATSClient) (Replicator, error) { 56 | replicationConn, standardConn, err := f.CreateConnections(config) 57 | if err != nil { 58 | return nil, err 59 | } 60 | 61 | baseReplicator := NewBaseReplicator(config, replicationConn, standardConn, natsClient) 62 | 63 | if f.MaxCopyWorkersPerTable <= 0 { 64 | f.MaxCopyWorkersPerTable = 4 65 | } 66 | 67 | return NewCopyAndStreamReplicator( 68 | baseReplicator, 69 | f.MaxCopyWorkersPerTable, 70 | f.CopyOnly, 71 | ), nil 72 | } 73 | -------------------------------------------------------------------------------- /pkg/replicator/interfaces.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/jackc/pglogrepl" 7 | "github.com/jackc/pgx/v5" 8 | "github.com/jackc/pgx/v5/pgconn" 9 | "github.com/jackc/pgx/v5/pgproto3" 10 | "github.com/nats-io/nats.go" 11 | "github.com/pgflo/pg_flo/pkg/pgflonats" 12 | ) 13 | 14 | type Replicator interface { 15 | Start(ctx context.Context) error 16 | Stop(ctx context.Context) error 17 | } 18 | 19 | type ReplicationConnection interface { 20 | Connect(ctx context.Context) error 21 | Close(ctx context.Context) error 22 | CreateReplicationSlot(ctx context.Context, slotName string) (pglogrepl.CreateReplicationSlotResult, error) 23 | StartReplication(ctx context.Context, slotName string, startLSN pglogrepl.LSN, options pglogrepl.StartReplicationOptions) error 24 | ReceiveMessage(ctx context.Context) (pgproto3.BackendMessage, error) 25 | SendStandbyStatusUpdate(ctx context.Context, status pglogrepl.StandbyStatusUpdate) error 26 | } 27 | 28 | type StandardConnection interface { 29 | Connect(ctx context.Context) error 30 | Close(ctx context.Context) error 31 | Exec(ctx context.Context, sql string, arguments ...interface{}) (pgconn.CommandTag, error) 32 | Query(ctx context.Context, sql string, args ...interface{}) (pgx.Rows, error) 33 | QueryRow(ctx context.Context, sql string, args ...interface{}) pgx.Row 34 | BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) 35 | Acquire(ctx context.Context) (PgxPoolConn, error) 36 | } 37 | 38 | type PgxPoolConn interface { 39 | BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) 40 | Exec(ctx context.Context, sql string, arguments ...interface{}) (pgconn.CommandTag, error) 41 | Query(ctx context.Context, sql string, args ...interface{}) (pgx.Rows, error) 42 | QueryRow(ctx context.Context, sql string, args ...interface{}) pgx.Row 43 | Release() 44 | } 45 | 46 | type NATSClient interface { 47 | PublishMessage(subject string, data []byte) error 48 | Close() error 49 | SaveState(state pgflonats.State) error 50 | GetState() (pgflonats.State, error) 51 | JetStream() nats.JetStreamContext 52 | } 53 | -------------------------------------------------------------------------------- /pkg/replicator/json_encoder.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/pgflo/pg_flo/pkg/utils" 8 | ) 9 | 10 | // InitializeOIDMap initializes the OID to type name map with custom types from the database 11 | func InitializeOIDMap(ctx context.Context, conn StandardConnection) error { 12 | rows, err := conn.Query(ctx, ` 13 | SELECT oid, typname 14 | FROM pg_type 15 | WHERE typtype = 'b' AND oid > 10000 -- Only base types and custom types 16 | `) 17 | if err != nil { 18 | return fmt.Errorf("failed to query pg_type: %w", err) 19 | } 20 | defer rows.Close() 21 | 22 | for rows.Next() { 23 | var oid uint32 24 | var typeName string 25 | if err := rows.Scan(&oid, &typeName); err != nil { 26 | return fmt.Errorf("failed to scan row: %w", err) 27 | } 28 | utils.OidToTypeName[oid] = typeName 29 | } 30 | 31 | if err := rows.Err(); err != nil { 32 | return fmt.Errorf("error iterating over rows: %w", err) 33 | } 34 | 35 | return nil 36 | } 37 | -------------------------------------------------------------------------------- /pkg/replicator/replication_connection.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/jackc/pglogrepl" 8 | "github.com/jackc/pgx/v5" 9 | "github.com/jackc/pgx/v5/pgconn" 10 | "github.com/jackc/pgx/v5/pgproto3" 11 | ) 12 | 13 | // PostgresReplicationConnection implements the ReplicationConnection interface 14 | // for PostgreSQL databases. 15 | type PostgresReplicationConnection struct { 16 | Config Config 17 | Conn *pgconn.PgConn 18 | } 19 | 20 | // NewReplicationConnection creates a new PostgresReplicationConnection instance. 21 | func NewReplicationConnection(config Config) ReplicationConnection { 22 | return &PostgresReplicationConnection{ 23 | Config: config, 24 | } 25 | } 26 | 27 | // Connect establishes a connection to the PostgreSQL database for replication. 28 | func (rc *PostgresReplicationConnection) Connect(ctx context.Context) error { 29 | config, err := pgx.ParseConfig(fmt.Sprintf("host=%s port=%d dbname=%s user=%s password=%s", 30 | rc.Config.Host, 31 | rc.Config.Port, 32 | rc.Config.Database, 33 | rc.Config.User, 34 | rc.Config.Password)) 35 | if err != nil { 36 | return fmt.Errorf("failed to parse connection config: %v", err) 37 | } 38 | 39 | config.RuntimeParams["replication"] = "database" 40 | 41 | conn, err := pgx.ConnectConfig(ctx, config) 42 | if err != nil { 43 | return fmt.Errorf("failed to connect to PostgreSQL: %v", err) 44 | } 45 | 46 | rc.Conn = conn.PgConn() 47 | return nil 48 | } 49 | 50 | // Close terminates the connection to the PostgreSQL database. 51 | func (rc *PostgresReplicationConnection) Close(ctx context.Context) error { 52 | return rc.Conn.Close(ctx) 53 | } 54 | 55 | // CreateReplicationSlot creates a new replication slot in the PostgreSQL database. 56 | func (rc *PostgresReplicationConnection) CreateReplicationSlot(ctx context.Context, slotName string) (pglogrepl.CreateReplicationSlotResult, error) { 57 | return pglogrepl.CreateReplicationSlot(ctx, rc.Conn, slotName, "pgoutput", pglogrepl.CreateReplicationSlotOptions{Temporary: false}) 58 | } 59 | 60 | // StartReplication initiates the replication process from the specified LSN. 61 | func (rc *PostgresReplicationConnection) StartReplication(ctx context.Context, slotName string, startLSN pglogrepl.LSN, options pglogrepl.StartReplicationOptions) error { 62 | return pglogrepl.StartReplication(ctx, rc.Conn, slotName, startLSN, options) 63 | } 64 | 65 | // ReceiveMessage receives a message from the PostgreSQL replication stream. 66 | func (rc *PostgresReplicationConnection) ReceiveMessage(ctx context.Context) (pgproto3.BackendMessage, error) { 67 | return rc.Conn.ReceiveMessage(ctx) 68 | } 69 | 70 | // SendStandbyStatusUpdate sends a status update to the PostgreSQL server during replication. 71 | func (rc *PostgresReplicationConnection) SendStandbyStatusUpdate(ctx context.Context, status pglogrepl.StandbyStatusUpdate) error { 72 | return pglogrepl.SendStandbyStatusUpdate(ctx, rc.Conn, status) 73 | } 74 | -------------------------------------------------------------------------------- /pkg/replicator/standard_connection.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/jackc/pgx/v5" 8 | "github.com/jackc/pgx/v5/pgconn" 9 | "github.com/jackc/pgx/v5/pgxpool" 10 | ) 11 | 12 | // StandardConnectionImpl implements the StandardConnection interface for PostgreSQL databases. 13 | type StandardConnectionImpl struct { 14 | pool *pgxpool.Pool 15 | } 16 | 17 | // NewStandardConnection creates a new StandardConnectionImpl instance and establishes a connection. 18 | func NewStandardConnection(config Config) (*StandardConnectionImpl, error) { 19 | connString := fmt.Sprintf("host=%s port=%d dbname=%s user=%s password=%s", 20 | config.Host, 21 | config.Port, 22 | config.Database, 23 | config.User, 24 | config.Password) 25 | 26 | poolConfig, err := pgxpool.ParseConfig(connString) 27 | if err != nil { 28 | return nil, fmt.Errorf("unable to parse connection string: %v", err) 29 | } 30 | 31 | poolConfig.MaxConns = 20 32 | 33 | pool, err := pgxpool.NewWithConfig(context.Background(), poolConfig) 34 | if err != nil { 35 | return nil, fmt.Errorf("unable to create connection pool: %v", err) 36 | } 37 | return &StandardConnectionImpl{pool: pool}, nil 38 | } 39 | 40 | // Connect establishes a connection to the PostgreSQL database. 41 | func (s *StandardConnectionImpl) Connect(ctx context.Context) error { 42 | return s.pool.Ping(ctx) 43 | } 44 | 45 | // Close terminates the connection to the PostgreSQL database. 46 | func (s *StandardConnectionImpl) Close(_ context.Context) error { 47 | s.pool.Close() 48 | return nil 49 | } 50 | 51 | // Exec executes a SQL query without returning any rows. 52 | func (s *StandardConnectionImpl) Exec(ctx context.Context, sql string, arguments ...interface{}) (pgconn.CommandTag, error) { 53 | return s.pool.Exec(ctx, sql, arguments...) 54 | } 55 | 56 | // BeginTx starts a new transaction with the specified options. 57 | func (s *StandardConnectionImpl) BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) { 58 | return s.pool.BeginTx(ctx, txOptions) 59 | } 60 | 61 | // QueryRow executes a query that is expected to return at most one row. 62 | func (s *StandardConnectionImpl) QueryRow(ctx context.Context, sql string, args ...interface{}) pgx.Row { 63 | return s.pool.QueryRow(ctx, sql, args...) 64 | } 65 | 66 | // Query executes a query that returns rows, typically a SELECT. 67 | func (s *StandardConnectionImpl) Query(ctx context.Context, sql string, args ...interface{}) (pgx.Rows, error) { 68 | return s.pool.Query(ctx, sql, args...) 69 | } 70 | 71 | // Acquire acquires a connection from the pool. 72 | func (s *StandardConnectionImpl) Acquire(ctx context.Context) (PgxPoolConn, error) { 73 | conn, err := s.pool.Acquire(ctx) 74 | if err != nil { 75 | return nil, err 76 | } 77 | return &PgxPoolConnWrapper{Conn: conn}, nil 78 | } 79 | 80 | type PgxPoolConnWrapper struct { 81 | *pgxpool.Conn 82 | } 83 | -------------------------------------------------------------------------------- /pkg/replicator/stream_replicator.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/jackc/pglogrepl" 7 | ) 8 | 9 | type StreamReplicator struct { 10 | *BaseReplicator 11 | } 12 | 13 | func NewStreamReplicator(base *BaseReplicator) *StreamReplicator { 14 | return &StreamReplicator{ 15 | BaseReplicator: base, 16 | } 17 | } 18 | 19 | func (r *StreamReplicator) Start(ctx context.Context) error { 20 | if err := r.BaseReplicator.Start(ctx); err != nil { 21 | return err 22 | } 23 | 24 | startLSN, err := r.GetLastState() 25 | if err != nil { 26 | r.Logger.Warn().Err(err).Msg("Failed to get last LSN, starting from 0") 27 | startLSN = pglogrepl.LSN(0) 28 | } 29 | 30 | r.Logger.Info().Str("startLSN", startLSN.String()).Msg("Starting replication") 31 | 32 | errChan := make(chan error, 1) 33 | go func() { 34 | errChan <- r.StartReplicationFromLSN(ctx, startLSN, r.stopChan) 35 | }() 36 | 37 | select { 38 | case <-ctx.Done(): 39 | return ctx.Err() 40 | case err := <-errChan: 41 | return err 42 | } 43 | } 44 | 45 | func (r *StreamReplicator) Stop(ctx context.Context) error { 46 | return r.BaseReplicator.Stop(ctx) 47 | } 48 | -------------------------------------------------------------------------------- /pkg/replicator/table_handling.go: -------------------------------------------------------------------------------- 1 | package replicator 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/pgflo/pg_flo/pkg/utils" 8 | ) 9 | 10 | // AddPrimaryKeyInfo adds replication key information to the CDCMessage 11 | func (r *BaseReplicator) AddPrimaryKeyInfo(message *utils.CDCMessage, table string) { 12 | if key, ok := r.TableReplicationKeys[table]; ok { 13 | message.ReplicationKey = key 14 | } else { 15 | r.Logger.Error(). 16 | Str("table", table). 17 | Msg("No replication key information found for table. This should not happen as validation is done during initialization") 18 | } 19 | } 20 | 21 | // InitializePrimaryKeyInfo initializes primary key information for all tables 22 | func (r *BaseReplicator) InitializePrimaryKeyInfo() error { 23 | query := ` 24 | WITH table_info AS ( 25 | SELECT 26 | t.tablename, 27 | c.relreplident, 28 | ( 29 | SELECT array_agg(a.attname ORDER BY array_position(i.indkey, a.attnum)) 30 | FROM pg_index i 31 | JOIN pg_attribute a ON a.attrelid = c.oid AND a.attnum = ANY(i.indkey) 32 | WHERE i.indrelid = c.oid AND i.indisprimary 33 | ) as pk_columns, 34 | ( 35 | SELECT array_agg(a.attname ORDER BY array_position(i.indkey, a.attnum)) 36 | FROM pg_index i 37 | JOIN pg_attribute a ON a.attrelid = c.oid AND a.attnum = ANY(i.indkey) 38 | WHERE i.indrelid = c.oid AND i.indisunique AND NOT i.indisprimary 39 | LIMIT 1 40 | ) as unique_columns 41 | FROM pg_tables t 42 | JOIN pg_class c ON t.tablename = c.relname 43 | JOIN pg_namespace n ON c.relnamespace = n.oid 44 | WHERE t.schemaname = $1 45 | ) 46 | SELECT 47 | tablename, 48 | relreplident::text, 49 | COALESCE(pk_columns, ARRAY[]::text[]) as pk_columns, 50 | COALESCE(unique_columns, ARRAY[]::text[]) as unique_columns 51 | FROM table_info; 52 | ` 53 | 54 | rows, err := r.StandardConn.Query(context.Background(), query, r.Config.Schema) 55 | if err != nil { 56 | return fmt.Errorf("failed to query replication key info: %v", err) 57 | } 58 | defer rows.Close() 59 | 60 | r.TableReplicationKeys = make(map[string]utils.ReplicationKey) 61 | 62 | for rows.Next() { 63 | var ( 64 | tableName string 65 | replicaIdentity string 66 | pkColumns []string 67 | uniqueColumns []string 68 | ) 69 | 70 | if err := rows.Scan(&tableName, &replicaIdentity, &pkColumns, &uniqueColumns); err != nil { 71 | return fmt.Errorf("failed to scan row: %v", err) 72 | } 73 | 74 | key := utils.ReplicationKey{} 75 | 76 | switch { 77 | case len(pkColumns) > 0: 78 | key = utils.ReplicationKey{ 79 | Type: utils.ReplicationKeyPK, 80 | Columns: pkColumns, 81 | } 82 | case len(uniqueColumns) > 0: 83 | key = utils.ReplicationKey{ 84 | Type: utils.ReplicationKeyUnique, 85 | Columns: uniqueColumns, 86 | } 87 | case replicaIdentity == "f": 88 | key = utils.ReplicationKey{ 89 | Type: utils.ReplicationKeyFull, 90 | Columns: nil, 91 | } 92 | } 93 | 94 | if err := r.validateTableReplicationKey(tableName, key); err != nil { 95 | r.Logger.Warn(). 96 | Str("table", tableName). 97 | Str("replica_identity", replicaIdentity). 98 | Str("key_type", string(key.Type)). 99 | Strs("columns", key.Columns). 100 | Err(err). 101 | Msg("Invalid replication key configuration") 102 | continue 103 | } 104 | 105 | r.TableReplicationKeys[tableName] = key 106 | 107 | r.Logger.Debug(). 108 | Str("table", tableName). 109 | Str("key_type", string(key.Type)). 110 | Strs("columns", key.Columns). 111 | Str("replica_identity", replicaIdentity). 112 | Msg("Initialized replication key configuration") 113 | } 114 | 115 | return rows.Err() 116 | } 117 | 118 | // GetConfiguredTables returns all tables based on configuration 119 | // If no specific tables are configured, returns all tables from the configured schema 120 | func (r *BaseReplicator) GetConfiguredTables(ctx context.Context) ([]string, error) { 121 | if len(r.Config.Tables) > 0 { 122 | fullyQualifiedTables := make([]string, len(r.Config.Tables)) 123 | for i, table := range r.Config.Tables { 124 | fullyQualifiedTables[i] = fmt.Sprintf("%s.%s", r.Config.Schema, table) 125 | } 126 | return fullyQualifiedTables, nil 127 | } 128 | 129 | rows, err := r.StandardConn.Query(ctx, ` 130 | SELECT schemaname || '.' || tablename 131 | FROM pg_tables 132 | WHERE schemaname = $1 133 | AND schemaname NOT IN ('pg_catalog', 'information_schema', 'internal_pg_flo') 134 | `, r.Config.Schema) 135 | if err != nil { 136 | return nil, fmt.Errorf("failed to query tables: %v", err) 137 | } 138 | defer rows.Close() 139 | 140 | var tables []string 141 | for rows.Next() { 142 | var tableName string 143 | if err := rows.Scan(&tableName); err != nil { 144 | return nil, fmt.Errorf("failed to scan table name: %v", err) 145 | } 146 | tables = append(tables, tableName) 147 | } 148 | 149 | return tables, nil 150 | } 151 | 152 | func (r *BaseReplicator) validateTableReplicationKey(tableName string, key utils.ReplicationKey) error { 153 | if !key.IsValid() { 154 | return fmt.Errorf( 155 | "table %q requires one of the following:\n"+ 156 | "\t1. A PRIMARY KEY constraint\n"+ 157 | "\t2. A UNIQUE constraint\n"+ 158 | "\t3. REPLICA IDENTITY FULL (ALTER TABLE %s REPLICA IDENTITY FULL)", 159 | tableName, tableName) 160 | } 161 | return nil 162 | } 163 | -------------------------------------------------------------------------------- /pkg/replicator/tests/buffer_test.go: -------------------------------------------------------------------------------- 1 | package replicator_test 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/pgflo/pg_flo/pkg/replicator" 8 | "github.com/stretchr/testify/assert" 9 | "github.com/stretchr/testify/mock" 10 | ) 11 | 12 | func TestBuffer(t *testing.T) { 13 | t.Run("NewBuffer", func(t *testing.T) { 14 | buffer := replicator.NewBuffer(10, 5*time.Second) 15 | assert.NotNil(t, buffer) 16 | }) 17 | 18 | t.Run("Add and Flush", func(t *testing.T) { 19 | buffer := replicator.NewBuffer(10, 5*time.Second) 20 | 21 | // Add items 22 | for i := 0; i < 5; i++ { 23 | shouldFlush := buffer.Add([]byte("test")) 24 | assert.False(t, shouldFlush) 25 | } 26 | 27 | // Flush 28 | data := buffer.Flush() 29 | assert.Len(t, data, 5) 30 | assert.Equal(t, []byte("test"), data[0]) 31 | 32 | // Buffer should be empty after flush 33 | emptyData := buffer.Flush() 34 | assert.Nil(t, emptyData) 35 | }) 36 | 37 | t.Run("Flush on MaxRows", func(t *testing.T) { 38 | buffer := replicator.NewBuffer(3, 5*time.Second) 39 | 40 | buffer.Add([]byte("test1")) 41 | buffer.Add([]byte("test2")) 42 | shouldFlush := buffer.Add([]byte("test3")) 43 | 44 | assert.True(t, shouldFlush) 45 | 46 | data := buffer.Flush() 47 | assert.Len(t, data, 3) 48 | }) 49 | 50 | t.Run("Flush on Timeout", func(t *testing.T) { 51 | buffer := replicator.NewBuffer(10, 100*time.Millisecond) 52 | 53 | buffer.Add([]byte("test")) 54 | time.Sleep(150 * time.Millisecond) 55 | 56 | shouldFlush := buffer.Add([]byte("test")) 57 | assert.True(t, shouldFlush) 58 | 59 | data := buffer.Flush() 60 | assert.Len(t, data, 2) 61 | }) 62 | 63 | t.Run("Concurrent Access", func(t *testing.T) { 64 | buffer := replicator.NewBuffer(100, 5*time.Second) 65 | 66 | done := make(chan bool) 67 | for i := 0; i < 10; i++ { 68 | go func() { 69 | for j := 0; j < 10; j++ { 70 | buffer.Add([]byte("test")) 71 | } 72 | done <- true 73 | }() 74 | } 75 | 76 | for i := 0; i < 10; i++ { 77 | <-done 78 | } 79 | 80 | data := buffer.Flush() 81 | assert.Len(t, data, 100) 82 | }) 83 | 84 | t.Run("BufferFlush", func(t *testing.T) { 85 | mockSink := new(MockSink) 86 | buffer := replicator.NewBuffer(5, 1*time.Second) 87 | 88 | mockSink.On("WriteBatch", mock.Anything).Return(nil) 89 | 90 | for i := 0; i < 5; i++ { 91 | shouldFlush := buffer.Add(i) 92 | if shouldFlush { 93 | data := buffer.Flush() 94 | err := mockSink.WriteBatch(data) 95 | assert.NoError(t, err) 96 | } 97 | } 98 | 99 | mockSink.AssertNumberOfCalls(t, "WriteBatch", 1) 100 | mockSink.AssertExpectations(t) 101 | }) 102 | } 103 | -------------------------------------------------------------------------------- /pkg/replicator/tests/ddl_replicator_test.go: -------------------------------------------------------------------------------- 1 | package replicator_test 2 | 3 | import ( 4 | "context" 5 | "strings" 6 | "testing" 7 | "time" 8 | 9 | "github.com/jackc/pgx/v5/pgconn" 10 | "github.com/pgflo/pg_flo/pkg/replicator" 11 | "github.com/pgflo/pg_flo/pkg/utils" 12 | "github.com/rs/zerolog" 13 | "github.com/stretchr/testify/assert" 14 | "github.com/stretchr/testify/mock" 15 | ) 16 | 17 | func TestDDLReplicator(t *testing.T) { 18 | t.Run("NewDDLReplicator", func(t *testing.T) { 19 | mockBaseReplicator := &replicator.BaseReplicator{ 20 | Logger: utils.NewZerologLogger(zerolog.New(nil)), 21 | } 22 | mockStandardConn := &MockStandardConnection{} 23 | config := replicator.Config{} 24 | 25 | ddlReplicator, err := replicator.NewDDLReplicator(config, mockBaseReplicator, mockStandardConn) 26 | 27 | assert.NoError(t, err) 28 | assert.NotNil(t, ddlReplicator) 29 | assert.Equal(t, config, ddlReplicator.Config) 30 | assert.Equal(t, mockStandardConn, ddlReplicator.DDLConn) 31 | }) 32 | 33 | t.Run("SetupDDLTracking", func(t *testing.T) { 34 | mockStandardConn := &MockStandardConnection{} 35 | mockBaseRepl := &replicator.BaseReplicator{ 36 | Logger: utils.NewZerologLogger(zerolog.New(zerolog.NewConsoleWriter()).With().Timestamp().Logger()), 37 | StandardConn: mockStandardConn, 38 | Config: replicator.Config{ 39 | Schema: "public", 40 | Tables: []string{"test_table"}, 41 | }, 42 | } 43 | 44 | ddlReplicator := &replicator.DDLReplicator{ 45 | DDLConn: mockStandardConn, 46 | BaseRepl: mockBaseRepl, 47 | } 48 | 49 | ctx := context.Background() 50 | 51 | mockStandardConn.On("Exec", ctx, mock.AnythingOfType("string"), mock.Anything).Return(pgconn.CommandTag{}, nil). 52 | Run(func(args mock.Arguments) { 53 | sql := args.Get(1).(string) 54 | assert.Contains(t, sql, "CREATE SCHEMA IF NOT EXISTS internal_pg_flo") 55 | assert.Contains(t, sql, "CREATE TABLE IF NOT EXISTS internal_pg_flo.ddl_log") 56 | assert.Contains(t, sql, "CREATE OR REPLACE FUNCTION internal_pg_flo.ddl_trigger()") 57 | assert.Contains(t, sql, "CREATE EVENT TRIGGER pg_flo_ddl_trigger") 58 | }) 59 | 60 | err := ddlReplicator.SetupDDLTracking(ctx) 61 | 62 | assert.NoError(t, err) 63 | mockStandardConn.AssertExpectations(t) 64 | }) 65 | 66 | t.Run("StartDDLReplication", func(t *testing.T) { 67 | mockStandardConn := &MockStandardConnection{} 68 | mockBaseReplicator := &replicator.BaseReplicator{ 69 | Logger: utils.NewZerologLogger(zerolog.New(zerolog.NewConsoleWriter()).With().Timestamp().Logger()), 70 | } 71 | ddlReplicator := &replicator.DDLReplicator{ 72 | DDLConn: mockStandardConn, 73 | BaseRepl: mockBaseReplicator, 74 | } 75 | 76 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 77 | defer cancel() 78 | 79 | mockRows := &MockRows{} 80 | mockStandardConn.On("Query", mock.Anything, mock.MatchedBy(func(sql string) bool { 81 | expectedParts := []string{ 82 | "SELECT id, event_type, object_type, object_identity, table_name, ddl_command, created_at", 83 | "FROM internal_pg_flo.ddl_log", 84 | "ORDER BY created_at ASC", 85 | } 86 | for _, part := range expectedParts { 87 | if !strings.Contains(sql, part) { 88 | return false 89 | } 90 | } 91 | return true 92 | }), mock.Anything).Return(mockRows, nil).Maybe() 93 | 94 | mockRows.On("Next").Return(false).Maybe() 95 | mockRows.On("Err").Return(nil).Maybe() 96 | mockRows.On("Close").Return().Maybe() 97 | 98 | mockStandardConn.On("QueryRow", mock.Anything, mock.MatchedBy(func(sql string) bool { 99 | return strings.Contains(sql, "SELECT COUNT(*) FROM internal_pg_flo.ddl_log") 100 | }), mock.Anything).Return(&MockRow{ 101 | scanFunc: func(dest ...interface{}) error { 102 | *dest[0].(*int) = 0 103 | return nil 104 | }, 105 | }).Maybe() 106 | 107 | go ddlReplicator.StartDDLReplication(ctx) 108 | 109 | time.Sleep(100 * time.Millisecond) 110 | 111 | cancel() 112 | 113 | time.Sleep(100 * time.Millisecond) 114 | 115 | mockStandardConn.AssertExpectations(t) 116 | mockRows.AssertExpectations(t) 117 | }) 118 | } 119 | -------------------------------------------------------------------------------- /pkg/replicator/tests/json_encoder_test.go: -------------------------------------------------------------------------------- 1 | package replicator_test 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/jackc/pglogrepl" 8 | "github.com/jackc/pgtype" 9 | "github.com/pgflo/pg_flo/pkg/utils" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestOIDToString(t *testing.T) { 14 | t.Run("OIDToString function", func(t *testing.T) { 15 | assert.Equal(t, "int4", utils.OIDToString(pgtype.Int4OID)) 16 | assert.Equal(t, "text", utils.OIDToString(pgtype.TextOID)) 17 | assert.Equal(t, "unknown_99999", utils.OIDToString(99999)) 18 | }) 19 | } 20 | 21 | func TestCDCBinaryEncoding(t *testing.T) { 22 | t.Run("Encode and decode preserves CDC types", func(t *testing.T) { 23 | testData := utils.CDCMessage{ 24 | Type: utils.OperationInsert, 25 | Schema: "public", 26 | Table: "users", 27 | Columns: []*pglogrepl.RelationMessageColumn{ 28 | {Name: "id", DataType: pgtype.Int4OID}, 29 | {Name: "name", DataType: pgtype.TextOID}, 30 | }, 31 | NewTuple: &pglogrepl.TupleData{ 32 | Columns: []*pglogrepl.TupleDataColumn{ 33 | {Data: []byte("123")}, 34 | {Data: []byte("John Doe")}, 35 | }, 36 | }, 37 | } 38 | 39 | encoded, err := testData.MarshalBinary() 40 | assert.NoError(t, err) 41 | 42 | var decoded utils.CDCMessage 43 | err = decoded.UnmarshalBinary(encoded) 44 | assert.NoError(t, err) 45 | 46 | assert.Equal(t, testData.Type, decoded.Type) 47 | assert.Equal(t, testData.Schema, decoded.Schema) 48 | assert.Equal(t, testData.Table, decoded.Table) 49 | assert.Equal(t, testData.Columns, decoded.Columns) 50 | assert.Equal(t, testData.NewTuple, decoded.NewTuple) 51 | }) 52 | } 53 | 54 | func TestBinaryEncodingComplexTypes(t *testing.T) { 55 | t.Run("Encode and decode handles complex types", func(t *testing.T) { 56 | binaryData := []byte{0x01, 0x02, 0x03, 0x04} 57 | jsonbData := []byte(`{"key": "value", "nested": {"number": 42}}`) 58 | timestamp := time.Now().UTC() 59 | floatValue := []byte("3.14159") 60 | intValue := []byte("9876543210") 61 | boolValue := []byte("true") 62 | textArrayValue := []byte("{hello,world}") 63 | 64 | testData := utils.CDCMessage{ 65 | Type: utils.OperationInsert, 66 | Schema: "public", 67 | Table: "complex_types", 68 | Columns: []*pglogrepl.RelationMessageColumn{ 69 | {Name: "binary", DataType: pgtype.ByteaOID}, 70 | {Name: "jsonb", DataType: pgtype.JSONBOID}, 71 | {Name: "timestamp", DataType: pgtype.TimestamptzOID}, 72 | {Name: "float", DataType: pgtype.Float8OID}, 73 | {Name: "integer", DataType: pgtype.Int8OID}, 74 | {Name: "boolean", DataType: pgtype.BoolOID}, 75 | {Name: "text_array", DataType: pgtype.TextArrayOID}, 76 | }, 77 | NewTuple: &pglogrepl.TupleData{ 78 | Columns: []*pglogrepl.TupleDataColumn{ 79 | {Data: binaryData}, 80 | {Data: jsonbData}, 81 | {Data: []byte(timestamp.Format(time.RFC3339Nano))}, 82 | {Data: floatValue}, 83 | {Data: intValue}, 84 | {Data: boolValue}, 85 | {Data: textArrayValue}, 86 | }, 87 | }, 88 | OldTuple: &pglogrepl.TupleData{ 89 | Columns: []*pglogrepl.TupleDataColumn{ 90 | {Data: []byte{0x05, 0x06, 0x07, 0x08}}, 91 | {Data: []byte(`{"old": "data"}`)}, 92 | }, 93 | }, 94 | } 95 | 96 | encoded, err := testData.MarshalBinary() 97 | assert.NoError(t, err) 98 | 99 | var decoded utils.CDCMessage 100 | err = decoded.UnmarshalBinary(encoded) 101 | assert.NoError(t, err) 102 | 103 | assert.Equal(t, binaryData, decoded.NewTuple.Columns[0].Data) 104 | assert.Equal(t, jsonbData, decoded.NewTuple.Columns[1].Data) 105 | assert.Equal(t, []byte(timestamp.Format(time.RFC3339Nano)), decoded.NewTuple.Columns[2].Data) 106 | assert.Equal(t, floatValue, decoded.NewTuple.Columns[3].Data) 107 | assert.Equal(t, intValue, decoded.NewTuple.Columns[4].Data) 108 | assert.Equal(t, boolValue, decoded.NewTuple.Columns[5].Data) 109 | assert.Equal(t, textArrayValue, decoded.NewTuple.Columns[6].Data) 110 | 111 | assert.Equal(t, []byte{0x05, 0x06, 0x07, 0x08}, decoded.OldTuple.Columns[0].Data) 112 | assert.Equal(t, []byte(`{"old": "data"}`), decoded.OldTuple.Columns[1].Data) 113 | 114 | assert.Equal(t, testData.Type, decoded.Type) 115 | assert.Equal(t, testData.Schema, decoded.Schema) 116 | assert.Equal(t, testData.Table, decoded.Table) 117 | assert.Equal(t, testData.Columns, decoded.Columns) 118 | }) 119 | } 120 | -------------------------------------------------------------------------------- /pkg/replicator/tests/mocks_test.go: -------------------------------------------------------------------------------- 1 | package replicator_test 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/jackc/pglogrepl" 7 | "github.com/jackc/pgx/v5" 8 | "github.com/jackc/pgx/v5/pgconn" 9 | "github.com/jackc/pgx/v5/pgproto3" 10 | "github.com/nats-io/nats.go" 11 | "github.com/pgflo/pg_flo/pkg/pgflonats" 12 | "github.com/pgflo/pg_flo/pkg/replicator" 13 | "github.com/stretchr/testify/mock" 14 | ) 15 | 16 | type MockReplicationConnection struct { 17 | mock.Mock 18 | } 19 | 20 | func (m *MockReplicationConnection) Connect(ctx context.Context) error { 21 | args := m.Called(ctx) 22 | return args.Error(0) 23 | } 24 | 25 | func (m *MockReplicationConnection) Close(ctx context.Context) error { 26 | args := m.Called(ctx) 27 | return args.Error(0) 28 | } 29 | 30 | func (m *MockReplicationConnection) CreateReplicationSlot(ctx context.Context, slotName string) (pglogrepl.CreateReplicationSlotResult, error) { 31 | args := m.Called(ctx, slotName) 32 | return args.Get(0).(pglogrepl.CreateReplicationSlotResult), args.Error(1) 33 | } 34 | 35 | func (m *MockReplicationConnection) StartReplication(ctx context.Context, slotName string, startLSN pglogrepl.LSN, options pglogrepl.StartReplicationOptions) error { 36 | args := m.Called(ctx, slotName, startLSN, options) 37 | return args.Error(0) 38 | } 39 | 40 | func (m *MockReplicationConnection) ReceiveMessage(ctx context.Context) (pgproto3.BackendMessage, error) { 41 | args := m.Called(ctx) 42 | msg := args.Get(0) 43 | if msg == nil { 44 | return nil, args.Error(1) 45 | } 46 | return msg.(pgproto3.BackendMessage), args.Error(1) 47 | } 48 | 49 | func (m *MockReplicationConnection) SendStandbyStatusUpdate(ctx context.Context, status pglogrepl.StandbyStatusUpdate) error { 50 | args := m.Called(ctx, status) 51 | return args.Error(0) 52 | } 53 | 54 | type MockStandardConnection struct { 55 | mock.Mock 56 | } 57 | 58 | func (m *MockStandardConnection) Connect(ctx context.Context) error { 59 | args := m.Called(ctx) 60 | return args.Error(0) 61 | } 62 | 63 | func (m *MockStandardConnection) Close(ctx context.Context) error { 64 | args := m.Called(ctx) 65 | return args.Error(0) 66 | } 67 | 68 | func (m *MockStandardConnection) Exec(ctx context.Context, sql string, arguments ...interface{}) (pgconn.CommandTag, error) { 69 | args := m.Called(ctx, sql, arguments) 70 | return args.Get(0).(pgconn.CommandTag), args.Error(1) 71 | } 72 | 73 | func (m *MockStandardConnection) Query(ctx context.Context, sql string, args ...interface{}) (pgx.Rows, error) { 74 | mockArgs := m.Called(ctx, sql, args) 75 | return mockArgs.Get(0).(pgx.Rows), mockArgs.Error(1) 76 | } 77 | 78 | func (m *MockStandardConnection) QueryRow(ctx context.Context, sql string, args ...interface{}) pgx.Row { 79 | mockArgs := m.Called(ctx, sql, args) 80 | return mockArgs.Get(0).(pgx.Row) 81 | } 82 | 83 | func (m *MockStandardConnection) BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) { 84 | args := m.Called(ctx, txOptions) 85 | return args.Get(0).(pgx.Tx), args.Error(1) 86 | } 87 | 88 | func (m *MockStandardConnection) Acquire(ctx context.Context) (replicator.PgxPoolConn, error) { 89 | args := m.Called(ctx) 90 | return args.Get(0).(replicator.PgxPoolConn), args.Error(1) 91 | } 92 | 93 | type MockSink struct { 94 | mock.Mock 95 | } 96 | 97 | func (m *MockSink) WriteBatch(data []interface{}) error { 98 | args := m.Called(data) 99 | return args.Error(0) 100 | } 101 | 102 | func (m *MockSink) Close() error { 103 | args := m.Called() 104 | return args.Error(0) 105 | } 106 | 107 | type MockPgxPoolConn struct { 108 | mock.Mock 109 | } 110 | 111 | func (m *MockPgxPoolConn) BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) { 112 | args := m.Called(ctx, txOptions) 113 | return args.Get(0).(pgx.Tx), args.Error(1) 114 | } 115 | 116 | func (m *MockPgxPoolConn) Exec(ctx context.Context, sql string, arguments ...interface{}) (pgconn.CommandTag, error) { 117 | args := m.Called(ctx, sql, arguments) 118 | return args.Get(0).(pgconn.CommandTag), args.Error(1) 119 | } 120 | 121 | func (m *MockPgxPoolConn) Query(ctx context.Context, sql string, args ...interface{}) (pgx.Rows, error) { 122 | mockArgs := m.Called(ctx, sql, args) 123 | return mockArgs.Get(0).(pgx.Rows), mockArgs.Error(1) 124 | } 125 | 126 | func (m *MockPgxPoolConn) QueryRow(ctx context.Context, sql string, args ...interface{}) pgx.Row { 127 | mockArgs := m.Called(ctx, sql, args) 128 | return mockArgs.Get(0).(pgx.Row) 129 | } 130 | 131 | func (m *MockPgxPoolConn) Release() { 132 | m.Called() 133 | } 134 | 135 | type MockTx struct { 136 | mock.Mock 137 | } 138 | 139 | func (m *MockTx) Begin(ctx context.Context) (pgx.Tx, error) { 140 | args := m.Called(ctx) 141 | return args.Get(0).(pgx.Tx), args.Error(1) 142 | } 143 | 144 | func (m *MockTx) Commit(ctx context.Context) error { 145 | args := m.Called(ctx) 146 | return args.Error(0) 147 | } 148 | 149 | func (m *MockTx) CopyFrom(ctx context.Context, tableName pgx.Identifier, columnNames []string, rowSrc pgx.CopyFromSource) (int64, error) { 150 | args := m.Called(ctx, tableName, columnNames, rowSrc) 151 | return args.Get(0).(int64), args.Error(1) 152 | } 153 | 154 | func (m *MockTx) SendBatch(ctx context.Context, b *pgx.Batch) pgx.BatchResults { 155 | args := m.Called(ctx, b) 156 | return args.Get(0).(pgx.BatchResults) 157 | } 158 | 159 | func (m *MockTx) LargeObjects() pgx.LargeObjects { 160 | args := m.Called() 161 | return args.Get(0).(pgx.LargeObjects) 162 | } 163 | 164 | func (m *MockTx) Prepare(ctx context.Context, name, sql string) (*pgconn.StatementDescription, error) { 165 | args := m.Called(ctx, name, sql) 166 | return args.Get(0).(*pgconn.StatementDescription), args.Error(1) 167 | } 168 | 169 | func (m *MockTx) Exec(ctx context.Context, sql string, arguments ...interface{}) (pgconn.CommandTag, error) { 170 | args := []interface{}{ctx, sql} 171 | args = append(args, arguments...) 172 | callArgs := m.Called(args...) 173 | return callArgs.Get(0).(pgconn.CommandTag), callArgs.Error(1) 174 | } 175 | 176 | func (m *MockTx) Query(ctx context.Context, sql string, args ...interface{}) (pgx.Rows, error) { 177 | mockArgs := m.Called(ctx, sql, args) 178 | return mockArgs.Get(0).(pgx.Rows), mockArgs.Error(1) 179 | } 180 | 181 | func (m *MockTx) QueryRow(ctx context.Context, sql string, args ...interface{}) pgx.Row { 182 | callArgs := []interface{}{ctx, sql} 183 | callArgs = append(callArgs, args...) 184 | mockArgs := m.Called(callArgs...) 185 | return mockArgs.Get(0).(pgx.Row) 186 | } 187 | 188 | func (m *MockTx) Conn() *pgx.Conn { 189 | args := m.Called() 190 | return args.Get(0).(*pgx.Conn) 191 | } 192 | 193 | func (m *MockTx) Rollback(ctx context.Context) error { 194 | args := m.Called(ctx) 195 | return args.Error(0) 196 | } 197 | 198 | type MockRow struct { 199 | scanFunc func(dest ...interface{}) error 200 | } 201 | 202 | func (m MockRow) Scan(dest ...interface{}) error { 203 | return m.scanFunc(dest...) 204 | } 205 | 206 | type MockRows struct { 207 | mock.Mock 208 | } 209 | 210 | func (m *MockRows) Next() bool { 211 | args := m.Called() 212 | return args.Bool(0) 213 | } 214 | 215 | func (m *MockRows) Scan(dest ...interface{}) error { 216 | args := m.Called(dest...) 217 | return args.Error(0) 218 | } 219 | 220 | func (m *MockRows) Err() error { 221 | args := m.Called() 222 | return args.Error(0) 223 | } 224 | 225 | func (m *MockRows) Close() { 226 | m.Called() 227 | } 228 | 229 | func (m *MockRows) CommandTag() pgconn.CommandTag { 230 | args := m.Called() 231 | return args.Get(0).(pgconn.CommandTag) 232 | } 233 | 234 | func (m *MockRows) FieldDescriptions() []pgconn.FieldDescription { 235 | args := m.Called() 236 | return args.Get(0).([]pgconn.FieldDescription) 237 | } 238 | 239 | func (m *MockRows) Values() ([]interface{}, error) { 240 | args := m.Called() 241 | return args.Get(0).([]interface{}), args.Error(1) 242 | } 243 | 244 | func (m *MockRows) RawValues() [][]byte { 245 | args := m.Called() 246 | return args.Get(0).([][]byte) 247 | } 248 | 249 | func (m *MockRows) Conn() *pgx.Conn { 250 | args := m.Called() 251 | return args.Get(0).(*pgx.Conn) 252 | } 253 | 254 | // MockNATSClient mocks the NATSClient 255 | type MockNATSClient struct { 256 | mock.Mock 257 | } 258 | 259 | // PublishMessage mocks the PublishMessage method 260 | func (m *MockNATSClient) PublishMessage(subject string, data []byte) error { 261 | args := m.Called(subject, data) 262 | return args.Error(0) 263 | } 264 | 265 | // Close mocks the Close method 266 | func (m *MockNATSClient) Close() error { 267 | args := m.Called() 268 | return args.Error(0) 269 | } 270 | 271 | // SaveState mocks the SaveState method 272 | func (m *MockNATSClient) SaveState(state pgflonats.State) error { 273 | args := m.Called(state) 274 | return args.Error(0) 275 | } 276 | 277 | // GetState mocks the GetState method 278 | func (m *MockNATSClient) GetState() (pgflonats.State, error) { 279 | args := m.Called() 280 | return args.Get(0).(pgflonats.State), args.Error(1) 281 | } 282 | 283 | // JetStream mocks the JetStream method 284 | func (m *MockNATSClient) JetStream() nats.JetStreamContext { 285 | args := m.Called() 286 | return args.Get(0).(nats.JetStreamContext) 287 | } 288 | -------------------------------------------------------------------------------- /pkg/routing/README.md: -------------------------------------------------------------------------------- 1 | # Message Routing 2 | 3 | Table routing allows you to map source tables and columns to different destinations while preserving data types. 4 | 5 | ## Configuration 6 | 7 | Create a YAML file (e.g., `routing.yaml`) with your routing rules: 8 | 9 | ```yaml 10 | users: 11 | source_table: users 12 | destination_table: customers 13 | column_mappings: 14 | - source: id 15 | destination: customer_id 16 | - source: username 17 | destination: customer_name 18 | operations: 19 | - INSERT 20 | - UPDATE 21 | 22 | orders: 23 | source_table: orders 24 | destination_table: transactions 25 | column_mappings: 26 | - source: id 27 | destination: transaction_id 28 | - source: total_amount 29 | destination: amount 30 | operations: 31 | - INSERT 32 | - UPDATE 33 | - DELETE 34 | ``` 35 | 36 | ## Usage with Routing 37 | 38 | Start the worker with the routing configuration: 39 | 40 | ```shell 41 | pg_flo worker postgres --routing-config routing.yaml ... 42 | ``` 43 | 44 | ## Routing Rules 45 | 46 | Each table configuration supports: 47 | 48 | - `source_table`: Original table name (required) 49 | - `destination_table`: Target table name (optional, defaults to source_table) 50 | - `column_mappings`: List of column name mappings (optional) 51 | - `source`: Original column name 52 | - `destination`: New column name in target 53 | - `operations`: List of operations to replicate (required) 54 | - Supported: `INSERT`, `UPDATE`, `DELETE` 55 | 56 | ## Important Notes 57 | 58 | - Column data types must match between source and destination 59 | - Primary keys are automatically mapped 60 | - All specified columns must exist in both tables 61 | - Operations not listed in `operations` will be ignored. Defaults to all operations. 62 | - Unlisted columns are preserved with their original names 63 | - Complex types (jsonb, arrays) are preserved during mapping 64 | 65 | ## Examples 66 | 67 | ### Basic Table Mapping 68 | 69 | ```yaml 70 | users: 71 | source_table: users 72 | destination_table: customers 73 | operations: 74 | - INSERT 75 | - UPDATE 76 | ``` 77 | 78 | ### Column Remapping 79 | 80 | ```yaml 81 | products: 82 | source_table: products 83 | destination_table: items 84 | column_mappings: 85 | - source: id 86 | destination: item_id 87 | - source: name 88 | destination: item_name 89 | operations: 90 | - INSERT 91 | - UPDATE 92 | - DELETE 93 | ``` 94 | -------------------------------------------------------------------------------- /pkg/routing/router.go: -------------------------------------------------------------------------------- 1 | package routing 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/jackc/pglogrepl" 7 | "github.com/pgflo/pg_flo/pkg/utils" 8 | "github.com/rs/zerolog" 9 | "github.com/rs/zerolog/log" 10 | ) 11 | 12 | type ColumnMapping struct { 13 | Source string `yaml:"source"` 14 | Destination string `yaml:"destination"` 15 | } 16 | 17 | type TableRoute struct { 18 | SourceTable string `yaml:"source_table"` 19 | DestinationTable string `yaml:"destination_table"` 20 | ColumnMappings []ColumnMapping `yaml:"column_mappings"` 21 | Operations []utils.OperationType `yaml:"operations"` 22 | } 23 | 24 | type Router struct { 25 | Routes map[string]TableRoute 26 | mutex sync.RWMutex 27 | logger zerolog.Logger 28 | } 29 | 30 | func NewRouter() *Router { 31 | return &Router{ 32 | Routes: make(map[string]TableRoute), 33 | logger: log.With().Str("component", "router").Logger(), 34 | } 35 | } 36 | 37 | func (r *Router) AddRoute(route TableRoute) { 38 | r.mutex.Lock() 39 | defer r.mutex.Unlock() 40 | r.Routes[route.SourceTable] = route 41 | } 42 | 43 | func (r *Router) ApplyRouting(message *utils.CDCMessage) (*utils.CDCMessage, error) { 44 | r.mutex.RLock() 45 | defer r.mutex.RUnlock() 46 | route, exists := r.Routes[message.Table] 47 | if !exists { 48 | return message, nil 49 | } 50 | 51 | if !ContainsOperation(route.Operations, message.Type) { 52 | return nil, nil 53 | } 54 | 55 | routedMessage := *message 56 | routedMessage.Table = route.DestinationTable 57 | 58 | if len(route.ColumnMappings) > 0 { 59 | newColumns := make([]*pglogrepl.RelationMessageColumn, len(message.Columns)) 60 | for i, col := range message.Columns { 61 | newCol := *col 62 | mappedName := GetMappedColumnName(route.ColumnMappings, col.Name) 63 | if mappedName != "" { 64 | newCol.Name = mappedName 65 | } 66 | newColumns[i] = &newCol 67 | } 68 | routedMessage.Columns = newColumns 69 | 70 | if routedMessage.ReplicationKey.Type != utils.ReplicationKeyFull { 71 | mappedColumns := make([]string, len(routedMessage.ReplicationKey.Columns)) 72 | for i, keyCol := range routedMessage.ReplicationKey.Columns { 73 | mappedName := GetMappedColumnName(route.ColumnMappings, keyCol) 74 | if mappedName != "" { 75 | mappedColumns[i] = mappedName 76 | } else { 77 | mappedColumns[i] = keyCol 78 | } 79 | } 80 | routedMessage.ReplicationKey.Columns = mappedColumns 81 | } 82 | } 83 | 84 | return &routedMessage, nil 85 | } 86 | 87 | // ContainsOperation checks if the given operation is in the list of operations 88 | func ContainsOperation(operations []utils.OperationType, operation utils.OperationType) bool { 89 | for _, op := range operations { 90 | if op == operation { 91 | return true 92 | } 93 | } 94 | return false 95 | } 96 | 97 | // GetMappedColumnName returns the destination column name for a given source column name 98 | func GetMappedColumnName(mappings []ColumnMapping, sourceName string) string { 99 | for _, mapping := range mappings { 100 | if mapping.Source == sourceName { 101 | return mapping.Destination 102 | } 103 | } 104 | return "" 105 | } 106 | 107 | // LoadRoutes loads routes from the provided configuration 108 | func (r *Router) LoadRoutes(config map[string]TableRoute) error { 109 | for sourceName, route := range config { 110 | r.logger.Info(). 111 | Str("source_table", sourceName). 112 | Str("destination_table", route.DestinationTable). 113 | Any("operations", route.Operations). 114 | Any("column_mappings", route.ColumnMappings). 115 | Msg("Loading route") 116 | 117 | route.SourceTable = sourceName 118 | if route.DestinationTable == "" { 119 | route.DestinationTable = sourceName 120 | } 121 | r.AddRoute(route) 122 | } 123 | return nil 124 | } 125 | -------------------------------------------------------------------------------- /pkg/routing/tests/routing_test.go: -------------------------------------------------------------------------------- 1 | package routing_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/jackc/pglogrepl" 7 | "github.com/pgflo/pg_flo/pkg/routing" 8 | "github.com/pgflo/pg_flo/pkg/utils" 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestRouter_ApplyRouting(t *testing.T) { 13 | tests := []struct { 14 | name string 15 | routes map[string]routing.TableRoute 16 | inputMessage *utils.CDCMessage 17 | expectedOutput *utils.CDCMessage 18 | expectNil bool 19 | }{ 20 | { 21 | name: "Simple table routing", 22 | routes: map[string]routing.TableRoute{ 23 | "source_table": { 24 | SourceTable: "source_table", 25 | DestinationTable: "dest_table", 26 | Operations: []utils.OperationType{utils.OperationInsert, utils.OperationUpdate, utils.OperationDelete}, 27 | }, 28 | }, 29 | inputMessage: &utils.CDCMessage{ 30 | Type: utils.OperationInsert, 31 | Table: "source_table", 32 | Columns: []*pglogrepl.RelationMessageColumn{ 33 | {Name: "id", DataType: 23}, 34 | {Name: "name", DataType: 25}, 35 | }, 36 | }, 37 | expectedOutput: &utils.CDCMessage{ 38 | Type: utils.OperationInsert, 39 | Table: "dest_table", 40 | Columns: []*pglogrepl.RelationMessageColumn{ 41 | {Name: "id", DataType: 23}, 42 | {Name: "name", DataType: 25}, 43 | }, 44 | }, 45 | }, 46 | { 47 | name: "Column mapping", 48 | routes: map[string]routing.TableRoute{ 49 | "users": { 50 | SourceTable: "users", 51 | DestinationTable: "customers", 52 | ColumnMappings: []routing.ColumnMapping{ 53 | {Source: "user_id", Destination: "customer_id"}, 54 | {Source: "user_name", Destination: "customer_name"}, 55 | }, 56 | Operations: []utils.OperationType{utils.OperationInsert, utils.OperationUpdate, utils.OperationDelete}, 57 | }, 58 | }, 59 | inputMessage: &utils.CDCMessage{ 60 | Type: utils.OperationUpdate, 61 | Table: "users", 62 | Columns: []*pglogrepl.RelationMessageColumn{ 63 | {Name: "user_id", DataType: 23}, 64 | {Name: "user_name", DataType: 25}, 65 | {Name: "email", DataType: 25}, 66 | }, 67 | }, 68 | expectedOutput: &utils.CDCMessage{ 69 | Type: utils.OperationUpdate, 70 | Table: "customers", 71 | Columns: []*pglogrepl.RelationMessageColumn{ 72 | {Name: "customer_id", DataType: 23}, 73 | {Name: "customer_name", DataType: 25}, 74 | {Name: "email", DataType: 25}, 75 | }, 76 | }, 77 | }, 78 | { 79 | name: "Operation filtering - allowed", 80 | routes: map[string]routing.TableRoute{ 81 | "orders": { 82 | SourceTable: "orders", 83 | DestinationTable: "processed_orders", 84 | Operations: []utils.OperationType{utils.OperationInsert, utils.OperationUpdate}, 85 | }, 86 | }, 87 | inputMessage: &utils.CDCMessage{ 88 | Type: utils.OperationUpdate, 89 | Table: "orders", 90 | }, 91 | expectedOutput: &utils.CDCMessage{ 92 | Type: utils.OperationUpdate, 93 | Table: "processed_orders", 94 | }, 95 | }, 96 | { 97 | name: "Operation filtering - not allowed", 98 | routes: map[string]routing.TableRoute{ 99 | "orders": { 100 | SourceTable: "orders", 101 | DestinationTable: "processed_orders", 102 | Operations: []utils.OperationType{utils.OperationInsert, utils.OperationUpdate}, 103 | }, 104 | }, 105 | inputMessage: &utils.CDCMessage{ 106 | Type: utils.OperationDelete, 107 | Table: "orders", 108 | }, 109 | expectNil: true, 110 | }, 111 | { 112 | name: "No route for table", 113 | routes: map[string]routing.TableRoute{}, 114 | inputMessage: &utils.CDCMessage{ 115 | Type: utils.OperationInsert, 116 | Table: "unknown_table", 117 | }, 118 | expectedOutput: &utils.CDCMessage{ 119 | Type: utils.OperationInsert, 120 | Table: "unknown_table", 121 | }, 122 | }, 123 | } 124 | 125 | for _, tt := range tests { 126 | t.Run(tt.name, func(t *testing.T) { 127 | router := routing.NewRouter() 128 | for _, route := range tt.routes { 129 | router.AddRoute(route) 130 | } 131 | 132 | result, err := router.ApplyRouting(tt.inputMessage) 133 | 134 | assert.NoError(t, err) 135 | 136 | if tt.expectNil { 137 | assert.Nil(t, result) 138 | } else { 139 | assert.NotNil(t, result) 140 | assert.Equal(t, tt.expectedOutput.Type, result.Type) 141 | assert.Equal(t, tt.expectedOutput.Table, result.Table) 142 | assert.Equal(t, len(tt.expectedOutput.Columns), len(result.Columns)) 143 | for i, col := range tt.expectedOutput.Columns { 144 | assert.Equal(t, col.Name, result.Columns[i].Name) 145 | assert.Equal(t, col.DataType, result.Columns[i].DataType) 146 | } 147 | } 148 | }) 149 | } 150 | } 151 | 152 | func TestRouter_LoadRoutes(t *testing.T) { 153 | router := routing.NewRouter() 154 | config := map[string]routing.TableRoute{ 155 | "table1": { 156 | SourceTable: "table1", 157 | DestinationTable: "dest_table1", 158 | ColumnMappings: []routing.ColumnMapping{ 159 | {Source: "col1", Destination: "dest_col1"}, 160 | }, 161 | Operations: []utils.OperationType{utils.OperationInsert, utils.OperationUpdate}, 162 | }, 163 | "table2": { 164 | SourceTable: "table2", 165 | DestinationTable: "dest_table2", 166 | Operations: []utils.OperationType{utils.OperationInsert, utils.OperationUpdate, utils.OperationDelete}, 167 | }, 168 | } 169 | 170 | err := router.LoadRoutes(config) 171 | assert.NoError(t, err) 172 | 173 | assert.Len(t, router.Routes, 2) 174 | assert.Contains(t, router.Routes, "table1") 175 | assert.Contains(t, router.Routes, "table2") 176 | 177 | assert.Equal(t, "dest_table1", router.Routes["table1"].DestinationTable) 178 | assert.Equal(t, "dest_table2", router.Routes["table2"].DestinationTable) 179 | 180 | assert.Len(t, router.Routes["table1"].ColumnMappings, 1) 181 | assert.Len(t, router.Routes["table1"].Operations, 2) 182 | assert.Len(t, router.Routes["table2"].Operations, 3) 183 | } 184 | 185 | func TestRouter_AddRoute(t *testing.T) { 186 | router := routing.NewRouter() 187 | route := routing.TableRoute{ 188 | SourceTable: "source", 189 | DestinationTable: "destination", 190 | ColumnMappings: []routing.ColumnMapping{ 191 | {Source: "src_col", Destination: "dest_col"}, 192 | }, 193 | Operations: []utils.OperationType{utils.OperationInsert}, 194 | } 195 | 196 | router.AddRoute(route) 197 | 198 | assert.Len(t, router.Routes, 1) 199 | assert.Contains(t, router.Routes, "source") 200 | assert.Equal(t, route, router.Routes["source"]) 201 | } 202 | 203 | func TestContainsOperation(t *testing.T) { 204 | operations := []utils.OperationType{utils.OperationInsert, utils.OperationUpdate} 205 | 206 | assert.True(t, routing.ContainsOperation(operations, utils.OperationInsert)) 207 | assert.True(t, routing.ContainsOperation(operations, utils.OperationUpdate)) 208 | assert.False(t, routing.ContainsOperation(operations, utils.OperationDelete)) 209 | } 210 | 211 | func TestGetMappedColumnName(t *testing.T) { 212 | mappings := []routing.ColumnMapping{ 213 | {Source: "col1", Destination: "mapped_col1"}, 214 | {Source: "col2", Destination: "mapped_col2"}, 215 | } 216 | 217 | assert.Equal(t, "mapped_col1", routing.GetMappedColumnName(mappings, "col1")) 218 | assert.Equal(t, "mapped_col2", routing.GetMappedColumnName(mappings, "col2")) 219 | assert.Equal(t, "", routing.GetMappedColumnName(mappings, "col3")) 220 | } 221 | -------------------------------------------------------------------------------- /pkg/rules/README.md: -------------------------------------------------------------------------------- 1 | ## Transformation Rules 2 | 3 | There are two types of transformation rules available: 4 | 5 | 1. **Regex Transform** 6 | 7 | - Type: `"regex"` 8 | - Parameters: 9 | - `pattern`: The regular expression pattern to match 10 | - `replace`: The replacement string 11 | - Description: Applies a regular expression replacement on string values in the specified column. 12 | 13 | 2. **Mask Transform** 14 | - Type: `"mask"` 15 | - Parameters: 16 | - `mask_char`: The character to use for masking 17 | - Description: Masks the content of string values, keeping the first and last characters visible and replacing the rest with the specified mask character. 18 | 19 | ## Filtering Rules 20 | 21 | Filtering rules use various comparison operators to determine whether a row should be included in the output. The available operators are: 22 | 23 | 1. **Equality** (`"eq"`) 24 | 2. **Inequality** (`"ne"`) 25 | 3. **Greater Than** (`"gt"`) 26 | 4. **Less Than** (`"lt"`) 27 | 5. **Greater Than or Equal To** (`"gte"`) 28 | 6. **Less Than or Equal To** (`"lte"`) 29 | 7. **Contains** (`"contains"`) 30 | 31 | ## Rule Properties 32 | 33 | Both transformation and filtering rules share these common properties: 34 | 35 | - `type`: Specifies whether it's a "transform" or "filter" rule. 36 | - `column`: The name of the column to apply the rule to. 37 | - `operations`: An array of operations (INSERT, UPDATE, DELETE) to which the rule should be applied. If not specified, it applies to all operations. 38 | - `allow_empty_deletes`: A boolean flag that, when set to true, allows the rule to process delete operations even if the column value is empty. 39 | 40 | ## Additional Notes 41 | 42 | - The rules support various data types, including integers, floats, strings, timestamps, booleans, and numeric (decimal) values. 43 | - For filtering rules, the comparison is type-aware, ensuring that values are compared appropriately based on their data type. 44 | - The `contains` operator for filtering only works on string values. 45 | - Transformation rules currently only work on string values. If a non-string value is encountered, the transformation is skipped and a warning is logged. 46 | 47 | To use these rules, you would define them in a YAML configuration file and specify the path to this file using the `--rules-config` flag when running `pg_flo`. The exact structure of the YAML file should match the rule properties and parameters described above. 48 | -------------------------------------------------------------------------------- /pkg/rules/engine.go: -------------------------------------------------------------------------------- 1 | package rules 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/pgflo/pg_flo/pkg/utils" 7 | ) 8 | 9 | // AddRule adds a new rule for the specified table 10 | func (re *RuleEngine) AddRule(tableName string, rule Rule) { 11 | re.mutex.Lock() 12 | defer re.mutex.Unlock() 13 | re.Rules[tableName] = append(re.Rules[tableName], rule) 14 | } 15 | 16 | // ApplyRules applies all rules for the specified table to the given data 17 | func (re *RuleEngine) ApplyRules(message *utils.CDCMessage) (*utils.CDCMessage, error) { 18 | re.mutex.RLock() 19 | defer re.mutex.RUnlock() 20 | 21 | rules, exists := re.Rules[message.Table] 22 | if !exists { 23 | return message, nil // No rules for this table 24 | } 25 | 26 | logger.Info(). 27 | Str("table", message.Table). 28 | Str("operation", string(message.Type)). 29 | Int("ruleCount", len(rules)). 30 | Msg("Applying rules") 31 | 32 | var err error 33 | for _, rule := range rules { 34 | message, err = rule.Apply(message) 35 | if err != nil { 36 | return nil, err 37 | } 38 | if message == nil { 39 | // Message filtered out 40 | return nil, nil 41 | } 42 | } 43 | return message, nil 44 | } 45 | 46 | // LoadRules loads rules from the provided configuration 47 | func (re *RuleEngine) LoadRules(config Config) error { 48 | for tableName, ruleConfigs := range config.Tables { 49 | logger.Info().Str("table", tableName).Msg("Loading rules for table") 50 | for i, ruleConfig := range ruleConfigs { 51 | rule, err := createRule(tableName, ruleConfig) 52 | if err != nil { 53 | return fmt.Errorf("error creating rule for table %s: %w", tableName, err) 54 | } 55 | logger.Info(). 56 | Str("table", tableName). 57 | Int("ruleIndex", i+1). 58 | Str("ruleType", fmt.Sprintf("%T", rule)). 59 | Msg("Created rule") 60 | re.AddRule(tableName, rule) 61 | } 62 | } 63 | return nil 64 | } 65 | 66 | // createRule creates a new rule based on the provided configuration 67 | func createRule(tableName string, config RuleConfig) (Rule, error) { 68 | logger.Info(). 69 | Str("table", tableName). 70 | Str("ruleType", config.Type). 71 | Msg("Creating rule") 72 | switch config.Type { 73 | case "transform": 74 | return NewTransformRule(tableName, config.Column, config.Parameters) 75 | case "filter": 76 | return NewFilterRule(tableName, config.Column, config.Parameters) 77 | default: 78 | return nil, fmt.Errorf("unknown rule type: %s", config.Type) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /pkg/rules/tests/engine_test.go: -------------------------------------------------------------------------------- 1 | package rules_test 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "testing" 7 | 8 | "github.com/jackc/pglogrepl" 9 | "github.com/jackc/pgtype" 10 | "github.com/pgflo/pg_flo/pkg/rules" 11 | "github.com/pgflo/pg_flo/pkg/utils" 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | func TestMain(m *testing.M) { 16 | log.SetOutput(os.Stdout) 17 | os.Exit(m.Run()) 18 | } 19 | 20 | func TestRuleEngine_AddRule(t *testing.T) { 21 | re := rules.NewRuleEngine() 22 | rule := &MockRule{ 23 | TableName: "users", 24 | ColumnName: "test_column", 25 | ApplyFunc: func(message *utils.CDCMessage) (*utils.CDCMessage, error) { 26 | return message, nil 27 | }, 28 | } 29 | re.AddRule("users", rule) 30 | 31 | message := &utils.CDCMessage{ 32 | Type: utils.OperationInsert, 33 | Schema: "public", 34 | Table: "users", 35 | Columns: []*pglogrepl.RelationMessageColumn{ 36 | {Name: "test_column", DataType: pgtype.TextOID}, 37 | }, 38 | NewTuple: &pglogrepl.TupleData{ 39 | Columns: []*pglogrepl.TupleDataColumn{ 40 | {Data: []byte("original")}, 41 | }, 42 | }, 43 | } 44 | 45 | result, err := re.ApplyRules(message) 46 | assert.NoError(t, err) 47 | assert.NotNil(t, result) 48 | } 49 | 50 | func TestRuleEngine_ApplyRules(t *testing.T) { 51 | re := rules.NewRuleEngine() 52 | rule := &MockRule{ 53 | TableName: "users", 54 | ColumnName: "test_column", 55 | ApplyFunc: func(message *utils.CDCMessage) (*utils.CDCMessage, error) { 56 | message.NewTuple.Columns[0].Data = []byte("transformed") 57 | return message, nil 58 | }, 59 | } 60 | re.AddRule("users", rule) 61 | 62 | message := &utils.CDCMessage{ 63 | Type: utils.OperationInsert, 64 | Schema: "public", 65 | Table: "users", 66 | Columns: []*pglogrepl.RelationMessageColumn{ 67 | {Name: "test_column", DataType: pgtype.TextOID}, 68 | }, 69 | NewTuple: &pglogrepl.TupleData{ 70 | Columns: []*pglogrepl.TupleDataColumn{ 71 | {Data: []byte("original")}, 72 | }, 73 | }, 74 | } 75 | 76 | result, err := re.ApplyRules(message) 77 | 78 | assert.NoError(t, err) 79 | value, err := result.GetColumnValue("test_column", false) 80 | assert.NoError(t, err) 81 | assert.Equal(t, "transformed", value) 82 | } 83 | 84 | func TestRuleEngine_ApplyRules_NoRules(t *testing.T) { 85 | re := rules.NewRuleEngine() 86 | message := &utils.CDCMessage{ 87 | Type: utils.OperationInsert, 88 | Schema: "public", 89 | Table: "users", 90 | Columns: []*pglogrepl.RelationMessageColumn{ 91 | {Name: "test_column", DataType: pgtype.TextOID}, 92 | }, 93 | NewTuple: &pglogrepl.TupleData{ 94 | Columns: []*pglogrepl.TupleDataColumn{ 95 | {Data: []byte("original")}, 96 | }, 97 | }, 98 | } 99 | 100 | result, err := re.ApplyRules(message) 101 | 102 | assert.NoError(t, err) 103 | assert.Equal(t, message, result) 104 | } 105 | 106 | func TestRuleEngine_LoadRules_Transform(t *testing.T) { 107 | re := rules.NewRuleEngine() 108 | config := rules.Config{ 109 | Tables: map[string][]rules.RuleConfig{ 110 | "users": { 111 | { 112 | Type: "transform", 113 | Column: "test_column", 114 | Parameters: map[string]interface{}{ 115 | "type": "mask", 116 | "mask_char": "*", 117 | }, 118 | Operations: []utils.OperationType{utils.OperationInsert, utils.OperationUpdate}, 119 | }, 120 | }, 121 | }, 122 | } 123 | 124 | err := re.LoadRules(config) 125 | assert.NoError(t, err) 126 | 127 | message := &utils.CDCMessage{ 128 | Type: utils.OperationInsert, 129 | Schema: "public", 130 | Table: "users", 131 | Columns: []*pglogrepl.RelationMessageColumn{ 132 | {Name: "test_column", DataType: pgtype.TextOID}, 133 | }, 134 | NewTuple: &pglogrepl.TupleData{ 135 | Columns: []*pglogrepl.TupleDataColumn{ 136 | {Data: []byte("test")}, 137 | }, 138 | }, 139 | } 140 | 141 | result, err := re.ApplyRules(message) 142 | assert.NoError(t, err) 143 | assert.NotNil(t, result) 144 | value, err := result.GetColumnValue("test_column", false) 145 | assert.NoError(t, err) 146 | assert.Equal(t, "t**t", value) 147 | } 148 | 149 | func TestRuleEngine_LoadRules_Filter(t *testing.T) { 150 | re := rules.NewRuleEngine() 151 | config := rules.Config{ 152 | Tables: map[string][]rules.RuleConfig{ 153 | "users": { 154 | { 155 | Type: "filter", 156 | Column: "id", 157 | Parameters: map[string]interface{}{ 158 | "operator": "gt", 159 | "value": int64(100), 160 | }, 161 | Operations: []utils.OperationType{utils.OperationDelete}, 162 | }, 163 | }, 164 | }, 165 | } 166 | 167 | err := re.LoadRules(config) 168 | assert.NoError(t, err) 169 | 170 | message := &utils.CDCMessage{ 171 | Type: utils.OperationDelete, 172 | Schema: "public", 173 | Table: "users", 174 | Columns: []*pglogrepl.RelationMessageColumn{ 175 | {Name: "id", DataType: pgtype.Int8OID}, 176 | }, 177 | OldTuple: &pglogrepl.TupleData{ 178 | Columns: []*pglogrepl.TupleDataColumn{ 179 | {Data: []byte("101")}, 180 | }, 181 | }, 182 | } 183 | 184 | result, err := re.ApplyRules(message) 185 | assert.NoError(t, err) 186 | assert.NotNil(t, result) 187 | value, err := result.GetColumnValue("id", true) 188 | assert.NoError(t, err) 189 | assert.Equal(t, int64(101), value) 190 | 191 | message.OldTuple.Columns[0].Data = []byte("99") 192 | result, err = re.ApplyRules(message) 193 | assert.NoError(t, err) 194 | assert.Nil(t, result) 195 | } 196 | 197 | func TestRuleEngine_LoadRules_EmptyDeletes(t *testing.T) { 198 | re := rules.NewRuleEngine() 199 | config := rules.Config{ 200 | Tables: map[string][]rules.RuleConfig{ 201 | "users": { 202 | { 203 | Type: "filter", 204 | Column: "id", 205 | AllowEmptyDeletes: true, 206 | Parameters: map[string]interface{}{ 207 | "operator": "eq", 208 | "value": int64(101), 209 | }, 210 | Operations: []utils.OperationType{utils.OperationDelete}, 211 | }, 212 | }, 213 | }, 214 | } 215 | 216 | err := re.LoadRules(config) 217 | assert.NoError(t, err) 218 | 219 | message := &utils.CDCMessage{ 220 | Type: utils.OperationDelete, 221 | Schema: "public", 222 | Table: "users", 223 | Columns: []*pglogrepl.RelationMessageColumn{ 224 | {Name: "id", DataType: pgtype.Int8OID}, 225 | }, 226 | OldTuple: &pglogrepl.TupleData{ 227 | Columns: []*pglogrepl.TupleDataColumn{ 228 | {Data: []byte("101")}, 229 | }, 230 | }, 231 | } 232 | 233 | result, err := re.ApplyRules(message) 234 | assert.NoError(t, err) 235 | assert.NotNil(t, result) 236 | value, err := result.GetColumnValue("id", true) 237 | assert.NoError(t, err) 238 | assert.Equal(t, int64(101), value) 239 | } 240 | 241 | func TestRuleEngine_ApplyRules_FilterRule(t *testing.T) { 242 | re := rules.NewRuleEngine() 243 | config := rules.Config{ 244 | Tables: map[string][]rules.RuleConfig{ 245 | "users": { 246 | { 247 | Type: "filter", 248 | Column: "id", 249 | Parameters: map[string]interface{}{ 250 | "operator": "gt", 251 | "value": int64(100), 252 | }, 253 | Operations: []utils.OperationType{utils.OperationUpdate}, 254 | }, 255 | }, 256 | }, 257 | } 258 | 259 | err := re.LoadRules(config) 260 | assert.NoError(t, err) 261 | 262 | message := &utils.CDCMessage{ 263 | Type: utils.OperationUpdate, 264 | Schema: "public", 265 | Table: "users", 266 | Columns: []*pglogrepl.RelationMessageColumn{ 267 | {Name: "id", DataType: pgtype.Int8OID}, 268 | }, 269 | NewTuple: &pglogrepl.TupleData{ 270 | Columns: []*pglogrepl.TupleDataColumn{ 271 | {Data: []byte("101")}, 272 | }, 273 | }, 274 | } 275 | result, err := re.ApplyRules(message) 276 | 277 | assert.NoError(t, err) 278 | assert.NotNil(t, result) 279 | idValue, err := result.GetColumnValue("id", false) 280 | assert.NoError(t, err) 281 | assert.Equal(t, int64(101), idValue) 282 | 283 | message.NewTuple.Columns[0].Data = []byte("99") 284 | result, err = re.ApplyRules(message) 285 | 286 | assert.NoError(t, err) 287 | assert.Nil(t, result) 288 | 289 | message.Type = utils.OperationInsert 290 | message.NewTuple.Columns[0].Data = []byte("101") 291 | result, err = re.ApplyRules(message) 292 | 293 | assert.NoError(t, err) 294 | assert.NotNil(t, result) 295 | idValue, err = result.GetColumnValue("id", false) 296 | assert.NoError(t, err) 297 | assert.Equal(t, int64(101), idValue) 298 | } 299 | -------------------------------------------------------------------------------- /pkg/rules/tests/mocks_test.go: -------------------------------------------------------------------------------- 1 | package rules_test 2 | 3 | import ( 4 | "github.com/pgflo/pg_flo/pkg/utils" 5 | ) 6 | 7 | type MockRule struct { 8 | TableName string 9 | ColumnName string 10 | ApplyFunc func(*utils.CDCMessage) (*utils.CDCMessage, error) 11 | } 12 | 13 | func (r *MockRule) Apply(message *utils.CDCMessage) (*utils.CDCMessage, error) { 14 | return r.ApplyFunc(message) 15 | } 16 | -------------------------------------------------------------------------------- /pkg/rules/types.go: -------------------------------------------------------------------------------- 1 | package rules 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/pgflo/pg_flo/pkg/utils" 7 | ) 8 | 9 | // Rule interface defines the methods that all rules must implement 10 | type Rule interface { 11 | Apply(message *utils.CDCMessage) (*utils.CDCMessage, error) 12 | } 13 | 14 | // RuleConfig represents the configuration for a single rule 15 | type RuleConfig struct { 16 | Type string `yaml:"type"` 17 | Column string `yaml:"column"` 18 | Parameters map[string]interface{} `yaml:"parameters"` 19 | Operations []utils.OperationType `yaml:"operations,omitempty"` 20 | AllowEmptyDeletes bool `yaml:"allow_empty_deletes,omitempty"` 21 | } 22 | 23 | // Config represents the overall configuration for rules 24 | type Config struct { 25 | Tables map[string][]RuleConfig `yaml:"tables"` 26 | } 27 | 28 | // TransformRule represents a rule that transforms data 29 | type TransformRule struct { 30 | TableName string 31 | ColumnName string 32 | Transform func(*utils.CDCMessage) (*utils.CDCMessage, error) 33 | Operations []utils.OperationType 34 | AllowEmptyDeletes bool 35 | } 36 | 37 | // FilterRule represents a rule that filters data 38 | type FilterRule struct { 39 | TableName string 40 | ColumnName string 41 | Condition func(*utils.CDCMessage) bool 42 | Operations []utils.OperationType 43 | AllowEmptyDeletes bool 44 | } 45 | 46 | // RuleEngine manages and applies rules to data 47 | type RuleEngine struct { 48 | Rules map[string][]Rule // map of table name to slice of rules 49 | mutex sync.RWMutex 50 | } 51 | 52 | // NewRuleEngine creates a new RuleEngine instance 53 | func NewRuleEngine() *RuleEngine { 54 | return &RuleEngine{ 55 | Rules: make(map[string][]Rule), 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /pkg/sinks/README.md: -------------------------------------------------------------------------------- 1 | # Supported Sinks in pg_flo 2 | 3 | pg_flo supports various sink types (destinations) for streaming data changes. This document provides an overview of the supported sinks and how to use them via the command-line interface. 4 | 5 | - [Available Sinks](#available-sinks) 6 | - [Common Flags](#common-flags) 7 | - [STDOUT Sink](#stdout-sink) 8 | - [Usage](#usage) 9 | - [Example](#example) 10 | - [File Sink](#file-sink) 11 | - [Usage](#usage-1) 12 | - [Additional Flags](#additional-flags) 13 | - [Example](#example-1) 14 | - [PostgreSQL Sink](#postgresql-sink) 15 | - [Usage](#usage-2) 16 | - [Additional Flags](#additional-flags-1) 17 | - [Example](#example-2) 18 | - [Additional Behavior](#additional-behavior) 19 | - [Webhook Sink](#webhook-sink) 20 | - [Usage](#usage-3) 21 | - [Additional Flags](#additional-flags-2) 22 | - [Example](#example-3) 23 | - [Additional Behavior](#additional-behavior-1) 24 | - [Sink Interface](#sink-interface) 25 | 26 | ## Available Sinks 27 | 28 | 1. STDOUT 29 | 2. File 30 | 3. PostgreSQL 31 | 4. Webhook 32 | 33 | ## Common Flags 34 | 35 | These flags are common to all sink types: 36 | 37 | - `--host`: PostgreSQL source host 38 | - `--port`: PostgreSQL source port 39 | - `--dbname`: PostgreSQL source database name 40 | - `--user`: PostgreSQL source user 41 | - `--password`: PostgreSQL source password 42 | - `--group`: Group name for replication 43 | - `--tables`: Tables to replicate (comma-separated) 44 | - `--status-dir`: Directory to store status files 45 | 46 | ## STDOUT Sink 47 | 48 | The STDOUT sink writes changes directly to the console output. 49 | 50 | ### Usage 51 | 52 | ```shell 53 | pg_flo stream stdout [common flags] 54 | ``` 55 | 56 | ### Example 57 | 58 | ```shell 59 | pg_flo stream stdout \ 60 | --host localhost \ 61 | --port 5432 \ 62 | --dbname your_database \ 63 | --user your_user \ 64 | --password your_password \ 65 | --group your_group \ 66 | --tables table1,table2 \ 67 | --status-dir /tmp/pg_flo-status 68 | ``` 69 | 70 | ## File Sink 71 | 72 | The File sink writes changes to files in the specified output directory. 73 | 74 | ### Usage 75 | 76 | ```shell 77 | pg_flo stream file [common flags] --output-dir 78 | ``` 79 | 80 | ### Additional Flags 81 | 82 | - `--output-dir`: Output directory for file sink 83 | 84 | ### Example 85 | 86 | ```shell 87 | pg_flo stream file \ 88 | --host localhost \ 89 | --port 5432 \ 90 | --dbname your_database \ 91 | --user your_user \ 92 | --password your_password \ 93 | --group your_group \ 94 | --tables table1,table2 \ 95 | --status-dir /tmp/pg_flo-status \ 96 | --output-dir /tmp/pg_flo-output 97 | ``` 98 | 99 | ## PostgreSQL Sink 100 | 101 | The PostgreSQL sink replicates changes to another PostgreSQL database. To ensure accurate replication of updates and deletes, all tables must have a primary key defined. 102 | 103 | ### Usage 104 | 105 | ```shell 106 | pg_flo stream postgres [common flags] [postgres sink flags] 107 | ``` 108 | 109 | ### Additional Flags 110 | 111 | - `--target-host`: Target PostgreSQL host 112 | - `--target-port`: Target PostgreSQL port 113 | - `--target-dbname`: Target PostgreSQL database name 114 | - `--target-user`: Target PostgreSQL user 115 | - `--target-password`: Target PostgreSQL password 116 | - `--sync-schema`: Sync schema from source to target via `pg_dump` (boolean flag) 117 | 118 | ### Example 119 | 120 | ```shell 121 | pg_flo stream postgres \ 122 | --host localhost \ 123 | --port 5432 \ 124 | --dbname source_db \ 125 | --user source_user \ 126 | --password source_password \ 127 | --group replication_group \ 128 | --tables table1,table2 \ 129 | --schema public \ 130 | --status-dir /tmp/pg_flo-status \ 131 | --target-host target.host.com \ 132 | --target-port 5433 \ 133 | --target-dbname target_db \ 134 | --target-user target_user \ 135 | --target-password target_password \ 136 | --sync-schema 137 | ``` 138 | 139 | ### Additional Behavior 140 | 141 | - Supports schema synchronization between source and target databases using `pg_dump` when the `--sync-schema` flag is set. 142 | - Creates an `internal_pg_flo` schema and `lsn_status` table to keep track of the last processed LSN. 143 | - Handles `INSERT`, `UPDATE`, `DELETE`, and `DDL` operations. 144 | - Uses `UPSERT` (`INSERT ... ON CONFLICT DO UPDATE`) for handling both `INSERT` and `UPDATE` operations efficiently. 145 | - Executes operations within a transaction for each batch of changes. 146 | - Rolls back the transaction and logs an error if any operation in the batch fails. 147 | 148 | ## Webhook Sink 149 | 150 | The Webhook sink sends changes as HTTP POST requests to a specified URL. 151 | 152 | ### Usage 153 | 154 | ```shell 155 | pg_flo stream webhook [common flags] --webhook-url 156 | ``` 157 | 158 | ### Additional Flags 159 | 160 | - `--webhook-url`: URL to send webhook POST requests 161 | 162 | ### Example 163 | 164 | ```shell 165 | pg_flo stream webhook \ 166 | --host localhost \ 167 | --port 5432 \ 168 | --dbname your_database \ 169 | --user your_user \ 170 | --password your_password \ 171 | --group your_group \ 172 | --tables table1,table2 \ 173 | --schema public \ 174 | --status-dir /tmp/pg_flo-status \ 175 | --webhook-url https://your-webhook-endpoint.com/receive 176 | ``` 177 | 178 | ### Additional Behavior 179 | 180 | - Sends each change as a separate HTTP POST request to the specified webhook URL. 181 | - Implements a retry mechanism with up to 3 attempts for failed requests. 182 | - Considers both network errors and non-2xx status codes as failures that trigger retries. 183 | - Maintains a status file to keep track of the last processed LSN. 184 | - The status file is stored in the specified status directory with the name `pg_flo_webhook_last_lsn.json`. 185 | 186 | ## Sink Interface 187 | 188 | `pg_flo` uses a common interface for all sink types, allowing for easy implementation of new sinks. The `Sink` interface defines the following methods: 189 | 190 | - `WriteBatch(data []interface{}) error`: Writes a batch of changes to the sink. 191 | - `Close() error`: Closes the sink, releasing any resources or connections. 192 | 193 | Sinks can save the last processed `LSN` at the destination (as appropriate). This ensures that if a `pg_flo` process shuts down (for example, during a deployment) and starts again, it knows where to resume from. 194 | -------------------------------------------------------------------------------- /pkg/sinks/file.go: -------------------------------------------------------------------------------- 1 | package sinks 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "sync" 8 | "time" 9 | 10 | "github.com/goccy/go-json" 11 | "github.com/pgflo/pg_flo/pkg/utils" 12 | 13 | "github.com/rs/zerolog" 14 | "github.com/rs/zerolog/log" 15 | ) 16 | 17 | func init() { 18 | log.Logger = log.Output(zerolog.ConsoleWriter{ 19 | Out: os.Stderr, 20 | TimeFormat: "15:04:05.000", 21 | }) 22 | } 23 | 24 | // FileSink represents a sink that writes data to files 25 | type FileSink struct { 26 | outputDir string 27 | currentFile *os.File 28 | currentSize int64 29 | maxFileSize int64 30 | rotateInterval time.Duration 31 | lastRotation time.Time 32 | mutex sync.Mutex 33 | } 34 | 35 | // NewFileSink creates a new FileSink instance 36 | func NewFileSink(outputDir string) (*FileSink, error) { 37 | sink := &FileSink{ 38 | outputDir: outputDir, 39 | maxFileSize: 100 * 1024 * 1024, // 100 MB 40 | rotateInterval: time.Hour, // Rotate every hour if size limit not reached 41 | } 42 | 43 | if err := os.MkdirAll(outputDir, 0755); err != nil { 44 | return nil, fmt.Errorf("failed to create output directory: %v", err) 45 | } 46 | 47 | if err := sink.rotateFile(); err != nil { 48 | return nil, fmt.Errorf("failed to create initial log file: %v", err) 49 | } 50 | 51 | return sink, nil 52 | } 53 | 54 | // rotateFile creates a new log file and updates the current file pointer 55 | func (s *FileSink) rotateFile() error { 56 | if s.currentFile != nil { 57 | err := s.currentFile.Close() 58 | if err != nil { 59 | return err 60 | } 61 | s.currentFile = nil 62 | } 63 | 64 | timestamp := time.Now().UTC().Format("20060102T150405Z") 65 | filename := fmt.Sprintf("pg_flo_log_%s.jsonl", timestamp) 66 | filepath := filepath.Join(s.outputDir, filename) 67 | 68 | file, err := os.OpenFile(filepath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 69 | if err != nil { 70 | return fmt.Errorf("failed to create new log file: %v", err) 71 | } 72 | 73 | s.currentFile = file 74 | s.currentSize = 0 75 | s.lastRotation = time.Now() 76 | 77 | log.Info().Str("file", filepath).Msg("Rotated to new log file") 78 | return nil 79 | } 80 | 81 | // WriteBatch writes a batch of data to the current log file 82 | func (s *FileSink) WriteBatch(messages []*utils.CDCMessage) error { 83 | s.mutex.Lock() 84 | defer s.mutex.Unlock() 85 | 86 | for _, message := range messages { 87 | decodedMessage, err := buildDecodedMessage(message) 88 | if err != nil { 89 | return fmt.Errorf("failed to build decoded message: %v", err) 90 | } 91 | 92 | jsonData, err := json.Marshal(decodedMessage) 93 | if err != nil { 94 | return fmt.Errorf("failed to marshal data to JSON: %v", err) 95 | } 96 | 97 | if s.currentFile == nil || s.currentSize >= s.maxFileSize || time.Since(s.lastRotation) >= s.rotateInterval { 98 | if err := s.rotateFile(); err != nil { 99 | return err 100 | } 101 | } 102 | 103 | jsonData = append(jsonData, '\n') 104 | n, err := s.currentFile.Write(jsonData) 105 | if err != nil { 106 | return fmt.Errorf("failed to write to log file: %v", err) 107 | } 108 | 109 | s.currentSize += int64(n) 110 | } 111 | return nil 112 | } 113 | 114 | // Close closes the current log file and performs any necessary cleanup 115 | func (s *FileSink) Close() error { 116 | s.mutex.Lock() 117 | defer s.mutex.Unlock() 118 | 119 | if s.currentFile != nil { 120 | err := s.currentFile.Close() 121 | s.currentFile = nil 122 | return err 123 | } 124 | return nil 125 | } 126 | -------------------------------------------------------------------------------- /pkg/sinks/shared.go: -------------------------------------------------------------------------------- 1 | package sinks 2 | 3 | import "github.com/pgflo/pg_flo/pkg/utils" 4 | 5 | func buildDecodedMessage(message *utils.CDCMessage) (map[string]interface{}, error) { 6 | decodedMessage := make(map[string]interface{}) 7 | decodedMessage["Type"] = message.Type 8 | decodedMessage["Schema"] = message.Schema 9 | decodedMessage["Table"] = message.Table 10 | decodedMessage["ReplicationKey"] = message.ReplicationKey 11 | decodedMessage["LSN"] = message.LSN 12 | decodedMessage["EmittedAt"] = message.EmittedAt 13 | 14 | if message.NewTuple != nil { 15 | newTuple := make(map[string]interface{}) 16 | for _, col := range message.Columns { 17 | value, err := message.GetColumnValue(col.Name, false) 18 | if err != nil { 19 | return nil, err 20 | } 21 | newTuple[col.Name] = value 22 | } 23 | decodedMessage["NewTuple"] = newTuple 24 | } 25 | 26 | if message.OldTuple != nil { 27 | oldTuple := make(map[string]interface{}) 28 | for _, col := range message.Columns { 29 | value, err := message.GetColumnValue(col.Name, true) 30 | if err != nil { 31 | return nil, err 32 | } 33 | oldTuple[col.Name] = value 34 | } 35 | decodedMessage["OldTuple"] = oldTuple 36 | } 37 | 38 | return decodedMessage, nil 39 | } 40 | -------------------------------------------------------------------------------- /pkg/sinks/sink.go: -------------------------------------------------------------------------------- 1 | package sinks 2 | 3 | import ( 4 | "github.com/pgflo/pg_flo/pkg/utils" 5 | ) 6 | 7 | type Sink interface { 8 | WriteBatch(data []*utils.CDCMessage) error 9 | } 10 | -------------------------------------------------------------------------------- /pkg/sinks/stdout.go: -------------------------------------------------------------------------------- 1 | package sinks 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/goccy/go-json" 7 | "github.com/pgflo/pg_flo/pkg/utils" 8 | ) 9 | 10 | // StdoutSink represents a sink that writes data to standard output 11 | type StdoutSink struct{} 12 | 13 | // NewStdoutSink creates a new StdoutSink instance 14 | func NewStdoutSink() (*StdoutSink, error) { 15 | return &StdoutSink{}, nil 16 | } 17 | 18 | // WriteBatch writes a batch of data to standard output 19 | func (s *StdoutSink) WriteBatch(messages []*utils.CDCMessage) error { 20 | for _, message := range messages { 21 | decodedMessage, err := buildDecodedMessage(message) 22 | if err != nil { 23 | return fmt.Errorf("failed to build decoded message: %v", err) 24 | } 25 | 26 | jsonData, err := json.Marshal(decodedMessage) 27 | if err != nil { 28 | return fmt.Errorf("failed to marshal data to JSON: %v", err) 29 | } 30 | 31 | if _, err := fmt.Println(string(jsonData)); err != nil { 32 | return err 33 | } 34 | } 35 | return nil 36 | } 37 | -------------------------------------------------------------------------------- /pkg/sinks/types.go: -------------------------------------------------------------------------------- 1 | package sinks 2 | 3 | import "github.com/jackc/pglogrepl" 4 | 5 | type Status struct { 6 | LastLSN pglogrepl.LSN `json:"last_lsn"` 7 | } 8 | -------------------------------------------------------------------------------- /pkg/sinks/webhooks.go: -------------------------------------------------------------------------------- 1 | package sinks 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "net/http" 7 | "os" 8 | 9 | "github.com/goccy/go-json" 10 | "github.com/pgflo/pg_flo/pkg/utils" 11 | "github.com/rs/zerolog" 12 | "github.com/rs/zerolog/log" 13 | ) 14 | 15 | func init() { 16 | log.Logger = log.Output(zerolog.ConsoleWriter{ 17 | Out: os.Stderr, 18 | TimeFormat: "15:04:05.000", 19 | }) 20 | } 21 | 22 | // WebhookSink represents a sink that sends data to a webhook endpoint 23 | type WebhookSink struct { 24 | webhookURL string 25 | client *http.Client 26 | } 27 | 28 | // NewWebhookSink creates a new WebhookSink instance 29 | func NewWebhookSink(webhookURL string) (*WebhookSink, error) { 30 | sink := &WebhookSink{ 31 | webhookURL: webhookURL, 32 | client: &http.Client{}, 33 | } 34 | 35 | return sink, nil 36 | } 37 | 38 | // WriteBatch sends a batch of data to the webhook endpoint 39 | func (s *WebhookSink) WriteBatch(messages []*utils.CDCMessage) error { 40 | for _, message := range messages { 41 | decodedMessage, err := buildDecodedMessage(message) 42 | if err != nil { 43 | return fmt.Errorf("failed to build decoded message: %v", err) 44 | } 45 | 46 | jsonData, err := json.Marshal(decodedMessage) 47 | if err != nil { 48 | return fmt.Errorf("failed to marshal data to JSON: %v", err) 49 | } 50 | 51 | if err = s.sendWithRetry(jsonData); err != nil { 52 | return err 53 | } 54 | } 55 | return nil 56 | } 57 | 58 | // sendWithRetry sends data to the webhook endpoint with retry logic 59 | func (s *WebhookSink) sendWithRetry(jsonData []byte) error { 60 | maxRetries := 3 61 | for attempt := 1; attempt <= maxRetries; attempt++ { 62 | req, err := http.NewRequest("POST", s.webhookURL, bytes.NewBuffer(jsonData)) 63 | if err != nil { 64 | return fmt.Errorf("failed to create request: %v", err) 65 | } 66 | 67 | req.Header.Set("Content-Type", "application/json") 68 | 69 | resp, err := s.client.Do(req) 70 | if err != nil { 71 | if attempt == maxRetries { 72 | return fmt.Errorf("failed to send webhook after %d attempts: %v", maxRetries, err) 73 | } 74 | log.Warn().Err(err).Int("attempt", attempt).Msg("Webhook request failed, retrying...") 75 | continue 76 | } 77 | defer resp.Body.Close() 78 | 79 | if resp.StatusCode >= 200 && resp.StatusCode < 300 { 80 | return nil 81 | } 82 | 83 | if attempt == maxRetries { 84 | return fmt.Errorf("webhook request failed with status code: %d after %d attempts", resp.StatusCode, maxRetries) 85 | } 86 | log.Warn().Int("statusCode", resp.StatusCode).Int("attempt", attempt).Msg("Received non-2xx status code, retrying...") 87 | } 88 | return nil 89 | } 90 | 91 | // Close performs any necessary cleanup (no-op for WebhookSink) 92 | func (s *WebhookSink) Close() error { 93 | return nil 94 | } 95 | -------------------------------------------------------------------------------- /pkg/utils/cdc_encoding.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "reflect" 7 | "strconv" 8 | "strings" 9 | "time" 10 | 11 | "github.com/jackc/pgx/v5/pgtype" 12 | ) 13 | 14 | // ConvertToPgCompatibleOutput converts a Go value to its PostgreSQL output format. 15 | func ConvertToPgCompatibleOutput(value interface{}, oid uint32) ([]byte, error) { 16 | if value == nil { 17 | return nil, nil 18 | } 19 | 20 | switch oid { 21 | case pgtype.BoolOID: 22 | return strconv.AppendBool(nil, value.(bool)), nil 23 | case pgtype.Int2OID, pgtype.Int4OID, pgtype.Int8OID: 24 | switch v := value.(type) { 25 | case int: 26 | return []byte(strconv.FormatInt(int64(v), 10)), nil 27 | case int32: 28 | return []byte(strconv.FormatInt(int64(v), 10)), nil 29 | case int64: 30 | return []byte(strconv.FormatInt(v, 10)), nil 31 | default: 32 | return []byte(fmt.Sprintf("%d", value)), nil 33 | } 34 | case pgtype.Float4OID, pgtype.Float8OID: 35 | return []byte(strconv.FormatFloat(value.(float64), 'f', -1, 64)), nil 36 | case pgtype.NumericOID: 37 | return []byte(fmt.Sprintf("%v", value)), nil 38 | case pgtype.TextOID, pgtype.VarcharOID: 39 | return []byte(value.(string)), nil 40 | case pgtype.ByteaOID: 41 | if byteaData, ok := value.([]byte); ok { 42 | return byteaData, nil 43 | } 44 | return nil, fmt.Errorf("invalid bytea data type") 45 | case pgtype.TimestampOID, pgtype.TimestamptzOID: 46 | return []byte(value.(time.Time).Format(time.RFC3339Nano)), nil 47 | case pgtype.DateOID: 48 | return []byte(value.(time.Time).Format("2006-01-02")), nil 49 | case pgtype.JSONOID: 50 | switch v := value.(type) { 51 | case string: 52 | return []byte(v), nil 53 | case []byte: 54 | return v, nil 55 | default: 56 | return nil, fmt.Errorf("unsupported type for JSON data: %T", value) 57 | } 58 | case pgtype.JSONBOID: 59 | if jsonBytes, ok := value.([]byte); ok { 60 | return jsonBytes, nil 61 | } 62 | return json.Marshal(value) 63 | case pgtype.TextArrayOID, pgtype.VarcharArrayOID, 64 | pgtype.Int2ArrayOID, pgtype.Int4ArrayOID, pgtype.Int8ArrayOID, 65 | pgtype.Float4ArrayOID, pgtype.Float8ArrayOID, pgtype.BoolArrayOID: 66 | return EncodeArray(value) 67 | default: 68 | return []byte(fmt.Sprintf("%v", value)), nil 69 | } 70 | } 71 | 72 | // EncodeArray encodes a slice of values into a PostgreSQL array format. 73 | func EncodeArray(value interface{}) ([]byte, error) { 74 | var elements []string 75 | 76 | switch slice := value.(type) { 77 | case []interface{}: 78 | for _, v := range slice { 79 | elem, err := encodeArrayElement(v) 80 | if err != nil { 81 | return nil, err 82 | } 83 | elements = append(elements, elem) 84 | } 85 | case []string: 86 | elements = append(elements, slice...) 87 | case []int, []int32, []int64, []float32, []float64, []bool: 88 | sliceValue := reflect.ValueOf(slice) 89 | for i := 0; i < sliceValue.Len(); i++ { 90 | elem, err := encodeArrayElement(sliceValue.Index(i).Interface()) 91 | if err != nil { 92 | return nil, err 93 | } 94 | elements = append(elements, elem) 95 | } 96 | default: 97 | return nil, fmt.Errorf("unsupported slice type: %T", value) 98 | } 99 | 100 | return []byte("{" + strings.Join(elements, ",") + "}"), nil 101 | } 102 | 103 | // encodeArrayElement encodes a single array element into a string representation. 104 | func encodeArrayElement(v interface{}) (string, error) { 105 | if v == nil { 106 | return "NULL", nil 107 | } 108 | 109 | switch val := v.(type) { 110 | case string: 111 | return val, nil 112 | case int, int32, int64, float32, float64: 113 | return fmt.Sprintf("%v", val), nil 114 | case bool: 115 | return strconv.FormatBool(val), nil 116 | case time.Time: 117 | return val.Format(time.RFC3339Nano), nil 118 | case []byte: 119 | return fmt.Sprintf("\\x%x", val), nil 120 | default: 121 | jsonBytes, err := json.Marshal(val) 122 | if err != nil { 123 | return "", fmt.Errorf("failed to marshal array element to JSON: %w", err) 124 | } 125 | return string(jsonBytes), nil 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /pkg/utils/cdc_message.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "encoding/hex" 7 | "encoding/json" 8 | "fmt" 9 | "strconv" 10 | "strings" 11 | "time" 12 | 13 | "github.com/jackc/pglogrepl" 14 | "github.com/jackc/pgx/v5/pgtype" 15 | ) 16 | 17 | // init registers types with the gob package for encoding/decoding 18 | func init() { 19 | gob.Register(json.RawMessage{}) 20 | gob.Register(time.Time{}) 21 | gob.Register(map[string]interface{}{}) 22 | gob.Register(pglogrepl.RelationMessageColumn{}) 23 | gob.Register(pglogrepl.LSN(0)) 24 | 25 | gob.Register(CDCMessage{}) 26 | gob.Register(pglogrepl.TupleData{}) 27 | gob.Register(pglogrepl.TupleDataColumn{}) 28 | } 29 | 30 | // CDCMessage represents a full message for Change Data Capture 31 | type CDCMessage struct { 32 | Type OperationType 33 | Schema string 34 | Table string 35 | Columns []*pglogrepl.RelationMessageColumn 36 | NewTuple *pglogrepl.TupleData 37 | OldTuple *pglogrepl.TupleData 38 | ReplicationKey ReplicationKey 39 | LSN string 40 | EmittedAt time.Time 41 | ToastedColumns map[string]bool 42 | } 43 | 44 | // MarshalBinary implements the encoding.BinaryMarshaler interface 45 | func (m CDCMessage) MarshalBinary() ([]byte, error) { 46 | return EncodeCDCMessage(m) 47 | } 48 | 49 | // UnmarshalBinary implements the encoding.BinaryUnmarshaler interface 50 | func (m *CDCMessage) UnmarshalBinary(data []byte) error { 51 | decodedMessage, err := DecodeCDCMessage(data) 52 | if err != nil { 53 | return err 54 | } 55 | *m = *decodedMessage 56 | return nil 57 | } 58 | 59 | func (m *CDCMessage) GetColumnIndex(columnName string) int { 60 | for i, col := range m.Columns { 61 | if col.Name == columnName { 62 | return i 63 | } 64 | } 65 | return -1 66 | } 67 | 68 | // GetColumnValue gets a column value, optionally using old values for DELETE/UPDATE 69 | func (m *CDCMessage) GetColumnValue(columnName string, useOldValues bool) (interface{}, error) { 70 | colIndex := m.GetColumnIndex(columnName) 71 | if colIndex == -1 { 72 | return nil, fmt.Errorf("column %s not found", columnName) 73 | } 74 | 75 | var data []byte 76 | if useOldValues && m.OldTuple != nil { 77 | data = m.OldTuple.Columns[colIndex].Data 78 | } else if m.NewTuple != nil { 79 | data = m.NewTuple.Columns[colIndex].Data 80 | } else { 81 | return nil, fmt.Errorf("no data available for column %s", columnName) 82 | } 83 | 84 | return DecodeValue(data, m.Columns[colIndex].DataType) 85 | } 86 | 87 | // SetColumnValue sets the value of a column, respecting its type 88 | func (m *CDCMessage) SetColumnValue(columnName string, value interface{}) error { 89 | colIndex := m.GetColumnIndex(columnName) 90 | if colIndex == -1 { 91 | return fmt.Errorf("column %s not found", columnName) 92 | } 93 | 94 | column := m.Columns[colIndex] 95 | encodedValue, err := EncodeValue(value, column.DataType) 96 | if err != nil { 97 | return err 98 | } 99 | 100 | if m.Type == OperationDelete { 101 | m.OldTuple.Columns[colIndex] = &pglogrepl.TupleDataColumn{Data: encodedValue} 102 | } else { 103 | m.NewTuple.Columns[colIndex] = &pglogrepl.TupleDataColumn{Data: encodedValue} 104 | } 105 | 106 | return nil 107 | } 108 | 109 | // EncodeCDCMessage encodes a CDCMessage into a byte slice 110 | func EncodeCDCMessage(m CDCMessage) ([]byte, error) { 111 | var buf bytes.Buffer 112 | enc := gob.NewEncoder(&buf) 113 | 114 | if err := enc.Encode(m.Type); err != nil { 115 | return nil, err 116 | } 117 | if err := enc.Encode(m.Schema); err != nil { 118 | return nil, err 119 | } 120 | if err := enc.Encode(m.Table); err != nil { 121 | return nil, err 122 | } 123 | if err := enc.Encode(m.Columns); err != nil { 124 | return nil, err 125 | } 126 | 127 | if err := enc.Encode(m.NewTuple != nil); err != nil { 128 | return nil, err 129 | } 130 | if m.NewTuple != nil { 131 | if err := enc.Encode(m.NewTuple); err != nil { 132 | return nil, err 133 | } 134 | } 135 | 136 | if err := enc.Encode(m.OldTuple != nil); err != nil { 137 | return nil, err 138 | } 139 | 140 | if m.OldTuple != nil { 141 | if err := enc.Encode(m.OldTuple); err != nil { 142 | return nil, err 143 | } 144 | } 145 | 146 | if err := enc.Encode(m.ReplicationKey); err != nil { 147 | return nil, err 148 | } 149 | 150 | if err := enc.Encode(m.LSN); err != nil { 151 | return nil, err 152 | } 153 | 154 | if err := enc.Encode(m.EmittedAt); err != nil { 155 | return nil, err 156 | } 157 | 158 | if err := enc.Encode(m.ToastedColumns); err != nil { 159 | return nil, err 160 | } 161 | 162 | return buf.Bytes(), nil 163 | } 164 | 165 | // DecodeCDCMessage decodes a byte slice into a CDCMessage 166 | func DecodeCDCMessage(data []byte) (*CDCMessage, error) { 167 | buf := bytes.NewBuffer(data) 168 | dec := gob.NewDecoder(buf) 169 | m := &CDCMessage{} 170 | 171 | if err := dec.Decode(&m.Type); err != nil { 172 | return nil, err 173 | } 174 | if err := dec.Decode(&m.Schema); err != nil { 175 | return nil, err 176 | } 177 | if err := dec.Decode(&m.Table); err != nil { 178 | return nil, err 179 | } 180 | if err := dec.Decode(&m.Columns); err != nil { 181 | return nil, err 182 | } 183 | 184 | var newTupleExists bool 185 | if err := dec.Decode(&newTupleExists); err != nil { 186 | return nil, err 187 | } 188 | if newTupleExists { 189 | m.NewTuple = &pglogrepl.TupleData{} 190 | if err := dec.Decode(m.NewTuple); err != nil { 191 | return nil, err 192 | } 193 | } 194 | 195 | var oldTupleExists bool 196 | if err := dec.Decode(&oldTupleExists); err != nil { 197 | return nil, err 198 | } 199 | if oldTupleExists { 200 | m.OldTuple = &pglogrepl.TupleData{} 201 | if err := dec.Decode(m.OldTuple); err != nil { 202 | return nil, err 203 | } 204 | } 205 | 206 | if err := dec.Decode(&m.ReplicationKey); err != nil { 207 | return nil, err 208 | } 209 | 210 | if err := dec.Decode(&m.LSN); err != nil { 211 | return nil, err 212 | } 213 | 214 | if err := dec.Decode(&m.EmittedAt); err != nil { 215 | return nil, err 216 | } 217 | 218 | if err := dec.Decode(&m.ToastedColumns); err != nil { 219 | return nil, err 220 | } 221 | 222 | return m, nil 223 | } 224 | 225 | // DecodeValue decodes a byte slice into a Go value based on the PostgreSQL data type 226 | func DecodeValue(data []byte, dataType uint32) (interface{}, error) { 227 | if data == nil { 228 | return nil, nil 229 | } 230 | strData := string(data) 231 | switch dataType { 232 | case pgtype.BoolOID: 233 | return strconv.ParseBool(string(data)) 234 | case pgtype.Int2OID, pgtype.Int4OID, pgtype.Int8OID: 235 | return strconv.ParseInt(string(data), 10, 64) 236 | case pgtype.Float4OID, pgtype.Float8OID: 237 | if strings.EqualFold(strData, "NULL") { 238 | return nil, nil 239 | } 240 | return strconv.ParseFloat(strData, 64) 241 | case pgtype.NumericOID: 242 | return string(data), nil 243 | case pgtype.TextOID, pgtype.VarcharOID: 244 | return string(data), nil 245 | case pgtype.ByteaOID: 246 | if strings.HasPrefix(strData, "\\x") { 247 | hexString := strData[2:] 248 | byteData, err := hex.DecodeString(hexString) 249 | if err != nil { 250 | return nil, fmt.Errorf("failed to decode bytea hex string: %v", err) 251 | } 252 | return byteData, nil 253 | } 254 | return data, nil 255 | case pgtype.TimestampOID, pgtype.TimestamptzOID: 256 | return ParseTimestamp(string(data)) 257 | case pgtype.DateOID: 258 | return time.Parse("2006-01-02", string(data)) 259 | case pgtype.JSONOID: 260 | return string(data), nil 261 | case pgtype.JSONBOID: 262 | var result interface{} 263 | err := json.Unmarshal(data, &result) 264 | return result, err 265 | case pgtype.TextArrayOID, pgtype.VarcharArrayOID: 266 | return DecodeTextArray(data) 267 | case pgtype.Int2ArrayOID, pgtype.Int4ArrayOID, pgtype.Int8ArrayOID, pgtype.Float4ArrayOID, pgtype.Float8ArrayOID, pgtype.BoolArrayOID: 268 | return DecodeArray(data, dataType) 269 | default: 270 | return string(data), nil 271 | } 272 | } 273 | 274 | // DecodeTextArray decodes a PostgreSQL text array into a []string 275 | func DecodeTextArray(data []byte) ([]string, error) { 276 | if len(data) < 2 || data[0] != '{' || data[len(data)-1] != '}' { 277 | return nil, fmt.Errorf("invalid array format") 278 | } 279 | elements := strings.Split(string(data[1:len(data)-1]), ",") 280 | for i, elem := range elements { 281 | elements[i] = strings.Trim(elem, "\"") 282 | } 283 | return elements, nil 284 | } 285 | 286 | // DecodeArray decodes a PostgreSQL array into a slice of the appropriate type 287 | func DecodeArray(data []byte, dataType uint32) (interface{}, error) { 288 | if len(data) < 2 || data[0] != '{' || data[len(data)-1] != '}' { 289 | return nil, fmt.Errorf("invalid array format") 290 | } 291 | elements := strings.Split(string(data[1:len(data)-1]), ",") 292 | 293 | switch dataType { 294 | case pgtype.Int2ArrayOID, pgtype.Int4ArrayOID, pgtype.Int8ArrayOID: 295 | result := make([]interface{}, len(elements)) 296 | for i, elem := range elements { 297 | if elem == "NULL" { 298 | result[i] = nil 299 | continue 300 | } 301 | val, err := strconv.ParseInt(elem, 10, 64) 302 | if err != nil { 303 | return nil, err 304 | } 305 | result[i] = val 306 | } 307 | return result, nil 308 | case pgtype.Float4ArrayOID, pgtype.Float8ArrayOID: 309 | result := make([]interface{}, len(elements)) 310 | for i, elem := range elements { 311 | if elem == "NULL" { 312 | result[i] = nil 313 | continue 314 | } 315 | val, err := strconv.ParseFloat(elem, 64) 316 | if err != nil { 317 | return nil, err 318 | } 319 | result[i] = val 320 | } 321 | return result, nil 322 | case pgtype.BoolArrayOID: 323 | result := make([]interface{}, len(elements)) 324 | for i, elem := range elements { 325 | if elem == "NULL" { 326 | result[i] = nil 327 | continue 328 | } 329 | val, err := strconv.ParseBool(elem) 330 | if err != nil { 331 | return nil, err 332 | } 333 | result[i] = val 334 | } 335 | return result, nil 336 | default: 337 | return elements, nil 338 | } 339 | } 340 | 341 | // EncodeValue encodes a Go value into a byte slice based on the PostgreSQL data type 342 | func EncodeValue(value interface{}, dataType uint32) ([]byte, error) { 343 | return ConvertToPgCompatibleOutput(value, dataType) 344 | } 345 | 346 | // IsColumnToasted checks if a column was TOASTed 347 | func (m *CDCMessage) IsColumnToasted(columnName string) bool { 348 | return m.ToastedColumns[columnName] 349 | } 350 | -------------------------------------------------------------------------------- /pkg/utils/retry.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "time" 6 | ) 7 | 8 | type RetryConfig struct { 9 | MaxAttempts int 10 | InitialWait time.Duration 11 | MaxWait time.Duration 12 | } 13 | 14 | func WithRetry(ctx context.Context, cfg RetryConfig, operation func() error) error { 15 | wait := cfg.InitialWait 16 | for attempt := 1; attempt <= cfg.MaxAttempts; attempt++ { 17 | err := operation() 18 | if err == nil { 19 | return nil 20 | } 21 | 22 | if attempt == cfg.MaxAttempts { 23 | return err 24 | } 25 | 26 | select { 27 | case <-ctx.Done(): 28 | return ctx.Err() 29 | case <-time.After(wait): 30 | // Exponential backoff with max wait 31 | wait *= 2 32 | if wait > cfg.MaxWait { 33 | wait = cfg.MaxWait 34 | } 35 | } 36 | } 37 | return nil 38 | } 39 | -------------------------------------------------------------------------------- /pkg/utils/shared.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/jackc/pgtype" 10 | ) 11 | 12 | // ParseTimestamp attempts to parse a timestamp string using multiple layouts 13 | func ParseTimestamp(value string) (time.Time, error) { 14 | layouts := []string{ 15 | time.RFC3339Nano, 16 | "2006-01-02 15:04:05.999999-07", 17 | "2006-01-02 15:04:05.999999Z07:00", 18 | "2006-01-02 15:04:05.999999", 19 | "2006-01-02T15:04:05.999999Z", 20 | "2006-01-02 15:04:05", 21 | "2006-01-02T15:04:05Z", 22 | } 23 | 24 | for _, layout := range layouts { 25 | if t, err := time.Parse(layout, value); err == nil { 26 | return t, nil 27 | } 28 | } 29 | 30 | return time.Time{}, fmt.Errorf("unable to parse timestamp: %s", value) 31 | } 32 | 33 | // OidToTypeName maps PostgreSQL OIDs to their corresponding type names 34 | var OidToTypeName = map[uint32]string{ 35 | pgtype.BoolOID: "bool", 36 | pgtype.ByteaOID: "bytea", 37 | pgtype.Int8OID: "int8", 38 | pgtype.Int2OID: "int2", 39 | pgtype.Int4OID: "int4", 40 | pgtype.TextOID: "text", 41 | pgtype.JSONOID: "json", 42 | pgtype.Float4OID: "float4", 43 | pgtype.Float8OID: "float8", 44 | pgtype.BoolArrayOID: "bool[]", 45 | pgtype.Int2ArrayOID: "int2[]", 46 | pgtype.Int4ArrayOID: "int4[]", 47 | pgtype.TextArrayOID: "text[]", 48 | pgtype.ByteaArrayOID: "bytea[]", 49 | pgtype.Int8ArrayOID: "int8[]", 50 | pgtype.Float4ArrayOID: "float4[]", 51 | pgtype.Float8ArrayOID: "float8[]", 52 | pgtype.BPCharOID: "bpchar", 53 | pgtype.VarcharOID: "varchar", 54 | pgtype.DateOID: "date", 55 | pgtype.TimeOID: "time", 56 | pgtype.TimestampOID: "timestamp", 57 | pgtype.TimestampArrayOID: "timestamp[]", 58 | pgtype.DateArrayOID: "date[]", 59 | pgtype.TimestamptzOID: "timestamptz", 60 | pgtype.TimestamptzArrayOID: "timestamptz[]", 61 | pgtype.IntervalOID: "interval", 62 | pgtype.NumericArrayOID: "numeric[]", 63 | pgtype.BitOID: "bit", 64 | pgtype.VarbitOID: "varbit", 65 | pgtype.NumericOID: "numeric", 66 | pgtype.UUIDOID: "uuid", 67 | pgtype.UUIDArrayOID: "uuid[]", 68 | pgtype.JSONBOID: "jsonb", 69 | pgtype.JSONBArrayOID: "jsonb[]", 70 | } 71 | 72 | // OIDToString converts a PostgreSQL OID to its string representation 73 | func OIDToString(oid uint32) string { 74 | if typeName, ok := OidToTypeName[oid]; ok { 75 | return typeName 76 | } 77 | return fmt.Sprintf("unknown_%d", oid) 78 | } 79 | 80 | // StringToOID converts a type name to its PostgreSQL OID 81 | func StringToOID(typeName string) uint32 { 82 | for oid, name := range OidToTypeName { 83 | if name == typeName { 84 | return oid 85 | } 86 | } 87 | return 0 88 | } 89 | 90 | // ToInt64 converts an interface{} to int64 91 | func ToInt64(v interface{}) (int64, bool) { 92 | switch v := v.(type) { 93 | case int, int8, int16, int32, int64: 94 | return reflect.ValueOf(v).Int(), true 95 | case uint, uint8, uint16, uint32, uint64: 96 | return int64(reflect.ValueOf(v).Uint()), true 97 | case string: 98 | if i, err := strconv.ParseInt(v, 10, 64); err == nil { 99 | return i, true 100 | } 101 | } 102 | return 0, false 103 | } 104 | 105 | // ToFloat64 converts an interface{} to float64 106 | func ToFloat64(v interface{}) (float64, bool) { 107 | switch v := v.(type) { 108 | case int, int8, int16, int32, int64: 109 | return float64(reflect.ValueOf(v).Int()), true 110 | case uint, uint8, uint16, uint32, uint64: 111 | return float64(reflect.ValueOf(v).Uint()), true 112 | case float32, float64: 113 | return reflect.ValueOf(v).Float(), true 114 | case string: 115 | if f, err := strconv.ParseFloat(v, 64); err == nil { 116 | return f, true 117 | } 118 | } 119 | return 0, false 120 | } 121 | 122 | // ToBool converts various types to bool 123 | func ToBool(v interface{}) (bool, bool) { 124 | switch v := v.(type) { 125 | case bool: 126 | return v, true 127 | case string: 128 | if v == "true" || v == "1" { 129 | return true, true 130 | } 131 | if v == "false" || v == "0" { 132 | return false, true 133 | } 134 | case int, int8, int16, int32, int64: 135 | return reflect.ValueOf(v).Int() != 0, true 136 | case uint, uint8, uint16, uint32, uint64: 137 | return reflect.ValueOf(v).Uint() != 0, true 138 | case float32, float64: 139 | return reflect.ValueOf(v).Float() != 0, true 140 | } 141 | return false, false 142 | } 143 | 144 | // IsValid checks if the replication key is properly configured 145 | func (rk *ReplicationKey) IsValid() bool { 146 | if rk.Type == ReplicationKeyFull { 147 | return true // FULL doesn't require specific columns 148 | } 149 | 150 | return len(rk.Columns) > 0 && 151 | (rk.Type == ReplicationKeyPK || rk.Type == ReplicationKeyUnique) 152 | } 153 | -------------------------------------------------------------------------------- /pkg/utils/shared_types.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // OperationType represents the type of database operation 4 | type OperationType string 5 | 6 | const ( 7 | OperationInsert OperationType = "INSERT" 8 | OperationUpdate OperationType = "UPDATE" 9 | OperationDelete OperationType = "DELETE" 10 | OperationDDL OperationType = "DDL" 11 | ) 12 | 13 | // ReplicationKeyType represents the type of replication key 14 | type ReplicationKeyType string 15 | 16 | const ( 17 | ReplicationKeyPK ReplicationKeyType = "PRIMARY KEY" 18 | ReplicationKeyUnique ReplicationKeyType = "UNIQUE" 19 | ReplicationKeyFull ReplicationKeyType = "FULL" // Replica identity full 20 | ) 21 | 22 | // ReplicationKey represents a key used for replication (either PK or unique constraint) 23 | type ReplicationKey struct { 24 | Type ReplicationKeyType 25 | Columns []string 26 | } 27 | 28 | type Logger interface { 29 | Debug() LogEvent 30 | Info() LogEvent 31 | Warn() LogEvent 32 | Error() LogEvent 33 | Err(err error) LogEvent 34 | } 35 | 36 | type LogEvent interface { 37 | Str(key, val string) LogEvent 38 | Int(key string, val int) LogEvent 39 | Int64(key string, val int64) LogEvent 40 | Uint8(key string, val uint8) LogEvent 41 | Uint32(key string, val uint32) LogEvent 42 | Interface(key string, val interface{}) LogEvent 43 | Err(err error) LogEvent 44 | Strs(key string, vals []string) LogEvent 45 | Any(key string, val interface{}) LogEvent 46 | Type(key string, val interface{}) LogEvent 47 | Msg(msg string) 48 | Msgf(format string, v ...interface{}) 49 | } 50 | -------------------------------------------------------------------------------- /pkg/utils/zerolog_logger.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "github.com/rs/zerolog" 5 | ) 6 | 7 | type ZerologLogger struct { 8 | logger zerolog.Logger 9 | } 10 | 11 | func NewZerologLogger(logger zerolog.Logger) Logger { 12 | return &ZerologLogger{logger: logger} 13 | } 14 | 15 | type ZerologLogEvent struct { 16 | event *zerolog.Event 17 | } 18 | 19 | func (z *ZerologLogger) Debug() LogEvent { 20 | return &ZerologLogEvent{event: z.logger.Debug()} 21 | } 22 | 23 | func (z *ZerologLogger) Info() LogEvent { 24 | return &ZerologLogEvent{event: z.logger.Info()} 25 | } 26 | 27 | func (z *ZerologLogger) Warn() LogEvent { 28 | return &ZerologLogEvent{event: z.logger.Warn()} 29 | } 30 | 31 | func (z *ZerologLogger) Error() LogEvent { 32 | return &ZerologLogEvent{event: z.logger.Error()} 33 | } 34 | 35 | func (z *ZerologLogger) Err(err error) LogEvent { 36 | return &ZerologLogEvent{event: z.logger.Err(err)} 37 | } 38 | 39 | func (e *ZerologLogEvent) Str(key, val string) LogEvent { 40 | e.event = e.event.Str(key, val) 41 | return e 42 | } 43 | 44 | func (e *ZerologLogEvent) Int(key string, val int) LogEvent { 45 | e.event = e.event.Int(key, val) 46 | return e 47 | } 48 | 49 | func (e *ZerologLogEvent) Int64(key string, val int64) LogEvent { 50 | e.event = e.event.Int64(key, val) 51 | return e 52 | } 53 | 54 | func (e *ZerologLogEvent) Uint32(key string, val uint32) LogEvent { 55 | e.event = e.event.Uint32(key, val) 56 | return e 57 | } 58 | 59 | func (e *ZerologLogEvent) Interface(key string, val interface{}) LogEvent { 60 | e.event = e.event.Interface(key, val) 61 | return e 62 | } 63 | 64 | func (e *ZerologLogEvent) Err(err error) LogEvent { 65 | e.event = e.event.Err(err) 66 | return e 67 | } 68 | 69 | func (e *ZerologLogEvent) Msg(msg string) { 70 | e.event.Msg(msg) 71 | } 72 | 73 | func (e *ZerologLogEvent) Msgf(format string, v ...interface{}) { 74 | e.event.Msgf(format, v...) 75 | } 76 | 77 | func (e *ZerologLogEvent) Strs(key string, vals []string) LogEvent { 78 | e.event = e.event.Strs(key, vals) 79 | return e 80 | } 81 | 82 | func (e *ZerologLogEvent) Any(key string, val interface{}) LogEvent { 83 | e.event = e.event.Interface(key, val) 84 | return e 85 | } 86 | 87 | func (e *ZerologLogEvent) Uint8(key string, val uint8) LogEvent { 88 | e.event = e.event.Uint8(key, val) 89 | return e 90 | } 91 | 92 | func (e *ZerologLogEvent) Type(key string, val interface{}) LogEvent { 93 | e.event = e.event.Type(key, val) 94 | return e 95 | } 96 | -------------------------------------------------------------------------------- /pkg/worker/worker.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "sync" 9 | "time" 10 | 11 | "github.com/nats-io/nats.go" // Use the standard NATS package 12 | "github.com/pgflo/pg_flo/pkg/pgflonats" 13 | "github.com/pgflo/pg_flo/pkg/routing" 14 | "github.com/pgflo/pg_flo/pkg/rules" 15 | "github.com/pgflo/pg_flo/pkg/sinks" 16 | "github.com/pgflo/pg_flo/pkg/utils" 17 | "github.com/rs/zerolog" 18 | "github.com/rs/zerolog/log" 19 | ) 20 | 21 | // Worker represents a worker that processes messages from NATS. 22 | type Worker struct { 23 | natsClient *pgflonats.NATSClient 24 | ruleEngine *rules.RuleEngine 25 | router *routing.Router 26 | sink sinks.Sink 27 | group string 28 | logger utils.Logger 29 | batchSize int 30 | buffer []*utils.CDCMessage 31 | lastSavedState uint64 32 | flushInterval time.Duration 33 | shutdownCh chan struct{} 34 | wg sync.WaitGroup 35 | } 36 | 37 | // Option is a function type that modifies Worker configuration 38 | type Option func(*Worker) 39 | 40 | // WithBatchSize sets the batch size for the worker 41 | func WithBatchSize(size int) Option { 42 | return func(w *Worker) { 43 | w.batchSize = size 44 | } 45 | } 46 | 47 | func init() { 48 | log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stdout, TimeFormat: "15:04:05.000"}) 49 | zerolog.TimeFieldFormat = "2006-01-02T15:04:05.000Z07:00" 50 | } 51 | 52 | // NewWorker creates and returns a new Worker instance with the provided NATS client 53 | func NewWorker(natsClient *pgflonats.NATSClient, ruleEngine *rules.RuleEngine, router *routing.Router, sink sinks.Sink, group string, opts ...Option) *Worker { 54 | logger := utils.NewZerologLogger(log.With().Str("component", "worker").Logger()) 55 | 56 | w := &Worker{ 57 | natsClient: natsClient, 58 | ruleEngine: ruleEngine, 59 | router: router, 60 | sink: sink, 61 | group: group, 62 | logger: logger, 63 | batchSize: 1000, 64 | buffer: make([]*utils.CDCMessage, 0, 1000), 65 | lastSavedState: 0, 66 | flushInterval: 500 * time.Millisecond, 67 | shutdownCh: make(chan struct{}), 68 | } 69 | 70 | for _, opt := range opts { 71 | opt(w) 72 | } 73 | w.buffer = make([]*utils.CDCMessage, 0, w.batchSize) 74 | 75 | return w 76 | } 77 | 78 | // Start begins the worker's message processing loop, setting up the NATS consumer and processing messages. 79 | func (w *Worker) Start(ctx context.Context) error { 80 | stream := fmt.Sprintf("pgflo_%s_stream", w.group) 81 | subject := fmt.Sprintf("pgflo.%s", w.group) 82 | 83 | w.logger.Info(). 84 | Str("stream", stream). 85 | Str("subject", subject). 86 | Str("group", w.group). 87 | Msg("Starting worker") 88 | 89 | js := w.natsClient.JetStream() 90 | 91 | consumerName := fmt.Sprintf("pgflo_%s_consumer", w.group) 92 | 93 | consumerConfig := &nats.ConsumerConfig{ 94 | Durable: consumerName, 95 | FilterSubject: subject, 96 | AckPolicy: nats.AckExplicitPolicy, 97 | MaxDeliver: 1, 98 | AckWait: 25 * time.Minute, 99 | DeliverPolicy: nats.DeliverAllPolicy, 100 | } 101 | 102 | _, err := js.AddConsumer(stream, consumerConfig) 103 | if err != nil && !errors.Is(err, nats.ErrConsumerNameAlreadyInUse) { 104 | w.logger.Error().Err(err).Msg("Failed to add or update consumer") 105 | return fmt.Errorf("failed to add or update consumer: %w", err) 106 | } 107 | 108 | sub, err := js.PullSubscribe(subject, consumerName) 109 | if err != nil { 110 | w.logger.Error().Err(err).Msg("Failed to subscribe to subject") 111 | return fmt.Errorf("failed to subscribe to subject: %w", err) 112 | } 113 | 114 | w.wg.Add(1) 115 | go func() { 116 | defer w.wg.Done() 117 | if err := w.processMessages(ctx, sub); err != nil && err != context.Canceled { 118 | w.logger.Error().Err(err).Msg("Error processing messages") 119 | } 120 | }() 121 | 122 | <-ctx.Done() 123 | w.logger.Info().Msg("Received shutdown signal. Initiating graceful shutdown...") 124 | 125 | w.wg.Wait() 126 | w.logger.Debug().Msg("All goroutines finished") 127 | 128 | return w.flushBuffer() 129 | } 130 | 131 | // processMessages continuously processes messages from the NATS consumer. 132 | func (w *Worker) processMessages(ctx context.Context, sub *nats.Subscription) error { 133 | flushTicker := time.NewTicker(w.flushInterval) 134 | defer flushTicker.Stop() 135 | 136 | for { 137 | select { 138 | case <-ctx.Done(): 139 | w.logger.Info().Msg("Flushing remaining messages") 140 | return w.flushBuffer() 141 | case <-flushTicker.C: 142 | if err := w.flushBuffer(); err != nil { 143 | w.logger.Error().Err(err).Msg("Failed to flush buffer on interval") 144 | } 145 | default: 146 | msgs, err := sub.Fetch(10, nats.MaxWait(500*time.Millisecond)) 147 | if err != nil && !errors.Is(err, nats.ErrTimeout) { 148 | w.logger.Error().Err(err).Msg("Error fetching messages") 149 | continue 150 | } 151 | 152 | for _, msg := range msgs { 153 | if err := w.processMessage(msg); err != nil { 154 | w.logger.Error().Err(err).Msg("Failed to process message") 155 | } 156 | if err := msg.Ack(); err != nil { 157 | w.logger.Error().Err(err).Msg("Failed to acknowledge message") 158 | } 159 | } 160 | if len(w.buffer) >= w.batchSize { 161 | if err := w.flushBuffer(); err != nil { 162 | w.logger.Error().Err(err).Msg("Failed to flush buffer") 163 | } 164 | } 165 | } 166 | } 167 | } 168 | 169 | // processMessage handles a single message, applying rules, writing to the sink, and updating the last processed sequence. 170 | func (w *Worker) processMessage(msg *nats.Msg) error { 171 | metadata, err := msg.Metadata() 172 | if err != nil { 173 | w.logger.Error().Err(err).Msg("Failed to get message metadata") 174 | return err 175 | } 176 | 177 | var cdcMessage utils.CDCMessage 178 | err = cdcMessage.UnmarshalBinary(msg.Data) 179 | if err != nil { 180 | w.logger.Error().Err(err).Msg("Failed to unmarshal message") 181 | return err 182 | } 183 | 184 | if w.ruleEngine != nil { 185 | processedMessage, err := w.ruleEngine.ApplyRules(&cdcMessage) 186 | if err != nil { 187 | w.logger.Error().Err(err).Msg("Failed to apply rules") 188 | return err 189 | } 190 | if processedMessage == nil { 191 | w.logger.Debug().Msg("Message filtered out by rules") 192 | return nil 193 | } 194 | cdcMessage = *processedMessage 195 | } 196 | 197 | if w.router != nil { 198 | routedMessage, err := w.router.ApplyRouting(&cdcMessage) 199 | if err != nil { 200 | w.logger.Error().Err(err).Msg("Failed to apply routing") 201 | return err 202 | } 203 | if routedMessage == nil { 204 | w.logger.Debug().Msg("Message filtered out by routing") 205 | return nil 206 | } 207 | cdcMessage = *routedMessage 208 | } 209 | 210 | w.buffer = append(w.buffer, &cdcMessage) 211 | w.lastSavedState = metadata.Sequence.Stream 212 | 213 | return nil 214 | } 215 | 216 | // flushBuffer writes the buffered messages to the sink and updates the last processed sequence. 217 | func (w *Worker) flushBuffer() error { 218 | if len(w.buffer) == 0 { 219 | return nil 220 | } 221 | 222 | w.logger.Debug(). 223 | Int("messages", len(w.buffer)). 224 | Int("batch_size", w.batchSize). 225 | Msg("Flushing buffer") 226 | 227 | err := w.sink.WriteBatch(w.buffer) 228 | if err != nil { 229 | w.logger.Error().Err(err).Msg("Failed to write batch to sink") 230 | return err 231 | } 232 | 233 | state, err := w.natsClient.GetState() 234 | if err != nil { 235 | w.logger.Error().Err(err).Msg("Failed to get current state") 236 | return err 237 | } 238 | 239 | state.LastProcessedSeq[w.group] = w.lastSavedState 240 | if err := w.natsClient.SaveState(state); err != nil { 241 | w.logger.Error().Err(err).Msg("Failed to save state") 242 | return err 243 | } 244 | 245 | w.buffer = w.buffer[:0] 246 | return nil 247 | } 248 | --------------------------------------------------------------------------------