├── .dockerignore
├── .gitattributes
├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── Cargo.toml
├── Dockerfile
├── LICENSE.txt
├── Makefile
├── README.adoc
├── bin
    ├── clean-example-data.sh
    ├── consume-example-json.sh
    ├── extract-example-json.sh
    └── localstack-setup_emails.sh
├── build.rs
├── contrib
    ├── Dockerfile.setup
    └── docker-compose-with-statsd.yml
├── doc
    ├── DESIGN.md
    ├── HACKING.adoc
    └── img
    │   ├── kafka-delta-ingest-interaction.png
    │   ├── kafka-delta-ingest-jobs-and-processes.png
    │   ├── kafka-delta-ingest-transaction-state.png
    │   └── kafka-delta-ingest-workflow.png
├── docker-compose.yml
├── src
    ├── coercions.rs
    ├── cursor.rs
    ├── dead_letters.rs
    ├── delta_helpers.rs
    ├── lib.rs
    ├── main.rs
    ├── metrics.rs
    ├── offsets.rs
    ├── serialization.rs
    ├── transforms.rs
    ├── value_buffers.rs
    └── writer.rs
└── tests
    ├── buffer_flush_tests.rs
    ├── data
        ├── .gitignore
        ├── default_schema.avro
        ├── emails
        │   ├── .gitignore
        │   └── _delta_log
        │   │   └── 00000000000000000000.json
        ├── example
        │   ├── .gitignore
        │   └── _delta_log
        │   │   └── 00000000000000000000.json
        ├── web_requests
        │   ├── .gitignore
        │   └── _delta_log
        │   │   └── 00000000000000000000.json
        └── zero_offset
        │   ├── .gitignore
        │   └── _delta_log
        │       ├── 00000000000000000000.json
        │       └── 00000000000000000001.json
    ├── dead_letter_tests.rs
    ├── delta_partitions_tests.rs
    ├── deserialization_tests.rs
    ├── emails_azure_blob_tests.rs
    ├── emails_s3_tests.rs
    ├── helpers
        └── mod.rs
    ├── json
        ├── web_requests-100.json
        └── web_requests-100K.json.tar.gz
    ├── offset_tests.rs
    └── schema_update_tests.rs


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git/
2 | contrib/
3 | target/
4 | venv/
5 | .github/
6 | Dockerfile
7 | docker-compose.yml
8 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/json/web_requests-100K.json.tar.gz filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   format:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Install minimum stable with clippy and rustfmt
15 |       uses: actions-rs/toolchain@v1
16 |       with:
17 |         profile: default
18 |         toolchain: stable
19 |         override: true
20 |     - name: Format
21 |       run: cargo fmt -- --check
22 | 
23 |   build:
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |     - uses: actions/checkout@v2
27 |     - name: Install minimum stable with clippy and rustfmt
28 |       uses: actions-rs/toolchain@v1
29 |       with:
30 |         profile: default
31 |         toolchain: stable
32 |         override: true
33 |     - name: Build and lint for S3
34 |       run: cargo clippy --features s3
35 |     - name: Build and lint for Azure
36 |       run: cargo clippy --features azure
37 | 
38 |   test:
39 |     runs-on: ubuntu-latest
40 |     env:
41 |       # Disable full debug symbol generation to speed up CI build and keep memory down
42 |       RUSTFLAGS: -C debuginfo=none
43 |       # Disable incremental builds by cargo for CI which should save disk space
44 |       # and hopefully avoid final link "No space left on device"
45 |       CARGO_INCREMENTAL: 0
46 |     steps:
47 |     - uses: actions/checkout@v2
48 |     - name: Install minimum stable with clippy and rustfmt
49 |       uses: actions-rs/toolchain@v1
50 |       with:
51 |         profile: default
52 |         toolchain: stable
53 |         override: true
54 |     - name: Teststack setup
55 |       run: docker compose up setup
56 |     - name: Run s3 feature tests
57 |       run: cargo test --features s3
58 |     - name: Run azure feature tests
59 |       run: cargo test --features azure
60 | 
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | /target
3 | 
4 | tests/json/*.json
5 | 
6 | .idea
7 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at opensource@scribd.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "kafka-delta-ingest"
 3 | version = "0.4.0"
 4 | authors = ["R. Tyler Croy <rtyler@brokenco.de>", "Christian Williams <christianw@scribd.com>"]
 5 | edition = "2018"
 6 | rust-version = "1.81"
 7 | 
 8 | [dependencies]
 9 | flate2 = "1.0"
10 | anyhow = "1"
11 | async-trait = "0.1"
12 | apache-avro = "^0.17"
13 | base64 = "0.13"
14 | bytes = "1"
15 | chrono = "0.4.31"
16 | clap = { version = "4", features = ["color", "env"] }
17 | dipstick = "0.9"
18 | env_logger = "0"
19 | futures = "0.3"
20 | jmespatch = { version = "0.3", features = ["sync"] }
21 | lazy_static = "1"
22 | log = "0"
23 | maplit = "1"
24 | rdkafka = { version = "0.37", features = ["ssl"] }
25 | schema_registry_converter = { version = "3.1.0", features = ["easy", "json", "avro"] }
26 | serde = { version = "1", features = ["derive"] }
27 | serde_json = "1"
28 | strum = "0.27.1"
29 | strum_macros = "0.27.1"
30 | thiserror = "1"
31 | tokio = { version = "1", features = ["full"] }
32 | tokio-stream = { version = "0", features = ["fs"] }
33 | tokio-util = "0.6.3"
34 | uuid = { version = "0.8", features = ["serde", "v4"] }
35 | url = "2.3"
36 | dashmap = "6.0.1"
37 | 
38 | # datafusion feature is required for writer version 2
39 | deltalake-core = { version = "0.26.0", features = ["json", "datafusion"]}
40 | deltalake-aws = { version = "0.9.0", optional = true }
41 | deltalake-azure = { version = "0.9.0", optional = true }
42 | 
43 | # s3 feature enabled, helps for locking interactions with DLQ
44 | dynamodb_lock = { version = "0.6.0", optional = true }
45 | # sentry
46 | sentry = { version = "0.23.0", optional = true }
47 | 
48 | [features]
49 | default = []
50 | sentry-ext = ["sentry"]
51 | dynamic-linking = [ "rdkafka/dynamic-linking" ]
52 | azure = [
53 |     "deltalake-azure",
54 | ]
55 | s3 = [
56 |     "deltalake-aws",
57 |     "dynamodb_lock",
58 | ]
59 | 
60 | [dev-dependencies]
61 | azure_core = { version = "0.18.0" }
62 | azure_storage = { version = "0.18.0" }
63 | azure_storage_blobs = { version = "0.18.0" }
64 | rusoto_core = { version = "0.47", default-features = false, features = ["rustls"]}
65 | rusoto_credential = { version = "0.47"}
66 | rusoto_s3 = { version = "0.47", default-features = false, features = ["rustls"]}
67 | serial_test = "*"
68 | tempfile = "3"
69 | time = "0.3.20"
70 | utime = "0.3"
71 | 
72 | [profile.release]
73 | lto = true
74 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rust:1.86 AS builder
 2 | 
 3 | RUN mkdir /build
 4 | WORKDIR /build
 5 | 
 6 | COPY --link ./Cargo.toml .
 7 | COPY --link ./build.rs .
 8 | COPY --link ./src ./src
 9 | COPY --link ./tests .
10 | 
11 | RUN --mount=type=cache,target=/usr/local/cargo/registry,id=kdi-cargo-registry \
12 |     --mount=type=cache,target=/usr/local/cargo/git,id=kdi-cargo-git \
13 |     --mount=type=cache,target=target,id=kdi-target \
14 |     cargo build --release --features s3 && cp target/release/kafka-delta-ingest .
15 | 
16 | FROM debian:12
17 | 
18 | RUN apt-get update && apt-get -y install \
19 |     ca-certificates \
20 |   && rm -rf /var/lib/apt/lists/*
21 | 
22 | WORKDIR /build
23 | 
24 | COPY --from=builder /build/kafka-delta-ingest ./
25 | ENTRYPOINT ["/build/kafka-delta-ingest"]
26 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Copyright (2021) Scribd Inc. All rights reserved.
  2 | 
  3 | 
  4 |                                  Apache License
  5 |                            Version 2.0, January 2004
  6 |                         http://www.apache.org/licenses/
  7 | 
  8 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  9 | 
 10 |    1. Definitions.
 11 | 
 12 |       "License" shall mean the terms and conditions for use, reproduction,
 13 |       and distribution as defined by Sections 1 through 9 of this document.
 14 | 
 15 |       "Licensor" shall mean the copyright owner or entity authorized by
 16 |       the copyright owner that is granting the License.
 17 | 
 18 |       "Legal Entity" shall mean the union of the acting entity and all
 19 |       other entities that control, are controlled by, or are under common
 20 |       control with that entity. For the purposes of this definition,
 21 |       "control" means (i) the power, direct or indirect, to cause the
 22 |       direction or management of such entity, whether by contract or
 23 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 24 |       outstanding shares, or (iii) beneficial ownership of such entity.
 25 | 
 26 |       "You" (or "Your") shall mean an individual or Legal Entity
 27 |       exercising permissions granted by this License.
 28 | 
 29 |       "Source" form shall mean the preferred form for making modifications,
 30 |       including but not limited to software source code, documentation
 31 |       source, and configuration files.
 32 | 
 33 |       "Object" form shall mean any form resulting from mechanical
 34 |       transformation or translation of a Source form, including but
 35 |       not limited to compiled object code, generated documentation,
 36 |       and conversions to other media types.
 37 | 
 38 |       "Work" shall mean the work of authorship, whether in Source or
 39 |       Object form, made available under the License, as indicated by a
 40 |       copyright notice that is included in or attached to the work
 41 |       (an example is provided in the Appendix below).
 42 | 
 43 |       "Derivative Works" shall mean any work, whether in Source or Object
 44 |       form, that is based on (or derived from) the Work and for which the
 45 |       editorial revisions, annotations, elaborations, or other modifications
 46 |       represent, as a whole, an original work of authorship. For the purposes
 47 |       of this License, Derivative Works shall not include works that remain
 48 |       separable from, or merely link (or bind by name) to the interfaces of,
 49 |       the Work and Derivative Works thereof.
 50 | 
 51 |       "Contribution" shall mean any work of authorship, including
 52 |       the original version of the Work and any modifications or additions
 53 |       to that Work or Derivative Works thereof, that is intentionally
 54 |       submitted to Licensor for inclusion in the Work by the copyright owner
 55 |       or by an individual or Legal Entity authorized to submit on behalf of
 56 |       the copyright owner. For the purposes of this definition, "submitted"
 57 |       means any form of electronic, verbal, or written communication sent
 58 |       to the Licensor or its representatives, including but not limited to
 59 |       communication on electronic mailing lists, source code control systems,
 60 |       and issue tracking systems that are managed by, or on behalf of, the
 61 |       Licensor for the purpose of discussing and improving the Work, but
 62 |       excluding communication that is conspicuously marked or otherwise
 63 |       designated in writing by the copyright owner as "Not a Contribution."
 64 | 
 65 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 66 |       on behalf of whom a Contribution has been received by Licensor and
 67 |       subsequently incorporated within the Work.
 68 | 
 69 |    2. Grant of Copyright License. Subject to the terms and conditions of
 70 |       this License, each Contributor hereby grants to You a perpetual,
 71 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 72 |       copyright license to reproduce, prepare Derivative Works of,
 73 |       publicly display, publicly perform, sublicense, and distribute the
 74 |       Work and such Derivative Works in Source or Object form.
 75 | 
 76 |    3. Grant of Patent License. Subject to the terms and conditions of
 77 |       this License, each Contributor hereby grants to You a perpetual,
 78 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 79 |       (except as stated in this section) patent license to make, have made,
 80 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 81 |       where such license applies only to those patent claims licensable
 82 |       by such Contributor that are necessarily infringed by their
 83 |       Contribution(s) alone or by combination of their Contribution(s)
 84 |       with the Work to which such Contribution(s) was submitted. If You
 85 |       institute patent litigation against any entity (including a
 86 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 87 |       or a Contribution incorporated within the Work constitutes direct
 88 |       or contributory patent infringement, then any patent licenses
 89 |       granted to You under this License for that Work shall terminate
 90 |       as of the date such litigation is filed.
 91 | 
 92 |    4. Redistribution. You may reproduce and distribute copies of the
 93 |       Work or Derivative Works thereof in any medium, with or without
 94 |       modifications, and in Source or Object form, provided that You
 95 |       meet the following conditions:
 96 | 
 97 |       (a) You must give any other recipients of the Work or
 98 |           Derivative Works a copy of this License; and
 99 | 
100 |       (b) You must cause any modified files to carry prominent notices
101 |           stating that You changed the files; and
102 | 
103 |       (c) You must retain, in the Source form of any Derivative Works
104 |           that You distribute, all copyright, patent, trademark, and
105 |           attribution notices from the Source form of the Work,
106 |           excluding those notices that do not pertain to any part of
107 |           the Derivative Works; and
108 | 
109 |       (d) If the Work includes a "NOTICE" text file as part of its
110 |           distribution, then any Derivative Works that You distribute must
111 |           include a readable copy of the attribution notices contained
112 |           within such NOTICE file, excluding those notices that do not
113 |           pertain to any part of the Derivative Works, in at least one
114 |           of the following places: within a NOTICE text file distributed
115 |           as part of the Derivative Works; within the Source form or
116 |           documentation, if provided along with the Derivative Works; or,
117 |           within a display generated by the Derivative Works, if and
118 |           wherever such third-party notices normally appear. The contents
119 |           of the NOTICE file are for informational purposes only and
120 |           do not modify the License. You may add Your own attribution
121 |           notices within Derivative Works that You distribute, alongside
122 |           or as an addendum to the NOTICE text from the Work, provided
123 |           that such additional attribution notices cannot be construed
124 |           as modifying the License.
125 | 
126 |       You may add Your own copyright statement to Your modifications and
127 |       may provide additional or different license terms and conditions
128 |       for use, reproduction, or distribution of Your modifications, or
129 |       for any such Derivative Works as a whole, provided Your use,
130 |       reproduction, and distribution of the Work otherwise complies with
131 |       the conditions stated in this License.
132 | 
133 |    5. Submission of Contributions. Unless You explicitly state otherwise,
134 |       any Contribution intentionally submitted for inclusion in the Work
135 |       by You to the Licensor shall be under the terms and conditions of
136 |       this License, without any additional terms or conditions.
137 |       Notwithstanding the above, nothing herein shall supersede or modify
138 |       the terms of any separate license agreement you may have executed
139 |       with Licensor regarding such Contributions.
140 | 
141 |    6. Trademarks. This License does not grant permission to use the trade
142 |       names, trademarks, service marks, or product names of the Licensor,
143 |       except as required for reasonable and customary use in describing the
144 |       origin of the Work and reproducing the content of the NOTICE file.
145 | 
146 |    7. Disclaimer of Warranty. Unless required by applicable law or
147 |       agreed to in writing, Licensor provides the Work (and each
148 |       Contributor provides its Contributions) on an "AS IS" BASIS,
149 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
150 |       implied, including, without limitation, any warranties or conditions
151 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
152 |       PARTICULAR PURPOSE. You are solely responsible for determining the
153 |       appropriateness of using or redistributing the Work and assume any
154 |       risks associated with Your exercise of permissions under this License.
155 | 
156 |    8. Limitation of Liability. In no event and under no legal theory,
157 |       whether in tort (including negligence), contract, or otherwise,
158 |       unless required by applicable law (such as deliberate and grossly
159 |       negligent acts) or agreed to in writing, shall any Contributor be
160 |       liable to You for damages, including any direct, indirect, special,
161 |       incidental, or consequential damages of any character arising as a
162 |       result of this License or out of the use or inability to use the
163 |       Work (including but not limited to damages for loss of goodwill,
164 |       work stoppage, computer failure or malfunction, or any and all
165 |       other commercial damages or losses), even if such Contributor
166 |       has been advised of the possibility of such damages.
167 | 
168 |    9. Accepting Warranty or Additional Liability. While redistributing
169 |       the Work or Derivative Works thereof, You may choose to offer,
170 |       and charge a fee for, acceptance of support, warranty, indemnity,
171 |       or other liability obligations and/or rights consistent with this
172 |       License. However, in accepting such obligations, You may act only
173 |       on Your own behalf and on Your sole responsibility, not on behalf
174 |       of any other Contributor, and only if You agree to indemnify,
175 |       defend, and hold each Contributor harmless for any liability
176 |       incurred by, or claims asserted against, such Contributor by reason
177 |       of your accepting any such warranty or additional liability.
178 | 
179 |    END OF TERMS AND CONDITIONS
180 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := help
 2 | 
 3 | CARGO:=$(shell which cargo)
 4 | APP=target/debug/kafka-delta-ingest
 5 | SOURCES=$(shell find . -type f -iname '*.rs')
 6 | 
 7 | $(APP): Cargo.toml $(SOURCES)
 8 | 	$(CARGO) build
 9 | 
10 | .PHONY: check
11 | check: Cargo.toml $(SOURCES)
12 | 	$(CARGO) fmt
13 | 	$(CARGO) test --features azure,s3
14 | 
15 | .PHONY: docker
16 | docker: Dockerfile ## Build the docker image
17 | 	docker build -t kafka-delta-ingest .
18 | 
19 | 
20 | .PHONY: help
21 | help:
22 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
23 | 


--------------------------------------------------------------------------------
/README.adoc:
--------------------------------------------------------------------------------
  1 | = kafka-delta-ingest
  2 | 
  3 | The kafka-delta-ingest project aims to build a highly efficient daemon for
  4 | streaming data through link:https://kafka.apache.org[Apache Kafka] into
  5 | link:https://delta.io[Delta Lake].
  6 | 
  7 | This project is currently in production in a number of organizations and is
  8 | still actively evolving in tandem with the
  9 | link:https://github.com/delta-io/delta-rs[delta-rs] bindings.
 10 | 
 11 | To contribute please look at the link:https://github.com/delta-io/kafka-delta-ingest/blob/main/doc/HACKING.adoc[hacking document].
 12 | 
 13 | == Features
 14 | 
 15 | * Multiple worker processes per stream
 16 | * Basic transformations within message
 17 | * Statsd metric output
 18 | 
 19 | See the link:https://github.com/delta-io/kafka-delta-ingest/blob/main/doc/DESIGN.md[design doc] for more details.
 20 | 
 21 | === Example
 22 | 
 23 | The repository includes an example for trying out the application locally with some fake web request data.
 24 | 
 25 | The included docker-compose.yml contains link:https://github.com/wurstmeister/kafka-docker/issues[kafka] and link:https://github.com/localstack/localstack[localstack] services you can run `kafka-delta-ingest` against locally.
 26 | 
 27 | ==== Starting Worker Processes
 28 | 
 29 | 1. Launch test services - `docker-compose up setup`
 30 | 1. Download and extract kafka: `curl -L https://dlcdn.apache.org/kafka/3.9.0/kafka_2.13-3.9.0.tgz | tar -xz`     
 31 | 1. Create kafka topic: `./kafka_2.13-3.9.0/bin/kafka-topics.sh --bootstrap-server localhost:9092 --topic web_requests --create --if-not-exists`
 32 | 1. Ingest test messages into kafka: `./kafka_2.13-3.9.0/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic web_requests < tests/json/web_requests-100.json`
 33 | 1. Compile: `cargo build --features s3`,  MacOS: `brew install librdkafka && cargo build --features s3,dynamic-linking`
 34 | 1. Run kafka-delta-ingest against the web_requests example topic and table (customize arguments as desired):
 35 | 
 36 | ```bash
 37 | export AWS_ENDPOINT_URL=http://0.0.0.0:4566
 38 | export AWS_ACCESS_KEY_ID=test
 39 | export AWS_SECRET_ACCESS_KEY=test
 40 | 
 41 | RUST_LOG=debug cargo run --features s3,dynamic-linking ingest web_requests ./tests/data/web_requests \
 42 |   --allowed_latency 60 \
 43 |   --app_id web_requests \
 44 |   --transform 'date: substr(meta.producer.timestamp, `0`, `10`)' \
 45 |   --transform 'meta.kafka.offset: kafka.offset' \
 46 |   --transform 'meta.kafka.partition: kafka.partition' \
 47 |   --transform 'meta.kafka.topic: kafka.topic' \
 48 |   --auto_offset_reset earliest
 49 | ```
 50 | 
 51 | Notes:
 52 | 
 53 | * The AWS_* environment variables are for S3 and are required by the delta-rs library.
 54 | ** Above, AWS_ENDPOINT_URL points to localstack.
 55 | * The Kafka broker is assumed to be at localhost:9092, use -k to override.
 56 | * To clean data from previous local runs, execute `./bin/clean-example-data.sh`. You'll need to do this if you destroy your Kafka container between runs since your delta log directory will be out of sync with Kafka offsets.
 57 | 
 58 | ==== Kafka SSL
 59 | 
 60 | In case you have Kafka topics secured by SSL client certificates, you can specify these secrets as environment variables.
 61 | 
 62 | For the cert chain include the PEM content as an environment variable named `KAFKA_DELTA_INGEST_CERT`.
 63 | For the cert private key include the PEM content as an environment variable named `KAFKA_DELTA_INGEST_KEY`.
 64 | 
 65 | These will be set as the `ssl.certificate.pem` and `ssl.key.pem` Kafka settings respectively.
 66 | 
 67 | Make sure to provide the additional option:
 68 | 
 69 | ```
 70 | -K security.protocol=SSL
 71 | ```
 72 | 
 73 | when invoking the cli command as well.
 74 | 
 75 | 
 76 | === Using Azure Event Hubs
 77 | 
 78 | Azure Event Hubs (with pricing tier "Standard" or higher) has a Kafka Surface that can be used with kafka-delta-ingest.
 79 | 
 80 | Azure Event Hubs doesn't have a local emulator, so an actual Azure Event Hubs resource is required. As a result, there's no need for the docker-compose application described above.
 81 | 
 82 | More info:
 83 | 
 84 | * https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-migration-guide
 85 | * https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-troubleshooting-guide
 86 | * https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-configurations#librdkafka-configuration-properties
 87 | * https://github.com/Azure/azure-event-hubs-for-kafka/blob/master/CONFIGURATION.md#librdkafka-configuration-properties
 88 | * https://github.com/edenhill/librdkafka/wiki/Using-SASL-with-librdkafka
 89 | * https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
 90 | * https://github.com/edenhill/librdkafka/issues/3109
 91 | 
 92 | 
 93 | ==== Starting Worker Processes
 94 | 
 95 | 1. link:https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-create[Create] an Azure Event Hubs Namespace and within it, an Event Hub (which corresponds to a Kafka topic).
 96 | 
 97 | 2. Set these environment variables, they are required by the delta-rs library:
 98 | * `AZURE_STORAGE_ACCOUNT_NAME` (just the storage account name, not the FQDN)
 99 | * `AZURE_STORAGE_ACCOUNT_KEY` (just the key, not the connection string)
100 | 
101 | 3. Create the `_delta_log` directory in the `web_requests` directory in Azure Storage and upload the link:https://github.com/delta-io/kafka-delta-ingest/blob/main/tests/data/web_requests/_delta_log/00000000000000000000.json[first Delta transaction containing the schema] to this directory.
102 | 
103 | 4. In the docker command below, replace the following placeholders with your values:
104 | * `AZURE_STORAGE_ACCOUNT_NAME` (just the storage account name, not the FQDN)
105 | * `AZURE_STORAGE_ACCOUNT_KEY` (just the key, not the connection string)
106 | * `EVENTHUBS_NAMESPACE_NAME` (just the namespace name, not the FQDN)
107 | * `EVENTHUBS_KEY_NAME`
108 | * `EVENTHUBS_KEY`
109 | 
110 | 5. Build the docker image
111 | 
112 | ```
113 | docker build -t kdi:0.1 . -f Dockerfile.Debian
114 | ```
115 | 
116 | Notes:
117 | 
118 | * If this takes a long time, make sure that docker has enough memory
119 | 
120 | 6. Execute this docker command to run kafka-delta-ingest
121 | 
122 | ```
123 | docker run -it --network=host ^
124 |   -e RUST_LOG="debug" ^
125 |   -e SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt ^
126 |   -e AZURE_STORAGE_ACCOUNT_NAME={AZURE_STORAGE_ACCOUNT_NAME} ^
127 |   -e "AZURE_STORAGE_ACCOUNT_KEY={AZURE_STORAGE_ACCOUNT_KEY}" ^
128 |   kdi:0.1 ^
129 |   ingest web_requests adls2://{AZURE_STORAGE_ACCOUNT_NAME}/{FILESYSTEM_NAME}/web_requests ^
130 |   --allowed_latency 5 ^
131 |   --kafka thovoll-kdi-eh.servicebus.windows.net:9093 ^
132 |   --Kafka security.protocol=SASL_SSL ^
133 |   --Kafka sasl.mechanism=PLAIN ^
134 |   --Kafka sasl.username=$ConnectionString ^
135 |   --Kafka sasl.password=Endpoint=sb://{EVENTHUBS_NAMESPACE_NAME}.servicebus.windows.net/;SharedAccessKeyName={EVENTHUBS_KEY_NAME};SharedAccessKey={EVENTHUBS_KEY} ^
136 |   --Kafka socket.keepalive.enable=true ^
137 |   --Kafka metadata.max.age.ms=180000 ^
138 |   --Kafka heartbeat.interval.ms=3000 ^
139 |   --Kafka session.timeout.ms=30000 ^
140 |   --Kafka debug=broker,security,protocol ^
141 |   --app_id web_requests ^
142 |   --transform "date: substr(meta.producer.timestamp, `0`, `10`)" ^
143 |   --transform "meta.kafka.offset: kafka.offset" ^
144 |   --transform "meta.kafka.partition: kafka.partition" ^
145 |   --transform "meta.kafka.topic: kafka.topic" ^
146 |   --auto_offset_reset earliest
147 | ```
148 | 
149 | Notes:
150 | 
151 | * In the docker command:
152 | ** The `sasl.username` is the literal string `$ConnectionString` and not a placeholder.
153 | ** The following `--Kafka` arguments are taken from link:https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-configurations#librdkafka-configuration-properties[here]:
154 | *** `socket.keepalive.enable=true`
155 | *** `metadata.max.age.ms=180000`
156 | *** `heartbeat.interval.ms=3000`
157 | *** `session.timeout.ms=30000`
158 | 
159 | ==== Sending data to Event Hubs
160 | 
161 | On Windows, link:https://github.com/paolosalvatori/ServiceBusExplorer[Service Bus Explorer] can be used to send data to Event Hubs.
162 | 
163 | The following payload should be sent for the web_requests Delta table:
164 | 
165 | ```json
166 | {
167 |   "status": 200,
168 |   "session_id": "7c28bcf9-be26-4d0b-931a-3374ab4bb458",
169 |   "method": "GET",
170 |   "meta": {
171 |     "producer": {
172 |       "timestamp": "2021-03-24T15:06:17.321710+00:00"
173 |     }
174 |   },
175 |   "uuid": "831c6afa-375c-4988-b248-096f9ed101f8",
176 |   "url": "http://www.example.com"
177 | }
178 | ```
179 | 
180 | ==== Verifying data from Event Hub using kcat
181 | 
182 | kcat can be run on Windows via docker using this command, which will print the last message (-o -1).
183 | 
184 | Make sure to first replace the following placeholders:
185 | 
186 | * `EVENTHUBS_NAMESPACE_NAME` (just the namespace name, not the FQDN)
187 | * `EVENTHUBS_KEY_NAME`
188 | * `EVENTHUBS_KEY`
189 | 
190 | ```
191 | docker run -it --network=host edenhill/kcat:1.7.1 -C -o -1 -b {EVENTHUBS_NAMESPACE_NAME}.servicebus.windows.net:9093 -t web_requests -X security.protocol=SASL_SSL -X sasl.mechanism=PLAIN -X sasl.username=$ConnectionString -X sasl.password=Endpoint=sb://{EVENTHUBS_NAMESPACE_NAME}.servicebus.windows.net/;SharedAccessKeyName={EVENTHUBS_KEY_NAME};SharedAccessKey={EVENTHUBS_KEY} -X socket.keepalive.enable=true -X metadata.max.age.ms=180000 -X heartbeat.interval.ms=3000 -X session.timeout.ms=30000
192 | ```
193 | 
194 | Notes:
195 | 
196 | * The following configuration settings in the command above are taken from link:https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-configurations#librdkafka-configuration-properties[here]:
197 | `-X socket.keepalive.enable=true -X metadata.max.age.ms=180000 -X heartbeat.interval.ms=3000 -X session.timeout.ms=30000`
198 | 
199 | == Kafka SSL
200 | 
201 | In case you have Kafka topics secured by SSL client certificates, you can specify these secrets as environment variables.
202 | 
203 | For the cert chain include the PEM content as an environment variable named `KAFKA_DELTA_INGEST_CERT`.
204 | For the cert private key include the PEM content as an environment variable named `KAFKA_DELTA_INGEST_KEY`.
205 | 
206 | These will be set as the `ssl.certificate.pem` and `ssl.key.pem` Kafka settings respectively.
207 | 
208 | Make sure to provide the additional option:
209 | 
210 | ```
211 | -K security.protocol=SSL
212 | ```
213 | 
214 | when invoking the cli command as well.
215 | 
216 | == Gzip Compressed Messages
217 | 
218 | kafka-delta-ingest now supports ingestion of gzip-compressed messages. This can be particularly useful when dealing with large volumes of data that benefit from compression.
219 | 
220 | To enable gzip decompression, use the `--decompress_gzip` flag when starting the ingestion process.
221 | 
222 | == Writing to S3
223 | 
224 | When writing to S3, you may experience an error like `source: StorageError { source: S3Generic("dynamodb locking is not enabled") }`.
225 | 
226 | A locking mechanism is need to prevent unsafe concurrent writes to a delta lake directory, and DynamoDB is an option for this. To use DynamoDB, set the `AWS_S3_LOCKING_PROVIDER` variable to `dynamodb` and create a table named `delta_rs_lock_table` in Dynamo. An example DynamoDB table creation snippet using the aws CLI follows, and should be customized for your environment's needs (e.g. read/write capacity modes):
227 | 
228 | 
229 | ```bash
230 | aws dynamodb create-table --table-name delta_rs_lock_table \
231 |     --attribute-definitions \
232 |         AttributeName=key,AttributeType=S \
233 |     --key-schema \
234 |         AttributeName=key,KeyType=HASH \
235 |     --provisioned-throughput \
236 |         ReadCapacityUnits=10,WriteCapacityUnits=10
237 | ```
238 | 
239 | == Schema Support
240 | This application has support for both avro and json format via command line arguments. If no format argument is provided, the default behavior is to use json.
241 | The table below indicates what will happen with respect to the provided arguments.
242 | 
243 | |===
244 | | Argument      | Value |  Result |
245 | | ----------- | ----------- | ----------- |
246 | | <none>      | <none>       | default json behavior |
247 | | --json      | <any string>       | default json behavior |
248 | | --json      | <schema registry url>       |  will connect schema registry to deserialize json |
249 | | --avro   | ""        | expects all messages in avro format |
250 | | --avro      | <path to an avro schema>       | will use the provided avro schema for deserialization |
251 | | --avro   | <schema registry url>        | will connect schema registry to deserialize avro |
252 | |===
253 | 
254 | 
255 | For more information, see link:https://github.com/delta-io/delta-rs/tree/dbc2994c5fddfd39fc31a8f9202df74788f59a01/dynamodb_lock[DynamoDB lock].
256 | == Verifying data in Azure Storage
257 | 
258 | Use the Azure Portal to browse the file system:
259 | 
260 | * Data files: `web_requests/date=2021-03-24`
261 | * Delta log files: `web_requests/_delta_log`
262 | 
263 | 
264 | == Get Involved
265 | 
266 | Join link:https://delta-users.slack.com/archives/C01Q2RXCVSQ[#kafka-delta-ingest in the Delta Lake Slack workspace]
267 | 


--------------------------------------------------------------------------------
/bin/clean-example-data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | WEB_REQUESTS_DIR=tests/data/web_requests
4 | 
5 | find $WEB_REQUESTS_DIR/_delta_log -type f -not -name '00000000000000000000.json' -exec rm {} +
6 | find $WEB_REQUESTS_DIR -type d -name 'date=*' -exec rm -rf {} +
7 | 
8 | 


--------------------------------------------------------------------------------
/bin/consume-example-json.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export AWS_ENDPOINT_URL=http://0.0.0.0:4566
 4 | export AWS_ACCESS_KEY_ID=test
 5 | export AWS_SECRET_ACCESS_KEY=test
 6 | 
 7 | RUST_LOG=debug ./target/debug/kafka-delta-ingest ingest web_requests ./tests/data/web_requests \
 8 |   -l 60 \
 9 |   -a web_requests \
10 |   -K "auto.offset.reset=earliest" \
11 |   -t 'date: substr(meta.producer.timestamp, `0`, `10`)' \
12 |       'meta.kafka.offset: kafka.offset' \
13 |       'meta.kafka.partition: kafka.partition' \
14 |       'meta.kafka.topic: kafka.topic' \
15 |   -s "localhost:8125"
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/bin/extract-example-json.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | tar -xzvf tests/json/web_requests-100K.json.tar.gz -C tests/json
4 | 
5 | 


--------------------------------------------------------------------------------
/bin/localstack-setup_emails.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export AWS_ACCESS_KEY_ID=test
 4 | export AWS_SECRET_ACCESS_KEY=test
 5 | export AWS_DEFAULT_REGION=us-east-2
 6 | export ENDPOINT=http://localstack:4566
 7 | export AZURE_CONNECTION_STRING="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite:10000/devstoreaccount1;QueueEndpoint=http://azurite:10001/devstoreaccount1;"
 8 | 
 9 | function wait_for() {
10 |   retries=10
11 |   echo ">> running $2"
12 |   set -x
13 |   until eval $2 #> /dev/null 2>&1
14 |   do
15 |     if [ "$retries" -lt "0" ]; then
16 |       echo "$1 is still offline after 10 retries";
17 |       exit 1;
18 |     fi
19 |     echo "Waiting on $1 to start..."
20 |     sleep 5
21 |     retries=$((retries - 1))
22 |   done
23 | }
24 | 
25 | wait_for "Azurite" "az storage container list --connection-string ${AZURE_CONNECTION_STRING}"
26 | az storage container create -n tests --connection-string ${AZURE_CONNECTION_STRING}
27 | az storage blob upload-batch -d tests -s /data/emails -t block --overwrite --destination-path emails --connection-string ${AZURE_CONNECTION_STRING}
28 | 
29 | wait_for "S3" "aws s3api list-buckets --endpoint-url=$ENDPOINT"
30 | 
31 | echo "Create delta table in S3"
32 | aws s3api create-bucket --bucket tests --endpoint-url=$ENDPOINT > /dev/null 2>&1
33 | aws s3 sync /data/emails s3://tests/emails/ --delete --endpoint-url=$ENDPOINT
34 | 
35 | wait_for "DynamoDB" "aws dynamodb list-tables --endpoint-url=$ENDPOINT"
36 | 
37 | echo "Create delta-rs lock table in dynamo"
38 | aws dynamodb delete-table --table-name locks --endpoint-url=$ENDPOINT > /dev/null 2>&1
39 | aws dynamodb create-table \
40 |   --endpoint-url=$ENDPOINT \
41 |   --table-name locks \
42 |   --attribute-definitions AttributeName=tablePath,AttributeType=S \
43 |                           AttributeName=fileName,AttributeType=S \
44 |   --key-schema AttributeName=tablePath,KeyType=HASH \
45 |                AttributeName=fileName,KeyType=RANGE \
46 |   --billing-mode PAY_PER_REQUEST
47 | 


--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
1 | #[cfg(not(any(feature = "s3", feature = "azure")))]
2 | compile_error!(
3 |     "Either the \"s3\" or the \"azure\" feature must be enabled to compile kafka-delta-ingest"
4 | );
5 | fn main() {}
6 | 


--------------------------------------------------------------------------------
/contrib/Dockerfile.setup:
--------------------------------------------------------------------------------
 1 | FROM python:3-alpine3.18
 2 | 
 3 | RUN apk add --no-cache aws-cli bash openssh ca-certificates jq curl openssl perl git zip \
 4 |  && apk add --no-cache --virtual .build-deps gcc make openssl-dev libffi-dev musl-dev linux-headers \
 5 |  && apk add --no-cache libintl icu-libs libc6-compat \
 6 |  && apk add --no-cache bash-completion \
 7 |  && update-ca-certificates
 8 | RUN pip3 install azure-cli
 9 | RUN az config set extension.use_dynamic_install=yes_without_prompt
10 | 
11 | ENTRYPOINT ["/bin/bash"]
12 | 


--------------------------------------------------------------------------------
/contrib/docker-compose-with-statsd.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3.9'
 3 | services:
 4 |   kafka:
 5 |     image: wurstmeister/kafka
 6 |     depends_on: 
 7 |       - zookeeper
 8 |     ports:
 9 |       - 9092:9092
10 |     environment:
11 |       KAFKA_ADVERTISED_HOST_NAME: localhost
12 |       KAFKA_CREATE_TOPICS: "example:3:1,web_requests:3:1"
13 |       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
14 |     healthcheck:
15 |       test: ["CMD", "bash", "-c", "unset", "JMX_PORT", ";", "kafka-topics.sh", "--zookeeper", "zookeeper:2181", "--list"]
16 | 
17 |   zookeeper:
18 |     image: wurstmeister/zookeeper
19 |     privileged: true
20 |     ports:
21 |       - 2181:2181
22 | 
23 |   localstack:
24 |     image: localstack/localstack
25 |     ports: 
26 |       - 4566:4566
27 |     environment:
28 |       - SERVICES=s3,dynamodb
29 |       - DEBUG=1
30 |       - DATA_DIR=/tmp/localstack/data
31 |       - DOCKER_HOST=unix:///var/run/docker.sock
32 |       - HOST_TMP_FOLDER=${TMPDIR}
33 |     healthcheck:
34 |       test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]
35 | 
36 |   statsd:
37 |     image: graphiteapp/graphite-statsd
38 |     ports:
39 |       - 80:80
40 |       - 2003:2003
41 |       - 2004:2004
42 |       - 2023:2023
43 |       - 2024:2024
44 |       - 8125:8125/udp
45 |       - 8126:8126
46 | 
47 |   setup:
48 |     image: localstack/localstack  
49 |     depends_on: 
50 |       - kafka
51 |       - localstack
52 |     entrypoint: "/bin/bash"
53 |     command:
54 |       - /localstack-create_dynamodb_test_tables.sh
55 |     volumes:
56 |       - "./bin/localstack-create_dynamodb_test_tables.sh:/localstack-create_dynamodb_test_tables.sh"
57 | 
58 | 


--------------------------------------------------------------------------------
/doc/DESIGN.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Summary
  3 | 
  4 | This document describes the design of the kafka-delta-ingest service. The purpose of kafka-delta-ingest is to consume messages from Kafka topics, perform a few standard transformations (primarily for deriving a date-based partition column and merging service metadata) and append them to delta lake tables.
  5 | 
  6 | Application code shall be implemented in Rust to achieve a high level of efficiency. 
  7 | 
  8 | Only JSON Kafka message formats will be supported in the initial implementation. Databricks Spark clusters must be able to read from delta tables written to by kafka-delta-ingest as though they were written by a Spark writer.
  9 | 
 10 | ## Architecture
 11 | 
 12 | This document uses the term "job" to represent the full set of running kafka-delta-ingest resources required to sync a single Kafka topic to a Delta table. The term "process" is used to represent a single running resource that is part of a job. The relationship from topic-to-process is 1:M (to accommodate scalability and efficient resource allocation for both low-volume and high-volume topics), so a topic may have one or more processes handling it, but a single process will only handle a single topic. The relationship from job-to-process is 1:M, and the relationship from job-to-topic is 1:1.
 13 | 
 14 | ### Process Parameters
 15 | 
 16 | Basic required process properties (i.e. parameters with no default) must include:
 17 | 
 18 | * app_id
 19 |     * unique per topic per environment
 20 |     * must be the same for all processes that are part of a single job
 21 | * topic name
 22 |     * must be the same for all processes that are part of a single job
 23 | * table path
 24 |     * must be the same for all processes that are part of a single job
 25 | 
 26 | Properties with overrides include:
 27 | 
 28 | * consumer_group_id
 29 |     * default is kafka_delta_ingest
 30 |     * must be the same for all processes that are part of a single job
 31 | 
 32 | Properties for which overrides will not be expected or supported in v1 include:
 33 | 
 34 | * compression type (always snappy)
 35 | 
 36 | Figure 1 shows an example of some jobs and their composing processes handling a few different Kafka topic to Delta table streams.
 37 | 
 38 | _Figure 1_
 39 | 
 40 | ![Jobs and Processes](./img/kafka-delta-ingest-jobs-and-processes.png)
 41 | 
 42 | Each kafka-delta-ingest job is fully distributed with no coordinator process. Processes will coordinate on delta writes using optimistic concurrency. 
 43 | 
 44 | ---
 45 | 
 46 | ## Application Design
 47 | 
 48 | The design invariants listed below describe the guidelines considered for the design:
 49 | 
 50 | * To support read efficiency, Kafka messages must be buffered into sizable record batches before they are written
 51 | * Parquet file sizes should be as large as possible to limit optimization latency and pre-optimized query latency.
 52 | * Messages should be transformable (pure message transformation only - no joins or aggregates) prior to delta table write.
 53 | * Clients should be capable of specifying their allowed message latency for specific jobs, and actual message latency should approximate this value.
 54 |     * We will assume “processing time == 0” and use allowed latency as a flush marker.
 55 | 
 56 | The goals of achieving optimal size for low read latency and low write latency are in sharp contrast, so allowing for knobs and dials (i.e. configuration parameters) to tune this equation while simultaneously limiting code complexity is a key design factor.
 57 | 
 58 | Key options that should be provided as program arguments include:
 59 | 
 60 | * allowed_latency - max desired latency from when a message is received to when it is written and committed to the target delta table (in seconds). Note that this property is a guideline used internally for controlling frequency of flushes to delta and is not intended as a latency guarantee. The actual latency from the time a message is sent to the Kafka topic and becomes queryable from Delta Lake will exceed this value by the processing time.
 61 | * max_messages_per_batch - number of messages to buffer before writing a record batch. This should be set to some number n of messages m, where approximate_compressed_parquet_size  `n * m  ~= min_bytes_per_file`. Benchmarking should be performed by the user to find the right value that most closely matches min_bytes_per_file. Since the number of bytes that will be written in each parquet file cannot be known prior to writing the compressed parquet bytes to a memory buffer, this option is used to approximate min_bytes_per_file for each RecordBatch based on what benchmarking indicates is appropriate on average. This means that, unless allowed_latency is reached, the actual bytes per parquet file should usually be ~= twice the value of min_bytes_per_file.
 62 | * min_bytes_per_file - desired minimum number of compressed parquet bytes to buffer in memory before writing to storage and committing a transaction. Note that this property is trumped by allowed_latency. If allowed latency is reached before min_bytes_per_file, file size will correspond to the number of bytes received in the allowed latency interval.
 63 | 
 64 | The primary run loop of kafka-delta-ingest should follow the pseudo-code outline described below:
 65 | 
 66 | * Create an rdkafka StreamConsumer for the Kafka topic with the configured consumer group id.
 67 | * Check assigned partitions and store them internally.
 68 | * Register rebalance handler with ConsumerContext.
 69 | * Handle each message from the topic in a loop.
 70 | * Invoke a transformation function for each received message.
 71 |     * This allows the client context to modify the Kafka message and mutate it before it is written to Delta.
 72 | * Buffer each transformed message along with its partition and offset until either options.allowed_latency or options.max_messages_per_batch is reached.
 73 | * When either options.allowed_latency or options.max_messages_per_batch is reached, write a record batch to the underlying writer. 
 74 |     * If options.allowed_latency is reached, complete the underlying writer, commit the Delta transaction, and commit Kafka offsets.
 75 |     * If options.allowed_latency is NOT MET, but options.message_buffer_length is, continue to hold the writer open so additional batches may be written to the same file.
 76 | * After each record batch write, check current number of buffered messages. If this meets or exceeds options.min_bytes_per_file, complete the underlying writer, and write the buffer bytes to S3, and execute an end-to-end transaction (See “End-to-end Transactions / KafkaOffsetHandling”).
 77 | 
 78 | When creating DeltaTransactions - these shall be created with a StreamingUpdate operation in [CommitInfo](https://github.com/delta-io/delta/blob/master/PROTOCOL.md#commit-provenance-information). See [StreamingUpdate](https://github.com/delta-io/delta/blob/7899c47dd6594128d80db341bcb8d89ef62a9b78/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala#L70) in the reference implementation.
 79 | 
 80 | ### Key Dependencies
 81 | 
 82 | This section lists the crate repositories that are most important to the total job of kafka-delta-ingest.
 83 | 
 84 | * [delta-rs](https://github.com/delta-io/delta-rs)
 85 | * [arrow](https://github.com/apache/arrow-rs)
 86 | * [parquet](https://github.com/apache/arrow/tree/master/rust/parquet)
 87 | * [rdkafka](https://github.com/fede1024/rust-rdkafka)
 88 | * [dipstick](https://github.com/fralalonde/dipstick)
 89 | * [tokio](https://github.com/tokio-rs/tokio)
 90 | 
 91 | 
 92 | ### End-to-end Transactions / Kafka Offset Handling
 93 | 
 94 | This section describes how end-to-end transactions that account for Kafka partition offsets should be committed in a way that prevents duplication from re-processing of already written messages.
 95 | 
 96 | On startup and after rebalance, each kafka-delta-ingest process must check its assigned partitions and re-seek the consumer when necessary. Each process must maintain an internal map of assigned partitions and current offsets.  Kafka’s group management protocol guarantees that individual consumers within a group will be assigned a mutually exclusive set of partitions at any given time.
 97 | 
 98 | Upon startup or partition assignment (in case of rebalance), to identify the last offset written to Delta Lake for each assigned partition, the kafka-delta-ingest process must locate the last txn action in the delta log for each of its assigned partitions and re-seek the consumer based on the txn.version attribute of the delta log. 
 99 | 
100 | When performing a write, each process must commit the last offset of each partition within a txn action contained in the delta log entry.
101 | 
102 | Figure 2 shows a workflow diagram of these steps. 
103 | 
104 | _Figure 2_
105 | 
106 | ![Process Workflow](./img/kafka-delta-ingest-workflow.png)
107 | 
108 | 


--------------------------------------------------------------------------------
/doc/HACKING.adoc:
--------------------------------------------------------------------------------
 1 | ifdef::env-github[]
 2 | :tip-caption: :bulb:
 3 | :note-caption: :information_source:
 4 | :important-caption: :heavy_exclamation_mark:
 5 | :caution-caption: :fire:
 6 | :warning-caption: :warning:
 7 | endif::[]
 8 | :toc: macro
 9 | 
10 | = Hacking on kafka-delta-ingest
11 | 
12 | kafka-delta-ingest is designed to work well with AWS and Azure, which can add
13 | some complexity to the development environment. This document outlines how to
14 | work with kafka-delta-ingest locally for new and existing Rust developers.
15 | 
16 | toc::[]
17 | 
18 | == Developing
19 | 
20 | Make sure the docker-compose setup has been ran, and execute `cargo test` to run unit and integration tests.
21 | 
22 | == Environment Variables
23 | 
24 | |===
25 | | Name | Default | Notes
26 | 
27 | | `KAFKA_BROKERS`
28 | | `0.0.0.0:9092`
29 | | A kafka broker string which can be used during integration testing
30 | 
31 | 
32 | | `AWS_ENDPOINT_URL`
33 | | `http://0.0.0.0:4056`
34 | | AWS endpoint URL for something that can provide stub S3 and DynamoDB operations (e.g. Localstack)
35 | 
36 | | `AWS_S3_BUCKET`
37 | | `tests`
38 | | Bucket to use for test data at the given endpoint
39 | 
40 | |===
41 | 
42 | 
43 | == Example Data
44 | 
45 | A tarball containing 100K line-delimited JSON messages is included in `tests/json/web_requests-100K.json.tar.gz`. Running `./bin/extract-example-json.sh` will unpack this to the expected location.
46 | 
47 | 
48 | .Pretty-printed example from the file
49 | [source,json]
50 | ----
51 | {
52 |   "meta": {
53 |     "producer": {
54 |       "timestamp": "2021-03-24T15:06:17.321710+00:00"
55 |     }
56 |   },
57 |   "method": "DELETE",
58 |   "session_id": "7c28bcf9-be26-4d0b-931a-3374ab4bb458",
59 |   "status": 204,
60 |   "url": "http://www.youku.com",
61 |   "uuid": "831c6afa-375c-4988-b248-096f9ed101f8"
62 | }
63 | ----
64 | 
65 | After extracting the example data, you'll need to play this onto the web_requests topic of the local Kafka container.
66 | 
67 | NOTE: URLs sampled for the test data are sourced from Wikipedia's list of most popular websites - https://en.wikipedia.org/wiki/List_of_most_popular_websites.
68 | 
69 | === Inspect example output
70 | 
71 | * List data files - `ls tests/data/web_requests/date=2021-03-24`
72 | * List delta log files - `ls tests/data/web_requests/_delta_log`
73 | * Show some parquet data (using link:https://pypi.org/project/parquet-tools/[parquet-tools])
74 | ** `parquet-tools show tests/data/web_requests/date=2021-03-24/<some file written by your example>`
75 | 


--------------------------------------------------------------------------------
/doc/img/kafka-delta-ingest-interaction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta-io/kafka-delta-ingest/da9c932be3a98649da74ed91f5e1593bece65e89/doc/img/kafka-delta-ingest-interaction.png


--------------------------------------------------------------------------------
/doc/img/kafka-delta-ingest-jobs-and-processes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta-io/kafka-delta-ingest/da9c932be3a98649da74ed91f5e1593bece65e89/doc/img/kafka-delta-ingest-jobs-and-processes.png


--------------------------------------------------------------------------------
/doc/img/kafka-delta-ingest-transaction-state.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta-io/kafka-delta-ingest/da9c932be3a98649da74ed91f5e1593bece65e89/doc/img/kafka-delta-ingest-transaction-state.png


--------------------------------------------------------------------------------
/doc/img/kafka-delta-ingest-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta-io/kafka-delta-ingest/da9c932be3a98649da74ed91f5e1593bece65e89/doc/img/kafka-delta-ingest-workflow.png


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3.9'
 3 | services:
 4 |   kafka:
 5 |     image: docker.redpanda.com/redpandadata/redpanda:v23.1.13
 6 |     command:
 7 |       - redpanda
 8 |       - start
 9 |       - --smp
10 |       - '1'
11 |       - --reserve-memory
12 |       - 0M
13 |       - --overprovisioned
14 |       - --node-id
15 |       - '0'
16 |       - --kafka-addr
17 |       - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
18 |       - --advertise-kafka-addr
19 |       - PLAINTEXT://redpanda:29092,OUTSIDE://localhost:9092
20 |       - --pandaproxy-addr
21 |       - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082
22 |       - --advertise-pandaproxy-addr
23 |       - PLAINTEXT://redpanda:28082,OUTSIDE://localhost:8082
24 |     networks:
25 |       default:
26 |         aliases:
27 |           - redpanda
28 |     ports:
29 |       - 8082:8082
30 |       - 9092:9092
31 |       - 28082:28082
32 |       - 29092:29092
33 | 
34 |   schema-registry:
35 |     image: confluentinc/cp-schema-registry:7.4.0
36 |     hostname: schema-registry
37 |     container_name: schema-registry
38 |     depends_on:
39 |       - kafka
40 |     ports:
41 |       - "8081:8081"
42 |     environment:
43 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
44 |       SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'redpanda:29092'
45 |       SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
46 | 
47 |   localstack:
48 |     image: localstack/localstack:0.13.1
49 |     ports:
50 |       - "4566:4566"
51 |     environment:
52 |       - SERVICES=s3,dynamodb
53 |       - DEBUG=1
54 |       - DATA_DIR=/tmp/localstack/data
55 |       - DOCKER_HOST=unix:///var/run/docker.sock
56 |       - HOST_TMP_FOLDER=${TMPDIR}
57 |     healthcheck:
58 |       test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]
59 |   azurite:
60 |     image: mcr.microsoft.com/azure-storage/azurite
61 |     ports:
62 |       - "10000:10000"
63 |       - "10001:10001"
64 |   setup:
65 |     build:
66 |       context: .
67 |       dockerfile: contrib/Dockerfile.setup
68 |     depends_on:
69 |       - kafka
70 |       - localstack
71 |       - azurite
72 |       - schema-registry
73 |     entrypoint: "/bin/bash"
74 |     command:
75 |       - /localstack-setup_emails.sh
76 |     volumes:
77 |       - ./bin/localstack-setup_emails.sh:/localstack-setup_emails.sh
78 |       - "./tests/data/emails/:/data/emails"
79 | 


--------------------------------------------------------------------------------
/src/coercions.rs:
--------------------------------------------------------------------------------
  1 | use deltalake_core::kernel::Schema as DeltaSchema;
  2 | use deltalake_core::kernel::{DataType, PrimitiveType};
  3 | 
  4 | use chrono::prelude::*;
  5 | use serde_json::Value;
  6 | use std::collections::HashMap;
  7 | use std::str::FromStr;
  8 | 
  9 | #[derive(Debug, Clone, PartialEq)]
 10 | #[allow(unused)]
 11 | enum CoercionNode {
 12 |     Coercion(Coercion),
 13 |     Tree(CoercionTree),
 14 |     ArrayTree(CoercionTree),
 15 |     ArrayPrimitive(Coercion),
 16 | }
 17 | 
 18 | #[derive(Debug, Clone, PartialEq)]
 19 | enum Coercion {
 20 |     ToString,
 21 |     ToTimestamp,
 22 | }
 23 | 
 24 | #[derive(Debug, Clone, PartialEq)]
 25 | pub(crate) struct CoercionTree {
 26 |     root: HashMap<String, CoercionNode>,
 27 | }
 28 | 
 29 | /// Returns a [`CoercionTree`] so the schema can be walked efficiently level by level when performing conversions.
 30 | pub(crate) fn create_coercion_tree(schema: &DeltaSchema) -> CoercionTree {
 31 |     let mut root = HashMap::new();
 32 | 
 33 |     for field in schema.fields() {
 34 |         if let Some(node) = build_coercion_node(field.data_type()) {
 35 |             root.insert(field.name().to_string(), node);
 36 |         }
 37 |     }
 38 | 
 39 |     CoercionTree { root }
 40 | }
 41 | 
 42 | fn build_coercion_node(data_type: &DataType) -> Option<CoercionNode> {
 43 |     match data_type {
 44 |         DataType::Primitive(primitive) => match primitive {
 45 |             PrimitiveType::String => Some(CoercionNode::Coercion(Coercion::ToString)),
 46 |             PrimitiveType::Timestamp => Some(CoercionNode::Coercion(Coercion::ToTimestamp)),
 47 |             _ => None,
 48 |         },
 49 |         DataType::Struct(st) => {
 50 |             let nested_context = create_coercion_tree(st);
 51 |             if !nested_context.root.is_empty() {
 52 |                 Some(CoercionNode::Tree(nested_context))
 53 |             } else {
 54 |                 None
 55 |             }
 56 |         }
 57 |         DataType::Array(array) => {
 58 |             build_coercion_node(array.element_type()).and_then(|node| match node {
 59 |                 CoercionNode::Coercion(c) => Some(CoercionNode::ArrayPrimitive(c)),
 60 |                 CoercionNode::Tree(t) => Some(CoercionNode::ArrayTree(t)),
 61 |                 _ => None,
 62 |             })
 63 |         }
 64 |         _ => None,
 65 |     }
 66 | }
 67 | 
 68 | /// Applies all data coercions specified by the [`CoercionTree`] to the [`Value`].
 69 | /// Though it does not currently, this function should approximate or improve on the coercions applied by [Spark's `from_json`](https://spark.apache.org/docs/latest/api/sql/index.html#from_json)
 70 | pub(crate) fn coerce(value: &mut Value, coercion_tree: &CoercionTree) {
 71 |     if let Some(context) = value.as_object_mut() {
 72 |         for (field_name, coercion) in coercion_tree.root.iter() {
 73 |             if let Some(value) = context.get_mut(field_name) {
 74 |                 apply_coercion(value, coercion);
 75 |             }
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | fn apply_coercion(value: &mut Value, node: &CoercionNode) {
 81 |     match node {
 82 |         CoercionNode::Coercion(Coercion::ToString) => {
 83 |             if !value.is_string() {
 84 |                 *value = Value::String(value.to_string());
 85 |             }
 86 |         }
 87 |         CoercionNode::Coercion(Coercion::ToTimestamp) => {
 88 |             if let Some(as_str) = value.as_str() {
 89 |                 if let Some(parsed) = string_to_timestamp(as_str) {
 90 |                     *value = parsed
 91 |                 }
 92 |             }
 93 |         }
 94 |         CoercionNode::Tree(tree) => {
 95 |             for (name, node) in tree.root.iter() {
 96 |                 let fields = value.as_object_mut();
 97 |                 if let Some(fields) = fields {
 98 |                     if let Some(value) = fields.get_mut(name) {
 99 |                         apply_coercion(value, node);
100 |                     }
101 |                 }
102 |             }
103 |         }
104 |         CoercionNode::ArrayPrimitive(coercion) => {
105 |             let values = value.as_array_mut();
106 |             if let Some(values) = values {
107 |                 let node = CoercionNode::Coercion(coercion.clone());
108 |                 for value in values {
109 |                     apply_coercion(value, &node);
110 |                 }
111 |             }
112 |         }
113 |         CoercionNode::ArrayTree(tree) => {
114 |             let values = value.as_array_mut();
115 |             if let Some(values) = values {
116 |                 let node = CoercionNode::Tree(tree.clone());
117 |                 for value in values {
118 |                     apply_coercion(value, &node);
119 |                 }
120 |             }
121 |         }
122 |     }
123 | }
124 | 
125 | /// Convert a given datetime looking string into microseconds using chrono's [DateTime] parsing
126 | ///
127 | /// Since this will convert to microseconds, if the value is outside of the realm of conversion a None is returned
128 | fn string_to_timestamp(string: &str) -> Option<Value> {
129 |     let parsed = DateTime::from_str(string);
130 |     if let Err(e) = parsed {
131 |         log::error!(
132 |             "Error coercing timestamp from string. String: {}. Error: {}",
133 |             string,
134 |             e
135 |         )
136 |     }
137 |     parsed
138 |         .ok()
139 |         .map(|dt: DateTime<Utc>| Value::Number(dt.timestamp_micros().into()))
140 | }
141 | 
142 | #[cfg(test)]
143 | mod tests {
144 |     use super::*;
145 |     use serde_json::json;
146 | 
147 |     #[test]
148 |     fn test_string_to_timestamp() {
149 |         let result = string_to_timestamp("2010-01-01T22:11:58Z");
150 |         assert!(result.is_some());
151 |         // exceeds nanos size
152 |         let result = string_to_timestamp("2400-01-01T22:11:58Z");
153 |         assert!(result.is_some());
154 |     }
155 | 
156 |     lazy_static! {
157 |         static ref SCHEMA: Value = json!({
158 |             "type": "struct",
159 |             "fields": [
160 |                 { "name": "level1_string", "type": "string", "nullable": true, "metadata": {} },
161 |                 { "name": "level1_integer", "type": "integer", "nullable": true, "metadata": {} },
162 |                 { "name": "level1_timestamp", "type": "timestamp", "nullable": true, "metadata": {} },
163 |                 {
164 |                     "name": "level2",
165 |                     "type": {
166 |                         "type": "struct",
167 |                         "fields": [
168 |                             {
169 |                                 "name": "level2_string",
170 |                                 "type": "string",
171 |                                 "nullable": true, "metadata": {}
172 |                             },
173 |                             {
174 |                                 "name": "level2_int",
175 |                                 "type": "integer",
176 |                                 "nullable": true, "metadata": {}
177 |                             },
178 |                             {
179 |                                 "name": "level2_timestamp",
180 |                                 "type": "timestamp",
181 |                                 "nullable": true, "metadata": {}
182 |                             }]
183 |                     },
184 |                     "nullable": true, "metadata": {}
185 |                 },
186 |                 {
187 |                     "name": "array_timestamp",
188 |                     "type": {
189 |                         "type": "array",
190 |                         "containsNull": true,
191 |                         "elementType": "timestamp",
192 |                     },
193 |                     "nullable": true, "metadata": {},
194 |                 },
195 |                 {
196 |                     "name": "array_string",
197 |                     "type": {
198 |                         "type": "array",
199 |                         "containsNull": true,
200 |                         "elementType": "string",
201 |                     },
202 |                     "nullable": true, "metadata": {},
203 |                 },
204 |                 {
205 |                     "name": "array_int",
206 |                     "type": {
207 |                         "type": "array",
208 |                         "containsNull": true,
209 |                         "elementType": "integer",
210 |                     },
211 |                     "nullable": true, "metadata": {},
212 |                 },
213 |                 {
214 |                     "name": "array_struct",
215 |                     "type": {
216 |                         "type": "array",
217 |                         "containsNull": true,
218 |                         "elementType": {
219 |                             "type": "struct",
220 |                             "fields": [
221 |                                 {
222 |                                     "name": "level2_string",
223 |                                     "type": "string",
224 |                                     "nullable": true, "metadata": {}
225 |                                 },
226 |                                 {
227 |                                     "name": "level2_int",
228 |                                     "type": "integer",
229 |                                     "nullable": true, "metadata": {}
230 |                                 },
231 |                                 {
232 |                                     "name": "level2_timestamp",
233 |                                     "type": "timestamp",
234 |                                     "nullable": true, "metadata": {}
235 |                                 },
236 |                             ],
237 |                         },
238 |                     },
239 |                     "nullable": true, "metadata": {},
240 |                 }
241 |             ]
242 |         });
243 |     }
244 | 
245 |     #[test]
246 |     fn test_coercion_tree() {
247 |         let delta_schema: DeltaSchema = serde_json::from_value(SCHEMA.clone()).unwrap();
248 | 
249 |         let tree = create_coercion_tree(&delta_schema);
250 | 
251 |         let mut top_level_keys: Vec<&String> = tree.root.keys().collect();
252 |         top_level_keys.sort();
253 | 
254 |         let level2 = tree.root.get("level2");
255 |         let level2_root = match level2 {
256 |             Some(CoercionNode::Tree(tree)) => tree.root.clone(),
257 |             _ => unreachable!(""),
258 |         };
259 |         let mut level2_keys: Vec<&String> = level2_root.keys().collect();
260 |         level2_keys.sort();
261 | 
262 |         let array_struct = tree.root.get("array_struct");
263 |         let array_struct_root = match array_struct {
264 |             Some(CoercionNode::ArrayTree(tree)) => tree.root.clone(),
265 |             _ => unreachable!(""),
266 |         };
267 | 
268 |         assert_eq!(
269 |             vec![
270 |                 "array_string",
271 |                 "array_struct",
272 |                 "array_timestamp",
273 |                 "level1_string",
274 |                 "level1_timestamp",
275 |                 "level2"
276 |             ],
277 |             top_level_keys
278 |         );
279 | 
280 |         assert_eq!(vec!["level2_string", "level2_timestamp"], level2_keys);
281 | 
282 |         assert_eq!(
283 |             CoercionNode::Coercion(Coercion::ToString),
284 |             tree.root.get("level1_string").unwrap().to_owned()
285 |         );
286 |         assert_eq!(
287 |             CoercionNode::Coercion(Coercion::ToTimestamp),
288 |             tree.root.get("level1_timestamp").unwrap().to_owned()
289 |         );
290 |         assert_eq!(
291 |             CoercionNode::Coercion(Coercion::ToString),
292 |             level2_root.get("level2_string").unwrap().to_owned()
293 |         );
294 |         assert_eq!(
295 |             CoercionNode::Coercion(Coercion::ToTimestamp),
296 |             level2_root.get("level2_timestamp").unwrap().to_owned()
297 |         );
298 |         assert_eq!(
299 |             CoercionNode::ArrayPrimitive(Coercion::ToString),
300 |             tree.root.get("array_string").unwrap().to_owned()
301 |         );
302 |         assert_eq!(
303 |             CoercionNode::ArrayPrimitive(Coercion::ToTimestamp),
304 |             tree.root.get("array_timestamp").unwrap().to_owned()
305 |         );
306 |         assert_eq!(
307 |             CoercionNode::Coercion(Coercion::ToString),
308 |             array_struct_root.get("level2_string").unwrap().to_owned()
309 |         );
310 |         assert_eq!(
311 |             CoercionNode::Coercion(Coercion::ToTimestamp),
312 |             array_struct_root
313 |                 .get("level2_timestamp")
314 |                 .unwrap()
315 |                 .to_owned()
316 |         );
317 |     }
318 | 
319 |     #[test]
320 |     fn test_coercions() {
321 |         let delta_schema: DeltaSchema = serde_json::from_value(SCHEMA.clone()).unwrap();
322 | 
323 |         let coercion_tree = create_coercion_tree(&delta_schema);
324 | 
325 |         let mut messages = vec![
326 |             json!({
327 |                 "level1_string": "a",
328 |                 "level1_integer": 0,
329 |                 // Timestamp passed in as an i64. We won't coerce it, but it will work anyway.
330 |                 "level1_timestamp": 1636668718000000i64,
331 |                 "level2": {
332 |                     "level2_string": { "x": "x", "y": "y" },
333 |                     "level2_timestamp": "2021-11-11T22:11:58Z"
334 |                 },
335 |                 "array_timestamp": ["2021-11-17T01:02:03Z", "2021-11-17T02:03:04Z"],
336 |                 "array_string": ["a", "b", {"a": 1}],
337 |                 "array_int": [1, 2, 3],
338 |                 "array_struct": [
339 |                     {
340 |                         "level2_string": r#"{"a":1}"#,
341 |                         "level2_int": 1,
342 |                         "level2_timestamp": "2021-11-17T00:00:01Z"
343 |                     },
344 |                     {
345 |                         "level2_string": { "a": 2 },
346 |                         "level2_int": 2,
347 |                         "level2_timestamp": 1637107202000000i64
348 |                     },
349 |                 ]
350 |             }),
351 |             json!({
352 |                 "level1_string": { "a": "a", "b": "b"},
353 |                 "level1_integer": 42,
354 |                 // Complies with ISO 8601 and RFC 3339. We WILL coerce it.
355 |                 "level1_timestamp": "2021-11-11T22:11:58Z"
356 |             }),
357 |             json!({
358 |                 "level1_integer": 99,
359 |             }),
360 |             json!({
361 |                 // Complies with ISO 8601 and RFC 3339. We WILL coerce it.
362 |                 "level1_timestamp": "2021-11-11T22:11:58+00:00",
363 |             }),
364 |             json!({
365 |                 // RFC 3339 but not ISO 8601. We WILL coerce it.
366 |                 "level1_timestamp": "2021-11-11T22:11:58-00:00",
367 |             }),
368 |             json!({
369 |                 // ISO 8601 but not RFC 3339. We WON'T coerce it.
370 |                 "level1_timestamp": "20211111T22115800Z",
371 |             }),
372 |             json!({
373 |                 // This is a Java date style timestamp. We WON'T coerce it.
374 |                 "level1_timestamp": "2021-11-11 22:11:58",
375 |             }),
376 |             json!({
377 |                 "level1_timestamp": "This definitely is not a timestamp",
378 |             }),
379 |             json!({
380 |                 // This is valid epoch micros, but typed as a string on the way in. We WON'T coerce it.
381 |                 "level1_timestamp": "1636668718000000",
382 |             }),
383 |         ];
384 | 
385 |         for message in messages.iter_mut() {
386 |             coerce(message, &coercion_tree);
387 |         }
388 | 
389 |         let expected = vec![
390 |             json!({
391 |                 "level1_string": "a",
392 |                 "level1_integer": 0,
393 |                 // Timestamp passed in as an i64. We won't coerce it, but it will work anyway.
394 |                 "level1_timestamp": 1636668718000000i64,
395 |                 "level2": {
396 |                     "level2_string": r#"{"x":"x","y":"y"}"#,
397 |                     "level2_timestamp": 1636668718000000i64
398 |                 },
399 |                 "array_timestamp": [1637110923000000i64, 1637114584000000i64],
400 |                 "array_string": ["a", "b", r#"{"a":1}"#],
401 |                 "array_int": [1, 2, 3],
402 |                 "array_struct": [
403 |                     {
404 |                         "level2_string": "{\"a\":1}",
405 |                         "level2_int": 1,
406 |                         "level2_timestamp": 1637107201000000i64
407 |                     },
408 |                     {
409 |                         "level2_string": r#"{"a":2}"#,
410 |                         "level2_int": 2,
411 |                         "level2_timestamp": 1637107202000000i64
412 |                     },
413 |                 ]
414 |             }),
415 |             json!({
416 |                 "level1_string": r#"{"a":"a","b":"b"}"#,
417 |                 "level1_integer": 42,
418 |                 // Complies with ISO 8601 and RFC 3339. We WILL coerce it.
419 |                 "level1_timestamp": 1636668718000000i64
420 |             }),
421 |             json!({
422 |                 "level1_integer": 99,
423 |             }),
424 |             json!({
425 |                 // Complies with ISO 8601 and RFC 3339. We WILL coerce it.
426 |                 "level1_timestamp": 1636668718000000i64
427 |             }),
428 |             json!({
429 |                 // RFC 3339 but not ISO 8601. We WILL coerce it.
430 |                 "level1_timestamp": 1636668718000000i64
431 |             }),
432 |             json!({
433 |                 // ISO 8601 but not RFC 3339. We WON'T coerce it.
434 |                 "level1_timestamp": "20211111T22115800Z",
435 |             }),
436 |             json!({
437 |                 // This is a Java date style timestamp. We WON'T coerce it.
438 |                 "level1_timestamp": "2021-11-11 22:11:58",
439 |             }),
440 |             json!({
441 |                 "level1_timestamp": "This definitely is not a timestamp",
442 |             }),
443 |             json!({
444 |                 // This is valid epoch micros, but typed as a string on the way in. We WON'T coerce it.
445 |                 "level1_timestamp": "1636668718000000",
446 |             }),
447 |         ];
448 | 
449 |         for i in 0..messages.len() {
450 |             assert_eq!(messages[i], expected[i]);
451 |         }
452 |     }
453 | }
454 | 


--------------------------------------------------------------------------------
/src/cursor.rs:
--------------------------------------------------------------------------------
  1 | //! Cursors lifted from an older version of parquet.
  2 | use std::io::{self, Cursor, Error, ErrorKind, Read, Seek, SeekFrom, Write};
  3 | use std::sync::{Arc, Mutex};
  4 | use std::{cmp, fmt};
  5 | 
  6 | #[allow(clippy::rc_buffer)]
  7 | /// A SliceableCursor lifted from the legacy parquet implementation
  8 | pub struct SliceableCursor {
  9 |     inner: Arc<Vec<u8>>,
 10 |     start: u64,
 11 |     length: usize,
 12 |     pos: u64,
 13 | }
 14 | 
 15 | impl fmt::Debug for SliceableCursor {
 16 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 17 |         f.debug_struct("SliceableCursor")
 18 |             .field("start", &self.start)
 19 |             .field("length", &self.length)
 20 |             .field("pos", &self.pos)
 21 |             .field("inner.len", &self.inner.len())
 22 |             .finish()
 23 |     }
 24 | }
 25 | 
 26 | impl SliceableCursor {
 27 |     /// Create a new SliceableCursor
 28 |     pub fn new(content: impl Into<Arc<Vec<u8>>>) -> Self {
 29 |         let inner = content.into();
 30 |         let size = inner.len();
 31 |         SliceableCursor {
 32 |             inner,
 33 |             start: 0,
 34 |             pos: 0,
 35 |             length: size,
 36 |         }
 37 |     }
 38 | 
 39 |     /// Create a slice cursor using the same data as a current one.
 40 |     pub fn slice(&self, start: u64, length: usize) -> io::Result<Self> {
 41 |         let new_start = self.start + start;
 42 |         if new_start >= self.inner.len() as u64 || new_start as usize + length > self.inner.len() {
 43 |             return Err(Error::new(ErrorKind::InvalidInput, "out of bound"));
 44 |         }
 45 |         Ok(SliceableCursor {
 46 |             inner: Arc::clone(&self.inner),
 47 |             start: new_start,
 48 |             pos: new_start,
 49 |             length,
 50 |         })
 51 |     }
 52 | 
 53 |     fn remaining_slice(&self) -> &[u8] {
 54 |         let end = self.start as usize + self.length;
 55 |         let offset = cmp::min(self.pos, end as u64) as usize;
 56 |         &self.inner[offset..end]
 57 |     }
 58 | 
 59 |     /// Get the length of the current cursor slice
 60 |     pub fn len(&self) -> u64 {
 61 |         self.length as u64
 62 |     }
 63 | 
 64 |     /// return true if the cursor is empty (self.len() == 0)
 65 |     pub fn is_empty(&self) -> bool {
 66 |         self.len() == 0
 67 |     }
 68 | }
 69 | 
 70 | /// Implementation inspired by std::io::Cursor
 71 | impl Read for SliceableCursor {
 72 |     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
 73 |         let n = Read::read(&mut self.remaining_slice(), buf)?;
 74 |         self.pos += n as u64;
 75 |         Ok(n)
 76 |     }
 77 | }
 78 | 
 79 | impl Seek for SliceableCursor {
 80 |     fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
 81 |         let new_pos = match pos {
 82 |             SeekFrom::Start(pos) => pos as i64,
 83 |             SeekFrom::End(pos) => self.inner.len() as i64 + pos,
 84 |             SeekFrom::Current(pos) => self.pos as i64 + pos,
 85 |         };
 86 | 
 87 |         if new_pos < 0 {
 88 |             Err(Error::new(
 89 |                 ErrorKind::InvalidInput,
 90 |                 format!(
 91 |                     "Request out of bounds: cur position {} + seek {:?} < 0: {}",
 92 |                     self.pos, pos, new_pos
 93 |                 ),
 94 |             ))
 95 |         } else if new_pos >= self.inner.len() as i64 {
 96 |             Err(Error::new(
 97 |                 ErrorKind::InvalidInput,
 98 |                 format!(
 99 |                     "Request out of bounds: cur position {} + seek {:?} >= length {}: {}",
100 |                     self.pos,
101 |                     pos,
102 |                     self.inner.len(),
103 |                     new_pos
104 |                 ),
105 |             ))
106 |         } else {
107 |             self.pos = new_pos as u64;
108 |             Ok(self.start)
109 |         }
110 |     }
111 | }
112 | 
113 | #[derive(Debug, Default, Clone)]
114 | pub(crate) struct InMemoryWriteableCursor {
115 |     buffer: Arc<Mutex<Cursor<Vec<u8>>>>,
116 | }
117 | 
118 | impl InMemoryWriteableCursor {
119 |     /// Returns a clone of the underlying buffer
120 |     pub fn data(&self) -> Vec<u8> {
121 |         let inner = self.buffer.lock().unwrap();
122 |         inner.get_ref().to_vec()
123 |     }
124 | }
125 | 
126 | impl Write for InMemoryWriteableCursor {
127 |     fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
128 |         let mut inner = self.buffer.lock().unwrap();
129 |         inner.write(buf)
130 |     }
131 | 
132 |     fn flush(&mut self) -> std::io::Result<()> {
133 |         let mut inner = self.buffer.lock().unwrap();
134 |         inner.flush()
135 |     }
136 | }
137 | 
138 | impl Seek for InMemoryWriteableCursor {
139 |     fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
140 |         let mut inner = self.buffer.lock().unwrap();
141 |         inner.seek(pos)
142 |     }
143 | }
144 | 
145 | #[cfg(test)]
146 | mod tests {
147 |     use super::*;
148 | 
149 |     /// Create a SliceableCursor of all u8 values in ascending order
150 |     fn get_u8_range() -> SliceableCursor {
151 |         let data: Vec<u8> = (0u8..=255).collect();
152 |         SliceableCursor::new(data)
153 |     }
154 | 
155 |     /// Reads all the bytes in the slice and checks that it matches the u8 range from start to end_included
156 |     fn check_read_all(mut cursor: SliceableCursor, start: u8, end_included: u8) {
157 |         let mut target = vec![];
158 |         let cursor_res = cursor.read_to_end(&mut target);
159 |         println!("{:?}", cursor_res);
160 |         assert!(cursor_res.is_ok(), "reading error");
161 |         assert_eq!((end_included - start) as usize + 1, cursor_res.unwrap());
162 |         assert_eq!((start..=end_included).collect::<Vec<_>>(), target);
163 |     }
164 | 
165 |     #[test]
166 |     fn read_all_whole() {
167 |         let cursor = get_u8_range();
168 |         check_read_all(cursor, 0, 255);
169 |     }
170 | 
171 |     #[test]
172 |     fn read_all_slice() {
173 |         let cursor = get_u8_range().slice(10, 10).expect("error while slicing");
174 |         check_read_all(cursor, 10, 19);
175 |     }
176 | 
177 |     #[test]
178 |     fn seek_cursor_start() {
179 |         let mut cursor = get_u8_range();
180 | 
181 |         cursor.seek(SeekFrom::Start(5)).unwrap();
182 |         check_read_all(cursor, 5, 255);
183 |     }
184 | 
185 |     #[test]
186 |     fn seek_cursor_current() {
187 |         let mut cursor = get_u8_range();
188 |         cursor.seek(SeekFrom::Start(10)).unwrap();
189 |         cursor.seek(SeekFrom::Current(10)).unwrap();
190 |         check_read_all(cursor, 20, 255);
191 |     }
192 | 
193 |     #[test]
194 |     fn seek_cursor_end() {
195 |         let mut cursor = get_u8_range();
196 | 
197 |         cursor.seek(SeekFrom::End(-10)).unwrap();
198 |         check_read_all(cursor, 246, 255);
199 |     }
200 | 
201 |     #[test]
202 |     fn seek_cursor_error_too_long() {
203 |         let mut cursor = get_u8_range();
204 |         let res = cursor.seek(SeekFrom::Start(1000));
205 |         let actual_error = res.expect_err("expected error").to_string();
206 |         let expected_error =
207 |             "Request out of bounds: cur position 0 + seek Start(1000) >= length 256: 1000";
208 |         assert_eq!(actual_error, expected_error);
209 |     }
210 | 
211 |     #[test]
212 |     fn seek_cursor_error_too_short() {
213 |         let mut cursor = get_u8_range();
214 |         let res = cursor.seek(SeekFrom::End(-1000));
215 |         let actual_error = res.expect_err("expected error").to_string();
216 |         let expected_error = "Request out of bounds: cur position 0 + seek End(-1000) < 0: -744";
217 |         assert_eq!(actual_error, expected_error);
218 |     }
219 | }
220 | 


--------------------------------------------------------------------------------
/src/dead_letters.rs:
--------------------------------------------------------------------------------
  1 | use crate::transforms::Transformer;
  2 | use async_trait::async_trait;
  3 | use chrono::prelude::*;
  4 | use core::fmt::Debug;
  5 | use deltalake_core::parquet::errors::ParquetError;
  6 | use deltalake_core::{DeltaTable, DeltaTableError};
  7 | #[cfg(feature = "s3")]
  8 | use dynamodb_lock::dynamo_lock_options;
  9 | use log::{error, info, warn};
 10 | #[cfg(feature = "s3")]
 11 | use maplit::hashmap;
 12 | use rdkafka::message::BorrowedMessage;
 13 | use serde::{Deserialize, Serialize};
 14 | use serde_json::Value;
 15 | use std::collections::HashMap;
 16 | 
 17 | use crate::{transforms::TransformError, writer::*};
 18 | 
 19 | #[cfg(feature = "s3")]
 20 | mod env_vars {
 21 |     pub(crate) const DEAD_LETTER_DYNAMO_LOCK_PARTITION_KEY_VALUE: &str =
 22 |         "DEAD_LETTER_DYNAMO_LOCK_PARTITION_KEY_VALUE";
 23 | }
 24 | 
 25 | /// Struct that represents a dead letter record.
 26 | #[derive(Clone, Debug, Serialize, Deserialize)]
 27 | pub struct DeadLetter {
 28 |     /// Base64 encoded bytes captured when message deserialization from Kafka fails.
 29 |     pub base64_bytes: Option<String>,
 30 |     /// JSON string captured when either transform or parquet write fails.
 31 |     pub json_string: Option<String>,
 32 |     /// Error string that correlates with the failure.
 33 |     /// The error type will vary depending on the context.
 34 |     pub error: Option<String>,
 35 |     /// Timestamp microseconds representing when the dead letter was created.
 36 |     /// Microseconds are used for compatability with the deltalake `timestamp` type.
 37 |     pub timestamp: i64,
 38 | }
 39 | 
 40 | impl DeadLetter {
 41 |     /// Creates a dead letter from bytes that failed deserialization.
 42 |     /// `json_string` will always be `None`.
 43 |     pub(crate) fn from_failed_deserialization(bytes: &[u8], err: String) -> Self {
 44 |         let timestamp = Utc::now();
 45 |         Self {
 46 |             base64_bytes: Some(base64::encode(bytes)),
 47 |             json_string: None,
 48 |             error: Some(err),
 49 |             timestamp: timestamp
 50 |                 .timestamp_nanos_opt()
 51 |                 .expect("Failed to convert timezone to nanoseconds")
 52 |                 / 1000,
 53 |         }
 54 |     }
 55 | 
 56 |     /// Creates a dead letter from a failed transform.
 57 |     /// `base64_bytes` will always be `None`.
 58 |     pub(crate) fn from_failed_transform(value: &Value, err: TransformError) -> Self {
 59 |         let timestamp = Utc::now();
 60 |         Self {
 61 |             base64_bytes: None,
 62 |             json_string: Some(value.to_string()),
 63 |             error: Some(err.to_string()),
 64 |             timestamp: timestamp
 65 |                 .timestamp_nanos_opt()
 66 |                 .expect("Failed to convert timezone to nanoseconds")
 67 |                 / 1000,
 68 |         }
 69 |     }
 70 | 
 71 |     /// Creates a dead letter from a record that fails on parquet write.
 72 |     /// `base64_bytes` will always be `None`.
 73 |     /// `json_string` will contain the stringified JSON that was not writeable to parquet.
 74 |     pub(crate) fn from_failed_parquet_row(value: &Value, err: &ParquetError) -> Self {
 75 |         let timestamp = Utc::now();
 76 |         Self {
 77 |             base64_bytes: None,
 78 |             json_string: Some(value.to_string()),
 79 |             error: Some(err.to_string()),
 80 |             timestamp: timestamp
 81 |                 .timestamp_nanos_opt()
 82 |                 .expect("Failed to convert timezone to nanoseconds")
 83 |                 / 1000,
 84 |         }
 85 |     }
 86 | 
 87 |     /// Creates a vector of tuples where the first element is the
 88 |     /// stringified JSON value that was not writeable to parquet and
 89 |     /// the second element is the `ParquetError` that occurred for that record when attempting the write.
 90 |     pub(crate) fn vec_from_failed_parquet_rows(failed: Vec<(Value, ParquetError)>) -> Vec<Self> {
 91 |         failed
 92 |             .iter()
 93 |             .map(|(v, e)| Self::from_failed_parquet_row(v, e))
 94 |             .collect()
 95 |     }
 96 | }
 97 | 
 98 | /// Error returned when a dead letter write fails.
 99 | #[derive(thiserror::Error, Debug)]
100 | pub enum DeadLetterQueueError {
101 |     /// Error returned when JSON serialization of a [DeadLetter] fails.
102 |     #[error("JSON serialization failed: {source}")]
103 |     SerdeJson {
104 |         #[from]
105 |         source: serde_json::Error,
106 |     },
107 | 
108 |     /// Error returned when a write to the dead letter delta table used by [DeltaSinkDeadLetterQueue] fails.
109 |     #[error("Write failed: {source}")]
110 |     Writer {
111 |         #[from]
112 |         source: Box<DataWriterError>,
113 |     },
114 | 
115 |     /// Error returned by the internal dead letter transformer.
116 |     #[error("TransformError: {source}")]
117 |     Transform {
118 |         #[from]
119 |         source: TransformError,
120 |     },
121 | 
122 |     /// DeltaTable returned an error.
123 |     #[error("DeltaTable interaction failed: {source}")]
124 |     DeltaTable {
125 |         /// The wrapped [`DeltaTableError`]
126 |         #[from]
127 |         source: DeltaTableError,
128 |     },
129 | 
130 |     /// Error returned when the DeltaSinkDeadLetterQueue is used but no table uri is specified.
131 |     #[error("No table_uri for DeltaSinkDeadLetterQueue")]
132 |     NoTableUri,
133 | }
134 | 
135 | /// Options that should be passed to `dlq_from_opts` to create the desired [DeadLetterQueue] instance.
136 | pub(crate) struct DeadLetterQueueOptions {
137 |     /// Table URI of the delta table to write dead letters to. Implies usage of the DeltaSinkDeadLetterQueue.
138 |     pub delta_table_uri: Option<String>,
139 |     /// A list of transforms to apply to dead letters before writing to delta.
140 |     pub dead_letter_transforms: HashMap<String, String>,
141 |     /// Whether to write checkpoints on every 10th version of the dead letter table.
142 |     pub write_checkpoints: bool,
143 | }
144 | 
145 | /// Trait that defines a dead letter queue interface.
146 | /// Available implementations are:
147 | /// * [NoopDeadLetterQueue] (the default)
148 | /// * [DeltaSinkDeadLetterQueue]
149 | /// * [LoggingDeadLetterQueue]
150 | ///
151 | /// The [LoggingDeadLetterQueue] is intended for local development only
152 | /// and is not provided by the [dlq_from_opts] factory method.
153 | #[async_trait]
154 | pub(crate) trait DeadLetterQueue: Send {
155 |     /// Writes one [DeadLetter] to the [DeadLetterQueue].
156 |     async fn write_dead_letter(
157 |         &mut self,
158 |         dead_letter: DeadLetter,
159 |     ) -> Result<(), DeadLetterQueueError> {
160 |         self.write_dead_letters(vec![dead_letter]).await
161 |     }
162 | 
163 |     /// Writes a vector of [DeadLetter]s to the [DeadLetterQueue]
164 |     async fn write_dead_letters(
165 |         &mut self,
166 |         dead_letters: Vec<DeadLetter>,
167 |     ) -> Result<(), DeadLetterQueueError>;
168 | }
169 | 
170 | /// Factory method for creating a [DeadLetterQueue] based on the passed options.
171 | /// The default implementation is [NoopDeadLetterQueue].
172 | /// To opt-in for the [DeltaSinkDeadLetterQueue], the `delta_table_uri` should be set in options.
173 | pub(crate) async fn dlq_from_opts(
174 |     options: DeadLetterQueueOptions,
175 | ) -> Result<Box<dyn DeadLetterQueue>, DeadLetterQueueError> {
176 |     if options.delta_table_uri.is_some() {
177 |         Ok(Box::new(
178 |             DeltaSinkDeadLetterQueue::from_options(options).await?,
179 |         ))
180 |     } else {
181 |         Ok(Box::new(NoopDeadLetterQueue {}))
182 |     }
183 | }
184 | 
185 | /// Default implementation of [DeadLetterQueue] which does nothing.
186 | /// This is used as the default to avoid forcing users to setup additional infrastructure for capturing dead letters.
187 | /// and avoid any risk of exposing PII in logs,
188 | pub(crate) struct NoopDeadLetterQueue {}
189 | 
190 | #[async_trait]
191 | impl DeadLetterQueue for NoopDeadLetterQueue {
192 |     async fn write_dead_letters(
193 |         &mut self,
194 |         _dead_letters: Vec<DeadLetter>,
195 |     ) -> Result<(), DeadLetterQueueError> {
196 |         // noop
197 |         Ok(())
198 |     }
199 | }
200 | 
201 | /// Implementation of the [DeadLetterQueue] trait that writes dead letter content as warn logs.
202 | /// This implementation is currently only intended for debug development usage.
203 | /// Be mindful of your PII when using this implementation.
204 | #[allow(dead_code)]
205 | pub(crate) struct LoggingDeadLetterQueue {}
206 | 
207 | #[async_trait]
208 | impl DeadLetterQueue for LoggingDeadLetterQueue {
209 |     async fn write_dead_letters(
210 |         &mut self,
211 |         dead_letters: Vec<DeadLetter>,
212 |     ) -> Result<(), DeadLetterQueueError> {
213 |         for dead_letter in dead_letters {
214 |             warn!("DeadLetter: {:?}", dead_letter);
215 |         }
216 | 
217 |         Ok(())
218 |     }
219 | }
220 | 
221 | /// Implementation of the [DeadLetterQueue] trait that writes dead letters to a delta table.
222 | /// NOTE: The delta table where dead letters are written must be created beforehand
223 | /// and be based on the [DeadLetter] struct.
224 | /// DeadLetter transforms may be specified to enrich the serialized [DeadLetter] before writing it to the table.
225 | ///
226 | /// For example, given a delta table schema created from:
227 | ///
228 | /// ```sql
229 | /// CREATE TABLE `kafka_delta_ingest`.`dead_letters` (
230 | ///   `base64_bytes` STRING COMMENT 'Base 64 encoded bytes of a message that failed deserialization.',
231 | ///   `json_string` STRING COMMENT 'JSON string captured when either transform or parquet write fails for a message.',
232 | ///   `error` STRING COMMENT 'Error string captured when the dead letter failed.',
233 | ///   `timestamp` TIMESTAMP COMMENT 'Timestamp when the dead letter was created.',
234 | ///   `date` STRING COMMENT '(e.g. 2021-01-01) Date when the dead letter was created.')
235 | /// USING DELTA
236 | /// PARTITIONED BY (date)
237 | /// ```
238 | ///
239 | /// A dead letter transform with key: `date` and value: `substr(epoch_micros_to_iso8601(timestamp),`0`,`10`)` should be provided to generate the `date` field.
240 | pub(crate) struct DeltaSinkDeadLetterQueue {
241 |     table: DeltaTable,
242 |     delta_writer: DataWriter,
243 |     transformer: Transformer,
244 |     write_checkpoints: bool,
245 | }
246 | 
247 | impl DeltaSinkDeadLetterQueue {
248 |     pub(crate) async fn from_options(
249 |         options: DeadLetterQueueOptions,
250 |     ) -> Result<Self, DeadLetterQueueError> {
251 |         match &options.delta_table_uri {
252 |             Some(table_uri) => {
253 |                 #[cfg(feature = "s3")]
254 |                 let opts = hashmap! {
255 |                     dynamo_lock_options::DYNAMO_LOCK_PARTITION_KEY_VALUE.to_string() => std::env::var(env_vars::DEAD_LETTER_DYNAMO_LOCK_PARTITION_KEY_VALUE)
256 |                     .unwrap_or_else(|_| "kafka_delta_ingest-dead_letters".to_string()),
257 |                 };
258 |                 #[cfg(all(feature = "azure", not(feature = "s3")))]
259 |                 let opts = HashMap::default();
260 | 
261 |                 let table = crate::delta_helpers::load_table(table_uri, opts.clone()).await?;
262 |                 let delta_writer = DataWriter::for_table(&table, opts)?;
263 | 
264 |                 Ok(Self {
265 |                     table,
266 |                     delta_writer,
267 |                     transformer: Transformer::from_transforms(&options.dead_letter_transforms)?,
268 |                     write_checkpoints: options.write_checkpoints,
269 |                 })
270 |             }
271 |             _ => Err(DeadLetterQueueError::NoTableUri),
272 |         }
273 |     }
274 | }
275 | 
276 | #[async_trait]
277 | impl DeadLetterQueue for DeltaSinkDeadLetterQueue {
278 |     /// Writes dead letters to the delta table specified in [DeadLetterQueueOptions].
279 |     /// Transforms specified in [DeadLetterQueueOptions] are applied before write.
280 |     /// If the `write_checkpoints` options is specified - writes a checkpoint on every version divisible by 10.
281 |     async fn write_dead_letters(
282 |         &mut self,
283 |         dead_letters: Vec<DeadLetter>,
284 |     ) -> Result<(), DeadLetterQueueError> {
285 |         let values: Result<Vec<Value>, _> = dead_letters
286 |             .iter()
287 |             .map(|dl| {
288 |                 serde_json::to_value(dl)
289 |                     .map_err(|e| DeadLetterQueueError::SerdeJson { source: e })
290 |                     .and_then(|mut v| {
291 |                         self.transformer
292 |                             .transform(&mut v, None as Option<&BorrowedMessage>)?;
293 |                         Ok(v)
294 |                     })
295 |             })
296 |             .collect();
297 |         let values = values?;
298 | 
299 |         let version = self
300 |             .delta_writer
301 |             .insert_all(&mut self.table, values)
302 |             .await?;
303 | 
304 |         if self.write_checkpoints {
305 |             crate::delta_helpers::try_create_checkpoint(&mut self.table, version).await?;
306 |         }
307 | 
308 |         info!(
309 |             "Inserted {} dead letters to {}",
310 |             dead_letters.len(),
311 |             self.table.table_uri(),
312 |         );
313 | 
314 |         Ok(())
315 |     }
316 | }
317 | 


--------------------------------------------------------------------------------
/src/delta_helpers.rs:
--------------------------------------------------------------------------------
 1 | use crate::{DataTypeOffset, DataTypePartition};
 2 | use deltalake_core::kernel::{Action, Add, Transaction};
 3 | use deltalake_core::{DeltaTable, DeltaTableError};
 4 | use std::collections::HashMap;
 5 | 
 6 | pub(crate) async fn load_table(
 7 |     table_uri: &str,
 8 |     options: HashMap<String, String>,
 9 | ) -> Result<DeltaTable, DeltaTableError> {
10 |     let mut table = deltalake_core::open_table_with_storage_options(table_uri, options).await?;
11 |     table.load().await?;
12 |     Ok(table)
13 | }
14 | 
15 | pub(crate) fn build_actions(
16 |     partition_offsets: &HashMap<DataTypePartition, DataTypeOffset>,
17 |     app_id: &str,
18 |     mut add: Vec<Add>,
19 | ) -> Vec<Action> {
20 |     partition_offsets
21 |         .iter()
22 |         .map(|(partition, offset)| {
23 |             create_txn_action(txn_app_id_for_partition(app_id, *partition), *offset)
24 |         })
25 |         .chain(add.drain(..).map(Action::Add))
26 |         .collect()
27 | }
28 | 
29 | pub(crate) fn create_txn_action(txn_app_id: String, offset: DataTypeOffset) -> Action {
30 |     Action::Txn(Transaction {
31 |         app_id: txn_app_id,
32 |         version: offset,
33 |         last_updated: Some(
34 |             std::time::SystemTime::now()
35 |                 .duration_since(std::time::UNIX_EPOCH)
36 |                 .unwrap()
37 |                 .as_millis() as i64,
38 |         ),
39 |     })
40 | }
41 | 
42 | pub(crate) async fn try_create_checkpoint(
43 |     table: &mut DeltaTable,
44 |     version: i64,
45 | ) -> Result<(), DeltaTableError> {
46 |     if version % 10 == 0 {
47 |         let table_version = table.version();
48 |         // if there's new version right after current commit, then we need to reset
49 |         // the table right back to version to create the checkpoint
50 |         let version_updated = table_version != version;
51 |         if version_updated {
52 |             table.load_version(version).await?;
53 |         }
54 | 
55 |         deltalake_core::checkpoints::create_checkpoint(table, None).await?;
56 |         log::info!("Created checkpoint version {}.", version);
57 | 
58 |         let removed = deltalake_core::checkpoints::cleanup_metadata(table, None).await?;
59 |         if removed > 0 {
60 |             log::info!("Metadata cleanup, removed {} obsolete logs.", removed);
61 |         }
62 | 
63 |         if version_updated {
64 |             table.update().await?;
65 |         }
66 |     }
67 |     Ok(())
68 | }
69 | 
70 | pub(crate) fn txn_app_id_for_partition(app_id: &str, partition: DataTypePartition) -> String {
71 |     format!("{}-{}", app_id, partition)
72 | }
73 | 
74 | /// Returns the last transaction version for the given transaction id recorded in the delta table.
75 | pub(crate) fn last_txn_version(table: &DeltaTable, txn_id: &str) -> Option<i64> {
76 |     table
77 |         .get_app_transaction_version()
78 |         .get(txn_id)
79 |         .map(|t| t.version)
80 | }
81 | 


--------------------------------------------------------------------------------
/src/metrics.rs:
--------------------------------------------------------------------------------
  1 | use dipstick::*;
  2 | use log::error;
  3 | use std::convert::TryInto;
  4 | use std::time::Instant;
  5 | 
  6 | /// The environment variable used to specify how many metrics should be written to the metrics queue before flushing to statsd.
  7 | const METRICS_INPUT_QUEUE_SIZE_VAR_NAME: &str = "KDI_METRICS_INPUT_QUEUE_SIZE";
  8 | /// The environment variable used to specify a prefix for metrics.
  9 | const METRICS_PREFIX_VAR_NAME: &str = "KDI_METRICS_PREFIX";
 10 | 
 11 | /// The default input queue size for sending metrics to statsd.
 12 | const DEFAULT_INPUT_QUEUE_SIZE: usize = 100;
 13 | 
 14 | /// Error returned when there is a failure in [`IngestMetrics`].
 15 | #[derive(thiserror::Error, Debug)]
 16 | pub enum IngestMetricsError {
 17 |     /// Error returned when the environment variable provided for METRICS_INPUT_QUEUE_SIZE could not be parsed
 18 |     #[error("Could not parse {0} provided in METRICS_INPUT_QUEUE_SIZE env variable")]
 19 |     InvalidMetricsInputQueueSize(String),
 20 | }
 21 | 
 22 | /// Wraps a [`dipstick::queue::InputQueueScope`] to provide a higher level API for recording metrics.
 23 | #[derive(Clone)]
 24 | pub(crate) struct IngestMetrics {
 25 |     metrics: InputQueueScope,
 26 | }
 27 | 
 28 | impl IngestMetrics {
 29 |     /// Creates an instance of [`IngestMetrics`] for sending metrics to statsd.
 30 |     pub(crate) fn new(endpoint: &str) -> Result<Self, IngestMetricsError> {
 31 |         let metrics = create_queue(endpoint)?;
 32 | 
 33 |         Ok(Self { metrics })
 34 |     }
 35 | 
 36 |     /// increments a counter for message deserialized
 37 |     pub fn message_deserialized(&self) {
 38 |         self.record_one(StatType::MessageDeserialized);
 39 |     }
 40 | 
 41 |     /// increments a counter for message deserialization failed
 42 |     pub fn message_deserialization_failed(&self) {
 43 |         self.record_one(StatType::MessageDeserializationFailed);
 44 |     }
 45 | 
 46 |     /// records a guage stat for message size
 47 |     pub fn message_deserialized_size(&self, size: usize) {
 48 |         self.record_stat(StatType::MessageSize, size as i64);
 49 |     }
 50 | 
 51 |     /// increments a counter for message transformed
 52 |     pub fn message_transformed(&self) {
 53 |         self.record_one(StatType::MessageTransformed);
 54 |     }
 55 | 
 56 |     /// increments a counter for message transform failed
 57 |     pub fn message_transform_failed(&self) {
 58 |         self.record_one(StatType::MessageTransformFailed);
 59 |     }
 60 | 
 61 |     /// increments a counter for record batch started
 62 |     pub fn batch_started(&self) {
 63 |         self.record_one(StatType::RecordBatchStarted);
 64 |     }
 65 | 
 66 |     /// increments a counter for record batch completed.
 67 |     /// records a guage stat for buffered record batches.
 68 |     /// records a timer stat for record batch write duration.
 69 |     pub fn batch_completed(&self, buffered_record_batch_count: usize, timer: &Instant) {
 70 |         let duration = timer.elapsed().as_millis() as i64;
 71 |         self.record_one(StatType::RecordBatchCompleted);
 72 |         self.record_stat(
 73 |             StatType::BufferedRecordBatches,
 74 |             buffered_record_batch_count as i64,
 75 |         );
 76 |         self.record_stat(StatType::RecordBatchWriteDuration, duration);
 77 |     }
 78 | 
 79 |     /// increments a counter for delta write started
 80 |     pub fn delta_write_started(&self) {
 81 |         self.record_one(StatType::DeltaWriteStarted);
 82 |     }
 83 | 
 84 |     /// increments a counter for delta write started.
 85 |     /// records a timer stat for delta write duration.
 86 |     pub fn delta_write_completed(&self, timer: &Instant) {
 87 |         let duration = timer.elapsed().as_millis() as i64;
 88 |         self.record_one(StatType::DeltaWriteCompleted);
 89 |         self.record_stat(StatType::DeltaWriteDuration, duration);
 90 |     }
 91 | 
 92 |     /// increments a counter for delta write failed.
 93 |     pub fn delta_write_failed(&self) {
 94 |         self.record_one(StatType::DeltaWriteFailed);
 95 |     }
 96 | 
 97 |     /// records a guage for delta file size.
 98 |     pub fn delta_file_size(&self, size: i64) {
 99 |         self.record_stat(StatType::DeltaAddFileSize, size);
100 |     }
101 | 
102 |     /// records total, max, and min consumer lag for offsets held in buffer.
103 |     /// also records the number of partitions represented by the buffer lag vector.
104 |     pub fn buffer_lag(&self, buffer_lags: Vec<i64>) {
105 |         let lag_metrics = self.calculate_lag_metrics(buffer_lags);
106 | 
107 |         self.record_stat(StatType::BufferNumPartitions, lag_metrics.num_partitions);
108 |         self.record_stat(StatType::BufferLagTotal, lag_metrics.total);
109 | 
110 |         if let Some(max) = lag_metrics.max {
111 |             self.record_stat(StatType::BufferLagMax, max);
112 |         }
113 | 
114 |         if let Some(min) = lag_metrics.min {
115 |             self.record_stat(StatType::BufferLagMin, min);
116 |         }
117 |     }
118 | 
119 |     /// records total, max, and min consumer lag for offsets written to delta.
120 |     /// also records the number of partitions represented by the write lag vector.
121 |     pub fn delta_lag(&self, write_lags: Vec<i64>) {
122 |         let lag_metrics = self.calculate_lag_metrics(write_lags);
123 | 
124 |         self.record_stat(
125 |             StatType::DeltaWriteNumPartitions,
126 |             lag_metrics.num_partitions,
127 |         );
128 |         self.record_stat(StatType::DeltaWriteLagTotal, lag_metrics.total);
129 | 
130 |         if let Some(max) = lag_metrics.max {
131 |             self.record_stat(StatType::DeltaWriteLagMax, max);
132 |         }
133 | 
134 |         if let Some(min) = lag_metrics.min {
135 |             self.record_stat(StatType::DeltaWriteLagMin, min);
136 |         }
137 |     }
138 | 
139 |     /// Calculates total, max, min and num_partitions from the vector of lags.
140 |     fn calculate_lag_metrics(&self, lags: Vec<i64>) -> LagMetrics {
141 |         let total: i64 = lags.iter().sum();
142 |         let max = lags.iter().max().copied();
143 |         let min = lags.iter().min().copied();
144 |         let num_partitions = lags.len() as i64;
145 | 
146 |         LagMetrics {
147 |             total,
148 |             max,
149 |             min,
150 |             num_partitions,
151 |         }
152 |     }
153 | 
154 |     /// Records a count of 1 for the metric.
155 |     fn record_one(&self, stat_type: StatType) {
156 |         self.record_stat(stat_type, 1);
157 |     }
158 | 
159 |     /// Records a metric for the given [`StatType`] with the given value.
160 |     fn record_stat(&self, stat_type: StatType, val: i64) {
161 |         match stat_type {
162 |             // timers
163 |             StatType::RecordBatchWriteDuration | StatType::DeltaWriteDuration => {
164 |                 self.handle_timer(stat_type, val);
165 |             }
166 | 
167 |             // gauges
168 |             StatType::BufferedRecordBatches
169 |             | StatType::MessageSize
170 |             | StatType::DeltaAddFileSize
171 |             | StatType::BufferNumPartitions
172 |             | StatType::BufferLagTotal
173 |             | StatType::BufferLagMax
174 |             | StatType::BufferLagMin
175 |             | StatType::DeltaWriteNumPartitions
176 |             | StatType::DeltaWriteLagTotal
177 |             | StatType::DeltaWriteLagMax
178 |             | StatType::DeltaWriteLagMin => {
179 |                 self.handle_gauge(stat_type, val);
180 |             }
181 | 
182 |             // counters
183 |             _ => {
184 |                 self.handle_counter(stat_type, val);
185 |             }
186 |         }
187 |     }
188 | 
189 |     /// Records a timer metric for the given [`StatType`].
190 |     fn handle_timer(&self, stat_type: StatType, duration_us: i64) {
191 |         let stat_string = stat_type.to_string();
192 | 
193 |         if let Ok(duration) = duration_us.try_into() {
194 |             self.metrics
195 |                 .timer(stat_string.as_str())
196 |                 .interval_us(duration);
197 |         } else {
198 |             error!("Failed to report timer to statsd with an i64 that couldn't fit into u64.");
199 |         }
200 |     }
201 | 
202 |     /// Records a gauge metric for the given [`StatType`].
203 |     fn handle_gauge(&self, stat_type: StatType, count: i64) {
204 |         let stat_string = stat_type.to_string();
205 |         let key = stat_string.as_str();
206 | 
207 |         self.metrics.gauge(key).value(count);
208 |     }
209 | 
210 |     /// Records a counter metric for the given [`StatType`].
211 |     fn handle_counter(&self, stat_type: StatType, count: i64) {
212 |         let stat_string = stat_type.to_string();
213 |         let key = stat_string.as_str();
214 | 
215 |         let sized_count: usize = count.try_into().expect("Could not convert to usize");
216 | 
217 |         self.metrics.counter(key).count(sized_count);
218 |     }
219 | }
220 | 
221 | /// Stat types for the various metrics reported by the application
222 | #[derive(Debug, Display, Hash, PartialEq, Eq)]
223 | enum StatType {
224 |     //
225 |     // counters
226 |     //
227 |     /// Counter for a deserialized message.
228 |     #[strum(serialize = "messages.deserialization.completed")]
229 |     MessageDeserialized,
230 |     /// Counter for a message that failed deserialization.
231 |     #[strum(serialize = "messages.deserialization.failed")]
232 |     MessageDeserializationFailed,
233 |     /// Counter for a transformed message.
234 |     #[strum(serialize = "messages.transform.completed")]
235 |     MessageTransformed,
236 |     /// Counter for a message that failed transformation.
237 |     #[strum(serialize = "messages.transform.failed")]
238 |     MessageTransformFailed,
239 |     /// Counter for when a record batch is started.
240 |     #[strum(serialize = "recordbatch.started")]
241 |     RecordBatchStarted,
242 |     /// Counter for when a record batch is completed.
243 |     #[strum(serialize = "recordbatch.completed")]
244 |     RecordBatchCompleted,
245 |     /// Counter for when a delta write is started.
246 |     #[strum(serialize = "delta.write.started")]
247 |     DeltaWriteStarted,
248 |     /// Counter for when a delta write is completed.
249 |     #[strum(serialize = "delta.write.completed")]
250 |     DeltaWriteCompleted,
251 |     /// Counter for failed delta writes.
252 |     #[strum(serialize = "delta.write.failed")]
253 |     DeltaWriteFailed,
254 | 
255 |     //
256 |     // timers
257 |     //
258 |     /// Timer for record batch write duration.
259 |     #[strum(serialize = "recordbatch.write_duration")]
260 |     RecordBatchWriteDuration,
261 |     /// Timer for delta write duration.
262 |     #[strum(serialize = "delta.write.duration")]
263 |     DeltaWriteDuration,
264 | 
265 |     //
266 |     // gauges
267 |     //
268 |     /// Gauge for number of Arrow record batches in buffer.
269 |     #[strum(serialize = "buffered.record_batches")]
270 |     BufferedRecordBatches,
271 |     /// Gauge for message size.
272 |     #[strum(serialize = "messages.size")]
273 |     MessageSize,
274 |     /// Gauge for Delta add file size.
275 |     #[strum(serialize = "delta.add.size")]
276 |     DeltaAddFileSize,
277 |     /// Gauge for the number of partitions in buffer.
278 |     #[strum(serialize = "buffer.lag.num_partitions")]
279 |     BufferNumPartitions,
280 |     /// Gauge for total buffer lag across all partitions.
281 |     #[strum(serialize = "buffer.lag.total")]
282 |     BufferLagTotal,
283 |     /// Gauge for max buffer lag across all partitions.
284 |     #[strum(serialize = "buffer.lag.max")]
285 |     BufferLagMax,
286 |     /// Gauge for min buffer lag across all partitions.
287 |     #[strum(serialize = "buffer.lag.min")]
288 |     BufferLagMin,
289 |     /// Gauge for the number of partitions in the last delta write.
290 |     #[strum(serialize = "delta.write.lag.num_partitions")]
291 |     DeltaWriteNumPartitions,
292 |     /// Gauge for total delta write lag across all partitions.
293 |     #[strum(serialize = "delta.write.lag.total")]
294 |     DeltaWriteLagTotal,
295 |     /// Gauge for max delta write lag across all partitions.
296 |     #[strum(serialize = "delta.write.lag.max")]
297 |     DeltaWriteLagMax,
298 |     /// Gauge for min delta write lag across all partitions.
299 |     #[strum(serialize = "delta.write.lag.min")]
300 |     DeltaWriteLagMin,
301 | }
302 | 
303 | /// Struct representing aggregate lag metrics calculated from a vector of partition lags.
304 | struct LagMetrics {
305 |     total: i64,
306 |     max: Option<i64>,
307 |     min: Option<i64>,
308 |     num_partitions: i64,
309 | }
310 | 
311 | /// Creates a statsd metric scope to send metrics to.
312 | fn create_queue(endpoint: &str) -> Result<InputQueueScope, IngestMetricsError> {
313 |     let input_queue_size = if let Ok(val) = std::env::var(METRICS_INPUT_QUEUE_SIZE_VAR_NAME) {
314 |         val.parse::<usize>()
315 |             .map_err(|_| IngestMetricsError::InvalidMetricsInputQueueSize(val))?
316 |     } else {
317 |         DEFAULT_INPUT_QUEUE_SIZE
318 |     };
319 | 
320 |     let prefix =
321 |         std::env::var(METRICS_PREFIX_VAR_NAME).unwrap_or_else(|_| "kafka_delta_ingest".to_string());
322 | 
323 |     let scope = Statsd::send_to(endpoint)
324 |         .unwrap()
325 |         // don't send stats immediately -
326 |         // wait to trigger on input queue size
327 |         .queued(input_queue_size)
328 |         .named(prefix)
329 |         .metrics();
330 | 
331 |     Ok(scope)
332 | }
333 | 


--------------------------------------------------------------------------------
/src/offsets.rs:
--------------------------------------------------------------------------------
  1 | use crate::delta_helpers::*;
  2 | use crate::{DataTypeOffset, DataTypePartition};
  3 | use deltalake_core::kernel::transaction::CommitBuilder;
  4 | use deltalake_core::kernel::transaction::TableReference;
  5 | use deltalake_core::kernel::Action;
  6 | use deltalake_core::protocol::DeltaOperation;
  7 | use deltalake_core::protocol::OutputMode;
  8 | use deltalake_core::{DeltaTable, DeltaTableError};
  9 | use log::{error, info};
 10 | 
 11 | /// Errors returned by `write_offsets_to_delta` function.
 12 | #[derive(thiserror::Error, Debug)]
 13 | pub enum WriteOffsetsError {
 14 |     /// Error returned when stored offsets in delta table are lower than provided seek offsets.
 15 |     #[error("Stored offsets are lower than provided: {0}")]
 16 |     InconsistentStoredOffsets(String),
 17 | 
 18 |     /// Error from [`deltalake::DeltaTable`]
 19 |     #[error("DeltaTable interaction failed: {source}")]
 20 |     DeltaTable {
 21 |         /// Wrapped [`deltalake::DeltaTableError`]
 22 |         #[from]
 23 |         source: DeltaTableError,
 24 |     },
 25 | }
 26 | 
 27 | /// Write provided seeking offsets as a new delta log version with a set of `txn` actions.
 28 | /// The `txn` id for each partition is constructed as `<app_id>-<partition>` and used across
 29 | /// kafka-delta-ingest to track messages offsets and protect from data duplication.
 30 | ///
 31 | /// However, if table has already stored offset for given app_id/partition then this action could
 32 | /// be ignored if stored offsets are equals or greater than provided seeking offsets.
 33 | /// But, if stored offsets are lower then the `InconsistentStoredOffsets` is returned since it
 34 | /// could break the data integrity.
 35 | /// Hence, one is advised to supply the new `app_id` if skipping offsets is what required.
 36 | pub(crate) async fn write_offsets_to_delta(
 37 |     table: &mut DeltaTable,
 38 |     app_id: &str,
 39 |     offsets: &[(DataTypePartition, DataTypeOffset)],
 40 | ) -> Result<(), WriteOffsetsError> {
 41 |     let offsets_as_str: String = offsets
 42 |         .iter()
 43 |         .map(|(p, o)| format!("{}:{}", p, o))
 44 |         .collect::<Vec<String>>()
 45 |         .join(",");
 46 | 
 47 |     info!("Writing offsets [{}]", offsets_as_str);
 48 | 
 49 |     let mapped_offsets: Vec<(String, DataTypeOffset)> = offsets
 50 |         .iter()
 51 |         .map(|(p, o)| (txn_app_id_for_partition(app_id, *p), *o))
 52 |         .collect();
 53 | 
 54 |     if is_safe_to_commit_transactions(table, &mapped_offsets) {
 55 |         // table has no stored offsets for given app_id/partitions so it is safe to write txn actions
 56 |         commit_partition_offsets(table, mapped_offsets, &offsets_as_str, app_id.to_owned()).await?;
 57 |         Ok(())
 58 |     } else {
 59 |         // there's at least one app_id/partition stored in delta,
 60 |         // checking whether it's safe to proceed further
 61 |         let mut conflict_offsets = Vec::new();
 62 | 
 63 |         for (txn_app_id, offset) in mapped_offsets {
 64 |             match table.get_app_transaction_version().get(&txn_app_id) {
 65 |                 Some(stored_offset) if stored_offset.version < offset => {
 66 |                     conflict_offsets.push((txn_app_id, stored_offset.version, offset));
 67 |                 }
 68 |                 _ => (),
 69 |             }
 70 |         }
 71 | 
 72 |         if conflict_offsets.is_empty() {
 73 |             // there's no conflicted offsets in delta, e.g. it's either missing or is higher than seek offset
 74 |             info!("The provided offsets are already applied.");
 75 |             Ok(())
 76 |         } else {
 77 |             let partitions = conflict_offsets
 78 |                 .iter()
 79 |                 .map(|p| p.0.split('-').next_back().unwrap_or("N/A"))
 80 |                 .collect::<Vec<&str>>()
 81 |                 .join(",");
 82 | 
 83 |             error!(
 84 |                 "Stored offsets for partitions [{}] are lower than seek offsets.",
 85 |                 partitions
 86 |             );
 87 | 
 88 |             let detailed_error_msg = conflict_offsets
 89 |                 .iter()
 90 |                 .map(|(partition, stored, provided)| {
 91 |                     format!("{}:stored={}/seek={}", partition, stored, provided)
 92 |                 })
 93 |                 .collect::<Vec<String>>()
 94 |                 .join(", ");
 95 | 
 96 |             Err(WriteOffsetsError::InconsistentStoredOffsets(format!(
 97 |                 "[{}]",
 98 |                 detailed_error_msg
 99 |             )))
100 |         }
101 |     }
102 | }
103 | 
104 | async fn commit_partition_offsets(
105 |     table: &mut DeltaTable,
106 |     offsets: Vec<(String, DataTypeOffset)>,
107 |     offsets_as_str: &str,
108 |     app_id: String,
109 | ) -> Result<(), DeltaTableError> {
110 |     let actions: Vec<Action> = offsets
111 |         .iter()
112 |         .map(|(txn_id, offset)| create_txn_action(txn_id.to_string(), *offset))
113 |         .collect();
114 |     let epoch_id = std::time::SystemTime::now()
115 |         .duration_since(std::time::UNIX_EPOCH)
116 |         .expect("Time went backwards")
117 |         .as_millis() as i64;
118 | 
119 |     table.update().await?;
120 |     let commit = CommitBuilder::default()
121 |         .with_actions(actions)
122 |         .build(
123 |             table.state.as_ref().map(|s| s as &dyn TableReference),
124 |             table.log_store().clone(),
125 |             DeltaOperation::StreamingUpdate {
126 |                 output_mode: OutputMode::Complete,
127 |                 query_id: app_id,
128 |                 epoch_id,
129 |             },
130 |         )
131 |         .await;
132 |     match commit {
133 |         Ok(v) => {
134 |             info!(
135 |                 "Delta version {} completed with new txn offsets {}.",
136 |                 v.version, offsets_as_str
137 |             );
138 |             Ok(())
139 |         }
140 |         Err(e) => match e {
141 |             DeltaTableError::VersionAlreadyExists(_) => {
142 |                 error!("Transaction attempt failed. Attempts exhausted beyond max_retry_commit_attempts of {} so failing", crate::DEFAULT_DELTA_MAX_RETRY_COMMIT_ATTEMPTS);
143 |                 Err(e)
144 |             }
145 |             _ => Err(e),
146 |         },
147 |     }
148 | }
149 | 
150 | fn is_safe_to_commit_transactions(
151 |     table: &DeltaTable,
152 |     offsets: &[(String, DataTypeOffset)],
153 | ) -> bool {
154 |     offsets
155 |         .iter()
156 |         .all(|(id, _)| !table.get_app_transaction_version().contains_key(id))
157 | }
158 | 
159 | #[cfg(test)]
160 | mod tests {
161 |     use super::*;
162 |     use std::path::Path;
163 |     use uuid::Uuid;
164 | 
165 |     const VERSION_0: &str = r#"{"commitInfo":{"timestamp":1564524295023,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"isBlindAppend":true}}
166 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
167 | {"metaData":{"id":"22ef18ba-191c-4c36-a606-3dad5cdf3830","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1564524294376}}
168 | "#;
169 | 
170 |     #[tokio::test]
171 |     async fn write_offsets_to_delta_test() {
172 |         env_logger::init();
173 | 
174 |         let mut table = create_table().await;
175 | 
176 |         let offsets = vec![(0, 5), (1, 10)];
177 | 
178 |         // Test successful write
179 |         assert_eq!(table.version(), 0);
180 |         write_offsets_to_delta(&mut table, "test", &offsets)
181 |             .await
182 |             .unwrap();
183 | 
184 |         // verify that txn action is written
185 |         table.update().await.unwrap();
186 |         assert_eq!(table.version(), 1);
187 |         assert_eq!(
188 |             table
189 |                 .get_app_transaction_version()
190 |                 .get("test-0")
191 |                 .unwrap()
192 |                 .version,
193 |             5
194 |         );
195 |         assert_eq!(
196 |             table
197 |                 .get_app_transaction_version()
198 |                 .get("test-1")
199 |                 .unwrap()
200 |                 .version,
201 |             10
202 |         );
203 | 
204 |         // Test ignored write
205 |         write_offsets_to_delta(&mut table, "test", &offsets)
206 |             .await
207 |             .unwrap();
208 | 
209 |         // verify that txn action is not written
210 |         table.update().await.unwrap();
211 |         assert_eq!(table.version(), 1);
212 | 
213 |         // Test failed write (lower stored offsets)
214 |         let offsets = vec![(0, 15)];
215 |         let err = write_offsets_to_delta(&mut table, "test", &offsets)
216 |             .await
217 |             .err()
218 |             .unwrap();
219 |         let err = format!("{:?}", err);
220 | 
221 |         assert_eq!(
222 |             err.as_str(),
223 |             "InconsistentStoredOffsets(\"[test-0:stored=5/seek=15]\")"
224 |         );
225 | 
226 |         std::fs::remove_dir_all(table.table_uri()).unwrap();
227 |     }
228 | 
229 |     async fn create_table() -> DeltaTable {
230 |         let table_path = format!("./tests/gen/table-{}", Uuid::new_v4());
231 |         let v0_path = format!("{}/_delta_log/00000000000000000000.json", &table_path);
232 |         std::fs::create_dir_all(Path::new(&v0_path).parent().unwrap()).unwrap();
233 |         std::fs::write(&v0_path, VERSION_0).unwrap();
234 |         deltalake_core::open_table(&table_path).await.unwrap()
235 |     }
236 | }
237 | 


--------------------------------------------------------------------------------
/src/serialization.rs:
--------------------------------------------------------------------------------
  1 | use crate::{dead_letters::DeadLetter, MessageDeserializationError, MessageFormat};
  2 | use async_trait::async_trait;
  3 | use dashmap::DashMap;
  4 | use flate2::read::GzDecoder;
  5 | use schema_registry_converter::async_impl::{
  6 |     easy_avro::EasyAvroDecoder, easy_json::EasyJsonDecoder, schema_registry::SrSettings,
  7 | };
  8 | use serde_json::Value;
  9 | 
 10 | // use crate::avro_canonical_schema_workaround::parse_into_canonical_form;
 11 | use apache_avro::{rabin::Rabin, GenericSingleObjectReader, Schema};
 12 | use std::{
 13 |     borrow::BorrowMut,
 14 |     convert::{TryFrom, TryInto},
 15 |     io::{Cursor, Read},
 16 |     path::PathBuf,
 17 | };
 18 | 
 19 | use log::debug;
 20 | 
 21 | #[async_trait]
 22 | pub(crate) trait MessageDeserializer {
 23 |     async fn deserialize(
 24 |         &mut self,
 25 |         message_bytes: &[u8],
 26 |     ) -> Result<Value, MessageDeserializationError>;
 27 | }
 28 | 
 29 | pub(crate) struct MessageDeserializerFactory {}
 30 | 
 31 | impl MessageDeserializerFactory {
 32 |     pub fn try_build(
 33 |         input_format: &MessageFormat,
 34 |         decompress_gzip: bool, // Add this parameter
 35 |     ) -> Result<Box<dyn MessageDeserializer + Send>, anyhow::Error> {
 36 |         match input_format {
 37 |             MessageFormat::Json(data) => match data {
 38 |                 crate::SchemaSource::None => Ok(Self::json_default(decompress_gzip)),
 39 |                 crate::SchemaSource::SchemaRegistry(sr) => {
 40 |                     match Self::build_sr_settings(sr).map(JsonDeserializer::from_schema_registry) {
 41 |                         Ok(s) => Ok(Box::new(s)),
 42 |                         Err(e) => Err(e),
 43 |                     }
 44 |                 }
 45 |                 crate::SchemaSource::File(_) => Ok(Self::json_default(decompress_gzip)),
 46 |             },
 47 |             MessageFormat::Avro(data) => match data {
 48 |                 crate::SchemaSource::None => Ok(Box::<AvroSchemaDeserializer>::default()),
 49 |                 crate::SchemaSource::SchemaRegistry(sr) => {
 50 |                     match Self::build_sr_settings(sr).map(AvroDeserializer::from_schema_registry) {
 51 |                         Ok(s) => Ok(Box::new(s)),
 52 |                         Err(e) => Err(e),
 53 |                     }
 54 |                 }
 55 |                 crate::SchemaSource::File(f) => {
 56 |                     match AvroSchemaDeserializer::try_from_schema_file(f) {
 57 |                         Ok(s) => Ok(Box::new(s)),
 58 |                         Err(e) => Err(e),
 59 |                     }
 60 |                 }
 61 |             },
 62 |             MessageFormat::SoeAvro(path) => match SoeAvroDeserializer::try_from_path(path) {
 63 |                 Ok(s) => Ok(Box::new(s)),
 64 |                 Err(e) => Err(e),
 65 |             },
 66 |             _ => Ok(Box::new(DefaultDeserializer::new(decompress_gzip))),
 67 |         }
 68 |     }
 69 | 
 70 |     fn json_default(decompress_gzip: bool) -> Box<dyn MessageDeserializer + Send> {
 71 |         Box::new(DefaultDeserializer::new(decompress_gzip))
 72 |     }
 73 | 
 74 |     fn build_sr_settings(registry_url: &url::Url) -> Result<SrSettings, anyhow::Error> {
 75 |         let mut url_string = registry_url.as_str();
 76 |         if url_string.ends_with('/') {
 77 |             url_string = &url_string[0..url_string.len() - 1];
 78 |         }
 79 | 
 80 |         let mut builder = SrSettings::new_builder(url_string.to_owned());
 81 |         if let Ok(username) = std::env::var("SCHEMA_REGISTRY_USERNAME") {
 82 |             builder.set_basic_authorization(
 83 |                 username.as_str(),
 84 |                 std::option_env!("SCHEMA_REGISTRY_PASSWORD"),
 85 |             );
 86 |         }
 87 | 
 88 |         if let Ok(proxy_url) = std::env::var("SCHEMA_REGISTRY_PROXY") {
 89 |             builder.set_proxy(proxy_url.as_str());
 90 |         }
 91 | 
 92 |         match builder.build() {
 93 |             Ok(s) => Ok(s),
 94 |             Err(e) => Err(anyhow::Error::new(e)),
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | struct DefaultDeserializer {
100 |     decompress_gzip: bool,
101 | }
102 | 
103 | impl DefaultDeserializer {
104 |     pub fn new(decompress_gzip: bool) -> Self {
105 |         DefaultDeserializer { decompress_gzip }
106 |     }
107 | 
108 |     fn decompress(bytes: &[u8]) -> std::io::Result<Vec<u8>> {
109 |         let mut decoder = GzDecoder::new(bytes);
110 |         let mut decompressed_data = Vec::new();
111 |         decoder.read_to_end(&mut decompressed_data)?;
112 |         Ok(decompressed_data)
113 |     }
114 | }
115 | 
116 | #[async_trait]
117 | impl MessageDeserializer for DefaultDeserializer {
118 |     async fn deserialize(&mut self, payload: &[u8]) -> Result<Value, MessageDeserializationError> {
119 |         let payload = if self.decompress_gzip {
120 |             Self::decompress(payload).map_err(|e| {
121 |                 MessageDeserializationError::JsonDeserialization {
122 |                     dead_letter: DeadLetter::from_failed_deserialization(payload, e.to_string()),
123 |                 }
124 |             })?
125 |         } else {
126 |             payload.to_vec()
127 |         };
128 | 
129 |         let value: Value = match serde_json::from_slice(&payload) {
130 |             Ok(v) => v,
131 |             Err(e) => {
132 |                 return Err(MessageDeserializationError::JsonDeserialization {
133 |                     dead_letter: DeadLetter::from_failed_deserialization(&payload, e.to_string()),
134 |                 });
135 |             }
136 |         };
137 | 
138 |         Ok(value)
139 |     }
140 | }
141 | 
142 | struct AvroDeserializer {
143 |     decoder: EasyAvroDecoder,
144 | }
145 | 
146 | struct SoeAvroDeserializer {
147 |     //Deserializer for avro single object encoding
148 |     decoders: DashMap<i64, GenericSingleObjectReader>,
149 | }
150 | 
151 | #[derive(Default)]
152 | struct AvroSchemaDeserializer {
153 |     schema: Option<apache_avro::Schema>,
154 | }
155 | 
156 | struct JsonDeserializer {
157 |     decoder: EasyJsonDecoder,
158 | }
159 | 
160 | #[async_trait]
161 | impl MessageDeserializer for SoeAvroDeserializer {
162 |     async fn deserialize(
163 |         &mut self,
164 |         message_bytes: &[u8],
165 |     ) -> Result<Value, MessageDeserializationError> {
166 |         let key = Self::extract_message_fingerprint(message_bytes).map_err(|e| {
167 |             MessageDeserializationError::AvroDeserialization {
168 |                 dead_letter: DeadLetter::from_failed_deserialization(message_bytes, e.to_string()),
169 |             }
170 |         })?;
171 | 
172 |         let decoder =
173 |             self.decoders
174 |                 .get(&key)
175 |                 .ok_or(MessageDeserializationError::AvroDeserialization {
176 |                     dead_letter: DeadLetter::from_failed_deserialization(
177 |                         message_bytes,
178 |                         format!(
179 |                             "Unkown schema with fingerprint {}",
180 |                             &message_bytes[2..10]
181 |                                 .iter()
182 |                                 .map(|byte| format!("{:02x}", byte))
183 |                                 .collect::<Vec<String>>()
184 |                                 .join("")
185 |                         ),
186 |                     ),
187 |                 })?;
188 |         let mut reader = Cursor::new(message_bytes);
189 | 
190 |         match decoder.read_value(&mut reader) {
191 |             Ok(drs) => match Value::try_from(drs) {
192 |                 Ok(v) => Ok(v),
193 |                 Err(e) => Err(MessageDeserializationError::AvroDeserialization {
194 |                     dead_letter: DeadLetter::from_failed_deserialization(
195 |                         message_bytes,
196 |                         e.to_string(),
197 |                     ),
198 |                 }),
199 |             },
200 |             Err(e) => {
201 |                 return Err(MessageDeserializationError::AvroDeserialization {
202 |                     dead_letter: DeadLetter::from_failed_deserialization(
203 |                         message_bytes,
204 |                         e.to_string(),
205 |                     ),
206 |                 });
207 |             }
208 |         }
209 |     }
210 | }
211 | 
212 | #[async_trait]
213 | impl MessageDeserializer for AvroDeserializer {
214 |     async fn deserialize(
215 |         &mut self,
216 |         message_bytes: &[u8],
217 |     ) -> Result<Value, MessageDeserializationError> {
218 |         match self.decoder.decode_with_schema(Some(message_bytes)).await {
219 |             Ok(drs) => match drs {
220 |                 Some(v) => match Value::try_from(v.value) {
221 |                     Ok(v) => Ok(v),
222 |                     Err(e) => Err(MessageDeserializationError::AvroDeserialization {
223 |                         dead_letter: DeadLetter::from_failed_deserialization(
224 |                             message_bytes,
225 |                             e.to_string(),
226 |                         ),
227 |                     }),
228 |                 },
229 |                 None => return Err(MessageDeserializationError::EmptyPayload),
230 |             },
231 |             Err(e) => {
232 |                 return Err(MessageDeserializationError::AvroDeserialization {
233 |                     dead_letter: DeadLetter::from_failed_deserialization(
234 |                         message_bytes,
235 |                         e.to_string(),
236 |                     ),
237 |                 });
238 |             }
239 |         }
240 |     }
241 | }
242 | 
243 | #[async_trait]
244 | impl MessageDeserializer for AvroSchemaDeserializer {
245 |     async fn deserialize(
246 |         &mut self,
247 |         message_bytes: &[u8],
248 |     ) -> Result<Value, MessageDeserializationError> {
249 |         let reader_result = match &self.schema {
250 |             None => apache_avro::Reader::new(Cursor::new(message_bytes)),
251 |             Some(schema) => apache_avro::Reader::with_schema(schema, Cursor::new(message_bytes)),
252 |         };
253 | 
254 |         match reader_result {
255 |             Ok(mut reader) => {
256 |                 if let Some(r) = reader.next() {
257 |                     let v = match r {
258 |                         Err(_) => return Err(MessageDeserializationError::EmptyPayload),
259 |                         Ok(v) => Value::try_from(v),
260 |                     };
261 | 
262 |                     return match v {
263 |                         Ok(value) => Ok(value),
264 |                         Err(e) => Err(MessageDeserializationError::AvroDeserialization {
265 |                             dead_letter: DeadLetter::from_failed_deserialization(
266 |                                 message_bytes,
267 |                                 e.to_string(),
268 |                             ),
269 |                         }),
270 |                     };
271 |                 }
272 | 
273 |                 return Err(MessageDeserializationError::EmptyPayload);
274 |                 // TODO: Code to return multiple values from avro message
275 |                 /*let (values, errors): (Vec<_>, Vec<_>) =
276 |                     reader.into_iter().partition(Result::is_ok);
277 |                 if errors.len() > 0 {
278 |                     let error_string = errors
279 |                         .iter()
280 |                         .map(|m| m.err().unwrap().to_string())
281 |                         .fold(String::new(), |current, next| current + "\n" + &next);
282 |                     return Err(MessageDeserializationError::AvroDeserialization {
283 |                         dead_letter: DeadLetter::from_failed_deserialization(
284 |                             message_bytes,
285 |                             error_string,
286 |                         ),
287 |                     });
288 |                 }
289 |                 let (transformed, t_errors): (Vec<_>, Vec<_>) = values
290 |                     .into_iter()
291 |                     .map(|v| v.unwrap())
292 |                     .map(Value::try_from)
293 |                     .partition(Result::is_ok);
294 | 
295 |                 if t_errors.len() > 0 {
296 |                     let error_string = t_errors
297 |                         .iter()
298 |                         .map(|m| m.err().unwrap().to_string())
299 |                         .fold(String::new(), |current, next| current + "\n" + &next);
300 |                     return Err(MessageDeserializationError::AvroDeserialization {
301 |                         dead_letter: DeadLetter::from_failed_deserialization(
302 |                             message_bytes,
303 |                             error_string,
304 |                         ),
305 |                     });
306 |                 }
307 | 
308 |                 Ok(transformed.into_iter().map(|m| m.unwrap()).collect())*/
309 |             }
310 |             Err(e) => Err(MessageDeserializationError::AvroDeserialization {
311 |                 dead_letter: DeadLetter::from_failed_deserialization(message_bytes, e.to_string()),
312 |             }),
313 |         }
314 |     }
315 | }
316 | 
317 | #[async_trait]
318 | impl MessageDeserializer for JsonDeserializer {
319 |     async fn deserialize(
320 |         &mut self,
321 |         message_bytes: &[u8],
322 |     ) -> Result<Value, MessageDeserializationError> {
323 |         let decoder = self.decoder.borrow_mut();
324 |         match decoder.decode(Some(message_bytes)).await {
325 |             Ok(drs) => match drs {
326 |                 Some(v) => Ok(v.value),
327 |                 None => return Err(MessageDeserializationError::EmptyPayload),
328 |             },
329 |             Err(e) => {
330 |                 return Err(MessageDeserializationError::AvroDeserialization {
331 |                     dead_letter: DeadLetter::from_failed_deserialization(
332 |                         message_bytes,
333 |                         e.to_string(),
334 |                     ),
335 |                 });
336 |             }
337 |         }
338 |     }
339 | }
340 | impl JsonDeserializer {
341 |     pub(crate) fn from_schema_registry(sr_settings: SrSettings) -> Self {
342 |         JsonDeserializer {
343 |             decoder: EasyJsonDecoder::new(sr_settings),
344 |         }
345 |     }
346 | }
347 | 
348 | impl AvroSchemaDeserializer {
349 |     pub(crate) fn try_from_schema_file(file: &PathBuf) -> Result<Self, anyhow::Error> {
350 |         match std::fs::read_to_string(file) {
351 |             Ok(content) => match apache_avro::Schema::parse_str(&content) {
352 |                 Ok(s) => Ok(AvroSchemaDeserializer { schema: Some(s) }),
353 |                 Err(e) => Err(anyhow::format_err!("{}", e.to_string())),
354 |             },
355 |             Err(e) => Err(anyhow::format_err!("{}", e.to_string())),
356 |         }
357 |     }
358 | }
359 | 
360 | impl AvroDeserializer {
361 |     pub(crate) fn from_schema_registry(sr_settings: SrSettings) -> Self {
362 |         AvroDeserializer {
363 |             decoder: EasyAvroDecoder::new(sr_settings),
364 |         }
365 |     }
366 | }
367 | 
368 | impl SoeAvroDeserializer {
369 |     pub(crate) fn try_from_path(path: &PathBuf) -> Result<Self, anyhow::Error> {
370 |         if path.is_file() {
371 |             let (key, seo_reader) = Self::read_single_schema_file(path)?;
372 |             debug!(
373 |                 "Loaded schema {:?} with key (i64 rep of fingerprint) {:?}",
374 |                 path, key
375 |             );
376 |             let map: DashMap<i64, GenericSingleObjectReader> = DashMap::with_capacity(1);
377 |             map.insert(key, seo_reader);
378 |             Ok(SoeAvroDeserializer { decoders: map })
379 |         } else if path.is_dir() {
380 |             let decoders = path
381 |                 .read_dir()?
382 |                 .map(|file| {
383 |                     let file_path = file?.path();
384 |                     let value = Self::read_single_schema_file(&file_path)?;
385 |                     Ok(value)
386 |                 })
387 |                 .collect::<anyhow::Result<DashMap<_, _>>>()?;
388 | 
389 |             Ok(SoeAvroDeserializer { decoders })
390 |         } else {
391 |             Err(anyhow::format_err!("Path '{:?}' does not exists", path))
392 |         }
393 |     }
394 | 
395 |     fn read_single_schema_file(
396 |         path: &PathBuf,
397 |     ) -> Result<(i64, GenericSingleObjectReader), anyhow::Error> {
398 |         match std::fs::read_to_string(path) {
399 |             Ok(content) => match Schema::parse_str(&content) {
400 |                 Ok(s) => {
401 |                     let fingerprint = s.fingerprint::<Rabin>().bytes;
402 |                     let fingerprint = fingerprint
403 |                         .try_into()
404 |                         .expect("Rabin fingerprints are 8 bytes");
405 |                     let key = Self::fingerprint_to_i64(fingerprint);
406 |                     match GenericSingleObjectReader::new(s) {
407 |                         Ok(decoder) => Ok((key, decoder)),
408 |                         Err(e) => Err(anyhow::format_err!(
409 |                             "Schema file '{:?}'; Error: {}",
410 |                             path,
411 |                             e.to_string()
412 |                         )),
413 |                     }
414 |                 }
415 |                 Err(e) => Err(anyhow::format_err!(
416 |                     "Schema file '{:?}'; Error: {}",
417 |                     path,
418 |                     e.to_string()
419 |                 )),
420 |             },
421 |             Err(e) => Err(anyhow::format_err!(
422 |                 "Schema file '{:?}'; Error: {}",
423 |                 path,
424 |                 e.to_string()
425 |             )),
426 |         }
427 |     }
428 | 
429 |     fn extract_message_fingerprint(msg: &[u8]) -> Result<i64, anyhow::Error> {
430 |         msg.get(2..10)
431 |             .ok_or(anyhow::anyhow!(
432 |                 "Message does not contain a valid fingerprint"
433 |             ))
434 |             .map(|x| Self::fingerprint_to_i64(x.try_into().expect("Slice must be 8 bytes long")))
435 |     }
436 | 
437 |     fn fingerprint_to_i64(msg: [u8; 8]) -> i64 {
438 |         i64::from_le_bytes(msg)
439 |     }
440 | }
441 | 
442 | #[cfg(test)]
443 | mod tests {}
444 | 


--------------------------------------------------------------------------------
/src/value_buffers.rs:
--------------------------------------------------------------------------------
  1 | use crate::{DataTypeOffset, DataTypePartition, IngestError};
  2 | use serde_json::Value;
  3 | use std::collections::HashMap;
  4 | 
  5 | /// Provides a single interface into the multiple [`ValueBuffer`] instances used to buffer data for each assigned partition.
  6 | #[derive(Debug, Default)]
  7 | pub(crate) struct ValueBuffers {
  8 |     buffers: HashMap<DataTypePartition, ValueBuffer>,
  9 |     len: usize,
 10 | }
 11 | 
 12 | impl ValueBuffers {
 13 |     /// Adds a value to in-memory buffers and tracks the partition and offset.
 14 |     pub(crate) fn add(
 15 |         &mut self,
 16 |         partition: DataTypePartition,
 17 |         offset: DataTypeOffset,
 18 |         value: Value,
 19 |     ) -> Result<(), IngestError> {
 20 |         let buffer = self
 21 |             .buffers
 22 |             .entry(partition)
 23 |             .or_insert_with(ValueBuffer::new);
 24 | 
 25 |         // The streaming consumer might read the same offsets twice on rebalance/seek,
 26 |         // hence we protect the buffer from dupes by filtering out the already processed offsets.
 27 |         // Having this guarantees the at-most-once rule.
 28 |         if offset <= buffer.last_offset {
 29 |             return Err(IngestError::AlreadyProcessedPartitionOffset { partition, offset });
 30 |         }
 31 | 
 32 |         buffer.add(value, offset);
 33 |         self.len += 1;
 34 |         Ok(())
 35 |     }
 36 | 
 37 |     /// Returns the total number of items stored across each partition specific [`ValueBuffer`].
 38 |     pub(crate) fn len(&self) -> usize {
 39 |         self.len
 40 |     }
 41 | 
 42 |     /// Returns values, partition offsets and partition counts currently held in buffer and resets buffers to empty.
 43 |     pub(crate) fn consume(&mut self) -> ConsumedBuffers {
 44 |         let mut partition_offsets = HashMap::new();
 45 |         let mut partition_counts = HashMap::new();
 46 | 
 47 |         let values = self
 48 |             .buffers
 49 |             .iter_mut()
 50 |             .filter_map(|(partition, buffer)| match buffer.consume() {
 51 |                 Some((values, offset)) => {
 52 |                     partition_offsets.insert(*partition, offset);
 53 |                     partition_counts.insert(*partition, values.len());
 54 |                     Some(values)
 55 |                 }
 56 |                 None => None,
 57 |             })
 58 |             .flatten()
 59 |             .collect();
 60 | 
 61 |         self.len = 0;
 62 | 
 63 |         ConsumedBuffers {
 64 |             values,
 65 |             partition_offsets,
 66 |             partition_counts,
 67 |         }
 68 |     }
 69 | 
 70 |     /// Clears all value buffers currently held in memory.
 71 |     pub(crate) fn reset(&mut self) {
 72 |         self.len = 0;
 73 |         self.buffers.clear();
 74 |     }
 75 | }
 76 | 
 77 | /// Buffer of values held in memory for a single Kafka partition.
 78 | #[derive(Debug)]
 79 | struct ValueBuffer {
 80 |     /// The offset of the last message stored in the buffer.
 81 |     last_offset: DataTypeOffset,
 82 |     /// The buffer of [`Value`] instances.
 83 |     values: Vec<Value>,
 84 | }
 85 | 
 86 | impl ValueBuffer {
 87 |     /// Creates a new [`ValueBuffer`] to store messages from a Kafka partition.
 88 |     pub(crate) fn new() -> Self {
 89 |         Self {
 90 |             // The -1 means that it has no stored offset and anything that is firstly passed
 91 |             // will be accepted, since message offsets starts with 0.
 92 |             // Hence, if the buffer is "consumed", the values list is emptied, but the last_offset
 93 |             // should always holds the value to prevent messages duplicates.
 94 |             last_offset: -1,
 95 |             values: Vec::new(),
 96 |         }
 97 |     }
 98 | 
 99 |     /// Adds the value to buffer and stores its offset as the `last_offset` of the buffer.
100 |     pub(crate) fn add(&mut self, value: Value, offset: DataTypeOffset) {
101 |         self.last_offset = offset;
102 |         self.values.push(value);
103 |     }
104 | 
105 |     /// Consumes and returns the buffer and last offset so it may be written to delta and clears internal state.
106 |     pub(crate) fn consume(&mut self) -> Option<(Vec<Value>, DataTypeOffset)> {
107 |         if !self.values.is_empty() {
108 |             assert!(self.last_offset >= 0);
109 |             Some((std::mem::take(&mut self.values), self.last_offset))
110 |         } else {
111 |             None
112 |         }
113 |     }
114 | }
115 | 
116 | /// A struct that wraps the data consumed from [`ValueBuffers`] before writing to a [`arrow::record_batch::RecordBatch`].
117 | pub(crate) struct ConsumedBuffers {
118 |     /// The vector of [`Value`] instances consumed.
119 |     pub(crate) values: Vec<Value>,
120 |     /// A [`HashMap`] from partition to last offset represented by the consumed buffers.
121 |     pub(crate) partition_offsets: HashMap<DataTypePartition, DataTypeOffset>,
122 |     /// A [`HashMap`] from partition to number of messages consumed for each partition.
123 |     pub(crate) partition_counts: HashMap<DataTypePartition, usize>,
124 | }
125 | 
126 | #[cfg(test)]
127 | mod tests {
128 |     use super::*;
129 |     use maplit::hashmap;
130 | 
131 |     #[test]
132 |     fn value_buffers_test() {
133 |         let mut buffers = ValueBuffers::default();
134 |         let mut add = |p, o| {
135 |             buffers
136 |                 .add(p, o, Value::String(format!("{}:{}", p, o)))
137 |                 .unwrap();
138 |         };
139 | 
140 |         add(0, 0);
141 |         add(1, 0);
142 |         add(0, 1);
143 |         add(0, 2);
144 |         add(1, 1);
145 | 
146 |         assert_eq!(buffers.len, 5);
147 |         assert_eq!(buffers.buffers.len(), 2);
148 |         assert_eq!(buffers.buffers.get(&0).unwrap().last_offset, 2);
149 |         assert_eq!(buffers.buffers.get(&0).unwrap().values.len(), 3);
150 |         assert_eq!(buffers.buffers.get(&1).unwrap().last_offset, 1);
151 |         assert_eq!(buffers.buffers.get(&1).unwrap().values.len(), 2);
152 | 
153 |         let consumed = buffers.consume();
154 | 
155 |         assert_eq!(buffers.len, 0);
156 |         assert_eq!(buffers.buffers.len(), 2);
157 |         // last_offset is kept after consume
158 |         assert_eq!(buffers.buffers.get(&0).unwrap().last_offset, 2);
159 |         assert_eq!(buffers.buffers.get(&0).unwrap().values.len(), 0);
160 |         assert_eq!(buffers.buffers.get(&1).unwrap().last_offset, 1);
161 |         assert_eq!(buffers.buffers.get(&1).unwrap().values.len(), 0);
162 | 
163 |         assert_eq!(
164 |             consumed.partition_counts,
165 |             hashmap! {
166 |                 0 => 3,
167 |                 1 => 2
168 |             }
169 |         );
170 |         assert_eq!(
171 |             consumed.partition_offsets,
172 |             hashmap! {
173 |                 0 => 2,
174 |                 1 => 1
175 |             }
176 |         );
177 | 
178 |         let mut values: Vec<String> = consumed.values.iter().map(|j| j.to_string()).collect();
179 | 
180 |         values.sort();
181 | 
182 |         let expected: Vec<String> = vec!["\"0:0\"", "\"0:1\"", "\"0:2\"", "\"1:0\"", "\"1:1\""]
183 |             .iter()
184 |             .map(|s| s.to_string())
185 |             .collect();
186 |         assert_eq!(values, expected);
187 |     }
188 | 
189 |     #[test]
190 |     fn value_buffers_conflict_offsets_test() {
191 |         let mut buffers = ValueBuffers::default();
192 | 
193 |         let verify_error = |res: Result<(), IngestError>, o: i64| {
194 |             match res.err().unwrap() {
195 |                 IngestError::AlreadyProcessedPartitionOffset { partition, offset } => {
196 |                     assert_eq!(partition, 0);
197 |                     assert_eq!(offset, o);
198 |                 }
199 |                 other => panic!("{:?}", other),
200 |             };
201 |         };
202 | 
203 |         add(&mut buffers, 0).unwrap();
204 |         add(&mut buffers, 1).unwrap();
205 |         verify_error(add(&mut buffers, 0), 0);
206 |         verify_error(add(&mut buffers, 1), 1);
207 |         add(&mut buffers, 2).unwrap();
208 | 
209 |         let consumed = buffers.consume();
210 | 
211 |         assert_eq!(consumed.values.len(), 3);
212 |         assert_eq!(
213 |             consumed.partition_offsets,
214 |             hashmap! {
215 |                 0 => 2,
216 |             }
217 |         );
218 | 
219 |         // Also value buffer should hold last_offset after consume
220 |         verify_error(add(&mut buffers, 0), 0);
221 |         verify_error(add(&mut buffers, 1), 1);
222 |         verify_error(add(&mut buffers, 2), 2);
223 |         add(&mut buffers, 3).unwrap();
224 |         add(&mut buffers, 4).unwrap();
225 | 
226 |         let consumed_again = buffers.consume();
227 | 
228 |         assert_eq!(consumed_again.values.len(), 2);
229 |         assert_eq!(
230 |             consumed_again.partition_offsets,
231 |             hashmap! {
232 |                 0 => 4,
233 |             }
234 |         );
235 |     }
236 | 
237 |     fn add(buffers: &mut ValueBuffers, offset: i64) -> Result<(), IngestError> {
238 |         buffers.add(0, offset, Value::Number(offset.into()))
239 |     }
240 | }
241 | 


--------------------------------------------------------------------------------
/tests/buffer_flush_tests.rs:
--------------------------------------------------------------------------------
  1 | #[allow(dead_code)]
  2 | mod helpers;
  3 | 
  4 | use log::info;
  5 | use serde::{Deserialize, Serialize};
  6 | use serde_json::json;
  7 | use serial_test::serial;
  8 | use tokio::time::{sleep, Duration};
  9 | 
 10 | use kafka_delta_ingest::IngestOptions;
 11 | 
 12 | #[tokio::test]
 13 | #[serial]
 14 | async fn test_flush_when_latency_expires() {
 15 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
 16 |         "flush_when_latency_expires",
 17 |         json!({
 18 |             "id": "integer",
 19 |             "date": "string",
 20 |         }),
 21 |         vec!["date"],
 22 |         1,
 23 |         Some(IngestOptions {
 24 |             app_id: "flush_when_latency_expires".to_string(),
 25 |             // buffer for 5 seconds before flush
 26 |             allowed_latency: 5,
 27 |             // large value - avoid flushing on num messages
 28 |             max_messages_per_batch: 5000,
 29 |             // large value - avoid flushing on file size
 30 |             min_bytes_per_file: 1000000,
 31 |             kafka_brokers: helpers::test_broker(),
 32 |             ..Default::default()
 33 |         }),
 34 |     )
 35 |     .await;
 36 | 
 37 |     for m in create_generator(1).take(10) {
 38 |         info!("Writing test message");
 39 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
 40 |     }
 41 | 
 42 |     // wait for latency flush
 43 |     helpers::wait_until_version_created(&table, 1);
 44 | 
 45 |     for m in create_generator(11).take(10) {
 46 |         info!("Writing test message");
 47 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
 48 |     }
 49 | 
 50 |     // wait for latency flush
 51 |     helpers::wait_until_version_created(&table, 2);
 52 | 
 53 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
 54 |     let v2_rows: Vec<TestMsg> = helpers::read_table_content_as(&table).await;
 55 | 
 56 |     assert_eq!(v1_rows.len(), 10);
 57 |     assert_eq!(v2_rows.len(), 20);
 58 | 
 59 |     token.cancel();
 60 |     kdi.await.unwrap();
 61 |     rt.shutdown_background();
 62 | }
 63 | 
 64 | #[tokio::test]
 65 | #[serial]
 66 | async fn test_dont_write_an_empty_buffer() {
 67 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
 68 |         "dont_write_an_empty_buffer",
 69 |         json!({
 70 |             "id": "integer",
 71 |             "date": "string",
 72 |         }),
 73 |         vec!["date"],
 74 |         1,
 75 |         Some(IngestOptions {
 76 |             app_id: "dont_write_an_empty_buffer".to_string(),
 77 |             // buffer for 5 seconds before flush
 78 |             allowed_latency: 5,
 79 |             kafka_brokers: helpers::test_broker(),
 80 |             ..Default::default()
 81 |         }),
 82 |     )
 83 |     .await;
 84 |     // write one version so we can make sure the stream is up and running.
 85 | 
 86 |     for m in create_generator(1).take(10) {
 87 |         info!("Writing test message");
 88 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
 89 |     }
 90 | 
 91 |     // wait for latency flush
 92 |     helpers::wait_until_version_created(&table, 1);
 93 | 
 94 |     // wait for the latency timer to trigger
 95 |     sleep(Duration::from_secs(6)).await;
 96 | 
 97 |     // verify that an empty version _was not_ created.
 98 |     // i.e. we should still be at version 1
 99 | 
100 |     let t = deltalake_core::open_table(&table).await.unwrap();
101 | 
102 |     assert_eq!(1, t.version());
103 | 
104 |     token.cancel();
105 |     kdi.await.unwrap();
106 |     rt.shutdown_background();
107 | }
108 | 
109 | #[tokio::test]
110 | #[serial]
111 | async fn test_flush_on_size_without_latency_expiration() {
112 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
113 |         "flush_on_size_without_latency_expiration",
114 |         json!({
115 |             "id": "integer",
116 |             "date": "string",
117 |         }),
118 |         vec!["date"],
119 |         1,
120 |         Some(IngestOptions {
121 |             app_id: "flush_on_size_without_latency_expiration".to_string(),
122 |             // buffer for an hour
123 |             allowed_latency: 3600,
124 |             // create a record batch when we have 10 messages
125 |             max_messages_per_batch: 10,
126 |             // tiny buffer size for write flush
127 |             min_bytes_per_file: 20,
128 |             kafka_brokers: helpers::test_broker(),
129 |             ..Default::default()
130 |         }),
131 |     )
132 |     .await;
133 | 
134 |     for m in create_generator(1).take(10) {
135 |         info!("Writing test message");
136 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
137 |     }
138 | 
139 |     helpers::wait_until_version_created(&table, 1);
140 | 
141 |     let data: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
142 | 
143 |     assert_eq!(data.len(), 10);
144 | 
145 |     token.cancel();
146 |     kdi.await.unwrap();
147 |     rt.shutdown_background();
148 | }
149 | 
150 | #[derive(Clone, Serialize, Deserialize, Debug)]
151 | struct TestMsg {
152 |     id: u64,
153 |     date: String,
154 | }
155 | 
156 | fn create_generator(staring_id: u64) -> impl Iterator<Item = TestMsg> {
157 |     std::iter::successors(Some(staring_id), |n| Some(*n + 1)).map(|n| TestMsg {
158 |         id: n,
159 |         date: "2022-06-03".to_string(),
160 |     })
161 | }
162 | 


--------------------------------------------------------------------------------
/tests/data/.gitignore:
--------------------------------------------------------------------------------
1 | gen/


--------------------------------------------------------------------------------
/tests/data/default_schema.avro:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "record",
3 |     "name": "test",
4 |     "fields": [
5 |         {"name": "id", "type": "long"},
6 |         {"name": "name", "type": "string"},
7 |         {"name": "date", "type": "string"}
8 |     ]
9 | }


--------------------------------------------------------------------------------
/tests/data/emails/.gitignore:
--------------------------------------------------------------------------------
1 | *.parquet
2 | 
3 | _delta_log/*.json
4 | !/_delta_log/00000000000000000000.json
5 | 


--------------------------------------------------------------------------------
/tests/data/emails/_delta_log/00000000000000000000.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1621845641000,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"isBlindAppend":true}}
2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
3 | {"metaData":{"id":"ec285dbc-6479-4cc1-b038-1de97afabf9b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sender\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"recipient\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["date"],"configuration":{},"createdTime":1621845641001}}
4 | 


--------------------------------------------------------------------------------
/tests/data/example/.gitignore:
--------------------------------------------------------------------------------
1 | *.parquet
2 | 
3 | _delta_log/*.json
4 | !/_delta_log/00000000000000000000.json
5 | 


--------------------------------------------------------------------------------
/tests/data/example/_delta_log/00000000000000000000.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1564524295023,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"isBlindAppend":true}}
2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
3 | {"metaData":{"id":"22ef18ba-191c-4c36-a606-3dad5cdf3830","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"modified\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"modified_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["modified_date"],"configuration":{},"createdTime":1564524294376}}
4 | 


--------------------------------------------------------------------------------
/tests/data/web_requests/.gitignore:
--------------------------------------------------------------------------------
1 | *.parquet
2 | 
3 | _delta_log/*.json
4 | !/_delta_log/00000000000000000000.json
5 | 


--------------------------------------------------------------------------------
/tests/data/web_requests/_delta_log/00000000000000000000.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1564524295023,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"isBlindAppend":true}}
2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
3 | {"metaData":{"id":"22ef18ba-191c-4c36-a606-3dad5cdf3830","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"meta\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"producer\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"timestamp\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"kafka\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"offset\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"topic\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"partition\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"method\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"session_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"status\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"url\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"uuid\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["date"],"configuration":{},"createdTime":1564524294376}}
4 | 


--------------------------------------------------------------------------------
/tests/data/zero_offset/.gitignore:
--------------------------------------------------------------------------------
1 | *.parquet
2 | 
3 | _delta_log/*.json
4 | !/_delta_log/00000000000000000000.json
5 | 


--------------------------------------------------------------------------------
/tests/data/zero_offset/_delta_log/00000000000000000000.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1621845641000,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"isBlindAppend":true}}
2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
3 | {"metaData":{"id":"ec285dbc-6479-4cc1-b038-1de97afabf9b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"metadata\":{},\"name\":\"color\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"integer\"}]}","partitionColumns":["color"],"configuration":{},"createdTime":1621845641001}}
4 | 


--------------------------------------------------------------------------------
/tests/data/zero_offset/_delta_log/00000000000000000001.json:
--------------------------------------------------------------------------------
1 | {"txn":{"appId":"zero_offset-0","version":0,"lastUpdated":1628003413471}}
2 | {"add":{"path":"color=default/part-00000-0f324a9e-6066-4f14-970e-c174ca2299c6-c000.snappy.parquet","size":710,"partitionValues":{"color":"default"},"modificationTime":1628003413471,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"color\":\"default\",\"id\":0},\"maxValues\":{\"color\":\"default\",\"id\":0},\"nullCount\":{\"id\":0,\"color\":0}}","tags":null}}


--------------------------------------------------------------------------------
/tests/dead_letter_tests.rs:
--------------------------------------------------------------------------------
  1 | use kafka_delta_ingest::IngestOptions;
  2 | use log::info;
  3 | use serde::{Deserialize, Serialize};
  4 | 
  5 | use serde_json::json;
  6 | use serde_json::Value;
  7 | use uuid::Uuid;
  8 | 
  9 | #[allow(dead_code)]
 10 | mod helpers;
 11 | 
 12 | #[macro_use]
 13 | extern crate maplit;
 14 | 
 15 | #[derive(Clone, Serialize, Deserialize, Debug)]
 16 | struct TestMsgNested {
 17 |     value: String,
 18 | }
 19 | 
 20 | #[derive(Clone, Serialize, Deserialize, Debug)]
 21 | struct TestMsg {
 22 |     value: String,
 23 |     a_list_of_structs: Option<Vec<TestMsgNested>>,
 24 |     date: String,
 25 | }
 26 | 
 27 | #[tokio::test]
 28 | #[ignore]
 29 | async fn test_dlq() {
 30 |     helpers::init_logger();
 31 | 
 32 |     let table = create_table();
 33 |     let data_topic = create_data_topic().await;
 34 |     let dlq_table = create_dlq_table().await;
 35 | 
 36 |     let allowed_latency = 5;
 37 |     let max_messages_per_batch = 6;
 38 |     let min_bytes_per_file = 20;
 39 | 
 40 |     let (kdi, token, rt) = helpers::create_kdi(
 41 |         &data_topic,
 42 |         &table,
 43 |         IngestOptions {
 44 |             app_id: "dlq_test".to_string(),
 45 |             dlq_table_uri: Some(dlq_table.clone()),
 46 |             dlq_transforms: hashmap! {
 47 |                 "date".to_string() => "substr(epoch_micros_to_iso8601(timestamp),`0`,`10`)".to_string(),
 48 |             },
 49 |             allowed_latency,
 50 |             max_messages_per_batch,
 51 |             min_bytes_per_file,
 52 |             ..Default::default()
 53 |         },
 54 |     );
 55 |     let producer = helpers::create_producer();
 56 | 
 57 |     let good_generator = std::iter::repeat(TestMsg {
 58 |         value: "good".to_string(),
 59 |         a_list_of_structs: Some(vec![TestMsgNested {
 60 |             value: "abc".to_string(),
 61 |         }]),
 62 |         date: "2021-01-01".to_string(),
 63 |     });
 64 | 
 65 |     let null_struct_generator = std::iter::repeat(TestMsg {
 66 |         value: "bad-null_struct".to_string(),
 67 |         a_list_of_structs: None,
 68 |         date: "2021-01-01".to_string(),
 69 |     });
 70 | 
 71 |     let expected_date = chrono::Utc::now();
 72 |     let expected_date = format!("{}", expected_date.format("%Y-%m-%d"));
 73 | 
 74 |     // 4 good messages
 75 |     // 2 bad messages that fail parquet write
 76 |     let batch_to_send: Vec<TestMsg> = good_generator
 77 |         .clone()
 78 |         .take(2)
 79 |         .chain(null_struct_generator.take(2))
 80 |         .chain(good_generator.clone().take(2))
 81 |         .collect();
 82 | 
 83 |     for m in batch_to_send.iter() {
 84 |         helpers::send_json(&producer, &data_topic, &serde_json::to_value(m).unwrap()).await;
 85 |     }
 86 | 
 87 |     info!("Sent {} records from structs", batch_to_send.len());
 88 | 
 89 |     info!("Waiting for version 1 of dlq table");
 90 |     helpers::wait_until_version_created(&dlq_table, 1);
 91 |     info!("Waiting for version 1 of data table");
 92 |     helpers::wait_until_version_created(&table, 1);
 93 | 
 94 |     // 1 message with bad bytes
 95 |     let bad_bytes_generator = std::iter::repeat("bad bytes".as_bytes().to_vec());
 96 | 
 97 |     for m in bad_bytes_generator.clone().take(1) {
 98 |         helpers::send_bytes(&producer, &data_topic, &m).await;
 99 |     }
100 | 
101 |     info!("Sent {} records from bytes", batch_to_send.len());
102 | 
103 |     info!("Waiting for version 2 of dlq table - {}", dlq_table);
104 |     helpers::wait_until_version_created(&dlq_table, 2);
105 |     info!("Wait completed for dlq table - {}", dlq_table);
106 | 
107 |     // 6 more good messages just to make sure the stream keep working after hitting some bad
108 |     let good_bytes_generator = good_generator.clone().map(|g| {
109 |         let json = serde_json::to_string(&g).unwrap();
110 |         json.as_bytes().to_vec()
111 |     });
112 | 
113 |     for m in good_bytes_generator.clone().take(6) {
114 |         helpers::send_bytes(&producer, &data_topic, &m).await;
115 |     }
116 | 
117 |     info!("Waiting for version 2 of data table - {}", table);
118 |     helpers::wait_until_version_created(&table, 2);
119 |     info!("Wait completed for data table - {}", table);
120 | 
121 |     token.cancel();
122 | 
123 |     for m in bad_bytes_generator.clone().take(1) {
124 |         helpers::send_bytes(&producer, &data_topic, &m).await;
125 |     }
126 | 
127 |     kdi.await.unwrap();
128 |     rt.shutdown_background();
129 | 
130 |     // after above sequence - we should have 10 good messages and 3 dead letters
131 |     // good messages should be in the data table and dead letters should be in the dlq_table
132 | 
133 |     let table_content: Vec<TestMsg> = helpers::read_table_content_as_jsons(&table)
134 |         .await
135 |         .iter()
136 |         .map(|v| serde_json::from_value(v.clone()).unwrap())
137 |         .collect();
138 | 
139 |     assert_eq!(table_content.len(), 10);
140 | 
141 |     let dlq_content: Vec<Value> = helpers::read_table_content_as_jsons(&dlq_table).await;
142 |     assert_eq!(dlq_content.len(), 3);
143 | 
144 |     println!("Dead letter table content{:#?}", dlq_content);
145 | 
146 |     let bad_serde_records: Vec<Value> = dlq_content
147 |         .iter()
148 |         .filter(|v| {
149 |             v.get("base64_bytes")
150 |                 .map(|v| v.as_str() == Some("YmFkIGJ5dGVz"))
151 |                 .unwrap_or(false)
152 |         })
153 |         .map(|v| v.to_owned())
154 |         .collect();
155 |     assert_eq!(bad_serde_records.len(), 1);
156 | 
157 |     let bad_null_struct_records: Vec<Value> = dlq_content
158 |         .iter()
159 |         .filter(|v| {
160 |             v.get("error")
161 |                 .unwrap()
162 |                 .as_str()
163 |                 .unwrap()
164 |                 .contains("Inconsistent length of definition and repetition levels")
165 |         })
166 |         .map(|v| v.to_owned())
167 |         .collect();
168 |     assert_eq!(bad_null_struct_records.len(), 2);
169 | 
170 |     assert!(dlq_content
171 |         .iter()
172 |         .all(|v| v.get("date").unwrap().as_str() == Some(expected_date.as_str())));
173 | }
174 | 
175 | fn create_table() -> String {
176 |     helpers::create_local_table(
177 |         json!({
178 |             "value": "string",
179 |             "a_list_of_structs": [{
180 |                 "value": "string"
181 |             }],
182 |             "date": "string",
183 |         }),
184 |         vec!["date"],
185 |         "dlq",
186 |     )
187 | }
188 | 
189 | async fn create_data_topic() -> String {
190 |     let topic = format!("dlq_test_source_{}", Uuid::new_v4());
191 |     helpers::create_topic(&topic, 3).await;
192 |     topic
193 | }
194 | 
195 | async fn create_dlq_table() -> String {
196 |     helpers::create_local_table(
197 |         json! ({
198 |             "base64_bytes": "string",
199 |             "json_string": "string",
200 |             "error": "string",
201 |             "timestamp": "timestamp",
202 |             "date": "string",
203 |         }),
204 |         vec!["date"],
205 |         "dlq",
206 |     )
207 | }
208 | 


--------------------------------------------------------------------------------
/tests/delta_partitions_tests.rs:
--------------------------------------------------------------------------------
  1 | #[allow(dead_code)]
  2 | mod helpers;
  3 | 
  4 | use deltalake_core::kernel::transaction::CommitBuilder;
  5 | use deltalake_core::kernel::transaction::TableReference;
  6 | use deltalake_core::kernel::{Action, Add};
  7 | use deltalake_core::protocol::{DeltaOperation, SaveMode};
  8 | use deltalake_core::DeltaTableError;
  9 | use kafka_delta_ingest::writer::*;
 10 | use serde::{Deserialize, Serialize};
 11 | use serde_json::{json, Value};
 12 | use std::collections::HashMap;
 13 | 
 14 | #[derive(Debug, Serialize, Deserialize)]
 15 | struct TestMsg {
 16 |     id: u32,
 17 |     color: Option<String>,
 18 | }
 19 | 
 20 | impl TestMsg {
 21 |     fn new(id: u32, color: &str) -> Self {
 22 |         Self {
 23 |             id,
 24 |             color: Some(color.to_string()),
 25 |         }
 26 |     }
 27 | 
 28 |     fn new_color_null(id: u32) -> Self {
 29 |         Self { id, color: None }
 30 |     }
 31 | }
 32 | 
 33 | #[tokio::test]
 34 | async fn test_delta_partitions() {
 35 |     let table_path = helpers::create_local_table(
 36 |         json!({
 37 |             "id": "integer",
 38 |             "color": "string",
 39 |         }),
 40 |         vec!["color"],
 41 |         "test_delta_partitions",
 42 |     );
 43 | 
 44 |     let table = deltalake_core::open_table(&table_path).await.unwrap();
 45 |     let mut delta_writer = DataWriter::for_table(&table, HashMap::new()).unwrap();
 46 | 
 47 |     let batch1 = vec![
 48 |         TestMsg::new(1, "red"),
 49 |         TestMsg::new(2, "red"),
 50 |         TestMsg::new(3, "blue"),
 51 |         TestMsg::new(4, "red"),
 52 |     ];
 53 | 
 54 |     let batch2 = vec![
 55 |         TestMsg::new(5, "blue"),
 56 |         TestMsg::new(6, "red"),
 57 |         TestMsg::new(7, "blue"),
 58 |         TestMsg::new(8, "blue"),
 59 |         TestMsg::new_color_null(9),
 60 |     ];
 61 | 
 62 |     delta_writer.write(msgs_to_values(batch1)).await.unwrap();
 63 |     delta_writer.write(msgs_to_values(batch2)).await.unwrap();
 64 | 
 65 |     let result = delta_writer
 66 |         .write_parquet_files(&table.table_uri())
 67 |         .await
 68 |         .unwrap();
 69 | 
 70 |     for add in result.iter() {
 71 |         match add
 72 |             .partition_values
 73 |             .get("color")
 74 |             .unwrap()
 75 |             .clone()
 76 |             .as_deref()
 77 |         {
 78 |             Some("red") => {
 79 |                 assert!(add.path.starts_with("color=red"));
 80 |                 assert_eq!(&get_stats_value(add, "numRecords"), "4");
 81 |                 assert_eq!(msg(get_stats_value(add, "minValues")).id, 1);
 82 |                 assert_eq!(msg(get_stats_value(add, "maxValues")).id, 6);
 83 |             }
 84 |             Some("blue") => {
 85 |                 assert!(add.path.starts_with("color=blue"));
 86 |                 assert_eq!(&get_stats_value(add, "numRecords"), "4");
 87 |                 assert_eq!(msg(get_stats_value(add, "minValues")).id, 3);
 88 |                 assert_eq!(msg(get_stats_value(add, "maxValues")).id, 8);
 89 |             }
 90 |             None => {
 91 |                 assert!(add.path.starts_with("color=__HIVE_DEFAULT_PARTITION__"));
 92 |                 assert_eq!(&get_stats_value(add, "numRecords"), "1");
 93 |                 assert_eq!(msg(get_stats_value(add, "minValues")).id, 9);
 94 |                 assert_eq!(msg(get_stats_value(add, "maxValues")).id, 9);
 95 |             }
 96 |             other => {
 97 |                 panic!("{:?}", other);
 98 |             }
 99 |         }
100 |     }
101 | 
102 |     let operation = DeltaOperation::Write {
103 |         mode: SaveMode::Append,
104 |         partition_by: None,
105 |         predicate: None,
106 |     };
107 | 
108 |     let version = CommitBuilder::default()
109 |         .with_actions(result.iter().cloned().map(Action::Add).collect())
110 |         .build(
111 |             table.state.as_ref().map(|s| s as &dyn TableReference),
112 |             table.log_store().clone(),
113 |             operation,
114 |         )
115 |         .await
116 |         .map_err(DeltaTableError::from)
117 |         .expect("Failed to create transaction")
118 |         .version;
119 | 
120 |     deltalake_core::checkpoints::create_checkpoint(&table, None)
121 |         .await
122 |         .unwrap();
123 | 
124 |     let table = deltalake_core::open_table(&table_path).await.unwrap();
125 |     assert_eq!(table.version(), version);
126 | 
127 |     std::fs::remove_dir_all(&table_path).unwrap();
128 | }
129 | 
130 | fn msgs_to_values(values: Vec<TestMsg>) -> Vec<Value> {
131 |     values
132 |         .iter()
133 |         .map(|j| serde_json::to_value(j).unwrap())
134 |         .collect()
135 | }
136 | 
137 | fn get_stats_value(add: &Add, key: &str) -> String {
138 |     let v: Value = serde_json::from_str(add.stats.as_ref().unwrap()).unwrap();
139 |     v.as_object().unwrap().get(key).unwrap().to_string()
140 | }
141 | 
142 | fn msg(s: String) -> TestMsg {
143 |     serde_json::from_str(&s).unwrap()
144 | }
145 | 


--------------------------------------------------------------------------------
/tests/deserialization_tests.rs:
--------------------------------------------------------------------------------
  1 | #[allow(dead_code)]
  2 | mod helpers;
  3 | 
  4 | use kafka_delta_ingest::{IngestOptions, MessageFormat, SchemaSource};
  5 | use log::info;
  6 | use schema_registry_converter::{
  7 |     async_impl::{
  8 |         easy_avro::EasyAvroEncoder,
  9 |         easy_json::EasyJsonEncoder,
 10 |         schema_registry::{post_schema, SrSettings},
 11 |     },
 12 |     error::SRCError,
 13 |     schema_registry_common::{RegisteredSchema, SchemaType, SubjectNameStrategy, SuppliedSchema},
 14 | };
 15 | use serde::{Deserialize, Serialize};
 16 | use serde_json::json;
 17 | use serial_test::serial;
 18 | use std::path::PathBuf;
 19 | use std::str::FromStr;
 20 | use url::Url;
 21 | 
 22 | const DEFAULT_AVRO_SCHEMA: &str = r#"{
 23 |     "type": "record",
 24 |     "name": "test",
 25 |     "fields": [
 26 |         {"name": "id", "type": "long"},
 27 |         {"name": "name", "type": "string"},
 28 |         {"name": "date", "type": "string"}
 29 |     ]
 30 | }"#;
 31 | const SCHEMA_PATH: &str = "tests/data/default_schema.avro";
 32 | const DEFAULT_ID: i64 = 1;
 33 | const DEFAULT_DATE: &str = "2023-06-30";
 34 | const DEFAULT_NAME: &str = "test";
 35 | const SCHEMA_REGISTRY_ADDRESS: &str = "http://localhost:8081";
 36 | 
 37 | #[tokio::test]
 38 | #[serial]
 39 | async fn test_json_default() {
 40 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
 41 |         "test_json_default",
 42 |         default_schema(),
 43 |         vec!["date"],
 44 |         1,
 45 |         Some(IngestOptions {
 46 |             app_id: "test_json_default".to_string(),
 47 |             // buffer for 5 seconds before flush
 48 |             allowed_latency: 5,
 49 |             // large value - avoid flushing on num messages
 50 |             max_messages_per_batch: 1,
 51 |             // large value - avoid flushing on file size
 52 |             min_bytes_per_file: 1000000,
 53 |             ..Default::default()
 54 |         }),
 55 |     )
 56 |     .await;
 57 | 
 58 |     let data = json!({
 59 |         "name": DEFAULT_NAME,
 60 |         "id": DEFAULT_ID,
 61 |         "date": DEFAULT_DATE,
 62 |     });
 63 |     info!("Writing test message");
 64 |     helpers::send_json(&producer, &topic, &data).await;
 65 |     // wait for latency flush
 66 |     helpers::wait_until_version_created(&table, 1);
 67 | 
 68 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
 69 |     assert_eq!(v1_rows.len(), 1);
 70 |     assert_defaults(&v1_rows[0]);
 71 | 
 72 |     token.cancel();
 73 |     kdi.await.unwrap();
 74 |     rt.shutdown_background();
 75 | }
 76 | 
 77 | #[tokio::test]
 78 | #[serial]
 79 | async fn test_json_with_args() {
 80 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
 81 |         "test_json_with_args",
 82 |         default_schema(),
 83 |         vec!["date"],
 84 |         1,
 85 |         Some(IngestOptions {
 86 |             app_id: "test_json_with_args".to_string(),
 87 |             // buffer for 5 seconds before flush
 88 |             allowed_latency: 5,
 89 |             // large value - avoid flushing on num messages
 90 |             max_messages_per_batch: 1,
 91 |             // large value - avoid flushing on file size
 92 |             min_bytes_per_file: 1000000,
 93 |             input_format: MessageFormat::Json(SchemaSource::None),
 94 |             ..Default::default()
 95 |         }),
 96 |     )
 97 |     .await;
 98 | 
 99 |     let data = json!({
100 |         "name": DEFAULT_NAME,
101 |         "id": DEFAULT_ID,
102 |         "date": DEFAULT_DATE,
103 |     });
104 |     info!("Writing test message");
105 |     helpers::send_json(&producer, &topic, &data).await;
106 |     // wait for latency flush
107 |     helpers::wait_until_version_created(&table, 1);
108 | 
109 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
110 |     assert_eq!(v1_rows.len(), 1);
111 |     assert_defaults(&v1_rows[0]);
112 | 
113 |     token.cancel();
114 |     kdi.await.unwrap();
115 |     rt.shutdown_background();
116 | }
117 | 
118 | #[tokio::test]
119 | #[serial]
120 | async fn test_json_with_registry() {
121 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
122 |         "test_json_with_registry",
123 |         default_schema(),
124 |         vec!["date"],
125 |         1,
126 |         Some(IngestOptions {
127 |             app_id: "test_json_with_registry".to_string(),
128 |             // buffer for 5 seconds before flush
129 |             allowed_latency: 5,
130 |             // large value - avoid flushing on num messages
131 |             max_messages_per_batch: 1,
132 |             // large value - avoid flushing on file size
133 |             min_bytes_per_file: 1000000,
134 |             input_format: MessageFormat::Json(SchemaSource::SchemaRegistry(
135 |                 Url::parse(SCHEMA_REGISTRY_ADDRESS).unwrap(),
136 |             )),
137 |             ..Default::default()
138 |         }),
139 |     )
140 |     .await;
141 | 
142 |     let data = json!({
143 |         "name": DEFAULT_NAME,
144 |         "id": DEFAULT_ID,
145 |         "date": DEFAULT_DATE,
146 |     });
147 |     info!("Writing test message");
148 |     prepare_json_schema(topic.clone()).await.unwrap();
149 |     let encoded = json_encode(&data, topic.clone()).await.unwrap();
150 |     helpers::send_encoded(&producer, &topic, encoded).await;
151 |     // wait for latency flush
152 |     helpers::wait_until_version_created(&table, 1);
153 | 
154 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
155 |     assert_eq!(v1_rows.len(), 1);
156 |     assert_defaults(&v1_rows[0]);
157 |     token.cancel();
158 |     kdi.await.unwrap();
159 |     rt.shutdown_background();
160 | }
161 | 
162 | #[tokio::test]
163 | #[serial]
164 | async fn test_avro_default() {
165 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
166 |         "test_avro_default",
167 |         default_schema(),
168 |         vec!["date"],
169 |         1,
170 |         Some(IngestOptions {
171 |             app_id: "test_avro_default".to_string(),
172 |             // buffer for 5 seconds before flush
173 |             allowed_latency: 5,
174 |             // large value - avoid flushing on num messages
175 |             max_messages_per_batch: 1,
176 |             // large value - avoid flushing on file size
177 |             min_bytes_per_file: 1000000,
178 |             input_format: MessageFormat::Avro(SchemaSource::None),
179 |             ..Default::default()
180 |         }),
181 |     )
182 |     .await;
183 | 
184 |     let schema = apache_avro::Schema::parse_str(DEFAULT_AVRO_SCHEMA).unwrap();
185 |     let mut writer = apache_avro::Writer::new(&schema, Vec::new());
186 |     let mut record = apache_avro::types::Record::new(writer.schema()).unwrap();
187 |     record.put("id", DEFAULT_ID);
188 |     record.put("name", DEFAULT_NAME);
189 |     record.put("date", DEFAULT_DATE);
190 |     writer.append(record).unwrap();
191 |     let encoded = writer.into_inner().unwrap();
192 |     helpers::send_encoded(&producer, &topic, encoded).await;
193 |     // wait for latency flush
194 |     helpers::wait_until_version_created(&table, 1);
195 | 
196 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
197 |     assert_eq!(v1_rows.len(), 1);
198 |     assert_defaults(&v1_rows[0]);
199 | 
200 |     token.cancel();
201 |     kdi.await.unwrap();
202 |     rt.shutdown_background();
203 | }
204 | 
205 | #[tokio::test]
206 | #[serial]
207 | async fn test_avro_with_file() {
208 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
209 |         "test_avro_with_file",
210 |         default_schema(),
211 |         vec!["date"],
212 |         1,
213 |         Some(IngestOptions {
214 |             app_id: "test_avro_with_file".to_string(),
215 |             // buffer for 5 seconds before flush
216 |             allowed_latency: 5,
217 |             // large value - avoid flushing on num messages
218 |             max_messages_per_batch: 1,
219 |             // large value - avoid flushing on file size
220 |             min_bytes_per_file: 1000000,
221 |             input_format: MessageFormat::Avro(SchemaSource::File(
222 |                 PathBuf::from_str(SCHEMA_PATH).unwrap(),
223 |             )),
224 |             ..Default::default()
225 |         }),
226 |     )
227 |     .await;
228 | 
229 |     let schema = apache_avro::Schema::parse_str(DEFAULT_AVRO_SCHEMA).unwrap();
230 |     let mut writer = apache_avro::Writer::new(&schema, Vec::new());
231 |     let mut record = apache_avro::types::Record::new(writer.schema()).unwrap();
232 |     record.put("id", DEFAULT_ID);
233 |     record.put("name", DEFAULT_NAME);
234 |     record.put("date", DEFAULT_DATE);
235 |     writer.append(record).unwrap();
236 |     let encoded = writer.into_inner().unwrap();
237 |     helpers::send_encoded(&producer, &topic, encoded).await;
238 |     // wait for latency flush
239 |     helpers::wait_until_version_created(&table, 1);
240 | 
241 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
242 |     assert_eq!(v1_rows.len(), 1);
243 |     assert_defaults(&v1_rows[0]);
244 | 
245 |     token.cancel();
246 |     kdi.await.unwrap();
247 |     rt.shutdown_background();
248 | }
249 | 
250 | #[tokio::test]
251 | #[serial]
252 | async fn test_avro_with_registry() {
253 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
254 |         "test_avro_with_registry",
255 |         default_schema(),
256 |         vec!["date"],
257 |         1,
258 |         Some(IngestOptions {
259 |             app_id: "flush_when_latency_expires".to_string(),
260 |             // buffer for 5 seconds before flush
261 |             allowed_latency: 5,
262 |             // large value - avoid flushing on num messages
263 |             max_messages_per_batch: 1,
264 |             // large value - avoid flushing on file size
265 |             min_bytes_per_file: 1000000,
266 |             input_format: MessageFormat::Avro(SchemaSource::SchemaRegistry(
267 |                 Url::parse(SCHEMA_REGISTRY_ADDRESS).unwrap(),
268 |             )),
269 |             ..Default::default()
270 |         }),
271 |     )
272 |     .await;
273 | 
274 |     let data = TestMsg {
275 |         id: DEFAULT_ID,
276 |         name: String::from(DEFAULT_NAME),
277 |         date: String::from(DEFAULT_DATE),
278 |     };
279 |     prepare_avro_schema(topic.clone()).await.unwrap();
280 |     let encoded = avro_encode(&data, topic.clone()).await.unwrap();
281 |     helpers::send_encoded(&producer, &topic, encoded).await;
282 |     // wait for latency flush
283 |     helpers::wait_until_version_created(&table, 1);
284 | 
285 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
286 |     assert_eq!(v1_rows.len(), 1);
287 |     assert_defaults(&v1_rows[0]);
288 | 
289 |     token.cancel();
290 |     kdi.await.unwrap();
291 |     rt.shutdown_background();
292 | }
293 | 
294 | #[tokio::test]
295 | #[serial]
296 | async fn test_avro_single_object_encoding_with_file() {
297 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
298 |         "test_avro_seo_with_file",
299 |         default_schema(),
300 |         vec!["date"],
301 |         1,
302 |         Some(IngestOptions {
303 |             app_id: "test_avro_seo_with_file".to_string(),
304 |             // buffer for 5 seconds before flush
305 |             allowed_latency: 5,
306 |             // large value - avoid flushing on num messages
307 |             max_messages_per_batch: 1,
308 |             // large value - avoid flushing on file size
309 |             min_bytes_per_file: 1000000,
310 |             input_format: MessageFormat::SoeAvro(PathBuf::from_str(SCHEMA_PATH).unwrap()),
311 |             ..Default::default()
312 |         }),
313 |     )
314 |     .await;
315 | 
316 |     let schema = apache_avro::Schema::parse_str(DEFAULT_AVRO_SCHEMA).unwrap();
317 |     let mut writer =
318 |         apache_avro::GenericSingleObjectWriter::new_with_capacity(&schema, 100).unwrap();
319 | 
320 |     let mut record = apache_avro::types::Record::new(&schema).unwrap();
321 |     record.put("id", DEFAULT_ID);
322 |     record.put("name", DEFAULT_NAME);
323 |     record.put("date", DEFAULT_DATE);
324 | 
325 |     let mut write_buf = Vec::with_capacity(100);
326 |     let length = writer.write_value(record.into(), &mut write_buf).unwrap();
327 | 
328 |     let encoded = write_buf[0..length].to_owned();
329 |     helpers::send_encoded(&producer, &topic, encoded).await;
330 |     // wait for latency flush
331 |     helpers::wait_until_version_created(&table, 1);
332 | 
333 |     let v1_rows: Vec<TestMsg> = helpers::read_table_content_at_version_as(&table, 1).await;
334 |     assert_eq!(v1_rows.len(), 1);
335 |     assert_defaults(&v1_rows[0]);
336 | 
337 |     token.cancel();
338 |     kdi.await.unwrap();
339 |     rt.shutdown_background();
340 | }
341 | 
342 | #[derive(Clone, Serialize, Deserialize, Debug)]
343 | struct TestMsg {
344 |     id: i64,
345 |     date: String,
346 |     name: String,
347 | }
348 | 
349 | fn default_settings() -> SrSettings {
350 |     SrSettings::new(String::from(SCHEMA_REGISTRY_ADDRESS))
351 | }
352 | 
353 | async fn avro_encode(item: impl Serialize, topic: String) -> Result<Vec<u8>, SRCError> {
354 |     EasyAvroEncoder::new(default_settings())
355 |         .encode_struct(item, &SubjectNameStrategy::RecordNameStrategy(topic))
356 |         .await
357 | }
358 | 
359 | async fn json_encode(value: &serde_json::Value, topic: String) -> Result<Vec<u8>, SRCError> {
360 |     EasyJsonEncoder::new(default_settings())
361 |         .encode(value, SubjectNameStrategy::RecordNameStrategy(topic))
362 |         .await
363 | }
364 | 
365 | async fn prepare_json_schema(topic: String) -> Result<RegisteredSchema, SRCError> {
366 |     let settings = default_settings();
367 |     let schema = SuppliedSchema {
368 |         name: None,
369 |         schema_type: SchemaType::Json,
370 |         schema: String::from(
371 |             r#"{"schemaType": "JSON", "schema": "{\"type\": \"object\", \"properties\": {\"name\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}, \"id\": {\"type\": \"number\"}}}"}"#,
372 |         ),
373 |         references: vec![],
374 |     };
375 |     post_schema(&settings, topic, schema).await
376 | }
377 | 
378 | async fn prepare_avro_schema(topic: String) -> Result<RegisteredSchema, SRCError> {
379 |     let settings = default_settings();
380 |     let schema = SuppliedSchema {
381 |         name: None,
382 |         schema_type: SchemaType::Avro,
383 |         schema: String::from(DEFAULT_AVRO_SCHEMA),
384 |         references: vec![],
385 |     };
386 |     post_schema(&settings, topic, schema).await
387 | }
388 | 
389 | fn assert_defaults(msg: &TestMsg) {
390 |     assert_eq!(msg.id, DEFAULT_ID);
391 |     assert_eq!(msg.name, DEFAULT_NAME);
392 |     assert_eq!(msg.date, DEFAULT_DATE);
393 | }
394 | 
395 | fn default_schema() -> serde_json::Value {
396 |     json!({
397 |         "name": "string",
398 |         "id": "integer",
399 |         "date": "string",
400 |     })
401 | }
402 | 


--------------------------------------------------------------------------------
/tests/emails_azure_blob_tests.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "azure")]
  2 | #[allow(dead_code)]
  3 | mod helpers;
  4 | 
  5 | use std::collections::HashMap;
  6 | use std::env;
  7 | use std::thread;
  8 | use std::time::Duration;
  9 | use time::OffsetDateTime;
 10 | 
 11 | use serial_test::serial;
 12 | use uuid::Uuid;
 13 | 
 14 | use kafka_delta_ingest::IngestOptions;
 15 | 
 16 | use helpers::*;
 17 | 
 18 | use azure_storage::{prelude::BlobSasPermissions, shared_access_signature::SasProtocol};
 19 | 
 20 | #[tokio::test(flavor = "multi_thread")]
 21 | #[serial]
 22 | async fn when_both_workers_started_simultaneously_azure() {
 23 |     run_emails_azure_tests(false).await;
 24 | }
 25 | 
 26 | #[tokio::test(flavor = "multi_thread")]
 27 | #[serial]
 28 | async fn when_rebalance_happens_azure() {
 29 |     run_emails_azure_tests(true).await;
 30 | }
 31 | 
 32 | async fn run_emails_azure_tests(initiate_rebalance: bool) {
 33 |     helpers::init_logger();
 34 |     deltalake_azure::register_handlers(None);
 35 |     let topic = format!("emails_azure-{}", Uuid::new_v4());
 36 |     let table = prepare_table(&topic).await;
 37 |     let options = create_options();
 38 |     let scope = TestScope::new(&topic, &table, options).await;
 39 | 
 40 |     let w1 = scope.create_and_start(helpers::WORKER_1).await;
 41 | 
 42 |     // in order to initiate rebalance we first send messages,
 43 |     // ensure that worker 1 consumes some of them and then create worker 2,
 44 |     // otherwise, to proceed without rebalance the two workers has to be created simultaneously
 45 |     let w2 = if initiate_rebalance {
 46 |         scope.send_messages(TEST_TOTAL_MESSAGES).await;
 47 |         thread::sleep(Duration::from_secs(1));
 48 |         scope.create_and_start(helpers::WORKER_2).await
 49 |     } else {
 50 |         let w = scope.create_and_start(helpers::WORKER_2).await;
 51 |         thread::sleep(Duration::from_secs(4));
 52 |         scope.send_messages(TEST_TOTAL_MESSAGES).await;
 53 |         w
 54 |     };
 55 | 
 56 |     // this will end up with more app_ids than actual,
 57 |     // since we're not sure which partitions will get each worker
 58 |     let partitions = create_partitions_app_ids(helpers::TEST_PARTITIONS);
 59 | 
 60 |     // wait until the destination table will get every expected message, we check this summing up
 61 |     // the each offset of each partition to get the TOTAL_MESSAGES value
 62 |     scope
 63 |         .wait_on_total_offset(partitions, helpers::TEST_TOTAL_MESSAGES)
 64 |         .await;
 65 | 
 66 |     println!("Waiting on workers futures to exit...");
 67 |     // wait until workers are completely stopped
 68 |     let w1_result = w1.await;
 69 |     println!("Worker 1 finished - {:?}", w1_result);
 70 | 
 71 |     let w2_result = w2.await;
 72 |     println!("Worker 2 finished - {:?}", w2_result);
 73 | 
 74 |     scope.validate_data().await;
 75 |     scope.shutdown();
 76 | }
 77 | 
 78 | async fn prepare_table(topic: &str) -> String {
 79 |     let container_client = azure_storage_blobs::prelude::ClientBuilder::emulator()
 80 |         .container_client(helpers::test_s3_bucket());
 81 |     let source_blob =
 82 |         container_client.blob_client(format!("emails/_delta_log/00000000000000000000.json"));
 83 |     let sas_url = {
 84 |         let now = OffsetDateTime::now_utc();
 85 |         let later = now + time::Duration::hours(1);
 86 |         let sas = source_blob
 87 |             .shared_access_signature(
 88 |                 BlobSasPermissions {
 89 |                     read: true,
 90 |                     ..Default::default()
 91 |                 },
 92 |                 later,
 93 |             )
 94 |             .await
 95 |             .unwrap()
 96 |             .start(now)
 97 |             .protocol(SasProtocol::HttpHttps);
 98 |         source_blob.generate_signed_blob_url(&sas).unwrap()
 99 |     };
100 |     container_client
101 |         .blob_client(format!("{}/_delta_log/00000000000000000000.json", topic))
102 |         .copy_from_url(sas_url)
103 |         .await
104 |         .unwrap();
105 | 
106 |     format!("az://{}/{}", helpers::test_s3_bucket(), topic)
107 | }
108 | 
109 | fn create_partitions_app_ids(num_p: i32) -> Vec<String> {
110 |     let mut vector = Vec::new();
111 |     for n in 0..num_p {
112 |         vector.push(format!("{}-{}", helpers::TEST_APP_ID, n));
113 |     }
114 |     vector
115 | }
116 | 
117 | fn create_options() -> IngestOptions {
118 |     env::set_var("AZURE_STORAGE_USE_EMULATOR", "true");
119 |     env::set_var("AZURE_ACCOUNT_NAME", "devstoreaccount1");
120 |     env::set_var(
121 |         "AZURE_ACCESS_KEY",
122 |         "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==",
123 |     );
124 |     env::set_var("AZURE_STORAGE_CONTAINER_NAME", "tests");
125 |     env::set_var("AZURE_STORAGE_ALLOW_HTTP", "1");
126 |     env::set_var("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000");
127 |     env::set_var(
128 |             "AZURE_STORAGE_CONNECTION_STRING", 
129 |             "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://localhost:10000/devstoreaccount1;QueueEndpoint=http://localhost:10001/devstoreaccount1;");
130 | 
131 |     let mut additional_kafka_settings = HashMap::new();
132 |     additional_kafka_settings.insert("auto.offset.reset".to_string(), "earliest".to_string());
133 |     let additional_kafka_settings = Some(additional_kafka_settings);
134 | 
135 |     let allowed_latency = 2;
136 |     let max_messages_per_batch = 10;
137 |     let min_bytes_per_file = 370;
138 | 
139 |     let mut transforms = HashMap::new();
140 |     transforms.insert("date".to_string(), "substr(timestamp,`0`,`10`)".to_string());
141 |     transforms.insert("_kafka_offset".to_string(), "kafka.offset".to_string());
142 | 
143 |     IngestOptions {
144 |         transforms,
145 |         kafka_brokers: helpers::test_broker(),
146 |         consumer_group_id: TEST_CONSUMER_GROUP_ID.to_string(),
147 |         app_id: TEST_APP_ID.to_string(),
148 |         additional_kafka_settings,
149 |         allowed_latency,
150 |         max_messages_per_batch,
151 |         min_bytes_per_file,
152 |         write_checkpoints: true,
153 |         ..Default::default()
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/tests/emails_s3_tests.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "s3")]
  2 | 
  3 | #[allow(dead_code)]
  4 | mod helpers;
  5 | 
  6 | use std::collections::HashMap;
  7 | use std::env;
  8 | use std::io::Read;
  9 | use std::thread;
 10 | use std::time::Duration;
 11 | 
 12 | use serial_test::serial;
 13 | use uuid::Uuid;
 14 | 
 15 | use kafka_delta_ingest::IngestOptions;
 16 | use rusoto_core::Region;
 17 | use rusoto_s3::{CopyObjectRequest, PutObjectRequest, S3};
 18 | 
 19 | use helpers::*;
 20 | 
 21 | #[tokio::test(flavor = "multi_thread")]
 22 | #[serial]
 23 | async fn when_both_workers_started_simultaneously() {
 24 |     run_emails_s3_tests(false).await;
 25 | }
 26 | 
 27 | #[tokio::test(flavor = "multi_thread")]
 28 | #[serial]
 29 | async fn when_rebalance_happens() {
 30 |     run_emails_s3_tests(true).await;
 31 | }
 32 | 
 33 | async fn run_emails_s3_tests(initiate_rebalance: bool) {
 34 |     deltalake_aws::register_handlers(None);
 35 |     helpers::init_logger();
 36 |     let topic = format!("emails_s3-{}", Uuid::new_v4());
 37 |     let table = prepare_table(&topic).await;
 38 |     let options = create_options(helpers::WORKER_1);
 39 |     let scope = TestScope::new(&topic, &table, options).await;
 40 | 
 41 |     let w1 = scope.create_and_start(WORKER_1).await;
 42 | 
 43 |     // in order to initiate rebalance we first send messages,
 44 |     // ensure that worker 1 consumes some of them and then create worker 2,
 45 |     // otherwise, to proceed without rebalance the two workers has to be created simultaneously
 46 |     let w2 = if initiate_rebalance {
 47 |         scope.send_messages(TEST_TOTAL_MESSAGES).await;
 48 |         thread::sleep(Duration::from_secs(1));
 49 |         scope.create_and_start(WORKER_2).await
 50 |     } else {
 51 |         let w = scope.create_and_start(WORKER_2).await;
 52 |         thread::sleep(Duration::from_secs(4));
 53 |         scope.send_messages(TEST_TOTAL_MESSAGES).await;
 54 |         w
 55 |     };
 56 | 
 57 |     // this will end up with more app_ids than actual,
 58 |     // since we're not sure which partitions will get each worker
 59 |     let partitions = create_partitions_app_ids(TEST_PARTITIONS);
 60 | 
 61 |     // wait until the destination table will get every expected message, we check this summing up
 62 |     // the each offset of each partition to get the TOTAL_MESSAGES value
 63 |     scope
 64 |         .wait_on_total_offset(partitions, TEST_TOTAL_MESSAGES)
 65 |         .await;
 66 | 
 67 |     println!("Waiting on workers futures to exit...");
 68 |     // wait until workers are completely stopped
 69 |     let w1_result = w1.await;
 70 |     println!("Worker 1 finished - {:?}", w1_result);
 71 | 
 72 |     let w2_result = w2.await;
 73 |     println!("Worker 2 finished - {:?}", w2_result);
 74 | 
 75 |     scope.validate_data().await;
 76 |     scope.shutdown();
 77 | }
 78 | 
 79 | fn create_options(name: &str) -> IngestOptions {
 80 |     env::set_var("AWS_ENDPOINT_URL", helpers::test_aws_endpoint());
 81 |     env::set_var("AWS_S3_LOCKING_PROVIDER", "dynamodb");
 82 |     env::set_var("AWS_REGION", "us-east-2");
 83 |     env::set_var("AWS_STORAGE_ALLOW_HTTP", "true");
 84 |     env::set_var("DELTA_DYNAMO_TABLE_NAME", "locks");
 85 |     env::set_var("DYNAMO_LOCK_OWNER_NAME", name);
 86 |     env::set_var("DYNAMO_LOCK_PARTITION_KEY_VALUE", "emails_s3_tests");
 87 |     env::set_var("DYNAMO_LOCK_REFRESH_PERIOD_MILLIS", "100");
 88 |     env::set_var("DYNAMO_LOCK_ADDITIONAL_TIME_TO_WAIT_MILLIS", "100");
 89 |     env::set_var("DYNAMO_LOCK_LEASE_DURATION", "2");
 90 | 
 91 |     let mut additional_kafka_settings = HashMap::new();
 92 |     additional_kafka_settings.insert("auto.offset.reset".to_string(), "earliest".to_string());
 93 |     let additional_kafka_settings = Some(additional_kafka_settings);
 94 | 
 95 |     let allowed_latency = 2;
 96 |     let max_messages_per_batch = 10;
 97 |     let min_bytes_per_file = 370;
 98 | 
 99 |     let mut transforms = HashMap::new();
100 |     transforms.insert("date".to_string(), "substr(timestamp,`0`,`10`)".to_string());
101 |     transforms.insert("_kafka_offset".to_string(), "kafka.offset".to_string());
102 | 
103 |     IngestOptions {
104 |         transforms,
105 |         kafka_brokers: helpers::test_broker(),
106 |         consumer_group_id: helpers::TEST_CONSUMER_GROUP_ID.to_string(),
107 |         app_id: helpers::TEST_APP_ID.to_string(),
108 |         additional_kafka_settings,
109 |         allowed_latency,
110 |         max_messages_per_batch,
111 |         min_bytes_per_file,
112 |         write_checkpoints: true,
113 |         ..Default::default()
114 |     }
115 | }
116 | 
117 | async fn prepare_table(topic: &str) -> String {
118 |     match env::var("AWS_ACCESS_KEY_ID") {
119 |         Err(_) => env::set_var("AWS_ACCESS_KEY_ID", "test"),
120 |         Ok(_) => {}
121 |     }
122 |     match env::var("AWS_SECRET_ACCESS_KEY") {
123 |         Err(_) => env::set_var("AWS_SECRET_ACCESS_KEY", "test"),
124 |         Ok(_) => {}
125 |     }
126 | 
127 |     let s3 = rusoto_s3::S3Client::new(Region::Custom {
128 |         name: "custom".to_string(),
129 |         endpoint: helpers::test_aws_endpoint(),
130 |     });
131 | 
132 |     /*
133 |      * Copy the local fixture to create a simple delta table in storage.
134 |      */
135 |     let mut buf = vec![];
136 |     let _original_log =
137 |         std::fs::File::open("tests/data/emails/_delta_log/00000000000000000000.json")
138 |             .unwrap()
139 |             .read_to_end(&mut buf);
140 | 
141 |     s3.put_object(PutObjectRequest {
142 |         bucket: helpers::test_s3_bucket(),
143 |         body: Some(buf.into()),
144 |         key: "emails/_delta_log/00000000000000000000.json".into(),
145 |         ..Default::default()
146 |     })
147 |     .await
148 |     .unwrap();
149 | 
150 |     s3.copy_object(CopyObjectRequest {
151 |         bucket: helpers::test_s3_bucket(),
152 |         key: format!("{}/_delta_log/00000000000000000000.json", topic),
153 |         copy_source: format!(
154 |             "/{}/emails/_delta_log/00000000000000000000.json",
155 |             helpers::test_s3_bucket(),
156 |         ),
157 |         ..Default::default()
158 |     })
159 |     .await
160 |     .unwrap();
161 | 
162 |     format!("s3://{}/{}", helpers::test_s3_bucket(), topic)
163 | }
164 | 
165 | fn create_partitions_app_ids(num_p: i32) -> Vec<String> {
166 |     let mut vector = Vec::new();
167 |     for n in 0..num_p {
168 |         vector.push(format!("{}-{}", TEST_APP_ID, n));
169 |     }
170 |     vector
171 | }
172 | 


--------------------------------------------------------------------------------
/tests/json/web_requests-100K.json.tar.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bf933bd75dfaffe95c9678a5c7be1a835531e0bb6e28d04907afccc4671d2b2b
3 | size 5641438
4 | 


--------------------------------------------------------------------------------
/tests/offset_tests.rs:
--------------------------------------------------------------------------------
  1 | use deltalake_core::protocol::Stats;
  2 | use deltalake_core::DeltaTable;
  3 | use log::*;
  4 | use rdkafka::{producer::Producer, util::Timeout};
  5 | use serde::{Deserialize, Serialize};
  6 | use serde_json::json;
  7 | use serial_test::serial;
  8 | use uuid::Uuid;
  9 | 
 10 | use std::path::Path;
 11 | 
 12 | use kafka_delta_ingest::{AutoOffsetReset, IngestOptions};
 13 | #[allow(dead_code)]
 14 | mod helpers;
 15 | 
 16 | #[derive(Debug, Serialize, Deserialize)]
 17 | struct TestMsg {
 18 |     id: u64,
 19 |     color: String,
 20 | }
 21 | 
 22 | impl TestMsg {
 23 |     fn new(id: u64) -> Self {
 24 |         Self {
 25 |             id,
 26 |             color: "default".to_string(),
 27 |         }
 28 |     }
 29 | }
 30 | 
 31 | #[tokio::test]
 32 | async fn zero_offset_issue() {
 33 |     let table = "./tests/data/zero_offset";
 34 |     helpers::init_logger();
 35 |     let topic = format!("zero_offset_issue_{}", Uuid::new_v4());
 36 | 
 37 |     helpers::create_topic(&topic, 1).await;
 38 | 
 39 |     let (kdi, token, rt) = helpers::create_kdi(
 40 |         &topic,
 41 |         table,
 42 |         IngestOptions {
 43 |             app_id: "zero_offset".to_string(),
 44 |             allowed_latency: 5,
 45 |             max_messages_per_batch: 1,
 46 |             min_bytes_per_file: 20,
 47 |             ..Default::default()
 48 |         },
 49 |     );
 50 | 
 51 |     {
 52 |         // check that there's only 1 record in table
 53 |         let table = deltalake_core::open_table(table).await.unwrap();
 54 |         assert_eq!(table.version(), 1);
 55 |         assert_eq!(count_records(table), 1);
 56 |     }
 57 | 
 58 |     let producer = helpers::create_producer();
 59 | 
 60 |     // submit 3 messages in kafka, but only 2nd and 3rd should go in as msg 0:0 already in delta
 61 |     for i in 0..3 {
 62 |         helpers::send_json(
 63 |             &producer,
 64 |             &topic,
 65 |             &serde_json::to_value(TestMsg::new(i)).unwrap(),
 66 |         )
 67 |         .await;
 68 |     }
 69 | 
 70 |     let v2 = Path::new("./tests/data/zero_offset/_delta_log/00000000000000000002.json");
 71 |     let v3 = Path::new("./tests/data/zero_offset/_delta_log/00000000000000000003.json");
 72 | 
 73 |     helpers::wait_until_file_created(v2);
 74 |     helpers::wait_until_file_created(v3);
 75 |     token.cancel();
 76 |     // if it succeeds then it means that we successfully seeked into offset 0, e.g. Offset::Beginning
 77 |     kdi.await.unwrap();
 78 |     rt.shutdown_background();
 79 | 
 80 |     // check that there's only 3 records
 81 |     let table = deltalake_core::open_table(table).await.unwrap();
 82 |     assert_eq!(table.version(), 3);
 83 |     assert_eq!(count_records(table), 3);
 84 | 
 85 |     //cleanup
 86 |     std::fs::remove_file(v2).unwrap();
 87 |     std::fs::remove_file(v3).unwrap();
 88 | }
 89 | 
 90 | fn count_records(table: DeltaTable) -> i64 {
 91 |     let mut count = 0;
 92 | 
 93 |     if let Ok(adds) = table.state.unwrap().file_actions() {
 94 |         for add in adds.iter() {
 95 |             if let Some(stats) = add.stats.as_ref() {
 96 |                 // as of deltalake-core 0.18.0 get_stats_parsed() only returns data when loaded
 97 |                 // from checkpoints so manual parsing is necessary
 98 |                 let stats: Stats = serde_json::from_str(stats).unwrap_or(Stats::default());
 99 |                 count += stats.num_records;
100 |             }
101 |         }
102 |     }
103 |     count
104 | }
105 | 
106 | #[tokio::test]
107 | #[serial]
108 | async fn test_start_from_explicit() {
109 |     helpers::init_logger();
110 | 
111 |     let table = helpers::create_local_table(
112 |         json!({
113 |             "id": "integer",
114 |             "color": "string",
115 |         }),
116 |         vec!["color"],
117 |         "starting_offsets_explicit",
118 |     );
119 | 
120 |     let topic = format!("starting_offsets_explicit_{}", uuid::Uuid::new_v4());
121 |     helpers::create_topic(&topic, 1).await;
122 | 
123 |     let producer = helpers::create_producer();
124 | 
125 |     // Send messages to Kafka before starting kafka-delta-ingest
126 |     for m in create_generator(1).take(10) {
127 |         info!("Writing test message");
128 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
129 |     }
130 | 
131 |     producer.flush(Timeout::Never);
132 | 
133 |     debug!("Sent test messages to Kafka");
134 | 
135 |     // Start ingest
136 |     let (kdi, token, rt) = helpers::create_kdi(
137 |         &topic,
138 |         &table,
139 |         IngestOptions {
140 |             app_id: "starting_offsets_explicit".to_string(),
141 |             allowed_latency: 20,
142 |             max_messages_per_batch: 10,
143 |             min_bytes_per_file: 10,
144 |             seek_offsets: Some(vec![(0, 3)]), // starting offset is goign to be 4
145 |             ..Default::default()
146 |         },
147 |     );
148 | 
149 |     // Wait for the rebalance assignment
150 |     std::thread::sleep(std::time::Duration::from_secs(8));
151 | 
152 |     // Send messages to Kafka before starting kafka-delta-ingest
153 |     for m in create_generator(11).take(5) {
154 |         info!("Writing test message");
155 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
156 |     }
157 | 
158 |     info!("Waiting for version 1");
159 |     helpers::wait_until_version_created(&table, 1);
160 | 
161 |     token.cancel();
162 |     kdi.await.unwrap();
163 |     rt.shutdown_background();
164 | 
165 |     let written_ids: Vec<u64> = helpers::read_table_content_as_jsons(&table)
166 |         .await
167 |         .iter()
168 |         .map(|v| serde_json::from_value::<TestMsg>(v.clone()).unwrap().id)
169 |         .collect();
170 | 
171 |     assert_eq!((5u64..15).collect::<Vec<u64>>(), written_ids);
172 | 
173 |     helpers::cleanup_kdi(&topic, &table).await;
174 | }
175 | 
176 | #[tokio::test]
177 | #[serial]
178 | async fn test_start_from_earliest() {
179 |     helpers::init_logger();
180 | 
181 |     let table = helpers::create_local_table(
182 |         json!({
183 |             "id": "integer",
184 |             "color": "string",
185 |         }),
186 |         vec!["color"],
187 |         "starting_offsets_earliest",
188 |     );
189 | 
190 |     let topic = format!("starting_offsets_earliest{}", uuid::Uuid::new_v4());
191 |     helpers::create_topic(&topic, 3).await;
192 | 
193 |     let producer = helpers::create_producer();
194 | 
195 |     let messages: Vec<TestMsg> = create_generator(1).take(10).collect();
196 | 
197 |     // Send messages to Kafka before starting kafka-delta-ingest
198 |     for m in messages.iter().take(15) {
199 |         info!("Writing test message");
200 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
201 |     }
202 | 
203 |     // Start ingest
204 |     let (kdi, token, rt) = helpers::create_kdi(
205 |         &topic,
206 |         &table,
207 |         IngestOptions {
208 |             app_id: "starting_offsets_earliest".to_string(),
209 |             allowed_latency: 10,
210 |             max_messages_per_batch: 10,
211 |             min_bytes_per_file: 10,
212 |             auto_offset_reset: AutoOffsetReset::Earliest,
213 |             ..Default::default()
214 |         },
215 |     );
216 | 
217 |     info!("Waiting for version 1");
218 |     helpers::wait_until_version_created(&table, 1);
219 | 
220 |     token.cancel();
221 |     kdi.await.unwrap();
222 |     rt.shutdown_background();
223 | 
224 |     let mut written_ids: Vec<u64> = helpers::read_table_content_as_jsons(&table)
225 |         .await
226 |         .iter()
227 |         .map(|v| serde_json::from_value::<TestMsg>(v.clone()).unwrap().id)
228 |         .collect();
229 |     written_ids.sort();
230 | 
231 |     assert_eq!((1u64..11).collect::<Vec<u64>>(), written_ids);
232 | 
233 |     helpers::cleanup_kdi(&topic, &table).await;
234 | }
235 | 
236 | #[tokio::test]
237 | #[serial]
238 | async fn test_start_from_latest() {
239 |     helpers::init_logger();
240 | 
241 |     let table = helpers::create_local_table(
242 |         json! ({
243 |             "id": "integer",
244 |             "color": "string",
245 |         }),
246 |         vec!["color"],
247 |         "starting_offsets_latest",
248 |     );
249 | 
250 |     let topic = format!("starting_offsets_latest{}", uuid::Uuid::new_v4());
251 |     helpers::create_topic(&topic, 1).await;
252 | 
253 |     let producer = helpers::create_producer();
254 | 
255 |     // Send messages to Kafka before starting kafka-delta-ingest
256 |     // offsets for this first set should be 0...4
257 |     for m in create_generator(1).take(5) {
258 |         info!("Writing test message");
259 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
260 |     }
261 | 
262 |     producer.flush(Timeout::Never);
263 | 
264 |     // Start ingest
265 |     let (kdi, token, rt) = helpers::create_kdi(
266 |         &topic,
267 |         &table,
268 |         IngestOptions {
269 |             app_id: "starting_offsets_latest".to_string(),
270 |             allowed_latency: 10,
271 |             max_messages_per_batch: 10,
272 |             min_bytes_per_file: 10,
273 |             auto_offset_reset: AutoOffsetReset::Latest,
274 |             ..Default::default()
275 |         },
276 |     );
277 | 
278 |     // Wait for the rebalance assignment so the position of latest is clear.
279 |     // Precise starting offset in a production environment will depend on message rate, but, "latest is what latest is".
280 |     std::thread::sleep(std::time::Duration::from_secs(3));
281 | 
282 |     // Send on message to trigger seek to latest
283 |     // This skips a message to account for the seek
284 |     for m in create_generator(6).take(1) {
285 |         info!("Writing test message");
286 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
287 |     }
288 | 
289 |     // Wait for the rebalance assignment so the position of latest is clear.
290 |     std::thread::sleep(std::time::Duration::from_secs(8));
291 | 
292 |     // These 10 messages should be in the delta log
293 |     for m in create_generator(7).take(10) {
294 |         info!("Writing test message");
295 |         helpers::send_json(&producer, &topic, &serde_json::to_value(m).unwrap()).await;
296 |     }
297 | 
298 |     info!("Waiting for version 1");
299 |     helpers::wait_until_version_created(&table, 1);
300 | 
301 |     token.cancel();
302 |     kdi.await.unwrap();
303 |     rt.shutdown_background();
304 | 
305 |     let mut written_ids: Vec<u64> = helpers::read_table_content_as_jsons(&table)
306 |         .await
307 |         .iter()
308 |         .map(|v| serde_json::from_value::<TestMsg>(v.clone()).unwrap().id)
309 |         .collect();
310 |     written_ids.sort();
311 | 
312 |     // ids should be 7 -16 (offsets 6-15)
313 |     assert_eq!((7u64..17).collect::<Vec<u64>>(), written_ids);
314 | 
315 |     helpers::cleanup_kdi(&topic, &table).await;
316 | }
317 | 
318 | fn create_generator(starting_id: u64) -> impl Iterator<Item = TestMsg> {
319 |     std::iter::successors(Some(starting_id), |n| Some(*n + 1)).map(|n| TestMsg {
320 |         id: n,
321 |         color: "red".to_string(),
322 |     })
323 | }
324 | 
325 | #[derive(Debug, Serialize, Deserialize)]
326 | struct Msg {
327 |     id: u32,
328 |     city: String,
329 | }
330 | 
331 | impl Msg {
332 |     fn new(id: u32) -> Self {
333 |         Self {
334 |             id,
335 |             city: "default".to_string(),
336 |         }
337 |     }
338 | }
339 | 
340 | #[tokio::test]
341 | async fn end_at_initial_offsets() {
342 |     helpers::init_logger();
343 |     let topic = format!("end_at_offset_{}", Uuid::new_v4());
344 | 
345 |     let table = helpers::create_local_table(
346 |         json!({
347 |             "id": "integer",
348 |             "city": "string",
349 |         }),
350 |         vec!["city"],
351 |         &topic,
352 |     );
353 |     let table = table.as_str();
354 | 
355 |     helpers::create_topic(&topic, 3).await;
356 | 
357 |     let producer = helpers::create_producer();
358 |     // submit 15 messages in kafka
359 |     for i in 0..15 {
360 |         helpers::send_json(
361 |             &producer,
362 |             &topic,
363 |             &serde_json::to_value(Msg::new(i)).unwrap(),
364 |         )
365 |         .await;
366 |     }
367 | 
368 |     let (kdi, _token, rt) = helpers::create_kdi(
369 |         &topic,
370 |         table,
371 |         IngestOptions {
372 |             app_id: topic.clone(),
373 |             allowed_latency: 5,
374 |             max_messages_per_batch: 20,
375 |             min_bytes_per_file: 20,
376 |             end_at_last_offsets: true,
377 |             ..Default::default()
378 |         },
379 |     );
380 | 
381 |     helpers::wait_until_version_created(table, 1);
382 | 
383 |     {
384 |         // check that there's 3 records in table
385 |         let table = deltalake_core::open_table(table).await.unwrap();
386 |         assert_eq!(table.version(), 1);
387 |         assert_eq!(count_records(table), 15);
388 |     }
389 | 
390 |     // messages in kafka
391 |     for i in 16..31 {
392 |         helpers::send_json(
393 |             &producer,
394 |             &topic,
395 |             &serde_json::to_value(Msg::new(i)).unwrap(),
396 |         )
397 |         .await;
398 |     }
399 | 
400 |     helpers::expect_termination_within(kdi, 10).await;
401 |     rt.shutdown_background();
402 | 
403 |     // check that there's only 3 records
404 |     let table = deltalake_core::open_table(table).await.unwrap();
405 |     assert_eq!(table.version(), 1);
406 |     assert_eq!(count_records(table), 15);
407 | }
408 | 


--------------------------------------------------------------------------------
/tests/schema_update_tests.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Deserialize, Serialize};
  2 | use serde_json::{json, Value};
  3 | use serial_test::serial;
  4 | use std::fs::File;
  5 | use std::io::prelude::*;
  6 | 
  7 | #[allow(dead_code)]
  8 | mod helpers;
  9 | 
 10 | #[derive(Debug, Serialize, Deserialize, Clone)]
 11 | struct MsgV1 {
 12 |     id: u32,
 13 |     date: String,
 14 | }
 15 | 
 16 | #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Clone)]
 17 | struct MsgV2 {
 18 |     id: u32,
 19 |     color: Option<String>,
 20 |     date: String,
 21 | }
 22 | 
 23 | #[tokio::test]
 24 | #[serial]
 25 | async fn schema_update_test() {
 26 |     let (topic, table, producer, kdi, token, rt) = helpers::create_and_run_kdi(
 27 |         "schema_update",
 28 |         json!({
 29 |             "id": "integer",
 30 |             "date": "string",
 31 |         }),
 32 |         vec!["date"],
 33 |         1,
 34 |         None,
 35 |     )
 36 |     .await;
 37 | 
 38 |     let msg_v1 = MsgV1 {
 39 |         id: 1,
 40 |         date: "default".to_string(),
 41 |     };
 42 | 
 43 |     let msg_v2_1 = MsgV2 {
 44 |         id: 2,
 45 |         color: Some("red".to_string()),
 46 |         date: "default".to_string(),
 47 |     };
 48 | 
 49 |     let msg_v2_2 = MsgV2 {
 50 |         id: 3,
 51 |         color: Some("blue".to_string()),
 52 |         date: "default".to_string(),
 53 |     };
 54 | 
 55 |     // send msg v1
 56 |     helpers::send_json(
 57 |         &producer,
 58 |         &topic,
 59 |         &serde_json::to_value(msg_v1.clone()).unwrap(),
 60 |     )
 61 |     .await;
 62 |     helpers::wait_until_version_created(&table, 1);
 63 | 
 64 |     // update delta schema with new col 'color'
 65 |     let new_schema = json!({
 66 |         "id": "integer",
 67 |         "color": "string",
 68 |         "date": "string",
 69 |     });
 70 |     alter_schema(&table, 2, new_schema, vec!["date"]);
 71 | 
 72 |     // send few messages with new schema
 73 |     helpers::send_json(
 74 |         &producer,
 75 |         &topic,
 76 |         &serde_json::to_value(msg_v2_1.clone()).unwrap(),
 77 |     )
 78 |     .await;
 79 |     helpers::send_json(
 80 |         &producer,
 81 |         &topic,
 82 |         &serde_json::to_value(msg_v2_2.clone()).unwrap(),
 83 |     )
 84 |     .await;
 85 |     helpers::wait_until_version_created(&table, 4);
 86 | 
 87 |     token.cancel();
 88 |     kdi.await.unwrap();
 89 |     rt.shutdown_background();
 90 | 
 91 |     // retrieve data from the table
 92 |     let content: Vec<MsgV2> = helpers::read_table_content_as_jsons(&table)
 93 |         .await
 94 |         .iter()
 95 |         .map(|v| serde_json::from_value(v.clone()).unwrap())
 96 |         .collect();
 97 | 
 98 |     // convert msg v1 to v2
 99 |     let expected = vec![
100 |         MsgV2 {
101 |             id: msg_v1.id,
102 |             color: None,
103 |             date: msg_v1.date.clone(),
104 |         },
105 |         msg_v2_1,
106 |         msg_v2_2,
107 |     ];
108 | 
109 |     //  and compare the results
110 |     assert_eq!(content, expected);
111 | 
112 |     helpers::cleanup_kdi(&topic, &table).await;
113 | }
114 | 
115 | fn alter_schema(table: &str, version: i64, schema: Value, partitions: Vec<&str>) {
116 |     let schema = helpers::create_metadata_action_json(schema, &partitions);
117 |     let tmp = format!("{}/_delta_log/temp.json", table);
118 |     {
119 |         let mut file = File::create(&tmp).unwrap();
120 |         writeln!(file, "{}", schema).unwrap();
121 |         file.flush().unwrap();
122 |     }
123 |     // rename is atomic, but create+write is not
124 |     std::fs::rename(tmp, format!("{}/_delta_log/{:020}.json", table, version)).unwrap();
125 | }
126 | 


--------------------------------------------------------------------------------