├── .github └── workflows │ ├── build.yml │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── deltalake-rb.gemspec ├── ext └── deltalake │ ├── Cargo.toml │ ├── extconf.rb │ └── src │ ├── error.rs │ ├── features.rs │ ├── lib.rs │ ├── merge.rs │ ├── schema.rs │ └── utils.rs ├── lib ├── deltalake-rb.rb ├── deltalake.rb └── deltalake │ ├── field.rb │ ├── metadata.rb │ ├── schema.rb │ ├── table.rb │ ├── table_alterer.rb │ ├── table_merger.rb │ ├── table_optimizer.rb │ ├── utils.rb │ └── version.rb └── test ├── alter_test.rb ├── merge_test.rb ├── optimize_test.rb ├── table_test.rb ├── test_helper.rb ├── types_test.rb └── write_test.rb /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: actions/cache@v4 9 | with: 10 | path: | 11 | ~/.cargo/registry 12 | ~/.cargo/git 13 | tmp 14 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 15 | - uses: ruby/setup-ruby@v1 16 | with: 17 | ruby-version: 3.4 18 | bundler-cache: true 19 | - run: bundle exec rake compile 20 | - run: bundle exec rake test 21 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | on: workflow_dispatch 3 | jobs: 4 | release: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | include: 9 | - platform: x86_64-linux 10 | target: x86_64-unknown-linux-gnu 11 | - platform: x86_64-linux-musl 12 | target: x86_64-unknown-linux-musl 13 | - platform: aarch64-linux 14 | target: aarch64-unknown-linux-gnu 15 | - platform: aarch64-linux-musl 16 | target: aarch64-unknown-linux-musl 17 | - platform: x86_64-darwin 18 | target: x86_64-apple-darwin 19 | - platform: arm64-darwin 20 | target: aarch64-apple-darwin 21 | - platform: x64-mingw-ucrt 22 | target: x86_64-pc-windows-gnu 23 | runs-on: ubuntu-latest 24 | name: ${{ matrix.platform }} 25 | steps: 26 | - uses: actions/checkout@v4 27 | - run: | 28 | cargo install --locked --git https://github.com/ankane/cargo-3pl 29 | git clone https://github.com/ankane/3pl-source.git 30 | cargo 3pl --target ${{ matrix.target }} --require-files --source 3pl-source > LICENSE-THIRD-PARTY.txt 31 | - uses: ruby/setup-ruby@v1 32 | with: 33 | ruby-version: 3.3 34 | - uses: oxidize-rb/actions/cross-gem@v1 35 | id: cross-gem 36 | with: 37 | platform: ${{ matrix.platform }} 38 | ruby-versions: "3.4,3.3,3.2" 39 | - uses: actions/upload-artifact@v4 40 | with: 41 | name: cross-gem-${{ matrix.platform }} 42 | path: ${{ steps.cross-gem.outputs.gem-path }} 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | /Gemfile.lock 10 | /target/ 11 | /Makefile 12 | *.bundle 13 | *.so 14 | *.dll 15 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.2.0 (unreleased) 2 | 3 | - Updated `deltalake` to 0.26.2 4 | - Fixed `to_polars` method excluding partitioning columns 5 | - Dropped support for Ruby < 3.2 6 | 7 | ## 0.1.7 (2025-05-03) 8 | 9 | - Updated `deltalake` to 0.26.0 10 | 11 | ## 0.1.6 (2025-03-12) 12 | 13 | - Updated `deltalake` to 0.25.0 14 | 15 | ## 0.1.5 (2025-01-28) 16 | 17 | - Updated `deltalake` to 0.24.0 18 | 19 | ## 0.1.4 (2025-01-02) 20 | 21 | - Updated `deltalake` to 0.23.0 22 | 23 | ## 0.1.3 (2024-12-28) 24 | 25 | - Updated `deltalake` to 0.22.3 26 | - Added support for Ruby 3.4 27 | - Added `rechunk` and `columns` options to `to_polars` method 28 | 29 | ## 0.1.2 (2024-12-03) 30 | 31 | - Updated `deltalake` to 0.22.2 32 | - Added `merge` method to `Table` 33 | - Added `set_table_properties` method 34 | 35 | ## 0.1.1 (2024-11-22) 36 | 37 | - Added support for constraints 38 | - Added support for small file compaction 39 | - Added support for Z Ordering 40 | - Added `history`, `partitions`, `protocol`, `repair`, and `restore` methods to `Table` 41 | - Added experimental `load_cdf` method to `Table` 42 | - Fixed handling of unsigned integers 43 | - Fixed error with timestamps 44 | 45 | ## 0.1.0 (2024-11-20) 46 | 47 | - First release 48 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["ext/deltalake"] 3 | resolver = "2" 4 | 5 | [profile.release] 6 | strip = true 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | 5 | gem "rake" 6 | gem "rake-compiler" 7 | gem "minitest" 8 | gem "polars-df", ">= 0.15.0" 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (2020) QP Hou and a number of other contributors. All rights reserved. 2 | 3 | 4 | Apache License 5 | Version 2.0, January 2004 6 | http://www.apache.org/licenses/ 7 | 8 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 9 | 10 | 1. Definitions. 11 | 12 | "License" shall mean the terms and conditions for use, reproduction, 13 | and distribution as defined by Sections 1 through 9 of this document. 14 | 15 | "Licensor" shall mean the copyright owner or entity authorized by 16 | the copyright owner that is granting the License. 17 | 18 | "Legal Entity" shall mean the union of the acting entity and all 19 | other entities that control, are controlled by, or are under common 20 | control with that entity. For the purposes of this definition, 21 | "control" means (i) the power, direct or indirect, to cause the 22 | direction or management of such entity, whether by contract or 23 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 24 | outstanding shares, or (iii) beneficial ownership of such entity. 25 | 26 | "You" (or "Your") shall mean an individual or Legal Entity 27 | exercising permissions granted by this License. 28 | 29 | "Source" form shall mean the preferred form for making modifications, 30 | including but not limited to software source code, documentation 31 | source, and configuration files. 32 | 33 | "Object" form shall mean any form resulting from mechanical 34 | transformation or translation of a Source form, including but 35 | not limited to compiled object code, generated documentation, 36 | and conversions to other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or 39 | Object form, made available under the License, as indicated by a 40 | copyright notice that is included in or attached to the work 41 | (an example is provided in the Appendix below). 42 | 43 | "Derivative Works" shall mean any work, whether in Source or Object 44 | form, that is based on (or derived from) the Work and for which the 45 | editorial revisions, annotations, elaborations, or other modifications 46 | represent, as a whole, an original work of authorship. For the purposes 47 | of this License, Derivative Works shall not include works that remain 48 | separable from, or merely link (or bind by name) to the interfaces of, 49 | the Work and Derivative Works thereof. 50 | 51 | "Contribution" shall mean any work of authorship, including 52 | the original version of the Work and any modifications or additions 53 | to that Work or Derivative Works thereof, that is intentionally 54 | submitted to Licensor for inclusion in the Work by the copyright owner 55 | or by an individual or Legal Entity authorized to submit on behalf of 56 | the copyright owner. For the purposes of this definition, "submitted" 57 | means any form of electronic, verbal, or written communication sent 58 | to the Licensor or its representatives, including but not limited to 59 | communication on electronic mailing lists, source code control systems, 60 | and issue tracking systems that are managed by, or on behalf of, the 61 | Licensor for the purpose of discussing and improving the Work, but 62 | excluding communication that is conspicuously marked or otherwise 63 | designated in writing by the copyright owner as "Not a Contribution." 64 | 65 | "Contributor" shall mean Licensor and any individual or Legal Entity 66 | on behalf of whom a Contribution has been received by Licensor and 67 | subsequently incorporated within the Work. 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of 70 | this License, each Contributor hereby grants to You a perpetual, 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 72 | copyright license to reproduce, prepare Derivative Works of, 73 | publicly display, publicly perform, sublicense, and distribute the 74 | Work and such Derivative Works in Source or Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | (except as stated in this section) patent license to make, have made, 80 | use, offer to sell, sell, import, and otherwise transfer the Work, 81 | where such license applies only to those patent claims licensable 82 | by such Contributor that are necessarily infringed by their 83 | Contribution(s) alone or by combination of their Contribution(s) 84 | with the Work to which such Contribution(s) was submitted. If You 85 | institute patent litigation against any entity (including a 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work 87 | or a Contribution incorporated within the Work constitutes direct 88 | or contributory patent infringement, then any patent licenses 89 | granted to You under this License for that Work shall terminate 90 | as of the date such litigation is filed. 91 | 92 | 4. Redistribution. You may reproduce and distribute copies of the 93 | Work or Derivative Works thereof in any medium, with or without 94 | modifications, and in Source or Object form, provided that You 95 | meet the following conditions: 96 | 97 | (a) You must give any other recipients of the Work or 98 | Derivative Works a copy of this License; and 99 | 100 | (b) You must cause any modified files to carry prominent notices 101 | stating that You changed the files; and 102 | 103 | (c) You must retain, in the Source form of any Derivative Works 104 | that You distribute, all copyright, patent, trademark, and 105 | attribution notices from the Source form of the Work, 106 | excluding those notices that do not pertain to any part of 107 | the Derivative Works; and 108 | 109 | (d) If the Work includes a "NOTICE" text file as part of its 110 | distribution, then any Derivative Works that You distribute must 111 | include a readable copy of the attribution notices contained 112 | within such NOTICE file, excluding those notices that do not 113 | pertain to any part of the Derivative Works, in at least one 114 | of the following places: within a NOTICE text file distributed 115 | as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, 117 | within a display generated by the Derivative Works, if and 118 | wherever such third-party notices normally appear. The contents 119 | of the NOTICE file are for informational purposes only and 120 | do not modify the License. You may add Your own attribution 121 | notices within Derivative Works that You distribute, alongside 122 | or as an addendum to the NOTICE text from the Work, provided 123 | that such additional attribution notices cannot be construed 124 | as modifying the License. 125 | 126 | You may add Your own copyright statement to Your modifications and 127 | may provide additional or different license terms and conditions 128 | for use, reproduction, or distribution of Your modifications, or 129 | for any such Derivative Works as a whole, provided Your use, 130 | reproduction, and distribution of the Work otherwise complies with 131 | the conditions stated in this License. 132 | 133 | 5. Submission of Contributions. Unless You explicitly state otherwise, 134 | any Contribution intentionally submitted for inclusion in the Work 135 | by You to the Licensor shall be under the terms and conditions of 136 | this License, without any additional terms or conditions. 137 | Notwithstanding the above, nothing herein shall supersede or modify 138 | the terms of any separate license agreement you may have executed 139 | with Licensor regarding such Contributions. 140 | 141 | 6. Trademarks. This License does not grant permission to use the trade 142 | names, trademarks, service marks, or product names of the Licensor, 143 | except as required for reasonable and customary use in describing the 144 | origin of the Work and reproducing the content of the NOTICE file. 145 | 146 | 7. Disclaimer of Warranty. Unless required by applicable law or 147 | agreed to in writing, Licensor provides the Work (and each 148 | Contributor provides its Contributions) on an "AS IS" BASIS, 149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 150 | implied, including, without limitation, any warranties or conditions 151 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 152 | PARTICULAR PURPOSE. You are solely responsible for determining the 153 | appropriateness of using or redistributing the Work and assume any 154 | risks associated with Your exercise of permissions under this License. 155 | 156 | 8. Limitation of Liability. In no event and under no legal theory, 157 | whether in tort (including negligence), contract, or otherwise, 158 | unless required by applicable law (such as deliberate and grossly 159 | negligent acts) or agreed to in writing, shall any Contributor be 160 | liable to You for damages, including any direct, indirect, special, 161 | incidental, or consequential damages of any character arising as a 162 | result of this License or out of the use or inability to use the 163 | Work (including but not limited to damages for loss of goodwill, 164 | work stoppage, computer failure or malfunction, or any and all 165 | other commercial damages or losses), even if such Contributor 166 | has been advised of the possibility of such damages. 167 | 168 | 9. Accepting Warranty or Additional Liability. While redistributing 169 | the Work or Derivative Works thereof, You may choose to offer, 170 | and charge a fee for, acceptance of support, warranty, indemnity, 171 | or other liability obligations and/or rights consistent with this 172 | License. However, in accepting such obligations, You may act only 173 | on Your own behalf and on Your sole responsibility, not on behalf 174 | of any other Contributor, and only if You agree to indemnify, 175 | defend, and hold each Contributor harmless for any liability 176 | incurred by, or claims asserted against, such Contributor by reason 177 | of your accepting any such warranty or additional liability. 178 | 179 | END OF TERMS AND CONDITIONS 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # delta-ruby 2 | 3 | [Delta Lake](https://delta.io/) for Ruby 4 | 5 | Supports local files and Amazon S3 6 | 7 | [![Build Status](https://github.com/ankane/delta-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/delta-ruby/actions) 8 | 9 | ## Installation 10 | 11 | Add this line to your application’s Gemfile: 12 | 13 | ```ruby 14 | gem "deltalake-rb" 15 | ``` 16 | 17 | It can take 5-10 minutes to compile the gem. 18 | 19 | ## Getting Started 20 | 21 | Write data 22 | 23 | ```ruby 24 | df = Polars::DataFrame.new({"id" => [1, 2], "value" => [3.0, 4.0]}) 25 | DeltaLake.write("./events", df) 26 | ``` 27 | 28 | Load a table 29 | 30 | ```ruby 31 | dt = DeltaLake::Table.new("./events") 32 | df = dt.to_polars 33 | ``` 34 | 35 | Get a lazy frame 36 | 37 | ```ruby 38 | lf = dt.to_polars(eager: false) 39 | ``` 40 | 41 | Append rows 42 | 43 | ```ruby 44 | DeltaLake.write("./events", df, mode: "append") 45 | ``` 46 | 47 | Overwrite a table 48 | 49 | ```ruby 50 | DeltaLake.write("./events", df, mode: "overwrite") 51 | ``` 52 | 53 | Add a constraint 54 | 55 | ```ruby 56 | dt.alter.add_constraint({"id_gt_0" => "id > 0"}) 57 | ``` 58 | 59 | Drop a constraint 60 | 61 | ```ruby 62 | dt.alter.drop_constraint("id_gt_0") 63 | ``` 64 | 65 | Delete rows 66 | 67 | ```ruby 68 | dt.delete("id > 1") 69 | ``` 70 | 71 | Vacuum 72 | 73 | ```ruby 74 | dt.vacuum(dry_run: false) 75 | ``` 76 | 77 | Perform small file compaction 78 | 79 | ```ruby 80 | dt.optimize.compact 81 | ``` 82 | 83 | Colocate similar data in the same files 84 | 85 | ```ruby 86 | dt.optimize.z_order(["category"]) 87 | ``` 88 | 89 | Load a previous version of a table 90 | 91 | ```ruby 92 | dt = DeltaLake::Table.new("./events", version: 1) 93 | # or 94 | dt.load_as_version(1) 95 | ``` 96 | 97 | Get the schema 98 | 99 | ```ruby 100 | dt.schema 101 | ``` 102 | 103 | Get metadata 104 | 105 | ```ruby 106 | dt.metadata 107 | ``` 108 | 109 | Get history 110 | 111 | ```ruby 112 | dt.history 113 | ``` 114 | 115 | ## API 116 | 117 | This library follows the [Delta Lake Python API](https://delta-io.github.io/delta-rs/) (with a few changes to make it more Ruby-like). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems. 118 | 119 | ## History 120 | 121 | View the [changelog](https://github.com/ankane/delta-ruby/blob/master/CHANGELOG.md) 122 | 123 | ## Contributing 124 | 125 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 126 | 127 | - [Report bugs](https://github.com/ankane/delta-ruby/issues) 128 | - Fix bugs and [submit pull requests](https://github.com/ankane/delta-ruby/pulls) 129 | - Write, clarify, or fix documentation 130 | - Suggest or add new features 131 | 132 | To get started with development: 133 | 134 | ```sh 135 | git clone https://github.com/ankane/delta-ruby.git 136 | cd delta-ruby 137 | bundle install 138 | bundle exec rake compile 139 | bundle exec rake test 140 | ``` 141 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | require "rake/extensiontask" 4 | 5 | task default: :test 6 | Rake::TestTask.new do |t| 7 | t.libs << "test" 8 | t.pattern = "test/**/*_test.rb" 9 | end 10 | 11 | platforms = [ 12 | "x86_64-linux", 13 | "x86_64-linux-musl", 14 | "aarch64-linux", 15 | "aarch64-linux-musl", 16 | "x86_64-darwin", 17 | "arm64-darwin", 18 | "x64-mingw-ucrt" 19 | ] 20 | 21 | gemspec = Bundler.load_gemspec("deltalake-rb.gemspec") 22 | Rake::ExtensionTask.new("deltalake", gemspec) do |ext| 23 | ext.lib_dir = "lib/deltalake" 24 | ext.cross_compile = true 25 | ext.cross_platform = platforms 26 | ext.cross_compiling do |spec| 27 | spec.dependencies.reject! { |dep| dep.name == "rb_sys" } 28 | spec.files.reject! { |file| File.fnmatch?("ext/*", file, File::FNM_EXTGLOB) } 29 | end 30 | end 31 | 32 | task :remove_ext do 33 | path = "lib/deltalake/deltalake.bundle" 34 | File.unlink(path) if File.exist?(path) 35 | end 36 | 37 | Rake::Task["build"].enhance [:remove_ext] 38 | -------------------------------------------------------------------------------- /deltalake-rb.gemspec: -------------------------------------------------------------------------------- 1 | require_relative "lib/deltalake/version" 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "deltalake-rb" 5 | spec.version = DeltaLake::VERSION 6 | spec.summary = "Delta Lake for Ruby" 7 | spec.homepage = "https://github.com/ankane/delta-ruby" 8 | spec.license = "Apache-2.0" 9 | 10 | spec.author = "Andrew Kane" 11 | spec.email = "andrew@ankane.org" 12 | 13 | spec.files = Dir["*.{md,txt}", "{ext,lib}/**/*", "Cargo.*"] 14 | spec.require_path = "lib" 15 | spec.extensions = ["ext/deltalake/extconf.rb"] 16 | 17 | spec.required_ruby_version = ">= 3.2" 18 | 19 | spec.add_dependency "rb_sys" 20 | end 21 | -------------------------------------------------------------------------------- /ext/deltalake/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "deltalake" 3 | version = "0.1.7" 4 | license = "Apache-2.0" 5 | authors = ["Andrew Kane "] 6 | edition = "2021" 7 | rust-version = "1.82.0" 8 | publish = false 9 | 10 | [lib] 11 | crate-type = ["cdylib"] 12 | 13 | [dependencies] 14 | arrow = { version = "55", features = ["ffi"] } 15 | arrow-schema = { version = "55", features = ["serde"] } 16 | chrono = "0.4" 17 | delta_kernel = "=0.10.0" 18 | deltalake = { version = "=0.26.2", features = ["azure", "datafusion", "gcs", "s3"] } 19 | futures = "0.3" 20 | magnus = "0.7" 21 | num_cpus = "1" 22 | serde = "1" 23 | serde_json = "1" 24 | tokio = { version = "1", features = ["rt-multi-thread"] } 25 | -------------------------------------------------------------------------------- /ext/deltalake/extconf.rb: -------------------------------------------------------------------------------- 1 | require "mkmf" 2 | require "rb_sys/mkmf" 3 | 4 | create_rust_makefile("deltalake/deltalake") 5 | -------------------------------------------------------------------------------- /ext/deltalake/src/error.rs: -------------------------------------------------------------------------------- 1 | use arrow_schema::ArrowError; 2 | use deltalake::datafusion::error::DataFusionError; 3 | use deltalake::protocol::ProtocolError; 4 | use deltalake::{errors::DeltaTableError, ObjectStoreError}; 5 | use magnus::{exception, Error as RbErr, Module, RModule, Ruby}; 6 | use std::borrow::Cow; 7 | 8 | macro_rules! create_exception { 9 | ($type:ident, $name:expr) => { 10 | pub struct $type {} 11 | 12 | impl $type { 13 | pub fn new_err(message: T) -> RbErr 14 | where 15 | T: Into>, 16 | { 17 | let class = Ruby::get() 18 | .unwrap() 19 | .class_object() 20 | .const_get::<_, RModule>("DeltaLake") 21 | .unwrap() 22 | .const_get($name) 23 | .unwrap(); 24 | RbErr::new(class, message) 25 | } 26 | } 27 | }; 28 | } 29 | 30 | create_exception!(DeltaError, "Error"); 31 | create_exception!(TableNotFoundError, "TableNotFoundError"); 32 | create_exception!(DeltaProtocolError, "DeltaProtocolError"); 33 | create_exception!(CommitFailedError, "CommitFailedError"); 34 | create_exception!(SchemaMismatchError, "SchemaMismatchError"); 35 | 36 | fn inner_to_rb_err(err: DeltaTableError) -> RbErr { 37 | match err { 38 | DeltaTableError::NotATable(msg) => TableNotFoundError::new_err(msg), 39 | DeltaTableError::InvalidTableLocation(msg) => TableNotFoundError::new_err(msg), 40 | 41 | // protocol errors 42 | DeltaTableError::InvalidJsonLog { .. } => DeltaProtocolError::new_err(err.to_string()), 43 | DeltaTableError::InvalidStatsJson { .. } => DeltaProtocolError::new_err(err.to_string()), 44 | DeltaTableError::InvalidData { violations } => { 45 | DeltaProtocolError::new_err(format!("Invariant violations: {:?}", violations)) 46 | } 47 | 48 | // commit errors 49 | DeltaTableError::Transaction { source } => CommitFailedError::new_err(source.to_string()), 50 | 51 | // ruby exceptions 52 | DeltaTableError::ObjectStore { source } => object_store_to_rb(source), 53 | DeltaTableError::Io { source } => RbIOError::new_err(source.to_string()), 54 | 55 | DeltaTableError::Arrow { source } => arrow_to_rb(source), 56 | 57 | _ => DeltaError::new_err(err.to_string()), 58 | } 59 | } 60 | 61 | fn object_store_to_rb(err: ObjectStoreError) -> RbErr { 62 | match err { 63 | ObjectStoreError::NotFound { .. } => RbIOError::new_err(err.to_string()), 64 | ObjectStoreError::Generic { source, .. } 65 | if source.to_string().contains("AWS_S3_ALLOW_UNSAFE_RENAME") => 66 | { 67 | DeltaProtocolError::new_err(source.to_string()) 68 | } 69 | _ => RbIOError::new_err(err.to_string()), 70 | } 71 | } 72 | 73 | fn arrow_to_rb(err: ArrowError) -> RbErr { 74 | match err { 75 | ArrowError::IoError(msg, _) => RbIOError::new_err(msg), 76 | ArrowError::DivideByZero => RbValueError::new_err("division by zero"), 77 | ArrowError::InvalidArgumentError(msg) => RbValueError::new_err(msg), 78 | ArrowError::NotYetImplemented(msg) => RbNotImplementedError::new_err(msg), 79 | ArrowError::SchemaError(msg) => SchemaMismatchError::new_err(msg), 80 | other => RbException::new_err(other.to_string()), 81 | } 82 | } 83 | 84 | fn checkpoint_to_rb(err: ProtocolError) -> RbErr { 85 | match err { 86 | ProtocolError::Arrow { source } => arrow_to_rb(source), 87 | ProtocolError::ObjectStore { source } => object_store_to_rb(source), 88 | ProtocolError::EndOfLog => DeltaProtocolError::new_err("End of log"), 89 | ProtocolError::NoMetaData => DeltaProtocolError::new_err("Table metadata missing"), 90 | ProtocolError::CheckpointNotFound => DeltaProtocolError::new_err(err.to_string()), 91 | ProtocolError::InvalidField(err) => RbValueError::new_err(err), 92 | ProtocolError::InvalidRow(err) => RbValueError::new_err(err), 93 | ProtocolError::InvalidDeletionVectorStorageType(err) => RbValueError::new_err(err), 94 | ProtocolError::SerializeOperation { source } => RbValueError::new_err(source.to_string()), 95 | ProtocolError::ParquetParseError { source } => RbIOError::new_err(source.to_string()), 96 | ProtocolError::IO { source } => RbIOError::new_err(source.to_string()), 97 | ProtocolError::Generic(msg) => DeltaError::new_err(msg), 98 | ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()), 99 | } 100 | } 101 | 102 | fn datafusion_to_rb(err: DataFusionError) -> RbErr { 103 | DeltaError::new_err(err.to_string()) 104 | } 105 | 106 | pub enum RubyError { 107 | DeltaTable(DeltaTableError), 108 | Protocol(ProtocolError), 109 | DataFusion(DataFusionError), 110 | } 111 | 112 | impl From for RubyError { 113 | fn from(err: DeltaTableError) -> Self { 114 | RubyError::DeltaTable(err) 115 | } 116 | } 117 | 118 | impl From for RubyError { 119 | fn from(err: ProtocolError) -> Self { 120 | RubyError::Protocol(err) 121 | } 122 | } 123 | 124 | impl From for RubyError { 125 | fn from(err: DataFusionError) -> Self { 126 | RubyError::DataFusion(err) 127 | } 128 | } 129 | 130 | impl From for RbErr { 131 | fn from(value: RubyError) -> Self { 132 | match value { 133 | RubyError::DeltaTable(err) => inner_to_rb_err(err), 134 | RubyError::Protocol(err) => checkpoint_to_rb(err), 135 | RubyError::DataFusion(err) => datafusion_to_rb(err), 136 | } 137 | } 138 | } 139 | 140 | macro_rules! create_builtin_exception { 141 | ($type:ident, $class:expr) => { 142 | pub struct $type {} 143 | 144 | impl $type { 145 | pub fn new_err(message: T) -> RbErr 146 | where 147 | T: Into>, 148 | { 149 | RbErr::new($class, message) 150 | } 151 | } 152 | }; 153 | } 154 | 155 | create_builtin_exception!(RbException, exception::runtime_error()); 156 | create_builtin_exception!(RbIOError, exception::io_error()); 157 | create_builtin_exception!(RbNotImplementedError, exception::not_imp_error()); 158 | create_builtin_exception!(RbValueError, exception::arg_error()); 159 | -------------------------------------------------------------------------------- /ext/deltalake/src/features.rs: -------------------------------------------------------------------------------- 1 | use crate::{RbResult, RbValueError}; 2 | use deltalake::kernel::TableFeatures as KernelTableFeatures; 3 | use magnus::{prelude::*, TryConvert, Value}; 4 | 5 | /// High level table features 6 | #[derive(Clone)] 7 | pub enum TableFeatures { 8 | /// Mapping of one column to another 9 | ColumnMapping, 10 | /// Deletion vectors for merge, update, delete 11 | DeletionVectors, 12 | /// timestamps without timezone support 13 | TimestampWithoutTimezone, 14 | /// version 2 of checkpointing 15 | V2Checkpoint, 16 | /// Append Only Tables 17 | AppendOnly, 18 | /// Table invariants 19 | Invariants, 20 | /// Check constraints on columns 21 | CheckConstraints, 22 | /// CDF on a table 23 | ChangeDataFeed, 24 | /// Columns with generated values 25 | GeneratedColumns, 26 | /// ID Columns 27 | IdentityColumns, 28 | /// Row tracking on tables 29 | RowTracking, 30 | /// domain specific metadata 31 | DomainMetadata, 32 | /// Iceberg compatibility support 33 | IcebergCompatV1, 34 | } 35 | 36 | impl From for KernelTableFeatures { 37 | fn from(value: TableFeatures) -> Self { 38 | match value { 39 | TableFeatures::ColumnMapping => KernelTableFeatures::ColumnMapping, 40 | TableFeatures::DeletionVectors => KernelTableFeatures::DeletionVectors, 41 | TableFeatures::TimestampWithoutTimezone => { 42 | KernelTableFeatures::TimestampWithoutTimezone 43 | } 44 | TableFeatures::V2Checkpoint => KernelTableFeatures::V2Checkpoint, 45 | TableFeatures::AppendOnly => KernelTableFeatures::AppendOnly, 46 | TableFeatures::Invariants => KernelTableFeatures::Invariants, 47 | TableFeatures::CheckConstraints => KernelTableFeatures::CheckConstraints, 48 | TableFeatures::ChangeDataFeed => KernelTableFeatures::ChangeDataFeed, 49 | TableFeatures::GeneratedColumns => KernelTableFeatures::GeneratedColumns, 50 | TableFeatures::IdentityColumns => KernelTableFeatures::IdentityColumns, 51 | TableFeatures::RowTracking => KernelTableFeatures::RowTracking, 52 | TableFeatures::DomainMetadata => KernelTableFeatures::DomainMetadata, 53 | TableFeatures::IcebergCompatV1 => KernelTableFeatures::IcebergCompatV1, 54 | } 55 | } 56 | } 57 | 58 | impl TryConvert for TableFeatures { 59 | fn try_convert(val: Value) -> RbResult { 60 | // TODO add more features 61 | let feature = match unsafe { val.to_r_string()?.as_str()? } { 62 | "append_only" => TableFeatures::AppendOnly, 63 | _ => return Err(RbValueError::new_err("Invalid feature")), 64 | }; 65 | Ok(feature) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /ext/deltalake/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod error; 2 | mod features; 3 | mod merge; 4 | mod schema; 5 | mod utils; 6 | 7 | use std::cell::RefCell; 8 | use std::collections::{HashMap, HashSet}; 9 | use std::future::IntoFuture; 10 | use std::str::FromStr; 11 | use std::sync::Arc; 12 | use std::time; 13 | 14 | use chrono::{DateTime, Duration, FixedOffset, Utc}; 15 | use delta_kernel::schema::StructField; 16 | use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; 17 | use deltalake::arrow::record_batch::RecordBatchIterator; 18 | use deltalake::checkpoints::{cleanup_metadata, create_checkpoint}; 19 | use deltalake::datafusion::catalog::TableProvider; 20 | use deltalake::datafusion::prelude::SessionContext; 21 | use deltalake::delta_datafusion::DeltaCdfTableProvider; 22 | use deltalake::errors::DeltaTableError; 23 | use deltalake::kernel::transaction::{CommitProperties, TableReference}; 24 | use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction}; 25 | use deltalake::logstore::IORuntime; 26 | use deltalake::operations::add_column::AddColumnBuilder; 27 | use deltalake::operations::add_feature::AddTableFeatureBuilder; 28 | use deltalake::operations::collect_sendable_stream; 29 | use deltalake::operations::constraints::ConstraintBuilder; 30 | use deltalake::operations::delete::DeleteBuilder; 31 | use deltalake::operations::drop_constraints::DropConstraintBuilder; 32 | use deltalake::operations::filesystem_check::FileSystemCheckBuilder; 33 | use deltalake::operations::load_cdf::CdfLoadBuilder; 34 | use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType}; 35 | use deltalake::operations::restore::RestoreBuilder; 36 | use deltalake::operations::set_tbl_properties::SetTablePropertiesBuilder; 37 | use deltalake::operations::vacuum::VacuumBuilder; 38 | use deltalake::parquet::basic::Compression; 39 | use deltalake::parquet::errors::ParquetError; 40 | use deltalake::parquet::file::properties::WriterProperties; 41 | use deltalake::partitions::PartitionFilter; 42 | use deltalake::{DeltaOps, DeltaResult}; 43 | use error::DeltaError; 44 | use futures::future::join_all; 45 | 46 | use magnus::{ 47 | function, method, prelude::*, typed_data::Obj, Error, Integer, Module, RArray, RHash, Ruby, 48 | TryConvert, Value, 49 | }; 50 | use serde_json::Map; 51 | 52 | use crate::error::DeltaProtocolError; 53 | use crate::error::RbValueError; 54 | use crate::error::RubyError; 55 | use crate::features::TableFeatures; 56 | use crate::merge::RbMergeBuilder; 57 | use crate::schema::{schema_to_rbobject, Field}; 58 | use crate::utils::rt; 59 | 60 | type RbResult = Result; 61 | 62 | enum PartitionFilterValue { 63 | Single(String), 64 | Multiple(Vec), 65 | } 66 | 67 | impl TryConvert for PartitionFilterValue { 68 | fn try_convert(val: Value) -> RbResult { 69 | if let Ok(v) = Vec::::try_convert(val) { 70 | Ok(PartitionFilterValue::Multiple(v)) 71 | } else { 72 | Ok(PartitionFilterValue::Single(String::try_convert(val)?)) 73 | } 74 | } 75 | } 76 | 77 | #[magnus::wrap(class = "DeltaLake::RawDeltaTable")] 78 | struct RawDeltaTable { 79 | _table: RefCell, 80 | } 81 | 82 | #[magnus::wrap(class = "DeltaLake::RawDeltaTableMetaData")] 83 | struct RawDeltaTableMetaData { 84 | id: String, 85 | name: Option, 86 | description: Option, 87 | partition_columns: Vec, 88 | created_time: Option, 89 | configuration: HashMap>, 90 | } 91 | 92 | impl RawDeltaTableMetaData { 93 | fn id(&self) -> String { 94 | self.id.clone() 95 | } 96 | 97 | fn name(&self) -> Option { 98 | self.name.clone() 99 | } 100 | 101 | fn description(&self) -> Option { 102 | self.description.clone() 103 | } 104 | 105 | fn partition_columns(&self) -> Vec { 106 | self.partition_columns.clone() 107 | } 108 | 109 | fn created_time(&self) -> Option { 110 | self.created_time 111 | } 112 | 113 | fn configuration(&self) -> HashMap> { 114 | self.configuration.clone() 115 | } 116 | } 117 | 118 | type StringVec = Vec; 119 | 120 | impl RawDeltaTable { 121 | pub fn new( 122 | table_uri: String, 123 | version: Option, 124 | storage_options: Option>, 125 | without_files: bool, 126 | log_buffer_size: Option, 127 | ) -> RbResult { 128 | let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri) 129 | .with_io_runtime(IORuntime::default()); 130 | 131 | if let Some(storage_options) = storage_options { 132 | builder = builder.with_storage_options(storage_options) 133 | } 134 | if let Some(version) = version { 135 | builder = builder.with_version(version) 136 | } 137 | if without_files { 138 | builder = builder.without_files() 139 | } 140 | if let Some(buf_size) = log_buffer_size { 141 | builder = builder 142 | .with_log_buffer_size(buf_size) 143 | .map_err(RubyError::from)?; 144 | } 145 | 146 | let table = rt().block_on(builder.load()).map_err(RubyError::from)?; 147 | Ok(RawDeltaTable { 148 | _table: RefCell::new(table), 149 | }) 150 | } 151 | 152 | pub fn is_deltatable( 153 | table_uri: String, 154 | storage_options: Option>, 155 | ) -> RbResult { 156 | let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri); 157 | if let Some(storage_options) = storage_options { 158 | builder = builder.with_storage_options(storage_options) 159 | } 160 | Ok(rt() 161 | .block_on(async { 162 | match builder.build() { 163 | Ok(table) => table.verify_deltatable_existence().await, 164 | Err(err) => Err(err), 165 | } 166 | }) 167 | .map_err(RubyError::from)?) 168 | } 169 | 170 | pub fn table_uri(&self) -> RbResult { 171 | Ok(self._table.borrow().table_uri()) 172 | } 173 | 174 | pub fn version(&self) -> RbResult { 175 | Ok(self._table.borrow().version()) 176 | } 177 | 178 | pub fn has_files(&self) -> RbResult { 179 | Ok(self._table.borrow().config.require_files) 180 | } 181 | 182 | pub fn metadata(&self) -> RbResult { 183 | let binding = self._table.borrow(); 184 | let metadata = binding.metadata().map_err(RubyError::from)?; 185 | Ok(RawDeltaTableMetaData { 186 | id: metadata.id.clone(), 187 | name: metadata.name.clone(), 188 | description: metadata.description.clone(), 189 | partition_columns: metadata.partition_columns.clone(), 190 | created_time: metadata.created_time, 191 | configuration: metadata.configuration.clone(), 192 | }) 193 | } 194 | 195 | pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option, Option)> { 196 | let binding = self._table.borrow(); 197 | let table_protocol = binding.protocol().map_err(RubyError::from)?; 198 | Ok(( 199 | table_protocol.min_reader_version, 200 | table_protocol.min_writer_version, 201 | table_protocol 202 | .writer_features 203 | .as_ref() 204 | .and_then(|features| { 205 | let empty_set = !features.is_empty(); 206 | empty_set.then(|| { 207 | features 208 | .iter() 209 | .map(|v| v.to_string()) 210 | .collect::>() 211 | }) 212 | }), 213 | table_protocol 214 | .reader_features 215 | .as_ref() 216 | .and_then(|features| { 217 | let empty_set = !features.is_empty(); 218 | empty_set.then(|| { 219 | features 220 | .iter() 221 | .map(|v| v.to_string()) 222 | .collect::>() 223 | }) 224 | }), 225 | )) 226 | } 227 | 228 | pub fn load_version(&self, version: i64) -> RbResult<()> { 229 | Ok(rt() 230 | .block_on(self._table.borrow_mut().load_version(version)) 231 | .map_err(RubyError::from)?) 232 | } 233 | 234 | pub fn get_latest_version(&self) -> RbResult { 235 | Ok(rt() 236 | .block_on(self._table.borrow().get_latest_version()) 237 | .map_err(RubyError::from)?) 238 | } 239 | 240 | pub fn get_earliest_version(&self) -> RbResult { 241 | Ok(rt() 242 | .block_on(self._table.borrow().get_earliest_version()) 243 | .map_err(RubyError::from)?) 244 | } 245 | 246 | pub fn get_num_index_cols(&self) -> RbResult { 247 | Ok(self 248 | ._table 249 | .borrow() 250 | .snapshot() 251 | .map_err(RubyError::from)? 252 | .config() 253 | .num_indexed_cols()) 254 | } 255 | 256 | pub fn get_stats_columns(&self) -> RbResult>> { 257 | Ok(self 258 | ._table 259 | .borrow() 260 | .snapshot() 261 | .map_err(RubyError::from)? 262 | .config() 263 | .stats_columns() 264 | .map(|v| v.iter().map(|v| v.to_string()).collect::>())) 265 | } 266 | 267 | pub fn load_with_datetime(&self, ds: String) -> RbResult<()> { 268 | let datetime = 269 | DateTime::::from(DateTime::::parse_from_rfc3339(&ds).map_err( 270 | |err| RbValueError::new_err(format!("Failed to parse datetime string: {err}")), 271 | )?); 272 | Ok(rt() 273 | .block_on(self._table.borrow_mut().load_with_datetime(datetime)) 274 | .map_err(RubyError::from)?) 275 | } 276 | 277 | pub fn files( 278 | &self, 279 | partition_filters: Option>, 280 | ) -> RbResult> { 281 | if !self.has_files()? { 282 | return Err(DeltaError::new_err("Table is instantiated without files.")); 283 | } 284 | 285 | if let Some(filters) = partition_filters { 286 | let filters = convert_partition_filters(filters).map_err(RubyError::from)?; 287 | Ok(self 288 | ._table 289 | .borrow() 290 | .get_files_by_partitions(&filters) 291 | .map_err(RubyError::from)? 292 | .into_iter() 293 | .map(|p| p.to_string()) 294 | .collect()) 295 | } else { 296 | Ok(self 297 | ._table 298 | .borrow() 299 | .get_files_iter() 300 | .map_err(RubyError::from)? 301 | .map(|f| f.to_string()) 302 | .collect()) 303 | } 304 | } 305 | 306 | pub fn file_uris( 307 | &self, 308 | partition_filters: Option>, 309 | ) -> RbResult> { 310 | if !self._table.borrow().config.require_files { 311 | return Err(DeltaError::new_err("Table is initiated without files.")); 312 | } 313 | 314 | if let Some(filters) = partition_filters { 315 | let filters = convert_partition_filters(filters).map_err(RubyError::from)?; 316 | Ok(self 317 | ._table 318 | .borrow() 319 | .get_file_uris_by_partitions(&filters) 320 | .map_err(RubyError::from)?) 321 | } else { 322 | Ok(self 323 | ._table 324 | .borrow() 325 | .get_file_uris() 326 | .map_err(RubyError::from)? 327 | .collect()) 328 | } 329 | } 330 | 331 | pub fn schema(&self) -> RbResult { 332 | let binding = self._table.borrow(); 333 | let schema: &StructType = binding.get_schema().map_err(RubyError::from)?; 334 | schema_to_rbobject(schema.to_owned()) 335 | } 336 | 337 | pub fn vacuum( 338 | &self, 339 | dry_run: bool, 340 | retention_hours: Option, 341 | enforce_retention_duration: bool, 342 | commit_properties: Option, 343 | post_commithook_properties: Option, 344 | ) -> RbResult> { 345 | let mut cmd = VacuumBuilder::new( 346 | self._table.borrow().log_store(), 347 | self._table 348 | .borrow() 349 | .snapshot() 350 | .map_err(RubyError::from)? 351 | .clone(), 352 | ) 353 | .with_enforce_retention_duration(enforce_retention_duration) 354 | .with_dry_run(dry_run); 355 | if let Some(retention_period) = retention_hours { 356 | cmd = cmd.with_retention_period(Duration::hours(retention_period as i64)); 357 | } 358 | 359 | if let Some(commit_properties) = 360 | maybe_create_commit_properties(commit_properties, post_commithook_properties) 361 | { 362 | cmd = cmd.with_commit_properties(commit_properties); 363 | } 364 | let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 365 | self._table.borrow_mut().state = table.state; 366 | Ok(metrics.files_deleted) 367 | } 368 | 369 | #[allow(clippy::too_many_arguments)] 370 | pub fn compact_optimize( 371 | &self, 372 | partition_filters: Option>, 373 | target_size: Option, 374 | max_concurrent_tasks: Option, 375 | min_commit_interval: Option, 376 | writer_properties: Option, 377 | commit_properties: Option, 378 | post_commithook_properties: Option, 379 | ) -> RbResult { 380 | let mut cmd = OptimizeBuilder::new( 381 | self._table.borrow().log_store(), 382 | self._table 383 | .borrow() 384 | .snapshot() 385 | .map_err(RubyError::from)? 386 | .clone(), 387 | ) 388 | .with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get)); 389 | if let Some(size) = target_size { 390 | cmd = cmd.with_target_size(size); 391 | } 392 | if let Some(commit_interval) = min_commit_interval { 393 | cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval)); 394 | } 395 | 396 | if let Some(writer_props) = writer_properties { 397 | cmd = cmd.with_writer_properties( 398 | set_writer_properties(writer_props).map_err(RubyError::from)?, 399 | ); 400 | } 401 | 402 | if let Some(commit_properties) = 403 | maybe_create_commit_properties(commit_properties, post_commithook_properties) 404 | { 405 | cmd = cmd.with_commit_properties(commit_properties); 406 | } 407 | 408 | let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default()) 409 | .map_err(RubyError::from)?; 410 | cmd = cmd.with_filters(&converted_filters); 411 | 412 | let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 413 | self._table.borrow_mut().state = table.state; 414 | Ok(serde_json::to_string(&metrics).unwrap()) 415 | } 416 | 417 | #[allow(clippy::too_many_arguments)] 418 | pub fn z_order_optimize( 419 | &self, 420 | z_order_columns: Vec, 421 | partition_filters: Option>, 422 | target_size: Option, 423 | max_concurrent_tasks: Option, 424 | max_spill_size: usize, 425 | min_commit_interval: Option, 426 | writer_properties: Option, 427 | commit_properties: Option, 428 | post_commithook_properties: Option, 429 | ) -> RbResult { 430 | let mut cmd = OptimizeBuilder::new( 431 | self._table.borrow().log_store(), 432 | self._table 433 | .borrow() 434 | .snapshot() 435 | .map_err(RubyError::from)? 436 | .clone(), 437 | ) 438 | .with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get)) 439 | .with_max_spill_size(max_spill_size) 440 | .with_type(OptimizeType::ZOrder(z_order_columns)); 441 | if let Some(size) = target_size { 442 | cmd = cmd.with_target_size(size); 443 | } 444 | if let Some(commit_interval) = min_commit_interval { 445 | cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval)); 446 | } 447 | 448 | if let Some(writer_props) = writer_properties { 449 | cmd = cmd.with_writer_properties( 450 | set_writer_properties(writer_props).map_err(RubyError::from)?, 451 | ); 452 | } 453 | 454 | if let Some(commit_properties) = 455 | maybe_create_commit_properties(commit_properties, post_commithook_properties) 456 | { 457 | cmd = cmd.with_commit_properties(commit_properties); 458 | } 459 | 460 | let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default()) 461 | .map_err(RubyError::from)?; 462 | cmd = cmd.with_filters(&converted_filters); 463 | 464 | let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 465 | self._table.borrow_mut().state = table.state; 466 | Ok(serde_json::to_string(&metrics).unwrap()) 467 | } 468 | 469 | pub fn add_columns(&self, fields: RArray) -> RbResult<()> { 470 | let fields = fields.typecheck::>()?; 471 | let mut cmd = AddColumnBuilder::new( 472 | self._table.borrow().log_store(), 473 | self._table 474 | .borrow() 475 | .snapshot() 476 | .map_err(RubyError::from)? 477 | .clone(), 478 | ); 479 | 480 | let new_fields = fields 481 | .iter() 482 | .map(|v| v.inner.clone()) 483 | .collect::>(); 484 | 485 | cmd = cmd.with_fields(new_fields); 486 | 487 | let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 488 | self._table.borrow_mut().state = table.state; 489 | Ok(()) 490 | } 491 | 492 | pub fn add_feature( 493 | &self, 494 | feature: RArray, 495 | allow_protocol_versions_increase: bool, 496 | ) -> RbResult<()> { 497 | let feature = feature 498 | .into_iter() 499 | .map(TableFeatures::try_convert) 500 | .collect::>>()?; 501 | let cmd = AddTableFeatureBuilder::new( 502 | self._table.borrow().log_store(), 503 | self._table 504 | .borrow() 505 | .snapshot() 506 | .map_err(RubyError::from)? 507 | .clone(), 508 | ) 509 | .with_features(feature) 510 | .with_allow_protocol_versions_increase(allow_protocol_versions_increase); 511 | 512 | let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 513 | self._table.borrow_mut().state = table.state; 514 | Ok(()) 515 | } 516 | 517 | pub fn add_constraints(&self, constraints: HashMap) -> RbResult<()> { 518 | let mut cmd = ConstraintBuilder::new( 519 | self._table.borrow().log_store(), 520 | self._table 521 | .borrow() 522 | .snapshot() 523 | .map_err(RubyError::from)? 524 | .clone(), 525 | ); 526 | 527 | for (col_name, expression) in constraints { 528 | cmd = cmd.with_constraint(col_name.clone(), expression.clone()); 529 | } 530 | 531 | let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 532 | self._table.borrow_mut().state = table.state; 533 | Ok(()) 534 | } 535 | 536 | pub fn drop_constraints(&self, name: String, raise_if_not_exists: bool) -> RbResult<()> { 537 | let cmd = DropConstraintBuilder::new( 538 | self._table.borrow().log_store(), 539 | self._table 540 | .borrow() 541 | .snapshot() 542 | .map_err(RubyError::from)? 543 | .clone(), 544 | ) 545 | .with_constraint(name) 546 | .with_raise_if_not_exists(raise_if_not_exists); 547 | 548 | let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 549 | self._table.borrow_mut().state = table.state; 550 | Ok(()) 551 | } 552 | 553 | pub fn load_cdf( 554 | &self, 555 | starting_version: i64, 556 | ending_version: Option, 557 | starting_timestamp: Option, 558 | ending_timestamp: Option, 559 | columns: Option>, 560 | ) -> RbResult { 561 | let ctx = SessionContext::new(); 562 | let mut cdf_read = CdfLoadBuilder::new( 563 | self._table.borrow().log_store(), 564 | self._table 565 | .borrow() 566 | .snapshot() 567 | .map_err(RubyError::from)? 568 | .clone(), 569 | ) 570 | .with_starting_version(starting_version); 571 | 572 | if let Some(ev) = ending_version { 573 | cdf_read = cdf_read.with_ending_version(ev); 574 | } 575 | if let Some(st) = starting_timestamp { 576 | let starting_ts: DateTime = DateTime::::from_str(&st) 577 | .map_err(|pe| RbValueError::new_err(pe.to_string()))? 578 | .to_utc(); 579 | cdf_read = cdf_read.with_starting_timestamp(starting_ts); 580 | } 581 | if let Some(et) = ending_timestamp { 582 | let ending_ts = DateTime::::from_str(&et) 583 | .map_err(|pe| RbValueError::new_err(pe.to_string()))? 584 | .to_utc(); 585 | cdf_read = cdf_read.with_starting_timestamp(ending_ts); 586 | } 587 | 588 | let table_provider: Arc = 589 | Arc::new(DeltaCdfTableProvider::try_new(cdf_read).map_err(RubyError::from)?); 590 | 591 | let plan = rt() 592 | .block_on(async { 593 | let mut df = ctx.read_table(table_provider)?; 594 | if let Some(columns) = columns { 595 | let cols: Vec<_> = columns.iter().map(|c| c.as_ref()).collect(); 596 | df = df.select_columns(&cols)?; 597 | } 598 | df.create_physical_plan().await 599 | }) 600 | .map_err(RubyError::from)?; 601 | 602 | let mut tasks = vec![]; 603 | for p in 0..plan.properties().output_partitioning().partition_count() { 604 | let inner_plan = plan.clone(); 605 | let partition_batch = inner_plan.execute(p, ctx.task_ctx()).unwrap(); 606 | let handle = rt().spawn(collect_sendable_stream(partition_batch)); 607 | tasks.push(handle); 608 | } 609 | 610 | // This is unfortunate. 611 | let batches = rt() 612 | .block_on(join_all(tasks)) 613 | .into_iter() 614 | .flatten() 615 | .collect::>, _>>() 616 | .unwrap() 617 | .into_iter() 618 | .flatten() 619 | .map(Ok); 620 | let batch_iter = RecordBatchIterator::new(batches, plan.schema()); 621 | let ffi_stream = FFI_ArrowArrayStream::new(Box::new(batch_iter)); 622 | Ok(ArrowArrayStream { stream: ffi_stream }) 623 | } 624 | 625 | #[allow(clippy::too_many_arguments)] 626 | pub fn create_merge_builder( 627 | &self, 628 | source: RbArrowType, 629 | predicate: String, 630 | source_alias: Option, 631 | target_alias: Option, 632 | safe_cast: bool, 633 | writer_properties: Option, 634 | post_commithook_properties: Option, 635 | commit_properties: Option, 636 | ) -> RbResult { 637 | Ok(RbMergeBuilder::new( 638 | self._table.borrow().log_store(), 639 | self._table 640 | .borrow() 641 | .snapshot() 642 | .map_err(RubyError::from)? 643 | .clone(), 644 | source.0, 645 | predicate, 646 | source_alias, 647 | target_alias, 648 | safe_cast, 649 | writer_properties, 650 | post_commithook_properties, 651 | commit_properties, 652 | ) 653 | .map_err(RubyError::from)?) 654 | } 655 | 656 | pub fn merge_execute(&self, merge_builder: &RbMergeBuilder) -> RbResult { 657 | let (table, metrics) = merge_builder.execute().map_err(RubyError::from)?; 658 | self._table.borrow_mut().state = table.state; 659 | Ok(metrics) 660 | } 661 | 662 | pub fn restore( 663 | &self, 664 | target: Option, 665 | ignore_missing_files: bool, 666 | protocol_downgrade_allowed: bool, 667 | commit_properties: Option, 668 | ) -> RbResult { 669 | let mut cmd = RestoreBuilder::new( 670 | self._table.borrow().log_store(), 671 | self._table 672 | .borrow() 673 | .snapshot() 674 | .map_err(RubyError::from)? 675 | .clone(), 676 | ); 677 | if let Some(val) = target { 678 | if let Some(version) = Integer::from_value(val) { 679 | cmd = cmd.with_version_to_restore(version.to_i64()?) 680 | } 681 | if let Ok(ds) = String::try_convert(val) { 682 | let datetime = DateTime::::from( 683 | DateTime::::parse_from_rfc3339(ds.as_ref()).map_err(|err| { 684 | RbValueError::new_err(format!("Failed to parse datetime string: {err}")) 685 | })?, 686 | ); 687 | cmd = cmd.with_datetime_to_restore(datetime) 688 | } 689 | } 690 | cmd = cmd.with_ignore_missing_files(ignore_missing_files); 691 | cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed); 692 | 693 | if let Some(commit_properties) = maybe_create_commit_properties(commit_properties, None) { 694 | cmd = cmd.with_commit_properties(commit_properties); 695 | } 696 | 697 | let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 698 | self._table.borrow_mut().state = table.state; 699 | Ok(serde_json::to_string(&metrics).unwrap()) 700 | } 701 | 702 | pub fn history(&self, limit: Option) -> RbResult> { 703 | let history = rt() 704 | .block_on(self._table.borrow().history(limit)) 705 | .map_err(RubyError::from)?; 706 | Ok(history 707 | .iter() 708 | .map(|c| serde_json::to_string(c).unwrap()) 709 | .collect()) 710 | } 711 | 712 | pub fn update_incremental(&self) -> RbResult<()> { 713 | #[allow(deprecated)] 714 | Ok(rt() 715 | .block_on(self._table.borrow_mut().update_incremental(None)) 716 | .map_err(RubyError::from)?) 717 | } 718 | 719 | fn get_active_partitions(&self) -> RbResult { 720 | let binding = self._table.borrow(); 721 | let _column_names: HashSet<&str> = binding 722 | .get_schema() 723 | .map_err(|_| DeltaProtocolError::new_err("table does not yet have a schema"))? 724 | .fields() 725 | .map(|field| field.name().as_str()) 726 | .collect(); 727 | let partition_columns: HashSet<&str> = binding 728 | .metadata() 729 | .map_err(RubyError::from)? 730 | .partition_columns 731 | .iter() 732 | .map(|col| col.as_str()) 733 | .collect(); 734 | 735 | let converted_filters = Vec::new(); 736 | 737 | let partition_columns: Vec<&str> = partition_columns.into_iter().collect(); 738 | 739 | let adds = binding 740 | .snapshot() 741 | .map_err(RubyError::from)? 742 | .get_active_add_actions_by_partitions(&converted_filters) 743 | .map_err(RubyError::from)? 744 | .collect::, _>>() 745 | .map_err(RubyError::from)?; 746 | let active_partitions: HashSet)>> = adds 747 | .iter() 748 | .flat_map(|add| { 749 | Ok::<_, RubyError>( 750 | partition_columns 751 | .iter() 752 | .flat_map(|col| { 753 | Ok::<_, RubyError>(( 754 | *col, 755 | add.partition_values() 756 | .map_err(RubyError::from)? 757 | .get(*col) 758 | .map(|v| v.serialize()), 759 | )) 760 | }) 761 | .collect(), 762 | ) 763 | }) 764 | .collect(); 765 | 766 | Ok(RArray::from_iter(active_partitions)) 767 | } 768 | 769 | pub fn create_checkpoint(&self) -> RbResult<()> { 770 | rt().block_on(create_checkpoint(&self._table.borrow(), None)) 771 | .map_err(RubyError::from)?; 772 | 773 | Ok(()) 774 | } 775 | 776 | pub fn cleanup_metadata(&self) -> RbResult<()> { 777 | rt().block_on(cleanup_metadata(&self._table.borrow(), None)) 778 | .map_err(RubyError::from)?; 779 | 780 | Ok(()) 781 | } 782 | 783 | pub fn get_add_file_sizes(&self) -> RbResult> { 784 | Ok(self 785 | ._table 786 | .borrow() 787 | .snapshot() 788 | .map_err(RubyError::from)? 789 | .eager_snapshot() 790 | .files() 791 | .map(|f| (f.path().to_string(), f.size())) 792 | .collect::>()) 793 | } 794 | 795 | pub fn delete( 796 | &self, 797 | predicate: Option, 798 | writer_properties: Option, 799 | commit_properties: Option, 800 | post_commithook_properties: Option, 801 | ) -> RbResult { 802 | let mut cmd = DeleteBuilder::new( 803 | self._table.borrow().log_store(), 804 | self._table 805 | .borrow() 806 | .snapshot() 807 | .map_err(RubyError::from)? 808 | .clone(), 809 | ); 810 | if let Some(predicate) = predicate { 811 | cmd = cmd.with_predicate(predicate); 812 | } 813 | if let Some(writer_props) = writer_properties { 814 | cmd = cmd.with_writer_properties( 815 | set_writer_properties(writer_props).map_err(RubyError::from)?, 816 | ); 817 | } 818 | if let Some(commit_properties) = 819 | maybe_create_commit_properties(commit_properties, post_commithook_properties) 820 | { 821 | cmd = cmd.with_commit_properties(commit_properties); 822 | } 823 | 824 | let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 825 | self._table.borrow_mut().state = table.state; 826 | Ok(serde_json::to_string(&metrics).unwrap()) 827 | } 828 | 829 | pub fn set_table_properties( 830 | &self, 831 | properties: HashMap, 832 | raise_if_not_exists: bool, 833 | ) -> RbResult<()> { 834 | let cmd = SetTablePropertiesBuilder::new( 835 | self._table.borrow().log_store(), 836 | self._table 837 | .borrow() 838 | .snapshot() 839 | .map_err(RubyError::from)? 840 | .clone(), 841 | ) 842 | .with_properties(properties) 843 | .with_raise_if_not_exists(raise_if_not_exists); 844 | 845 | let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 846 | self._table.borrow_mut().state = table.state; 847 | Ok(()) 848 | } 849 | 850 | pub fn repair( 851 | &self, 852 | dry_run: bool, 853 | commit_properties: Option, 854 | post_commithook_properties: Option, 855 | ) -> RbResult { 856 | let mut cmd = FileSystemCheckBuilder::new( 857 | self._table.borrow().log_store(), 858 | self._table 859 | .borrow() 860 | .snapshot() 861 | .map_err(RubyError::from)? 862 | .clone(), 863 | ) 864 | .with_dry_run(dry_run); 865 | 866 | if let Some(commit_properties) = 867 | maybe_create_commit_properties(commit_properties, post_commithook_properties) 868 | { 869 | cmd = cmd.with_commit_properties(commit_properties); 870 | } 871 | 872 | let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?; 873 | self._table.borrow_mut().state = table.state; 874 | Ok(serde_json::to_string(&metrics).unwrap()) 875 | } 876 | 877 | pub fn transaction_versions(&self) -> RHash { 878 | RHash::from_iter( 879 | self._table 880 | .borrow() 881 | .get_app_transaction_version() 882 | .into_iter() 883 | .map(|(app_id, transaction)| (app_id, RbTransaction::from(transaction))), 884 | ) 885 | } 886 | } 887 | 888 | fn set_post_commithook_properties( 889 | mut commit_properties: CommitProperties, 890 | post_commithook_properties: RbPostCommitHookProperties, 891 | ) -> CommitProperties { 892 | commit_properties = 893 | commit_properties.with_create_checkpoint(post_commithook_properties.create_checkpoint); 894 | commit_properties = commit_properties 895 | .with_cleanup_expired_logs(post_commithook_properties.cleanup_expired_logs); 896 | commit_properties 897 | } 898 | 899 | fn set_writer_properties(writer_properties: RbWriterProperties) -> DeltaResult { 900 | let mut properties = WriterProperties::builder(); 901 | let data_page_size_limit = writer_properties.data_page_size_limit; 902 | let dictionary_page_size_limit = writer_properties.dictionary_page_size_limit; 903 | let data_page_row_count_limit = writer_properties.data_page_row_count_limit; 904 | let write_batch_size = writer_properties.write_batch_size; 905 | let max_row_group_size = writer_properties.max_row_group_size; 906 | let compression = writer_properties.compression; 907 | let statistics_truncate_length = writer_properties.statistics_truncate_length; 908 | let default_column_properties = writer_properties.default_column_properties; 909 | let column_properties = writer_properties.column_properties; 910 | 911 | if let Some(data_page_size) = data_page_size_limit { 912 | properties = properties.set_data_page_size_limit(data_page_size); 913 | } 914 | if let Some(dictionary_page_size) = dictionary_page_size_limit { 915 | properties = properties.set_dictionary_page_size_limit(dictionary_page_size); 916 | } 917 | if let Some(data_page_row_count) = data_page_row_count_limit { 918 | properties = properties.set_data_page_row_count_limit(data_page_row_count); 919 | } 920 | if let Some(batch_size) = write_batch_size { 921 | properties = properties.set_write_batch_size(batch_size); 922 | } 923 | if let Some(row_group_size) = max_row_group_size { 924 | properties = properties.set_max_row_group_size(row_group_size); 925 | } 926 | properties = properties.set_statistics_truncate_length(statistics_truncate_length); 927 | 928 | if let Some(compression) = compression { 929 | let compress: Compression = compression 930 | .parse() 931 | .map_err(|err: ParquetError| DeltaTableError::Generic(err.to_string()))?; 932 | 933 | properties = properties.set_compression(compress); 934 | } 935 | 936 | if let Some(default_column_properties) = default_column_properties { 937 | if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled { 938 | properties = properties.set_dictionary_enabled(dictionary_enabled); 939 | } 940 | if let Some(bloom_filter_properties) = default_column_properties.bloom_filter_properties { 941 | if let Some(set_bloom_filter_enabled) = bloom_filter_properties.set_bloom_filter_enabled 942 | { 943 | properties = properties.set_bloom_filter_enabled(set_bloom_filter_enabled); 944 | } 945 | if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp { 946 | properties = properties.set_bloom_filter_fpp(bloom_filter_fpp); 947 | } 948 | if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv { 949 | properties = properties.set_bloom_filter_ndv(bloom_filter_ndv); 950 | } 951 | } 952 | } 953 | if let Some(column_properties) = column_properties { 954 | for (column_name, column_prop) in column_properties { 955 | if let Some(column_prop) = column_prop { 956 | if let Some(dictionary_enabled) = column_prop.dictionary_enabled { 957 | properties = properties.set_column_dictionary_enabled( 958 | column_name.clone().into(), 959 | dictionary_enabled, 960 | ); 961 | } 962 | if let Some(bloom_filter_properties) = column_prop.bloom_filter_properties { 963 | if let Some(set_bloom_filter_enabled) = 964 | bloom_filter_properties.set_bloom_filter_enabled 965 | { 966 | properties = properties.set_column_bloom_filter_enabled( 967 | column_name.clone().into(), 968 | set_bloom_filter_enabled, 969 | ); 970 | } 971 | if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp { 972 | properties = properties.set_column_bloom_filter_fpp( 973 | column_name.clone().into(), 974 | bloom_filter_fpp, 975 | ); 976 | } 977 | if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv { 978 | properties = properties 979 | .set_column_bloom_filter_ndv(column_name.into(), bloom_filter_ndv); 980 | } 981 | } 982 | } 983 | } 984 | } 985 | Ok(properties.build()) 986 | } 987 | 988 | fn convert_partition_filters( 989 | partitions_filters: Vec<(String, String, PartitionFilterValue)>, 990 | ) -> Result, DeltaTableError> { 991 | partitions_filters 992 | .into_iter() 993 | .map(|filter| match filter { 994 | (key, op, PartitionFilterValue::Single(v)) => { 995 | let key: &'_ str = key.as_ref(); 996 | let op: &'_ str = op.as_ref(); 997 | let v: &'_ str = v.as_ref(); 998 | PartitionFilter::try_from((key, op, v)) 999 | } 1000 | (key, op, PartitionFilterValue::Multiple(v)) => { 1001 | let key: &'_ str = key.as_ref(); 1002 | let op: &'_ str = op.as_ref(); 1003 | let v: Vec<&'_ str> = v.iter().map(|v| v.as_ref()).collect(); 1004 | PartitionFilter::try_from((key, op, v.as_slice())) 1005 | } 1006 | }) 1007 | .collect() 1008 | } 1009 | 1010 | fn maybe_create_commit_properties( 1011 | maybe_commit_properties: Option, 1012 | post_commithook_properties: Option, 1013 | ) -> Option { 1014 | if maybe_commit_properties.is_none() && post_commithook_properties.is_none() { 1015 | return None; 1016 | } 1017 | let mut commit_properties = CommitProperties::default(); 1018 | 1019 | if let Some(commit_props) = maybe_commit_properties { 1020 | if let Some(metadata) = commit_props.custom_metadata { 1021 | let json_metadata: Map = 1022 | metadata.into_iter().map(|(k, v)| (k, v.into())).collect(); 1023 | commit_properties = commit_properties.with_metadata(json_metadata); 1024 | }; 1025 | 1026 | if let Some(max_retries) = commit_props.max_commit_retries { 1027 | commit_properties = commit_properties.with_max_retries(max_retries); 1028 | }; 1029 | 1030 | if let Some(app_transactions) = commit_props.app_transactions { 1031 | let app_transactions = app_transactions.iter().map(Transaction::from).collect(); 1032 | commit_properties = commit_properties.with_application_transactions(app_transactions); 1033 | } 1034 | } 1035 | 1036 | if let Some(post_commit_hook_props) = post_commithook_properties { 1037 | commit_properties = 1038 | set_post_commithook_properties(commit_properties, post_commit_hook_props) 1039 | } 1040 | Some(commit_properties) 1041 | } 1042 | 1043 | fn rust_core_version() -> String { 1044 | deltalake::crate_version().to_string() 1045 | } 1046 | 1047 | pub struct BloomFilterProperties { 1048 | pub set_bloom_filter_enabled: Option, 1049 | pub fpp: Option, 1050 | pub ndv: Option, 1051 | } 1052 | 1053 | impl TryConvert for BloomFilterProperties { 1054 | fn try_convert(val: Value) -> RbResult { 1055 | Ok(BloomFilterProperties { 1056 | set_bloom_filter_enabled: val.funcall("set_bloom_filter_enabled", ())?, 1057 | fpp: val.funcall("fpp", ())?, 1058 | ndv: val.funcall("ndv", ())?, 1059 | }) 1060 | } 1061 | } 1062 | 1063 | pub struct ColumnProperties { 1064 | pub dictionary_enabled: Option, 1065 | pub max_statistics_size: Option, 1066 | pub bloom_filter_properties: Option, 1067 | } 1068 | 1069 | impl TryConvert for ColumnProperties { 1070 | fn try_convert(val: Value) -> RbResult { 1071 | Ok(ColumnProperties { 1072 | dictionary_enabled: val.funcall("dictionary_enabled", ())?, 1073 | max_statistics_size: val.funcall("max_statistics_size", ())?, 1074 | bloom_filter_properties: val.funcall("bloom_filter_properties", ())?, 1075 | }) 1076 | } 1077 | } 1078 | 1079 | pub struct RbWriterProperties { 1080 | data_page_size_limit: Option, 1081 | dictionary_page_size_limit: Option, 1082 | data_page_row_count_limit: Option, 1083 | write_batch_size: Option, 1084 | max_row_group_size: Option, 1085 | statistics_truncate_length: Option, 1086 | compression: Option, 1087 | default_column_properties: Option, 1088 | column_properties: Option>>, 1089 | } 1090 | 1091 | impl TryConvert for RbWriterProperties { 1092 | fn try_convert(val: Value) -> RbResult { 1093 | Ok(RbWriterProperties { 1094 | data_page_size_limit: val.funcall("data_page_size_limit", ())?, 1095 | dictionary_page_size_limit: val.funcall("dictionary_page_size_limit", ())?, 1096 | data_page_row_count_limit: val.funcall("data_page_row_count_limit", ())?, 1097 | write_batch_size: val.funcall("write_batch_size", ())?, 1098 | max_row_group_size: val.funcall("max_row_group_size", ())?, 1099 | statistics_truncate_length: val.funcall("statistics_truncate_length", ())?, 1100 | compression: val.funcall("compression", ())?, 1101 | default_column_properties: val.funcall("default_column_properties", ())?, 1102 | // TODO fix 1103 | column_properties: None, 1104 | }) 1105 | } 1106 | } 1107 | 1108 | pub struct RbPostCommitHookProperties { 1109 | create_checkpoint: bool, 1110 | cleanup_expired_logs: Option, 1111 | } 1112 | 1113 | impl TryConvert for RbPostCommitHookProperties { 1114 | fn try_convert(val: Value) -> RbResult { 1115 | Ok(RbPostCommitHookProperties { 1116 | create_checkpoint: val.funcall("create_checkpoint", ())?, 1117 | cleanup_expired_logs: val.funcall("cleanup_expired_logs", ())?, 1118 | }) 1119 | } 1120 | } 1121 | 1122 | #[magnus::wrap(class = "DeltaLake::Transaction")] 1123 | pub struct RbTransaction { 1124 | pub app_id: String, 1125 | pub version: i64, 1126 | pub last_updated: Option, 1127 | } 1128 | 1129 | impl From for RbTransaction { 1130 | fn from(value: Transaction) -> Self { 1131 | RbTransaction { 1132 | app_id: value.app_id, 1133 | version: value.version, 1134 | last_updated: value.last_updated, 1135 | } 1136 | } 1137 | } 1138 | 1139 | impl From<&RbTransaction> for Transaction { 1140 | fn from(value: &RbTransaction) -> Self { 1141 | Transaction { 1142 | app_id: value.app_id.clone(), 1143 | version: value.version, 1144 | last_updated: value.last_updated, 1145 | } 1146 | } 1147 | } 1148 | 1149 | pub struct RbCommitProperties { 1150 | custom_metadata: Option>, 1151 | max_commit_retries: Option, 1152 | app_transactions: Option>, 1153 | } 1154 | 1155 | impl TryConvert for RbCommitProperties { 1156 | fn try_convert(val: Value) -> RbResult { 1157 | Ok(RbCommitProperties { 1158 | custom_metadata: val.funcall("custom_metadata", ())?, 1159 | max_commit_retries: val.funcall("max_commit_retries", ())?, 1160 | // TODO fix 1161 | app_transactions: None, 1162 | }) 1163 | } 1164 | } 1165 | 1166 | #[allow(clippy::too_many_arguments)] 1167 | fn write_to_deltalake( 1168 | table_uri: String, 1169 | data: RbArrowType, 1170 | mode: String, 1171 | table: Option<&RawDeltaTable>, 1172 | schema_mode: Option, 1173 | partition_by: Option>, 1174 | predicate: Option, 1175 | target_file_size: Option, 1176 | name: Option, 1177 | description: Option, 1178 | configuration: Option>>, 1179 | storage_options: Option>, 1180 | writer_properties: Option, 1181 | commit_properties: Option, 1182 | post_commithook_properties: Option, 1183 | ) -> RbResult<()> { 1184 | let batches = data.0.map(|batch| batch.unwrap()).collect::>(); 1185 | let save_mode = mode.parse().map_err(RubyError::from)?; 1186 | 1187 | let options = storage_options.clone().unwrap_or_default(); 1188 | let table = if let Some(table) = table { 1189 | DeltaOps(table._table.borrow().clone()) 1190 | } else { 1191 | rt().block_on(DeltaOps::try_from_uri_with_storage_options( 1192 | &table_uri, options, 1193 | )) 1194 | .map_err(RubyError::from)? 1195 | }; 1196 | 1197 | let mut builder = table.write(batches).with_save_mode(save_mode); 1198 | if let Some(schema_mode) = schema_mode { 1199 | builder = builder.with_schema_mode(schema_mode.parse().map_err(RubyError::from)?); 1200 | } 1201 | if let Some(partition_columns) = partition_by { 1202 | builder = builder.with_partition_columns(partition_columns); 1203 | } 1204 | 1205 | if let Some(writer_props) = writer_properties { 1206 | builder = builder 1207 | .with_writer_properties(set_writer_properties(writer_props).map_err(RubyError::from)?); 1208 | } 1209 | 1210 | if let Some(name) = &name { 1211 | builder = builder.with_table_name(name); 1212 | }; 1213 | 1214 | if let Some(description) = &description { 1215 | builder = builder.with_description(description); 1216 | }; 1217 | 1218 | if let Some(predicate) = predicate { 1219 | builder = builder.with_replace_where(predicate); 1220 | }; 1221 | 1222 | if let Some(target_file_size) = target_file_size { 1223 | builder = builder.with_target_file_size(target_file_size) 1224 | }; 1225 | 1226 | if let Some(config) = configuration { 1227 | builder = builder.with_configuration(config); 1228 | }; 1229 | 1230 | if let Some(commit_properties) = 1231 | maybe_create_commit_properties(commit_properties, post_commithook_properties) 1232 | { 1233 | builder = builder.with_commit_properties(commit_properties); 1234 | }; 1235 | 1236 | rt().block_on(builder.into_future()) 1237 | .map_err(RubyError::from)?; 1238 | 1239 | Ok(()) 1240 | } 1241 | 1242 | pub struct RbArrowType(pub T); 1243 | 1244 | impl TryConvert for RbArrowType { 1245 | fn try_convert(val: Value) -> RbResult { 1246 | let addr: usize = val.funcall("to_i", ())?; 1247 | 1248 | // use similar approach as Polars to consume pointer and avoid copy 1249 | let stream_ptr = 1250 | Box::new(unsafe { std::ptr::replace(addr as _, FFI_ArrowArrayStream::empty()) }); 1251 | 1252 | Ok(RbArrowType( 1253 | ArrowArrayStreamReader::try_new(*stream_ptr) 1254 | .map_err(|err| DeltaError::new_err(err.to_string()))?, 1255 | )) 1256 | } 1257 | } 1258 | 1259 | #[magnus::wrap(class = "DeltaLake::ArrowArrayStream")] 1260 | pub struct ArrowArrayStream { 1261 | stream: FFI_ArrowArrayStream, 1262 | } 1263 | 1264 | impl ArrowArrayStream { 1265 | pub fn to_i(&self) -> usize { 1266 | (&self.stream as *const _) as usize 1267 | } 1268 | } 1269 | 1270 | #[magnus::init] 1271 | fn init(ruby: &Ruby) -> RbResult<()> { 1272 | deltalake::aws::register_handlers(None); 1273 | deltalake::azure::register_handlers(None); 1274 | deltalake::gcp::register_handlers(None); 1275 | 1276 | let module = ruby.define_module("DeltaLake")?; 1277 | module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 15))?; 1278 | module.define_singleton_method("rust_core_version", function!(rust_core_version, 0))?; 1279 | 1280 | let class = module.define_class("RawDeltaTable", ruby.class_object())?; 1281 | class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?; 1282 | class.define_singleton_method("is_deltatable", function!(RawDeltaTable::is_deltatable, 2))?; 1283 | class.define_method("table_uri", method!(RawDeltaTable::table_uri, 0))?; 1284 | class.define_method("version", method!(RawDeltaTable::version, 0))?; 1285 | class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?; 1286 | class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?; 1287 | class.define_method( 1288 | "protocol_versions", 1289 | method!(RawDeltaTable::protocol_versions, 0), 1290 | )?; 1291 | class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?; 1292 | class.define_method( 1293 | "get_latest_version", 1294 | method!(RawDeltaTable::get_latest_version, 0), 1295 | )?; 1296 | class.define_method( 1297 | "get_earliest_version", 1298 | method!(RawDeltaTable::get_earliest_version, 0), 1299 | )?; 1300 | class.define_method( 1301 | "get_num_index_cols", 1302 | method!(RawDeltaTable::get_num_index_cols, 0), 1303 | )?; 1304 | class.define_method( 1305 | "get_stats_columns", 1306 | method!(RawDeltaTable::get_stats_columns, 0), 1307 | )?; 1308 | class.define_method( 1309 | "load_with_datetime", 1310 | method!(RawDeltaTable::load_with_datetime, 1), 1311 | )?; 1312 | class.define_method("files", method!(RawDeltaTable::files, 1))?; 1313 | class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?; 1314 | class.define_method("schema", method!(RawDeltaTable::schema, 0))?; 1315 | class.define_method("vacuum", method!(RawDeltaTable::vacuum, 5))?; 1316 | class.define_method( 1317 | "compact_optimize", 1318 | method!(RawDeltaTable::compact_optimize, 7), 1319 | )?; 1320 | class.define_method( 1321 | "z_order_optimize", 1322 | method!(RawDeltaTable::z_order_optimize, 9), 1323 | )?; 1324 | class.define_method("add_columns", method!(RawDeltaTable::add_columns, 1))?; 1325 | class.define_method("add_feature", method!(RawDeltaTable::add_feature, 2))?; 1326 | class.define_method( 1327 | "add_constraints", 1328 | method!(RawDeltaTable::add_constraints, 1), 1329 | )?; 1330 | class.define_method( 1331 | "drop_constraints", 1332 | method!(RawDeltaTable::drop_constraints, 2), 1333 | )?; 1334 | class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?; 1335 | class.define_method( 1336 | "create_merge_builder", 1337 | method!(RawDeltaTable::create_merge_builder, 8), 1338 | )?; 1339 | class.define_method("merge_execute", method!(RawDeltaTable::merge_execute, 1))?; 1340 | class.define_method("restore", method!(RawDeltaTable::restore, 4))?; 1341 | class.define_method("history", method!(RawDeltaTable::history, 1))?; 1342 | class.define_method( 1343 | "update_incremental", 1344 | method!(RawDeltaTable::update_incremental, 0), 1345 | )?; 1346 | class.define_method( 1347 | "get_active_partitions", 1348 | method!(RawDeltaTable::get_active_partitions, 0), 1349 | )?; 1350 | class.define_method( 1351 | "create_checkpoint", 1352 | method!(RawDeltaTable::create_checkpoint, 0), 1353 | )?; 1354 | class.define_method( 1355 | "cleanup_metadata", 1356 | method!(RawDeltaTable::cleanup_metadata, 0), 1357 | )?; 1358 | class.define_method( 1359 | "get_add_file_sizes", 1360 | method!(RawDeltaTable::get_add_file_sizes, 0), 1361 | )?; 1362 | class.define_method("delete", method!(RawDeltaTable::delete, 4))?; 1363 | class.define_method( 1364 | "set_table_properties", 1365 | method!(RawDeltaTable::set_table_properties, 2), 1366 | )?; 1367 | class.define_method("repair", method!(RawDeltaTable::repair, 3))?; 1368 | class.define_method( 1369 | "transaction_versions", 1370 | method!(RawDeltaTable::transaction_versions, 0), 1371 | )?; 1372 | 1373 | let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?; 1374 | class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?; 1375 | class.define_method("name", method!(RawDeltaTableMetaData::name, 0))?; 1376 | class.define_method( 1377 | "description", 1378 | method!(RawDeltaTableMetaData::description, 0), 1379 | )?; 1380 | class.define_method( 1381 | "partition_columns", 1382 | method!(RawDeltaTableMetaData::partition_columns, 0), 1383 | )?; 1384 | class.define_method( 1385 | "created_time", 1386 | method!(RawDeltaTableMetaData::created_time, 0), 1387 | )?; 1388 | class.define_method( 1389 | "configuration", 1390 | method!(RawDeltaTableMetaData::configuration, 0), 1391 | )?; 1392 | 1393 | let class = module.define_class("ArrowArrayStream", ruby.class_object())?; 1394 | class.define_method("to_i", method!(ArrowArrayStream::to_i, 0))?; 1395 | 1396 | let class = module.define_class("Field", ruby.class_object())?; 1397 | class.define_method("name", method!(Field::name, 0))?; 1398 | class.define_method("type", method!(Field::get_type, 0))?; 1399 | class.define_method("nullable", method!(Field::nullable, 0))?; 1400 | 1401 | let class = module.define_class("RbMergeBuilder", ruby.class_object())?; 1402 | class.define_method("source_alias", method!(RbMergeBuilder::source_alias, 0))?; 1403 | class.define_method("target_alias", method!(RbMergeBuilder::target_alias, 0))?; 1404 | class.define_method( 1405 | "when_matched_update", 1406 | method!(RbMergeBuilder::when_matched_update, 2), 1407 | )?; 1408 | class.define_method( 1409 | "when_matched_delete", 1410 | method!(RbMergeBuilder::when_matched_delete, 1), 1411 | )?; 1412 | class.define_method( 1413 | "when_not_matched_insert", 1414 | method!(RbMergeBuilder::when_not_matched_insert, 2), 1415 | )?; 1416 | class.define_method( 1417 | "when_not_matched_by_source_update", 1418 | method!(RbMergeBuilder::when_not_matched_by_source_update, 2), 1419 | )?; 1420 | class.define_method( 1421 | "when_not_matched_by_source_delete", 1422 | method!(RbMergeBuilder::when_not_matched_by_source_delete, 1), 1423 | )?; 1424 | 1425 | Ok(()) 1426 | } 1427 | -------------------------------------------------------------------------------- /ext/deltalake/src/merge.rs: -------------------------------------------------------------------------------- 1 | use deltalake::arrow::array::RecordBatchReader; 2 | use deltalake::arrow::datatypes::Schema as ArrowSchema; 3 | use deltalake::arrow::ffi_stream::ArrowArrayStreamReader; 4 | use deltalake::datafusion::catalog::TableProvider; 5 | use deltalake::datafusion::datasource::MemTable; 6 | use deltalake::datafusion::prelude::SessionContext; 7 | use deltalake::logstore::LogStoreRef; 8 | use deltalake::operations::merge::MergeBuilder; 9 | use deltalake::table::state::DeltaTableState; 10 | use deltalake::{DeltaResult, DeltaTable}; 11 | use std::cell::RefCell; 12 | use std::collections::HashMap; 13 | use std::future::IntoFuture; 14 | use std::sync::Arc; 15 | 16 | use crate::error::RubyError; 17 | use crate::utils::rt; 18 | use crate::RbResult; 19 | use crate::{ 20 | maybe_create_commit_properties, set_writer_properties, RbCommitProperties, 21 | RbPostCommitHookProperties, RbWriterProperties, 22 | }; 23 | 24 | #[magnus::wrap(class = "DeltaLake::RbMergeBuilder")] 25 | pub(crate) struct RbMergeBuilder { 26 | _builder: RefCell>, 27 | source_alias: Option, 28 | target_alias: Option, 29 | #[allow(dead_code)] 30 | arrow_schema: Arc, 31 | } 32 | 33 | // getters 34 | impl RbMergeBuilder { 35 | pub fn source_alias(&self) -> Option { 36 | self.source_alias.clone() 37 | } 38 | 39 | pub fn target_alias(&self) -> Option { 40 | self.target_alias.clone() 41 | } 42 | } 43 | 44 | impl RbMergeBuilder { 45 | #[allow(clippy::too_many_arguments)] 46 | pub fn new( 47 | log_store: LogStoreRef, 48 | snapshot: DeltaTableState, 49 | source: ArrowArrayStreamReader, 50 | predicate: String, 51 | source_alias: Option, 52 | target_alias: Option, 53 | safe_cast: bool, 54 | writer_properties: Option, 55 | post_commithook_properties: Option, 56 | commit_properties: Option, 57 | ) -> DeltaResult { 58 | let ctx = SessionContext::new(); 59 | let schema = source.schema(); 60 | let batches = vec![source.map(|batch| batch.unwrap()).collect::>()]; 61 | let table_provider: Arc = 62 | Arc::new(MemTable::try_new(schema.clone(), batches).unwrap()); 63 | let source_df = ctx.read_table(table_provider).unwrap(); 64 | 65 | let mut cmd = 66 | MergeBuilder::new(log_store, snapshot, predicate, source_df).with_safe_cast(safe_cast); 67 | 68 | if let Some(src_alias) = &source_alias { 69 | cmd = cmd.with_source_alias(src_alias); 70 | } 71 | 72 | if let Some(trgt_alias) = &target_alias { 73 | cmd = cmd.with_target_alias(trgt_alias); 74 | } 75 | 76 | if let Some(writer_props) = writer_properties { 77 | cmd = cmd.with_writer_properties(set_writer_properties(writer_props)?); 78 | } 79 | 80 | if let Some(commit_properties) = 81 | maybe_create_commit_properties(commit_properties, post_commithook_properties) 82 | { 83 | cmd = cmd.with_commit_properties(commit_properties); 84 | } 85 | 86 | Ok(Self { 87 | _builder: RefCell::new(Some(cmd)), 88 | source_alias, 89 | target_alias, 90 | arrow_schema: schema, 91 | }) 92 | } 93 | 94 | pub fn execute(&self) -> DeltaResult<(DeltaTable, String)> { 95 | let (table, metrics) = rt().block_on(self._builder.take().unwrap().into_future())?; 96 | Ok((table, serde_json::to_string(&metrics).unwrap())) 97 | } 98 | } 99 | 100 | impl RbMergeBuilder { 101 | pub fn when_matched_update( 102 | &self, 103 | updates: HashMap, 104 | predicate: Option, 105 | ) -> RbResult<()> { 106 | let mut binding = self._builder.borrow_mut(); 107 | *binding = match binding.take() { 108 | Some(cmd) => Some( 109 | cmd.when_matched_update(|mut update| { 110 | for (column, expression) in updates { 111 | update = update.update(column, expression) 112 | } 113 | if let Some(predicate) = predicate { 114 | update = update.predicate(predicate) 115 | }; 116 | update 117 | }) 118 | .map_err(RubyError::from)?, 119 | ), 120 | None => unreachable!(), 121 | }; 122 | Ok(()) 123 | } 124 | 125 | pub fn when_matched_delete(&self, predicate: Option) -> RbResult<()> { 126 | let mut binding = self._builder.borrow_mut(); 127 | *binding = match binding.take() { 128 | Some(cmd) => Some( 129 | cmd.when_matched_delete(|mut delete| { 130 | if let Some(predicate) = predicate { 131 | delete = delete.predicate(predicate) 132 | }; 133 | delete 134 | }) 135 | .map_err(RubyError::from)?, 136 | ), 137 | None => unreachable!(), 138 | }; 139 | Ok(()) 140 | } 141 | 142 | pub fn when_not_matched_insert( 143 | &self, 144 | updates: HashMap, 145 | predicate: Option, 146 | ) -> RbResult<()> { 147 | let mut binding = self._builder.borrow_mut(); 148 | *binding = match binding.take() { 149 | Some(cmd) => Some( 150 | cmd.when_not_matched_insert(|mut insert| { 151 | for (column, expression) in updates { 152 | insert = insert.set(column, expression) 153 | } 154 | if let Some(predicate) = predicate { 155 | insert = insert.predicate(predicate) 156 | }; 157 | insert 158 | }) 159 | .map_err(RubyError::from)?, 160 | ), 161 | None => unreachable!(), 162 | }; 163 | Ok(()) 164 | } 165 | 166 | pub fn when_not_matched_by_source_update( 167 | &self, 168 | updates: HashMap, 169 | predicate: Option, 170 | ) -> RbResult<()> { 171 | let mut binding = self._builder.borrow_mut(); 172 | *binding = match binding.take() { 173 | Some(cmd) => Some( 174 | cmd.when_not_matched_by_source_update(|mut update| { 175 | for (column, expression) in updates { 176 | update = update.update(column, expression) 177 | } 178 | if let Some(predicate) = predicate { 179 | update = update.predicate(predicate) 180 | }; 181 | update 182 | }) 183 | .map_err(RubyError::from)?, 184 | ), 185 | None => unreachable!(), 186 | }; 187 | Ok(()) 188 | } 189 | 190 | pub fn when_not_matched_by_source_delete(&self, predicate: Option) -> RbResult<()> { 191 | let mut binding = self._builder.borrow_mut(); 192 | *binding = match binding.take() { 193 | Some(cmd) => Some( 194 | cmd.when_not_matched_by_source_delete(|mut delete| { 195 | if let Some(predicate) = predicate { 196 | delete = delete.predicate(predicate) 197 | }; 198 | delete 199 | }) 200 | .map_err(RubyError::from)?, 201 | ), 202 | None => unreachable!(), 203 | }; 204 | Ok(()) 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /ext/deltalake/src/schema.rs: -------------------------------------------------------------------------------- 1 | use deltalake::kernel::{StructField, StructType as DeltaStructType}; 2 | use magnus::{value::ReprValue, Module, RArray, RModule, Ruby, Value}; 3 | 4 | use crate::RbResult; 5 | 6 | pub fn schema_to_rbobject(schema: DeltaStructType) -> RbResult { 7 | let fields = schema.fields().map(|field| Field { 8 | inner: field.clone(), 9 | }); 10 | 11 | let rb_schema: Value = Ruby::get() 12 | .unwrap() 13 | .class_object() 14 | .const_get::<_, RModule>("DeltaLake")? 15 | .const_get("Schema")?; 16 | 17 | rb_schema.funcall("new", (RArray::from_iter(fields),)) 18 | } 19 | 20 | #[magnus::wrap(class = "DeltaLake::Field")] 21 | pub struct Field { 22 | pub inner: StructField, 23 | } 24 | 25 | impl Field { 26 | pub fn name(&self) -> String { 27 | self.inner.name().to_string() 28 | } 29 | 30 | pub fn get_type(&self) -> String { 31 | self.inner.data_type().to_string() 32 | } 33 | 34 | pub fn nullable(&self) -> bool { 35 | self.inner.is_nullable() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /ext/deltalake/src/utils.rs: -------------------------------------------------------------------------------- 1 | use std::sync::OnceLock; 2 | 3 | use tokio::runtime::Runtime; 4 | 5 | #[inline] 6 | pub fn rt() -> &'static Runtime { 7 | static TOKIO_RT: OnceLock = OnceLock::new(); 8 | static PID: OnceLock = OnceLock::new(); 9 | let pid = std::process::id(); 10 | let runtime_pid = *PID.get_or_init(|| pid); 11 | if pid != runtime_pid { 12 | panic!( 13 | "Forked process detected - current PID is {} but the tokio runtime was created by {}. The tokio \ 14 | runtime does not support forked processes https://github.com/tokio-rs/tokio/issues/4301. If you are \ 15 | seeing this message while using Ruby multithreading make sure to use the `spawn` or `forkserver` \ 16 | mode.", 17 | pid, runtime_pid 18 | ); 19 | } 20 | TOKIO_RT.get_or_init(|| Runtime::new().expect("Failed to create a tokio runtime.")) 21 | } 22 | -------------------------------------------------------------------------------- /lib/deltalake-rb.rb: -------------------------------------------------------------------------------- 1 | require_relative "deltalake" 2 | -------------------------------------------------------------------------------- /lib/deltalake.rb: -------------------------------------------------------------------------------- 1 | # ext 2 | begin 3 | require "deltalake/#{RUBY_VERSION.to_f}/deltalake" 4 | rescue LoadError 5 | require "deltalake/deltalake" 6 | end 7 | 8 | # stdlib 9 | require "json" 10 | require "time" 11 | 12 | # modules 13 | require_relative "deltalake/field" 14 | require_relative "deltalake/metadata" 15 | require_relative "deltalake/schema" 16 | require_relative "deltalake/table" 17 | require_relative "deltalake/table_alterer" 18 | require_relative "deltalake/table_merger" 19 | require_relative "deltalake/table_optimizer" 20 | require_relative "deltalake/utils" 21 | require_relative "deltalake/version" 22 | 23 | module DeltaLake 24 | class Error < StandardError; end 25 | class TableNotFoundError < Error; end 26 | class DeltaProtocolError < Error; end 27 | class CommitFailedError < Error; end 28 | class SchemaMismatchError < Error; end 29 | 30 | class Todo < Error 31 | def message 32 | "not implemented yet" 33 | end 34 | end 35 | 36 | ProtocolVersions = 37 | Struct.new( 38 | :min_reader_version, 39 | :min_writer_version, 40 | :writer_features, 41 | :reader_features 42 | ) 43 | 44 | CommitProperties = 45 | Struct.new( 46 | :custom_metadata, 47 | :max_commit_retries, 48 | # TODO 49 | # :app_transactions, 50 | keyword_init: true 51 | ) 52 | 53 | PostCommitHookProperties = 54 | Struct.new( 55 | :create_checkpoint, 56 | :cleanup_expired_logs, 57 | keyword_init: true 58 | ) 59 | 60 | class ArrowArrayStream 61 | def arrow_c_stream 62 | self 63 | end 64 | end 65 | 66 | class << self 67 | def write( 68 | table_or_uri, 69 | data, 70 | partition_by: nil, 71 | mode: "error", 72 | name: nil, 73 | description: nil, 74 | configuration: nil, 75 | schema_mode: nil, 76 | storage_options: nil, 77 | predicate: nil, 78 | target_file_size: nil, 79 | writer_properties: nil, 80 | commit_properties: nil, 81 | post_commithook_properties: nil 82 | ) 83 | table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options) 84 | 85 | if partition_by.is_a?(String) 86 | partition_by = [partition_by] 87 | end 88 | 89 | if !table.nil? && mode == "ignore" 90 | return 91 | end 92 | 93 | data = Utils.convert_data(data) 94 | 95 | write_deltalake_rust( 96 | table_uri, 97 | data, 98 | mode, 99 | table&._table, 100 | schema_mode, 101 | partition_by, 102 | predicate, 103 | target_file_size, 104 | name, 105 | description, 106 | configuration, 107 | storage_options, 108 | writer_properties, 109 | commit_properties, 110 | post_commithook_properties 111 | ) 112 | 113 | if table 114 | table.update_incremental 115 | end 116 | end 117 | 118 | private 119 | 120 | def try_get_table_and_table_uri(table_or_uri, storage_options) 121 | if !table_or_uri.is_a?(String) && !table_or_uri.is_a?(Table) 122 | raise ArgumentError, "table_or_uri must be a String or Table" 123 | end 124 | 125 | if table_or_uri.is_a?(String) 126 | table = try_get_deltatable(table_or_uri, storage_options) 127 | table_uri = table_or_uri.to_s 128 | else 129 | table = table_or_uri 130 | table_uri = table._table.table_uri 131 | end 132 | 133 | [table, table_uri] 134 | end 135 | 136 | def try_get_deltatable(table_uri, storage_options) 137 | Table.new(table_uri, storage_options: storage_options) 138 | rescue TableNotFoundError 139 | nil 140 | end 141 | end 142 | end 143 | -------------------------------------------------------------------------------- /lib/deltalake/field.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | class Field 3 | def inspect 4 | attributes = { 5 | name: name, 6 | type: type, 7 | nullable: nullable 8 | } 9 | "<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>" 10 | end 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/deltalake/metadata.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | class Metadata 3 | def initialize(table) 4 | @metadata = table.metadata 5 | end 6 | 7 | def id 8 | @metadata.id 9 | end 10 | 11 | def name 12 | @metadata.name 13 | end 14 | 15 | def description 16 | @metadata.description 17 | end 18 | 19 | def partition_columns 20 | @metadata.partition_columns 21 | end 22 | 23 | def created_time 24 | @metadata.created_time 25 | end 26 | 27 | def configuration 28 | @metadata.configuration 29 | end 30 | 31 | def inspect 32 | attributes = { 33 | id: id, 34 | name: name, 35 | description: description, 36 | partition_columns: partition_columns, 37 | created_time: created_time, 38 | configuration: configuration 39 | } 40 | "<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>" 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/deltalake/schema.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | class Schema 3 | attr_reader :fields 4 | 5 | def initialize(fields) 6 | @fields = fields 7 | end 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /lib/deltalake/table.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | class Table 3 | FSCK_METRICS_FILES_REMOVED_LABEL = "files_removed" 4 | 5 | def initialize( 6 | table_uri, 7 | version: nil, 8 | storage_options: nil, 9 | without_files: false, 10 | log_buffer_size: nil 11 | ) 12 | @storage_options = storage_options 13 | @table = 14 | RawDeltaTable.new( 15 | table_uri, 16 | version, 17 | storage_options, 18 | without_files, 19 | log_buffer_size 20 | ) 21 | end 22 | 23 | def self.exists?(table_uri, storage_options: nil) 24 | RawDeltaTable.is_deltatable(table_uri, storage_options) 25 | end 26 | 27 | def version 28 | @table.version 29 | end 30 | 31 | def partitions 32 | partitions = [] 33 | @table.get_active_partitions.each do |partition| 34 | next unless partition 35 | partitions << partition.to_h 36 | end 37 | partitions 38 | end 39 | 40 | def files(partition_filters: nil) 41 | @table.files(_stringify_partition_values(partition_filters)) 42 | end 43 | 44 | def file_uris(partition_filters: nil) 45 | @table.file_uris(_stringify_partition_values(partition_filters)) 46 | end 47 | 48 | def load_as_version(version) 49 | if version.is_a?(Integer) 50 | @table.load_version(version) 51 | elsif version.is_a?(Time) 52 | @table.load_with_datetime(version.utc.iso8601(9)) 53 | elsif version.is_a?(String) 54 | @table.load_with_datetime(version) 55 | else 56 | raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted." 57 | end 58 | end 59 | 60 | def load_cdf( 61 | starting_version: 0, 62 | ending_version: nil, 63 | starting_timestamp: nil, 64 | ending_timestamp: nil, 65 | columns: nil 66 | ) 67 | @table.load_cdf( 68 | starting_version, 69 | ending_version, 70 | starting_timestamp, 71 | ending_timestamp, 72 | columns 73 | ) 74 | end 75 | 76 | def table_uri 77 | @table.table_uri 78 | end 79 | 80 | def schema 81 | @table.schema 82 | end 83 | 84 | def metadata 85 | Metadata.new(@table) 86 | end 87 | 88 | def protocol 89 | ProtocolVersions.new(*@table.protocol_versions) 90 | end 91 | 92 | def history(limit: nil) 93 | backwards_enumerate = lambda do |iterable, start_end, &block| 94 | n = start_end 95 | iterable.each do |elem| 96 | block.call(n, elem) 97 | n -= 1 98 | end 99 | end 100 | 101 | commits = @table.history(limit) 102 | history = [] 103 | backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw| 104 | commit = JSON.parse(commit_info_raw) 105 | commit["version"] = version 106 | history << commit 107 | end 108 | history 109 | end 110 | 111 | def vacuum( 112 | retention_hours: nil, 113 | dry_run: true, 114 | enforce_retention_duration: true, 115 | post_commithook_properties: nil, 116 | commit_properties: nil 117 | ) 118 | if retention_hours 119 | if retention_hours < 0 120 | raise ArgumentError, "The retention periods should be positive." 121 | end 122 | end 123 | 124 | @table.vacuum( 125 | dry_run, 126 | retention_hours, 127 | enforce_retention_duration, 128 | commit_properties, 129 | post_commithook_properties 130 | ) 131 | end 132 | 133 | def optimize 134 | TableOptimizer.new(self) 135 | end 136 | 137 | def alter 138 | TableAlterer.new(self) 139 | end 140 | 141 | def merge( 142 | source, 143 | predicate, 144 | source_alias: nil, 145 | target_alias: nil, 146 | error_on_type_mismatch: true, 147 | writer_properties: nil, 148 | post_commithook_properties: nil, 149 | commit_properties: nil 150 | ) 151 | source = Utils.convert_data(source) 152 | 153 | rb_merge_builder = 154 | @table.create_merge_builder( 155 | source, 156 | predicate, 157 | source_alias, 158 | target_alias, 159 | !error_on_type_mismatch, 160 | writer_properties, 161 | post_commithook_properties, 162 | commit_properties 163 | ) 164 | TableMerger.new(rb_merge_builder, @table) 165 | end 166 | 167 | def restore( 168 | target, 169 | ignore_missing_files: false, 170 | protocol_downgrade_allowed: false, 171 | commit_properties: nil 172 | ) 173 | if target.is_a?(Time) 174 | metrics = 175 | @table.restore( 176 | target.utc.iso8601(9), 177 | ignore_missing_files, 178 | protocol_downgrade_allowed, 179 | commit_properties 180 | ) 181 | else 182 | metrics = 183 | @table.restore( 184 | target, 185 | ignore_missing_files, 186 | protocol_downgrade_allowed, 187 | commit_properties 188 | ) 189 | end 190 | JSON.parse(metrics) 191 | end 192 | 193 | def to_polars(eager: true, rechunk: false, columns: nil) 194 | require "polars-df" 195 | 196 | sources = file_uris 197 | if sources.empty? 198 | lf = Polars::LazyFrame.new 199 | else 200 | delta_keys = [ 201 | "AWS_S3_ALLOW_UNSAFE_RENAME", 202 | "AWS_S3_LOCKING_PROVIDER", 203 | "CONDITIONAL_PUT", 204 | "DELTA_DYNAMO_TABLE_NAME" 205 | ] 206 | storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) } 207 | lf = 208 | Polars.scan_parquet( 209 | sources, 210 | hive_partitioning: true, 211 | storage_options: storage_options, 212 | rechunk: rechunk 213 | ) 214 | 215 | if columns 216 | # by_name requires polars-df > 0.15.0 217 | lf = lf.select(Polars.cs.by_name(*columns)) 218 | end 219 | end 220 | 221 | eager ? lf.collect : lf 222 | end 223 | 224 | def update_incremental 225 | @table.update_incremental 226 | end 227 | 228 | def delete( 229 | predicate = nil, 230 | writer_properties: nil, 231 | post_commithook_properties: nil, 232 | commit_properties: nil 233 | ) 234 | metrics = 235 | @table.delete( 236 | predicate, 237 | writer_properties, 238 | post_commithook_properties, 239 | commit_properties 240 | ) 241 | JSON.parse(metrics).transform_keys(&:to_sym) 242 | end 243 | 244 | def repair( 245 | dry_run: false, 246 | post_commithook_properties: nil, 247 | commit_properties: nil 248 | ) 249 | metrics = 250 | @table.repair( 251 | dry_run, 252 | commit_properties, 253 | post_commithook_properties 254 | ) 255 | deserialized_metrics = JSON.parse(metrics) 256 | deserialized_metrics[FSCK_METRICS_FILES_REMOVED_LABEL] = JSON.parse( 257 | deserialized_metrics[FSCK_METRICS_FILES_REMOVED_LABEL] 258 | ) 259 | deserialized_metrics.transform_keys(&:to_sym) 260 | end 261 | 262 | def transaction_versions 263 | @table.transaction_versions 264 | end 265 | 266 | # private 267 | def _table 268 | @table 269 | end 270 | 271 | # private 272 | def _stringify_partition_values(partition_filters) 273 | if partition_filters.nil? 274 | return partition_filters 275 | end 276 | 277 | raise Todo 278 | end 279 | end 280 | end 281 | -------------------------------------------------------------------------------- /lib/deltalake/table_alterer.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | class TableAlterer 3 | def initialize(table) 4 | @table = table 5 | end 6 | 7 | def add_feature( 8 | feature, 9 | allow_protocol_versions_increase: false 10 | ) 11 | if !feature.is_a?(Array) 12 | feature = [feature] 13 | end 14 | @table._table.add_feature( 15 | feature, 16 | allow_protocol_versions_increase 17 | ) 18 | end 19 | 20 | def add_columns(fields) 21 | if fields.is_a?(DeltaLake::Field) 22 | fields = [fields] 23 | end 24 | 25 | @table._table.add_columns( 26 | fields 27 | ) 28 | end 29 | 30 | def add_constraint(constraints) 31 | if constraints.length > 1 32 | raise ArgumentError, 33 | "add_constraints is limited to a single constraint addition at once for now." 34 | end 35 | 36 | @table._table.add_constraints( 37 | constraints 38 | ) 39 | end 40 | 41 | def drop_constraint(name, raise_if_not_exists: true) 42 | @table._table.drop_constraints( 43 | name, 44 | raise_if_not_exists 45 | ) 46 | end 47 | 48 | def set_table_properties( 49 | properties, 50 | raise_if_not_exists: true 51 | ) 52 | @table._table.set_table_properties( 53 | properties, 54 | raise_if_not_exists 55 | ) 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /lib/deltalake/table_merger.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | class TableMerger 3 | def initialize(builder, table) 4 | @builder = builder 5 | @table = table 6 | end 7 | 8 | def when_matched_update(updates, predicate: nil) 9 | @builder.when_matched_update(updates, predicate) 10 | self 11 | end 12 | 13 | def when_not_matched_insert(updates, predicate: nil) 14 | @builder.when_not_matched_insert(updates, predicate) 15 | self 16 | end 17 | 18 | def when_matched_delete(predicate: nil) 19 | @builder.when_matched_delete(predicate) 20 | self 21 | end 22 | 23 | def when_not_matched_by_source_update(updates, predicate: nil) 24 | @builder.when_not_matched_by_source_update(updates, predicate) 25 | self 26 | end 27 | 28 | def when_not_matched_by_source_delete(predicate: nil) 29 | @builder.when_not_matched_by_source_delete(predicate) 30 | self 31 | end 32 | 33 | def execute 34 | metrics = @table.merge_execute(@builder) 35 | JSON.parse(metrics).transform_keys(&:to_sym) 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /lib/deltalake/table_optimizer.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | class TableOptimizer 3 | def initialize(table) 4 | @table = table 5 | end 6 | 7 | def compact( 8 | partition_filters: nil, 9 | target_size: nil, 10 | max_concurrent_tasks: nil, 11 | min_commit_interval: nil, 12 | writer_properties: nil, 13 | post_commithook_properties: nil, 14 | commit_properties: nil 15 | ) 16 | metrics = 17 | @table._table.compact_optimize( 18 | @table._stringify_partition_values(partition_filters), 19 | target_size, 20 | max_concurrent_tasks, 21 | min_commit_interval, 22 | writer_properties, 23 | post_commithook_properties, 24 | commit_properties 25 | ) 26 | @table.update_incremental 27 | result = JSON.parse(metrics) 28 | ["filesAdded", "filesRemoved"].each do |key| 29 | result[key] = JSON.parse(result[key]) if result[key].is_a?(String) 30 | end 31 | # TODO return underscore symbols like delete 32 | result 33 | end 34 | 35 | def z_order( 36 | columns, 37 | partition_filters: nil, 38 | target_size: nil, 39 | max_concurrent_tasks: nil, 40 | max_spill_size: 20 * 1024 * 1024 * 1024, 41 | min_commit_interval: nil, 42 | writer_properties: nil, 43 | post_commithook_properties: nil, 44 | commit_properties: nil 45 | ) 46 | metrics = 47 | @table._table.z_order_optimize( 48 | Array(columns), 49 | @table._stringify_partition_values(partition_filters), 50 | target_size, 51 | max_concurrent_tasks, 52 | max_spill_size, 53 | min_commit_interval, 54 | writer_properties, 55 | post_commithook_properties, 56 | commit_properties 57 | ) 58 | @table.update_incremental 59 | result = JSON.parse(metrics) 60 | ["filesAdded", "filesRemoved"].each do |key| 61 | result[key] = JSON.parse(result[key]) if result[key].is_a?(String) 62 | end 63 | # TODO return underscore symbols like delete 64 | result 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/deltalake/utils.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | module Utils 3 | def self.convert_data(data) 4 | if data.respond_to?(:arrow_c_stream) 5 | # TODO convert other object types 6 | # should probably move logic to Rust 7 | if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame) 8 | data = convert_polars_data(data) 9 | end 10 | 11 | data.arrow_c_stream 12 | else 13 | raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source." 14 | end 15 | end 16 | 17 | # unsigned integers are not part of the protocol 18 | # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types 19 | def self.convert_polars_data(data) 20 | new_schema = {} 21 | data.schema.each do |k, v| 22 | new_type = convert_polars_type(v) 23 | new_schema[k] = new_type if new_type 24 | end 25 | 26 | if new_schema.any? 27 | data.cast(new_schema) 28 | else 29 | data 30 | end 31 | end 32 | 33 | def self.convert_polars_type(t) 34 | case t 35 | when Polars::UInt8 36 | Polars::Int8 37 | when Polars::UInt16 38 | Polars::Int16 39 | when Polars::UInt32 40 | Polars::Int32 41 | when Polars::UInt64 42 | Polars::Int64 43 | when Polars::Datetime 44 | Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us" 45 | when Polars::List 46 | inner = convert_polars_type(t.inner) 47 | Polars::List.new(inner) if inner 48 | when Polars::Array 49 | inner = convert_polars_type(t.inner) 50 | Polars::Array.new(t.inner, t.width) if inner 51 | when Polars::Struct 52 | if t.fields.any? { |f| convert_polars_type(f.dtype) } 53 | fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) } 54 | Polars::Struct.new(fields) 55 | end 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/deltalake/version.rb: -------------------------------------------------------------------------------- 1 | module DeltaLake 2 | VERSION = "0.1.7" 3 | end 4 | -------------------------------------------------------------------------------- /test/alter_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class AlterTest < Minitest::Test 4 | def test_add_feature 5 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 6 | with_table(df) do |dt| 7 | error = assert_raises(DeltaLake::Error) do 8 | dt.alter.add_feature(:append_only) 9 | end 10 | assert_match "Table feature enables writer feature, but min_writer is not v7", error.message 11 | 12 | dt.alter.add_feature(:append_only, allow_protocol_versions_increase: true) 13 | 14 | protocol = dt.protocol 15 | assert_equal 1, protocol.min_reader_version 16 | assert_equal 7, protocol.min_writer_version 17 | assert_equal ["appendOnly"], protocol.writer_features 18 | assert_nil protocol.reader_features 19 | 20 | error = assert_raises(ArgumentError) do 21 | dt.alter.add_feature(:missing) 22 | end 23 | assert_equal "Invalid feature", error.message 24 | end 25 | end 26 | 27 | def test_add_columns 28 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 29 | with_table(df) do |dt| 30 | # TODO improve 31 | dt.alter.add_columns([]) 32 | 33 | assert_equal 1, dt.schema.fields.size 34 | end 35 | end 36 | 37 | def test_add_constraint 38 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 39 | with_table(df) do |dt| 40 | error = assert_raises(DeltaLake::DeltaProtocolError) do 41 | dt.alter.add_constraint({"a_gt_1" => "a > 1"}) 42 | end 43 | assert_match "Check or Invariant (a > 1) violated", error.message 44 | 45 | dt.alter.add_constraint({"a_gt_0" => "a > 0"}) 46 | 47 | df2 = Polars::DataFrame.new({"a" => [4, 5, -1]}) 48 | error = assert_raises(DeltaLake::DeltaProtocolError) do 49 | DeltaLake.write(dt, df2, mode: "append") 50 | end 51 | assert_match "Check or Invariant (a > 0) violated", error.message 52 | end 53 | end 54 | 55 | def test_drop_constraint 56 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 57 | with_table(df) do |dt| 58 | dt.alter.add_constraint({"a_gt_0" => "a > 0"}) 59 | 60 | dt.alter.drop_constraint("a_gt_0") 61 | 62 | df2 = Polars::DataFrame.new({"a" => [4, 5, -1]}) 63 | DeltaLake.write(dt, df2, mode: "append") 64 | end 65 | end 66 | 67 | def test_drop_constraint_missing 68 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 69 | with_table(df) do |dt| 70 | error = assert_raises(DeltaLake::Error) do 71 | dt.alter.drop_constraint("a_gt_0") 72 | end 73 | assert_equal "Generic DeltaTable error: Constraint with name: a_gt_0 doesn't exists", error.message 74 | 75 | dt.alter.drop_constraint("a_gt_0", raise_if_not_exists: false) 76 | end 77 | end 78 | 79 | def test_set_table_properties 80 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 81 | with_table(df) do |dt| 82 | dt.alter.set_table_properties({"delta.enableChangeDataFeed" => "true"}) 83 | 84 | assert_equal "true", dt.metadata.configuration["delta.enableChangeDataFeed"] 85 | 86 | error = assert_raises(DeltaLake::Error) do 87 | dt.alter.set_table_properties({"missing" => "true"}) 88 | end 89 | assert_equal "Kernel: Generic delta kernel error: Error parsing property 'missing':'true'", error.message 90 | end 91 | end 92 | end 93 | -------------------------------------------------------------------------------- /test/merge_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class MergeTest < Minitest::Test 4 | def test_when_matched_update 5 | df = Polars::DataFrame.new({"x" => [1, 2, 3], "y" => [4, 5, 6]}) 6 | with_table(df) do |dt| 7 | source = Polars::DataFrame.new({"x" => [2, 3], "y" => [5, 8]}) 8 | 9 | metrics = 10 | dt.merge(source, "target.x = source.x", source_alias: "source", target_alias: "target") 11 | .when_matched_update({"x" => "source.x", "y" => "source.y"}) 12 | .execute 13 | assert_equal 3, metrics[:num_output_rows] 14 | assert_equal 1, metrics[:num_target_files_added] 15 | assert_equal 1, metrics[:num_target_files_removed] 16 | 17 | expected = Polars::DataFrame.new({"x" => [1, 2, 3], "y" => [4, 5, 8]}) 18 | assert_equal expected, dt.to_polars 19 | end 20 | end 21 | 22 | def test_when_not_matched_insert 23 | df = Polars::DataFrame.new({"x" => [1, 2, 3], "y" => [4, 5, 6]}) 24 | with_table(df) do |dt| 25 | source = Polars::DataFrame.new({"x" => [2, 3, 7], "y" => [4, 5, 8]}) 26 | 27 | metrics = 28 | dt.merge(source, "target.x = source.x", source_alias: "source", target_alias: "target") 29 | .when_not_matched_insert({"x" => "source.x", "y" => "source.y"}) 30 | .execute 31 | assert_equal 1, metrics[:num_output_rows] 32 | assert_equal 1, metrics[:num_target_files_added] 33 | assert_equal 0, metrics[:num_target_files_removed] 34 | 35 | expected = Polars::DataFrame.new({"x" => [1, 2, 3, 7], "y" => [4, 5, 6, 8]}) 36 | assert_equal expected, dt.to_polars 37 | end 38 | end 39 | 40 | def test_when_matched_delete 41 | df = Polars::DataFrame.new({"x" => [1, 2, 3], "y" => [4, 5, 6]}) 42 | with_table(df) do |dt| 43 | source = Polars::DataFrame.new({"x" => [2, 3], "deleted" => [false, true]}) 44 | 45 | metrics = 46 | dt.merge(source, "target.x = source.x", source_alias: "source", target_alias: "target") 47 | .when_matched_delete(predicate: "source.deleted = true") 48 | .execute 49 | assert_equal 2, metrics[:num_output_rows] 50 | assert_equal 1, metrics[:num_target_files_added] 51 | assert_equal 1, metrics[:num_target_files_removed] 52 | 53 | expected = Polars::DataFrame.new({"x" => [1, 2], "y" => [4, 5]}) 54 | assert_equal expected, dt.to_polars 55 | end 56 | end 57 | 58 | def test_multiple 59 | df = Polars::DataFrame.new({"x" => [1, 2, 3], "y" => [4, 5, 6]}) 60 | with_table(df) do |dt| 61 | source = Polars::DataFrame.new({"x" => [2, 3, 5], "y" => [5, 8, 11]}) 62 | 63 | metrics = 64 | dt.merge(source, "target.x = source.x", source_alias: "source", target_alias: "target") 65 | .when_matched_delete(predicate: "source.x = target.x") 66 | .when_matched_update({"x" => "source.x", "y" => "source.y"}) 67 | .execute 68 | assert_equal 1, metrics[:num_output_rows] 69 | assert_equal 1, metrics[:num_target_files_added] 70 | assert_equal 1, metrics[:num_target_files_removed] 71 | 72 | expected = Polars::DataFrame.new({"x" => [1], "y" => [4]}) 73 | assert_equal expected, dt.to_polars 74 | end 75 | end 76 | 77 | def test_when_not_matched_by_source_update 78 | df = Polars::DataFrame.new({"x" => [1, 2, 3], "y" => [4, 5, 6]}) 79 | with_table(df) do |dt| 80 | source = Polars::DataFrame.new({"x" => [2, 3, 4]}) 81 | 82 | metrics = 83 | dt.merge(source, "target.x = source.x", source_alias: "source", target_alias: "target") 84 | .when_not_matched_by_source_update({"`y`" => "0"}, predicate: "`y` > 3") 85 | .execute 86 | assert_equal 3, metrics[:num_output_rows] 87 | assert_equal 1, metrics[:num_target_files_added] 88 | assert_equal 1, metrics[:num_target_files_removed] 89 | 90 | expected = Polars::DataFrame.new({"x" => [1, 2, 3], "y" => [0, 5, 6]}) 91 | assert_equal expected, dt.to_polars 92 | end 93 | end 94 | end 95 | -------------------------------------------------------------------------------- /test/optimize_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class OptimizeTest < Minitest::Test 4 | def test_compact 5 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 6 | with_table(df) do |dt| 7 | 2.times do 8 | DeltaLake.write(dt, df, mode: "append") 9 | end 10 | 11 | metrics = dt.optimize.compact 12 | assert_equal 1, metrics["numFilesAdded"] 13 | assert_equal 3, metrics["numFilesRemoved"] 14 | assert_in_delta 528, metrics["filesAdded"]["avg"] 15 | assert_equal 528, metrics["filesAdded"]["max"] 16 | assert_equal 528, metrics["filesAdded"]["min"] 17 | assert_equal 1, metrics["filesAdded"]["totalFiles"] 18 | assert_equal 528, metrics["filesAdded"]["totalSize"] 19 | assert_in_delta 514, metrics["filesRemoved"]["avg"] 20 | assert_equal 514, metrics["filesRemoved"]["max"] 21 | assert_equal 514, metrics["filesRemoved"]["min"] 22 | assert_equal 3, metrics["filesRemoved"]["totalFiles"] 23 | assert_equal 1542, metrics["filesRemoved"]["totalSize"] 24 | assert_equal 1, metrics["partitionsOptimized"] 25 | assert_equal 3, metrics["numBatches"] 26 | assert_equal 3, metrics["totalConsideredFiles"] 27 | assert_equal 0, metrics["totalFilesSkipped"] 28 | assert_equal true, metrics["preserveInsertionOrder"] 29 | end 30 | end 31 | 32 | def test_z_order 33 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 34 | with_table(df) do |dt| 35 | 2.times do 36 | DeltaLake.write(dt, df, mode: "append") 37 | end 38 | 39 | metrics = dt.optimize.z_order(["a"]) 40 | assert_equal 1, metrics["numFilesAdded"] 41 | assert_equal 3, metrics["numFilesRemoved"] 42 | assert_in_delta 528, metrics["filesAdded"]["avg"] 43 | assert_equal 528, metrics["filesAdded"]["max"] 44 | assert_equal 528, metrics["filesAdded"]["min"] 45 | assert_equal 1, metrics["filesAdded"]["totalFiles"] 46 | assert_equal 528, metrics["filesAdded"]["totalSize"] 47 | assert_in_delta 514, metrics["filesRemoved"]["avg"] 48 | assert_equal 514, metrics["filesRemoved"]["max"] 49 | assert_equal 514, metrics["filesRemoved"]["min"] 50 | assert_equal 3, metrics["filesRemoved"]["totalFiles"] 51 | assert_equal 1542, metrics["filesRemoved"]["totalSize"] 52 | assert_equal 0, metrics["partitionsOptimized"] 53 | assert_equal 1, metrics["numBatches"] 54 | assert_equal 3, metrics["totalConsideredFiles"] 55 | assert_equal 0, metrics["totalFilesSkipped"] 56 | assert_equal true, metrics["preserveInsertionOrder"] 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /test/table_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class TableTest < Minitest::Test 4 | def test_to_polars 5 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 6 | with_table(df) do |dt| 7 | assert_equal df, dt.to_polars 8 | assert_equal df, dt.to_polars(eager: false).collect 9 | end 10 | end 11 | 12 | def test_files 13 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 14 | with_table(df) do |dt| 15 | assert_equal 1, dt.file_uris.length 16 | assert_equal 1, dt.files.length 17 | end 18 | end 19 | 20 | def test_partition_by 21 | df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4, 4, 5]}) 22 | with_table(df, partition_by: "b") do |dt| 23 | assert_equal df, dt.to_polars 24 | end 25 | end 26 | 27 | def test_metadata 28 | df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4, 4, 5]}) 29 | with_table(df, name: "hello", description: "world", partition_by: "b") do |dt| 30 | metadata = dt.metadata 31 | assert_kind_of String, metadata.id 32 | assert_equal "hello", metadata.name 33 | assert_equal "world", metadata.description 34 | assert_equal ["b"], metadata.partition_columns 35 | # consistent with Python 36 | assert_kind_of Integer, metadata.created_time 37 | assert_empty metadata.configuration 38 | 39 | partitions = dt.partitions 40 | assert_equal 2, partitions.size 41 | assert_includes partitions, {"b" => "4"} 42 | assert_includes partitions, {"b" => "5"} 43 | end 44 | end 45 | 46 | def test_protocol 47 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 48 | with_table(df) do |dt| 49 | protocol = dt.protocol 50 | assert_equal 1, protocol.min_reader_version 51 | assert_equal 2, protocol.min_writer_version 52 | assert_nil protocol.reader_features 53 | assert_nil protocol.writer_features 54 | end 55 | end 56 | 57 | def test_schema 58 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 59 | with_table(df) do |dt| 60 | schema = dt.schema 61 | assert_equal 1, schema.fields.length 62 | assert_equal "a", schema.fields[0].name 63 | assert_equal "long", schema.fields[0].type 64 | assert_equal true, schema.fields[0].nullable 65 | assert_match %!@fields=[]!, dt.schema.inspect 66 | end 67 | end 68 | 69 | def test_history 70 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 71 | with_table(df) do |dt| 72 | dt.delete("a > 1") 73 | 74 | history = dt.history 75 | assert_equal 2, history.size 76 | assert_equal "DELETE", history[0]["operation"] 77 | assert_equal 1, history[0]["version"] 78 | assert_equal "WRITE", history[1]["operation"] 79 | assert_equal 0, history[1]["version"] 80 | 81 | assert_equal 1, dt.history(limit: 1).size 82 | end 83 | end 84 | 85 | def test_load_cdf 86 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 87 | with_table(df) do |dt| 88 | DeltaLake.write(dt, df, mode: "overwrite") 89 | DeltaLake.write(dt, df, mode: "append") 90 | 91 | cdf = dt.load_cdf(starting_version: 1, ending_version: 2) 92 | assert_kind_of DeltaLake::ArrowArrayStream, cdf 93 | 94 | assert_equal 2, Polars::DataFrame.new(cdf).n_unique(subset: ["_commit_version"]) 95 | 96 | error = assert_raises(ArgumentError) do 97 | Polars::DataFrame.new(cdf) 98 | end 99 | assert_equal "the C stream was already released", error.message 100 | 101 | cdf = dt.load_cdf(starting_version: 1, ending_version: 2, columns: ["a", "_commit_timestamp"]) 102 | assert_equal ["a", "_commit_timestamp"], Polars::DataFrame.new(cdf).columns 103 | end 104 | end 105 | 106 | def test_delete 107 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 108 | with_table(df) do |dt| 109 | metrics = dt.delete("a > 1") 110 | assert_equal 1, metrics[:num_added_files] 111 | assert_equal 1, metrics[:num_removed_files] 112 | assert_equal 2, metrics[:num_deleted_rows] 113 | assert_equal 1, metrics[:num_copied_rows] 114 | 115 | expected = Polars::DataFrame.new({"a" => [1]}) 116 | assert_equal expected, dt.to_polars 117 | 118 | dt.delete 119 | assert_empty dt.to_polars 120 | end 121 | end 122 | 123 | def test_vacuum 124 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 125 | with_table(df) do |dt| 126 | assert_empty dt.vacuum 127 | assert_empty dt.vacuum(retention_hours: 0, enforce_retention_duration: false) 128 | 129 | dt.delete 130 | 131 | assert_empty dt.vacuum 132 | 133 | # fix flakiness 134 | sleep(0.001) 135 | 136 | assert_equal 1, dt.vacuum(retention_hours: 0, enforce_retention_duration: false).size 137 | assert_equal 1, dt.vacuum(dry_run: false, retention_hours: 0, enforce_retention_duration: false).size 138 | assert_empty dt.vacuum(retention_hours: 0, enforce_retention_duration: false) 139 | 140 | error = assert_raises(DeltaLake::Error) do 141 | dt.vacuum(retention_hours: 0) 142 | end 143 | assert_match "minimum retention for vacuum is configured to be greater than 168 hours", error.message 144 | end 145 | end 146 | 147 | def test_vacuum_commit_properties 148 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 149 | with_table(df) do |dt| 150 | dt.delete 151 | 152 | # fix flakiness 153 | sleep(0.001) 154 | 155 | dt.vacuum( 156 | dry_run: false, 157 | retention_hours: 0, 158 | enforce_retention_duration: false, 159 | commit_properties: DeltaLake::CommitProperties.new(custom_metadata: {"hello" => "world"}) 160 | ) 161 | 162 | history = dt.history(limit: 1) 163 | assert_equal "world", history[0]["hello"] 164 | end 165 | end 166 | 167 | def test_repair 168 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 169 | with_table(df) do |dt| 170 | metrics = dt.repair(dry_run: true) 171 | assert_equal true, metrics[:dry_run] 172 | assert_empty metrics[:files_removed] 173 | 174 | metrics = dt.repair 175 | assert_equal false, metrics[:dry_run] 176 | assert_empty metrics[:files_removed] 177 | end 178 | end 179 | 180 | def test_restore 181 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 182 | with_table(df) do |dt| 183 | df2 = Polars::DataFrame.new({"a" => [4, 5, 6]}) 184 | DeltaLake.write(dt, df2, mode: "overwrite") 185 | 186 | metrics = dt.restore(dt.version - 1) 187 | assert_equal 1, metrics["numRemovedFile"] 188 | assert_equal 1, metrics["numRestoredFile"] 189 | 190 | assert_equal 2, dt.version 191 | assert_equal df, dt.to_polars 192 | end 193 | end 194 | 195 | def test_missing 196 | with_new_table do |table_uri| 197 | error = assert_raises(DeltaLake::TableNotFoundError) do 198 | DeltaLake::Table.new(table_uri) 199 | end 200 | assert_equal "no log files", error.message 201 | assert_equal false, DeltaLake::Table.exists?(table_uri) 202 | end 203 | end 204 | end 205 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | Bundler.require(:default) 3 | require "minitest/autorun" 4 | require "polars-df" 5 | 6 | class Minitest::Test 7 | def with_new_table 8 | prefix = ENV["CLOUD_PREFIX"] 9 | 10 | if prefix 11 | if prefix.start_with?("s3://") 12 | ENV["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true" 13 | end 14 | yield "#{prefix}/delta-ruby-test/#{Time.now.to_f}" 15 | else 16 | Dir.mktmpdir do |table_uri| 17 | yield table_uri 18 | end 19 | end 20 | end 21 | 22 | def with_table(df, **write_options) 23 | with_new_table do |table_uri| 24 | DeltaLake.write(table_uri, df, **write_options) 25 | yield DeltaLake::Table.new(table_uri) 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /test/types_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class TypesTest < Minitest::Test 4 | def test_types 5 | schema = { 6 | "int8" => Polars::Int8, 7 | "int16" => Polars::Int16, 8 | "int32" => Polars::Int32, 9 | "int64" => Polars::Int64, 10 | "uint8" => Polars::UInt8, 11 | "uint16" => Polars::UInt16, 12 | "uint32" => Polars::UInt32, 13 | "uint64" => Polars::UInt64, 14 | "float32" => Polars::Float32, 15 | "float64" => Polars::Float64, 16 | "decimal" => Polars::Decimal, 17 | "boolean" => Polars::Boolean, 18 | "date" => Polars::Date, 19 | "datetime_ms" => Polars::Datetime.new("ms"), 20 | "datetime_us" => Polars::Datetime.new("us"), 21 | "datetime_ns" => Polars::Datetime.new("ns"), 22 | "datetime_ms_tz" => Polars::Datetime.new("ms", "UTC"), 23 | "datetime_us_tz" => Polars::Datetime.new("us", "UTC"), 24 | "datetime_ns_tz" => Polars::Datetime.new("ns", "UTC"), 25 | "string" => Polars::String, 26 | "binary" => Polars::Binary, 27 | "list" => Polars::List.new(Polars::UInt32), 28 | "struct" => Polars::Struct.new([Polars::Field.new("a", Polars::UInt32)]) 29 | } 30 | row = {} 31 | schema.each_key do |k| 32 | row[k] = 33 | case k 34 | when "list" 35 | [1] 36 | when "struct" 37 | {"a" => 1} 38 | else 39 | 1 40 | end 41 | end 42 | df = Polars::DataFrame.new([row], schema: schema) 43 | with_table(df) do |dt| 44 | types = dt.schema.fields.to_h { |f| [f.name, f.type] } 45 | 46 | assert_equal "byte", types["int8"] 47 | assert_equal "short", types["int16"] 48 | assert_equal "integer", types["int32"] 49 | assert_equal "long", types["int64"] 50 | 51 | assert_equal "byte", types["uint8"] 52 | assert_equal "short", types["uint16"] 53 | assert_equal "integer", types["uint32"] 54 | assert_equal "long", types["uint64"] 55 | 56 | assert_equal "float", types["float32"] 57 | assert_equal "double", types["float64"] 58 | assert_equal "decimal(38,0)", types["decimal"] 59 | 60 | assert_equal "boolean", types["boolean"] 61 | 62 | assert_equal "date", types["date"] 63 | assert_equal "timestamp_ntz", types["datetime_ms"] 64 | assert_equal "timestamp_ntz", types["datetime_us"] 65 | assert_equal "timestamp_ntz", types["datetime_ns"] 66 | assert_equal "timestamp", types["datetime_ms_tz"] 67 | assert_equal "timestamp", types["datetime_us_tz"] 68 | assert_equal "timestamp", types["datetime_ns_tz"] 69 | 70 | assert_equal "string", types["string"] 71 | assert_equal "binary", types["binary"] 72 | 73 | assert_equal "array", types["list"] 74 | assert_equal "struct", types["struct"] 75 | 76 | pl_types = dt.to_polars.schema 77 | 78 | assert_equal Polars::Int8, pl_types["int8"] 79 | assert_equal Polars::Int16, pl_types["int16"] 80 | assert_equal Polars::Int32, pl_types["int32"] 81 | assert_equal Polars::Int64, pl_types["int64"] 82 | 83 | # unsigned integers are converted to signed 84 | assert_equal Polars::Int8, pl_types["uint8"] 85 | assert_equal Polars::Int16, pl_types["uint16"] 86 | assert_equal Polars::Int32, pl_types["uint32"] 87 | assert_equal Polars::Int64, pl_types["uint64"] 88 | 89 | assert_equal Polars::Float32, pl_types["float32"] 90 | assert_equal Polars::Float64, pl_types["float64"] 91 | assert_equal Polars::Decimal.new(38, 0), pl_types["decimal"] 92 | 93 | assert_equal Polars::Boolean, pl_types["boolean"] 94 | 95 | assert_equal Polars::Date, pl_types["date"] 96 | assert_equal Polars::Datetime.new("us"), pl_types["datetime_ms"] 97 | assert_equal Polars::Datetime.new("us"), pl_types["datetime_us"] 98 | assert_equal Polars::Datetime.new("us"), pl_types["datetime_ns"] 99 | assert_equal Polars::Datetime.new("us", "UTC"), pl_types["datetime_ms_tz"] 100 | assert_equal Polars::Datetime.new("us", "UTC"), pl_types["datetime_us_tz"] 101 | assert_equal Polars::Datetime.new("us", "UTC"), pl_types["datetime_ns_tz"] 102 | 103 | assert_equal Polars::String, pl_types["string"] 104 | assert_equal Polars::Binary, pl_types["binary"] 105 | 106 | assert_equal Polars::List.new(Polars::Int32), pl_types["list"] 107 | assert_equal Polars::Struct.new([Polars::Field.new("a", Polars::Int32)]), pl_types["struct"] 108 | end 109 | end 110 | 111 | def test_time 112 | with_new_table do |table_uri| 113 | df = Polars::DataFrame.new({"a" => [1]}, schema: {"a" => Polars::Time}) 114 | error = assert_raises(DeltaLake::SchemaMismatchError) do 115 | DeltaLake.write(table_uri, df) 116 | end 117 | assert_equal "Invalid data type for Delta Lake: Time64(Nanosecond)", error.message 118 | end 119 | end 120 | 121 | def test_duration 122 | with_new_table do |table_uri| 123 | df = Polars::DataFrame.new({"a" => [1]}, schema: {"a" => Polars::Duration}) 124 | error = assert_raises(DeltaLake::SchemaMismatchError) do 125 | DeltaLake.write(table_uri, df) 126 | end 127 | assert_equal "Invalid data type for Delta Lake: Duration(Microsecond)", error.message 128 | end 129 | end 130 | 131 | def test_unsigned_integer 132 | with_new_table do |table_uri| 133 | df = Polars::DataFrame.new({"a" => [255]}, schema: {"a" => Polars::UInt8}) 134 | error = assert_raises(Polars::InvalidOperationError) do 135 | DeltaLake.write(table_uri, df) 136 | end 137 | assert_match "conversion from `u8` to `i8` failed", error.message 138 | end 139 | end 140 | end 141 | -------------------------------------------------------------------------------- /test/write_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class WriteTest < Minitest::Test 4 | def test_mode 5 | with_new_table do |table_uri| 6 | df = Polars::DataFrame.new({"a" => [1, 2, 3]}) 7 | DeltaLake.write(table_uri, df) 8 | 9 | dt = DeltaLake::Table.new(table_uri) 10 | assert_equal 0, dt.version 11 | assert_equal df, dt.to_polars 12 | 13 | error = assert_raises(DeltaLake::Error) do 14 | DeltaLake.write(dt, df) 15 | end 16 | assert_match "table already exists", error.message 17 | 18 | DeltaLake.write(dt, df, mode: "overwrite") 19 | assert_equal 1, dt.version 20 | assert_equal df, dt.to_polars 21 | 22 | time = Time.now 23 | sleep(0.01) 24 | 25 | DeltaLake.write(dt, df, mode: "ignore") 26 | assert_equal 1, dt.version 27 | assert_equal df, dt.to_polars 28 | 29 | DeltaLake.write(dt, df, mode: "append") 30 | assert_equal 2, dt.version 31 | assert_equal Polars.concat([df, df]), dt.to_polars 32 | 33 | assert_empty dt.transaction_versions 34 | 35 | dt.load_as_version(dt.version - 1) 36 | assert_equal 1, dt.version 37 | assert_equal df, dt.to_polars 38 | 39 | dt.load_as_version(time) 40 | assert_equal 1, dt.version 41 | assert_equal df, dt.to_polars 42 | 43 | dt = DeltaLake::Table.new(table_uri, version: 1) 44 | assert_equal 1, dt.version 45 | assert_equal df, dt.to_polars 46 | end 47 | end 48 | 49 | def test_invalid_data 50 | with_new_table do |table_uri| 51 | error = assert_raises(TypeError) do 52 | DeltaLake.write(table_uri, Object.new) 53 | end 54 | assert_equal "Only objects implementing the Arrow C stream interface are valid inputs for source.", error.message 55 | end 56 | end 57 | 58 | def test_rust_core_version 59 | assert_match(/\A\d+\.\d+\.\d+\z/, DeltaLake.rust_core_version) 60 | end 61 | end 62 | --------------------------------------------------------------------------------