├── .asf.yaml ├── .devcontainer └── devcontainer.json ├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE ├── NOTICE ├── README.md ├── copyright.txt ├── crates └── paimon │ ├── Cargo.toml │ ├── src │ ├── error.rs │ ├── file_index │ │ ├── file_index_format.rs │ │ └── mod.rs │ ├── io │ │ ├── file_io.rs │ │ ├── mod.rs │ │ ├── storage.rs │ │ ├── storage_fs.rs │ │ └── storage_memory.rs │ ├── lib.rs │ └── spec │ │ ├── data_file.rs │ │ ├── index_file_meta.rs │ │ ├── index_manifest.rs │ │ ├── manifest_common.rs │ │ ├── manifest_entry.rs │ │ ├── manifest_file_meta.rs │ │ ├── mod.rs │ │ ├── objects_file.rs │ │ ├── schema.rs │ │ ├── schema_change.rs │ │ ├── snapshot.rs │ │ ├── stats.rs │ │ └── types.rs │ └── tests │ └── fixtures │ ├── array_type.json │ ├── array_type_nullable.json │ ├── bigint_type.json │ ├── bigint_type_nullable.json │ ├── binary_type.json │ ├── binary_type_nullable.json │ ├── boolean_type.json │ ├── boolean_type_nullable.json │ ├── char_type.json │ ├── char_type_nullable.json │ ├── date_type.json │ ├── date_type_nullable.json │ ├── decimal_type.json │ ├── decimal_type_nullable.json │ ├── double_type.json │ ├── double_type_nullable.json │ ├── float_type.json │ ├── float_type_nullable.json │ ├── highly_complex_nested_row_type.json │ ├── int_type.json │ ├── int_type_nullable.json │ ├── local_zoned_timestamp_type.json │ ├── local_zoned_timestamp_type_nullable.json │ ├── manifest │ ├── index-manifest-85cc6729-f5af-431a-a1c3-ef45319328fb-0 │ ├── manifest-8ded1f09-fcda-489e-9167-582ac0f9f846-0 │ └── manifest-list-5c7399a0-46ae-4a5e-9c13-3ab07212cdb6-0 │ ├── manifest_file_meta_schema.json │ ├── map_type.json │ ├── map_type_nullable.json │ ├── multiset_type.json │ ├── multiset_type_nullable.json │ ├── row_type.json │ ├── row_type_nullable.json │ ├── smallint_type.json │ ├── smallint_type_nullable.json │ ├── snapshot │ ├── snapshot-v3-none-field.json │ └── snapshot-v3.json │ ├── time_type.json │ ├── time_type_nullable.json │ ├── timestamp_type.json │ ├── timestamp_type_nullable.json │ ├── tinyint_type.json │ ├── tinyint_type_nullable.json │ ├── varbinary_type.json │ ├── varbinary_type_nullable.json │ ├── varchar_type.json │ └── varchar_type_nullable.json ├── rust-toolchain.toml └── rustfmt.toml /.asf.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # See: https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features 19 | 20 | github: 21 | description: "Apache Paimon Rust The rust implementation of Apache Paimon." 22 | homepage: https://paimon.apache.org/ 23 | labels: 24 | - rust 25 | - paimon 26 | - streaming-datalake 27 | - real-time-analytics 28 | - data-ingestion 29 | - big-data 30 | - table-store 31 | enabled_merge_buttons: 32 | squash: true 33 | merge: false 34 | rebase: true 35 | features: 36 | issues: true 37 | discussions: true 38 | wiki: false 39 | projects: false 40 | gh-pages: 41 | whatever: Just a placeholder to make it take effects 42 | collaborators: 43 | - Xuanwo 44 | 45 | notifications: 46 | commits: commits@paimon.apache.org 47 | issues: issues@paimon.apache.org 48 | pullrequests: issues@paimon.apache.org 49 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "mcr.microsoft.com/devcontainers/rust:1-1-bookworm" 3 | } 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | version: 2 19 | updates: 20 | # Maintain dependencies for GitHub Actions 21 | - package-ecosystem: "github-actions" 22 | directory: "/" 23 | schedule: 24 | interval: "monthly" 25 | 26 | # Maintain dependencies for rust 27 | - package-ecosystem: "cargo" 28 | directory: "/" 29 | schedule: 30 | interval: "monthly" 31 | 32 | - package-ecosystem: "devcontainers" 33 | directory: "/" 34 | schedule: 35 | interval: "monthly" -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: CI 19 | 20 | on: 21 | push: 22 | branches: 23 | - main 24 | pull_request: 25 | branches: 26 | - main 27 | 28 | concurrency: 29 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} 30 | cancel-in-progress: true 31 | 32 | jobs: 33 | check: 34 | runs-on: ubuntu-latest 35 | steps: 36 | - uses: actions/checkout@v4 37 | 38 | - name: Check License Header 39 | uses: apache/skywalking-eyes/header@v0.6.0 40 | 41 | - name: Format 42 | run: cargo fmt --all -- --check 43 | 44 | - name: Clippy 45 | run: cargo clippy --all-targets --workspace -- -D warnings 46 | 47 | build: 48 | runs-on: ${{ matrix.os }} 49 | strategy: 50 | matrix: 51 | os: 52 | - ubuntu-latest 53 | - macos-latest 54 | - windows-latest 55 | steps: 56 | - uses: actions/checkout@v4 57 | - name: Build 58 | run: cargo build 59 | 60 | unit: 61 | runs-on: ${{ matrix.os }} 62 | strategy: 63 | matrix: 64 | os: 65 | - ubuntu-latest 66 | - macos-latest 67 | - windows-latest 68 | steps: 69 | - uses: actions/checkout@v4 70 | - name: Test 71 | run: cargo test --all-targets --workspace 72 | env: 73 | RUST_LOG: DEBUG 74 | RUST_BACKTRACE: full 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /target 19 | /Cargo.lock 20 | .idea 21 | .vscode 22 | **/.DS_Store 23 | dist/* 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Contributing 21 | 22 | ## Get Started 23 | This is a Rust project, so [rustup](https://rustup.rs/) is a great place to start. It provides an easy way to manage your Rust installation and toolchains. 24 | 25 | This is a pure Rust project, so only `cargo` is needed. Here are some common commands to get you started: 26 | - `cargo check`: Analyze the current package and report errors. This is a quick way to catch any obvious issues without a full compilation. 27 | - `cargo fmt`: Format the current code according to the Rust style guidelines. This helps maintain a consistent code style throughout the project. 28 | - `cargo build`: Compile the current package. This will build the project and generate executable binaries if applicable. 29 | - `cargo clippy`: Catch common mistakes and improve code quality. Clippy provides a set of lints that can help you write better Rust code. 30 | - `cargo test`: Run unit tests. This will execute all the tests defined in the project to ensure the functionality is correct. 31 | - `cargo bench`: Run benchmark tests. This is useful for measuring the performance of specific parts of the code. 32 | 33 | ### Setting up the Development Environment 34 | 1. Install Rust using `rustup`. Follow the instructions on the [rustup website](https://rustup.rs/) to install Rust on your system. 35 | 2. Clone the repository to your local machine. 36 | 3. Navigate to the project directory. 37 | 38 | ### Making Changes 39 | 1. Create a new branch for your changes. This helps keep your work separate from the main development branch and makes it easier to review and merge your changes. 40 | 2. Make your changes and ensure that the code still compiles and passes all tests. Use the commands mentioned above to check for errors and run tests. 41 | 3. Format your code using `cargo fmt` to ensure consistency with the project's code style. 42 | 43 | ### Submitting Changes 44 | 1. Once you are satisfied with your changes, push your branch to the remote repository. 45 | 2. Open a pull request on the project's GitHub page. Provide a clear description of your changes and why they are necessary. 46 | 3. Wait for reviews and address any feedback. Once the pull request is approved and merged, your changes will be part of the project. 47 | 48 | ### Read the design docs 49 | For a deeper understanding of the project, read the design documentation available on our [Paimon official website](https://paimon.apache.org/). 50 | 51 | Thank you for contributing to this project! 😊 52 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [workspace] 19 | resolver = "2" 20 | members = ["crates/paimon"] 21 | 22 | [workspace.package] 23 | version = "0.0.0" 24 | edition = "2021" 25 | homepage = "https://paimon.apache.org/" 26 | repository = "https://github.com/apache/paimon-rust" 27 | license = "Apache-2.0" 28 | rust-version = "1.86.0" 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Datafuse Labs 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Apache Paimon Rust 2 | Copyright 2024 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Apache Paimon Rust   [![Build Status]][actions] [![Latest Version]][crates.io] 21 | 22 | [Build Status]: https://img.shields.io/github/actions/workflow/status/apache/paimon-rust/ci.yml 23 | [actions]: https://github.com/apache/paimon-rust/actions?query=branch%3Amain 24 | [Latest Version]: https://img.shields.io/crates/v/paimon.svg 25 | [crates.io]: https://crates.io/crates/paimon 26 | 27 | The rust implementation of Apache Paimon. 28 | 29 | ## Issue Tracker 30 | 31 | See [Tracking issues of 0.1.0 version for Apache Paimon Rust](https://github.com/apache/paimon-rust/issues/3) 32 | 33 | ## Contributing 34 | 35 | Apache Paimon Rust is an exciting project currently under active development. Whether you're looking to use it in your projects or contribute to its growth, there are several ways you can get involved: 36 | 37 | - Follow the [Contributing Guide](CONTRIBUTING.md) to contribute. 38 | - Create new [Issue](https://github.com/apache/paimon-rust/issues/new) for bug reportor or feature request. 39 | - Start discussion thread at [dev mailing list](mailto:dev@paimon.apache.org) ([subscribe]() / [unsubscribe]() / [archives](https://lists.apache.org/list.html?dev@paimon.apache.org)) 40 | - Talk to community directly at [Slack #paimon channel](https://join.slack.com/t/the-asf/shared_invite/zt-2l9rns8pz-H8PE2Xnz6KraVd2Ap40z4g). 41 | 42 | ## Getting help 43 | 44 | Submit [issues](https://github.com/apache/paimon-rust/issues/new/choose) for bug report or asking questions in [discussion](https://github.com/apache/paimon-rust/discussions/new?category=q-a). 45 | 46 | ## License 47 | 48 | Licensed under Apache License, Version 2.0. 49 | 50 | -------------------------------------------------------------------------------- /copyright.txt: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ -------------------------------------------------------------------------------- /crates/paimon/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | categories = ["database"] 20 | description = "The rust implementation of Apache Paimon" 21 | documentation = "https://docs.rs/paimon" 22 | name = "paimon" 23 | 24 | repository.workspace = true 25 | edition.workspace = true 26 | license.workspace = true 27 | version.workspace = true 28 | 29 | [features] 30 | default = ["storage-memory", "storage-fs"] 31 | storage-all = ["storage-memory", "storage-fs"] 32 | 33 | storage-memory = ["opendal/services-memory"] 34 | storage-fs = ["opendal/services-fs"] 35 | 36 | [dependencies] 37 | url = "2.5.2" 38 | async-trait = "0.1.81" 39 | bytes = "1.7.1" 40 | bitflags = "2.6.0" 41 | tokio = { version = "1.39.2", features = ["macros"] } 42 | chrono = { version = "0.4.38", features = ["serde"] } 43 | serde = { version = "1", features = ["derive"] } 44 | serde_bytes = "0.11.15" 45 | serde_json = "1.0.120" 46 | serde_with = "3.9.0" 47 | serde_repr = "0.1" 48 | snafu = "0.8.3" 49 | typed-builder = "^0.19" 50 | opendal = { version = "0.49", features = ["services-fs"] } 51 | pretty_assertions = "1" 52 | apache-avro = { version = "0.17", features = ["snappy"] } 53 | indexmap = "2.5.0" 54 | 55 | [dev-dependencies] 56 | rand = "0.8.5" 57 | serde_avro_fast = { version = "1.1.2", features = ["snappy"] } 58 | -------------------------------------------------------------------------------- /crates/paimon/src/error.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use snafu::prelude::*; 19 | 20 | /// Result type used in paimon. 21 | pub type Result = std::result::Result; 22 | 23 | /// Error type for paimon. 24 | #[derive(Debug, Snafu)] 25 | pub enum Error { 26 | #[snafu(display("Paimon data invalid for {}: {:?}", message, source))] 27 | DataInvalid { 28 | message: String, 29 | #[snafu(backtrace)] 30 | source: snafu::Whatever, 31 | }, 32 | #[snafu( 33 | visibility(pub(crate)), 34 | display("Paimon data type invalid for {}", message) 35 | )] 36 | DataTypeInvalid { message: String }, 37 | #[snafu( 38 | visibility(pub(crate)), 39 | display("Paimon hitting unexpected error {}: {:?}", message, source) 40 | )] 41 | IoUnexpected { 42 | message: String, 43 | source: opendal::Error, 44 | }, 45 | #[snafu( 46 | visibility(pub(crate)), 47 | display("Paimon hitting unsupported io error {}", message) 48 | )] 49 | IoUnsupported { message: String }, 50 | #[snafu( 51 | visibility(pub(crate)), 52 | display("Paimon hitting invalid config: {}", message) 53 | )] 54 | ConfigInvalid { message: String }, 55 | #[snafu( 56 | visibility(pub(crate)), 57 | display("Paimon hitting unexpected avro error {}: {:?}", message, source) 58 | )] 59 | DataUnexpected { 60 | message: String, 61 | source: apache_avro::Error, 62 | }, 63 | #[snafu( 64 | visibility(pub(crate)), 65 | display("Paimon hitting invalid file index format: {}", message) 66 | )] 67 | FileIndexFormatInvalid { message: String }, 68 | } 69 | 70 | impl From for Error { 71 | fn from(source: opendal::Error) -> Self { 72 | // TODO: Simple use IoUnexpected for now 73 | Error::IoUnexpected { 74 | message: "IO operation failed on underlying storage".to_string(), 75 | source, 76 | } 77 | } 78 | } 79 | 80 | impl From for Error { 81 | fn from(source: apache_avro::Error) -> Self { 82 | Error::DataUnexpected { 83 | message: "".to_string(), 84 | source, 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /crates/paimon/src/file_index/file_index_format.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::collections::HashMap; 19 | 20 | use bytes::{Buf, BufMut, Bytes, BytesMut}; 21 | 22 | use crate::{ 23 | io::{FileIO, FileRead, FileStatus, InputFile, OutputFile}, 24 | Error, 25 | }; 26 | 27 | /// Default 1MB read block size 28 | const READ_BLOCK_SIZE: u64 = 1024 * 1024; 29 | 30 | /// Quoted from the Java implement of the structure, 31 | /// `MAGIC`` is used to mark the beginning of a FileFormat structure. 32 | pub const MAGIC: u64 = 1493475289347502; 33 | 34 | /// Used to mark an empty INDEX. 35 | pub const EMPTY_INDEX_FLAG: i64 = -1; 36 | 37 | #[derive(Debug)] 38 | struct IndexInfo { 39 | start_pos: i64, 40 | length: i64, 41 | } 42 | 43 | #[repr(i32)] 44 | #[derive(Debug, PartialEq, Eq)] 45 | enum Version { 46 | V1, 47 | } 48 | 49 | /// File index file format. All columns and offsets are stored in the header. 50 | /// 51 | /// ```text 52 | /// _____________________________________ _____________________ 53 | /// | magic |version|head length | 54 | /// |-------------------------------------| 55 | /// | column number | 56 | /// |-------------------------------------| 57 | /// | column 1 | index number | 58 | /// |-------------------------------------| 59 | /// | index name 1 |start pos |length | 60 | /// |-------------------------------------| 61 | /// | index name 2 |start pos |length | 62 | /// |-------------------------------------| 63 | /// | index name 3 |start pos |length | 64 | /// |-------------------------------------| HEADER 65 | /// | column 2 | index number | 66 | /// |-------------------------------------| 67 | /// | index name 1 |start pos |length | 68 | /// |-------------------------------------| 69 | /// | index name 2 |start pos |length | 70 | /// |-------------------------------------| 71 | /// | index name 3 |start pos |length | 72 | /// |-------------------------------------| 73 | /// | ... | 74 | /// |-------------------------------------| 75 | /// | ... | 76 | /// |-------------------------------------| 77 | /// | redundant length |redundant bytes | 78 | /// |-------------------------------------| --------------------- 79 | /// | BODY | 80 | /// | BODY | 81 | /// | BODY | BODY 82 | /// | BODY | 83 | /// |_____________________________________| _____________________ 84 | /// 85 | /// - `magic`: 8 bytes long 86 | /// - `version`: 4-byte integer 87 | /// - `head length`: 4-byte integer 88 | /// - `column number`: 4-byte integer 89 | /// - `column x`: variable-length UTF-8 string (length + bytes) 90 | /// - `index number`: 4-byte integer (number of index items below) 91 | /// - `index name x`: variable-length UTF-8 string 92 | /// - `start pos`: 4-byte integer 93 | /// - `length`: 4-byte integer 94 | /// - `redundant length`: 4-byte integer (for compatibility with future versions; content is zero in this version) 95 | /// - `redundant bytes`: variable-length bytes (for compatibility with future versions; empty in this version) 96 | /// - `BODY`: sequence of index data (concatenated index data for each column) 97 | /// ``` 98 | /// 99 | /// Impl Reference: 100 | pub async fn write_column_indexes( 101 | path: &str, 102 | indexes: HashMap>, 103 | ) -> crate::Result { 104 | let file_io = FileIO::from_url(path)?.build()?; 105 | let output = file_io.new_output(path)?; 106 | let mut writer = output.writer().await?; 107 | 108 | let mut body_info: HashMap> = HashMap::new(); 109 | let mut total_data_size = 0; 110 | 111 | // Calculate the total data size 112 | for bytes_map in indexes.values() { 113 | for data in bytes_map.values() { 114 | if !data.is_empty() { 115 | total_data_size += data.len(); 116 | } 117 | } 118 | } 119 | 120 | let mut body = BytesMut::with_capacity(total_data_size); 121 | 122 | for (column_name, bytes_map) in indexes.into_iter() { 123 | let inner_map = body_info.entry(column_name.clone()).or_default(); 124 | for (index_name, data) in bytes_map { 125 | let start_position = body.len() as i64; 126 | if data.is_empty() { 127 | inner_map.insert( 128 | index_name, 129 | IndexInfo { 130 | start_pos: EMPTY_INDEX_FLAG, 131 | length: 0, 132 | }, 133 | ); 134 | } else { 135 | body.extend_from_slice(&data); 136 | inner_map.insert( 137 | index_name, 138 | IndexInfo { 139 | start_pos: start_position, 140 | length: body.len() as i64 - start_position, 141 | }, 142 | ); 143 | } 144 | } 145 | } 146 | 147 | // write_head(writer, &body_info).await?; 148 | let head_length = calculate_head_length(&body_info)?; 149 | let mut head_buffer = BytesMut::with_capacity(head_length); 150 | 151 | // Magic 152 | head_buffer.put_u64_le(MAGIC); 153 | // Version 154 | head_buffer.put_i32_le(Version::V1 as i32); 155 | // HeadLength 156 | head_buffer.put_i32_le(head_length as i32); 157 | // ColumnSize 158 | head_buffer.put_i32_le(body_info.len() as i32); 159 | 160 | for (column_name, index_info) in body_info { 161 | // ColumnName 162 | head_buffer.put_u16_le(column_name.len() as u16); 163 | head_buffer.put_slice(column_name.as_bytes()); 164 | // IndexTypeSize 165 | head_buffer.put_i32_le(index_info.len() as i32); 166 | // ColumnInfo,offset = headLength 167 | for (index_name, IndexInfo { start_pos, length }) in index_info { 168 | head_buffer.put_u16_le(index_name.len() as u16); 169 | head_buffer.put_slice(index_name.as_bytes()); 170 | let adjusted_start = if start_pos == EMPTY_INDEX_FLAG { 171 | EMPTY_INDEX_FLAG 172 | } else { 173 | start_pos + head_length as i64 174 | }; 175 | head_buffer.put_i64_le(adjusted_start); 176 | head_buffer.put_i64_le(length); 177 | } 178 | } 179 | 180 | // Redundant length for future compatibility 181 | head_buffer.put_i32_le(0); 182 | 183 | // Write into 184 | writer.write(head_buffer.freeze()).await?; 185 | writer.write(body.freeze()).await?; 186 | writer.close().await?; 187 | Ok(output) 188 | } 189 | 190 | fn calculate_head_length( 191 | body_info: &HashMap>, 192 | ) -> crate::Result { 193 | // Magic + Version + HeadLength + ColumnNumber + RedundantLength 194 | let base_length = 8 + 4 + 4 + 4 + 4; 195 | let mut total_length = base_length; 196 | 197 | for (column_name, index_info) in body_info { 198 | // Column name length + actual column name length 199 | total_length += 2 + column_name.len(); 200 | // IndexTypeSize (index number) 201 | total_length += 4; 202 | 203 | for index_name in index_info.keys() { 204 | // Index name length + actual index name length 205 | total_length += 2 + index_name.len(); 206 | // start_pos (8 bytes) + length (8 bytes) 207 | total_length += 16; 208 | } 209 | } 210 | 211 | Ok(total_length) 212 | } 213 | 214 | pub struct FileIndex { 215 | reader: Box, 216 | header: HashMap>, 217 | } 218 | 219 | impl FileIndex { 220 | pub async fn get_column_index( 221 | &self, 222 | column_name: &str, 223 | ) -> crate::Result> { 224 | if let Some(index_info) = self.header.get(column_name) { 225 | let mut result = HashMap::new(); 226 | for (index_name, info) in index_info { 227 | let bytes = self.get_bytes_with_start_and_length(info).await?; 228 | result.insert(index_name.clone(), bytes); 229 | } 230 | Ok(result) 231 | } else { 232 | Err(Error::FileIndexFormatInvalid { 233 | message: format!("Column '{}' not found in header", column_name), 234 | }) 235 | } 236 | } 237 | 238 | pub async fn get_index(&self) -> crate::Result>> { 239 | let mut result = HashMap::new(); 240 | for (column_name, index_info) in self.header.iter() { 241 | let mut column_index = HashMap::new(); 242 | for (index_name, info) in index_info { 243 | let bytes = self.get_bytes_with_start_and_length(info).await?; 244 | column_index.insert(index_name.clone(), bytes); 245 | } 246 | result.insert(column_name.clone(), column_index); 247 | } 248 | Ok(result) 249 | } 250 | 251 | async fn get_bytes_with_start_and_length( 252 | &self, 253 | index_info: &IndexInfo, 254 | ) -> crate::Result { 255 | let data_bytes = self 256 | .reader 257 | .read(index_info.start_pos as u64..(index_info.start_pos + index_info.length) as u64) 258 | .await?; 259 | 260 | Ok(data_bytes) 261 | } 262 | } 263 | 264 | pub struct FileIndexFormatReader { 265 | reader: Box, 266 | stat: FileStatus, 267 | } 268 | 269 | impl FileIndexFormatReader { 270 | pub async fn get_file_index(input_file: InputFile) -> crate::Result { 271 | let reader = input_file.reader().await?; 272 | let mut file_reader = Self { 273 | reader: Box::new(reader), 274 | stat: input_file.metadata().await?, 275 | }; 276 | let header = file_reader.read_header().await?; 277 | Ok(FileIndex { 278 | header, 279 | reader: file_reader.reader, 280 | }) 281 | } 282 | 283 | async fn read_header(&mut self) -> crate::Result>> { 284 | let read_size = if self.stat.size < READ_BLOCK_SIZE { 285 | self.stat.size 286 | } else { 287 | READ_BLOCK_SIZE 288 | }; 289 | let mut buffer = self.reader.read(0..read_size).await?; 290 | 291 | // Magic (8 bytes) 292 | let magic = buffer.get_u64_le(); 293 | if magic != MAGIC { 294 | return Err(Error::FileIndexFormatInvalid { 295 | message: format!("Expected MAGIC: {}, but found: {}", MAGIC, magic), 296 | }); 297 | } 298 | 299 | // Version (4 bytes) 300 | let version = buffer.get_i32_le(); 301 | if version != Version::V1 as i32 { 302 | return Err(Error::FileIndexFormatInvalid { 303 | message: format!( 304 | "Unsupported file index version: expected {}, but found: {}", 305 | Version::V1 as i32, 306 | version 307 | ), 308 | }); 309 | } 310 | 311 | // Head Length (4 bytes) 312 | let head_length = buffer.get_i32_le() as usize; 313 | 314 | // Ensure the header is fully contained in the buffer 315 | if buffer.len() < head_length { 316 | let remaining = head_length - buffer.len(); 317 | let mut remaining_head_buffer = BytesMut::with_capacity(remaining); 318 | let additional_data = self 319 | .reader 320 | .read(buffer.len() as u64..buffer.len() as u64 + remaining as u64) 321 | .await?; 322 | remaining_head_buffer.extend_from_slice(&additional_data); 323 | buffer = Bytes::from( 324 | [buffer.slice(0..), remaining_head_buffer.freeze().slice(0..)].concat(), 325 | ); 326 | } 327 | 328 | // Column Number (4 bytes) 329 | let column_number = buffer.get_i32_le(); 330 | 331 | let mut current_offset = 20; 332 | let mut header = HashMap::new(); 333 | 334 | for _ in 0..column_number { 335 | // Column Name Length (2 bytes) 336 | let column_name_len = buffer.get_u16_le(); 337 | current_offset += 2; 338 | 339 | // Column Name (variable-length UTF-8 string) 340 | let column_name = String::from_utf8(buffer.split_to(column_name_len as usize).to_vec()) 341 | .map_err(|e| Error::FileIndexFormatInvalid { 342 | message: format!("Invalid UTF-8 sequence in column name: {}", e), 343 | })?; 344 | current_offset += column_name_len as u64; 345 | 346 | // Index Number (4 bytes) 347 | let index_number = buffer.get_i32_le(); 348 | current_offset += 4; 349 | 350 | let mut index_info_map = HashMap::new(); 351 | for _ in 0..index_number { 352 | // Index Name Length (2 bytes) 353 | let index_name_len = buffer.get_u16_le(); 354 | current_offset += 2; 355 | 356 | // Index Name (variable-length UTF-8 string) 357 | let index_name = 358 | String::from_utf8(buffer.split_to(index_name_len as usize).to_vec()).unwrap(); 359 | current_offset += index_name_len as u64; 360 | 361 | // Start Pos (8 bytes) 362 | let start_pos = buffer.get_i64_le(); 363 | current_offset += 4; 364 | 365 | // Length (8 bytes) 366 | let length = buffer.get_i64_le(); 367 | current_offset += 4; 368 | 369 | index_info_map.insert(index_name, IndexInfo { start_pos, length }); 370 | } 371 | 372 | header.insert(column_name, index_info_map); 373 | } 374 | 375 | let redundant_length = buffer.get_i32_le() as u64; 376 | current_offset += 4; 377 | 378 | if redundant_length > 0 { 379 | let redundant_bytes = buffer.split_to(redundant_length as usize); 380 | 381 | if redundant_bytes.len() as u64 != redundant_length { 382 | return Err(Error::FileIndexFormatInvalid { 383 | message: format!( 384 | "Expected to read {} redundant bytes, but found only {}, on offset {}", 385 | redundant_length, 386 | redundant_bytes.len(), 387 | current_offset 388 | ), 389 | }); 390 | } 391 | } 392 | 393 | Ok(header) 394 | } 395 | } 396 | 397 | #[cfg(test)] 398 | mod file_index_format_tests { 399 | 400 | use super::*; 401 | use bytes::Bytes; 402 | use std::collections::HashMap; 403 | 404 | #[tokio::test] 405 | async fn test_single_column_single_index() -> crate::Result<()> { 406 | let path = "memory:/tmp/test_single_column_single_index"; 407 | 408 | let mut indexes = HashMap::new(); 409 | let mut index_map = HashMap::new(); 410 | index_map.insert("index1".to_string(), Bytes::from("sample_data")); 411 | indexes.insert("column111".to_string(), index_map); 412 | 413 | let output = write_column_indexes(path, indexes.clone()).await?; 414 | 415 | let input = output.to_input_file(); 416 | 417 | let reader = FileIndexFormatReader::get_file_index(input).await?; 418 | let column_data = reader.get_column_index("column111").await?; 419 | assert_eq!( 420 | column_data.get("index1").unwrap(), 421 | &Bytes::from("sample_data") 422 | ); 423 | 424 | Ok(()) 425 | } 426 | 427 | #[tokio::test] 428 | async fn test_multiple_columns_multiple_indexes() -> crate::Result<()> { 429 | let path = "memory:/tmp/test_multiple_columns_multiple_indexes"; 430 | 431 | let mut indexes = HashMap::new(); 432 | for col_num in 1..5 { 433 | let column_name = format!("column{}", col_num); 434 | let mut index_map = HashMap::new(); 435 | for idx_num in 1..5 { 436 | index_map.insert( 437 | format!("index{}", idx_num), 438 | random_bytes(100 + col_num * idx_num), 439 | ); 440 | } 441 | indexes.insert(column_name, index_map); 442 | } 443 | 444 | let output = write_column_indexes(path, indexes.clone()).await?; 445 | 446 | let input = output.to_input_file(); 447 | 448 | let reader = FileIndexFormatReader::get_file_index(input).await?; 449 | for (column, index_map) in indexes { 450 | let column_data = reader.get_column_index(&column).await?; 451 | for (index_name, expected_data) in index_map { 452 | assert_eq!(column_data.get(&index_name).unwrap(), &expected_data); 453 | } 454 | } 455 | 456 | Ok(()) 457 | } 458 | 459 | #[tokio::test] 460 | async fn test_empty_file_index() -> crate::Result<()> { 461 | let path = "memory:/tmp/test_empty_file_index"; 462 | 463 | let mut indexes = HashMap::new(); 464 | let mut a_index = HashMap::new(); 465 | a_index.insert("b".to_string(), Bytes::new()); 466 | a_index.insert("c".to_string(), Bytes::new()); 467 | indexes.insert("a".to_string(), a_index); 468 | 469 | let output = write_column_indexes(path, indexes.clone()).await?; 470 | 471 | let input = output.to_input_file(); 472 | 473 | let reader = FileIndexFormatReader::get_file_index(input).await?; 474 | 475 | let column_indexes = reader.get_column_index("a").await?; 476 | assert_eq!(column_indexes.len(), 2); 477 | assert_eq!(column_indexes.get("b").unwrap(), &Bytes::new()); 478 | assert_eq!(column_indexes.get("c").unwrap(), &Bytes::new()); 479 | 480 | Ok(()) 481 | } 482 | 483 | #[tokio::test] 484 | async fn test_large_data_set() -> crate::Result<()> { 485 | let path = "memory:/tmp/test_large_data_set"; 486 | 487 | let mut indexes = HashMap::new(); 488 | let mut large_data = HashMap::new(); 489 | large_data.insert("large_index".to_string(), random_bytes(100_000_000)); // 100MB data 490 | indexes.insert("large_column".to_string(), large_data); 491 | 492 | write_column_indexes(path, indexes.clone()).await?; 493 | 494 | let output = write_column_indexes(path, indexes.clone()).await?; 495 | 496 | let input = output.to_input_file(); 497 | 498 | let reader = FileIndexFormatReader::get_file_index(input).await?; 499 | let column_data = reader.get_column_index("large_column").await?; 500 | assert_eq!( 501 | column_data.get("large_index").unwrap(), 502 | &indexes 503 | .get("large_column") 504 | .unwrap() 505 | .get("large_index") 506 | .unwrap() 507 | ); 508 | 509 | Ok(()) 510 | } 511 | 512 | fn random_bytes(len: usize) -> Bytes { 513 | use rand::RngCore; 514 | let mut rng = rand::thread_rng(); 515 | let mut bytes = vec![0u8; len]; 516 | rng.fill_bytes(&mut bytes); 517 | Bytes::from(bytes) 518 | } 519 | } 520 | -------------------------------------------------------------------------------- /crates/paimon/src/file_index/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | mod file_index_format; 19 | pub use file_index_format::*; 20 | -------------------------------------------------------------------------------- /crates/paimon/src/io/file_io.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::error::*; 19 | use std::collections::HashMap; 20 | use std::ops::Range; 21 | use std::sync::Arc; 22 | 23 | use bytes::Bytes; 24 | use chrono::{DateTime, Utc}; 25 | use opendal::Operator; 26 | use snafu::ResultExt; 27 | use url::Url; 28 | 29 | use super::Storage; 30 | 31 | #[derive(Clone, Debug)] 32 | pub struct FileIO { 33 | storage: Arc, 34 | } 35 | 36 | impl FileIO { 37 | /// Try to infer file io scheme from path. 38 | /// 39 | /// The input HashMap is paimon-java's [`Options`](https://github.com/apache/paimon/blob/release-0.8.2/paimon-common/src/main/java/org/apache/paimon/options/Options.java#L60) 40 | pub fn from_url(path: &str) -> crate::Result { 41 | let url = Url::parse(path).map_err(|_| Error::ConfigInvalid { 42 | message: format!("Invalid URL: {}", path), 43 | })?; 44 | 45 | Ok(FileIOBuilder::new(url.scheme())) 46 | } 47 | 48 | /// Create a new input file to read data. 49 | /// 50 | /// Reference: 51 | pub fn new_input(&self, path: &str) -> crate::Result { 52 | let (op, relative_path) = self.storage.create(path)?; 53 | let path = path.to_string(); 54 | let relative_path_pos = path.len() - relative_path.len(); 55 | Ok(InputFile { 56 | op, 57 | path, 58 | relative_path_pos, 59 | }) 60 | } 61 | 62 | /// Create a new output file to write data. 63 | /// 64 | /// Reference: 65 | pub fn new_output(&self, path: &str) -> Result { 66 | let (op, relative_path) = self.storage.create(path)?; 67 | let path = path.to_string(); 68 | let relative_path_pos = path.len() - relative_path.len(); 69 | Ok(OutputFile { 70 | op, 71 | path, 72 | relative_path_pos, 73 | }) 74 | } 75 | 76 | /// Return a file status object that represents the path. 77 | /// 78 | /// Reference: 79 | pub async fn get_status(&self, path: &str) -> Result { 80 | let (op, relative_path) = self.storage.create(path)?; 81 | let meta = op.stat(relative_path).await.context(IoUnexpectedSnafu { 82 | message: format!("Failed to get file status for '{}'", path), 83 | })?; 84 | 85 | Ok(FileStatus { 86 | size: meta.content_length(), 87 | is_dir: meta.is_dir(), 88 | last_modified: meta.last_modified(), 89 | path: path.to_string(), 90 | }) 91 | } 92 | 93 | /// List the statuses of the files/directories in the given path if the path is a directory. 94 | /// 95 | /// References: 96 | /// 97 | /// FIXME: how to handle large dir? Better to return a stream instead? 98 | pub async fn list_status(&self, path: &str) -> Result> { 99 | let (op, relative_path) = self.storage.create(path)?; 100 | 101 | let entries = op.list(relative_path).await.context(IoUnexpectedSnafu { 102 | message: format!("Failed to list files in '{}'", path), 103 | })?; 104 | 105 | let mut statuses = Vec::new(); 106 | 107 | for entry in entries { 108 | let meta = entry.metadata(); 109 | statuses.push(FileStatus { 110 | size: meta.content_length(), 111 | is_dir: meta.is_dir(), 112 | path: path.to_string(), 113 | last_modified: meta.last_modified(), 114 | }); 115 | } 116 | 117 | Ok(statuses) 118 | } 119 | 120 | /// Check if exists. 121 | /// 122 | /// References: 123 | pub async fn exists(&self, path: &str) -> Result { 124 | let (op, relative_path) = self.storage.create(path)?; 125 | 126 | op.is_exist(relative_path).await.context(IoUnexpectedSnafu { 127 | message: format!("Failed to check existence of '{}'", path), 128 | }) 129 | } 130 | 131 | /// Delete a file. 132 | /// 133 | /// Reference: 134 | pub async fn delete_file(&self, path: &str) -> Result<()> { 135 | let (op, relative_path) = self.storage.create(path)?; 136 | 137 | op.delete(relative_path).await.context(IoUnexpectedSnafu { 138 | message: format!("Failed to delete file '{}'", path), 139 | })?; 140 | 141 | Ok(()) 142 | } 143 | 144 | /// Delete a dir recursively. 145 | /// 146 | /// Reference: 147 | pub async fn delete_dir(&self, path: &str) -> Result<()> { 148 | let (op, relative_path) = self.storage.create(path)?; 149 | 150 | op.remove_all(relative_path) 151 | .await 152 | .context(IoUnexpectedSnafu { 153 | message: format!("Failed to delete directory '{}'", path), 154 | })?; 155 | 156 | Ok(()) 157 | } 158 | 159 | /// Make the given file and all non-existent parents into directories. 160 | /// 161 | /// Has the semantics of Unix 'mkdir -p'. Existence of the directory hierarchy is not an error. 162 | /// 163 | /// Reference: 164 | pub async fn mkdirs(&self, path: &str) -> Result<()> { 165 | let (op, relative_path) = self.storage.create(path)?; 166 | 167 | op.create_dir(relative_path) 168 | .await 169 | .context(IoUnexpectedSnafu { 170 | message: format!("Failed to create directory '{}'", path), 171 | })?; 172 | 173 | Ok(()) 174 | } 175 | 176 | /// Renames the file/directory src to dst. 177 | /// 178 | /// Reference: 179 | pub async fn rename(&self, src: &str, dst: &str) -> Result<()> { 180 | let (op_src, relative_path_src) = self.storage.create(src)?; 181 | let (_, relative_path_dst) = self.storage.create(dst)?; 182 | 183 | op_src 184 | .rename(relative_path_src, relative_path_dst) 185 | .await 186 | .context(IoUnexpectedSnafu { 187 | message: format!("Failed to rename '{}' to '{}'", src, dst), 188 | })?; 189 | 190 | Ok(()) 191 | } 192 | } 193 | 194 | #[derive(Debug)] 195 | pub struct FileIOBuilder { 196 | scheme_str: Option, 197 | props: HashMap, 198 | } 199 | 200 | impl FileIOBuilder { 201 | pub fn new(scheme_str: impl ToString) -> Self { 202 | Self { 203 | scheme_str: Some(scheme_str.to_string()), 204 | props: HashMap::default(), 205 | } 206 | } 207 | 208 | pub(crate) fn into_parts(self) -> (String, HashMap) { 209 | (self.scheme_str.unwrap_or_default(), self.props) 210 | } 211 | 212 | pub fn with_prop(mut self, key: impl ToString, value: impl ToString) -> Self { 213 | self.props.insert(key.to_string(), value.to_string()); 214 | self 215 | } 216 | 217 | pub fn with_props( 218 | mut self, 219 | args: impl IntoIterator, 220 | ) -> Self { 221 | self.props 222 | .extend(args.into_iter().map(|e| (e.0.to_string(), e.1.to_string()))); 223 | self 224 | } 225 | 226 | pub fn build(self) -> crate::Result { 227 | let storage = Storage::build(self)?; 228 | Ok(FileIO { 229 | storage: Arc::new(storage), 230 | }) 231 | } 232 | } 233 | 234 | #[async_trait::async_trait] 235 | pub trait FileRead: Send + Unpin + 'static { 236 | async fn read(&self, range: Range) -> crate::Result; 237 | } 238 | 239 | #[async_trait::async_trait] 240 | impl FileRead for opendal::Reader { 241 | async fn read(&self, range: Range) -> crate::Result { 242 | Ok(opendal::Reader::read(self, range).await?.to_bytes()) 243 | } 244 | } 245 | 246 | #[async_trait::async_trait] 247 | pub trait FileWrite: Send + Unpin + 'static { 248 | async fn write(&mut self, bs: Bytes) -> crate::Result<()>; 249 | 250 | async fn close(&mut self) -> crate::Result<()>; 251 | } 252 | 253 | #[async_trait::async_trait] 254 | impl FileWrite for opendal::Writer { 255 | async fn write(&mut self, bs: Bytes) -> crate::Result<()> { 256 | Ok(opendal::Writer::write(self, bs).await?) 257 | } 258 | 259 | async fn close(&mut self) -> crate::Result<()> { 260 | Ok(opendal::Writer::close(self).await?) 261 | } 262 | } 263 | 264 | #[derive(Clone, Debug)] 265 | pub struct FileStatus { 266 | pub size: u64, 267 | pub is_dir: bool, 268 | pub path: String, 269 | pub last_modified: Option>, 270 | } 271 | 272 | #[derive(Debug)] 273 | pub struct InputFile { 274 | op: Operator, 275 | path: String, 276 | relative_path_pos: usize, 277 | } 278 | 279 | impl InputFile { 280 | pub fn location(&self) -> &str { 281 | &self.path 282 | } 283 | 284 | pub async fn exists(&self) -> crate::Result { 285 | Ok(self 286 | .op 287 | .is_exist(&self.path[self.relative_path_pos..]) 288 | .await?) 289 | } 290 | 291 | pub async fn metadata(&self) -> crate::Result { 292 | let meta = self.op.stat(&self.path[self.relative_path_pos..]).await?; 293 | 294 | Ok(FileStatus { 295 | size: meta.content_length(), 296 | is_dir: meta.is_dir(), 297 | path: self.path.clone(), 298 | last_modified: meta.last_modified(), 299 | }) 300 | } 301 | 302 | pub async fn read(&self) -> crate::Result { 303 | Ok(self 304 | .op 305 | .read(&self.path[self.relative_path_pos..]) 306 | .await? 307 | .to_bytes()) 308 | } 309 | 310 | pub async fn reader(&self) -> crate::Result { 311 | Ok(self.op.reader(&self.path[self.relative_path_pos..]).await?) 312 | } 313 | } 314 | 315 | #[derive(Debug, Clone)] 316 | pub struct OutputFile { 317 | op: Operator, 318 | path: String, 319 | relative_path_pos: usize, 320 | } 321 | 322 | impl OutputFile { 323 | pub fn location(&self) -> &str { 324 | &self.path 325 | } 326 | 327 | pub async fn exists(&self) -> crate::Result { 328 | Ok(self 329 | .op 330 | .is_exist(&self.path[self.relative_path_pos..]) 331 | .await?) 332 | } 333 | 334 | pub fn to_input_file(self) -> InputFile { 335 | InputFile { 336 | op: self.op, 337 | path: self.path, 338 | relative_path_pos: self.relative_path_pos, 339 | } 340 | } 341 | 342 | pub async fn write(&self, bs: Bytes) -> crate::Result<()> { 343 | let mut writer = self.writer().await?; 344 | writer.write(bs).await?; 345 | writer.close().await 346 | } 347 | 348 | pub async fn writer(&self) -> crate::Result> { 349 | Ok(Box::new( 350 | self.op.writer(&self.path[self.relative_path_pos..]).await?, 351 | )) 352 | } 353 | } 354 | 355 | #[cfg(test)] 356 | mod file_action_test { 357 | use std::fs; 358 | 359 | use super::*; 360 | use bytes::Bytes; 361 | 362 | fn setup_memory_file_io() -> FileIO { 363 | let storage = Storage::Memory; 364 | FileIO { 365 | storage: Arc::new(storage), 366 | } 367 | } 368 | 369 | fn setup_fs_file_io() -> FileIO { 370 | let storage = Storage::LocalFs; 371 | FileIO { 372 | storage: Arc::new(storage), 373 | } 374 | } 375 | 376 | async fn common_test_get_status(file_io: &FileIO, path: &str) { 377 | let output = file_io.new_output(path).unwrap(); 378 | let mut writer = output.writer().await.unwrap(); 379 | writer.write(Bytes::from("hello world")).await.unwrap(); 380 | writer.close().await.unwrap(); 381 | 382 | let status = file_io.get_status(path).await.unwrap(); 383 | assert_eq!(status.size, 11); 384 | 385 | file_io.delete_file(path).await.unwrap(); 386 | } 387 | 388 | async fn common_test_exists(file_io: &FileIO, path: &str) { 389 | let output = file_io.new_output(path).unwrap(); 390 | let mut writer = output.writer().await.unwrap(); 391 | writer.write(Bytes::from("hello world")).await.unwrap(); 392 | writer.close().await.unwrap(); 393 | 394 | let exists = file_io.exists(path).await.unwrap(); 395 | assert!(exists); 396 | 397 | file_io.delete_file(path).await.unwrap(); 398 | } 399 | 400 | async fn common_test_delete_file(file_io: &FileIO, path: &str) { 401 | let output = file_io.new_output(path).unwrap(); 402 | let mut writer = output.writer().await.unwrap(); 403 | writer.write(Bytes::from("hello world")).await.unwrap(); 404 | writer.close().await.unwrap(); 405 | 406 | file_io.delete_file(path).await.unwrap(); 407 | 408 | let exists = file_io.exists(path).await.unwrap(); 409 | assert!(!exists); 410 | } 411 | 412 | async fn common_test_mkdirs(file_io: &FileIO, dir_path: &str) { 413 | file_io.mkdirs(dir_path).await.unwrap(); 414 | 415 | let exists = file_io.exists(dir_path).await.unwrap(); 416 | assert!(exists); 417 | 418 | let _ = fs::remove_dir_all(dir_path.strip_prefix("file:/").unwrap()); 419 | } 420 | 421 | async fn common_test_rename(file_io: &FileIO, src: &str, dst: &str) { 422 | let output = file_io.new_output(src).unwrap(); 423 | let mut writer = output.writer().await.unwrap(); 424 | writer.write(Bytes::from("hello world")).await.unwrap(); 425 | writer.close().await.unwrap(); 426 | 427 | file_io.rename(src, dst).await.unwrap(); 428 | 429 | let exists_old = file_io.exists(src).await.unwrap(); 430 | let exists_new = file_io.exists(dst).await.unwrap(); 431 | assert!(!exists_old); 432 | assert!(exists_new); 433 | 434 | file_io.delete_file(dst).await.unwrap(); 435 | } 436 | 437 | #[tokio::test] 438 | async fn test_delete_file_memory() { 439 | let file_io = setup_memory_file_io(); 440 | common_test_delete_file(&file_io, "memory:/test_file_delete_mem").await; 441 | } 442 | 443 | #[tokio::test] 444 | async fn test_get_status_fs() { 445 | let file_io = setup_fs_file_io(); 446 | common_test_get_status(&file_io, "file:/tmp/test_file_get_status_fs").await; 447 | } 448 | 449 | #[tokio::test] 450 | async fn test_exists_fs() { 451 | let file_io = setup_fs_file_io(); 452 | common_test_exists(&file_io, "file:/tmp/test_file_exists_fs").await; 453 | } 454 | 455 | #[tokio::test] 456 | async fn test_delete_file_fs() { 457 | let file_io = setup_fs_file_io(); 458 | common_test_delete_file(&file_io, "file:/tmp/test_file_delete_fs").await; 459 | } 460 | 461 | #[tokio::test] 462 | async fn test_mkdirs_fs() { 463 | let file_io = setup_fs_file_io(); 464 | common_test_mkdirs(&file_io, "file:/tmp/test_fs_dir/").await; 465 | } 466 | 467 | #[tokio::test] 468 | async fn test_rename_fs() { 469 | let file_io = setup_fs_file_io(); 470 | common_test_rename( 471 | &file_io, 472 | "file:/tmp/test_file_fs_z", 473 | "file:/tmp/new_test_file_fs_o", 474 | ) 475 | .await; 476 | } 477 | } 478 | 479 | #[cfg(test)] 480 | mod input_output_test { 481 | use super::*; 482 | use bytes::Bytes; 483 | 484 | fn setup_memory_file_io() -> FileIO { 485 | let storage = Storage::Memory; 486 | FileIO { 487 | storage: Arc::new(storage), 488 | } 489 | } 490 | 491 | fn setup_fs_file_io() -> FileIO { 492 | let storage = Storage::LocalFs; 493 | FileIO { 494 | storage: Arc::new(storage), 495 | } 496 | } 497 | 498 | async fn common_test_output_file_write_and_read(file_io: &FileIO, path: &str) { 499 | let output = file_io.new_output(path).unwrap(); 500 | let mut writer = output.writer().await.unwrap(); 501 | writer.write(Bytes::from("hello world")).await.unwrap(); 502 | writer.close().await.unwrap(); 503 | 504 | let input = output.to_input_file(); 505 | let content = input.read().await.unwrap(); 506 | 507 | assert_eq!(&content[..], b"hello world"); 508 | 509 | file_io.delete_file(path).await.unwrap(); 510 | } 511 | 512 | async fn common_test_output_file_exists(file_io: &FileIO, path: &str) { 513 | let output = file_io.new_output(path).unwrap(); 514 | let mut writer = output.writer().await.unwrap(); 515 | writer.write(Bytes::from("hello world")).await.unwrap(); 516 | writer.close().await.unwrap(); 517 | 518 | let exists = output.exists().await.unwrap(); 519 | assert!(exists); 520 | 521 | file_io.delete_file(path).await.unwrap(); 522 | } 523 | 524 | async fn common_test_input_file_metadata(file_io: &FileIO, path: &str) { 525 | let output = file_io.new_output(path).unwrap(); 526 | let mut writer = output.writer().await.unwrap(); 527 | writer.write(Bytes::from("hello world")).await.unwrap(); 528 | writer.close().await.unwrap(); 529 | 530 | let input = output.to_input_file(); 531 | let metadata = input.metadata().await.unwrap(); 532 | 533 | assert_eq!(metadata.size, 11); 534 | 535 | file_io.delete_file(path).await.unwrap(); 536 | } 537 | 538 | async fn common_test_input_file_partial_read(file_io: &FileIO, path: &str) { 539 | let output = file_io.new_output(path).unwrap(); 540 | let mut writer = output.writer().await.unwrap(); 541 | writer.write(Bytes::from("hello world")).await.unwrap(); 542 | writer.close().await.unwrap(); 543 | 544 | let input = output.to_input_file(); 545 | let reader = input.reader().await.unwrap(); 546 | let partial_content = reader.read(0..5).await.unwrap(); // read "hello" 547 | 548 | assert_eq!(&partial_content[..], b"hello"); 549 | 550 | file_io.delete_file(path).await.unwrap(); 551 | } 552 | 553 | #[tokio::test] 554 | async fn test_output_file_write_and_read_memory() { 555 | let file_io = setup_memory_file_io(); 556 | common_test_output_file_write_and_read(&file_io, "memory:/test_file_rw_mem").await; 557 | } 558 | 559 | #[tokio::test] 560 | async fn test_output_file_exists_memory() { 561 | let file_io = setup_memory_file_io(); 562 | common_test_output_file_exists(&file_io, "memory:/test_file_exist_mem").await; 563 | } 564 | 565 | #[tokio::test] 566 | async fn test_input_file_metadata_memory() { 567 | let file_io = setup_memory_file_io(); 568 | common_test_input_file_metadata(&file_io, "memory:/test_file_meta_mem").await; 569 | } 570 | 571 | #[tokio::test] 572 | async fn test_input_file_partial_read_memory() { 573 | let file_io = setup_memory_file_io(); 574 | common_test_input_file_partial_read(&file_io, "memory:/test_file_part_read_mem").await; 575 | } 576 | 577 | #[tokio::test] 578 | async fn test_output_file_write_and_read_fs() { 579 | let file_io = setup_fs_file_io(); 580 | common_test_output_file_write_and_read(&file_io, "file:/tmp/test_file_fs_rw").await; 581 | } 582 | 583 | #[tokio::test] 584 | async fn test_output_file_exists_fs() { 585 | let file_io = setup_fs_file_io(); 586 | common_test_output_file_exists(&file_io, "file:/tmp/test_file_exists").await; 587 | } 588 | 589 | #[tokio::test] 590 | async fn test_input_file_metadata_fs() { 591 | let file_io = setup_fs_file_io(); 592 | common_test_input_file_metadata(&file_io, "file:/tmp/test_file_meta").await; 593 | } 594 | 595 | #[tokio::test] 596 | async fn test_input_file_partial_read_fs() { 597 | let file_io = setup_fs_file_io(); 598 | common_test_input_file_partial_read(&file_io, "file:/tmp/test_file_read_fs").await; 599 | } 600 | } 601 | -------------------------------------------------------------------------------- /crates/paimon/src/io/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | mod file_io; 19 | pub use file_io::*; 20 | 21 | mod storage; 22 | pub use storage::*; 23 | 24 | #[cfg(feature = "storage-fs")] 25 | mod storage_fs; 26 | #[cfg(feature = "storage-fs")] 27 | use storage_fs::*; 28 | 29 | #[cfg(feature = "storage-memory")] 30 | mod storage_memory; 31 | #[cfg(feature = "storage-memory")] 32 | use storage_memory::*; 33 | -------------------------------------------------------------------------------- /crates/paimon/src/io/storage.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use opendal::{Operator, Scheme}; 19 | 20 | use crate::error; 21 | 22 | use super::FileIOBuilder; 23 | 24 | /// The storage carries all supported storage services in paimon 25 | #[derive(Debug)] 26 | pub enum Storage { 27 | #[cfg(feature = "storage-memory")] 28 | Memory, 29 | #[cfg(feature = "storage-fs")] 30 | LocalFs, 31 | } 32 | 33 | impl Storage { 34 | pub(crate) fn build(file_io_builder: FileIOBuilder) -> crate::Result { 35 | let (scheme_str, _) = file_io_builder.into_parts(); 36 | let scheme = Self::parse_scheme(&scheme_str)?; 37 | 38 | match scheme { 39 | #[cfg(feature = "storage-memory")] 40 | Scheme::Memory => Ok(Self::Memory), 41 | #[cfg(feature = "storage-fs")] 42 | Scheme::Fs => Ok(Self::LocalFs), 43 | _ => Err(error::Error::IoUnsupported { 44 | message: "Unsupported storage feature".to_string(), 45 | }), 46 | } 47 | } 48 | 49 | pub(crate) fn create<'a>(&self, path: &'a str) -> crate::Result<(Operator, &'a str)> { 50 | match self { 51 | #[cfg(feature = "storage-memory")] 52 | Storage::Memory => { 53 | let op = super::memory_config_build()?; 54 | 55 | if let Some(stripped) = path.strip_prefix("memory:/") { 56 | Ok((op, stripped)) 57 | } else { 58 | Ok((op, &path[1..])) 59 | } 60 | } 61 | #[cfg(feature = "storage-fs")] 62 | Storage::LocalFs => { 63 | let op = super::fs_config_build()?; 64 | 65 | if let Some(stripped) = path.strip_prefix("file:/") { 66 | Ok((op, stripped)) 67 | } else { 68 | Ok((op, &path[1..])) 69 | } 70 | } 71 | } 72 | } 73 | 74 | fn parse_scheme(scheme: &str) -> crate::Result { 75 | match scheme { 76 | "memory" => Ok(Scheme::Memory), 77 | "file" | "" => Ok(Scheme::Fs), 78 | s => Ok(s.parse::()?), 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /crates/paimon/src/io/storage_fs.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use opendal::services::FsConfig; 19 | use opendal::Operator; 20 | 21 | use crate::Result; 22 | 23 | /// Build new opendal operator from give path. 24 | pub(crate) fn fs_config_build() -> Result { 25 | let mut cfg = FsConfig::default(); 26 | cfg.root = Some("/".to_string()); 27 | 28 | Ok(Operator::from_config(cfg)?.finish()) 29 | } 30 | -------------------------------------------------------------------------------- /crates/paimon/src/io/storage_memory.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use opendal::services::MemoryConfig; 19 | use opendal::Operator; 20 | 21 | use crate::Result; 22 | 23 | pub(crate) fn memory_config_build() -> Result { 24 | Ok(Operator::from_config(MemoryConfig::default())?.finish()) 25 | } 26 | -------------------------------------------------------------------------------- /crates/paimon/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | mod error; 19 | pub use error::Error; 20 | pub use error::Result; 21 | 22 | pub mod file_index; 23 | pub mod io; 24 | pub mod spec; 25 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/data_file.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::spec::stats::BinaryTableStats; 19 | use chrono::serde::ts_milliseconds::deserialize as from_millis; 20 | use chrono::serde::ts_milliseconds::serialize as to_millis; 21 | use chrono::{DateTime, Utc}; 22 | use serde::{Deserialize, Serialize}; 23 | use std::fmt::{Display, Formatter}; 24 | 25 | pub const EMPTY_BINARY_ROW: BinaryRow = BinaryRow::new(0); 26 | 27 | /// An implementation of InternalRow. 28 | /// 29 | /// Impl Reference: 30 | #[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] 31 | #[serde(rename_all = "camelCase")] 32 | pub struct BinaryRow { 33 | arity: i32, 34 | null_bits_size_in_bytes: i32, 35 | } 36 | 37 | impl BinaryRow { 38 | pub const HEADER_SIZE_IN_BYTES: i32 = 8; 39 | pub const fn cal_bit_set_width_in_bytes(arity: i32) -> i32 { 40 | ((arity + 63 + Self::HEADER_SIZE_IN_BYTES) / 64) * 8 41 | } 42 | pub const fn cal_fix_part_size_in_bytes(arity: i32) -> i32 { 43 | Self::cal_bit_set_width_in_bytes(arity) + 8 * arity 44 | } 45 | pub const fn new(arity: i32) -> Self { 46 | Self { 47 | arity, 48 | null_bits_size_in_bytes: (arity + 7) / 8, 49 | } 50 | } 51 | } 52 | 53 | /// Metadata of a data file. 54 | /// 55 | /// Impl References: 56 | #[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] 57 | #[serde(rename_all = "camelCase")] 58 | pub struct DataFileMeta { 59 | #[serde(rename = "_FILE_NAME")] 60 | pub file_name: String, 61 | #[serde(rename = "_FILE_SIZE")] 62 | pub file_size: i64, 63 | // row_count tells the total number of rows (including add & delete) in this file. 64 | #[serde(rename = "_ROW_COUNT")] 65 | pub row_count: i64, 66 | #[serde(rename = "_MIN_KEY", with = "serde_bytes")] 67 | pub min_key: Vec, 68 | #[serde(rename = "_MAX_KEY", with = "serde_bytes")] 69 | pub max_key: Vec, 70 | #[serde(rename = "_KEY_STATS")] 71 | pub key_stats: BinaryTableStats, 72 | #[serde(rename = "_VALUE_STATS")] 73 | pub value_stats: BinaryTableStats, 74 | #[serde(rename = "_MIN_SEQUENCE_NUMBER")] 75 | pub min_sequence_number: i64, 76 | #[serde(rename = "_MAX_SEQUENCE_NUMBER")] 77 | pub max_sequence_number: i64, 78 | #[serde(rename = "_SCHEMA_ID")] 79 | pub schema_id: i64, 80 | #[serde(rename = "_LEVEL")] 81 | pub level: i32, 82 | #[serde(rename = "_EXTRA_FILES")] 83 | pub extra_files: Vec, 84 | #[serde( 85 | rename = "_CREATION_TIME", 86 | serialize_with = "to_millis", 87 | deserialize_with = "from_millis" 88 | )] 89 | pub creation_time: DateTime, 90 | #[serde(rename = "_DELETE_ROW_COUNT")] 91 | // rowCount = add_row_count + delete_row_count. 92 | pub delete_row_count: Option, 93 | // file index filter bytes, if it is small, store in data file meta 94 | #[serde(rename = "_EMBEDDED_FILE_INDEX", with = "serde_bytes")] 95 | pub embedded_index: Option>, 96 | } 97 | 98 | impl Display for DataFileMeta { 99 | fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { 100 | todo!() 101 | } 102 | } 103 | 104 | #[allow(dead_code)] 105 | impl DataFileMeta {} 106 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/index_file_meta.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use serde::{Deserialize, Serialize}; 19 | use std::fmt::{Display, Formatter}; 20 | 21 | use indexmap::IndexMap; 22 | 23 | /// Metadata of index file. 24 | /// 25 | /// Impl Reference: 26 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] 27 | pub struct IndexFileMeta { 28 | #[serde(rename = "_INDEX_TYPE")] 29 | pub index_type: String, 30 | 31 | #[serde(rename = "_FILE_NAME")] 32 | pub file_name: String, 33 | 34 | #[serde(rename = "_FILE_SIZE")] 35 | pub file_size: i32, 36 | 37 | #[serde(rename = "_ROW_COUNT")] 38 | pub row_count: i32, 39 | 40 | // use Indexmap to ensure the order of deletion_vectors_ranges is consistent. 41 | #[serde( 42 | default, 43 | with = "map_serde", 44 | rename = "_DELETIONS_VECTORS_RANGES", 45 | alias = "_DELETION_VECTORS_RANGES" 46 | )] 47 | pub deletion_vectors_ranges: Option>, 48 | } 49 | 50 | impl Display for IndexFileMeta { 51 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 52 | write!( 53 | f, 54 | "IndexFileMeta{{index_type={}, fileName={}, fileSize={}, rowCount={}, deletion_vectors_ranges={:?}}}", 55 | self.index_type, 56 | self.file_name, 57 | self.file_size, 58 | self.row_count, 59 | self.deletion_vectors_ranges, 60 | ) 61 | } 62 | } 63 | 64 | mod map_serde { 65 | use indexmap::IndexMap; 66 | use serde::{Deserialize, Deserializer, Serialize, Serializer}; 67 | 68 | #[derive(Deserialize, Serialize)] 69 | struct Temp { 70 | f0: String, 71 | f1: i32, 72 | f2: i32, 73 | } 74 | 75 | pub fn serialize( 76 | date: &Option>, 77 | s: S, 78 | ) -> Result 79 | where 80 | S: Serializer, 81 | { 82 | match *date { 83 | None => s.serialize_none(), 84 | Some(ref d) => s.collect_seq(d.iter().map(|(s, (i1, i2))| Temp { 85 | f0: s.into(), 86 | f1: *i1, 87 | f2: *i2, 88 | })), 89 | } 90 | } 91 | 92 | #[allow(clippy::type_complexity)] 93 | pub fn deserialize<'de, D>( 94 | deserializer: D, 95 | ) -> Result>, D::Error> 96 | where 97 | D: Deserializer<'de>, 98 | { 99 | match Option::deserialize(deserializer)? { 100 | None => Ok(None), 101 | Some::>(s) => Ok(Some( 102 | s.into_iter() 103 | .map(|t| (t.f0, (t.f1, t.f2))) 104 | .collect::>(), 105 | )), 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/index_manifest.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::spec::manifest_common::FileKind; 19 | use crate::spec::IndexFileMeta; 20 | use serde::{Deserialize, Serialize}; 21 | use std::fmt::{Display, Formatter}; 22 | 23 | /// Manifest entry for index file. 24 | /// 25 | /// Impl Reference: 26 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] 27 | pub struct IndexManifestEntry { 28 | #[serde(rename = "_KIND")] 29 | pub kind: FileKind, 30 | 31 | #[serde(rename = "_PARTITION", with = "serde_bytes")] 32 | pub partition: Vec, 33 | 34 | #[serde(rename = "_BUCKET")] 35 | pub bucket: i32, 36 | 37 | #[serde(flatten)] 38 | pub index_file: IndexFileMeta, 39 | 40 | #[serde(rename = "_VERSION")] 41 | pub version: i32, 42 | } 43 | 44 | impl Display for IndexManifestEntry { 45 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 46 | write!( 47 | f, 48 | "IndexManifestEntry{{kind={:?}, partition={:?}, bucket={}, index_file={}}}", 49 | self.kind, self.partition, self.bucket, self.index_file, 50 | ) 51 | } 52 | } 53 | 54 | #[cfg(test)] 55 | mod tests { 56 | use indexmap::IndexMap; 57 | 58 | use super::*; 59 | 60 | #[test] 61 | fn test_read_index_manifest_file() { 62 | let workdir = 63 | std::env::current_dir().unwrap_or_else(|err| panic!("current_dir must exist: {err}")); 64 | let path = workdir 65 | .join("tests/fixtures/manifest/index-manifest-85cc6729-f5af-431a-a1c3-ef45319328fb-0"); 66 | let source = std::fs::read(path.to_str().unwrap()).unwrap(); 67 | let mut reader = 68 | serde_avro_fast::object_container_file_encoding::Reader::from_slice(source.as_slice()) 69 | .unwrap(); 70 | let res: Vec<_> = reader 71 | .deserialize::() 72 | .collect::>() 73 | .unwrap(); 74 | assert_eq!( 75 | res, 76 | vec![ 77 | IndexManifestEntry { 78 | version: 1, 79 | kind: FileKind::Add, 80 | partition: vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 81 | bucket: 0, 82 | index_file: IndexFileMeta { 83 | index_type: "HASH".into(), 84 | file_name: "index-a984b43a-c3fb-40b4-ad29-536343c239a6-0".into(), 85 | file_size: 16, 86 | row_count: 4, 87 | deletion_vectors_ranges: None, 88 | } 89 | }, 90 | IndexManifestEntry { 91 | version: 1, 92 | kind: FileKind::Add, 93 | partition: vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 94 | bucket: 0, 95 | index_file: IndexFileMeta { 96 | index_type: "DELETION_VECTORS".into(), 97 | file_name: "index-3f0986c5-4398-449b-be82-95f019d7a748-0".into(), 98 | file_size: 33, 99 | row_count: 1, 100 | deletion_vectors_ranges: Some(IndexMap::from([( 101 | "data-9b76122c-6bb5-4952-a946-b5bce29694a1-0.orc".into(), 102 | (1, 24) 103 | )])), 104 | } 105 | } 106 | ] 107 | ); 108 | } 109 | 110 | #[test] 111 | fn test_single_object_serde() { 112 | let sample = IndexManifestEntry { 113 | version: 1, 114 | kind: FileKind::Delete, 115 | partition: vec![0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6], 116 | bucket: 0, 117 | index_file: IndexFileMeta { 118 | index_type: "DELETION_VECTORS".into(), 119 | file_name: "test1".into(), 120 | file_size: 33, 121 | row_count: 1, 122 | deletion_vectors_ranges: Some(IndexMap::from([("test1".into(), (1, 24))])), 123 | }, 124 | }; 125 | 126 | let schema: serde_avro_fast::Schema = r#"["null", { 127 | "type": "record", 128 | "name": "org.apache.paimon.avro.generated.record", 129 | "fields": [ 130 | {"name": "_VERSION", "type": "int"}, 131 | {"name": "_KIND", "type": "int"}, 132 | {"name": "_PARTITION", "type": "bytes"}, 133 | {"name": "_BUCKET", "type": "int"}, 134 | {"name": "_INDEX_TYPE", "type": "string"}, 135 | {"name": "_FILE_NAME", "type": "string"}, 136 | {"name": "_FILE_SIZE", "type": "long"}, 137 | {"name": "_ROW_COUNT", "type": "long"}, 138 | { 139 | "default": null, 140 | "name": "_DELETIONS_VECTORS_RANGES", 141 | "type": ["null", { 142 | "type": "array", 143 | "items": ["null", { 144 | "type": "record", 145 | "name": "org.apache.paimon.avro.generated.record__DELETIONS_VECTORS_RANGES", 146 | "fields": [ 147 | {"name": "f0", "type": "string"}, 148 | {"name": "f1", "type": "int"}, 149 | {"name": "f2", "type": "int"} 150 | ] 151 | }] 152 | }] 153 | } 154 | ] 155 | }]"# 156 | .parse().unwrap(); 157 | 158 | let serializer_config = &mut serde_avro_fast::ser::SerializerConfig::new(&schema); 159 | let encoded = serde_avro_fast::to_single_object_vec(&sample, serializer_config).unwrap(); 160 | let decoded: IndexManifestEntry = 161 | serde_avro_fast::from_single_object_slice(encoded.as_slice(), &schema).unwrap(); 162 | assert_eq!(sample, decoded); 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/manifest_common.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use serde_repr::{Deserialize_repr, Serialize_repr}; 19 | 20 | /// Kind of a file. 21 | /// Impl Reference: 22 | #[derive(PartialEq, Eq, Debug, Clone, Serialize_repr, Deserialize_repr)] 23 | #[repr(u8)] 24 | pub enum FileKind { 25 | Add = 0, 26 | Delete = 1, 27 | } 28 | 29 | /// The Source of a file. 30 | /// Impl References: 31 | #[derive(PartialEq, Eq, Debug, Clone, Serialize_repr, Deserialize_repr)] 32 | #[repr(u8)] 33 | pub enum FileSource { 34 | Append = 0, 35 | Compact = 1, 36 | } 37 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/manifest_entry.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::spec::manifest_common::FileKind; 19 | use crate::spec::DataFileMeta; 20 | use serde::Deserialize; 21 | use serde_with::serde_derive::Serialize; 22 | 23 | /// The same {@link Identifier} indicates that the {@link ManifestEntry} refers to the same data file. 24 | /// 25 | /// Impl Reference: 26 | #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] 27 | pub struct Identifier { 28 | pub partition: Vec, 29 | pub bucket: i32, 30 | pub level: i32, 31 | pub file_name: String, 32 | } 33 | 34 | /// Entry of a manifest file, representing an addition / deletion of a data file. 35 | /// Impl Reference: 36 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] 37 | pub struct ManifestEntry { 38 | #[serde(rename = "_KIND")] 39 | kind: FileKind, 40 | 41 | #[serde(rename = "_PARTITION", with = "serde_bytes")] 42 | partition: Vec, 43 | 44 | #[serde(rename = "_BUCKET")] 45 | bucket: i32, 46 | 47 | #[serde(rename = "_TOTAL_BUCKETS")] 48 | total_buckets: i32, 49 | 50 | #[serde(rename = "_FILE")] 51 | file: DataFileMeta, 52 | 53 | #[serde(rename = "_VERSION")] 54 | version: i32, 55 | } 56 | 57 | #[allow(dead_code)] 58 | impl ManifestEntry { 59 | fn kind(&self) -> &FileKind { 60 | &self.kind 61 | } 62 | 63 | fn partition(&self) -> &Vec { 64 | &self.partition 65 | } 66 | 67 | fn bucket(&self) -> i32 { 68 | self.bucket 69 | } 70 | 71 | fn level(&self) -> i32 { 72 | self.file.level 73 | } 74 | 75 | fn file_name(&self) -> &str { 76 | &self.file.file_name 77 | } 78 | 79 | fn min_key(&self) -> &Vec { 80 | &self.file.min_key 81 | } 82 | 83 | fn max_key(&self) -> &Vec { 84 | &self.file.max_key 85 | } 86 | 87 | fn identifier(&self) -> Identifier { 88 | Identifier { 89 | partition: self.partition.clone(), 90 | bucket: self.bucket, 91 | level: self.file.level, 92 | file_name: self.file.file_name.clone(), 93 | } 94 | } 95 | 96 | pub fn total_buckets(&self) -> i32 { 97 | self.total_buckets 98 | } 99 | 100 | pub fn file(&self) -> &DataFileMeta { 101 | &self.file 102 | } 103 | 104 | pub fn new( 105 | kind: FileKind, 106 | partition: Vec, 107 | bucket: i32, 108 | total_buckets: i32, 109 | file: DataFileMeta, 110 | version: i32, 111 | ) -> Self { 112 | ManifestEntry { 113 | kind, 114 | partition, 115 | bucket, 116 | total_buckets, 117 | file, 118 | version, 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/manifest_file_meta.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::spec::stats::BinaryTableStats; 19 | use serde::{Deserialize, Serialize}; 20 | use std::fmt::{Display, Formatter}; 21 | 22 | /// Metadata of a manifest file. 23 | /// 24 | /// Impl Reference: 25 | #[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] 26 | pub struct ManifestFileMeta { 27 | #[serde(rename = "_VERSION")] 28 | version: i32, 29 | 30 | /// manifest file name 31 | #[serde(rename = "_FILE_NAME")] 32 | file_name: String, 33 | 34 | /// manifest file size. 35 | #[serde(rename = "_FILE_SIZE")] 36 | file_size: i64, 37 | 38 | /// number added files in manifest. 39 | #[serde(rename = "_NUM_ADDED_FILES")] 40 | num_added_files: i64, 41 | 42 | /// number deleted files in manifest. 43 | #[serde(rename = "_NUM_DELETED_FILES")] 44 | num_deleted_files: i64, 45 | 46 | /// partition stats, the minimum and maximum values of partition fields in this manifest are beneficial for skipping certain manifest files during queries, it is a SimpleStats. 47 | #[serde(rename = "_PARTITION_STATS")] 48 | partition_stats: BinaryTableStats, 49 | 50 | /// schema id when writing this manifest file. 51 | #[serde(rename = "_SCHEMA_ID")] 52 | schema_id: i64, 53 | } 54 | 55 | impl ManifestFileMeta { 56 | /// Get the manifest file name 57 | #[inline] 58 | pub fn file_name(&self) -> &str { 59 | self.file_name.as_str() 60 | } 61 | 62 | /// Get the manifest file size. 63 | #[inline] 64 | pub fn file_size(&self) -> i64 { 65 | self.file_size 66 | } 67 | 68 | /// Get the number added files in manifest. 69 | #[inline] 70 | pub fn num_added_files(&self) -> i64 { 71 | self.num_added_files 72 | } 73 | 74 | /// Get the number deleted files in manifest. 75 | #[inline] 76 | pub fn num_deleted_files(&self) -> i64 { 77 | self.num_deleted_files 78 | } 79 | 80 | /// Get the partition stats 81 | pub fn partition_stats(&self) -> &BinaryTableStats { 82 | &self.partition_stats 83 | } 84 | 85 | /// Get the schema id when writing this manifest file. 86 | #[inline] 87 | pub fn schema_id(&self) -> i64 { 88 | self.schema_id 89 | } 90 | 91 | /// Get the version of this manifest file 92 | #[inline] 93 | pub fn version(&self) -> i32 { 94 | self.version 95 | } 96 | 97 | #[inline] 98 | pub fn new( 99 | file_name: String, 100 | file_size: i64, 101 | num_added_files: i64, 102 | num_deleted_files: i64, 103 | partition_stats: BinaryTableStats, 104 | schema_id: i64, 105 | ) -> ManifestFileMeta { 106 | Self { 107 | version: 2, 108 | file_name, 109 | file_size, 110 | num_added_files, 111 | num_deleted_files, 112 | partition_stats, 113 | schema_id, 114 | } 115 | } 116 | } 117 | 118 | impl Display for ManifestFileMeta { 119 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 120 | write!( 121 | f, 122 | "{{{}, {}, {}, {}, {:?}, {}}}", 123 | self.file_name, 124 | self.file_size, 125 | self.num_added_files, 126 | self.num_deleted_files, 127 | self.partition_stats, 128 | self.schema_id 129 | ) 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | //! Spec module for paimon. 19 | //! 20 | //! All paimon specs types are defined here. 21 | 22 | mod data_file; 23 | pub use data_file::*; 24 | 25 | mod schema; 26 | pub use schema::*; 27 | 28 | mod schema_change; 29 | pub use schema_change::*; 30 | 31 | mod snapshot; 32 | pub use snapshot::*; 33 | 34 | mod manifest_file_meta; 35 | pub use manifest_file_meta::*; 36 | 37 | mod index_file_meta; 38 | pub use index_file_meta::*; 39 | 40 | mod index_manifest; 41 | mod manifest_common; 42 | mod manifest_entry; 43 | mod objects_file; 44 | mod stats; 45 | mod types; 46 | 47 | pub use types::*; 48 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/objects_file.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::Error; 19 | use apache_avro::types::Value; 20 | use apache_avro::{from_value, Reader}; 21 | use serde::de::DeserializeOwned; 22 | 23 | #[allow(dead_code)] 24 | pub fn from_avro_bytes(bytes: &[u8]) -> crate::Result> { 25 | let reader = Reader::new(bytes).map_err(Error::from)?; 26 | let records = reader 27 | .collect::, _>>() 28 | .map_err(Error::from)?; 29 | let values = Value::Array(records); 30 | from_value::>(&values).map_err(Error::from) 31 | } 32 | 33 | #[cfg(test)] 34 | mod tests { 35 | use crate::spec::manifest_common::FileKind; 36 | use crate::spec::manifest_entry::ManifestEntry; 37 | use crate::spec::objects_file::from_avro_bytes; 38 | use crate::spec::stats::BinaryTableStats; 39 | use crate::spec::{DataFileMeta, ManifestFileMeta}; 40 | use chrono::{DateTime, Utc}; 41 | 42 | #[tokio::test] 43 | async fn test_read_manifest_list() { 44 | let workdir = 45 | std::env::current_dir().unwrap_or_else(|err| panic!("current_dir must exist: {err}")); 46 | let path = workdir 47 | .join("tests/fixtures/manifest/manifest-list-5c7399a0-46ae-4a5e-9c13-3ab07212cdb6-0"); 48 | let v = std::fs::read(path.to_str().unwrap()).unwrap(); 49 | let res = from_avro_bytes::(&v).unwrap(); 50 | let value_bytes = vec![ 51 | 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 49, 0, 0, 0, 0, 0, 0, 129, 52 | ]; 53 | assert_eq!( 54 | res, 55 | vec![ 56 | ManifestFileMeta::new( 57 | "manifest-19d138df-233f-46f7-beb6-fadaf4741c0e".to_string(), 58 | 10, 59 | 10, 60 | 10, 61 | BinaryTableStats::new(value_bytes.clone(), value_bytes.clone(), vec![1, 2]), 62 | 1 63 | ), 64 | ManifestFileMeta::new( 65 | "manifest-a703ee48-c411-413e-b84e-c03bdb179631".to_string(), 66 | 11, 67 | 0, 68 | 10, 69 | BinaryTableStats::new(value_bytes.clone(), value_bytes.clone(), vec![1, 2]), 70 | 2 71 | ) 72 | ], 73 | ); 74 | } 75 | 76 | #[tokio::test] 77 | async fn test_read_manifest_entry() { 78 | let workdir = 79 | std::env::current_dir().unwrap_or_else(|err| panic!("current_dir must exist: {err}")); 80 | let path = 81 | workdir.join("tests/fixtures/manifest/manifest-8ded1f09-fcda-489e-9167-582ac0f9f846-0"); 82 | let v = std::fs::read(path.to_str().unwrap()).unwrap(); 83 | let res = from_avro_bytes::(&v).unwrap(); 84 | let value_bytes = vec![ 85 | 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 49, 0, 0, 0, 0, 0, 0, 129, 1, 0, 0, 0, 0, 0, 0, 0, 86 | ]; 87 | let single_value = vec![0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]; 88 | assert_eq!( 89 | res, 90 | vec![ 91 | ManifestEntry::new( 92 | FileKind::Delete, 93 | single_value.clone(), 94 | 1, 95 | 10, 96 | DataFileMeta { 97 | file_name: "f1.parquet".to_string(), 98 | 99 | file_size: 10, 100 | row_count: 100, 101 | min_key: single_value.clone(), 102 | max_key: single_value.clone(), 103 | key_stats: BinaryTableStats::new( 104 | value_bytes.clone(), 105 | value_bytes.clone(), 106 | vec![1, 2] 107 | ), 108 | value_stats: BinaryTableStats::new( 109 | value_bytes.clone(), 110 | value_bytes.clone(), 111 | vec![1, 2] 112 | ), 113 | min_sequence_number: 1, 114 | max_sequence_number: 100, 115 | schema_id: 0, 116 | level: 1, 117 | extra_files: vec![], 118 | creation_time: "2024-09-06T07:45:55.039+00:00" 119 | .parse::>() 120 | .unwrap(), 121 | delete_row_count: Some(0), 122 | embedded_index: None, 123 | }, 124 | 2 125 | ), 126 | ManifestEntry::new( 127 | FileKind::Add, 128 | single_value.clone(), 129 | 2, 130 | 10, 131 | DataFileMeta { 132 | file_name: "f2.parquet".to_string(), 133 | file_size: 10, 134 | row_count: 100, 135 | min_key: single_value.clone(), 136 | max_key: single_value.clone(), 137 | key_stats: BinaryTableStats::new( 138 | value_bytes.clone(), 139 | value_bytes.clone(), 140 | vec![1, 2] 141 | ), 142 | value_stats: BinaryTableStats::new( 143 | value_bytes.clone(), 144 | value_bytes.clone(), 145 | vec![1, 2] 146 | ), 147 | min_sequence_number: 1, 148 | max_sequence_number: 100, 149 | schema_id: 0, 150 | level: 1, 151 | extra_files: vec![], 152 | creation_time: "2024-09-06T07:45:55.039+00:00" 153 | .parse::>() 154 | .unwrap(), 155 | delete_row_count: Some(1), 156 | embedded_index: None, 157 | }, 158 | 2 159 | ), 160 | ] 161 | ) 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/schema.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::spec::types::DataType; 19 | use serde::{Deserialize, Serialize}; 20 | use serde_with::serde_as; 21 | use std::collections::HashMap; 22 | 23 | /// The table schema for paimon table. 24 | /// 25 | /// Impl References: 26 | #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] 27 | #[serde(rename_all = "camelCase")] 28 | pub struct TableSchema { 29 | /// version of schema for paimon 30 | version: i32, 31 | id: i64, 32 | fields: Vec, 33 | highest_field_id: i32, 34 | partition_keys: Vec, 35 | primary_keys: Vec, 36 | options: HashMap, 37 | comment: Option, 38 | time_millis: i64, 39 | } 40 | 41 | /// Data field for paimon table. 42 | /// 43 | /// Impl Reference: 44 | #[serde_as] 45 | #[derive(Debug, Clone, PartialEq, Hash, Eq, Deserialize, Serialize)] 46 | pub struct DataField { 47 | id: i32, 48 | name: String, 49 | #[serde(rename = "type")] 50 | typ: DataType, 51 | #[serde(skip_serializing_if = "Option::is_none")] 52 | description: Option, 53 | } 54 | 55 | impl DataField { 56 | pub fn new(id: i32, name: String, typ: DataType) -> Self { 57 | Self { 58 | id, 59 | name, 60 | typ, 61 | description: None, 62 | } 63 | } 64 | 65 | pub fn id(&self) -> i32 { 66 | self.id 67 | } 68 | 69 | pub fn name(&self) -> &str { 70 | &self.name 71 | } 72 | 73 | pub fn data_type(&self) -> &DataType { 74 | &self.typ 75 | } 76 | 77 | pub fn description(&self) -> Option<&str> { 78 | self.description.as_deref() 79 | } 80 | 81 | pub fn with_id(mut self, new_id: i32) -> Self { 82 | self.id = new_id; 83 | self 84 | } 85 | 86 | pub fn with_name(mut self, new_name: String) -> Self { 87 | self.name = new_name; 88 | self 89 | } 90 | 91 | pub fn with_description(mut self, new_description: Option) -> Self { 92 | self.description = new_description; 93 | self 94 | } 95 | } 96 | 97 | pub fn escape_identifier(identifier: &str) -> String { 98 | identifier.replace('"', "\"\"") 99 | } 100 | 101 | pub fn escape_single_quotes(text: &str) -> String { 102 | text.replace('\'', "''") 103 | } 104 | 105 | #[cfg(test)] 106 | mod tests { 107 | use crate::spec::IntType; 108 | 109 | use super::*; 110 | 111 | #[test] 112 | fn test_create_data_field() { 113 | let id = 1; 114 | let name = "field1".to_string(); 115 | let typ = DataType::Int(IntType::new()); 116 | let description = "test description".to_string(); 117 | 118 | let data_field = DataField::new(id, name.clone(), typ.clone()) 119 | .with_description(Some(description.clone())); 120 | 121 | assert_eq!(data_field.id(), id); 122 | assert_eq!(data_field.name(), name); 123 | assert_eq!(data_field.data_type(), &typ); 124 | assert_eq!(data_field.description(), Some(description).as_deref()); 125 | } 126 | 127 | #[test] 128 | fn test_new_id() { 129 | let d_type = DataType::Int(IntType::new()); 130 | let new_data_field = DataField::new(1, "field1".to_string(), d_type.clone()).with_id(2); 131 | 132 | assert_eq!(new_data_field.id(), 2); 133 | assert_eq!(new_data_field.name(), "field1"); 134 | assert_eq!(new_data_field.data_type(), &d_type); 135 | assert_eq!(new_data_field.description(), None); 136 | } 137 | 138 | #[test] 139 | fn test_new_name() { 140 | let d_type = DataType::Int(IntType::new()); 141 | let new_data_field = 142 | DataField::new(1, "field1".to_string(), d_type.clone()).with_name("field2".to_string()); 143 | 144 | assert_eq!(new_data_field.id(), 1); 145 | assert_eq!(new_data_field.name(), "field2"); 146 | assert_eq!(new_data_field.data_type(), &d_type); 147 | assert_eq!(new_data_field.description(), None); 148 | } 149 | 150 | #[test] 151 | fn test_new_description() { 152 | let d_type = DataType::Int(IntType::new()); 153 | let new_data_field = DataField::new(1, "field1".to_string(), d_type.clone()) 154 | .with_description(Some("new description".to_string())); 155 | 156 | assert_eq!(new_data_field.id(), 1); 157 | assert_eq!(new_data_field.name(), "field1"); 158 | assert_eq!(new_data_field.data_type(), &d_type); 159 | assert_eq!(new_data_field.description(), Some("new description")); 160 | } 161 | 162 | #[test] 163 | fn test_escape_identifier() { 164 | let escaped_identifier = escape_identifier("\"identifier\""); 165 | assert_eq!(escaped_identifier, "\"\"identifier\"\""); 166 | } 167 | 168 | #[test] 169 | fn test_escape_single_quotes() { 170 | let escaped_text = escape_single_quotes("text with 'single' quotes"); 171 | assert_eq!(escaped_text, "text with ''single'' quotes"); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/schema_change.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::spec::DataType; 19 | use serde::{Deserialize, Serialize}; 20 | 21 | /// Schema change to table. 22 | /// 23 | /// Reference: 24 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] 25 | #[serde(rename_all = "camelCase")] 26 | pub enum SchemaChange { 27 | /// A SchemaChange to set a table option. 28 | /// 29 | /// Reference: 30 | SetOption { key: String, value: String }, 31 | /// A SchemaChange to remove a table option. 32 | /// 33 | /// Reference: 34 | RemoveOption { key: String }, 35 | /// A SchemaChange to update a table comment. 36 | /// 37 | /// Reference: 38 | UpdateComment { comment: Option }, 39 | /// A SchemaChange to add a new field. 40 | /// 41 | /// Reference: 42 | #[serde(rename_all = "camelCase")] 43 | AddColumn { 44 | field_name: String, 45 | data_type: DataType, 46 | description: Option, 47 | #[serde(rename = "move")] 48 | column_move: Option, 49 | }, 50 | /// A SchemaChange to rename a field. 51 | /// 52 | /// Reference: 53 | #[serde(rename_all = "camelCase")] 54 | RenameColumn { 55 | field_name: String, 56 | new_name: String, 57 | }, 58 | /// A SchemaChange to drop a field. 59 | /// 60 | /// Reference: 61 | #[serde(rename_all = "camelCase")] 62 | DropColumn { field_name: String }, 63 | /// A SchemaChange to update the field's type. 64 | /// 65 | /// Reference: 66 | #[serde(rename_all = "camelCase")] 67 | UpdateColumnType { 68 | field_name: String, 69 | data_type: DataType, 70 | }, 71 | /// A SchemaChange to update the field's position. 72 | /// 73 | /// Reference: 74 | #[serde(rename_all = "camelCase")] 75 | UpdateColumnPosition { 76 | #[serde(rename = "move")] 77 | column_move: ColumnMove, 78 | }, 79 | /// A SchemaChange to update the field's nullability. 80 | /// 81 | /// Reference: 82 | #[serde(rename_all = "camelCase")] 83 | UpdateColumnNullability { 84 | field_name: Vec, 85 | nullable: bool, 86 | }, 87 | /// A SchemaChange to update the (nested) field's comment. 88 | /// 89 | /// Reference: 90 | #[serde(rename_all = "camelCase")] 91 | UpdateColumnComment { 92 | field_names: Vec, 93 | new_description: String, 94 | }, 95 | } 96 | 97 | impl SchemaChange { 98 | /// impl the `set_option`. 99 | pub fn set_option(key: String, value: String) -> Self { 100 | SchemaChange::SetOption { key, value } 101 | } 102 | 103 | /// impl the `remove_option`. 104 | pub fn remove_option(key: String) -> Self { 105 | SchemaChange::RemoveOption { key } 106 | } 107 | 108 | /// impl the `update_comment`. 109 | pub fn update_comment(comment: Option) -> Self { 110 | SchemaChange::UpdateComment { comment } 111 | } 112 | 113 | /// impl the `add_column`. 114 | pub fn add_column(field_name: String, data_type: DataType) -> Self { 115 | SchemaChange::AddColumn { 116 | field_name, 117 | data_type, 118 | description: None, 119 | column_move: None, 120 | } 121 | } 122 | 123 | /// impl the `add_column_with_description`. 124 | pub fn add_column_with_description( 125 | field_name: String, 126 | data_type: DataType, 127 | description: String, 128 | ) -> Self { 129 | SchemaChange::AddColumn { 130 | field_name, 131 | data_type, 132 | description: Some(description), 133 | column_move: None, 134 | } 135 | } 136 | 137 | /// impl the `add_column_with_description_and_column_move`. 138 | pub fn add_column_with_description_and_column_move( 139 | field_name: String, 140 | data_type: DataType, 141 | description: String, 142 | column_move: ColumnMove, 143 | ) -> Self { 144 | SchemaChange::AddColumn { 145 | field_name, 146 | data_type, 147 | description: Some(description), 148 | column_move: Some(column_move), 149 | } 150 | } 151 | 152 | /// impl the `rename_column`. 153 | pub fn rename_column(field_name: String, new_name: String) -> Self { 154 | SchemaChange::RenameColumn { 155 | field_name, 156 | new_name, 157 | } 158 | } 159 | 160 | /// impl the `drop_column`. 161 | pub fn drop_column(field_name: String) -> Self { 162 | SchemaChange::DropColumn { field_name } 163 | } 164 | 165 | /// impl the `update_column_type`. 166 | pub fn update_column_type(field_name: String, new_data_type: DataType) -> Self { 167 | SchemaChange::UpdateColumnType { 168 | field_name, 169 | data_type: new_data_type, 170 | } 171 | } 172 | 173 | /// impl the `update_column_position`. 174 | pub fn update_column_position(column_move: ColumnMove) -> Self { 175 | SchemaChange::UpdateColumnPosition { column_move } 176 | } 177 | 178 | /// impl the `update_column_position`. 179 | pub fn update_column_nullability(field_name: String, new_nullability: bool) -> Self { 180 | SchemaChange::UpdateColumnNullability { 181 | field_name: vec![field_name], 182 | nullable: new_nullability, 183 | } 184 | } 185 | 186 | /// impl the `update_columns_nullability`. 187 | pub fn update_columns_nullability(field_names: Vec, new_nullability: bool) -> Self { 188 | SchemaChange::UpdateColumnNullability { 189 | field_name: field_names, 190 | nullable: new_nullability, 191 | } 192 | } 193 | 194 | /// impl the `update_column_comment`. 195 | pub fn update_column_comment(field_name: String, comment: String) -> Self { 196 | SchemaChange::UpdateColumnComment { 197 | field_names: vec![field_name], 198 | new_description: comment, 199 | } 200 | } 201 | 202 | /// impl the `update_columns_comment`. 203 | pub fn update_columns_comment(field_names: Vec, comment: String) -> Self { 204 | SchemaChange::UpdateColumnComment { 205 | field_names, 206 | new_description: comment, 207 | } 208 | } 209 | } 210 | 211 | /// The type of move. 212 | /// 213 | /// Reference: 214 | #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] 215 | pub enum ColumnMoveType { 216 | FIRST, 217 | AFTER, 218 | } 219 | 220 | /// Represents a requested column move in a struct. 221 | /// 222 | /// Reference: 223 | #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] 224 | #[serde(rename_all = "camelCase")] 225 | pub struct ColumnMove { 226 | pub field_name: String, 227 | pub referenced_field_name: Option, 228 | #[serde(rename = "type")] 229 | pub move_type: ColumnMoveType, 230 | } 231 | 232 | impl ColumnMove { 233 | /// Get the field name. 234 | pub fn field_name(&self) -> &str { 235 | &self.field_name 236 | } 237 | 238 | /// Get the referenced field name. 239 | pub fn referenced_field_name(&self) -> Option<&str> { 240 | self.referenced_field_name.as_deref() 241 | } 242 | 243 | /// Get the move type. 244 | pub fn move_type(&self) -> &ColumnMoveType { 245 | &self.move_type 246 | } 247 | 248 | /// Create a new `Move` with `FIRST` move type. 249 | pub fn move_first(field_name: String) -> Self { 250 | ColumnMove { 251 | field_name, 252 | referenced_field_name: None, 253 | move_type: ColumnMoveType::FIRST, 254 | } 255 | } 256 | 257 | /// Create a new `Move` with `AFTER` move type. 258 | pub fn move_after(field_name: String, referenced_field_name: String) -> Self { 259 | ColumnMove { 260 | field_name, 261 | referenced_field_name: Some(referenced_field_name), 262 | move_type: ColumnMoveType::AFTER, 263 | } 264 | } 265 | } 266 | 267 | #[cfg(test)] 268 | mod tests { 269 | use super::*; 270 | use crate::spec::{DoubleType, IntType}; 271 | 272 | #[test] 273 | fn test_schema_change_serialize_deserialize() { 274 | let json_data = r#" 275 | [ 276 | { 277 | "setOption": { 278 | "key": "snapshot.time-retained", 279 | "value": "2h" 280 | } 281 | }, 282 | { 283 | "removeOption": { 284 | "key": "compaction.max.file-num" 285 | } 286 | }, 287 | { 288 | "updateComment": { 289 | "comment": "table.comment" 290 | } 291 | }, 292 | { 293 | "addColumn": { 294 | "fieldName": "col1", 295 | "dataType": "INT", 296 | "description": "col1_description", 297 | "move": { 298 | "fieldName": "col1_first", 299 | "referencedFieldName": null, 300 | "type": "FIRST" 301 | } 302 | } 303 | }, 304 | { 305 | "renameColumn": { 306 | "fieldName": "col3", 307 | "newName": "col3_new_name" 308 | } 309 | }, 310 | { 311 | "dropColumn": { 312 | "fieldName": "col1" 313 | } 314 | }, 315 | { 316 | "updateColumnType": { 317 | "fieldName": "col14", 318 | "dataType": "DOUBLE" 319 | } 320 | }, 321 | { 322 | "updateColumnPosition": { 323 | "move": { 324 | "fieldName": "col4_first", 325 | "referencedFieldName": null, 326 | "type": "FIRST" 327 | } 328 | } 329 | }, 330 | { 331 | "updateColumnNullability": { 332 | "fieldName": [ 333 | "col5", 334 | "f2" 335 | ], 336 | "nullable": false 337 | } 338 | }, 339 | { 340 | "updateColumnComment": { 341 | "fieldNames": [ 342 | "col5", 343 | "f1" 344 | ], 345 | "newDescription": "col5 f1 field" 346 | } 347 | } 348 | ]"#; 349 | 350 | let schema_changes: Vec = 351 | serde_json::from_str(json_data).expect("Failed to deserialize SchemaChange."); 352 | 353 | assert_eq!( 354 | schema_changes, 355 | vec![ 356 | SchemaChange::SetOption { 357 | key: "snapshot.time-retained".to_string(), 358 | value: "2h".to_string(), 359 | }, 360 | SchemaChange::RemoveOption { 361 | key: "compaction.max.file-num".to_string(), 362 | }, 363 | SchemaChange::UpdateComment { 364 | comment: Some("table.comment".to_string()), 365 | }, 366 | SchemaChange::AddColumn { 367 | field_name: "col1".to_string(), 368 | data_type: DataType::Int(IntType::new()), 369 | description: Some("col1_description".to_string()), 370 | column_move: Some(ColumnMove { 371 | field_name: "col1_first".to_string(), 372 | referenced_field_name: None, 373 | move_type: ColumnMoveType::FIRST, 374 | }), 375 | }, 376 | SchemaChange::RenameColumn { 377 | field_name: "col3".to_string(), 378 | new_name: "col3_new_name".to_string(), 379 | }, 380 | SchemaChange::DropColumn { 381 | field_name: "col1".to_string(), 382 | }, 383 | SchemaChange::UpdateColumnType { 384 | field_name: "col14".to_string(), 385 | data_type: DataType::Double(DoubleType::new()), 386 | }, 387 | SchemaChange::UpdateColumnPosition { 388 | column_move: ColumnMove { 389 | field_name: "col4_first".to_string(), 390 | referenced_field_name: None, 391 | move_type: ColumnMoveType::FIRST, 392 | }, 393 | }, 394 | SchemaChange::UpdateColumnNullability { 395 | field_name: vec!["col5".to_string(), "f2".to_string()], 396 | nullable: false, 397 | }, 398 | SchemaChange::UpdateColumnComment { 399 | field_names: vec!["col5".to_string(), "f1".to_string()], 400 | new_description: "col5 f1 field".to_string(), 401 | }, 402 | ] 403 | ); 404 | } 405 | 406 | #[test] 407 | fn test_column_move_serialize_deserialize() { 408 | let json_data = r#" 409 | [ 410 | { 411 | "fieldName": "col1", 412 | "referencedFieldName": null, 413 | "type": "FIRST" 414 | }, 415 | { 416 | "fieldName": "col2_after", 417 | "referencedFieldName": "col2", 418 | "type": "AFTER" 419 | } 420 | ]"#; 421 | 422 | let column_moves: Vec = serde_json::from_str(json_data).unwrap(); 423 | assert_eq!( 424 | column_moves, 425 | vec![ 426 | ColumnMove::move_first("col1".to_string()), 427 | ColumnMove::move_after("col2_after".to_string(), "col2".to_string()), 428 | ] 429 | ); 430 | } 431 | } 432 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/snapshot.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use serde::{Deserialize, Serialize}; 19 | use std::collections::HashMap; 20 | use typed_builder::TypedBuilder; 21 | 22 | /// Type of changes in this snapshot. 23 | #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] 24 | pub enum CommitKind { 25 | /// Changes flushed from the mem table. 26 | APPEND, 27 | 28 | /// Changes by compacting existing data files. 29 | COMPACT, 30 | 31 | /// Changes that clear up the whole partition and then add new records. 32 | OVERWRITE, 33 | 34 | /// Collect statistics. 35 | ANALYZE, 36 | } 37 | 38 | /// Snapshot for paimon. 39 | /// 40 | /// Impl Reference: . 41 | #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, TypedBuilder)] 42 | #[serde(rename_all = "camelCase")] 43 | pub struct Snapshot { 44 | /// version of snapshot 45 | version: i32, 46 | id: i64, 47 | schema_id: i64, 48 | /// a manifest list recording all changes from the previous snapshots 49 | base_manifest_list: String, 50 | /// a manifest list recording all new changes occurred in this snapshot 51 | delta_manifest_list: String, 52 | /// a manifest list recording all changelog produced in this snapshot 53 | #[builder(default = None)] 54 | #[serde(skip_serializing_if = "Option::is_none")] 55 | changelog_manifest_list: Option, 56 | /// a manifest recording all index files of this table 57 | #[builder(default = None)] 58 | #[serde(skip_serializing_if = "Option::is_none")] 59 | index_manifest: Option, 60 | /// user who committed this snapshot 61 | commit_user: String, 62 | /// Mainly for snapshot deduplication. 63 | /// 64 | /// If multiple snapshots have the same commitIdentifier, reading from any of these snapshots 65 | /// must produce the same table. 66 | /// 67 | /// If snapshot A has a smaller commitIdentifier than snapshot B, then snapshot A must be 68 | /// committed before snapshot B, and thus snapshot A must contain older records than snapshot B. 69 | commit_identifier: i64, 70 | commit_kind: CommitKind, 71 | /// timestamp of this snapshot 72 | time_millis: u64, 73 | /// log offsets of all changes occurred in this snapshot 74 | #[builder(default = None)] 75 | #[serde(skip_serializing_if = "Option::is_none")] 76 | log_offsets: Option>, 77 | /// record count of all changes occurred in this snapshot 78 | #[builder(default = None)] 79 | total_record_count: Option, 80 | /// record count of all new changes occurred in this snapshot 81 | #[builder(default = None)] 82 | delta_record_count: Option, 83 | /// record count of all changelog produced in this snapshot 84 | #[builder(default = None)] 85 | #[serde(skip_serializing_if = "Option::is_none")] 86 | changelog_record_count: Option, 87 | /// watermark for input records 88 | #[builder(default = None)] 89 | #[serde(skip_serializing_if = "Option::is_none")] 90 | watermark: Option, 91 | /// stats file name for statistics of this table 92 | #[builder(default = None)] 93 | #[serde(skip_serializing_if = "Option::is_none")] 94 | statistics: Option, 95 | } 96 | 97 | impl Snapshot { 98 | /// Get the version of this snapshot. 99 | #[inline] 100 | pub fn version(&self) -> i32 { 101 | self.version 102 | } 103 | 104 | /// Get the id of this snapshot. 105 | #[inline] 106 | pub fn id(&self) -> i64 { 107 | self.id 108 | } 109 | 110 | /// Get the schema id of this snapshot. 111 | #[inline] 112 | pub fn schema_id(&self) -> i64 { 113 | self.schema_id 114 | } 115 | 116 | /// Get the base manifest list of this snapshot. 117 | #[inline] 118 | pub fn base_manifest_list(&self) -> &str { 119 | &self.base_manifest_list 120 | } 121 | 122 | /// Get the delta manifest list of this snapshot. 123 | #[inline] 124 | pub fn delta_manifest_list(&self) -> &str { 125 | &self.delta_manifest_list 126 | } 127 | 128 | /// Get the changelog manifest list of this snapshot. 129 | #[inline] 130 | pub fn changelog_manifest_list(&self) -> Option<&str> { 131 | self.changelog_manifest_list.as_deref() 132 | } 133 | 134 | /// Get the index manifest of this snapshot. 135 | #[inline] 136 | pub fn index_manifest(&self) -> Option<&str> { 137 | self.index_manifest.as_deref() 138 | } 139 | 140 | /// Get the commit user of this snapshot. 141 | #[inline] 142 | pub fn commit_user(&self) -> &str { 143 | &self.commit_user 144 | } 145 | 146 | /// Get the commit time of this snapshot. 147 | #[inline] 148 | pub fn time_millis(&self) -> u64 { 149 | self.time_millis 150 | } 151 | 152 | /// Get the commit identifier of this snapshot. 153 | #[inline] 154 | pub fn commit_identifier(&self) -> i64 { 155 | self.commit_identifier 156 | } 157 | 158 | /// Get the log offsets of this snapshot. 159 | #[inline] 160 | pub fn log_offsets(&self) -> Option<&HashMap> { 161 | self.log_offsets.as_ref() 162 | } 163 | 164 | /// Get the total record count of this snapshot. 165 | #[inline] 166 | pub fn total_record_count(&self) -> Option { 167 | self.total_record_count 168 | } 169 | 170 | /// Get the delta record count of this snapshot. 171 | #[inline] 172 | pub fn delta_record_count(&self) -> Option { 173 | self.delta_record_count 174 | } 175 | 176 | /// Get the changelog record count of this snapshot. 177 | #[inline] 178 | pub fn changelog_record_count(&self) -> Option { 179 | self.changelog_record_count 180 | } 181 | 182 | /// Get the watermark of this snapshot. 183 | #[inline] 184 | pub fn watermark(&self) -> Option { 185 | self.watermark 186 | } 187 | 188 | /// Get the statistics of this snapshot. 189 | #[inline] 190 | pub fn statistics(&self) -> Option<&str> { 191 | self.statistics.as_deref() 192 | } 193 | } 194 | 195 | #[cfg(test)] 196 | mod tests { 197 | use super::*; 198 | use pretty_assertions::assert_eq; 199 | use serde_json; 200 | use std::env::current_dir; 201 | 202 | fn load_fixture(name: &str) -> String { 203 | let path = current_dir() 204 | .unwrap_or_else(|err| panic!("current_dir must exist: {err}")) 205 | .join(format!("tests/fixtures/snapshot/{name}.json")); 206 | let bytes = std::fs::read(&path) 207 | .unwrap_or_else(|err| panic!("fixtures {path:?} load failed: {err}")); 208 | String::from_utf8(bytes).expect("fixtures content must be valid utf8") 209 | } 210 | 211 | fn test_cases() -> Vec<(&'static str, Snapshot)> { 212 | vec![ 213 | ( 214 | "snapshot-v3", 215 | Snapshot::builder() 216 | .version(3) 217 | .id(2) 218 | .schema_id(0) 219 | .base_manifest_list( 220 | "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-0".to_string(), 221 | ) 222 | .delta_manifest_list( 223 | "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-1".to_string(), 224 | ) 225 | .commit_user("abbaac9e-4a17-43e3-b135-2269da263e3a".to_string()) 226 | .commit_identifier(9223372036854775807) 227 | .changelog_manifest_list(Some( 228 | "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-2".to_string(), 229 | )) 230 | .commit_kind(CommitKind::APPEND) 231 | .time_millis(1724509030368) 232 | .log_offsets(Some(HashMap::default())) 233 | .total_record_count(Some(4)) 234 | .delta_record_count(Some(2)) 235 | .changelog_record_count(Some(2)) 236 | .statistics(Some("statistics_string".to_string())) 237 | .build(), 238 | ), 239 | ( 240 | "snapshot-v3-none-field", 241 | Snapshot::builder() 242 | .version(3) 243 | .id(2) 244 | .schema_id(0) 245 | .base_manifest_list( 246 | "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-0".to_string(), 247 | ) 248 | .delta_manifest_list( 249 | "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-1".to_string(), 250 | ) 251 | .commit_user("abbaac9e-4a17-43e3-b135-2269da263e3a".to_string()) 252 | .commit_identifier(9223372036854775807) 253 | .changelog_manifest_list(None) 254 | .commit_kind(CommitKind::APPEND) 255 | .time_millis(1724509030368) 256 | .log_offsets(Some(HashMap::default())) 257 | .total_record_count(Some(4)) 258 | .delta_record_count(Some(2)) 259 | .changelog_record_count(Some(2)) 260 | .build(), 261 | ), 262 | ] 263 | } 264 | 265 | #[test] 266 | fn test_snapshot_serialization_deserialization() { 267 | for (name, expect) in test_cases() { 268 | let content = load_fixture(name); 269 | let snapshot: Snapshot = 270 | serde_json::from_str(content.as_str()).expect("Failed to deserialize Snapshot"); 271 | assert_eq!(snapshot, expect); 272 | let serialized = 273 | serde_json::to_string(&snapshot).expect("Failed to serialize Snapshot"); 274 | 275 | let deserialized: Snapshot = serde_json::from_str(&serialized) 276 | .expect("Failed to deserialize serialized Snapshot"); 277 | 278 | assert_eq!(snapshot, deserialized); 279 | } 280 | } 281 | } 282 | -------------------------------------------------------------------------------- /crates/paimon/src/spec/stats.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use serde::{Deserialize, Serialize}; 19 | use std::fmt::{Display, Formatter}; 20 | 21 | /// The statistics for columns, supports the following stats. 22 | /// 23 | /// All statistics are stored in the form of a Binary, which can significantly reduce its memory consumption, but the cost is that the column type needs to be known when getting. 24 | /// 25 | /// Impl Reference: 26 | #[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] 27 | pub struct BinaryTableStats { 28 | /// the minimum values of the columns 29 | #[serde(rename = "_MIN_VALUES", with = "serde_bytes")] 30 | min_values: Vec, 31 | 32 | /// the maximum values of the columns 33 | #[serde(rename = "_MAX_VALUES", with = "serde_bytes")] 34 | max_values: Vec, 35 | 36 | /// the number of nulls of the columns 37 | #[serde(rename = "_NULL_COUNTS")] 38 | null_counts: Vec, 39 | } 40 | 41 | impl BinaryTableStats { 42 | /// Get the minimum values of the columns 43 | #[inline] 44 | pub fn min_values(&self) -> &[u8] { 45 | &self.min_values 46 | } 47 | 48 | /// Get the maximum values of the columns 49 | #[inline] 50 | pub fn max_values(&self) -> &[u8] { 51 | &self.max_values 52 | } 53 | 54 | /// Get the number of nulls of the columns 55 | #[inline] 56 | pub fn null_counts(&self) -> &Vec { 57 | &self.null_counts 58 | } 59 | 60 | pub fn new( 61 | min_values: Vec, 62 | max_values: Vec, 63 | null_counts: Vec, 64 | ) -> BinaryTableStats { 65 | Self { 66 | min_values, 67 | max_values, 68 | null_counts, 69 | } 70 | } 71 | } 72 | 73 | impl Display for BinaryTableStats { 74 | fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { 75 | todo!() 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/array_type.json: -------------------------------------------------------------------------------- 1 | {"type":"ARRAY NOT NULL","element":"INT NOT NULL"} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/array_type_nullable.json: -------------------------------------------------------------------------------- 1 | {"type":"ARRAY","element":"INT"} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/bigint_type.json: -------------------------------------------------------------------------------- 1 | "BIGINT NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/bigint_type_nullable.json: -------------------------------------------------------------------------------- 1 | "BIGINT" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/binary_type.json: -------------------------------------------------------------------------------- 1 | "BINARY(22) NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/binary_type_nullable.json: -------------------------------------------------------------------------------- 1 | "BINARY(22)" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/boolean_type.json: -------------------------------------------------------------------------------- 1 | "BOOLEAN NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/boolean_type_nullable.json: -------------------------------------------------------------------------------- 1 | "BOOLEAN" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/char_type.json: -------------------------------------------------------------------------------- 1 | "CHAR(33) NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/char_type_nullable.json: -------------------------------------------------------------------------------- 1 | "CHAR(33)" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/date_type.json: -------------------------------------------------------------------------------- 1 | "DATE NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/date_type_nullable.json: -------------------------------------------------------------------------------- 1 | "DATE" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/decimal_type.json: -------------------------------------------------------------------------------- 1 | "DECIMAL(10, 2) NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/decimal_type_nullable.json: -------------------------------------------------------------------------------- 1 | "DECIMAL(10, 2)" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/double_type.json: -------------------------------------------------------------------------------- 1 | "DOUBLE NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/double_type_nullable.json: -------------------------------------------------------------------------------- 1 | "DOUBLE" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/float_type.json: -------------------------------------------------------------------------------- 1 | "FLOAT NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/float_type_nullable.json: -------------------------------------------------------------------------------- 1 | "FLOAT" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/highly_complex_nested_row_type.json: -------------------------------------------------------------------------------- 1 | {"type":"ROW","fields":[{"id":0,"name":"outer_row1","type":{"type":"ROW","fields":[{"id":0,"name":"middle1_decimal","type":"DECIMAL(12, 3)"},{"id":1,"name":"middle1_inner_row1","type":{"type":"ROW","fields":[{"id":0,"name":"inner1_boolean","type":"BOOLEAN"},{"id":1,"name":"inner1_int","type":"INT"},{"id":2,"name":"inner1_varchar","type":"VARCHAR(100)"}]}},{"id":2,"name":"middle1_array","type":{"type":"ARRAY","element":{"type":"MAP","key":"VARCHAR(50)","value":"INT"}}}]}},{"id":1,"name":"outer_row2","type":{"type":"ROW","fields":[{"id":0,"name":"middle2_multiset","type":{"type":"MULTISET","element":"TIMESTAMP(6)"}},{"id":1,"name":"middle2_inner_row2","type":{"type":"ROW","fields":[{"id":0,"name":"inner2_char","type":"CHAR(50)"},{"id":1,"name":"inner2_float","type":"FLOAT"},{"id":2,"name":"inner2_binary","type":"BINARY(256)"}]}},{"id":2,"name":"middle2_map","type":{"type":"MAP","key":"CHAR(10)","value":{"type":"ROW","fields":[{"id":0,"name":"inner1_boolean","type":"BOOLEAN"},{"id":1,"name":"inner1_int","type":"INT"},{"id":2,"name":"inner1_varchar","type":"VARCHAR(100)"}]}}}]}},{"id":2,"name":"outer_map","type":{"type":"MAP","key":"VARCHAR(30)","value":{"type":"ROW","fields":[{"id":0,"name":"middle1_decimal","type":"DECIMAL(12, 3)"},{"id":1,"name":"middle1_inner_row1","type":{"type":"ROW","fields":[{"id":0,"name":"inner1_boolean","type":"BOOLEAN"},{"id":1,"name":"inner1_int","type":"INT"},{"id":2,"name":"inner1_varchar","type":"VARCHAR(100)"}]}},{"id":2,"name":"middle1_array","type":{"type":"ARRAY","element":{"type":"MAP","key":"VARCHAR(50)","value":"INT"}}}]}}},{"id":3,"name":"outer_array","type":{"type":"ARRAY","element":{"type":"ROW","fields":[{"id":0,"name":"middle2_multiset","type":{"type":"MULTISET","element":"TIMESTAMP(6)"}},{"id":1,"name":"middle2_inner_row2","type":{"type":"ROW","fields":[{"id":0,"name":"inner2_char","type":"CHAR(50)"},{"id":1,"name":"inner2_float","type":"FLOAT"},{"id":2,"name":"inner2_binary","type":"BINARY(256)"}]}},{"id":2,"name":"middle2_map","type":{"type":"MAP","key":"CHAR(10)","value":{"type":"ROW","fields":[{"id":0,"name":"inner1_boolean","type":"BOOLEAN"},{"id":1,"name":"inner1_int","type":"INT"},{"id":2,"name":"inner1_varchar","type":"VARCHAR(100)"}]}}}]}}},{"id":4,"name":"outer_multiset","type":{"type":"MULTISET","element":{"type":"ROW","fields":[{"id":0,"name":"deep_inner_decimal","type":"DECIMAL(10, 2)"},{"id":1,"name":"deep_inner_varbinary","type":"VARBINARY(128)"}]}}}]} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/int_type.json: -------------------------------------------------------------------------------- 1 | "INT NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/int_type_nullable.json: -------------------------------------------------------------------------------- 1 | "INT" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/local_zoned_timestamp_type.json: -------------------------------------------------------------------------------- 1 | "TIMESTAMP(3) WITH LOCAL TIME ZONE NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/local_zoned_timestamp_type_nullable.json: -------------------------------------------------------------------------------- 1 | "TIMESTAMP(6) WITH LOCAL TIME ZONE" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/manifest/index-manifest-85cc6729-f5af-431a-a1c3-ef45319328fb-0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/paimon-rust/1f69c4a6c24f5f7fa858cad57be81fc135c1da71/crates/paimon/tests/fixtures/manifest/index-manifest-85cc6729-f5af-431a-a1c3-ef45319328fb-0 -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/manifest/manifest-8ded1f09-fcda-489e-9167-582ac0f9f846-0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/paimon-rust/1f69c4a6c24f5f7fa858cad57be81fc135c1da71/crates/paimon/tests/fixtures/manifest/manifest-8ded1f09-fcda-489e-9167-582ac0f9f846-0 -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/manifest/manifest-list-5c7399a0-46ae-4a5e-9c13-3ab07212cdb6-0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/paimon-rust/1f69c4a6c24f5f7fa858cad57be81fc135c1da71/crates/paimon/tests/fixtures/manifest/manifest-list-5c7399a0-46ae-4a5e-9c13-3ab07212cdb6-0 -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/manifest_file_meta_schema.json: -------------------------------------------------------------------------------- 1 | {"type":"ROW","fields":[{"id":0,"name":"_FILE_NAME","type":"STRING NOT NULL"},{"id":1,"name":"_FILE_SIZE","type":"BIGINT NOT NULL"},{"id":2,"name":"_NUM_ADDED_FILES","type":"BIGINT NOT NULL"},{"id":3,"name":"_NUM_DELETED_FILES","type":"BIGINT NOT NULL"},{"id":4,"name":"_PARTITION_STATS","type":{"type":"ROW","fields":[{"id":0,"name":"_MIN_VALUES","type":"BYTES NOT NULL"},{"id":1,"name":"_MAX_VALUES","type":"BYTES NOT NULL"},{"id":2,"name":"_NULL_COUNTS","type":{"type":"ARRAY","element":"BIGINT"}}]}},{"id":5,"name":"_SCHEMA_ID","type":"BIGINT NOT NULL"}]} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/map_type.json: -------------------------------------------------------------------------------- 1 | {"type":"MAP NOT NULL","key":"VARCHAR(20)","value":"INT NOT NULL"} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/map_type_nullable.json: -------------------------------------------------------------------------------- 1 | {"type":"MAP","key":"VARCHAR(20)","value":"INT"} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/multiset_type.json: -------------------------------------------------------------------------------- 1 | {"type":"MULTISET NOT NULL","element":"INT NOT NULL"} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/multiset_type_nullable.json: -------------------------------------------------------------------------------- 1 | {"type":"MULTISET","element":"INT"} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/row_type.json: -------------------------------------------------------------------------------- 1 | {"type":"ROW NOT NULL","fields":[{"id":0,"name":"a","type":"INT NOT NULL"},{"id":1,"name":"b","type":"VARCHAR(20) NOT NULL"}]} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/row_type_nullable.json: -------------------------------------------------------------------------------- 1 | {"type":"ROW","fields":[{"id":0,"name":"a","type":"INT"},{"id":1,"name":"b","type":"VARCHAR(20)"}]} -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/smallint_type.json: -------------------------------------------------------------------------------- 1 | "SMALLINT NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/smallint_type_nullable.json: -------------------------------------------------------------------------------- 1 | "SMALLINT" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/snapshot/snapshot-v3-none-field.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 3, 3 | "id": 2, 4 | "schemaId": 0, 5 | "baseManifestList": "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-0", 6 | "deltaManifestList": "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-1", 7 | "changelogManifestList": null, 8 | "commitUser": "abbaac9e-4a17-43e3-b135-2269da263e3a", 9 | "commitIdentifier": 9223372036854775807, 10 | "commitKind": "APPEND", 11 | "timeMillis": 1724509030368, 12 | "logOffsets": {}, 13 | "totalRecordCount": 4, 14 | "deltaRecordCount": 2, 15 | "changelogRecordCount": 2 16 | } -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/snapshot/snapshot-v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 3, 3 | "id": 2, 4 | "schemaId": 0, 5 | "baseManifestList": "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-0", 6 | "deltaManifestList": "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-1", 7 | "changelogManifestList": "manifest-list-ea4b892d-edc8-4ee7-9eee-7068b83a947b-2", 8 | "commitUser": "abbaac9e-4a17-43e3-b135-2269da263e3a", 9 | "commitIdentifier": 9223372036854775807, 10 | "commitKind": "APPEND", 11 | "timeMillis": 1724509030368, 12 | "logOffsets": {}, 13 | "totalRecordCount": 4, 14 | "deltaRecordCount": 2, 15 | "changelogRecordCount": 2, 16 | "statistics": "statistics_string" 17 | } -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/time_type.json: -------------------------------------------------------------------------------- 1 | "TIME(9) NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/time_type_nullable.json: -------------------------------------------------------------------------------- 1 | "TIME(0)" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/timestamp_type.json: -------------------------------------------------------------------------------- 1 | "TIMESTAMP(6) NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/timestamp_type_nullable.json: -------------------------------------------------------------------------------- 1 | "TIMESTAMP(6)" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/tinyint_type.json: -------------------------------------------------------------------------------- 1 | "TINYINT NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/tinyint_type_nullable.json: -------------------------------------------------------------------------------- 1 | "TINYINT" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/varbinary_type.json: -------------------------------------------------------------------------------- 1 | "VARBINARY(233) NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/varbinary_type_nullable.json: -------------------------------------------------------------------------------- 1 | "VARBINARY(233)" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/varchar_type.json: -------------------------------------------------------------------------------- 1 | "VARCHAR(33) NOT NULL" -------------------------------------------------------------------------------- /crates/paimon/tests/fixtures/varchar_type_nullable.json: -------------------------------------------------------------------------------- 1 | "VARCHAR(33)" -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [toolchain] 19 | channel = "stable" 20 | components = ["rustfmt", "clippy"] 21 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | edition = "2021" 19 | reorder_imports = true 20 | --------------------------------------------------------------------------------