├── .github └── workflows │ ├── coverage.yml │ └── test.yml ├── .gitignore ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── src ├── error.rs ├── lib.md ├── lib.rs ├── proto.rs └── read │ ├── column.rs │ ├── decode │ ├── boolean_rle.rs │ ├── float.rs │ ├── mod.rs │ ├── rle_v2.rs │ └── variable_length.rs │ ├── decompress │ └── mod.rs │ └── mod.rs ├── tests └── it │ ├── deserialize.rs │ └── main.rs └── write.py /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | 3 | on: [pull_request, push] 4 | 5 | jobs: 6 | coverage: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Install Rust 11 | run: rustup toolchain install stable --component llvm-tools-preview 12 | - name: Install cargo-llvm-cov 13 | uses: taiki-e/install-action@cargo-llvm-cov 14 | - uses: Swatinem/rust-cache@v1 15 | - name: Setup ORC files 16 | run: | 17 | apt update && apt install python3-pip python3-venv -y -q 18 | python3 -m venv venv 19 | source venv/bin/activate 20 | pip install pip --upgrade 21 | pip install pyorc 22 | python write.py 23 | deactivate 24 | - name: Generate code coverage 25 | run: cargo llvm-cov --lcov --output-path lcov.info 26 | - name: Upload coverage to Codecov 27 | uses: codecov/codecov-action@v1 28 | with: 29 | token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos 30 | files: lcov.info 31 | fail_ci_if_error: true 32 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Test 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - uses: Swatinem/rust-cache@v1 12 | - uses: actions-rs/toolchain@v1 13 | with: 14 | toolchain: stable 15 | - name: Setup ORC files 16 | run: | 17 | apt update && apt install python3-pip python3-venv -y -q 18 | python3 -m venv venv 19 | source venv/bin/activate 20 | pip install pip --upgrade 21 | pip install pyorc 22 | python write.py 23 | deactivate 24 | - name: test 25 | run: cargo test 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | venv 4 | 5 | *.orc 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "orc-format" 3 | version = "0.3.0" 4 | license = "MIT/Apache-2.0" 5 | description = "Unofficial implementation of Apache ORC spec in safe Rust" 6 | homepage = "https://github.com/DataEngineeringLabs/orc-format" 7 | repository = "https://github.com/DataEngineeringLabs/orc-format" 8 | authors = ["Jorge C. Leitao "] 9 | keywords = [ "orc", "analytics" ] 10 | edition = "2021" 11 | 12 | [dependencies] 13 | prost = { version = "0.9.0" } 14 | flate2 = "1" 15 | fallible-streaming-iterator = { version = "0.1" } 16 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | https://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | Copyright 2022 Jorge C. Leitão 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | https://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2022 Jorge C Leitao 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Read Apache ORC from Rust 2 | 3 | [![test](https://github.com/DataEngineeringLabs/orc-format/actions/workflows/test.yml/badge.svg)](https://github.com/DataEngineeringLabs/orc-format/actions/workflows/test.yml) 4 | [![codecov](https://codecov.io/gh/DataEngineeringLabs/orc-format/branch/main/graph/badge.svg?token=AgyTF60R3D)](https://codecov.io/gh/DataEngineeringLabs/orc-format) 5 | 6 | Read [Apache ORC](https://orc.apache.org/) in Rust. 7 | 8 | This repository is similar to [parquet2](https://github.com/jorgecarleitao/parquet2) and [Avro-schema](https://github.com/DataEngineeringLabs/avro-schema), providing a toolkit to: 9 | 10 | * Read ORC files (proto structures) 11 | * Read stripes (the conversion from proto metadata to memory regions) 12 | * Decode stripes (the math of decode stripes into e.g. booleans, runs of RLE, etc.) 13 | 14 | It currently reads the following (logical) types: 15 | 16 | * booleans 17 | * strings 18 | * integers 19 | * floats 20 | 21 | What is not yet implemented: 22 | 23 | * Snappy, LZO decompression 24 | * RLE v2 `Patched Base` decoding 25 | * RLE v1 decoding 26 | * Utility functions to decode non-native logical types: 27 | * decimal 28 | * timestamp 29 | * struct 30 | * List 31 | * Union 32 | 33 | ## Run tests 34 | 35 | ```bash 36 | python3 -m venv venv 37 | venv/bin/pip install -U pip 38 | venv/bin/pip install -U pyorc 39 | venv/bin/python write.py 40 | cargo test 41 | ``` 42 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | //! Contains [`Error`] 2 | use crate::proto::stream::Kind; 3 | 4 | /// Possible errors from this crate. 5 | #[derive(Debug, Clone)] 6 | pub enum Error { 7 | /// Generic error returned when the file is out of spec 8 | OutOfSpec, 9 | /// When a string column contains a value with invalid UTF8 10 | InvalidUtf8, 11 | /// When the user requests a column that does not exist 12 | InvalidColumn(u32), 13 | /// When the user requests a type that does not exist for the given column 14 | InvalidKind(u32, Kind), 15 | /// When decoding a float fails 16 | DecodeFloat, 17 | /// When decompression fails 18 | Decompression, 19 | /// When decoding the proto files fail 20 | InvalidProto, 21 | } 22 | 23 | impl From for Error { 24 | fn from(_: prost::DecodeError) -> Self { 25 | Self::InvalidProto 26 | } 27 | } 28 | 29 | impl From for Error { 30 | fn from(_: std::io::Error) -> Self { 31 | Self::OutOfSpec 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/lib.md: -------------------------------------------------------------------------------- 1 | Welcome to `orc-format` documentation. Thanks for checking it out! 2 | 3 | This Rust crate is a toolkit to read and deserialize ORC to your favourite in-memory format. 4 | 5 | Below is an example of how to read a column from ORC into memory: 6 | 7 | ```rust 8 | use std::fs::File; 9 | 10 | use orc_format::{error::Error, read, read::Column}; 11 | 12 | 13 | fn get_column(path: &str, column: u32) -> Result { 14 | // open the file, as expected. buffering this is not necessary - we 15 | // are very careful about the number of `read`s we perform. 16 | let mut f = File::open(path).expect("no file found"); 17 | 18 | // read the files' metadata 19 | let metadata = read::read_metadata(&mut f)?; 20 | 21 | // the next step is to identify which stripe we want to read. Let's say it is the first one. 22 | let stripe = 0; 23 | 24 | // Each stripe has a footer - we need to read it to extract the location of each column on it. 25 | let stripe_footer = read::read_stripe_footer(&mut f, &metadata, stripe, &mut vec![])?; 26 | 27 | // Finally, we read the column into `Column` 28 | read::read_stripe_column(&mut f, &metadata, stripe, stripe_footer, column, vec![]) 29 | } 30 | ``` 31 | 32 | To deserialize the values of a column, use things inside `read::decode`. 33 | For example, the below contains the deserialization of the "Present" to a `Vec`. 34 | 35 | ```rust 36 | use orc_format::{error::Error, proto::stream::Kind, read::decode::BooleanIter, read::Column}; 37 | 38 | fn deserialize_present(column: &Column, scratch: &mut Vec) -> Result, Error> { 39 | let mut reader = column.get_stream(Kind::Present, std::mem::take(scratch))?; 40 | 41 | let mut validity = Vec::with_capacity(column.number_of_rows()); 42 | BooleanIter::new(&mut reader, column.number_of_rows()).try_for_each(|item| { 43 | validity.push(item?); 44 | Result::<(), Error>::Ok(()) 45 | })?; 46 | 47 | *scratch = std::mem::take(&mut reader.into_inner()); 48 | 49 | Ok(validity) 50 | } 51 | ``` 52 | 53 | Check out the integration tests of the crate to find deserialization of other types such 54 | as floats, integers, strings and dictionaries. 55 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("lib.md")] 2 | #![forbid(unsafe_code)] 3 | pub mod error; 4 | pub mod proto; 5 | pub mod read; 6 | -------------------------------------------------------------------------------- /src/proto.rs: -------------------------------------------------------------------------------- 1 | #[derive(Clone, PartialEq, ::prost::Message)] 2 | pub struct IntegerStatistics { 3 | #[prost(sint64, optional, tag = "1")] 4 | pub minimum: ::core::option::Option, 5 | #[prost(sint64, optional, tag = "2")] 6 | pub maximum: ::core::option::Option, 7 | #[prost(sint64, optional, tag = "3")] 8 | pub sum: ::core::option::Option, 9 | } 10 | #[derive(Clone, PartialEq, ::prost::Message)] 11 | pub struct DoubleStatistics { 12 | #[prost(double, optional, tag = "1")] 13 | pub minimum: ::core::option::Option, 14 | #[prost(double, optional, tag = "2")] 15 | pub maximum: ::core::option::Option, 16 | #[prost(double, optional, tag = "3")] 17 | pub sum: ::core::option::Option, 18 | } 19 | #[derive(Clone, PartialEq, ::prost::Message)] 20 | pub struct StringStatistics { 21 | #[prost(string, optional, tag = "1")] 22 | pub minimum: ::core::option::Option<::prost::alloc::string::String>, 23 | #[prost(string, optional, tag = "2")] 24 | pub maximum: ::core::option::Option<::prost::alloc::string::String>, 25 | /// sum will store the total length of all strings in a stripe 26 | #[prost(sint64, optional, tag = "3")] 27 | pub sum: ::core::option::Option, 28 | } 29 | #[derive(Clone, PartialEq, ::prost::Message)] 30 | pub struct BucketStatistics { 31 | #[prost(uint64, repeated, tag = "1")] 32 | pub count: ::prost::alloc::vec::Vec, 33 | } 34 | #[derive(Clone, PartialEq, ::prost::Message)] 35 | pub struct DecimalStatistics { 36 | #[prost(string, optional, tag = "1")] 37 | pub minimum: ::core::option::Option<::prost::alloc::string::String>, 38 | #[prost(string, optional, tag = "2")] 39 | pub maximum: ::core::option::Option<::prost::alloc::string::String>, 40 | #[prost(string, optional, tag = "3")] 41 | pub sum: ::core::option::Option<::prost::alloc::string::String>, 42 | } 43 | #[derive(Clone, PartialEq, ::prost::Message)] 44 | pub struct DateStatistics { 45 | /// min,max values saved as days since epoch 46 | #[prost(sint32, optional, tag = "1")] 47 | pub minimum: ::core::option::Option, 48 | #[prost(sint32, optional, tag = "2")] 49 | pub maximum: ::core::option::Option, 50 | } 51 | #[derive(Clone, PartialEq, ::prost::Message)] 52 | pub struct TimestampStatistics { 53 | /// min,max values saved as milliseconds since epoch 54 | #[prost(sint64, optional, tag = "1")] 55 | pub minimum: ::core::option::Option, 56 | #[prost(sint64, optional, tag = "2")] 57 | pub maximum: ::core::option::Option, 58 | } 59 | #[derive(Clone, PartialEq, ::prost::Message)] 60 | pub struct BinaryStatistics { 61 | /// sum will store the total binary blob length in a stripe 62 | #[prost(sint64, optional, tag = "1")] 63 | pub sum: ::core::option::Option, 64 | } 65 | #[derive(Clone, PartialEq, ::prost::Message)] 66 | pub struct ColumnStatistics { 67 | #[prost(uint64, optional, tag = "1")] 68 | pub number_of_values: ::core::option::Option, 69 | #[prost(message, optional, tag = "2")] 70 | pub int_statistics: ::core::option::Option, 71 | #[prost(message, optional, tag = "3")] 72 | pub double_statistics: ::core::option::Option, 73 | #[prost(message, optional, tag = "4")] 74 | pub string_statistics: ::core::option::Option, 75 | #[prost(message, optional, tag = "5")] 76 | pub bucket_statistics: ::core::option::Option, 77 | #[prost(message, optional, tag = "6")] 78 | pub decimal_statistics: ::core::option::Option, 79 | #[prost(message, optional, tag = "7")] 80 | pub date_statistics: ::core::option::Option, 81 | #[prost(message, optional, tag = "8")] 82 | pub binary_statistics: ::core::option::Option, 83 | #[prost(message, optional, tag = "9")] 84 | pub timestamp_statistics: ::core::option::Option, 85 | #[prost(bool, optional, tag = "10")] 86 | pub has_null: ::core::option::Option, 87 | } 88 | #[derive(Clone, PartialEq, ::prost::Message)] 89 | pub struct RowIndexEntry { 90 | #[prost(uint64, repeated, tag = "1")] 91 | pub positions: ::prost::alloc::vec::Vec, 92 | #[prost(message, optional, tag = "2")] 93 | pub statistics: ::core::option::Option, 94 | } 95 | #[derive(Clone, PartialEq, ::prost::Message)] 96 | pub struct RowIndex { 97 | #[prost(message, repeated, tag = "1")] 98 | pub entry: ::prost::alloc::vec::Vec, 99 | } 100 | #[derive(Clone, PartialEq, ::prost::Message)] 101 | pub struct BloomFilter { 102 | #[prost(uint32, optional, tag = "1")] 103 | pub num_hash_functions: ::core::option::Option, 104 | #[prost(fixed64, repeated, packed = "false", tag = "2")] 105 | pub bitset: ::prost::alloc::vec::Vec, 106 | } 107 | #[derive(Clone, PartialEq, ::prost::Message)] 108 | pub struct BloomFilterIndex { 109 | #[prost(message, repeated, tag = "1")] 110 | pub bloom_filter: ::prost::alloc::vec::Vec, 111 | } 112 | #[derive(Clone, PartialEq, ::prost::Message)] 113 | pub struct Stream { 114 | #[prost(enumeration = "stream::Kind", optional, tag = "1")] 115 | pub kind: ::core::option::Option, 116 | #[prost(uint32, optional, tag = "2")] 117 | pub column: ::core::option::Option, 118 | #[prost(uint64, optional, tag = "3")] 119 | pub length: ::core::option::Option, 120 | } 121 | /// Nested message and enum types in `Stream`. 122 | pub mod stream { 123 | /// if you add new index stream kinds, you need to make sure to update 124 | /// StreamName to ensure it is added to the stripe in the right area 125 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] 126 | #[repr(i32)] 127 | pub enum Kind { 128 | Present = 0, 129 | Data = 1, 130 | Length = 2, 131 | DictionaryData = 3, 132 | DictionaryCount = 4, 133 | Secondary = 5, 134 | RowIndex = 6, 135 | BloomFilter = 7, 136 | } 137 | } 138 | #[derive(Clone, PartialEq, ::prost::Message)] 139 | pub struct ColumnEncoding { 140 | #[prost(enumeration = "column_encoding::Kind", optional, tag = "1")] 141 | pub kind: ::core::option::Option, 142 | #[prost(uint32, optional, tag = "2")] 143 | pub dictionary_size: ::core::option::Option, 144 | } 145 | /// Nested message and enum types in `ColumnEncoding`. 146 | pub mod column_encoding { 147 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] 148 | #[repr(i32)] 149 | pub enum Kind { 150 | Direct = 0, 151 | Dictionary = 1, 152 | DirectV2 = 2, 153 | DictionaryV2 = 3, 154 | } 155 | } 156 | #[derive(Clone, PartialEq, ::prost::Message)] 157 | pub struct StripeFooter { 158 | #[prost(message, repeated, tag = "1")] 159 | pub streams: ::prost::alloc::vec::Vec, 160 | #[prost(message, repeated, tag = "2")] 161 | pub columns: ::prost::alloc::vec::Vec, 162 | #[prost(string, optional, tag = "3")] 163 | pub writer_timezone: ::core::option::Option<::prost::alloc::string::String>, 164 | } 165 | #[derive(Clone, PartialEq, ::prost::Message)] 166 | pub struct Type { 167 | #[prost(enumeration = "r#type::Kind", optional, tag = "1")] 168 | pub kind: ::core::option::Option, 169 | #[prost(uint32, repeated, tag = "2")] 170 | pub subtypes: ::prost::alloc::vec::Vec, 171 | #[prost(string, repeated, tag = "3")] 172 | pub field_names: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, 173 | #[prost(uint32, optional, tag = "4")] 174 | pub maximum_length: ::core::option::Option, 175 | #[prost(uint32, optional, tag = "5")] 176 | pub precision: ::core::option::Option, 177 | #[prost(uint32, optional, tag = "6")] 178 | pub scale: ::core::option::Option, 179 | } 180 | /// Nested message and enum types in `Type`. 181 | pub mod r#type { 182 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] 183 | #[repr(i32)] 184 | pub enum Kind { 185 | Boolean = 0, 186 | Byte = 1, 187 | Short = 2, 188 | Int = 3, 189 | Long = 4, 190 | Float = 5, 191 | Double = 6, 192 | String = 7, 193 | Binary = 8, 194 | Timestamp = 9, 195 | List = 10, 196 | Map = 11, 197 | Struct = 12, 198 | Union = 13, 199 | Decimal = 14, 200 | Date = 15, 201 | Varchar = 16, 202 | Char = 17, 203 | } 204 | } 205 | #[derive(Clone, PartialEq, ::prost::Message)] 206 | pub struct StripeInformation { 207 | #[prost(uint64, optional, tag = "1")] 208 | pub offset: ::core::option::Option, 209 | #[prost(uint64, optional, tag = "2")] 210 | pub index_length: ::core::option::Option, 211 | #[prost(uint64, optional, tag = "3")] 212 | pub data_length: ::core::option::Option, 213 | #[prost(uint64, optional, tag = "4")] 214 | pub footer_length: ::core::option::Option, 215 | #[prost(uint64, optional, tag = "5")] 216 | pub number_of_rows: ::core::option::Option, 217 | } 218 | #[derive(Clone, PartialEq, ::prost::Message)] 219 | pub struct UserMetadataItem { 220 | #[prost(string, optional, tag = "1")] 221 | pub name: ::core::option::Option<::prost::alloc::string::String>, 222 | #[prost(bytes = "vec", optional, tag = "2")] 223 | pub value: ::core::option::Option<::prost::alloc::vec::Vec>, 224 | } 225 | #[derive(Clone, PartialEq, ::prost::Message)] 226 | pub struct StripeStatistics { 227 | #[prost(message, repeated, tag = "1")] 228 | pub col_stats: ::prost::alloc::vec::Vec, 229 | } 230 | #[derive(Clone, PartialEq, ::prost::Message)] 231 | pub struct Metadata { 232 | #[prost(message, repeated, tag = "1")] 233 | pub stripe_stats: ::prost::alloc::vec::Vec, 234 | } 235 | #[derive(Clone, PartialEq, ::prost::Message)] 236 | pub struct Footer { 237 | #[prost(uint64, optional, tag = "1")] 238 | pub header_length: ::core::option::Option, 239 | #[prost(uint64, optional, tag = "2")] 240 | pub content_length: ::core::option::Option, 241 | #[prost(message, repeated, tag = "3")] 242 | pub stripes: ::prost::alloc::vec::Vec, 243 | #[prost(message, repeated, tag = "4")] 244 | pub types: ::prost::alloc::vec::Vec, 245 | #[prost(message, repeated, tag = "5")] 246 | pub metadata: ::prost::alloc::vec::Vec, 247 | #[prost(uint64, optional, tag = "6")] 248 | pub number_of_rows: ::core::option::Option, 249 | #[prost(message, repeated, tag = "7")] 250 | pub statistics: ::prost::alloc::vec::Vec, 251 | #[prost(uint32, optional, tag = "8")] 252 | pub row_index_stride: ::core::option::Option, 253 | } 254 | /// Serialized length must be less that 255 bytes 255 | #[derive(Clone, PartialEq, ::prost::Message)] 256 | pub struct PostScript { 257 | #[prost(uint64, optional, tag = "1")] 258 | pub footer_length: ::core::option::Option, 259 | #[prost(enumeration = "CompressionKind", optional, tag = "2")] 260 | pub compression: ::core::option::Option, 261 | #[prost(uint64, optional, tag = "3")] 262 | pub compression_block_size: ::core::option::Option, 263 | /// the version of the file format 264 | /// [0, 11] = Hive 0.11 265 | /// [0, 12] = Hive 0.12 266 | #[prost(uint32, repeated, tag = "4")] 267 | pub version: ::prost::alloc::vec::Vec, 268 | #[prost(uint64, optional, tag = "5")] 269 | pub metadata_length: ::core::option::Option, 270 | /// Version of the writer: 271 | /// 0 (or missing) = original 272 | /// 1 = HIVE-8732 fixed 273 | #[prost(uint32, optional, tag = "6")] 274 | pub writer_version: ::core::option::Option, 275 | /// Leave this last in the record 276 | #[prost(string, optional, tag = "8000")] 277 | pub magic: ::core::option::Option<::prost::alloc::string::String>, 278 | } 279 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] 280 | #[repr(i32)] 281 | pub enum CompressionKind { 282 | None = 0, 283 | Zlib = 1, 284 | Snappy = 2, 285 | Lzo = 3, 286 | } 287 | -------------------------------------------------------------------------------- /src/read/column.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | error::Error, 3 | proto::{stream::Kind, ColumnEncoding, CompressionKind, StripeFooter}, 4 | }; 5 | 6 | use super::decompress::Decompressor; 7 | 8 | /// Helper struct used to access the streams associated to an ORC column. 9 | /// Its main use [`Column::get_stream`], to get a stream. 10 | #[derive(Debug)] 11 | pub struct Column { 12 | data: Vec, 13 | column: u32, 14 | number_of_rows: u64, 15 | footer: StripeFooter, 16 | compression: CompressionKind, 17 | } 18 | 19 | impl Column { 20 | pub(crate) fn new( 21 | data: Vec, 22 | column: u32, 23 | number_of_rows: u64, 24 | footer: StripeFooter, 25 | compression: CompressionKind, 26 | ) -> Self { 27 | Self { 28 | data, 29 | column, 30 | number_of_rows, 31 | footer, 32 | compression, 33 | } 34 | } 35 | 36 | /// Returns the stream `kind` associated to this column as a [`Decompressor`]. 37 | /// `scratch` becomes owned by [`Decompressor`], which you can recover via `into_inner`. 38 | pub fn get_stream(&self, kind: Kind, scratch: Vec) -> Result { 39 | let mut start = 0; // the start of the stream 40 | self.footer 41 | .streams 42 | .iter() 43 | .filter(|stream| stream.column() == self.column && stream.kind() != Kind::RowIndex) 44 | .map(|stream| { 45 | start += stream.length() as usize; 46 | stream 47 | }) 48 | .find(|stream| stream.kind() == kind) 49 | .map(|stream| { 50 | let length = stream.length() as usize; 51 | let data = &self.data[start - length..start]; 52 | Decompressor::new(data, self.compression, scratch) 53 | }) 54 | .ok_or(Error::InvalidKind(self.column, kind)) 55 | } 56 | 57 | /// Returns the encoding of the column 58 | pub fn encoding(&self) -> &ColumnEncoding { 59 | &self.footer.columns[self.column as usize] 60 | } 61 | 62 | /// Returns the number of items in the dictionary, if any 63 | pub fn dictionary_size(&self) -> Option { 64 | self.footer.columns[self.column as usize] 65 | .dictionary_size 66 | .map(|x| x as usize) 67 | } 68 | 69 | /// The number of rows on this column 70 | pub fn number_of_rows(&self) -> usize { 71 | self.number_of_rows as usize 72 | } 73 | 74 | /// Returns the underlying footer and the pre-allocated memory region 75 | /// containing all (compressed) streams of this column. 76 | pub fn into_inner(self) -> (StripeFooter, Vec) { 77 | (self.footer, self.data) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/read/decode/boolean_rle.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | 3 | use crate::error::Error; 4 | 5 | use super::read_u8; 6 | 7 | #[derive(Debug, Copy, Clone, PartialEq)] 8 | #[allow(clippy::large_enum_variant)] 9 | pub enum BooleanRun { 10 | Run(u8, u16), 11 | Literals([u8; 255]), 12 | } 13 | 14 | pub struct BooleanRleRunIter { 15 | reader: R, 16 | } 17 | 18 | impl BooleanRleRunIter { 19 | pub fn new(reader: R) -> Self { 20 | Self { reader } 21 | } 22 | 23 | pub fn into_inner(self) -> R { 24 | self.reader 25 | } 26 | } 27 | 28 | fn read_literals(reader: &mut R, header: i8) -> Result<[u8; 255], Error> { 29 | let length = (-header) as usize; 30 | 31 | let mut literals = [0u8; 255]; 32 | 33 | reader 34 | .take(length as u64) 35 | .read_exact(&mut literals[..length])?; 36 | 37 | Ok(literals) 38 | } 39 | 40 | impl Iterator for BooleanRleRunIter { 41 | type Item = Result; 42 | 43 | #[inline] 44 | fn next(&mut self) -> Option { 45 | let header = read_u8(&mut self.reader); 46 | let header = match header { 47 | Ok(header) => header as i8, 48 | Err(e) => return Some(Err(e.into())), 49 | }; 50 | if header < 0 { 51 | Some(read_literals(&mut self.reader, header).map(BooleanRun::Literals)) 52 | } else { 53 | let length = header as u16 + 3; 54 | // this is not ok - it may require more than one byte 55 | let value = read_u8(&mut self.reader); 56 | let value = match value { 57 | Ok(value) => value, 58 | Err(e) => return Some(Err(e.into())), 59 | }; 60 | Some(Ok(BooleanRun::Run(value, length))) 61 | } 62 | } 63 | } 64 | 65 | pub struct BooleanIter { 66 | iter: BooleanRleRunIter, 67 | current: Option, 68 | position: u8, 69 | byte_position: usize, 70 | remaining: usize, 71 | } 72 | 73 | impl<'a, R: Read> BooleanIter { 74 | pub fn new(reader: R, length: usize) -> Self { 75 | Self { 76 | iter: BooleanRleRunIter::new(reader), 77 | current: None, 78 | position: 0, 79 | byte_position: 0, 80 | remaining: length, 81 | } 82 | } 83 | 84 | pub fn into_inner(self) -> R { 85 | self.iter.into_inner() 86 | } 87 | } 88 | 89 | impl Iterator for BooleanIter { 90 | type Item = Result; 91 | 92 | #[inline] 93 | fn next(&mut self) -> Option { 94 | if let Some(run) = &self.current { 95 | match run { 96 | BooleanRun::Run(value, repetitions) => { 97 | let repetitions = *repetitions; 98 | let mask = 128u8 >> self.position; 99 | let result = value & mask == mask; 100 | self.position += 1; 101 | if self.remaining == 0 { 102 | self.current = None; 103 | return None; 104 | } else { 105 | self.remaining -= 1; 106 | } 107 | if self.position == 8 { 108 | if repetitions == 0 { 109 | self.current = None; 110 | } else { 111 | self.current = Some(BooleanRun::Run(*value, repetitions - 1)); 112 | } 113 | self.position = 0; 114 | } 115 | Some(Ok(result)) 116 | } 117 | BooleanRun::Literals(bytes) => { 118 | let mask = 128u8 >> self.position; 119 | let result = bytes[self.byte_position] & mask == mask; 120 | self.position += 1; 121 | if self.remaining == 0 { 122 | self.current = None; 123 | return None; 124 | } else { 125 | self.remaining -= 1; 126 | } 127 | if self.position == 8 { 128 | if bytes.len() == 1 { 129 | self.current = None; 130 | self.byte_position = 0; 131 | } else { 132 | self.byte_position += 1; 133 | } 134 | self.position = 0; 135 | } 136 | Some(Ok(result)) 137 | } 138 | } 139 | } else if self.remaining > 0 { 140 | match self.iter.next()? { 141 | Ok(run) => { 142 | self.current = Some(run); 143 | self.next() 144 | } 145 | Err(e) => { 146 | self.remaining = 0; 147 | Some(Err(e)) 148 | } 149 | } 150 | } else { 151 | None 152 | } 153 | } 154 | 155 | fn size_hint(&self) -> (usize, Option) { 156 | (self.remaining, Some(self.remaining)) 157 | } 158 | } 159 | 160 | #[cfg(test)] 161 | mod test { 162 | use super::*; 163 | 164 | #[test] 165 | fn basic() { 166 | let data = [0x61u8, 0x00]; 167 | 168 | let data = &mut data.as_ref(); 169 | 170 | let iter = BooleanIter::new(data, 100) 171 | .collect::, Error>>() 172 | .unwrap(); 173 | assert_eq!(iter, vec![false; 100]) 174 | } 175 | 176 | #[test] 177 | fn literals() { 178 | let data = [0xfeu8, 0b01000100, 0b01000101]; 179 | 180 | let data = &mut data.as_ref(); 181 | 182 | let iter = BooleanIter::new(data, 16) 183 | .collect::, Error>>() 184 | .unwrap(); 185 | assert_eq!( 186 | iter, 187 | vec![ 188 | false, true, false, false, false, true, false, false, // 0b01000100 189 | false, true, false, false, false, true, false, true, // 0b01000101 190 | ] 191 | ) 192 | } 193 | 194 | #[test] 195 | fn another() { 196 | // "For example, the byte sequence [0xff, 0x80] would be one true followed by seven false values." 197 | let data = [0xff, 0x80]; 198 | 199 | let data = &mut data.as_ref(); 200 | 201 | let iter = BooleanIter::new(data, 8) 202 | .collect::, Error>>() 203 | .unwrap(); 204 | assert_eq!( 205 | iter, 206 | vec![true, false, false, false, false, false, false, false,] 207 | ) 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/read/decode/float.rs: -------------------------------------------------------------------------------- 1 | use crate::error::Error; 2 | 3 | /// Sealead trait to generically represent f32 and f64. 4 | pub trait Float: Default + Copy + private::Sealed { 5 | type Bytes: AsRef<[u8]> + AsMut<[u8]> + Default; 6 | fn from_le_bytes(bytes: Self::Bytes) -> Self; 7 | } 8 | 9 | mod private { 10 | pub trait Sealed {} // Users in other crates cannot name this trait. 11 | impl Sealed for f32 {} 12 | impl Sealed for f64 {} 13 | } 14 | 15 | impl Float for f32 { 16 | type Bytes = [u8; 4]; 17 | 18 | #[inline] 19 | fn from_le_bytes(bytes: Self::Bytes) -> Self { 20 | Self::from_le_bytes(bytes) 21 | } 22 | } 23 | 24 | impl Float for f64 { 25 | type Bytes = [u8; 8]; 26 | 27 | #[inline] 28 | fn from_le_bytes(bytes: Self::Bytes) -> Self { 29 | Self::from_le_bytes(bytes) 30 | } 31 | } 32 | 33 | /// An iterator 34 | pub struct FloatIter { 35 | reader: R, 36 | remaining: usize, 37 | phantom: std::marker::PhantomData, 38 | } 39 | 40 | impl FloatIter { 41 | /// Returns a new [`FloatIter`] 42 | #[inline] 43 | pub fn new(reader: R, length: usize) -> Self { 44 | Self { 45 | reader, 46 | remaining: length, 47 | phantom: Default::default(), 48 | } 49 | } 50 | 51 | /// The number of items remaining 52 | #[inline] 53 | pub fn len(&self) -> usize { 54 | self.remaining 55 | } 56 | 57 | /// Whether the iterator is empty 58 | #[must_use] 59 | pub fn is_empty(&self) -> bool { 60 | self.len() == 0 61 | } 62 | 63 | /// Returns its internal reader 64 | pub fn into_inner(self) -> R { 65 | self.reader 66 | } 67 | } 68 | 69 | impl Iterator for FloatIter { 70 | type Item = Result; 71 | 72 | #[inline] 73 | fn next(&mut self) -> Option { 74 | if self.remaining == 0 { 75 | return None; 76 | } 77 | let mut chunk: T::Bytes = Default::default(); 78 | let error = self.reader.read_exact(chunk.as_mut()); 79 | if error.is_err() { 80 | return Some(Err(Error::DecodeFloat)); 81 | }; 82 | self.remaining -= 1; 83 | Some(Ok(T::from_le_bytes(chunk))) 84 | } 85 | 86 | #[inline] 87 | fn size_hint(&self) -> (usize, Option) { 88 | let remaining = self.len(); 89 | (remaining, Some(remaining)) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/read/decode/mod.rs: -------------------------------------------------------------------------------- 1 | //! Contains different iterators that receive a reader ([`std::io::Read`]) 2 | //! and return values for each of ORC's physical types (e.g. boolean). 3 | mod boolean_rle; 4 | mod float; 5 | mod rle_v2; 6 | mod variable_length; 7 | 8 | pub use boolean_rle::{BooleanIter, BooleanRleRunIter, BooleanRun}; 9 | pub use float::{Float, FloatIter}; 10 | pub use rle_v2::{ 11 | SignedRleV2Iter, SignedRleV2Run, SignedRleV2RunIter, UnsignedRleV2Iter, UnsignedRleV2Run, 12 | UnsignedRleV2RunIter, 13 | }; 14 | pub use variable_length::Values; 15 | 16 | #[inline] 17 | fn read_u8(reader: &mut R) -> Result { 18 | let mut buf = [0; 1]; 19 | reader.read_exact(&mut buf)?; 20 | Ok(buf[0]) 21 | } 22 | -------------------------------------------------------------------------------- /src/read/decode/rle_v2.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | 3 | use crate::error::Error; 4 | 5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 6 | enum EncodingTypeV2 { 7 | ShortRepeat, 8 | Direct, 9 | PatchedBase, 10 | Delta, 11 | } 12 | 13 | fn header_to_rle_v2_short_repeated_width(header: u8) -> u8 { 14 | (header & 0b00111000) >> 3 15 | } 16 | 17 | fn header_to_rle_v2_short_repeated_count(header: u8) -> u8 { 18 | header & 0b00000111 19 | } 20 | 21 | fn rle_v2_direct_bit_width(value: u8) -> u8 { 22 | match value { 23 | 0 => 1, 24 | 1 => 2, 25 | 3 => 4, 26 | 7 => 8, 27 | 15 => 16, 28 | 23 => 24, 29 | 27 => 32, 30 | 28 => 40, 31 | 29 => 48, 32 | 30 => 56, 33 | 31 => 64, 34 | other => todo!("{other}"), 35 | } 36 | } 37 | 38 | fn header_to_rle_v2_direct_bit_width(header: u8) -> u8 { 39 | let bit_width = (header & 0b00111110) >> 1; 40 | rle_v2_direct_bit_width(bit_width) 41 | } 42 | 43 | fn rle_v2_delta_bit_width(value: u8) -> u8 { 44 | match value { 45 | 0 => 0, 46 | 1 => 2, 47 | 3 => 4, 48 | 7 => 8, 49 | 15 => 16, 50 | 23 => 24, 51 | 27 => 32, 52 | 28 => 40, 53 | 29 => 48, 54 | 30 => 56, 55 | 31 => 64, 56 | other => todo!("{other}"), 57 | } 58 | } 59 | 60 | fn header_to_rle_v2_delta_bit_width(header: u8) -> u8 { 61 | let bit_width = (header & 0b00111110) >> 1; 62 | rle_v2_delta_bit_width(bit_width) 63 | } 64 | 65 | fn header_to_rle_v2_direct_length(header: u8, header1: u8) -> u16 { 66 | let bit = header & 0b00000001; 67 | let r = u16::from_be_bytes([bit, header1]); 68 | 1 + r 69 | } 70 | 71 | fn unsigned_varint(reader: &mut R) -> Result { 72 | let mut i = 0u64; 73 | let mut buf = [0u8; 1]; 74 | let mut j = 0; 75 | loop { 76 | if j > 9 { 77 | // if j * 7 > 64 78 | return Err(Error::OutOfSpec); 79 | } 80 | reader.read_exact(&mut buf[..])?; 81 | i |= (u64::from(buf[0] & 0x7F)) << (j * 7); 82 | if (buf[0] >> 7) == 0 { 83 | break; 84 | } else { 85 | j += 1; 86 | } 87 | } 88 | Ok(i) 89 | } 90 | 91 | #[inline] 92 | fn zigzag(z: u64) -> i64 { 93 | if z & 0x1 == 0 { 94 | (z >> 1) as i64 95 | } else { 96 | !(z >> 1) as i64 97 | } 98 | } 99 | 100 | fn signed_varint(reader: &mut R) -> Result { 101 | unsigned_varint(reader).map(zigzag) 102 | } 103 | 104 | #[inline] 105 | fn unpack(bytes: &[u8], num_bits: u8, index: usize) -> u64 { 106 | if num_bits == 0 { 107 | return 0; 108 | }; 109 | let num_bits = num_bits as usize; 110 | let start = num_bits * index; // in bits 111 | let length = num_bits; // in bits 112 | let byte_start = start / 8; 113 | let byte_end = (start + length + 7) / 8; 114 | // copy swapped 115 | let slice = &bytes[byte_start..byte_end]; 116 | let mut a = [0u8; 8]; 117 | for (i, item) in slice.iter().rev().enumerate() { 118 | a[i] = *item; 119 | } 120 | let bits = u64::from_le_bytes(a); 121 | let offset = (slice.len() * 8 - num_bits) % 8 - start % 8; 122 | (bits >> offset) & (!0u64 >> (64 - num_bits)) 123 | } 124 | 125 | #[derive(Debug)] 126 | pub struct UnsignedDirectRun { 127 | data: Vec, 128 | bit_width: u8, 129 | index: usize, 130 | length: usize, 131 | } 132 | 133 | impl UnsignedDirectRun { 134 | #[inline] 135 | pub fn try_new( 136 | header: u8, 137 | reader: &mut R, 138 | mut scratch: Vec, 139 | ) -> Result { 140 | let mut header1 = [0u8]; 141 | reader.read_exact(&mut header1)?; 142 | let bit_width = header_to_rle_v2_direct_bit_width(header); 143 | 144 | let length = header_to_rle_v2_direct_length(header, header1[0]); 145 | 146 | let additional = ((bit_width as usize) * (length as usize) + 7) / 8; 147 | scratch.clear(); 148 | scratch.reserve(additional); 149 | reader.take(additional as u64).read_to_end(&mut scratch)?; 150 | 151 | Ok(Self { 152 | data: scratch, 153 | bit_width, 154 | index: 0, 155 | length: length as usize, 156 | }) 157 | } 158 | 159 | #[inline] 160 | pub fn len(&self) -> usize { 161 | self.length - self.index 162 | } 163 | } 164 | 165 | impl Iterator for UnsignedDirectRun { 166 | type Item = u64; 167 | 168 | #[inline] 169 | fn next(&mut self) -> Option { 170 | (self.index != self.length).then(|| { 171 | let index = self.index; 172 | self.index += 1; 173 | unpack(&self.data, self.bit_width, index) 174 | }) 175 | } 176 | 177 | #[inline] 178 | fn size_hint(&self) -> (usize, Option) { 179 | let remaining = self.len(); 180 | (remaining, Some(remaining)) 181 | } 182 | } 183 | 184 | pub struct UnsignedDeltaRun { 185 | encoded_deltas: Vec, 186 | bit_width: u8, 187 | index: usize, 188 | length: usize, 189 | base: u64, 190 | delta_base: i64, 191 | } 192 | 193 | impl UnsignedDeltaRun { 194 | #[inline] 195 | pub fn try_new( 196 | header: u8, 197 | reader: &mut R, 198 | mut scratch: Vec, 199 | ) -> Result { 200 | let mut header1 = [0u8]; 201 | reader.read_exact(&mut header1)?; 202 | let bit_width = header_to_rle_v2_delta_bit_width(header); 203 | 204 | let length = header_to_rle_v2_direct_length(header, header1[0]); 205 | 206 | let base = unsigned_varint(reader)?; 207 | let delta_base = signed_varint(reader)?; 208 | let additional = ((length as usize - 2) * bit_width as usize + 7) / 8; 209 | 210 | scratch.clear(); 211 | scratch.reserve(additional); 212 | reader.take(additional as u64).read_to_end(&mut scratch)?; 213 | 214 | Ok(Self { 215 | base, 216 | encoded_deltas: scratch, 217 | bit_width, 218 | index: 0, 219 | length: length as usize, 220 | delta_base, 221 | }) 222 | } 223 | 224 | #[inline] 225 | pub fn len(&self) -> usize { 226 | self.length - self.index 227 | } 228 | 229 | #[inline] 230 | pub fn into_inner(mut self) -> Vec { 231 | self.encoded_deltas.clear(); 232 | self.encoded_deltas 233 | } 234 | } 235 | 236 | impl Iterator for UnsignedDeltaRun { 237 | type Item = u64; 238 | 239 | #[inline] 240 | fn next(&mut self) -> Option { 241 | (self.index != self.length).then(|| { 242 | let index = self.index; 243 | if index == 0 { 244 | self.index += 1; 245 | return self.base; 246 | } 247 | if index == 1 || self.bit_width == 0 { 248 | self.index += 1; 249 | if self.delta_base > 0 { 250 | self.base += self.delta_base as u64; 251 | } else { 252 | self.base -= (-self.delta_base) as u64; 253 | } 254 | return self.base; 255 | } 256 | self.index += 1; 257 | let delta = unpack(&self.encoded_deltas, self.bit_width, index - 2); 258 | if self.delta_base > 0 { 259 | self.base += delta; 260 | } else { 261 | self.base -= delta; 262 | } 263 | self.base 264 | }) 265 | } 266 | 267 | #[inline] 268 | fn size_hint(&self) -> (usize, Option) { 269 | let remaining = self.len(); 270 | (remaining, Some(remaining)) 271 | } 272 | } 273 | 274 | #[derive(Debug)] 275 | pub struct UnsignedShortRepeat { 276 | value: u64, 277 | remaining: usize, 278 | scratch: Vec, 279 | } 280 | 281 | impl UnsignedShortRepeat { 282 | #[inline] 283 | fn try_new(header: u8, reader: &mut R, mut scratch: Vec) -> Result { 284 | let width = 1 + header_to_rle_v2_short_repeated_width(header); 285 | let count = 3 + header_to_rle_v2_short_repeated_count(header); 286 | 287 | scratch.clear(); 288 | scratch.reserve(width as usize); 289 | reader.take(width as u64).read_to_end(&mut scratch)?; 290 | 291 | let mut a = [0u8; 8]; 292 | a[8 - scratch.len()..].copy_from_slice(&scratch); 293 | let value = u64::from_be_bytes(a); 294 | scratch.clear(); 295 | 296 | Ok(Self { 297 | value, 298 | remaining: count as usize, 299 | scratch, 300 | }) 301 | } 302 | 303 | #[inline] 304 | pub fn len(&self) -> usize { 305 | self.remaining 306 | } 307 | 308 | #[inline] 309 | pub fn into_inner(self) -> Vec { 310 | self.scratch 311 | } 312 | } 313 | 314 | impl Iterator for UnsignedShortRepeat { 315 | type Item = u64; 316 | 317 | #[inline] 318 | fn next(&mut self) -> Option { 319 | (self.remaining != 0).then(|| { 320 | self.remaining -= 1; 321 | self.value 322 | }) 323 | } 324 | 325 | #[inline] 326 | fn size_hint(&self) -> (usize, Option) { 327 | (self.len(), Some(self.len())) 328 | } 329 | } 330 | 331 | #[derive(Debug)] 332 | pub struct SignedDeltaRun { 333 | encoded_deltas: Vec, 334 | bit_width: u8, 335 | index: usize, 336 | length: usize, 337 | base: i64, 338 | delta_base: i64, 339 | } 340 | 341 | impl SignedDeltaRun { 342 | #[inline] 343 | fn try_new(header: u8, reader: &mut R, mut scratch: Vec) -> Result { 344 | let mut header1 = [0u8]; 345 | reader.read_exact(&mut header1)?; 346 | let bit_width = header_to_rle_v2_delta_bit_width(header); 347 | 348 | let length = header_to_rle_v2_direct_length(header, header1[0]); 349 | 350 | let base = unsigned_varint(reader).map(zigzag)?; 351 | let delta_base = signed_varint(reader)?; 352 | let additional = ((length as usize - 2) * bit_width as usize + 7) / 8; 353 | 354 | scratch.clear(); 355 | scratch.reserve(additional); 356 | reader.take(additional as u64).read_to_end(&mut scratch)?; 357 | 358 | Ok(Self { 359 | base, 360 | encoded_deltas: scratch, 361 | bit_width, 362 | index: 0, 363 | length: length as usize, 364 | delta_base, 365 | }) 366 | } 367 | 368 | pub fn len(&self) -> usize { 369 | self.length - self.index 370 | } 371 | 372 | #[must_use] 373 | pub fn is_empty(&self) -> bool { 374 | self.len() == 0 375 | } 376 | } 377 | 378 | impl Iterator for SignedDeltaRun { 379 | type Item = i64; 380 | 381 | #[inline] 382 | fn next(&mut self) -> Option { 383 | (self.index != self.length).then(|| { 384 | let index = self.index; 385 | if index == 0 { 386 | self.index += 1; 387 | return self.base; 388 | } 389 | if index == 1 || self.bit_width == 0 { 390 | self.index += 1; 391 | if self.delta_base > 0 { 392 | self.base += self.delta_base as i64; 393 | } else { 394 | self.base -= (-self.delta_base) as i64; 395 | } 396 | return self.base; 397 | } 398 | self.index += 1; 399 | // edge case where `bit_width == 0`, where deltas are equal to base delta 400 | let delta = unpack(&self.encoded_deltas, self.bit_width, index - 2); 401 | if self.delta_base > 0 { 402 | self.base += delta as i64; 403 | } else { 404 | self.base -= delta as i64; 405 | } 406 | self.base 407 | }) 408 | } 409 | 410 | #[inline] 411 | fn size_hint(&self) -> (usize, Option) { 412 | let remaining = self.length - self.index; 413 | (remaining, Some(remaining)) 414 | } 415 | } 416 | 417 | #[inline] 418 | fn run_encoding(header: u8) -> EncodingTypeV2 { 419 | match (header & 128 == 128, header & 64 == 64) { 420 | // 11... = 3 421 | (true, true) => EncodingTypeV2::Delta, 422 | // 10... = 2 423 | (true, false) => EncodingTypeV2::PatchedBase, 424 | // 01... = 1 425 | (false, true) => EncodingTypeV2::Direct, 426 | // 00... = 0 427 | (false, false) => EncodingTypeV2::ShortRepeat, 428 | } 429 | } 430 | 431 | /// An enum describing one of the RLE v2 runs for unsigned integers 432 | pub enum UnsignedRleV2Run { 433 | /// Direct 434 | Direct(UnsignedDirectRun), 435 | /// Delta 436 | Delta(UnsignedDeltaRun), 437 | /// Short repeat 438 | ShortRepeat(UnsignedShortRepeat), 439 | } 440 | 441 | impl UnsignedRleV2Run { 442 | /// Returns a new [`UnsignedRleV2Run`] owning `scratch`. 443 | pub fn try_new(reader: &mut R, scratch: Vec) -> Result { 444 | let mut header = [0u8]; 445 | reader.read_exact(&mut header)?; 446 | let header = header[0]; 447 | let encoding = run_encoding(header); 448 | 449 | match encoding { 450 | EncodingTypeV2::Direct => { 451 | UnsignedDirectRun::try_new(header, reader, scratch).map(Self::Direct) 452 | } 453 | EncodingTypeV2::Delta => { 454 | UnsignedDeltaRun::try_new(header, reader, scratch).map(Self::Delta) 455 | } 456 | EncodingTypeV2::ShortRepeat => { 457 | UnsignedShortRepeat::try_new(header, reader, scratch).map(Self::ShortRepeat) 458 | } 459 | other => todo!("{other:?}"), 460 | } 461 | } 462 | 463 | /// The number of items remaining 464 | pub fn len(&self) -> usize { 465 | match self { 466 | Self::Direct(run) => run.len(), 467 | Self::Delta(run) => run.len(), 468 | Self::ShortRepeat(run) => run.len(), 469 | } 470 | } 471 | 472 | /// Whether the iterator is empty 473 | #[must_use] 474 | pub fn is_empty(&self) -> bool { 475 | self.len() == 0 476 | } 477 | } 478 | 479 | /// A fallible [`Iterator`] of [`UnsignedRleV2Run`]. 480 | pub struct UnsignedRleV2RunIter { 481 | reader: R, 482 | scratch: Vec, 483 | length: usize, 484 | } 485 | 486 | impl UnsignedRleV2RunIter { 487 | /// Returns a new [`UnsignedRleV2RunIter`]. 488 | pub fn new(reader: R, length: usize, scratch: Vec) -> Self { 489 | Self { 490 | reader, 491 | scratch, 492 | length, 493 | } 494 | } 495 | 496 | /// Returns its internal buffer 497 | pub fn into_inner(mut self) -> (R, Vec) { 498 | self.scratch.clear(); 499 | (self.reader, self.scratch) 500 | } 501 | } 502 | 503 | impl Iterator for UnsignedRleV2RunIter { 504 | type Item = Result; 505 | 506 | fn next(&mut self) -> Option { 507 | (self.length != 0).then(|| { 508 | let run = 509 | UnsignedRleV2Run::try_new(&mut self.reader, std::mem::take(&mut self.scratch))?; 510 | self.length -= run.len(); 511 | Ok(run) 512 | }) 513 | } 514 | } 515 | 516 | /// A fallible [`Iterator`] of [`i64`]. 517 | pub struct UnsignedRleV2Iter { 518 | current: Option, 519 | runs: UnsignedRleV2RunIter, 520 | } 521 | 522 | impl UnsignedRleV2Iter { 523 | /// Returns a new [`SignedRleV2Iter`]. 524 | pub fn new(reader: R, length: usize, scratch: Vec) -> Self { 525 | Self { 526 | runs: UnsignedRleV2RunIter::new(reader, length, scratch), 527 | current: None, 528 | } 529 | } 530 | 531 | /// Returns its internal buffer 532 | pub fn into_inner(self) -> (R, Vec) { 533 | self.runs.into_inner() 534 | } 535 | } 536 | 537 | impl Iterator for UnsignedRleV2Iter { 538 | type Item = Result; 539 | 540 | #[inline] 541 | fn next(&mut self) -> Option { 542 | let next = if let Some(run) = &mut self.current { 543 | match run { 544 | UnsignedRleV2Run::Direct(values_iter) => values_iter.next(), 545 | UnsignedRleV2Run::Delta(values_iter) => values_iter.next(), 546 | UnsignedRleV2Run::ShortRepeat(values_iter) => values_iter.next(), 547 | } 548 | } else { 549 | None 550 | }; 551 | 552 | if next.is_none() { 553 | match self.runs.next()? { 554 | Ok(run) => self.current = Some(run), 555 | Err(e) => return Some(Err(e)), 556 | } 557 | self.next() 558 | } else { 559 | next.map(Ok) 560 | } 561 | } 562 | } 563 | 564 | #[derive(Debug)] 565 | pub struct SignedDirectRun(UnsignedDirectRun); 566 | 567 | impl SignedDirectRun { 568 | pub fn try_new(header: u8, reader: &mut R, scratch: Vec) -> Result { 569 | UnsignedDirectRun::try_new(header, reader, scratch).map(Self) 570 | } 571 | 572 | pub fn len(&self) -> usize { 573 | self.0.len() 574 | } 575 | 576 | /// Whether the iterator is empty 577 | #[must_use] 578 | pub fn is_empty(&self) -> bool { 579 | self.len() == 0 580 | } 581 | } 582 | 583 | impl Iterator for SignedDirectRun { 584 | type Item = i64; 585 | 586 | fn next(&mut self) -> Option { 587 | self.0.next().map(zigzag) 588 | } 589 | 590 | fn size_hint(&self) -> (usize, Option) { 591 | self.0.size_hint() 592 | } 593 | } 594 | 595 | #[derive(Debug)] 596 | pub struct SignedShortRepeat(UnsignedShortRepeat); 597 | 598 | impl SignedShortRepeat { 599 | pub fn try_new(header: u8, reader: &mut R, scratch: Vec) -> Result { 600 | UnsignedShortRepeat::try_new(header, reader, scratch).map(Self) 601 | } 602 | 603 | /// The number of items remaining 604 | pub fn len(&self) -> usize { 605 | self.0.len() 606 | } 607 | 608 | /// Whether the iterator is empty 609 | #[must_use] 610 | pub fn is_empty(&self) -> bool { 611 | self.len() == 0 612 | } 613 | } 614 | 615 | impl Iterator for SignedShortRepeat { 616 | type Item = i64; 617 | 618 | fn next(&mut self) -> Option { 619 | self.0.next().map(zigzag) 620 | } 621 | 622 | fn size_hint(&self) -> (usize, Option) { 623 | self.0.size_hint() 624 | } 625 | } 626 | 627 | /// An enum describing one of the RLE v2 runs for signed integers 628 | #[derive(Debug)] 629 | pub enum SignedRleV2Run { 630 | /// Direct 631 | Direct(SignedDirectRun), 632 | /// Delta 633 | Delta(SignedDeltaRun), 634 | /// Short repeat 635 | ShortRepeat(SignedShortRepeat), 636 | } 637 | 638 | impl SignedRleV2Run { 639 | /// Returns a new [`SignedRleV2Run`], moving `scratch` to itself 640 | pub fn try_new(reader: &mut R, scratch: Vec) -> Result { 641 | let mut header = [0u8]; 642 | reader.read_exact(&mut header)?; 643 | let header = header[0]; 644 | let encoding = run_encoding(header); 645 | 646 | match encoding { 647 | EncodingTypeV2::Direct => { 648 | SignedDirectRun::try_new(header, reader, scratch).map(Self::Direct) 649 | } 650 | EncodingTypeV2::Delta => { 651 | SignedDeltaRun::try_new(header, reader, scratch).map(Self::Delta) 652 | } 653 | EncodingTypeV2::ShortRepeat => { 654 | SignedShortRepeat::try_new(header, reader, scratch).map(Self::ShortRepeat) 655 | } 656 | other => todo!("{other:?}"), 657 | } 658 | } 659 | 660 | /// The number of items remaining 661 | pub fn len(&self) -> usize { 662 | match self { 663 | Self::Direct(run) => run.len(), 664 | Self::Delta(run) => run.len(), 665 | Self::ShortRepeat(run) => run.len(), 666 | } 667 | } 668 | 669 | /// Whether the iterator is empty 670 | #[must_use] 671 | pub fn is_empty(&self) -> bool { 672 | self.len() == 0 673 | } 674 | } 675 | 676 | /// A fallible [`Iterator`] of [`SignedRleV2Run`]. 677 | pub struct SignedRleV2RunIter { 678 | reader: R, 679 | scratch: Vec, 680 | length: usize, 681 | } 682 | 683 | impl SignedRleV2RunIter { 684 | /// Returns a new [`SignedRleV2RunIter`]. 685 | pub fn new(reader: R, length: usize, scratch: Vec) -> Self { 686 | Self { 687 | reader, 688 | scratch, 689 | length, 690 | } 691 | } 692 | 693 | pub fn into_inner(mut self) -> (R, Vec) { 694 | self.scratch.clear(); 695 | (self.reader, self.scratch) 696 | } 697 | } 698 | 699 | impl Iterator for SignedRleV2RunIter { 700 | type Item = Result; 701 | 702 | #[inline] 703 | fn next(&mut self) -> Option { 704 | (self.length != 0).then(|| { 705 | let run = SignedRleV2Run::try_new(&mut self.reader, std::mem::take(&mut self.scratch))?; 706 | self.length -= run.len(); 707 | Ok(run) 708 | }) 709 | } 710 | } 711 | 712 | /// A fallible [`Iterator`] of [`i64`]. 713 | pub struct SignedRleV2Iter { 714 | current: Option, 715 | runs: SignedRleV2RunIter, 716 | } 717 | 718 | impl SignedRleV2Iter { 719 | /// Returns a new [`SignedRleV2Iter`]. 720 | pub fn new(reader: R, length: usize, scratch: Vec) -> Self { 721 | Self { 722 | runs: SignedRleV2RunIter::new(reader, length, scratch), 723 | current: None, 724 | } 725 | } 726 | 727 | /// Returns its internal buffer 728 | pub fn into_inner(self) -> (R, Vec) { 729 | self.runs.into_inner() 730 | } 731 | } 732 | 733 | impl Iterator for SignedRleV2Iter { 734 | type Item = Result; 735 | 736 | #[inline] 737 | fn next(&mut self) -> Option { 738 | let next = if let Some(run) = &mut self.current { 739 | match run { 740 | SignedRleV2Run::Direct(values_iter) => values_iter.next(), 741 | SignedRleV2Run::Delta(values_iter) => values_iter.next(), 742 | SignedRleV2Run::ShortRepeat(values_iter) => values_iter.next(), 743 | } 744 | } else { 745 | None 746 | }; 747 | 748 | if next.is_none() { 749 | match self.runs.next()? { 750 | Ok(run) => self.current = Some(run), 751 | Err(e) => return Some(Err(e)), 752 | } 753 | self.next() 754 | } else { 755 | next.map(Ok) 756 | } 757 | } 758 | } 759 | 760 | #[cfg(test)] 761 | mod test { 762 | use super::*; 763 | 764 | #[test] 765 | fn test_zigzag() { 766 | assert_eq!(zigzag(2), 1); 767 | assert_eq!(zigzag(4), 2); 768 | } 769 | 770 | #[test] 771 | fn unpacking() { 772 | let bytes = [0b01000000u8]; 773 | assert_eq!(unpack(&bytes, 2, 0), 1); 774 | assert_eq!(unpack(&bytes, 2, 1), 0); 775 | } 776 | 777 | #[test] 778 | fn short_repeat() { 779 | // [10000, 10000, 10000, 10000, 10000] 780 | let data: [u8; 3] = [0x0a, 0x27, 0x10]; 781 | 782 | let a = UnsignedShortRepeat::try_new(data[0], &mut &data[1..], vec![]) 783 | .unwrap() 784 | .collect::>(); 785 | assert_eq!(a, vec![10000, 10000, 10000, 10000, 10000]); 786 | } 787 | 788 | #[test] 789 | fn direct() { 790 | // [23713, 43806, 57005, 48879] 791 | let data: [u8; 10] = [0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe, 0xef]; 792 | 793 | let data = &mut data.as_ref(); 794 | 795 | let a = UnsignedDirectRun::try_new(data[0], &mut &data[1..], vec![]) 796 | .unwrap() 797 | .collect::>(); 798 | assert_eq!(a, vec![23713, 43806, 57005, 48879]); 799 | } 800 | 801 | #[test] 802 | fn delta() { 803 | // [2, 3, 5, 7, 11, 13, 17, 19, 23, 29] 804 | // 0x22 = 34 805 | // 0x42 = 66 806 | // 0x46 = 70 807 | let data: [u8; 8] = [0xc6, 0x09, 0x02, 0x02, 0x22, 0x42, 0x42, 0x46]; 808 | 809 | let data = &mut data.as_ref(); 810 | 811 | let a = UnsignedDeltaRun::try_new(data[0], &mut &data[1..], vec![]) 812 | .unwrap() 813 | .collect::>(); 814 | assert_eq!(a, vec![2, 3, 5, 7, 11, 13, 17, 19, 23, 29]); 815 | } 816 | } 817 | -------------------------------------------------------------------------------- /src/read/decode/variable_length.rs: -------------------------------------------------------------------------------- 1 | use crate::error::Error; 2 | 3 | use std::io::Read; 4 | 5 | pub struct Values { 6 | reader: R, 7 | scratch: Vec, 8 | } 9 | 10 | impl Values { 11 | pub fn new(reader: R, scratch: Vec) -> Self { 12 | Self { reader, scratch } 13 | } 14 | 15 | pub fn next(&mut self, length: usize) -> Result<&[u8], Error> { 16 | self.scratch.clear(); 17 | self.scratch.reserve(length); 18 | (&mut self.reader) 19 | .take(length as u64) 20 | .read_to_end(&mut self.scratch)?; 21 | 22 | Ok(&self.scratch) 23 | } 24 | 25 | pub fn into_inner(self) -> Vec { 26 | self.scratch 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/read/decompress/mod.rs: -------------------------------------------------------------------------------- 1 | //! Contains [`Decompressor`] 2 | use std::io::Read; 3 | 4 | use fallible_streaming_iterator::FallibleStreamingIterator; 5 | 6 | use crate::error::Error; 7 | use crate::proto::CompressionKind; 8 | 9 | fn decode_header(bytes: &[u8]) -> (bool, usize) { 10 | let a: [u8; 3] = (&bytes[..3]).try_into().unwrap(); 11 | let a = [0, a[0], a[1], a[2]]; 12 | let length = u32::from_le_bytes(a); 13 | let is_original = a[1] & 1 == 1; 14 | let length = (length >> (8 + 1)) as usize; 15 | 16 | (is_original, length) 17 | } 18 | 19 | enum State<'a> { 20 | Original(&'a [u8]), 21 | Compressed(Vec), 22 | } 23 | 24 | struct DecompressorIter<'a> { 25 | stream: &'a [u8], 26 | current: Option>, // when we have compression but the value is original 27 | compression: CompressionKind, 28 | scratch: Vec, 29 | } 30 | 31 | impl<'a> DecompressorIter<'a> { 32 | pub fn new(stream: &'a [u8], compression: CompressionKind, scratch: Vec) -> Self { 33 | Self { 34 | stream, 35 | current: None, 36 | compression, 37 | scratch, 38 | } 39 | } 40 | 41 | pub fn into_inner(self) -> Vec { 42 | match self.current { 43 | Some(State::Compressed(some)) => some, 44 | _ => self.scratch, 45 | } 46 | } 47 | } 48 | 49 | impl<'a> FallibleStreamingIterator for DecompressorIter<'a> { 50 | type Item = [u8]; 51 | 52 | type Error = Error; 53 | 54 | #[inline] 55 | fn advance(&mut self) -> Result<(), Self::Error> { 56 | if self.stream.is_empty() { 57 | self.current = None; 58 | return Ok(()); 59 | } 60 | match self.compression { 61 | CompressionKind::None => { 62 | // todo: take stratch from current State::Compressed for re-use 63 | self.current = Some(State::Original(self.stream)); 64 | self.stream = &[]; 65 | } 66 | CompressionKind::Zlib => { 67 | // todo: take stratch from current State::Compressed for re-use 68 | let (is_original, length) = decode_header(self.stream); 69 | self.stream = &self.stream[3..]; 70 | let (maybe_compressed, remaining) = self.stream.split_at(length); 71 | self.stream = remaining; 72 | if is_original { 73 | self.current = Some(State::Original(maybe_compressed)); 74 | } else { 75 | let mut gz = flate2::read::DeflateDecoder::new(maybe_compressed); 76 | self.scratch.clear(); 77 | gz.read_to_end(&mut self.scratch)?; 78 | self.current = Some(State::Compressed(std::mem::take(&mut self.scratch))); 79 | } 80 | } 81 | _ => todo!(), 82 | }; 83 | Ok(()) 84 | } 85 | 86 | #[inline] 87 | fn get(&self) -> Option<&Self::Item> { 88 | self.current.as_ref().map(|x| match x { 89 | State::Original(x) => *x, 90 | State::Compressed(x) => x.as_ref(), 91 | }) 92 | } 93 | } 94 | 95 | /// A [`Read`]er fulfilling the ORC specification of reading compressed data. 96 | pub struct Decompressor<'a> { 97 | decompressor: DecompressorIter<'a>, 98 | offset: usize, 99 | is_first: bool, 100 | } 101 | 102 | impl<'a> Decompressor<'a> { 103 | /// Creates a new [`Decompressor`] that will use `scratch` as a temporary region. 104 | pub fn new(stream: &'a [u8], compression: CompressionKind, scratch: Vec) -> Self { 105 | Self { 106 | decompressor: DecompressorIter::new(stream, compression, scratch), 107 | offset: 0, 108 | is_first: true, 109 | } 110 | } 111 | 112 | /// Returns the internal memory region, so it can be re-used 113 | pub fn into_inner(self) -> Vec { 114 | self.decompressor.into_inner() 115 | } 116 | } 117 | 118 | impl<'a> std::io::Read for Decompressor<'a> { 119 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 120 | if self.is_first { 121 | self.is_first = false; 122 | self.decompressor.advance().unwrap(); 123 | } 124 | let current = self.decompressor.get(); 125 | let current = if let Some(current) = current { 126 | if current.len() == self.offset { 127 | self.decompressor.advance().unwrap(); 128 | self.offset = 0; 129 | let current = self.decompressor.get(); 130 | if let Some(current) = current { 131 | current 132 | } else { 133 | return Ok(0); 134 | } 135 | } else { 136 | ¤t[self.offset..] 137 | } 138 | } else { 139 | return Ok(0); 140 | }; 141 | 142 | if current.len() >= buf.len() { 143 | buf.copy_from_slice(¤t[..buf.len()]); 144 | self.offset += buf.len(); 145 | Ok(buf.len()) 146 | } else { 147 | buf[..current.len()].copy_from_slice(current); 148 | self.offset += current.len(); 149 | Ok(current.len()) 150 | } 151 | } 152 | } 153 | 154 | #[cfg(test)] 155 | mod tests { 156 | use super::*; 157 | 158 | #[test] 159 | fn decode_uncompressed() { 160 | // 5 uncompressed = [0x0b, 0x00, 0x00] = [0b1011, 0, 0] 161 | let bytes = &[0b1011, 0, 0, 0]; 162 | 163 | let (is_original, length) = decode_header(bytes); 164 | assert!(is_original); 165 | assert_eq!(length, 5); 166 | } 167 | 168 | #[test] 169 | fn decode_compressed() { 170 | // 100_000 compressed = [0x40, 0x0d, 0x03] = [0b01000000, 0b00001101, 0b00000011] 171 | let bytes = &[0b01000000, 0b00001101, 0b00000011, 0]; 172 | 173 | let (is_original, length) = decode_header(bytes); 174 | assert!(!is_original); 175 | assert_eq!(length, 100_000); 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/read/mod.rs: -------------------------------------------------------------------------------- 1 | //! APIs to read from ORC 2 | //! 3 | //! Reading from ORC is essentially composed by: 4 | //! 1. Identify the column type based on the file's schema 5 | //! 2. Read the stripe (or part of it in projection pushdown) 6 | //! 3. For each column, select the relevant region of the stripe 7 | //! 4. Attach an Iterator to the region 8 | 9 | use std::io::{Read, Seek, SeekFrom}; 10 | 11 | use prost::Message; 12 | 13 | use crate::error::Error; 14 | use crate::proto::stream::Kind; 15 | use crate::proto::{CompressionKind, Footer, Metadata, PostScript, StripeFooter}; 16 | 17 | mod column; 18 | pub mod decode; 19 | pub mod decompress; 20 | pub use column::Column; 21 | 22 | const DEFAULT_FOOTER_SIZE: u64 = 16 * 1024; 23 | 24 | // see (unstable) Seek::stream_len 25 | fn stream_len(seek: &mut impl Seek) -> std::result::Result { 26 | let old_pos = seek.seek(SeekFrom::Current(0))?; 27 | let len = seek.seek(SeekFrom::End(0))?; 28 | 29 | // Avoid seeking a third time when we were already at the end of the 30 | // stream. The branch is usually way cheaper than a seek operation. 31 | if old_pos != len { 32 | seek.seek(SeekFrom::Start(old_pos))?; 33 | } 34 | 35 | Ok(len) 36 | } 37 | 38 | /// The file's metadata. 39 | #[derive(Debug)] 40 | pub struct FileMetadata { 41 | pub postscript: PostScript, 42 | pub footer: Footer, 43 | pub metadata: Metadata, 44 | } 45 | 46 | pub fn read_metadata(reader: &mut R) -> Result 47 | where 48 | R: Read + Seek, 49 | { 50 | let file_len = stream_len(reader)?; 51 | 52 | // initial read of the footer 53 | let footer_len = if file_len < DEFAULT_FOOTER_SIZE { 54 | file_len 55 | } else { 56 | DEFAULT_FOOTER_SIZE 57 | }; 58 | 59 | reader.seek(SeekFrom::End(-(footer_len as i64)))?; 60 | let mut tail_bytes = Vec::with_capacity(footer_len as usize); 61 | reader.take(footer_len).read_to_end(&mut tail_bytes)?; 62 | 63 | // The final byte of the file contains the serialized length of the Postscript, 64 | // which must be less than 256 bytes. 65 | let postscript_len = tail_bytes[tail_bytes.len() - 1] as usize; 66 | tail_bytes.truncate(tail_bytes.len() - 1); 67 | 68 | // next is the postscript 69 | let postscript = PostScript::decode(&tail_bytes[tail_bytes.len() - postscript_len..])?; 70 | tail_bytes.truncate(tail_bytes.len() - postscript_len); 71 | 72 | // next is the footer 73 | let footer_length = postscript.footer_length.ok_or(Error::OutOfSpec)? as usize; // todo: throw error 74 | 75 | let footer = &tail_bytes[tail_bytes.len() - footer_length..]; 76 | let footer = deserialize_footer(footer, postscript.compression())?; 77 | tail_bytes.truncate(tail_bytes.len() - footer_length); 78 | 79 | // finally the metadata 80 | let metadata_length = postscript.metadata_length.ok_or(Error::OutOfSpec)? as usize; // todo: throw error 81 | let metadata = &tail_bytes[tail_bytes.len() - metadata_length..]; 82 | let metadata = deserialize_footer_metadata(metadata, postscript.compression())?; 83 | 84 | Ok(FileMetadata { 85 | postscript, 86 | footer, 87 | metadata, 88 | }) 89 | } 90 | 91 | /// Reads, decompresses and deserializes the stripe's footer as [`StripeFooter`] using 92 | /// `scratch` as an intermediary memory region. 93 | /// # Implementation 94 | /// This function is guaranteed to perform exactly one seek and one read to `reader`. 95 | pub fn read_stripe_footer( 96 | reader: &mut R, 97 | metadata: &FileMetadata, 98 | stripe: usize, 99 | scratch: &mut Vec, 100 | ) -> Result { 101 | let stripe = &metadata.footer.stripes[stripe]; 102 | 103 | let start = stripe.offset() + stripe.index_length() + stripe.data_length(); 104 | let len = stripe.footer_length(); 105 | reader.seek(SeekFrom::Start(start))?; 106 | 107 | scratch.clear(); 108 | scratch.reserve(len as usize); 109 | reader.take(len).read_to_end(scratch)?; 110 | deserialize_stripe_footer(scratch, metadata.postscript.compression()) 111 | } 112 | 113 | /// Reads `column` from the stripe into a [`Column`]. 114 | /// `scratch` becomes owned by [`Column`], which you can recover via `into_inner`. 115 | /// # Implementation 116 | /// This function is guaranteed to perform exactly one seek and one read to `reader`. 117 | pub fn read_stripe_column( 118 | reader: &mut R, 119 | metadata: &FileMetadata, 120 | stripe: usize, 121 | footer: StripeFooter, 122 | column: u32, 123 | mut scratch: Vec, 124 | ) -> Result { 125 | let stripe = &metadata.footer.stripes[stripe]; 126 | 127 | let mut start = 0; // the start of the stream 128 | 129 | let start = footer 130 | .streams 131 | .iter() 132 | .map(|stream| { 133 | start += stream.length(); 134 | (start, stream) 135 | }) 136 | .find(|(_, stream)| stream.column() == column && stream.kind() != Kind::RowIndex) 137 | .map(|(start, stream)| start - stream.length()) 138 | .ok_or(Error::InvalidColumn(column))?; 139 | 140 | let length = footer 141 | .streams 142 | .iter() 143 | .filter(|stream| stream.column() == column && stream.kind() != Kind::RowIndex) 144 | .fold(0, |acc, stream| acc + stream.length()); 145 | 146 | let start = stripe.offset() + start; 147 | reader.seek(SeekFrom::Start(start))?; 148 | 149 | scratch.clear(); 150 | scratch.reserve(length as usize); 151 | reader.take(length).read_to_end(&mut scratch)?; 152 | Ok(Column::new( 153 | scratch, 154 | column, 155 | stripe.number_of_rows(), 156 | footer, 157 | metadata.postscript.compression(), 158 | )) 159 | } 160 | 161 | fn deserialize_footer(bytes: &[u8], compression: CompressionKind) -> Result { 162 | let mut buffer = vec![]; 163 | decompress::Decompressor::new(bytes, compression, vec![]).read_to_end(&mut buffer)?; 164 | Ok(Footer::decode(&*buffer)?) 165 | } 166 | 167 | fn deserialize_footer_metadata( 168 | bytes: &[u8], 169 | compression: CompressionKind, 170 | ) -> Result { 171 | let mut buffer = vec![]; 172 | decompress::Decompressor::new(bytes, compression, vec![]).read_to_end(&mut buffer)?; 173 | Ok(Metadata::decode(&*buffer)?) 174 | } 175 | 176 | fn deserialize_stripe_footer( 177 | bytes: &[u8], 178 | compression: CompressionKind, 179 | ) -> Result { 180 | let mut buffer = vec![]; 181 | decompress::Decompressor::new(bytes, compression, vec![]).read_to_end(&mut buffer)?; 182 | Ok(StripeFooter::decode(&*buffer)?) 183 | } 184 | -------------------------------------------------------------------------------- /tests/it/deserialize.rs: -------------------------------------------------------------------------------- 1 | use orc_format::{ 2 | error::Error, 3 | proto::{column_encoding::Kind as ColumnEncodingKind, stream::Kind}, 4 | read, 5 | read::decode::{ 6 | BooleanIter, SignedRleV2Iter, SignedRleV2Run, SignedRleV2RunIter, UnsignedRleV2Run, 7 | UnsignedRleV2RunIter, 8 | }, 9 | read::decompress::Decompressor, 10 | read::Column, 11 | }; 12 | 13 | fn deserialize_validity(column: &Column, scratch: &mut Vec) -> Result, Error> { 14 | let mut reader = column.get_stream(Kind::Present, std::mem::take(scratch))?; 15 | 16 | let mut validity = Vec::with_capacity(column.number_of_rows()); 17 | BooleanIter::new(&mut reader, column.number_of_rows()).try_for_each(|item| { 18 | validity.push(item?); 19 | Result::<(), Error>::Ok(()) 20 | })?; 21 | 22 | *scratch = std::mem::take(&mut reader.into_inner()); 23 | 24 | Ok(validity) 25 | } 26 | 27 | pub fn deserialize_f32_array(column: &Column) -> Result<(Vec, Vec), Error> { 28 | let mut scratch = vec![]; 29 | 30 | let validity = deserialize_validity(column, &mut scratch)?; 31 | 32 | let reader = column.get_stream(Kind::Data, scratch)?; 33 | 34 | let num_of_values: usize = validity.iter().map(|x| *x as usize).sum(); 35 | 36 | let mut valid_values = Vec::with_capacity(num_of_values); 37 | let mut iter = read::decode::FloatIter::::new(reader, num_of_values); 38 | iter.try_for_each(|item| { 39 | valid_values.push(item?); 40 | Result::<(), Error>::Ok(()) 41 | })?; 42 | 43 | let _ = iter.into_inner(); 44 | 45 | Ok((validity, valid_values)) 46 | } 47 | 48 | pub fn deserialize_int_array(column: &Column) -> Result<(Vec, Vec), Error> { 49 | let mut scratch = vec![]; 50 | 51 | let validity = deserialize_validity(column, &mut scratch)?; 52 | 53 | let num_of_values: usize = validity.iter().map(|x| *x as usize).sum(); 54 | 55 | let reader = column.get_stream(Kind::Data, scratch)?; 56 | 57 | let mut valid_values = Vec::with_capacity(num_of_values); 58 | 59 | let mut iter = SignedRleV2RunIter::new(reader, num_of_values, vec![]); 60 | 61 | iter.try_for_each(|run| { 62 | run.map(|run| match run { 63 | SignedRleV2Run::Direct(values) => valid_values.extend(values), 64 | SignedRleV2Run::Delta(values) => valid_values.extend(values), 65 | SignedRleV2Run::ShortRepeat(values) => valid_values.extend(values), 66 | }) 67 | })?; 68 | 69 | let (_, _) = iter.into_inner(); 70 | 71 | // test the other iterator 72 | let reader = column.get_stream(Kind::Data, vec![])?; 73 | 74 | let mut valid_values1 = Vec::with_capacity(num_of_values); 75 | SignedRleV2Iter::new(reader, num_of_values, vec![]).try_for_each(|item| { 76 | valid_values1.push(item?); 77 | Result::<(), Error>::Ok(()) 78 | })?; 79 | assert_eq!(valid_values1, valid_values); 80 | 81 | Ok((validity, valid_values)) 82 | } 83 | 84 | pub fn deserialize_bool_array(column: &Column) -> Result<(Vec, Vec), Error> { 85 | let mut scratch = vec![]; 86 | 87 | let validity = deserialize_validity(column, &mut scratch)?; 88 | 89 | let num_of_values: usize = validity.iter().map(|x| *x as usize).sum(); 90 | 91 | let reader = column.get_stream(Kind::Data, std::mem::take(&mut scratch))?; 92 | 93 | let mut valid_values = Vec::with_capacity(num_of_values); 94 | 95 | let mut iter = BooleanIter::new(reader, num_of_values); 96 | iter.try_for_each(|item| { 97 | valid_values.push(item?); 98 | Result::<(), Error>::Ok(()) 99 | })?; 100 | 101 | let _ = iter.into_inner(); 102 | 103 | Ok((validity, valid_values)) 104 | } 105 | 106 | pub fn deserialize_str( 107 | mut lengths: UnsignedRleV2RunIter, 108 | values: &mut read::decode::Values, 109 | num_of_values: usize, 110 | ) -> Result, Error> { 111 | let mut result = Vec::with_capacity(num_of_values); 112 | 113 | for run in lengths.by_ref() { 114 | let f = |length| { 115 | values.next(length as usize).and_then(|x| { 116 | std::str::from_utf8(x) 117 | .map(|x| x.to_string()) 118 | .map_err(|_| Error::InvalidUtf8) 119 | }) 120 | }; 121 | match run? { 122 | UnsignedRleV2Run::Direct(lengths) => lengths.map(f).try_for_each(|x| { 123 | result.push(x?); 124 | Result::<_, Error>::Ok(()) 125 | }), 126 | UnsignedRleV2Run::Delta(lengths) => lengths.map(f).try_for_each(|x| { 127 | result.push(x?); 128 | Result::<_, Error>::Ok(()) 129 | }), 130 | UnsignedRleV2Run::ShortRepeat(lengths) => lengths.map(f).try_for_each(|x| { 131 | result.push(x?); 132 | Result::<_, Error>::Ok(()) 133 | }), 134 | }? 135 | } 136 | 137 | let (_, _) = lengths.into_inner(); 138 | 139 | Ok(result) 140 | } 141 | 142 | pub fn deserialize_str_dict_array( 143 | column: &Column, 144 | scratch: Vec, 145 | num_of_values: usize, 146 | ) -> Result, Error> { 147 | let values = column.get_stream(Kind::DictionaryData, scratch)?; 148 | 149 | let mut values_iter = read::decode::Values::new(values, vec![]); 150 | 151 | let scratch2 = vec![]; 152 | let lengths = column.get_stream(Kind::Length, scratch2)?; 153 | 154 | let lengths = UnsignedRleV2RunIter::new(lengths, column.dictionary_size().unwrap(), vec![]); 155 | 156 | let values = deserialize_str(lengths, &mut values_iter, 0)?; 157 | let scratch = values_iter.into_inner(); 158 | 159 | let indices = column.get_stream(Kind::Data, scratch)?; 160 | let mut indices = UnsignedRleV2RunIter::new(indices, column.number_of_rows(), vec![]); 161 | 162 | let f = |x| values.get(x as usize).cloned().ok_or(Error::OutOfSpec); 163 | 164 | let mut result = Vec::with_capacity(num_of_values); 165 | for run in indices.by_ref() { 166 | run.and_then(|run| match run { 167 | UnsignedRleV2Run::Direct(values) => values.map(f).try_for_each(|x| { 168 | result.push(x?); 169 | Result::<_, Error>::Ok(()) 170 | }), 171 | UnsignedRleV2Run::Delta(values) => values.map(f).try_for_each(|x| { 172 | result.push(x?); 173 | Result::<_, Error>::Ok(()) 174 | }), 175 | UnsignedRleV2Run::ShortRepeat(values) => values.map(f).try_for_each(|x| { 176 | result.push(x?); 177 | Result::<_, Error>::Ok(()) 178 | }), 179 | })?; 180 | } 181 | 182 | let (_, _) = indices.into_inner(); 183 | 184 | Ok(result) 185 | } 186 | 187 | fn deserialize_str_array_direct( 188 | column: &Column, 189 | 190 | scratch: Vec, 191 | num_of_values: usize, 192 | ) -> Result, Error> { 193 | let values = column.get_stream(Kind::Data, scratch)?; 194 | let mut values = read::decode::Values::new(values, vec![]); 195 | 196 | let scratch1 = vec![]; 197 | let lengths = column.get_stream(Kind::Length, scratch1)?; 198 | let lengths = UnsignedRleV2RunIter::new(lengths, num_of_values, vec![]); 199 | 200 | deserialize_str(lengths, &mut values, num_of_values) 201 | } 202 | 203 | pub fn deserialize_str_array(column: &Column) -> Result<(Vec, Vec), Error> { 204 | let mut scratch = vec![]; 205 | 206 | let validity = deserialize_validity(column, &mut scratch)?; 207 | 208 | let num_of_values: usize = validity.iter().map(|x| *x as usize).sum(); 209 | 210 | // todo: generalize to other encodings 211 | let encoding = column.encoding(); 212 | let valid_values = match encoding.kind() { 213 | ColumnEncodingKind::DirectV2 => { 214 | deserialize_str_array_direct(column, scratch, num_of_values)? 215 | } 216 | ColumnEncodingKind::DictionaryV2 => { 217 | deserialize_str_dict_array(column, scratch, num_of_values)? 218 | } 219 | other => todo!("{other:?}"), 220 | }; 221 | Ok((validity, valid_values)) 222 | } 223 | -------------------------------------------------------------------------------- /tests/it/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | 3 | mod deserialize; 4 | use deserialize::*; 5 | 6 | use orc_format::{error::Error, read, read::Column}; 7 | 8 | fn get_column(path: &str, column: u32) -> Result { 9 | // open the file, as expected. buffering this is not necessary - we 10 | // are very careful about the number of `read`s we perform. 11 | let mut f = File::open(path).expect("no file found"); 12 | 13 | // read the files' metadata 14 | let metadata = read::read_metadata(&mut f)?; 15 | 16 | // the next step is to identify which stripe we want to read. Let's say it is the first one. 17 | let stripe = 0; 18 | 19 | // Each stripe has a footer - we need to read it to extract the location of each column on it. 20 | let stripe_footer = read::read_stripe_footer(&mut f, &metadata, stripe, &mut vec![])?; 21 | 22 | // Finally, we read the column into `Column` 23 | read::read_stripe_column(&mut f, &metadata, stripe, stripe_footer, column, vec![]) 24 | } 25 | 26 | #[test] 27 | fn read_bool() -> Result<(), Error> { 28 | let column = get_column("test.orc", 2)?; 29 | 30 | let (a, b) = deserialize_bool_array(&column)?; 31 | assert_eq!(a, vec![true, true, false, true, true]); 32 | assert_eq!(b, vec![true, false, true, false]); 33 | 34 | let (_footer, _scratch) = column.into_inner(); 35 | Ok(()) 36 | } 37 | 38 | #[test] 39 | fn read_str_direct() -> Result<(), Error> { 40 | let column = get_column("test.orc", 3)?; 41 | 42 | let (a, b) = deserialize_str_array(&column)?; 43 | assert_eq!(a, vec![true, true, false, true, true]); 44 | assert_eq!(b, vec!["a", "cccccc", "ddd", "ee"]); 45 | Ok(()) 46 | } 47 | 48 | #[test] 49 | fn read_str_delta_plus() -> Result<(), Error> { 50 | let column = get_column("test.orc", 4)?; 51 | 52 | let (a, b) = deserialize_str_array(&column)?; 53 | assert_eq!(a, vec![true, true, false, true, true]); 54 | assert_eq!(b, vec!["a", "bb", "ccc", "ddd"]); 55 | Ok(()) 56 | } 57 | 58 | #[test] 59 | fn read_str_delta_minus() -> Result<(), Error> { 60 | let column = get_column("test.orc", 5)?; 61 | 62 | let (a, b) = deserialize_str_array(&column)?; 63 | assert_eq!(a, vec![true, true, false, true, true]); 64 | assert_eq!(b, vec!["ddd", "cc", "bb", "a"]); 65 | Ok(()) 66 | } 67 | 68 | #[test] 69 | fn read_str_short_repeat() -> Result<(), Error> { 70 | let column = get_column("test.orc", 6)?; 71 | 72 | let (a, b) = deserialize_str_array(&column)?; 73 | assert_eq!(a, vec![true, true, false, true, true]); 74 | assert_eq!(b, vec!["aaaaa", "bbbbb", "ccccc", "ddddd"]); 75 | Ok(()) 76 | } 77 | 78 | #[test] 79 | fn read_f32() -> Result<(), Error> { 80 | let column = get_column("test.orc", 1)?; 81 | 82 | let (a, b) = deserialize_f32_array(&column)?; 83 | assert_eq!(a, vec![true, true, false, true, true]); 84 | assert_eq!(b, vec![1.0, 2.0, 4.0, 5.0]); 85 | Ok(()) 86 | } 87 | 88 | #[test] 89 | fn read_int_short_repeated() -> Result<(), Error> { 90 | let column = get_column("test.orc", 7)?; 91 | 92 | let (a, b) = deserialize_int_array(&column)?; 93 | assert_eq!(a, vec![true, true, false, true, true]); 94 | assert_eq!(b, vec![5, 5, 5, 5]); 95 | Ok(()) 96 | } 97 | 98 | #[test] 99 | fn read_int_neg_short_repeated() -> Result<(), Error> { 100 | let column = get_column("test.orc", 8)?; 101 | 102 | let (a, b) = deserialize_int_array(&column)?; 103 | assert_eq!(a, vec![true, true, false, true, true]); 104 | assert_eq!(b, vec![-5, -5, -5, -5]); 105 | Ok(()) 106 | } 107 | 108 | #[test] 109 | fn read_int_delta() -> Result<(), Error> { 110 | let column = get_column("test.orc", 9)?; 111 | 112 | let (a, b) = deserialize_int_array(&column)?; 113 | assert_eq!(a, vec![true, true, false, true, true]); 114 | assert_eq!(b, vec![1, 2, 4, 5]); 115 | Ok(()) 116 | } 117 | 118 | #[test] 119 | fn read_int_neg_delta() -> Result<(), Error> { 120 | let column = get_column("test.orc", 10)?; 121 | 122 | let (a, b) = deserialize_int_array(&column)?; 123 | assert_eq!(a, vec![true, true, false, true, true]); 124 | assert_eq!(b, vec![5, 4, 2, 1]); 125 | Ok(()) 126 | } 127 | 128 | #[test] 129 | fn read_int_direct() -> Result<(), Error> { 130 | let column = get_column("test.orc", 11)?; 131 | 132 | let (a, b) = deserialize_int_array(&column)?; 133 | assert_eq!(a, vec![true, true, false, true, true]); 134 | assert_eq!(b, vec![1, 6, 3, 2]); 135 | Ok(()) 136 | } 137 | 138 | #[test] 139 | fn read_int_neg_direct() -> Result<(), Error> { 140 | let column = get_column("test.orc", 12)?; 141 | 142 | let (a, b) = deserialize_int_array(&column)?; 143 | assert_eq!(a, vec![true, true, false, true, true]); 144 | assert_eq!(b, vec![-1, -6, -3, -2]); 145 | Ok(()) 146 | } 147 | 148 | #[test] 149 | fn read_bigint_direct() -> Result<(), Error> { 150 | let column = get_column("test.orc", 13)?; 151 | 152 | let (a, b) = deserialize_int_array(&column)?; 153 | assert_eq!(a, vec![true, true, false, true, true]); 154 | assert_eq!(b, vec![1, 6, 3, 2]); 155 | Ok(()) 156 | } 157 | 158 | #[test] 159 | fn read_bigint_neg_direct() -> Result<(), Error> { 160 | let column = get_column("test.orc", 14)?; 161 | 162 | let (a, b) = deserialize_int_array(&column)?; 163 | assert_eq!(a, vec![true, true, false, true, true]); 164 | assert_eq!(b, vec![-1, -6, -3, -2]); 165 | Ok(()) 166 | } 167 | 168 | #[test] 169 | fn read_bigint_other() -> Result<(), Error> { 170 | let column = get_column("test.orc", 15)?; 171 | 172 | let (a, b) = deserialize_int_array(&column)?; 173 | assert_eq!(a, vec![true, true, true, true, true]); 174 | assert_eq!(b, vec![5, -5, 1, 5, 5]); 175 | Ok(()) 176 | } 177 | 178 | #[test] 179 | fn read_boolean_long() -> Result<(), Error> { 180 | let column = get_column("long_bool.orc", 1)?; 181 | 182 | let (a, b) = deserialize_bool_array(&column)?; 183 | assert_eq!(a, vec![true; 32]); 184 | assert_eq!(b, vec![true; 32]); 185 | Ok(()) 186 | } 187 | 188 | #[test] 189 | fn read_bool_compressed() -> Result<(), Error> { 190 | let column = get_column("long_bool_gzip.orc", 1)?; 191 | 192 | let (a, b) = deserialize_bool_array(&column)?; 193 | assert_eq!(a, vec![true; 32]); 194 | assert_eq!(b, vec![true; 32]); 195 | Ok(()) 196 | } 197 | 198 | #[test] 199 | fn read_string_long() -> Result<(), Error> { 200 | let column = get_column("string_long.orc", 1)?; 201 | 202 | let (a, b) = deserialize_str_array(&column)?; 203 | assert_eq!(a, vec![true; 64]); 204 | assert_eq!( 205 | b, 206 | vec!["abcd", "efgh"] 207 | .into_iter() 208 | .cycle() 209 | .take(64) 210 | .collect::>() 211 | ); 212 | Ok(()) 213 | } 214 | 215 | #[test] 216 | fn read_string_dict() -> Result<(), Error> { 217 | let column = get_column("string_dict.orc", 1)?; 218 | 219 | let (a, b) = deserialize_str_array(&column)?; 220 | assert_eq!(a, vec![true; 64]); 221 | assert_eq!( 222 | b, 223 | vec!["abc", "efgh"] 224 | .into_iter() 225 | .cycle() 226 | .take(64) 227 | .collect::>() 228 | ); 229 | Ok(()) 230 | } 231 | 232 | #[test] 233 | fn read_string_dict_gzip() -> Result<(), Error> { 234 | let column = get_column("string_dict_gzip.orc", 1)?; 235 | 236 | let (a, b) = deserialize_str_array(&column)?; 237 | assert_eq!(a, vec![true; 64]); 238 | assert_eq!( 239 | b, 240 | vec!["abc", "efgh"] 241 | .into_iter() 242 | .cycle() 243 | .take(64) 244 | .collect::>() 245 | ); 246 | Ok(()) 247 | } 248 | 249 | #[test] 250 | fn read_string_long_long() -> Result<(), Error> { 251 | let column = get_column("string_long_long.orc", 1)?; 252 | 253 | let (a, b) = deserialize_str_array(&column)?; 254 | assert_eq!(a.len(), 10_000); 255 | assert_eq!(a, vec![true; 10_000]); 256 | assert_eq!(b.len(), 10_000); 257 | assert_eq!( 258 | b, 259 | vec!["abcd", "efgh"] 260 | .into_iter() 261 | .cycle() 262 | .take(10_000) 263 | .collect::>() 264 | ); 265 | Ok(()) 266 | } 267 | 268 | #[test] 269 | fn read_string_long_long_gzip() -> Result<(), Error> { 270 | let column = get_column("string_long_long_gzip.orc", 1)?; 271 | 272 | let (a, b) = deserialize_str_array(&column)?; 273 | assert_eq!(a.len(), 10_000); 274 | assert_eq!(a, vec![true; 10_000]); 275 | assert_eq!(b.len(), 10_000); 276 | assert_eq!( 277 | b, 278 | vec!["abcd", "efgh"] 279 | .into_iter() 280 | .cycle() 281 | .take(10_000) 282 | .collect::>() 283 | ); 284 | Ok(()) 285 | } 286 | 287 | #[test] 288 | fn read_f32_long_long_gzip() -> Result<(), Error> { 289 | let column = get_column("f32_long_long_gzip.orc", 1)?; 290 | 291 | let (a, b) = deserialize_f32_array(&column)?; 292 | assert_eq!(a.len(), 1_000_000); 293 | assert_eq!(a, vec![true; 1_000_000]); 294 | assert_eq!(b.len(), 1_000_000); 295 | Ok(()) 296 | } 297 | 298 | #[test] 299 | fn read_string_increase() -> Result<(), Error> { 300 | let column = get_column("test.orc", 16)?; 301 | 302 | let (a, b) = deserialize_str_array(&column)?; 303 | assert_eq!(a, vec![true; 5]); 304 | assert_eq!(b, vec!["a", "bb", "ccc", "dddd", "eeeee"]); 305 | Ok(()) 306 | } 307 | 308 | #[test] 309 | fn read_string_decrease() -> Result<(), Error> { 310 | let column = get_column("test.orc", 17)?; 311 | 312 | let (a, b) = deserialize_str_array(&column)?; 313 | assert_eq!(a, vec![true; 5]); 314 | assert_eq!(b, vec!["eeeee", "dddd", "ccc", "bb", "a"]); 315 | Ok(()) 316 | } 317 | -------------------------------------------------------------------------------- /write.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pyorc 4 | 5 | data = { 6 | "a": [1.0, 2.0, None, 4.0, 5.0], 7 | "b": [True, False, None, True, False], 8 | "str_direct": ["a", "cccccc", None, "ddd", "ee"], 9 | "d": ["a", "bb", None, "ccc", "ddd"], 10 | "e": ["ddd", "cc", None, "bb", "a"], 11 | "f": ["aaaaa", "bbbbb", None, "ccccc", "ddddd"], 12 | "int_short_repeated": [5, 5, None, 5, 5], 13 | "int_neg_short_repeated": [-5, -5, None, -5, -5], 14 | "int_delta": [1, 2, None, 4, 5], 15 | "int_neg_delta": [5, 4, None, 2, 1], 16 | "int_direct": [1, 6, None, 3, 2], 17 | "int_neg_direct": [-1, -6, None, -3, -2], 18 | "bigint_direct": [1, 6, None, 3, 2], 19 | "bigint_neg_direct": [-1, -6, None, -3, -2], 20 | "bigint_other": [5, -5, 1, 5, 5], 21 | "utf8_increase": ["a", "bb", "ccc", "dddd", "eeeee"], 22 | "utf8_decrease": ["eeeee", "dddd", "ccc", "bb", "a"], 23 | } 24 | 25 | def infer_schema(data): 26 | schema = "struct<" 27 | for key, value in data.items(): 28 | dt = type(value[0]) 29 | if dt == float: 30 | dt = "float" 31 | elif dt == int: 32 | dt = "int" 33 | elif dt == bool: 34 | dt = "boolean" 35 | elif dt == str: 36 | dt = "string" 37 | else: 38 | raise NotImplementedError 39 | if key.startswith("double"): 40 | dt = "double" 41 | if key.startswith("bigint"): 42 | dt = "bigint" 43 | schema += key + ":" + dt + "," 44 | 45 | schema = schema[:-1] + ">" 46 | return schema 47 | 48 | 49 | 50 | def _write( 51 | schema: str, 52 | data, 53 | file_name: str, 54 | compression=pyorc.CompressionKind.NONE, 55 | dict_key_size_threshold=0.0, 56 | ): 57 | output = open(file_name, "wb") 58 | writer = pyorc.Writer( 59 | output, 60 | schema, 61 | dict_key_size_threshold=dict_key_size_threshold, 62 | # use a small number to ensure that compression crosses value boundaries 63 | compression_block_size=32, 64 | compression=compression, 65 | ) 66 | num_rows = len(list(data.values())[0]) 67 | for x in range(num_rows): 68 | row = tuple(values[x] for values in data.values()) 69 | writer.write(row) 70 | writer.close() 71 | 72 | with open(file_name, "rb") as f: 73 | reader = pyorc.Reader(f) 74 | list(reader) 75 | 76 | 77 | _write( 78 | infer_schema(data), 79 | data, 80 | "test.orc", 81 | ) 82 | 83 | data_boolean = { 84 | "long": [True] * 32, 85 | } 86 | 87 | _write("struct", data_boolean, "long_bool.orc") 88 | 89 | _write("struct", data_boolean, "long_bool_gzip.orc", pyorc.CompressionKind.ZLIB) 90 | 91 | data_dict = { 92 | "dict": ["abcd", "efgh"] * 32, 93 | } 94 | 95 | _write("struct", data_dict, "string_long.orc") 96 | 97 | data_dict = { 98 | "dict": ["abc", "efgh"] * 32, 99 | } 100 | 101 | _write("struct", data_dict, "string_dict.orc", dict_key_size_threshold=0.1) 102 | 103 | _write("struct", data_dict, "string_dict_gzip.orc", pyorc.CompressionKind.ZLIB) 104 | 105 | data_dict = { 106 | "dict": ["abcd", "efgh"] * (10**4 // 2), 107 | } 108 | 109 | _write("struct", data_dict, "string_long_long.orc") 110 | _write("struct", data_dict, "string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB) 111 | 112 | long_f32 = { 113 | "dict": [random.uniform(0, 1) for _ in range(10**6)], 114 | } 115 | 116 | _write("struct", long_f32, "f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB) 117 | --------------------------------------------------------------------------------