├── .config └── nextest.toml ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── benches └── arrow_reader.rs ├── codecov.yml ├── deny.toml ├── format └── orc_proto.proto ├── gen ├── Cargo.toml └── src │ └── main.rs ├── licenserc.toml ├── regen.sh ├── scripts ├── README.md ├── convert_tpch.py ├── generate-tpch.sh ├── generate_arrow.py ├── generate_orc.py ├── generate_orc_timestamps.py ├── prepare-test-data.sh ├── setup-venv.sh └── write.py ├── src ├── array_decoder │ ├── decimal.rs │ ├── list.rs │ ├── map.rs │ ├── mod.rs │ ├── string.rs │ ├── struct_decoder.rs │ ├── timestamp.rs │ └── union.rs ├── arrow_reader.rs ├── arrow_writer.rs ├── async_arrow_reader.rs ├── bin │ ├── orc-export.rs │ ├── orc-metadata.rs │ └── orc-stats.rs ├── column.rs ├── compression.rs ├── encoding │ ├── boolean.rs │ ├── byte.rs │ ├── decimal.rs │ ├── float.rs │ ├── integer │ │ ├── mod.rs │ │ ├── rle_v1.rs │ │ ├── rle_v2 │ │ │ ├── delta.rs │ │ │ ├── direct.rs │ │ │ ├── mod.rs │ │ │ ├── patched_base.rs │ │ │ └── short_repeat.rs │ │ └── util.rs │ ├── mod.rs │ ├── rle.rs │ ├── timestamp.rs │ └── util.rs ├── error.rs ├── lib.rs ├── memory.rs ├── projection.rs ├── proto.rs ├── reader │ ├── metadata.rs │ └── mod.rs ├── schema.rs ├── statistics.rs ├── stripe.rs └── writer │ ├── column.rs │ ├── mod.rs │ └── stripe.rs ├── taplo.toml ├── tests ├── basic │ ├── data │ │ ├── alltypes.lz4.orc │ │ ├── alltypes.lzo.orc │ │ ├── alltypes.none.orc │ │ ├── alltypes.snappy.orc │ │ ├── alltypes.zlib.orc │ │ ├── alltypes.zstd.orc │ │ ├── demo-11-zlib.orc │ │ ├── demo-12-zlib.orc │ │ ├── f32_long_long_gzip.orc │ │ ├── long_bool.orc │ │ ├── long_bool_gzip.orc │ │ ├── nested_array.orc │ │ ├── nested_array_float.orc │ │ ├── nested_array_struct.orc │ │ ├── nested_map.orc │ │ ├── nested_map_struct.orc │ │ ├── nested_struct.orc │ │ ├── overflowing_timestamps.orc │ │ ├── pyarrow_timestamps.orc │ │ ├── string_dict.orc │ │ ├── string_dict_gzip.orc │ │ ├── string_long.orc │ │ ├── string_long_long.orc │ │ ├── string_long_long_gzip.orc │ │ └── test.orc │ ├── main.rs │ └── misc.rs └── integration │ ├── data │ ├── README.md │ ├── TestCSVFileImport.test10rows.csv │ ├── TestCSVFileImport.testTimezoneOption.csv │ ├── TestOrcFile.columnProjection.orc │ ├── TestOrcFile.emptyFile.orc │ ├── TestOrcFile.metaData.orc │ ├── TestOrcFile.test1.orc │ ├── TestOrcFile.testDate1900.orc │ ├── TestOrcFile.testDate2038.orc │ ├── TestOrcFile.testMemoryManagementV11.orc │ ├── TestOrcFile.testMemoryManagementV12.orc │ ├── TestOrcFile.testPredicatePushdown.orc │ ├── TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc │ ├── TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc │ ├── TestOrcFile.testSeek.orc │ ├── TestOrcFile.testSnappy.orc │ ├── TestOrcFile.testStringAndBinaryStatistics.orc │ ├── TestOrcFile.testStripeLevelStats.orc │ ├── TestOrcFile.testTimestamp.orc │ ├── TestOrcFile.testUnionAndTimestamp.orc │ ├── TestOrcFile.testWithoutCompressionBlockSize.orc │ ├── TestOrcFile.testWithoutIndex.orc │ ├── TestStringDictionary.testRowIndex.orc │ ├── TestVectorOrcFile.testLz4.orc │ ├── TestVectorOrcFile.testLzo.orc │ ├── TestVectorOrcFile.testZstd.0.12.orc │ ├── bad_bloom_filter_1.6.0.orc │ ├── bad_bloom_filter_1.6.11.orc │ ├── complextypes_iceberg.orc │ ├── corrupt │ │ ├── missing_blob_stream_in_string_dict.orc │ │ ├── missing_length_stream_in_string_dict.orc │ │ ├── negative_dict_entry_lengths.orc │ │ └── stripe_footer_bad_column_encodings.orc │ ├── decimal.orc │ ├── decimal64_v2.orc │ ├── decimal64_v2_cplusplus.orc │ ├── demo-11-none.orc │ ├── demo-11-zlib.orc │ ├── demo-12-zlib.orc │ ├── encrypted │ │ ├── kms.keystore │ │ ├── sample1.orc │ │ └── sample2.orc │ ├── expected │ │ ├── TestOrcFile.columnProjection.jsn.gz │ │ ├── TestOrcFile.emptyFile.jsn.gz │ │ ├── TestOrcFile.metaData.jsn.gz │ │ ├── TestOrcFile.test1.jsn.gz │ │ ├── TestOrcFile.testDate1900.jsn.gz │ │ ├── TestOrcFile.testDate2038.jsn.gz │ │ ├── TestOrcFile.testMemoryManagementV11.jsn.gz │ │ ├── TestOrcFile.testMemoryManagementV12.jsn.gz │ │ ├── TestOrcFile.testPredicatePushdown.jsn.gz │ │ ├── TestOrcFile.testSeek.jsn.gz │ │ ├── TestOrcFile.testSnappy.jsn.gz │ │ ├── TestOrcFile.testStringAndBinaryStatistics.jsn.gz │ │ ├── TestOrcFile.testStripeLevelStats.jsn.gz │ │ ├── TestOrcFile.testTimestamp.jsn.gz │ │ ├── TestOrcFile.testUnionAndTimestamp.jsn.gz │ │ ├── TestOrcFile.testWithoutIndex.jsn.gz │ │ ├── TestStringDictionary.testRowIndex.jsn.gz │ │ ├── TestVectorOrcFile.testLz4.jsn.gz │ │ ├── TestVectorOrcFile.testLzo.jsn.gz │ │ ├── decimal.jsn.gz │ │ ├── demo-12-zlib.jsn.gz │ │ ├── nulls-at-end-snappy.jsn.gz │ │ ├── orc-file-11-format.jsn.gz │ │ ├── orc_index_int_string.jsn.gz │ │ ├── orc_split_elim.jsn.gz │ │ ├── orc_split_elim_cpp.jsn.gz │ │ ├── orc_split_elim_new.jsn.gz │ │ └── over1k_bloom.jsn.gz │ ├── expected_arrow │ │ ├── TestOrcFile.columnProjection.feather │ │ ├── TestOrcFile.emptyFile.feather │ │ ├── TestOrcFile.metaData.feather │ │ ├── TestOrcFile.test1.feather │ │ ├── TestOrcFile.testDate1900.feather │ │ ├── TestOrcFile.testDate2038.feather │ │ ├── TestOrcFile.testMemoryManagementV11.feather │ │ ├── TestOrcFile.testMemoryManagementV12.feather │ │ ├── TestOrcFile.testPredicatePushdown.feather │ │ ├── TestOrcFile.testSeek.feather │ │ ├── TestOrcFile.testSnappy.feather │ │ ├── TestOrcFile.testStringAndBinaryStatistics.feather │ │ ├── TestOrcFile.testStripeLevelStats.feather │ │ ├── TestOrcFile.testUnionAndTimestamp.feather │ │ ├── TestOrcFile.testWithoutIndex.feather │ │ ├── TestStringDictionary.testRowIndex.feather │ │ ├── TestVectorOrcFile.testLz4.feather │ │ ├── TestVectorOrcFile.testLzo.feather │ │ ├── decimal.feather │ │ ├── demo-12-zlib.feather │ │ ├── nulls-at-end-snappy.feather │ │ ├── orc-file-11-format.feather │ │ ├── orc_index_int_string.feather │ │ ├── orc_split_elim.feather │ │ ├── orc_split_elim_cpp.feather │ │ ├── orc_split_elim_new.feather │ │ └── over1k_bloom.feather │ ├── nulls-at-end-snappy.orc │ ├── orc-file-11-format.orc │ ├── orc_index_int_string.orc │ ├── orc_no_format.orc │ ├── orc_split_elim.orc │ ├── orc_split_elim_cpp.orc │ ├── orc_split_elim_new.orc │ ├── over1k_bloom.orc │ ├── version1999.orc │ └── zero.orc │ └── main.rs └── typos.toml /.config/nextest.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [profile.default] 19 | slow-timeout = { period = "60s", terminate-after = 3, grace-period = "30s" } 20 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | on: 19 | pull_request: 20 | types: [opened, synchronize, reopened, ready_for_review] 21 | paths-ignore: 22 | - 'docs/**' 23 | - 'config/**' 24 | - '**.md' 25 | - '.dockerignore' 26 | - 'docker/**' 27 | - '.gitignore' 28 | push: 29 | branches: 30 | - develop 31 | - main 32 | paths-ignore: 33 | - 'docs/**' 34 | - 'config/**' 35 | - '**.md' 36 | - '.dockerignore' 37 | - 'docker/**' 38 | - '.gitignore' 39 | workflow_dispatch: 40 | 41 | name: CI 42 | 43 | env: 44 | RUST_TOOLCHAIN: stable 45 | 46 | jobs: 47 | typos: 48 | name: Spell Check with Typos 49 | runs-on: ubuntu-latest 50 | steps: 51 | - uses: actions/checkout@v3 52 | - uses: crate-ci/typos@v1.13.10 53 | 54 | check: 55 | name: Check 56 | if: github.event.pull_request.draft == false 57 | runs-on: ubuntu-latest 58 | timeout-minutes: 60 59 | strategy: 60 | matrix: 61 | features: 62 | - '' 63 | - '--no-default-features' 64 | - '--all-features' 65 | steps: 66 | - uses: actions/checkout@v3 67 | - uses: dtolnay/rust-toolchain@master 68 | with: 69 | toolchain: ${{ env.RUST_TOOLCHAIN }} 70 | - name: Rust Cache 71 | uses: Swatinem/rust-cache@v2 72 | - name: Run cargo check 73 | run: cargo check --workspace --all-targets ${{ matrix.features }} 74 | 75 | toml: 76 | name: Toml Check 77 | if: github.event.pull_request.draft == false 78 | runs-on: ubuntu-latest 79 | timeout-minutes: 60 80 | steps: 81 | - uses: actions/checkout@v3 82 | - uses: dtolnay/rust-toolchain@master 83 | with: 84 | toolchain: ${{ env.RUST_TOOLCHAIN }} 85 | - name: Rust Cache 86 | uses: Swatinem/rust-cache@v2 87 | - name: Install taplo 88 | run: cargo install taplo-cli --version ^0.8 --locked 89 | - name: Run taplo 90 | run: taplo format --check 91 | 92 | fmt: 93 | name: Rustfmt 94 | if: github.event.pull_request.draft == false 95 | runs-on: ubuntu-latest 96 | timeout-minutes: 60 97 | steps: 98 | - uses: actions/checkout@v3 99 | - uses: dtolnay/rust-toolchain@master 100 | with: 101 | toolchain: ${{ env.RUST_TOOLCHAIN }} 102 | components: rustfmt 103 | - name: Rust Cache 104 | uses: Swatinem/rust-cache@v2 105 | - name: Run cargo fmt 106 | run: cargo fmt --all -- --check 107 | 108 | clippy: 109 | name: Clippy 110 | if: github.event.pull_request.draft == false 111 | runs-on: ubuntu-latest 112 | timeout-minutes: 60 113 | strategy: 114 | matrix: 115 | features: 116 | - '' 117 | - '--no-default-features' 118 | - '--all-features' 119 | steps: 120 | - uses: actions/checkout@v3 121 | - uses: dtolnay/rust-toolchain@master 122 | with: 123 | toolchain: ${{ env.RUST_TOOLCHAIN }} 124 | components: clippy 125 | - name: Rust Cache 126 | uses: Swatinem/rust-cache@v2 127 | - name: Run cargo clippy 128 | run: cargo clippy --workspace --all-targets ${{ matrix.features }} -- -D warnings 129 | 130 | license-header: 131 | name: Check license header 132 | if: github.event.pull_request.draft == false 133 | runs-on: ubuntu-latest 134 | steps: 135 | - uses: actions/checkout@v4 136 | - name: Check license headers 137 | uses: korandoru/hawkeye@v5 138 | 139 | cargo-deny: 140 | name: Cargo Deny License Check 141 | if: github.event.pull_request.draft == false 142 | runs-on: ubuntu-latest 143 | steps: 144 | - uses: actions/checkout@v3 145 | - uses: EmbarkStudios/cargo-deny-action@v1 146 | with: 147 | command: check license 148 | 149 | coverage: 150 | if: github.event.pull_request.draft == false 151 | runs-on: ubuntu-latest 152 | timeout-minutes: 60 153 | needs: [clippy] 154 | steps: 155 | - uses: actions/checkout@v3 156 | - uses: KyleMayes/install-llvm-action@v1 157 | with: 158 | version: "14.0" 159 | - name: Install toolchain 160 | uses: dtolnay/rust-toolchain@master 161 | with: 162 | toolchain: ${{ env.RUST_TOOLCHAIN }} 163 | components: llvm-tools-preview 164 | - name: Rust Cache 165 | uses: Swatinem/rust-cache@v2 166 | - name: Install latest nextest release 167 | uses: taiki-e/install-action@nextest 168 | - name: Install cargo-llvm-cov 169 | uses: taiki-e/install-action@cargo-llvm-cov 170 | - name: Collect coverage data 171 | run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info --all-features 172 | env: 173 | CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=lld" 174 | RUST_BACKTRACE: 1 175 | CARGO_INCREMENTAL: 0 176 | UNITTEST_LOG_DIR: "__unittest_logs" 177 | - name: Codecov upload 178 | uses: codecov/codecov-action@v2 179 | with: 180 | token: ${{ secrets.CODECOV_TOKEN }} 181 | files: ./lcov.info 182 | flags: rust 183 | fail_ci_if_error: false 184 | verbose: true 185 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | 16 | # RustRover 17 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 18 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 19 | # and can be added to the global gitignore or merged into this file. For a more nuclear 20 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 21 | .idea/ 22 | 23 | venv 24 | /benchmark_data 25 | 26 | private/ 27 | *.txt 28 | 29 | /perf.* 30 | /flamegraph.svg 31 | 32 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "orc-rust" 20 | version = "0.6.0" 21 | edition = "2021" 22 | homepage = "https://github.com/datafusion-contrib/orc-rust" 23 | repository = "https://github.com/datafusion-contrib/orc-rust" 24 | authors = ["Apache ORC "] 25 | license = "Apache-2.0" 26 | description = "Implementation of Apache ORC file format using Apache Arrow in-memory format" 27 | keywords = ["arrow", "orc", "arrow-rs"] 28 | include = ["src/**/*.rs", "Cargo.toml"] 29 | rust-version = "1.73" 30 | 31 | [package.metadata.docs.rs] 32 | all-features = true 33 | 34 | [dependencies] 35 | arrow = { version = ">= 53.1.0, < 55.0.0", features = ["prettyprint", "chrono-tz", "ipc_compression"] } 36 | bytemuck = { version = "1.18.0", features = ["must_cast"] } 37 | bytes = "1.4" 38 | chrono = { version = ">= 0.4.37, < 0.4.40", default-features = false, features = ["std"] } 39 | chrono-tz = "0.10" 40 | fallible-streaming-iterator = { version = "0.1" } 41 | flate2 = "1" 42 | lz4_flex = "0.11" 43 | lzokay-native = "0.1" 44 | num = "0.4.1" 45 | prost = { version = "0.13" } 46 | snafu = "0.8" 47 | snap = "1.1" 48 | zstd = "0.13" 49 | 50 | # async support 51 | async-trait = { version = "0.1.77", optional = true } 52 | futures = { version = "0.3", optional = true, default-features = false, features = ["std"] } 53 | futures-util = { version = "0.3", optional = true } 54 | tokio = { version = "1.28", optional = true, features = [ 55 | "io-util", 56 | "sync", 57 | "fs", 58 | "macros", 59 | "rt", 60 | "rt-multi-thread", 61 | ] } 62 | 63 | # cli 64 | anyhow = { version = "1.0", optional = true } 65 | clap = { version = "4.5.4", features = ["derive"], optional = true } 66 | 67 | # opendal 68 | opendal = { version = "0.50", optional = true, default-features = false } 69 | 70 | [dev-dependencies] 71 | arrow-ipc = { version = "53.0.0", features = ["lz4"] } 72 | arrow-json = "53.0.0" 73 | criterion = { version = "0.5", default-features = false, features = ["async_tokio"] } 74 | opendal = { version = "0.50", default-features = false, features = ["services-memory"] } 75 | pretty_assertions = "1.3.0" 76 | proptest = "1.0.0" 77 | serde_json = { version = "1.0", default-features = false, features = ["std"] } 78 | 79 | [features] 80 | default = ["async"] 81 | 82 | async = ["async-trait", "futures", "futures-util", "tokio"] 83 | cli = ["anyhow", "clap"] 84 | # Enable opendal support. 85 | opendal = ["dep:opendal"] 86 | 87 | [[bench]] 88 | name = "arrow_reader" 89 | harness = false 90 | required-features = ["async"] 91 | # Some issue when publishing and path isn't specified, so adding here 92 | path = "./benches/arrow_reader.rs" 93 | 94 | [profile.bench] 95 | debug = true 96 | 97 | [[bin]] 98 | name = "orc-metadata" 99 | required-features = ["cli"] 100 | 101 | [[bin]] 102 | name = "orc-export" 103 | required-features = ["cli"] 104 | 105 | [[bin]] 106 | name = "orc-stats" 107 | required-features = ["cli"] 108 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | .PHONY: fmt 19 | fmt: ## Format all the Rust code. 20 | cargo fmt --all 21 | 22 | 23 | .PHONY: clippy 24 | clippy: ## Check clippy rules. 25 | cargo clippy --workspace --all-targets -- -D warnings 26 | 27 | 28 | .PHONY: fmt-toml 29 | fmt-toml: ## Format all TOML files. 30 | taplo format --option "indent_string= " 31 | 32 | .PHONY: check-toml 33 | check-toml: ## Check all TOML files. 34 | taplo format --check --option "indent_string= " -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![test](https://github.com/datafusion-contrib/datafusion-orc/actions/workflows/ci.yml/badge.svg)](https://github.com/datafusion-contrib/datafusion-orc/actions/workflows/ci.yml) 2 | [![codecov](https://codecov.io/gh/WenyXu/orc-rs/branch/main/graph/badge.svg?token=2CSHZX02XM)](https://codecov.io/gh/WenyXu/orc-rs) 3 | [![Crates.io](https://img.shields.io/crates/v/orc-rust)](https://crates.io/crates/orc-rust) 4 | [![Crates.io](https://img.shields.io/crates/d/orc-rust)](https://crates.io/crates/orc-rust) 5 | 6 | # orc-rust 7 | 8 | A native Rust implementation of the [Apache ORC](https://orc.apache.org) file format, 9 | providing API's to read data into [Apache Arrow](https://arrow.apache.org) in-memory arrays. 10 | 11 | See the [documentation](https://docs.rs/orc-rust/latest/orc_rust/) for examples on how to use this crate. 12 | 13 | ## Supported features 14 | 15 | This crate currently only supports reading ORC files into Arrow arrays. Write support is planned 16 | (see [Roadmap](#roadmap)). The below features listed relate only to reading ORC files. 17 | At this time, we aim to support the [ORCv1](https://orc.apache.org/specification/ORCv1/) specification only. 18 | 19 | - Read synchronously & asynchronously (using Tokio) 20 | - All compression types (Zlib, Snappy, Lzo, Lz4, Zstd) 21 | - All ORC data types 22 | - All encodings 23 | - Rudimentary support for retrieving statistics 24 | - Retrieving user metadata into Arrow schema metadata 25 | 26 | ## Roadmap 27 | 28 | The long term vision for this crate is to be feature complete enough to be donated to the 29 | [arrow-rs](https://github.com/apache/arrow-rs) project. 30 | 31 | The following lists the rough roadmap for features to be implemented, from highest to lowest priority. 32 | 33 | - Performance enhancements 34 | - Predicate pushdown 35 | - Row indices 36 | - Bloom filters 37 | - Write from Arrow arrays 38 | - Encryption 39 | 40 | A non-Arrow API interface is not planned at the moment. Feel free to raise an issue if there is such 41 | a use case. 42 | 43 | ## Version compatibility 44 | 45 | No guarantees are provided about stability across versions. We will endeavour to keep the top level API's 46 | (`ArrowReader` and `ArrowStreamReader`) as stable as we can, but other API's provided may change as we 47 | explore the interface we want the library to expose. 48 | 49 | Versions will be released on an ad-hoc basis (with no fixed schedule). 50 | 51 | ## Mapping ORC types to Arrow types 52 | 53 | The following table lists how ORC data types are read into Arrow data types: 54 | 55 | | ORC Data Type | Arrow Data Type | Notes | 56 | | ----------------- | --------------------------- | ----- | 57 | | Boolean | Boolean | | 58 | | TinyInt | Int8 | | 59 | | SmallInt | Int16 | | 60 | | Int | Int32 | | 61 | | BigInt | Int64 | | 62 | | Float | Float32 | | 63 | | Double | Float64 | | 64 | | String | Utf8 | | 65 | | Char | Utf8 | | 66 | | VarChar | Utf8 | | 67 | | Binary | Binary | | 68 | | Decimal | Decimal128 | | 69 | | Date | Date32 | | 70 | | Timestamp | Timestamp(Nanosecond, None) | ¹ | 71 | | Timestamp instant | Timestamp(Nanosecond, UTC) | ¹ | 72 | | Struct | Struct | | 73 | | List | List | | 74 | | Map | Map | | 75 | | Union | Union(_, Sparse) | ² | 76 | 77 | ¹: `ArrowReaderBuilder::with_schema` allows configuring different time units or decoding to 78 | `Decimal128(38, 9)` (i128 of non-leap nanoseconds since UNIX epoch). 79 | Overflows may happen while decoding to a non-Seconds time unit, and results in `OrcError`. 80 | Loss of precision may happen while decoding to a non-Nanosecond time unit, and results in `OrcError`. 81 | `Decimal128(38, 9)` avoids both overflows and loss of precision. 82 | 83 | ²: Currently only supports a maximum of 127 variants 84 | 85 | ## Contributing 86 | 87 | All contributions are welcome! Feel free to raise an issue if you have a feature request, bug report, 88 | or a question. Feel free to raise a Pull Request without raising an issue first, as long as the Pull 89 | Request is descriptive enough. 90 | 91 | Some tools we use in addition to the standard `cargo` that require installation are: 92 | 93 | - [taplo](https://taplo.tamasfe.dev/) 94 | - [typos](https://crates.io/crates/typos) 95 | 96 | ```shell 97 | cargo install typos-cli 98 | cargo install taplo-cli 99 | ``` 100 | 101 | ```shell 102 | # Building the crate 103 | cargo build 104 | 105 | # Running the test suite 106 | cargo test 107 | 108 | # Simple benchmarks 109 | cargo bench 110 | 111 | # Formatting TOML files 112 | taplo format 113 | 114 | # Detect any typos in the codebase 115 | typos 116 | ``` 117 | 118 | To regenerate/update the [proto.rs](src/proto.rs) file, execute the [regen.sh](regen.sh) script. 119 | 120 | ```shell 121 | ./regen.sh 122 | ``` 123 | 124 | -------------------------------------------------------------------------------- /benches/arrow_reader.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::fs::File; 19 | 20 | use criterion::{criterion_group, criterion_main, Criterion}; 21 | use futures_util::TryStreamExt; 22 | use orc_rust::arrow_reader::ArrowReaderBuilder; 23 | 24 | fn basic_path(path: &str) -> String { 25 | let dir = env!("CARGO_MANIFEST_DIR"); 26 | format!("{}/tests/basic/data/{}", dir, path) 27 | } 28 | 29 | // demo-12-zlib.orc 30 | // 1,920,800 total rows 31 | // Columns: 32 | // - Int32 33 | // - Dictionary(UInt64, Utf8) 34 | // - Dictionary(UInt64, Utf8) 35 | // - Dictionary(UInt64, Utf8) 36 | // - Int32 37 | // - Dictionary(UInt64, Utf8) 38 | // - Int32 39 | // - Int32 40 | // - Int32 41 | 42 | async fn async_read_all() { 43 | let file = "demo-12-zlib.orc"; 44 | let file_path = basic_path(file); 45 | let f = tokio::fs::File::open(file_path).await.unwrap(); 46 | let reader = ArrowReaderBuilder::try_new_async(f) 47 | .await 48 | .unwrap() 49 | .build_async(); 50 | let _ = reader.try_collect::>().await.unwrap(); 51 | } 52 | 53 | fn sync_read_all() { 54 | let file = "demo-12-zlib.orc"; 55 | let file_path = basic_path(file); 56 | let f = File::open(file_path).unwrap(); 57 | let reader = ArrowReaderBuilder::try_new(f).unwrap().build(); 58 | let _ = reader.collect::, _>>().unwrap(); 59 | } 60 | 61 | fn criterion_benchmark(c: &mut Criterion) { 62 | c.bench_function("sync reader", |b| b.iter(sync_read_all)); 63 | c.bench_function("async reader", |b| { 64 | b.to_async(tokio::runtime::Runtime::new().unwrap()) 65 | .iter(async_read_all); 66 | }); 67 | } 68 | 69 | criterion_group!(benches, criterion_benchmark); 70 | criterion_main!(benches); 71 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # codecov config 19 | coverage: 20 | status: 21 | project: 22 | default: 23 | threshold: 1% 24 | patch: off 25 | ignore: 26 | - "**/error*.rs" # ignore all error.rs files 27 | comment: # this is a top-level key 28 | layout: "diff" 29 | -------------------------------------------------------------------------------- /deny.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [licenses] 19 | allow = [ 20 | "Apache-2.0", 21 | "Apache-2.0 WITH LLVM-exception", 22 | "MIT", 23 | "BSD-2-Clause", 24 | "BSD-3-Clause", 25 | "CC0-1.0", 26 | "Unicode-3.0", 27 | ] 28 | version = 2 29 | -------------------------------------------------------------------------------- /gen/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "gen" 20 | description = "Code generation for datafusion-orc" 21 | version = "0.1.0" 22 | edition = "2021" 23 | rust-version = "1.70" 24 | license = "Apache-2.0" 25 | publish = false 26 | 27 | [dependencies] 28 | prost-build = { version = "=0.12.1", default-features = false } 29 | -------------------------------------------------------------------------------- /gen/src/main.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::fs::{remove_file, OpenOptions}; 19 | use std::io::{Read, Write}; 20 | 21 | fn main() -> Result<(), Box> { 22 | prost_build::Config::new() 23 | .out_dir("src/") 24 | .compile_well_known_types() 25 | .extern_path(".google.protobuf", "::pbjson_types") 26 | .compile_protos(&["format/orc_proto.proto"], &["format"])?; 27 | 28 | // read file contents to string 29 | let mut file = OpenOptions::new().read(true).open("src/orc.proto.rs")?; 30 | let mut buffer = String::new(); 31 | file.read_to_string(&mut buffer)?; 32 | // append warning that file was auto-generate 33 | let mut file = OpenOptions::new() 34 | .write(true) 35 | .truncate(true) 36 | .create(true) 37 | .open("src/proto.rs")?; 38 | file.write_all("// This file was automatically generated through the regen.sh script, and should not be edited.\n\n".as_bytes())?; 39 | file.write_all(buffer.as_bytes())?; 40 | 41 | // since we renamed file to proto.rs to avoid period in the name 42 | remove_file("src/orc.proto.rs")?; 43 | 44 | // As the proto file is checked in, the build should not fail if the file is not found 45 | Ok(()) 46 | } 47 | -------------------------------------------------------------------------------- /licenserc.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | headerPath = "Apache-2.0-ASF.txt" 19 | 20 | excludes = [ 21 | "**/*.md" 22 | ] 23 | 24 | -------------------------------------------------------------------------------- /regen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 21 | cd $SCRIPT_DIR && cargo run --manifest-path gen/Cargo.toml 22 | rustfmt src/proto.rs 23 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | ## Generate data 2 | 3 | Setup the virtual environment with dependencies on PyArrow, PySpark and PyOrc 4 | to generate the reference data: 5 | 6 | ```bash 7 | # Run once 8 | ./scripts/setup-venv.sh 9 | ./scripts/prepare-test-data.sh 10 | ``` 11 | 12 | Then execute the tests: 13 | 14 | ```bash 15 | cargo test 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /scripts/convert_tpch.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | from pyarrow import orc 20 | from pyarrow import csv 21 | 22 | tables = [ 23 | "customer", 24 | "lineitem", 25 | "nation", 26 | "orders", 27 | "part", 28 | "partsupp", 29 | "region", 30 | "supplier" 31 | ] 32 | 33 | # Datatypes based on: 34 | # https://github.com/apache/datafusion/blob/3b93cc952b889cec2364ad2490ae18ecddb3ca49/benchmarks/src/tpch/mod.rs#L50-L134 35 | schemas = { 36 | "customer": pa.schema([ 37 | pa.field("c_custkey", pa.int64()), 38 | pa.field("c_name", pa.string()), 39 | pa.field("c_address", pa.string()), 40 | pa.field("c_nationkey", pa.int64()), 41 | pa.field("c_phone", pa.string()), 42 | pa.field("c_acctbal", pa.decimal128(15, 2)), 43 | pa.field("c_mktsegment", pa.string()), 44 | pa.field("c_comment", pa.string()), 45 | ]), 46 | "lineitem": pa.schema([ 47 | pa.field("l_orderkey", pa.int64()), 48 | pa.field("l_partkey", pa.int64()), 49 | pa.field("l_suppkey", pa.int64()), 50 | pa.field("l_linenumber", pa.int32()), 51 | pa.field("l_quantity", pa.decimal128(15, 2)), 52 | pa.field("l_extendedprice", pa.decimal128(15, 2)), 53 | pa.field("l_discount", pa.decimal128(15, 2)), 54 | pa.field("l_tax", pa.decimal128(15, 2)), 55 | pa.field("l_returnflag", pa.string()), 56 | pa.field("l_linestatus", pa.string()), 57 | pa.field("l_shipdate", pa.date32()), 58 | pa.field("l_commitdate", pa.date32()), 59 | pa.field("l_receiptdate", pa.date32()), 60 | pa.field("l_shipinstruct", pa.string()), 61 | pa.field("l_shipmode", pa.string()), 62 | pa.field("l_comment", pa.string()), 63 | ]), 64 | "nation": pa.schema([ 65 | pa.field("n_nationkey", pa.int64()), 66 | pa.field("n_name", pa.string()), 67 | pa.field("n_regionkey", pa.int64()), 68 | pa.field("n_comment", pa.string()), 69 | ]), 70 | "orders": pa.schema([ 71 | pa.field("o_orderkey", pa.int64()), 72 | pa.field("o_custkey", pa.int64()), 73 | pa.field("o_orderstatus", pa.string()), 74 | pa.field("o_totalprice", pa.decimal128(15, 2)), 75 | pa.field("o_orderdate", pa.date32()), 76 | pa.field("o_orderpriority", pa.string()), 77 | pa.field("o_clerk", pa.string()), 78 | pa.field("o_shippriority", pa.int32()), 79 | pa.field("o_comment", pa.string()), 80 | ]), 81 | "part": pa.schema([ 82 | pa.field("p_partkey", pa.int64()), 83 | pa.field("p_name", pa.string()), 84 | pa.field("p_mfgr", pa.string()), 85 | pa.field("p_brand", pa.string()), 86 | pa.field("p_type", pa.string()), 87 | pa.field("p_size", pa.int32()), 88 | pa.field("p_container", pa.string()), 89 | pa.field("p_retailprice", pa.decimal128(15, 2)), 90 | pa.field("p_comment", pa.string()), 91 | ]), 92 | "partsupp": pa.schema([ 93 | pa.field("ps_partkey", pa.int64()), 94 | pa.field("ps_suppkey", pa.int64()), 95 | pa.field("ps_availqty", pa.int32()), 96 | pa.field("ps_supplycost", pa.decimal128(15, 2)), 97 | pa.field("ps_comment", pa.string()), 98 | ]), 99 | "region": pa.schema([ 100 | pa.field("r_regionkey", pa.int64()), 101 | pa.field("r_name", pa.string()), 102 | pa.field("r_comment", pa.string()), 103 | ]), 104 | "supplier": pa.schema([ 105 | pa.field("s_suppkey", pa.int64()), 106 | pa.field("s_name", pa.string()), 107 | pa.field("s_address", pa.string()), 108 | pa.field("s_nationkey", pa.int64()), 109 | pa.field("s_phone", pa.string()), 110 | pa.field("s_acctbal", pa.decimal128(15, 2)), 111 | pa.field("s_comment", pa.string()), 112 | ]), 113 | } 114 | 115 | for table in tables: 116 | schema = schemas[table] 117 | tbl = csv.read_csv( 118 | f"benchmark_data/{table}.tbl", 119 | read_options=csv.ReadOptions(column_names=schema.names), 120 | parse_options=csv.ParseOptions(delimiter="|"), 121 | convert_options=csv.ConvertOptions(column_types=schema), 122 | ) 123 | orc.write_table(tbl, f"benchmark_data/{table}.orc") 124 | -------------------------------------------------------------------------------- /scripts/generate-tpch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 21 | BASE_DIR=$SCRIPT_DIR/.. 22 | DATA_DIR=$BASE_DIR/benchmark_data 23 | VENV_BIN=$BASE_DIR/venv/bin 24 | 25 | SCALE_FACTOR=${1:-1} 26 | 27 | # Generate TBL data 28 | mkdir -p $DATA_DIR 29 | docker run --rm \ 30 | -v $DATA_DIR:/data \ 31 | ghcr.io/scalytics/tpch-docker:main -vf -s $SCALE_FACTOR 32 | # Removing trailing | 33 | sed -i 's/.$//' benchmark_data/*.tbl 34 | $VENV_BIN/python $SCRIPT_DIR/convert_tpch.py 35 | echo "Done" 36 | -------------------------------------------------------------------------------- /scripts/generate_arrow.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Requires pyarrow to be installed 19 | import glob 20 | from pyarrow import orc, feather 21 | 22 | dir = "tests/integration/data" 23 | 24 | files = glob.glob(f"{dir}/expected/*") 25 | files = [file.removeprefix(f"{dir}/expected/").removesuffix(".jsn.gz") for file in files] 26 | 27 | ignore_files = [ 28 | "TestOrcFile.testTimestamp" # Root data type isn't struct 29 | ] 30 | 31 | files = [file for file in files if file not in ignore_files] 32 | 33 | for file in files: 34 | print(f"Converting {file} from ORC to feather") 35 | table = orc.read_table(f"{dir}/{file}.orc") 36 | feather.write_feather(table, f"{dir}/expected_arrow/{file}.feather") 37 | -------------------------------------------------------------------------------- /scripts/generate_orc.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import shutil 19 | import glob 20 | from datetime import date as dt 21 | from decimal import Decimal as Dec 22 | from pyspark.sql import SparkSession 23 | from pyspark.sql.types import * 24 | 25 | dir = "tests/basic/data" 26 | 27 | # We're using Spark because it supports lzo compression writing 28 | # (PyArrow supports all except lzo writing) 29 | 30 | spark = SparkSession.builder.getOrCreate() 31 | 32 | # TODO: how to do char and varchar? 33 | # TODO: struct, list, map, union 34 | df = spark.createDataFrame( 35 | [ # bool, int8, int16, int32, int64, float32, float64, decimal, binary, utf8, date32 36 | ( None, None, None, None, None, None, None, None, None, None, None), 37 | ( True, 0, 0, 0, 0, 0.0, 0.0, Dec(0), "".encode(), "", dt(1970, 1, 1)), 38 | (False, 1, 1, 1, 1, 1.0, 1.0, Dec(1), "a".encode(), "a", dt(1970, 1, 2)), 39 | (False, -1, -1, -1, -1, -1.0, -1.0, Dec(-1), " ".encode(), " ", dt(1969, 12, 31)), 40 | ( True, 127, (1 << 15) - 1, (1 << 31) - 1, (1 << 63) - 1, float("inf"), float("inf"), Dec(123456789.12345), "encode".encode(), "encode", dt(9999, 12, 31)), 41 | ( True, -128, -(1 << 15), -(1 << 31), -(1 << 63), float("-inf"), float("-inf"), Dec(-999999999.99999), "decode".encode(), "decode", dt(1582, 10, 15)), 42 | ( True, 50, 50, 50, 50, 3.1415927, 3.14159265359, Dec(-31256.123), "大熊和奏".encode(), "大熊和奏", dt(1582, 10, 16)), 43 | ( True, 51, 51, 51, 51, -3.1415927, -3.14159265359, Dec(1241000), "斉藤朱夏".encode(), "斉藤朱夏", dt(2000, 1, 1)), 44 | ( True, 52, 52, 52, 52, 1.1, 1.1, Dec(1.1), "鈴原希実".encode(), "鈴原希実", dt(3000, 12, 31)), 45 | (False, 53, 53, 53, 53, -1.1, -1.1, Dec(0.99999), "🤔".encode(), "🤔", dt(1900, 1, 1)), 46 | ( None, None, None, None, None, None, None, None, None, None, None), 47 | ], 48 | StructType( 49 | [ 50 | StructField("boolean", BooleanType()), 51 | StructField( "int8", ByteType()), 52 | StructField( "int16", ShortType()), 53 | StructField( "int32", IntegerType()), 54 | StructField( "int64", LongType()), 55 | StructField("float32", FloatType()), 56 | StructField("float64", DoubleType()), 57 | StructField("decimal", DecimalType(15, 5)), 58 | StructField( "binary", BinaryType()), 59 | StructField( "utf8", StringType()), 60 | StructField( "date32", DateType()), 61 | ] 62 | ), 63 | ).coalesce(1) 64 | 65 | compression = ["none", "snappy", "zlib", "lzo", "zstd", "lz4"] 66 | for c in compression: 67 | df.write.format("orc")\ 68 | .option("compression", c)\ 69 | .mode("overwrite")\ 70 | .save(f"{dir}/alltypes.{c}") 71 | # Since Spark saves into a directory 72 | # Move out and rename the expected single ORC file (because of coalesce above) 73 | orc_file = glob.glob(f"{dir}/alltypes.{c}/*.orc")[0] 74 | shutil.move(orc_file, f"{dir}/alltypes.{c}.orc") 75 | shutil.rmtree(f"{dir}/alltypes.{c}") 76 | -------------------------------------------------------------------------------- /scripts/generate_orc_timestamps.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from datetime import datetime as dttm 19 | import pyarrow as pa 20 | from pyarrow import orc 21 | from pyarrow import parquet 22 | import pyorc 23 | 24 | dir = "tests/basic/data" 25 | 26 | schema = pa.schema([ 27 | pa.field('timestamp_notz', pa.timestamp("ns")), 28 | pa.field('timestamp_utc', pa.timestamp("ns", tz="UTC")), 29 | ]) 30 | 31 | # TODO test with other non-UTC timezones 32 | arr = pa.array([ 33 | None, 34 | dttm(1970, 1, 1, 0, 0, 0), 35 | dttm(1970, 1, 2, 23, 59, 59), 36 | dttm(1969, 12, 31, 23, 59, 59), 37 | dttm(2262, 4, 11, 11, 47, 16), 38 | dttm(2001, 4, 13, 2, 14, 0), 39 | dttm(2000, 1, 1, 23, 10, 10), 40 | dttm(1900, 1, 1, 14, 25, 14), 41 | ]) 42 | table = pa.Table.from_arrays([arr, arr], schema=schema) 43 | orc.write_table(table, f"{dir}/pyarrow_timestamps.orc") 44 | 45 | 46 | # pyarrow overflows when trying to write this, so we have to use pyorc instead 47 | class TimestampConverter: 48 | @staticmethod 49 | def from_orc(obj, tz): 50 | return obj 51 | @staticmethod 52 | def to_orc(obj, tz): 53 | return obj 54 | schema = pyorc.Struct( 55 | id=pyorc.Int(), 56 | timestamp=pyorc.Timestamp() 57 | ) 58 | with open(f"{dir}/overflowing_timestamps.orc", "wb") as f: 59 | with pyorc.Writer( 60 | f, 61 | schema, 62 | converters={pyorc.TypeKind.TIMESTAMP: TimestampConverter}, 63 | ) as writer: 64 | writer.write((1, (12345678, 0))) 65 | writer.write((2, (-62135596800, 0))) 66 | writer.write((3, (12345678, 0))) 67 | -------------------------------------------------------------------------------- /scripts/prepare-test-data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 21 | BASE_DIR=$SCRIPT_DIR/.. 22 | VENV_BIN=$BASE_DIR/venv/bin 23 | 24 | cd $BASE_DIR 25 | $VENV_BIN/python $SCRIPT_DIR/write.py 26 | $VENV_BIN/python $SCRIPT_DIR/generate_orc.py 27 | $VENV_BIN/python $SCRIPT_DIR/generate_orc_timestamps.py 28 | $VENV_BIN/python $SCRIPT_DIR/generate_arrow.py 29 | 30 | echo "Done" 31 | 32 | -------------------------------------------------------------------------------- /scripts/setup-venv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 21 | BASE_DIR=$SCRIPT_DIR/.. 22 | VENV_BIN=$BASE_DIR/venv/bin 23 | 24 | python3 -m venv $BASE_DIR/venv 25 | 26 | $VENV_BIN/pip install -U pyorc pyspark pyarrow 27 | 28 | echo "Done" 29 | 30 | -------------------------------------------------------------------------------- /scripts/write.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Copied from https://github.com/DataEngineeringLabs/orc-format/blob/416490db0214fc51d53289253c0ee91f7fc9bc17/write.py 19 | import random 20 | import datetime 21 | import pyorc 22 | 23 | dir = "tests/basic/data" 24 | 25 | data = { 26 | "a": [1.0, 2.0, None, 4.0, 5.0], 27 | "b": [True, False, None, True, False], 28 | "str_direct": ["a", "cccccc", None, "ddd", "ee"], 29 | "d": ["a", "bb", None, "ccc", "ddd"], 30 | "e": ["ddd", "cc", None, "bb", "a"], 31 | "f": ["aaaaa", "bbbbb", None, "ccccc", "ddddd"], 32 | "int_short_repeated": [5, 5, None, 5, 5], 33 | "int_neg_short_repeated": [-5, -5, None, -5, -5], 34 | "int_delta": [1, 2, None, 4, 5], 35 | "int_neg_delta": [5, 4, None, 2, 1], 36 | "int_direct": [1, 6, None, 3, 2], 37 | "int_neg_direct": [-1, -6, None, -3, -2], 38 | "bigint_direct": [1, 6, None, 3, 2], 39 | "bigint_neg_direct": [-1, -6, None, -3, -2], 40 | "bigint_other": [5, -5, 1, 5, 5], 41 | "utf8_increase": ["a", "bb", "ccc", "dddd", "eeeee"], 42 | "utf8_decrease": ["eeeee", "dddd", "ccc", "bb", "a"], 43 | "timestamp_simple": [datetime.datetime(2023, 4, 1, 20, 15, 30, 2000), datetime.datetime.fromtimestamp(int('1629617204525777000')/1000000000), datetime.datetime(2023, 1, 1), datetime.datetime(2023, 2, 1), datetime.datetime(2023, 3, 1)], 44 | "date_simple": [datetime.date(2023, 4, 1), datetime.date(2023, 3, 1), datetime.date(2023, 1, 1), datetime.date(2023, 2, 1), datetime.date(2023, 3, 1)], 45 | "tinyint_simple": [-1, None, 1, 127, -127] 46 | } 47 | 48 | def infer_schema(data): 49 | schema = "struct<" 50 | for key, value in data.items(): 51 | dt = type(value[0]) 52 | if dt == float: 53 | dt = "float" 54 | elif dt == int: 55 | dt = "int" 56 | elif dt == bool: 57 | dt = "boolean" 58 | elif dt == str: 59 | dt = "string" 60 | elif dt == dict: 61 | dt = infer_schema(value[0]) 62 | elif key.startswith("timestamp"): 63 | dt = "timestamp" 64 | elif key.startswith("date"): 65 | dt = "date" 66 | else: 67 | print(key,value,dt) 68 | raise NotImplementedError 69 | if key.startswith("double"): 70 | dt = "double" 71 | if key.startswith("bigint"): 72 | dt = "bigint" 73 | if key.startswith("tinyint"): 74 | dt = "tinyint" 75 | schema += key + ":" + dt + "," 76 | 77 | schema = schema[:-1] + ">" 78 | return schema 79 | 80 | 81 | 82 | def _write( 83 | schema: str, 84 | data, 85 | file_name: str, 86 | compression=pyorc.CompressionKind.NONE, 87 | dict_key_size_threshold=0.0, 88 | ): 89 | output = open(file_name, "wb") 90 | writer = pyorc.Writer( 91 | output, 92 | schema, 93 | dict_key_size_threshold=dict_key_size_threshold, 94 | # use a small number to ensure that compression crosses value boundaries 95 | compression_block_size=32, 96 | compression=compression, 97 | ) 98 | num_rows = len(list(data.values())[0]) 99 | for x in range(num_rows): 100 | row = tuple(values[x] for values in data.values()) 101 | writer.write(row) 102 | writer.close() 103 | 104 | with open(file_name, "rb") as f: 105 | reader = pyorc.Reader(f) 106 | list(reader) 107 | 108 | nested_struct = { 109 | "nest": [ 110 | (1.0,True), 111 | (3.0,None), 112 | (None,None), 113 | None, 114 | (-3.0,None) 115 | ], 116 | } 117 | 118 | _write("struct>", nested_struct, f"{dir}/nested_struct.orc") 119 | 120 | 121 | nested_array = { 122 | "value": [ 123 | [1, None, 3, 43, 5], 124 | [5, None, 32, 4, 15], 125 | [16, None, 3, 4, 5, 6], 126 | None, 127 | [3, None], 128 | ], 129 | } 130 | 131 | _write("struct>", nested_array, f"{dir}/nested_array.orc") 132 | 133 | 134 | nested_array_float = { 135 | "value": [ 136 | [1.0, 3.0], 137 | [None, 2.0], 138 | ], 139 | } 140 | 141 | _write("struct>", nested_array_float, f"{dir}/nested_array_float.orc") 142 | 143 | nested_array_struct = { 144 | "value": [ 145 | [(1.0, 1, "01"), (2.0, 2, "02")], 146 | [None, (3.0, 3, "03")], 147 | ], 148 | } 149 | 150 | _write("struct>>", nested_array_struct, f"{dir}/nested_array_struct.orc") 151 | 152 | nested_map = { 153 | "map": [ 154 | {"zero": 0, "one": 1}, 155 | None, 156 | {"two": 2, "tree": 3}, 157 | {"one": 1, "two": 2, "nill": None}, 158 | ], 159 | } 160 | 161 | _write("struct>", nested_map, f"{dir}/nested_map.orc") 162 | 163 | nested_map_struct = { 164 | "map": [ 165 | {"01": (1.0, 1, "01"), "02": (2.0, 1, "02")}, 166 | None, 167 | {"03": (3.0, 3, "03"), "04": (4.0, 4, "04")}, 168 | ], 169 | } 170 | 171 | _write("struct>>", nested_map_struct, f"{dir}/nested_map_struct.orc") 172 | 173 | 174 | _write( 175 | infer_schema(data), 176 | data, 177 | f"{dir}/test.orc", 178 | ) 179 | 180 | data_boolean = { 181 | "long": [True] * 32, 182 | } 183 | 184 | _write("struct", data_boolean, f"{dir}/long_bool.orc") 185 | 186 | _write("struct", data_boolean, f"{dir}/long_bool_gzip.orc", pyorc.CompressionKind.ZLIB) 187 | 188 | data_dict = { 189 | "dict": ["abcd", "efgh"] * 32, 190 | } 191 | 192 | _write("struct", data_dict, f"{dir}/string_long.orc") 193 | 194 | data_dict = { 195 | "dict": ["abc", "efgh"] * 32, 196 | } 197 | 198 | _write("struct", data_dict, f"{dir}/string_dict.orc", dict_key_size_threshold=0.1) 199 | 200 | _write("struct", data_dict, f"{dir}/string_dict_gzip.orc", pyorc.CompressionKind.ZLIB) 201 | 202 | data_dict = { 203 | "dict": ["abcd", "efgh"] * (10**4 // 2), 204 | } 205 | 206 | _write("struct", data_dict, f"{dir}/string_long_long.orc") 207 | _write("struct", data_dict, f"{dir}/string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB) 208 | 209 | long_f32 = { 210 | "dict": [random.uniform(0, 1) for _ in range(10**6)], 211 | } 212 | 213 | _write("struct", long_f32, f"{dir}/f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB) 214 | -------------------------------------------------------------------------------- /src/array_decoder/decimal.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::cmp::Ordering; 19 | use std::sync::Arc; 20 | 21 | use arrow::array::ArrayRef; 22 | use arrow::buffer::NullBuffer; 23 | use arrow::datatypes::Decimal128Type; 24 | use snafu::ResultExt; 25 | 26 | use crate::encoding::decimal::UnboundedVarintStreamDecoder; 27 | use crate::encoding::integer::get_rle_reader; 28 | use crate::encoding::PrimitiveValueDecoder; 29 | use crate::error::ArrowSnafu; 30 | use crate::proto::stream::Kind; 31 | use crate::stripe::Stripe; 32 | use crate::{column::Column, error::Result}; 33 | 34 | use super::{ArrayBatchDecoder, PresentDecoder, PrimitiveArrayDecoder}; 35 | 36 | pub fn new_decimal_decoder( 37 | column: &Column, 38 | stripe: &Stripe, 39 | precision: u32, 40 | fixed_scale: u32, 41 | ) -> Result> { 42 | let varint_iter = stripe.stream_map().get(column, Kind::Data); 43 | let varint_iter = Box::new(UnboundedVarintStreamDecoder::new(varint_iter)); 44 | 45 | // Scale is specified on a per varint basis (in addition to being encoded in the type) 46 | let scale_iter = stripe.stream_map().get(column, Kind::Secondary); 47 | let scale_iter = get_rle_reader::(column, scale_iter)?; 48 | 49 | let present = PresentDecoder::from_stripe(stripe, column); 50 | 51 | let iter = DecimalScaleRepairDecoder { 52 | varint_iter, 53 | scale_iter, 54 | fixed_scale, 55 | }; 56 | let iter = Box::new(iter); 57 | 58 | Ok(Box::new(DecimalArrayDecoder::new( 59 | precision as u8, 60 | fixed_scale as i8, 61 | iter, 62 | present, 63 | ))) 64 | } 65 | 66 | /// Wrapper around PrimitiveArrayDecoder to allow specifying the precision and scale 67 | /// of the output decimal array. 68 | pub struct DecimalArrayDecoder { 69 | precision: u8, 70 | scale: i8, 71 | inner: PrimitiveArrayDecoder, 72 | } 73 | 74 | impl DecimalArrayDecoder { 75 | pub fn new( 76 | precision: u8, 77 | scale: i8, 78 | iter: Box + Send>, 79 | present: Option, 80 | ) -> Self { 81 | let inner = PrimitiveArrayDecoder::::new(iter, present); 82 | Self { 83 | precision, 84 | scale, 85 | inner, 86 | } 87 | } 88 | } 89 | 90 | impl ArrayBatchDecoder for DecimalArrayDecoder { 91 | fn next_batch( 92 | &mut self, 93 | batch_size: usize, 94 | parent_present: Option<&NullBuffer>, 95 | ) -> Result { 96 | let array = self 97 | .inner 98 | .next_primitive_batch(batch_size, parent_present)? 99 | .with_precision_and_scale(self.precision, self.scale) 100 | .context(ArrowSnafu)?; 101 | let array = Arc::new(array) as ArrayRef; 102 | Ok(array) 103 | } 104 | } 105 | 106 | /// This iter fixes the scales of the varints decoded as scale is specified on a per 107 | /// varint basis, and needs to align with type specified scale 108 | struct DecimalScaleRepairDecoder { 109 | varint_iter: Box + Send>, 110 | scale_iter: Box + Send>, 111 | fixed_scale: u32, 112 | } 113 | 114 | impl PrimitiveValueDecoder for DecimalScaleRepairDecoder { 115 | fn decode(&mut self, out: &mut [i128]) -> Result<()> { 116 | // TODO: can probably optimize, reuse buffers? 117 | let mut varint = vec![0; out.len()]; 118 | let mut scale = vec![0; out.len()]; 119 | self.varint_iter.decode(&mut varint)?; 120 | self.scale_iter.decode(&mut scale)?; 121 | for (index, (&varint, &scale)) in varint.iter().zip(scale.iter()).enumerate() { 122 | out[index] = fix_i128_scale(varint, self.fixed_scale, scale); 123 | } 124 | Ok(()) 125 | } 126 | } 127 | 128 | fn fix_i128_scale(i: i128, fixed_scale: u32, varying_scale: i32) -> i128 { 129 | // TODO: Verify with C++ impl in ORC repo, which does this cast 130 | // Not sure why scale stream can be signed if it gets casted to unsigned anyway 131 | // https://github.com/apache/orc/blob/0014bec1e4cdd1206f5bae4f5c2000b9300c6eb1/c%2B%2B/src/ColumnReader.cc#L1459-L1476 132 | let varying_scale = varying_scale as u32; 133 | match fixed_scale.cmp(&varying_scale) { 134 | Ordering::Less => { 135 | // fixed_scale < varying_scale 136 | // Current scale of number is greater than scale of the array type 137 | // So need to divide to align the scale 138 | // TODO: this differs from C++ implementation, need to verify 139 | let scale_factor = varying_scale - fixed_scale; 140 | // TODO: replace with lookup table? 141 | let scale_factor = 10_i128.pow(scale_factor); 142 | i / scale_factor 143 | } 144 | Ordering::Equal => i, 145 | Ordering::Greater => { 146 | // fixed_scale > varying_scale 147 | // Current scale of number is smaller than scale of the array type 148 | // So need to multiply to align the scale 149 | // TODO: this differs from C++ implementation, need to verify 150 | let scale_factor = fixed_scale - varying_scale; 151 | // TODO: replace with lookup table? 152 | let scale_factor = 10_i128.pow(scale_factor); 153 | i * scale_factor 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/array_decoder/list.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use arrow::array::{ArrayRef, ListArray}; 21 | use arrow::buffer::{NullBuffer, OffsetBuffer}; 22 | use arrow::datatypes::{Field, FieldRef}; 23 | use snafu::ResultExt; 24 | 25 | use crate::array_decoder::derive_present_vec; 26 | use crate::column::Column; 27 | use crate::encoding::integer::get_unsigned_rle_reader; 28 | use crate::encoding::PrimitiveValueDecoder; 29 | use crate::proto::stream::Kind; 30 | 31 | use crate::error::{ArrowSnafu, Result}; 32 | use crate::stripe::Stripe; 33 | 34 | use super::{array_decoder_factory, ArrayBatchDecoder, PresentDecoder}; 35 | 36 | pub struct ListArrayDecoder { 37 | inner: Box, 38 | present: Option, 39 | lengths: Box + Send>, 40 | field: FieldRef, 41 | } 42 | 43 | impl ListArrayDecoder { 44 | pub fn new(column: &Column, field: Arc, stripe: &Stripe) -> Result { 45 | let present = PresentDecoder::from_stripe(stripe, column); 46 | 47 | let child = &column.children()[0]; 48 | let inner = array_decoder_factory(child, field.clone(), stripe)?; 49 | 50 | let reader = stripe.stream_map().get(column, Kind::Length); 51 | let lengths = get_unsigned_rle_reader(column, reader); 52 | 53 | Ok(Self { 54 | inner, 55 | present, 56 | lengths, 57 | field, 58 | }) 59 | } 60 | } 61 | 62 | impl ArrayBatchDecoder for ListArrayDecoder { 63 | fn next_batch( 64 | &mut self, 65 | batch_size: usize, 66 | parent_present: Option<&NullBuffer>, 67 | ) -> Result { 68 | let present = 69 | derive_present_vec(&mut self.present, parent_present, batch_size).transpose()?; 70 | 71 | let mut lengths = vec![0; batch_size]; 72 | if let Some(present) = &present { 73 | self.lengths.decode_spaced(&mut lengths, present)?; 74 | } else { 75 | self.lengths.decode(&mut lengths)?; 76 | } 77 | let total_length: i64 = lengths.iter().sum(); 78 | // Fetch child array as one Array with total_length elements 79 | let child_array = self.inner.next_batch(total_length as usize, None)?; 80 | let offsets = OffsetBuffer::from_lengths(lengths.into_iter().map(|l| l as usize)); 81 | let null_buffer = present; 82 | 83 | let array = ListArray::try_new(self.field.clone(), offsets, child_array, null_buffer) 84 | .context(ArrowSnafu)?; 85 | let array = Arc::new(array); 86 | Ok(array) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/array_decoder/map.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use arrow::array::{ArrayRef, MapArray, StructArray}; 21 | use arrow::buffer::{NullBuffer, OffsetBuffer}; 22 | use arrow::datatypes::{Field, Fields}; 23 | use snafu::ResultExt; 24 | 25 | use crate::array_decoder::derive_present_vec; 26 | use crate::column::Column; 27 | use crate::encoding::integer::get_unsigned_rle_reader; 28 | use crate::encoding::PrimitiveValueDecoder; 29 | use crate::error::{ArrowSnafu, Result}; 30 | use crate::proto::stream::Kind; 31 | use crate::stripe::Stripe; 32 | 33 | use super::{array_decoder_factory, ArrayBatchDecoder, PresentDecoder}; 34 | 35 | pub struct MapArrayDecoder { 36 | keys: Box, 37 | values: Box, 38 | present: Option, 39 | lengths: Box + Send>, 40 | fields: Fields, 41 | } 42 | 43 | impl MapArrayDecoder { 44 | pub fn new( 45 | column: &Column, 46 | keys_field: Arc, 47 | values_field: Arc, 48 | stripe: &Stripe, 49 | ) -> Result { 50 | let present = PresentDecoder::from_stripe(stripe, column); 51 | 52 | let keys_column = &column.children()[0]; 53 | let keys = array_decoder_factory(keys_column, keys_field.clone(), stripe)?; 54 | 55 | let values_column = &column.children()[1]; 56 | let values = array_decoder_factory(values_column, values_field.clone(), stripe)?; 57 | 58 | let reader = stripe.stream_map().get(column, Kind::Length); 59 | let lengths = get_unsigned_rle_reader(column, reader); 60 | 61 | let fields = Fields::from(vec![keys_field, values_field]); 62 | 63 | Ok(Self { 64 | keys, 65 | values, 66 | present, 67 | lengths, 68 | fields, 69 | }) 70 | } 71 | } 72 | 73 | impl ArrayBatchDecoder for MapArrayDecoder { 74 | fn next_batch( 75 | &mut self, 76 | batch_size: usize, 77 | parent_present: Option<&NullBuffer>, 78 | ) -> Result { 79 | let present = 80 | derive_present_vec(&mut self.present, parent_present, batch_size).transpose()?; 81 | 82 | let mut lengths = vec![0; batch_size]; 83 | if let Some(present) = &present { 84 | self.lengths.decode_spaced(&mut lengths, present)?; 85 | } else { 86 | self.lengths.decode(&mut lengths)?; 87 | } 88 | let total_length: i64 = lengths.iter().sum(); 89 | // Fetch key and value arrays, each with total_length elements 90 | // Fetch child array as one Array with total_length elements 91 | let keys_array = self.keys.next_batch(total_length as usize, None)?; 92 | let values_array = self.values.next_batch(total_length as usize, None)?; 93 | // Compose the keys + values array into a StructArray with two entries 94 | let entries = 95 | StructArray::try_new(self.fields.clone(), vec![keys_array, values_array], None) 96 | .context(ArrowSnafu)?; 97 | let offsets = OffsetBuffer::from_lengths(lengths.into_iter().map(|l| l as usize)); 98 | 99 | let field = Arc::new(Field::new_struct("entries", self.fields.clone(), false)); 100 | let array = 101 | MapArray::try_new(field, offsets, entries, present, false).context(ArrowSnafu)?; 102 | let array = Arc::new(array); 103 | Ok(array) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/array_decoder/struct_decoder.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use arrow::{ 21 | array::{ArrayRef, StructArray}, 22 | buffer::NullBuffer, 23 | datatypes::Fields, 24 | }; 25 | use snafu::ResultExt; 26 | 27 | use crate::error::Result; 28 | use crate::stripe::Stripe; 29 | use crate::{column::Column, error::ArrowSnafu}; 30 | 31 | use super::{array_decoder_factory, derive_present_vec, ArrayBatchDecoder, PresentDecoder}; 32 | 33 | pub struct StructArrayDecoder { 34 | fields: Fields, 35 | decoders: Vec>, 36 | present: Option, 37 | } 38 | 39 | impl StructArrayDecoder { 40 | pub fn new(column: &Column, fields: Fields, stripe: &Stripe) -> Result { 41 | let present = PresentDecoder::from_stripe(stripe, column); 42 | 43 | let decoders = column 44 | .children() 45 | .iter() 46 | .zip(fields.iter().cloned()) 47 | .map(|(child, field)| array_decoder_factory(child, field, stripe)) 48 | .collect::>>()?; 49 | 50 | Ok(Self { 51 | decoders, 52 | present, 53 | fields, 54 | }) 55 | } 56 | } 57 | 58 | impl ArrayBatchDecoder for StructArrayDecoder { 59 | fn next_batch( 60 | &mut self, 61 | batch_size: usize, 62 | parent_present: Option<&NullBuffer>, 63 | ) -> Result { 64 | let present = 65 | derive_present_vec(&mut self.present, parent_present, batch_size).transpose()?; 66 | 67 | let child_arrays = self 68 | .decoders 69 | .iter_mut() 70 | .map(|child| child.next_batch(batch_size, present.as_ref())) 71 | .collect::>>()?; 72 | 73 | let null_buffer = present; 74 | let array = StructArray::try_new(self.fields.clone(), child_arrays, null_buffer) 75 | .context(ArrowSnafu)?; 76 | let array = Arc::new(array); 77 | Ok(array) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/array_decoder/union.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use arrow::array::{ArrayRef, BooleanBufferBuilder, UnionArray}; 21 | use arrow::buffer::{Buffer, NullBuffer}; 22 | use arrow::datatypes::UnionFields; 23 | use snafu::ResultExt; 24 | 25 | use crate::column::Column; 26 | use crate::encoding::byte::ByteRleDecoder; 27 | use crate::encoding::PrimitiveValueDecoder; 28 | use crate::error::ArrowSnafu; 29 | use crate::error::Result; 30 | use crate::proto::stream::Kind; 31 | use crate::stripe::Stripe; 32 | 33 | use super::{array_decoder_factory, derive_present_vec, ArrayBatchDecoder, PresentDecoder}; 34 | 35 | /// Decode ORC Union column into batches of Arrow Sparse UnionArrays. 36 | pub struct UnionArrayDecoder { 37 | // fields and variants should have same length 38 | // TODO: encode this assumption into types 39 | fields: UnionFields, 40 | variants: Vec>, 41 | tags: Box + Send>, 42 | present: Option, 43 | } 44 | 45 | impl UnionArrayDecoder { 46 | pub fn new(column: &Column, fields: UnionFields, stripe: &Stripe) -> Result { 47 | let present = PresentDecoder::from_stripe(stripe, column); 48 | 49 | let tags = stripe.stream_map().get(column, Kind::Data); 50 | let tags = Box::new(ByteRleDecoder::new(tags)); 51 | 52 | let variants = column 53 | .children() 54 | .iter() 55 | .zip(fields.iter()) 56 | .map(|(child, (_id, field))| array_decoder_factory(child, field.clone(), stripe)) 57 | .collect::>>()?; 58 | 59 | Ok(Self { 60 | fields, 61 | variants, 62 | tags, 63 | present, 64 | }) 65 | } 66 | } 67 | 68 | impl ArrayBatchDecoder for UnionArrayDecoder { 69 | fn next_batch( 70 | &mut self, 71 | batch_size: usize, 72 | parent_present: Option<&NullBuffer>, 73 | ) -> Result { 74 | let present = 75 | derive_present_vec(&mut self.present, parent_present, batch_size).transpose()?; 76 | let mut tags = vec![0; batch_size]; 77 | match &present { 78 | Some(present) => { 79 | // Since UnionArrays don't have nullability, we rely on child arrays. 80 | // So we default to first child (tag 0) for any nulls from this parent Union. 81 | self.tags.decode_spaced(&mut tags, present)?; 82 | } 83 | None => { 84 | self.tags.decode(&mut tags)?; 85 | } 86 | } 87 | 88 | // Calculate nullability for children 89 | let mut children_nullability = (0..self.variants.len()) 90 | .map(|index| { 91 | let mut child_present = BooleanBufferBuilder::new(batch_size); 92 | child_present.append_n(batch_size, false); 93 | for idx in tags 94 | .iter() 95 | .enumerate() 96 | // Where the parent expects the value of the child, we set to non-null. 97 | // Otherwise for the sparse spots, we leave as null in children. 98 | .filter_map(|(idx, &tag)| (tag as usize == index).then_some(idx)) 99 | { 100 | child_present.set_bit(idx, true); 101 | } 102 | child_present 103 | }) 104 | .collect::>(); 105 | // If parent says a slot is null, we need to ensure the first child (0-index) also 106 | // encodes this information, since as mentioned before, Arrow UnionArrays don't store 107 | // nullability and rely on their children. We default to first child to encode this 108 | // information so need to enforce that here. 109 | if let Some(present) = &present { 110 | let first_child = &mut children_nullability[0]; 111 | for idx in present 112 | .iter() 113 | .enumerate() 114 | .filter_map(|(idx, parent_present)| (!parent_present).then_some(idx)) 115 | { 116 | first_child.set_bit(idx, false); 117 | } 118 | } 119 | 120 | let child_arrays = self 121 | .variants 122 | .iter_mut() 123 | .zip(children_nullability) 124 | .map(|(decoder, mut present)| { 125 | let present = NullBuffer::from(present.finish()); 126 | decoder.next_batch(batch_size, Some(&present)) 127 | }) 128 | .collect::>>()?; 129 | 130 | // Currently default to decoding as Sparse UnionArray so no value offsets 131 | let type_ids = Buffer::from_vec(tags.clone()).into(); 132 | let array = UnionArray::try_new(self.fields.clone(), type_ids, None, child_arrays) 133 | .context(ArrowSnafu)?; 134 | let array = Arc::new(array); 135 | Ok(array) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/bin/orc-export.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::{fs::File, io, path::PathBuf}; 19 | 20 | use anyhow::Result; 21 | use arrow::{array::RecordBatch, csv, datatypes::DataType, error::ArrowError, json}; 22 | use clap::{Parser, ValueEnum}; 23 | use json::writer::{JsonFormat, LineDelimited}; 24 | use orc_rust::{projection::ProjectionMask, reader::metadata::read_metadata, ArrowReaderBuilder}; 25 | 26 | #[derive(Parser)] 27 | #[command(name = "orc-export")] 28 | #[command(version, about = "Export data from orc file to csv", long_about = None)] 29 | struct Cli { 30 | /// Path to the orc file 31 | file: PathBuf, 32 | /// Output file. If not provided output will be printed on console 33 | #[arg(short, long)] 34 | output_file: Option, 35 | /// Output format. If not provided then the output is csv 36 | #[arg(value_enum, short, long, default_value_t = FileFormat::Csv)] 37 | format: FileFormat, 38 | /// export only first N records 39 | #[arg(short, long, value_name = "N")] 40 | num_rows: Option, 41 | /// export only provided columns. Comma separated list 42 | #[arg(short, long, value_delimiter = ',')] 43 | columns: Option>, 44 | } 45 | 46 | #[derive(Clone, Debug, PartialEq, ValueEnum)] 47 | enum FileFormat { 48 | /// Output data in csv format 49 | Csv, 50 | /// Output data in json format 51 | Json, 52 | } 53 | 54 | #[allow(clippy::large_enum_variant)] 55 | enum OutputWriter { 56 | Csv(csv::Writer), 57 | Json(json::Writer), 58 | } 59 | 60 | impl OutputWriter 61 | where 62 | W: io::Write, 63 | F: JsonFormat, 64 | { 65 | fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { 66 | match self { 67 | OutputWriter::Csv(w) => w.write(batch), 68 | OutputWriter::Json(w) => w.write(batch), 69 | } 70 | } 71 | 72 | fn finish(&mut self) -> Result<(), ArrowError> { 73 | match self { 74 | OutputWriter::Csv(_) => Ok(()), 75 | OutputWriter::Json(w) => w.finish(), 76 | } 77 | } 78 | } 79 | 80 | fn main() -> Result<()> { 81 | let cli = Cli::parse(); 82 | 83 | // Prepare reader 84 | let mut f = File::open(&cli.file)?; 85 | let metadata = read_metadata(&mut f)?; 86 | 87 | // Select columns which should be exported (Binary and Decimal are not supported) 88 | let cols: Vec = metadata 89 | .root_data_type() 90 | .children() 91 | .iter() 92 | .enumerate() 93 | // TODO: handle nested types 94 | .filter(|(_, nc)| match nc.data_type().to_arrow_data_type() { 95 | DataType::Binary => false, 96 | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { 97 | matches!(cli.format, FileFormat::Csv) 98 | } 99 | _ => { 100 | if let Some(cols) = &cli.columns { 101 | cols.iter().any(|c| nc.name().eq(c)) 102 | } else { 103 | true 104 | } 105 | } 106 | }) 107 | .map(|(i, _)| i) 108 | .collect(); 109 | 110 | let projection = ProjectionMask::roots(metadata.root_data_type(), cols); 111 | let reader = ArrowReaderBuilder::try_new(f)? 112 | .with_projection(projection) 113 | .build(); 114 | 115 | // Prepare writer 116 | let writer: Box = if let Some(output) = cli.output_file { 117 | Box::new(File::create(output)?) 118 | } else { 119 | Box::new(io::stdout()) 120 | }; 121 | 122 | let mut output_writer = match cli.format { 123 | FileFormat::Json => { 124 | OutputWriter::Json(json::WriterBuilder::new().build::<_, LineDelimited>(writer)) 125 | } 126 | _ => OutputWriter::Csv(csv::WriterBuilder::new().with_header(true).build(writer)), 127 | }; 128 | 129 | // Convert data 130 | let mut num_rows = cli.num_rows.unwrap_or(u64::MAX); 131 | for mut batch in reader.flatten() { 132 | // Restrict rows 133 | if num_rows < batch.num_rows() as u64 { 134 | batch = batch.slice(0, num_rows as usize); 135 | } 136 | 137 | // Save 138 | output_writer.write(&batch)?; 139 | 140 | // Have we reached limit on the number of rows? 141 | if num_rows > batch.num_rows() as u64 { 142 | num_rows -= batch.num_rows() as u64; 143 | } else { 144 | break; 145 | } 146 | } 147 | 148 | output_writer.finish()?; 149 | 150 | Ok(()) 151 | } 152 | -------------------------------------------------------------------------------- /src/bin/orc-metadata.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::{error::Error, fs::File, path::PathBuf, sync::Arc}; 19 | 20 | use clap::Parser; 21 | use orc_rust::{reader::metadata::read_metadata, stripe::Stripe}; 22 | 23 | #[derive(Parser)] 24 | #[command(version, about, long_about = None)] 25 | struct Cli { 26 | /// ORC file path 27 | file: PathBuf, 28 | 29 | /// Display data for all stripes 30 | #[arg(short, long)] 31 | stripes: bool, 32 | } 33 | 34 | fn main() -> Result<(), Box> { 35 | let cli = Cli::parse(); 36 | 37 | let mut f = File::open(cli.file)?; 38 | let metadata = Arc::new(read_metadata(&mut f)?); 39 | 40 | // TODO: better way to handle this printing? 41 | println!( 42 | "compression: {}", 43 | metadata 44 | .compression() 45 | .map(|c| c.to_string()) 46 | .unwrap_or("None".to_string()) 47 | ); 48 | println!("file format version: {}", metadata.file_format_version()); 49 | println!("number of rows: {}", metadata.number_of_rows()); 50 | println!("number of stripes: {}", metadata.stripe_metadatas().len()); 51 | 52 | // TODO: nesting types indentation is messed up 53 | println!("schema:\n{}", metadata.root_data_type()); 54 | if cli.stripes { 55 | println!("\n=== Stripes ==="); 56 | for (i, stripe_metadata) in metadata.stripe_metadatas().iter().enumerate() { 57 | let stripe = Stripe::new( 58 | &mut f, 59 | &metadata, 60 | metadata.root_data_type(), 61 | stripe_metadata, 62 | )?; 63 | println!("stripe index: {i}"); 64 | println!("number of rows: {}", stripe.number_of_rows()); 65 | println!( 66 | "writer timezone: {}", 67 | stripe 68 | .writer_tz() 69 | .map(|tz| tz.to_string()) 70 | .unwrap_or("None".to_string()) 71 | ); 72 | println!(); 73 | } 74 | } 75 | 76 | Ok(()) 77 | } 78 | -------------------------------------------------------------------------------- /src/bin/orc-stats.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::{fs::File, path::PathBuf, sync::Arc}; 19 | 20 | use anyhow::Result; 21 | use arrow::temporal_conversions::{date32_to_datetime, timestamp_ms_to_datetime}; 22 | use clap::Parser; 23 | use orc_rust::{reader::metadata::read_metadata, statistics::ColumnStatistics}; 24 | 25 | #[derive(Parser)] 26 | #[command(name = "orc-stats")] 27 | #[command(version, about = "Print column and stripe stats from the orc file", long_about = None)] 28 | struct Cli { 29 | /// Path to the orc file 30 | file: PathBuf, 31 | } 32 | 33 | fn print_column_stats(col_stats: &ColumnStatistics) { 34 | if let Some(tstats) = col_stats.type_statistics() { 35 | match tstats { 36 | orc_rust::statistics::TypeStatistics::Integer { min, max, sum } => { 37 | println!("* Data type Integer"); 38 | println!("* Minimum: {}", min); 39 | println!("* Maximum: {}", max); 40 | if let Some(sum) = sum { 41 | println!("* Sum: {}", sum); 42 | } 43 | } 44 | orc_rust::statistics::TypeStatistics::Double { min, max, sum } => { 45 | println!("* Data type Double"); 46 | println!("* Minimum: {}", min); 47 | println!("* Maximum: {}", max); 48 | if let Some(sum) = sum { 49 | println!("* Sum: {}", sum); 50 | } 51 | } 52 | orc_rust::statistics::TypeStatistics::String { min, max, sum } => { 53 | println!("* Data type String"); 54 | println!("* Minimum: {}", min); 55 | println!("* Maximum: {}", max); 56 | println!("* Sum: {}", sum); 57 | } 58 | orc_rust::statistics::TypeStatistics::Bucket { true_count } => { 59 | println!("* Data type Bucket"); 60 | println!("* True count: {}", true_count); 61 | } 62 | orc_rust::statistics::TypeStatistics::Decimal { min, max, sum } => { 63 | println!("* Data type Decimal"); 64 | println!("* Minimum: {}", min); 65 | println!("* Maximum: {}", max); 66 | println!("* Sum: {}", sum); 67 | } 68 | orc_rust::statistics::TypeStatistics::Date { min, max } => { 69 | println!("* Data type Date"); 70 | if let Some(dt) = date32_to_datetime(*min) { 71 | println!("* Minimum: {}", dt); 72 | } 73 | if let Some(dt) = date32_to_datetime(*max) { 74 | println!("* Maximum: {}", dt); 75 | } 76 | } 77 | orc_rust::statistics::TypeStatistics::Binary { sum } => { 78 | println!("* Data type Binary"); 79 | println!("* Sum: {}", sum); 80 | } 81 | orc_rust::statistics::TypeStatistics::Timestamp { 82 | min, 83 | max, 84 | min_utc, 85 | max_utc, 86 | } => { 87 | println!("* Data type Timestamp"); 88 | println!("* Minimum: {}", min); 89 | println!("* Maximum: {}", max); 90 | if let Some(ts) = timestamp_ms_to_datetime(*min_utc) { 91 | println!("* Minimum UTC: {}", ts); 92 | } 93 | if let Some(ts) = timestamp_ms_to_datetime(*max_utc) { 94 | println!("* Maximum UTC: {}", ts); 95 | } 96 | } 97 | orc_rust::statistics::TypeStatistics::Collection { 98 | min_children, 99 | max_children, 100 | total_children, 101 | } => { 102 | println!("* Data type Collection"); 103 | println!("* Minimum children: {}", min_children); 104 | println!("* Maximum children: {}", max_children); 105 | println!("* Total children: {}", total_children); 106 | } 107 | } 108 | } 109 | 110 | println!("* Num values: {}", col_stats.number_of_values()); 111 | println!("* Has nulls: {}", col_stats.has_null()); 112 | println!(); 113 | } 114 | 115 | fn main() -> Result<()> { 116 | let cli = Cli::parse(); 117 | 118 | let mut f = File::open(&cli.file)?; 119 | let metadata = Arc::new(read_metadata(&mut f)?); 120 | 121 | println!("# Column stats"); 122 | println!( 123 | "File {:?} has {} columns", 124 | cli.file, 125 | metadata.column_file_statistics().len() 126 | ); 127 | println!(); 128 | for (idx, col_stats) in metadata.column_file_statistics().iter().enumerate() { 129 | println!("## Column {idx}"); 130 | print_column_stats(col_stats); 131 | } 132 | 133 | println!("# Stripe stats"); 134 | println!( 135 | "File {:?} has {} stripes", 136 | cli.file, 137 | metadata.stripe_metadatas().len() 138 | ); 139 | println!(); 140 | for (idm, sm) in metadata.stripe_metadatas().iter().enumerate() { 141 | println!("----- Stripe {idm} -----\n"); 142 | for (idc, col_stats) in sm.column_statistics().iter().enumerate() { 143 | println!("## Column {idc}"); 144 | print_column_stats(col_stats); 145 | } 146 | } 147 | 148 | Ok(()) 149 | } 150 | -------------------------------------------------------------------------------- /src/column.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::sync::Arc; 19 | 20 | use bytes::Bytes; 21 | use snafu::ResultExt; 22 | 23 | use crate::error::{IoSnafu, Result}; 24 | use crate::proto::{ColumnEncoding, StripeFooter}; 25 | use crate::reader::ChunkReader; 26 | use crate::schema::DataType; 27 | 28 | #[derive(Clone, Debug)] 29 | pub struct Column { 30 | footer: Arc, 31 | name: String, 32 | data_type: DataType, 33 | } 34 | 35 | impl Column { 36 | pub fn new(name: &str, data_type: &DataType, footer: &Arc) -> Self { 37 | Self { 38 | footer: footer.clone(), 39 | data_type: data_type.clone(), 40 | name: name.to_string(), 41 | } 42 | } 43 | 44 | pub fn dictionary_size(&self) -> usize { 45 | let column = self.data_type.column_index(); 46 | self.footer.columns[column] 47 | .dictionary_size 48 | .unwrap_or_default() as usize 49 | } 50 | 51 | pub fn encoding(&self) -> ColumnEncoding { 52 | let column = self.data_type.column_index(); 53 | self.footer.columns[column].clone() 54 | } 55 | 56 | pub fn data_type(&self) -> &DataType { 57 | &self.data_type 58 | } 59 | 60 | pub fn name(&self) -> &str { 61 | &self.name 62 | } 63 | 64 | pub fn column_id(&self) -> u32 { 65 | self.data_type.column_index() as u32 66 | } 67 | 68 | pub fn children(&self) -> Vec { 69 | match &self.data_type { 70 | DataType::Boolean { .. } 71 | | DataType::Byte { .. } 72 | | DataType::Short { .. } 73 | | DataType::Int { .. } 74 | | DataType::Long { .. } 75 | | DataType::Float { .. } 76 | | DataType::Double { .. } 77 | | DataType::String { .. } 78 | | DataType::Varchar { .. } 79 | | DataType::Char { .. } 80 | | DataType::Binary { .. } 81 | | DataType::Decimal { .. } 82 | | DataType::Timestamp { .. } 83 | | DataType::TimestampWithLocalTimezone { .. } 84 | | DataType::Date { .. } => vec![], 85 | DataType::Struct { children, .. } => children 86 | .iter() 87 | .map(|col| Column { 88 | footer: self.footer.clone(), 89 | name: col.name().to_string(), 90 | data_type: col.data_type().clone(), 91 | }) 92 | .collect(), 93 | DataType::List { child, .. } => { 94 | vec![Column { 95 | footer: self.footer.clone(), 96 | name: "item".to_string(), 97 | data_type: *child.clone(), 98 | }] 99 | } 100 | DataType::Map { key, value, .. } => { 101 | vec![ 102 | Column { 103 | footer: self.footer.clone(), 104 | name: "key".to_string(), 105 | data_type: *key.clone(), 106 | }, 107 | Column { 108 | footer: self.footer.clone(), 109 | name: "value".to_string(), 110 | data_type: *value.clone(), 111 | }, 112 | ] 113 | } 114 | DataType::Union { variants, .. } => { 115 | // TODO: might need corrections 116 | variants 117 | .iter() 118 | .enumerate() 119 | .map(|(index, data_type)| Column { 120 | footer: self.footer.clone(), 121 | name: format!("{index}"), 122 | data_type: data_type.clone(), 123 | }) 124 | .collect() 125 | } 126 | } 127 | } 128 | 129 | pub fn read_stream(reader: &mut R, start: u64, length: u64) -> Result { 130 | reader.get_bytes(start, length).context(IoSnafu) 131 | } 132 | 133 | #[cfg(feature = "async")] 134 | pub async fn read_stream_async( 135 | reader: &mut R, 136 | start: u64, 137 | length: u64, 138 | ) -> Result { 139 | reader.get_bytes(start, length).await.context(IoSnafu) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/encoding/boolean.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::io::Read; 19 | 20 | use arrow::{ 21 | array::BooleanBufferBuilder, 22 | buffer::{BooleanBuffer, NullBuffer}, 23 | }; 24 | use bytes::Bytes; 25 | 26 | use crate::{error::Result, memory::EstimateMemory}; 27 | 28 | use super::{ 29 | byte::{ByteRleDecoder, ByteRleEncoder}, 30 | PrimitiveValueDecoder, PrimitiveValueEncoder, 31 | }; 32 | 33 | pub struct BooleanDecoder { 34 | decoder: ByteRleDecoder, 35 | data: u8, 36 | bits_in_data: usize, 37 | } 38 | 39 | impl BooleanDecoder { 40 | pub fn new(reader: R) -> Self { 41 | Self { 42 | decoder: ByteRleDecoder::new(reader), 43 | bits_in_data: 0, 44 | data: 0, 45 | } 46 | } 47 | 48 | pub fn value(&mut self) -> bool { 49 | let value = (self.data & 0x80) != 0; 50 | self.data <<= 1; 51 | self.bits_in_data -= 1; 52 | 53 | value 54 | } 55 | } 56 | 57 | impl PrimitiveValueDecoder for BooleanDecoder { 58 | // TODO: can probably implement this better 59 | fn decode(&mut self, out: &mut [bool]) -> Result<()> { 60 | for x in out.iter_mut() { 61 | // read more data if necessary 62 | if self.bits_in_data == 0 { 63 | let mut data = [0]; 64 | self.decoder.decode(&mut data)?; 65 | self.data = data[0] as u8; 66 | self.bits_in_data = 8; 67 | } 68 | *x = self.value(); 69 | } 70 | Ok(()) 71 | } 72 | } 73 | 74 | /// ORC encodes validity starting from MSB, whilst Arrow encodes it 75 | /// from LSB. After bytes are filled with the present bits, they are 76 | /// further encoded via Byte RLE. 77 | pub struct BooleanEncoder { 78 | // TODO: can we refactor to not need two separate buffers? 79 | byte_encoder: ByteRleEncoder, 80 | builder: BooleanBufferBuilder, 81 | } 82 | 83 | impl EstimateMemory for BooleanEncoder { 84 | fn estimate_memory_size(&self) -> usize { 85 | self.builder.len() / 8 86 | } 87 | } 88 | 89 | impl BooleanEncoder { 90 | pub fn new() -> Self { 91 | Self { 92 | byte_encoder: ByteRleEncoder::new(), 93 | builder: BooleanBufferBuilder::new(8), 94 | } 95 | } 96 | 97 | pub fn extend(&mut self, null_buffer: &NullBuffer) { 98 | let bb = null_buffer.inner(); 99 | self.extend_bb(bb); 100 | } 101 | 102 | pub fn extend_bb(&mut self, bb: &BooleanBuffer) { 103 | self.builder.append_buffer(bb); 104 | } 105 | 106 | /// Extend with n true bits. 107 | pub fn extend_present(&mut self, n: usize) { 108 | self.builder.append_n(n, true); 109 | } 110 | 111 | pub fn extend_boolean(&mut self, b: bool) { 112 | self.builder.append(b); 113 | } 114 | 115 | /// Produce ORC present stream bytes and reset internal builder. 116 | pub fn finish(&mut self) -> Bytes { 117 | // TODO: don't throw away allocation? 118 | let bb = self.builder.finish(); 119 | // We use BooleanBufferBuilder so offset is 0 120 | let bytes = bb.values(); 121 | // Reverse bits as ORC stores from MSB 122 | let bytes = bytes.iter().map(|b| b.reverse_bits()).collect::>(); 123 | for &b in bytes.as_slice() { 124 | self.byte_encoder.write_one(b as i8); 125 | } 126 | self.byte_encoder.take_inner() 127 | } 128 | } 129 | 130 | #[cfg(test)] 131 | mod tests { 132 | use super::*; 133 | 134 | #[test] 135 | fn basic() { 136 | let expected = vec![false; 800]; 137 | let data = [0x61u8, 0x00]; 138 | let data = &mut data.as_ref(); 139 | let mut decoder = BooleanDecoder::new(data); 140 | let mut actual = vec![true; expected.len()]; 141 | decoder.decode(&mut actual).unwrap(); 142 | assert_eq!(actual, expected) 143 | } 144 | 145 | #[test] 146 | fn literals() { 147 | let expected = vec![ 148 | false, true, false, false, false, true, false, false, // 0b01000100 149 | false, true, false, false, false, true, false, true, // 0b01000101 150 | ]; 151 | let data = [0xfeu8, 0b01000100, 0b01000101]; 152 | let data = &mut data.as_ref(); 153 | let mut decoder = BooleanDecoder::new(data); 154 | let mut actual = vec![true; expected.len()]; 155 | decoder.decode(&mut actual).unwrap(); 156 | assert_eq!(actual, expected) 157 | } 158 | 159 | #[test] 160 | fn another() { 161 | // "For example, the byte sequence [0xff, 0x80] would be one true followed by seven false values." 162 | let expected = vec![true, false, false, false, false, false, false, false]; 163 | let data = [0xff, 0x80]; 164 | let data = &mut data.as_ref(); 165 | let mut decoder = BooleanDecoder::new(data); 166 | let mut actual = vec![true; expected.len()]; 167 | decoder.decode(&mut actual).unwrap(); 168 | assert_eq!(actual, expected) 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/encoding/decimal.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::io::Read; 19 | 20 | use crate::error::Result; 21 | 22 | use super::{ 23 | integer::{read_varint_zigzagged, SignedEncoding}, 24 | PrimitiveValueDecoder, 25 | }; 26 | 27 | /// Read stream of zigzag encoded varints as i128 (unbound). 28 | pub struct UnboundedVarintStreamDecoder { 29 | reader: R, 30 | } 31 | 32 | impl UnboundedVarintStreamDecoder { 33 | pub fn new(reader: R) -> Self { 34 | Self { reader } 35 | } 36 | } 37 | 38 | impl PrimitiveValueDecoder for UnboundedVarintStreamDecoder { 39 | fn decode(&mut self, out: &mut [i128]) -> Result<()> { 40 | for x in out.iter_mut() { 41 | *x = read_varint_zigzagged::(&mut self.reader)?; 42 | } 43 | Ok(()) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/encoding/float.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::marker::PhantomData; 19 | 20 | use bytemuck::{must_cast_slice, must_cast_slice_mut}; 21 | use bytes::{Bytes, BytesMut}; 22 | use snafu::ResultExt; 23 | 24 | use crate::{ 25 | error::{IoSnafu, Result}, 26 | memory::EstimateMemory, 27 | }; 28 | 29 | use super::{PrimitiveValueDecoder, PrimitiveValueEncoder}; 30 | 31 | /// Collect all the required traits we need on floats. 32 | pub trait Float: 33 | num::Float + std::fmt::Debug + bytemuck::NoUninit + bytemuck::AnyBitPattern 34 | { 35 | } 36 | impl Float for f32 {} 37 | impl Float for f64 {} 38 | 39 | pub struct FloatDecoder { 40 | reader: R, 41 | phantom: std::marker::PhantomData, 42 | } 43 | 44 | impl FloatDecoder { 45 | pub fn new(reader: R) -> Self { 46 | Self { 47 | reader, 48 | phantom: Default::default(), 49 | } 50 | } 51 | } 52 | 53 | impl PrimitiveValueDecoder for FloatDecoder { 54 | fn decode(&mut self, out: &mut [F]) -> Result<()> { 55 | let bytes = must_cast_slice_mut::(out); 56 | self.reader.read_exact(bytes).context(IoSnafu)?; 57 | Ok(()) 58 | } 59 | } 60 | 61 | /// No special run encoding for floats/doubles, they are stored as their IEEE 754 floating 62 | /// point bit layout. This encoder simply copies incoming floats/doubles to its internal 63 | /// byte buffer. 64 | pub struct FloatEncoder { 65 | data: BytesMut, 66 | _phantom: PhantomData, 67 | } 68 | 69 | impl EstimateMemory for FloatEncoder { 70 | fn estimate_memory_size(&self) -> usize { 71 | self.data.len() 72 | } 73 | } 74 | 75 | impl PrimitiveValueEncoder for FloatEncoder { 76 | fn new() -> Self { 77 | Self { 78 | data: BytesMut::new(), 79 | _phantom: Default::default(), 80 | } 81 | } 82 | 83 | fn write_one(&mut self, value: F) { 84 | self.write_slice(&[value]); 85 | } 86 | 87 | fn write_slice(&mut self, values: &[F]) { 88 | let bytes = must_cast_slice::(values); 89 | self.data.extend_from_slice(bytes); 90 | } 91 | 92 | fn take_inner(&mut self) -> Bytes { 93 | std::mem::take(&mut self.data).into() 94 | } 95 | } 96 | 97 | #[cfg(test)] 98 | mod tests { 99 | use std::f32::consts as f32c; 100 | use std::f64::consts as f64c; 101 | use std::io::Cursor; 102 | 103 | use proptest::prelude::*; 104 | 105 | use super::*; 106 | 107 | fn roundtrip_helper(input: &[F]) -> Result> { 108 | let mut encoder = FloatEncoder::::new(); 109 | encoder.write_slice(input); 110 | let bytes = encoder.take_inner(); 111 | let bytes = Cursor::new(bytes); 112 | 113 | let mut iter = FloatDecoder::::new(bytes); 114 | let mut actual = vec![F::zero(); input.len()]; 115 | iter.decode(&mut actual)?; 116 | 117 | Ok(actual) 118 | } 119 | 120 | fn assert_roundtrip(input: Vec) { 121 | let actual = roundtrip_helper(&input).unwrap(); 122 | assert_eq!(input, actual); 123 | } 124 | 125 | proptest! { 126 | #[test] 127 | fn roundtrip_f32(values: Vec) { 128 | let out = roundtrip_helper(&values)?; 129 | prop_assert_eq!(out, values); 130 | } 131 | 132 | #[test] 133 | fn roundtrip_f64(values: Vec) { 134 | let out = roundtrip_helper(&values)?; 135 | prop_assert_eq!(out, values); 136 | } 137 | } 138 | 139 | #[test] 140 | fn test_float_edge_cases() { 141 | assert_roundtrip::(vec![]); 142 | assert_roundtrip::(vec![]); 143 | 144 | assert_roundtrip(vec![f32c::PI]); 145 | assert_roundtrip(vec![f64c::PI]); 146 | 147 | let actual = roundtrip_helper(&[f32::NAN]).unwrap(); 148 | assert!(actual[0].is_nan()); 149 | let actual = roundtrip_helper(&[f64::NAN]).unwrap(); 150 | assert!(actual[0].is_nan()); 151 | } 152 | 153 | #[test] 154 | fn test_float_many() { 155 | assert_roundtrip(vec![ 156 | f32::NEG_INFINITY, 157 | f32::MIN, 158 | -1.0, 159 | -0.0, 160 | 0.0, 161 | 1.0, 162 | f32c::SQRT_2, 163 | f32::MAX, 164 | f32::INFINITY, 165 | ]); 166 | 167 | assert_roundtrip(vec![ 168 | f64::NEG_INFINITY, 169 | f64::MIN, 170 | -1.0, 171 | -0.0, 172 | 0.0, 173 | 1.0, 174 | f64c::SQRT_2, 175 | f64::MAX, 176 | f64::INFINITY, 177 | ]); 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/encoding/integer/rle_v2/direct.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::io::Read; 19 | 20 | use bytes::{BufMut, BytesMut}; 21 | 22 | use crate::{ 23 | encoding::{ 24 | integer::{ 25 | rle_v2::{EncodingType, MAX_RUN_LENGTH}, 26 | util::{ 27 | extract_run_length_from_header, read_ints, rle_v2_decode_bit_width, 28 | rle_v2_encode_bit_width, write_aligned_packed_ints, 29 | }, 30 | EncodingSign, 31 | }, 32 | util::read_u8, 33 | }, 34 | error::{OutOfSpecSnafu, Result}, 35 | }; 36 | 37 | use super::NInt; 38 | 39 | pub fn read_direct_values( 40 | reader: &mut R, 41 | out_ints: &mut Vec, 42 | header: u8, 43 | ) -> Result<()> { 44 | let encoded_bit_width = (header >> 1) & 0x1F; 45 | let bit_width = rle_v2_decode_bit_width(encoded_bit_width); 46 | 47 | if (N::BYTE_SIZE * 8) < bit_width { 48 | return OutOfSpecSnafu { 49 | msg: "byte width of direct encoding exceeds byte size of integer being decoded to", 50 | } 51 | .fail(); 52 | } 53 | 54 | let second_byte = read_u8(reader)?; 55 | let length = extract_run_length_from_header(header, second_byte); 56 | 57 | // Write the unpacked values and zigzag decode to result buffer 58 | read_ints(out_ints, length, bit_width, reader)?; 59 | 60 | for lit in out_ints.iter_mut() { 61 | *lit = S::zigzag_decode(*lit); 62 | } 63 | 64 | Ok(()) 65 | } 66 | 67 | /// `values` and `max` must be zigzag encoded. If `max` is not provided, it is derived 68 | /// by iterating over `values`. 69 | pub fn write_direct(writer: &mut BytesMut, values: &[N], max: Option) { 70 | debug_assert!( 71 | (1..=MAX_RUN_LENGTH).contains(&values.len()), 72 | "direct run length cannot exceed 512 values" 73 | ); 74 | 75 | let max = max.unwrap_or_else(|| { 76 | // Assert guards that values is non-empty 77 | *values.iter().max_by_key(|x| x.bits_used()).unwrap() 78 | }); 79 | 80 | let bit_width = max.closest_aligned_bit_width(); 81 | let encoded_bit_width = rle_v2_encode_bit_width(bit_width); 82 | // From [1, 512] to [0, 511] 83 | let encoded_length = values.len() as u16 - 1; 84 | // No need to mask as we guarantee max length is 512 85 | let encoded_length_high_bit = (encoded_length >> 8) as u8; 86 | let encoded_length_low_bits = (encoded_length & 0xFF) as u8; 87 | 88 | let header1 = 89 | EncodingType::Direct.to_header() | (encoded_bit_width << 1) | encoded_length_high_bit; 90 | let header2 = encoded_length_low_bits; 91 | 92 | writer.put_u8(header1); 93 | writer.put_u8(header2); 94 | write_aligned_packed_ints(writer, bit_width, values); 95 | } 96 | 97 | #[cfg(test)] 98 | mod tests { 99 | use std::io::Cursor; 100 | 101 | use proptest::prelude::*; 102 | 103 | use crate::encoding::integer::{SignedEncoding, UnsignedEncoding}; 104 | 105 | use super::*; 106 | 107 | fn roundtrip_direct_helper(values: &[N]) -> Result> { 108 | let mut buf = BytesMut::new(); 109 | let mut out = vec![]; 110 | 111 | write_direct(&mut buf, values, None); 112 | let header = buf[0]; 113 | read_direct_values::<_, _, S>(&mut Cursor::new(&buf[1..]), &mut out, header)?; 114 | 115 | Ok(out) 116 | } 117 | 118 | #[test] 119 | fn test_direct_edge_case() { 120 | let values: Vec = vec![109, -17809, -29946, -17285]; 121 | let encoded = values 122 | .iter() 123 | .map(|&v| SignedEncoding::zigzag_encode(v)) 124 | .collect::>(); 125 | let out = roundtrip_direct_helper::<_, SignedEncoding>(&encoded).unwrap(); 126 | assert_eq!(out, values); 127 | } 128 | 129 | proptest! { 130 | #[test] 131 | fn roundtrip_direct_i16(values in prop::collection::vec(any::(), 1..=512)) { 132 | let encoded = values.iter().map(|v| SignedEncoding::zigzag_encode(*v)).collect::>(); 133 | let out = roundtrip_direct_helper::<_, SignedEncoding>(&encoded)?; 134 | prop_assert_eq!(out, values); 135 | } 136 | 137 | #[test] 138 | fn roundtrip_direct_i32(values in prop::collection::vec(any::(), 1..=512)) { 139 | let encoded = values.iter().map(|v| SignedEncoding::zigzag_encode(*v)).collect::>(); 140 | let out = roundtrip_direct_helper::<_, SignedEncoding>(&encoded)?; 141 | prop_assert_eq!(out, values); 142 | } 143 | 144 | #[test] 145 | fn roundtrip_direct_i64(values in prop::collection::vec(any::(), 1..=512)) { 146 | let encoded = values.iter().map(|v| SignedEncoding::zigzag_encode(*v)).collect::>(); 147 | let out = roundtrip_direct_helper::<_, SignedEncoding>(&encoded)?; 148 | prop_assert_eq!(out, values); 149 | } 150 | 151 | #[test] 152 | fn roundtrip_direct_i64_unsigned(values in prop::collection::vec(0..=i64::MAX, 1..=512)) { 153 | let encoded = values.iter().map(|v| UnsignedEncoding::zigzag_encode(*v)).collect::>(); 154 | let out = roundtrip_direct_helper::<_, UnsignedEncoding>(&encoded)?; 155 | prop_assert_eq!(out, values); 156 | } 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/encoding/integer/rle_v2/short_repeat.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::io::Read; 19 | 20 | use bytes::{BufMut, BytesMut}; 21 | 22 | use crate::{ 23 | encoding::integer::{rle_v2::EncodingType, EncodingSign}, 24 | error::{OutOfSpecSnafu, Result}, 25 | }; 26 | 27 | use super::{NInt, SHORT_REPEAT_MIN_LENGTH}; 28 | 29 | pub fn read_short_repeat_values( 30 | reader: &mut R, 31 | out_ints: &mut Vec, 32 | header: u8, 33 | ) -> Result<()> { 34 | // Header byte: 35 | // 36 | // eeww_wccc 37 | // 7 0 LSB 38 | // 39 | // ee = Sub-encoding bits, always 00 40 | // www = Value width bits 41 | // ccc = Repeat count bits 42 | 43 | let byte_width = (header >> 3) & 0x07; // Encoded as 0 to 7 44 | let byte_width = byte_width as usize + 1; // Decode to 1 to 8 bytes 45 | 46 | if N::BYTE_SIZE < byte_width { 47 | return OutOfSpecSnafu { 48 | msg: 49 | "byte width of short repeat encoding exceeds byte size of integer being decoded to", 50 | } 51 | .fail(); 52 | } 53 | 54 | let run_length = (header & 0x07) as usize + SHORT_REPEAT_MIN_LENGTH; 55 | 56 | // Value that is being repeated is encoded as value_byte_width bytes in big endian format 57 | let val = N::read_big_endian(reader, byte_width)?; 58 | let val = S::zigzag_decode(val); 59 | 60 | out_ints.extend(std::iter::repeat(val).take(run_length)); 61 | 62 | Ok(()) 63 | } 64 | 65 | pub fn write_short_repeat(writer: &mut BytesMut, value: N, count: usize) { 66 | debug_assert!((SHORT_REPEAT_MIN_LENGTH..=10).contains(&count)); 67 | 68 | let value = S::zigzag_encode(value); 69 | 70 | // Take max in case value = 0 71 | let byte_size = value.bits_used().div_ceil(8).max(1) as u8; 72 | let encoded_byte_size = byte_size - 1; 73 | let encoded_count = (count - SHORT_REPEAT_MIN_LENGTH) as u8; 74 | 75 | let header = EncodingType::ShortRepeat.to_header() | (encoded_byte_size << 3) | encoded_count; 76 | let bytes = value.to_be_bytes(); 77 | let bytes = &bytes.as_ref()[N::BYTE_SIZE - byte_size as usize..]; 78 | 79 | writer.put_u8(header); 80 | writer.put_slice(bytes); 81 | } 82 | 83 | #[cfg(test)] 84 | mod tests { 85 | use std::io::Cursor; 86 | 87 | use proptest::prelude::*; 88 | 89 | use crate::encoding::integer::{SignedEncoding, UnsignedEncoding}; 90 | 91 | use super::*; 92 | 93 | fn roundtrip_short_repeat_helper( 94 | value: N, 95 | count: usize, 96 | ) -> Result> { 97 | let mut buf = BytesMut::new(); 98 | let mut out = vec![]; 99 | 100 | write_short_repeat::<_, S>(&mut buf, value, count); 101 | let header = buf[0]; 102 | read_short_repeat_values::<_, _, S>(&mut Cursor::new(&buf[1..]), &mut out, header)?; 103 | 104 | Ok(out) 105 | } 106 | 107 | proptest! { 108 | #[test] 109 | fn roundtrip_short_repeat_i16(value: i16, count in 3_usize..=10) { 110 | let out = roundtrip_short_repeat_helper::<_, SignedEncoding>(value, count)?; 111 | prop_assert_eq!(out, vec![value; count]); 112 | } 113 | 114 | #[test] 115 | fn roundtrip_short_repeat_i32(value: i32, count in 3_usize..=10) { 116 | let out = roundtrip_short_repeat_helper::<_, SignedEncoding>(value, count)?; 117 | prop_assert_eq!(out, vec![value; count]); 118 | } 119 | 120 | #[test] 121 | fn roundtrip_short_repeat_i64(value: i64, count in 3_usize..=10) { 122 | let out = roundtrip_short_repeat_helper::<_, SignedEncoding>(value, count)?; 123 | prop_assert_eq!(out, vec![value; count]); 124 | } 125 | 126 | #[test] 127 | fn roundtrip_short_repeat_i64_unsigned(value in 0..=i64::MAX, count in 3_usize..=10) { 128 | let out = roundtrip_short_repeat_helper::<_, UnsignedEncoding>(value, count)?; 129 | prop_assert_eq!(out, vec![value; count]); 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/encoding/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | //! Encoding/decoding logic for writing/reading primitive values from ORC types. 19 | 20 | use arrow::buffer::NullBuffer; 21 | use bytes::Bytes; 22 | 23 | use crate::{error::Result, memory::EstimateMemory}; 24 | 25 | pub mod boolean; 26 | pub mod byte; 27 | pub mod decimal; 28 | pub mod float; 29 | pub mod integer; 30 | mod rle; 31 | pub mod timestamp; 32 | mod util; 33 | 34 | /// Encodes primitive values into an internal buffer, usually with a specialized run length 35 | /// encoding for better compression. 36 | pub trait PrimitiveValueEncoder: EstimateMemory 37 | where 38 | V: Copy, 39 | { 40 | fn new() -> Self; 41 | 42 | fn write_one(&mut self, value: V); 43 | 44 | fn write_slice(&mut self, values: &[V]) { 45 | for &value in values { 46 | self.write_one(value); 47 | } 48 | } 49 | 50 | /// Take the encoded bytes, replacing it with an empty buffer. 51 | // TODO: Figure out how to retain the allocation instead of handing 52 | // it off each time. 53 | fn take_inner(&mut self) -> Bytes; 54 | } 55 | 56 | pub trait PrimitiveValueDecoder { 57 | /// Decode out.len() values into out at a time, failing if it cannot fill 58 | /// the buffer. 59 | fn decode(&mut self, out: &mut [V]) -> Result<()>; 60 | 61 | /// Decode into `out` according to the `true` elements in `present`. 62 | /// 63 | /// `present` must be the same length as `out`. 64 | fn decode_spaced(&mut self, out: &mut [V], present: &NullBuffer) -> Result<()> { 65 | debug_assert_eq!(out.len(), present.len()); 66 | 67 | // First get all the non-null values into a contiguous range. 68 | let non_null_count = present.len() - present.null_count(); 69 | if non_null_count == 0 { 70 | // All nulls, don't bother decoding anything 71 | return Ok(()); 72 | } 73 | // We read into the back because valid_indices() below is not reversible, 74 | // so we just reverse our algorithm. 75 | let range_start = out.len() - non_null_count; 76 | self.decode(&mut out[range_start..])?; 77 | if non_null_count == present.len() { 78 | // No nulls, don't need to space out 79 | return Ok(()); 80 | } 81 | 82 | // From the head of the contiguous range (at the end of the buffer) we swap 83 | // with the null elements to ensure it matches with the present buffer. 84 | let head_indices = range_start..out.len(); 85 | for (correct_index, head_index) in present.valid_indices().zip(head_indices) { 86 | // head_index points to the value we need to move to correct_index 87 | out.swap(correct_index, head_index); 88 | } 89 | 90 | Ok(()) 91 | } 92 | } 93 | 94 | #[cfg(test)] 95 | mod tests { 96 | use proptest::prelude::*; 97 | 98 | use super::*; 99 | 100 | /// Emits numbers increasing from 0. 101 | struct DummyDecoder; 102 | 103 | impl PrimitiveValueDecoder for DummyDecoder { 104 | fn decode(&mut self, out: &mut [i32]) -> Result<()> { 105 | let values = (0..out.len()).map(|x| x as i32).collect::>(); 106 | out.copy_from_slice(&values); 107 | Ok(()) 108 | } 109 | } 110 | 111 | fn gen_spaced_dummy_decoder_expected(present: &[bool]) -> Vec { 112 | let mut value = 0; 113 | let mut expected = vec![]; 114 | for &is_present in present { 115 | if is_present { 116 | expected.push(value); 117 | value += 1; 118 | } else { 119 | expected.push(-1); 120 | } 121 | } 122 | expected 123 | } 124 | 125 | proptest! { 126 | #[test] 127 | fn decode_spaced_proptest(present: Vec) { 128 | let mut decoder = DummyDecoder; 129 | let mut out = vec![-1; present.len()]; 130 | decoder.decode_spaced(&mut out, &NullBuffer::from(present.clone())).unwrap(); 131 | let expected = gen_spaced_dummy_decoder_expected(&present); 132 | prop_assert_eq!(out, expected); 133 | } 134 | } 135 | 136 | #[test] 137 | fn decode_spaced_edge_cases() { 138 | let mut decoder = DummyDecoder; 139 | let len = 10; 140 | 141 | // all present 142 | let mut out = vec![-1; len]; 143 | let present = vec![true; len]; 144 | let present = NullBuffer::from(present); 145 | decoder.decode_spaced(&mut out, &present).unwrap(); 146 | let expected: Vec<_> = (0..len).map(|i| i as i32).collect(); 147 | assert_eq!(out, expected); 148 | 149 | // all null 150 | let mut out = vec![-1; len]; 151 | let present = vec![false; len]; 152 | let present = NullBuffer::from(present); 153 | decoder.decode_spaced(&mut out, &present).unwrap(); 154 | let expected = vec![-1; len]; 155 | assert_eq!(out, expected); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/encoding/rle.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::error::{OutOfSpecSnafu, Result}; 19 | 20 | use super::PrimitiveValueDecoder; 21 | 22 | mod sealed { 23 | use std::io::Read; 24 | 25 | use crate::encoding::{ 26 | byte::ByteRleDecoder, 27 | integer::{rle_v1::RleV1Decoder, rle_v2::RleV2Decoder, EncodingSign, NInt}, 28 | }; 29 | 30 | pub trait Rle {} 31 | 32 | impl Rle for ByteRleDecoder {} 33 | impl Rle for RleV1Decoder {} 34 | impl Rle for RleV2Decoder {} 35 | } 36 | 37 | /// Generic decoding behaviour for run length encoded values, such as integers (v1 and v2) 38 | /// and bytes. 39 | /// 40 | /// Assumes an internal buffer which acts like a (single headed) queue where values are first 41 | /// decoded into, before being copied out into the output buffer (usually an Arrow array). 42 | pub trait GenericRle { 43 | /// Consume N elements from internal buffer to signify the values having been copied out. 44 | fn advance(&mut self, n: usize); 45 | 46 | /// All values available in internal buffer, respecting the current advancement level. 47 | fn available(&self) -> &[V]; 48 | 49 | /// This should clear the internal buffer and populate it with the next round of decoded 50 | /// values. 51 | // TODO: Have a version that copies directly into the output buffer (e.g. Arrow array). 52 | // Currently we always decode to the internal buffer first, even if we can copy 53 | // directly to the output and skip the middle man. Ideally the internal buffer 54 | // should only be used for leftovers between calls to PrimitiveValueDecoder::decode. 55 | fn decode_batch(&mut self) -> Result<()>; 56 | } 57 | 58 | impl + sealed::Rle> PrimitiveValueDecoder for G { 59 | fn decode(&mut self, out: &mut [V]) -> Result<()> { 60 | let available = self.available(); 61 | // If we have enough leftover to copy, can skip decoding more. 62 | if available.len() >= out.len() { 63 | out.copy_from_slice(&available[..out.len()]); 64 | self.advance(out.len()); 65 | return Ok(()); 66 | } 67 | 68 | // Otherwise progressively decode and copy over chunks. 69 | let len_to_copy = out.len(); 70 | let mut copied = 0; 71 | while copied < len_to_copy { 72 | if self.available().is_empty() { 73 | self.decode_batch()?; 74 | } 75 | 76 | let copying = self.available().len(); 77 | // At most, we fill to exact length of output buffer (don't overflow). 78 | let copying = copying.min(len_to_copy - copied); 79 | 80 | let out = &mut out[copied..]; 81 | out[..copying].copy_from_slice(&self.available()[..copying]); 82 | 83 | copied += copying; 84 | self.advance(copying); 85 | } 86 | 87 | // We always expect to be able to fill the output buffer; it is up to the 88 | // caller to control that size. 89 | if copied != out.len() { 90 | // TODO: more descriptive error 91 | OutOfSpecSnafu { 92 | msg: "Array length less than expected", 93 | } 94 | .fail() 95 | } else { 96 | Ok(()) 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/encoding/timestamp.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::marker::PhantomData; 19 | 20 | use arrow::datatypes::{ArrowTimestampType, TimeUnit}; 21 | use snafu::ensure; 22 | 23 | use crate::{ 24 | encoding::PrimitiveValueDecoder, 25 | error::{DecodeTimestampSnafu, Result}, 26 | }; 27 | 28 | const NANOSECONDS_IN_SECOND: i64 = 1_000_000_000; 29 | 30 | pub struct TimestampDecoder { 31 | base_from_epoch: i64, 32 | data: Box + Send>, 33 | secondary: Box + Send>, 34 | _marker: PhantomData, 35 | } 36 | 37 | impl TimestampDecoder { 38 | pub fn new( 39 | base_from_epoch: i64, 40 | data: Box + Send>, 41 | secondary: Box + Send>, 42 | ) -> Self { 43 | Self { 44 | base_from_epoch, 45 | data, 46 | secondary, 47 | _marker: PhantomData, 48 | } 49 | } 50 | } 51 | 52 | impl PrimitiveValueDecoder for TimestampDecoder { 53 | fn decode(&mut self, out: &mut [T::Native]) -> Result<()> { 54 | // TODO: can probably optimize, reuse buffers? 55 | let mut data = vec![0; out.len()]; 56 | let mut secondary = vec![0; out.len()]; 57 | self.data.decode(&mut data)?; 58 | self.secondary.decode(&mut secondary)?; 59 | for (index, (&seconds_since_orc_base, &nanoseconds)) in 60 | data.iter().zip(secondary.iter()).enumerate() 61 | { 62 | out[index] = 63 | decode_timestamp::(self.base_from_epoch, seconds_since_orc_base, nanoseconds)?; 64 | } 65 | Ok(()) 66 | } 67 | } 68 | 69 | /// Arrow TimestampNanosecond type cannot represent the full datetime range of 70 | /// the ORC Timestamp type, so this iterator provides the ability to decode the 71 | /// raw nanoseconds without restricting it to the Arrow TimestampNanosecond range. 72 | pub struct TimestampNanosecondAsDecimalDecoder { 73 | base_from_epoch: i64, 74 | data: Box + Send>, 75 | secondary: Box + Send>, 76 | } 77 | 78 | impl TimestampNanosecondAsDecimalDecoder { 79 | pub fn new( 80 | base_from_epoch: i64, 81 | data: Box + Send>, 82 | secondary: Box + Send>, 83 | ) -> Self { 84 | Self { 85 | base_from_epoch, 86 | data, 87 | secondary, 88 | } 89 | } 90 | } 91 | 92 | impl PrimitiveValueDecoder for TimestampNanosecondAsDecimalDecoder { 93 | fn decode(&mut self, out: &mut [i128]) -> Result<()> { 94 | // TODO: can probably optimize, reuse buffers? 95 | let mut data = vec![0; out.len()]; 96 | let mut secondary = vec![0; out.len()]; 97 | self.data.decode(&mut data)?; 98 | self.secondary.decode(&mut secondary)?; 99 | for (index, (&seconds_since_orc_base, &nanoseconds)) in 100 | data.iter().zip(secondary.iter()).enumerate() 101 | { 102 | out[index] = 103 | decode_timestamp_as_i128(self.base_from_epoch, seconds_since_orc_base, nanoseconds); 104 | } 105 | Ok(()) 106 | } 107 | } 108 | 109 | fn decode(base: i64, seconds_since_orc_base: i64, nanoseconds: i64) -> (i128, i64, u64) { 110 | let data = seconds_since_orc_base; 111 | // TODO: is this a safe cast? 112 | let mut nanoseconds = nanoseconds as u64; 113 | // Last 3 bits indicate how many trailing zeros were truncated 114 | let zeros = nanoseconds & 0x7; 115 | nanoseconds >>= 3; 116 | // Multiply by powers of 10 to get back the trailing zeros 117 | // TODO: would it be more efficient to unroll this? (if LLVM doesn't already do so) 118 | if zeros != 0 { 119 | nanoseconds *= 10_u64.pow(zeros as u32 + 1); 120 | } 121 | let seconds_since_epoch = data + base; 122 | // Timestamps below the UNIX epoch with nanoseconds > 999_999 need to be 123 | // adjusted to have 1 second subtracted due to ORC-763: 124 | // https://issues.apache.org/jira/browse/ORC-763 125 | let seconds = if seconds_since_epoch < 0 && nanoseconds > 999_999 { 126 | seconds_since_epoch - 1 127 | } else { 128 | seconds_since_epoch 129 | }; 130 | // Convert into nanoseconds since epoch, which Arrow uses as native representation 131 | // of timestamps 132 | // The timestamp may overflow i64 as ORC encodes them as a pair of (seconds, nanoseconds) 133 | // while we encode them as a single i64 of nanoseconds in Arrow. 134 | let nanoseconds_since_epoch = 135 | (seconds as i128 * NANOSECONDS_IN_SECOND as i128) + (nanoseconds as i128); 136 | // Returning seconds & nanoseconds only for error message 137 | // TODO: does the error message really need those details? Can simplify by removing. 138 | (nanoseconds_since_epoch, seconds, nanoseconds) 139 | } 140 | 141 | fn decode_timestamp( 142 | base: i64, 143 | seconds_since_orc_base: i64, 144 | nanoseconds: i64, 145 | ) -> Result { 146 | let (nanoseconds_since_epoch, seconds, nanoseconds) = 147 | decode(base, seconds_since_orc_base, nanoseconds); 148 | 149 | let nanoseconds_in_timeunit = match T::UNIT { 150 | TimeUnit::Second => 1_000_000_000, 151 | TimeUnit::Millisecond => 1_000_000, 152 | TimeUnit::Microsecond => 1_000, 153 | TimeUnit::Nanosecond => 1, 154 | }; 155 | 156 | // Error if loss of precision 157 | // TODO: make this configurable (e.g. can succeed but truncate) 158 | ensure!( 159 | nanoseconds_since_epoch % nanoseconds_in_timeunit == 0, 160 | DecodeTimestampSnafu { 161 | seconds, 162 | nanoseconds, 163 | to_time_unit: T::UNIT, 164 | } 165 | ); 166 | 167 | // Convert to i64 and error if overflow 168 | let num_since_epoch = (nanoseconds_since_epoch / nanoseconds_in_timeunit) 169 | .try_into() 170 | .or_else(|_| { 171 | DecodeTimestampSnafu { 172 | seconds, 173 | nanoseconds, 174 | to_time_unit: T::UNIT, 175 | } 176 | .fail() 177 | })?; 178 | 179 | Ok(num_since_epoch) 180 | } 181 | 182 | fn decode_timestamp_as_i128(base: i64, seconds_since_orc_base: i64, nanoseconds: i64) -> i128 { 183 | let (nanoseconds_since_epoch, _, _) = decode(base, seconds_since_orc_base, nanoseconds); 184 | nanoseconds_since_epoch 185 | } 186 | -------------------------------------------------------------------------------- /src/encoding/util.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::io::Read; 19 | 20 | use snafu::ResultExt; 21 | 22 | use crate::error::{self, Result}; 23 | 24 | /// Read single byte. 25 | #[inline] 26 | pub fn read_u8(reader: &mut impl Read) -> Result { 27 | let mut byte = [0]; 28 | reader.read_exact(&mut byte).context(error::IoSnafu)?; 29 | Ok(byte[0]) 30 | } 31 | 32 | /// Like [`read_u8()`] but returns `Ok(None)` if reader has reached EOF. 33 | #[inline] 34 | pub fn try_read_u8(reader: &mut impl Read) -> Result> { 35 | let mut byte = [0]; 36 | let length = reader.read(&mut byte).context(error::IoSnafu)?; 37 | Ok((length > 0).then_some(byte[0])) 38 | } 39 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::io; 19 | 20 | use arrow::datatypes::DataType as ArrowDataType; 21 | use arrow::datatypes::TimeUnit; 22 | use arrow::error::ArrowError; 23 | use snafu::prelude::*; 24 | use snafu::Location; 25 | 26 | use crate::proto; 27 | use crate::schema::DataType; 28 | 29 | // TODO: consolidate error types? better to have a smaller set? 30 | #[derive(Debug, Snafu)] 31 | #[snafu(visibility(pub))] 32 | pub enum OrcError { 33 | #[snafu(display("Failed to read, source: {}", source))] 34 | IoError { 35 | source: std::io::Error, 36 | #[snafu(implicit)] 37 | location: Location, 38 | }, 39 | 40 | #[snafu(display("Empty file"))] 41 | EmptyFile { 42 | #[snafu(implicit)] 43 | location: Location, 44 | }, 45 | 46 | #[snafu(display("Out of spec, message: {}", msg))] 47 | OutOfSpec { 48 | msg: String, 49 | #[snafu(implicit)] 50 | location: Location, 51 | }, 52 | 53 | #[snafu(display("Failed to decode float, source: {}", source))] 54 | DecodeFloat { 55 | #[snafu(implicit)] 56 | location: Location, 57 | source: std::io::Error, 58 | }, 59 | 60 | #[snafu(display( 61 | "Overflow while decoding timestamp (seconds={}, nanoseconds={}) to {:?}", 62 | seconds, 63 | nanoseconds, 64 | to_time_unit, 65 | ))] 66 | DecodeTimestamp { 67 | #[snafu(implicit)] 68 | location: Location, 69 | seconds: i64, 70 | nanoseconds: u64, 71 | to_time_unit: TimeUnit, 72 | }, 73 | 74 | #[snafu(display("Failed to decode proto, source: {}", source))] 75 | DecodeProto { 76 | #[snafu(implicit)] 77 | location: Location, 78 | source: prost::DecodeError, 79 | }, 80 | 81 | #[snafu(display("No types found"))] 82 | NoTypes { 83 | #[snafu(implicit)] 84 | location: Location, 85 | }, 86 | 87 | #[snafu(display("unsupported type variant: {}", msg))] 88 | UnsupportedTypeVariant { 89 | #[snafu(implicit)] 90 | location: Location, 91 | msg: &'static str, 92 | }, 93 | 94 | #[snafu(display( 95 | "Cannot decode ORC type {:?} into Arrow type {:?}", 96 | orc_type, 97 | arrow_type, 98 | ))] 99 | MismatchedSchema { 100 | #[snafu(implicit)] 101 | location: Location, 102 | orc_type: DataType, 103 | arrow_type: ArrowDataType, 104 | }, 105 | 106 | #[snafu(display("Invalid encoding for column '{}': {:?}", name, encoding))] 107 | InvalidColumnEncoding { 108 | #[snafu(implicit)] 109 | location: Location, 110 | name: String, 111 | encoding: proto::column_encoding::Kind, 112 | }, 113 | 114 | #[snafu(display("Failed to convert to record batch: {}", source))] 115 | ConvertRecordBatch { 116 | #[snafu(implicit)] 117 | location: Location, 118 | source: ArrowError, 119 | }, 120 | 121 | #[snafu(display("Varint being decoded is too large"))] 122 | VarintTooLarge { 123 | #[snafu(implicit)] 124 | location: Location, 125 | }, 126 | 127 | #[snafu(display("unexpected: {}", msg))] 128 | Unexpected { 129 | #[snafu(implicit)] 130 | location: Location, 131 | msg: String, 132 | }, 133 | 134 | #[snafu(display("Failed to build zstd decoder: {}", source))] 135 | BuildZstdDecoder { 136 | #[snafu(implicit)] 137 | location: Location, 138 | source: io::Error, 139 | }, 140 | 141 | #[snafu(display("Failed to build snappy decoder: {}", source))] 142 | BuildSnappyDecoder { 143 | #[snafu(implicit)] 144 | location: Location, 145 | source: snap::Error, 146 | }, 147 | 148 | #[snafu(display("Failed to build lzo decoder: {}", source))] 149 | BuildLzoDecoder { 150 | #[snafu(implicit)] 151 | location: Location, 152 | source: lzokay_native::Error, 153 | }, 154 | 155 | #[snafu(display("Failed to build lz4 decoder: {}", source))] 156 | BuildLz4Decoder { 157 | #[snafu(implicit)] 158 | location: Location, 159 | source: lz4_flex::block::DecompressError, 160 | }, 161 | 162 | #[snafu(display("Arrow error: {}", source))] 163 | Arrow { 164 | source: arrow::error::ArrowError, 165 | #[snafu(implicit)] 166 | location: Location, 167 | }, 168 | } 169 | 170 | pub type Result = std::result::Result; 171 | 172 | impl From for ArrowError { 173 | fn from(value: OrcError) -> Self { 174 | ArrowError::ExternalError(Box::new(value)) 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | //! A native Rust implementation of the [Apache ORC](https://orc.apache.org) file format, 19 | //! providing API's to read data into [Apache Arrow](https://arrow.apache.org) in-memory arrays. 20 | //! 21 | //! # Example read usage 22 | //! 23 | //! ```no_run 24 | //! # use std::fs::File; 25 | //! # use orc_rust::arrow_reader::ArrowReaderBuilder; 26 | //! let file = File::open("/path/to/file.orc").unwrap(); 27 | //! let reader = ArrowReaderBuilder::try_new(file).unwrap().build(); 28 | //! let record_batches = reader.collect::, _>>().unwrap(); 29 | //! ``` 30 | //! 31 | //! # Example write usage 32 | //! 33 | //! ```no_run 34 | //! # use std::fs::File; 35 | //! # use arrow::array::RecordBatch; 36 | //! # use orc_rust::arrow_writer::ArrowWriterBuilder; 37 | //! # fn get_record_batch() -> RecordBatch { 38 | //! # unimplemented!() 39 | //! # } 40 | //! let file = File::create("/path/to/file.orc").unwrap(); 41 | //! let batch = get_record_batch(); 42 | //! let mut writer = ArrowWriterBuilder::new(file, batch.schema()) 43 | //! .try_build() 44 | //! .unwrap(); 45 | //! writer.write(&batch).unwrap(); 46 | //! writer.close().unwrap(); 47 | //! ``` 48 | 49 | pub mod array_decoder; 50 | pub mod arrow_reader; 51 | pub mod arrow_writer; 52 | #[cfg(feature = "async")] 53 | pub mod async_arrow_reader; 54 | mod column; 55 | pub mod compression; 56 | mod encoding; 57 | pub mod error; 58 | mod memory; 59 | pub mod projection; 60 | mod proto; 61 | pub mod reader; 62 | pub mod schema; 63 | pub mod statistics; 64 | pub mod stripe; 65 | mod writer; 66 | 67 | pub use arrow_reader::{ArrowReader, ArrowReaderBuilder}; 68 | pub use arrow_writer::{ArrowWriter, ArrowWriterBuilder}; 69 | #[cfg(feature = "async")] 70 | pub use async_arrow_reader::ArrowStreamReader; 71 | -------------------------------------------------------------------------------- /src/memory.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | /// Estimating memory usage is important when writing files, as we finish 19 | /// writing a stripe according to a set size threshold. 20 | pub trait EstimateMemory { 21 | /// Approximate current memory usage in bytes. 22 | fn estimate_memory_size(&self) -> usize; 23 | } 24 | -------------------------------------------------------------------------------- /src/projection.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::schema::RootDataType; 19 | 20 | // TODO: be able to nest project (project columns within struct type) 21 | 22 | /// Specifies which column indices to project from an ORC type. 23 | #[derive(Debug, Clone)] 24 | pub struct ProjectionMask { 25 | /// Indices of column in ORC type, can refer to nested types 26 | /// (not only root level columns) 27 | indices: Option>, 28 | } 29 | 30 | impl ProjectionMask { 31 | /// Project all columns. 32 | pub fn all() -> Self { 33 | Self { indices: None } 34 | } 35 | 36 | /// Project only specific columns from the root type by column index. 37 | pub fn roots(root_data_type: &RootDataType, indices: impl IntoIterator) -> Self { 38 | // TODO: return error if column index not found? 39 | let input_indices = indices.into_iter().collect::>(); 40 | // By default always project root 41 | let mut indices = vec![0]; 42 | root_data_type 43 | .children() 44 | .iter() 45 | .filter(|col| input_indices.contains(&col.data_type().column_index())) 46 | .for_each(|col| indices.extend(col.data_type().all_indices())); 47 | Self { 48 | indices: Some(indices), 49 | } 50 | } 51 | 52 | /// Project only specific columns from the root type by column name. 53 | pub fn named_roots(root_data_type: &RootDataType, names: &[T]) -> Self 54 | where 55 | T: AsRef, 56 | { 57 | // TODO: return error if column name not found? 58 | // By default always project root 59 | let mut indices = vec![0]; 60 | let names = names.iter().map(AsRef::as_ref).collect::>(); 61 | root_data_type 62 | .children() 63 | .iter() 64 | .filter(|col| names.contains(&col.name())) 65 | .for_each(|col| indices.extend(col.data_type().all_indices())); 66 | Self { 67 | indices: Some(indices), 68 | } 69 | } 70 | 71 | /// Check if ORC column should is projected or not, by index. 72 | pub fn is_index_projected(&self, index: usize) -> bool { 73 | match &self.indices { 74 | Some(indices) => indices.contains(&index), 75 | None => true, 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/reader/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | pub mod metadata; 19 | 20 | use std::fs::File; 21 | use std::io::{BufReader, Read, Seek, SeekFrom}; 22 | 23 | use bytes::{Buf, Bytes}; 24 | 25 | /// Primary source used for reading required bytes for operations. 26 | #[allow(clippy::len_without_is_empty)] 27 | pub trait ChunkReader { 28 | type T: Read; 29 | 30 | /// Get total length of bytes. Useful for parsing the metadata located at 31 | /// the end of the file. 32 | // TODO: this is only used for file tail, so replace with load_metadata? 33 | fn len(&self) -> u64; 34 | 35 | /// Get a reader starting at a specific offset. 36 | fn get_read(&self, offset_from_start: u64) -> std::io::Result; 37 | 38 | /// Read bytes from an offset with specific length. 39 | fn get_bytes(&self, offset_from_start: u64, length: u64) -> std::io::Result { 40 | let mut bytes = vec![0; length as usize]; 41 | self.get_read(offset_from_start)? 42 | .take(length) 43 | .read_exact(&mut bytes)?; 44 | Ok(bytes.into()) 45 | } 46 | } 47 | 48 | impl ChunkReader for File { 49 | type T = BufReader; 50 | 51 | fn len(&self) -> u64 { 52 | self.metadata().map(|m| m.len()).unwrap_or(0u64) 53 | } 54 | 55 | /// Care needs to be taken when using this simultaneously as underlying 56 | /// file descriptor is the same and will be affected by other invocations. 57 | /// 58 | /// See [`File::try_clone()`] for more details. 59 | fn get_read(&self, offset_from_start: u64) -> std::io::Result { 60 | let mut reader = self.try_clone()?; 61 | reader.seek(SeekFrom::Start(offset_from_start))?; 62 | Ok(BufReader::new(self.try_clone()?)) 63 | } 64 | } 65 | 66 | impl ChunkReader for Bytes { 67 | type T = bytes::buf::Reader; 68 | 69 | fn len(&self) -> u64 { 70 | self.len() as u64 71 | } 72 | 73 | fn get_read(&self, offset_from_start: u64) -> std::io::Result { 74 | Ok(self.slice(offset_from_start as usize..).reader()) 75 | } 76 | } 77 | 78 | #[cfg(feature = "async")] 79 | mod async_chunk_reader { 80 | use super::*; 81 | 82 | use futures_util::future::BoxFuture; 83 | use futures_util::FutureExt; 84 | use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; 85 | 86 | #[allow(clippy::len_without_is_empty)] 87 | pub trait AsyncChunkReader: Send { 88 | // TODO: this is only used for file tail, so replace with load_metadata? 89 | fn len(&mut self) -> BoxFuture<'_, std::io::Result>; 90 | 91 | fn get_bytes( 92 | &mut self, 93 | offset_from_start: u64, 94 | length: u64, 95 | ) -> BoxFuture<'_, std::io::Result>; 96 | } 97 | 98 | impl AsyncChunkReader for T { 99 | fn len(&mut self) -> BoxFuture<'_, std::io::Result> { 100 | async move { self.seek(SeekFrom::End(0)).await }.boxed() 101 | } 102 | 103 | fn get_bytes( 104 | &mut self, 105 | offset_from_start: u64, 106 | length: u64, 107 | ) -> BoxFuture<'_, std::io::Result> { 108 | async move { 109 | self.seek(SeekFrom::Start(offset_from_start)).await?; 110 | let mut buffer = vec![0; length as usize]; 111 | self.read_exact(&mut buffer).await?; 112 | Ok(buffer.into()) 113 | } 114 | .boxed() 115 | } 116 | } 117 | 118 | impl AsyncChunkReader for Box { 119 | fn len(&mut self) -> BoxFuture<'_, std::io::Result> { 120 | self.as_mut().len() 121 | } 122 | 123 | fn get_bytes( 124 | &mut self, 125 | offset_from_start: u64, 126 | length: u64, 127 | ) -> BoxFuture<'_, std::io::Result> { 128 | self.as_mut().get_bytes(offset_from_start, length) 129 | } 130 | } 131 | } 132 | 133 | #[cfg(feature = "async")] 134 | pub use async_chunk_reader::AsyncChunkReader; 135 | 136 | #[cfg(all(feature = "async", feature = "opendal"))] 137 | mod async_opendal_reader { 138 | use crate::reader::AsyncChunkReader; 139 | use bytes::Bytes; 140 | use futures_util::future::BoxFuture; 141 | use opendal::Operator; 142 | use std::sync::Arc; 143 | 144 | /// AsyncOpendalReader provides native support for [`opendal`] 145 | /// 146 | /// ``` 147 | /// use opendal::Operator; 148 | /// use std::io::Result; 149 | /// use orc_rust::reader::AsyncOpendalReader; 150 | /// use orc_rust::reader::AsyncChunkReader; 151 | /// use opendal::services::MemoryConfig; 152 | /// 153 | /// # async fn test() -> Result<()> { 154 | /// let op = Operator::from_config(MemoryConfig::default())?.finish(); 155 | /// op.write("test", "Hello, world!").await?; 156 | /// 157 | /// let mut reader = AsyncOpendalReader::new(op, "test"); 158 | /// let len = reader.len().await?; 159 | /// let data = reader.get_bytes(0, len).await?; 160 | /// # Ok(()) 161 | /// # } 162 | /// ``` 163 | pub struct AsyncOpendalReader { 164 | op: Operator, 165 | path: Arc, 166 | } 167 | 168 | impl AsyncOpendalReader { 169 | /// Create a new async opendal reader. 170 | pub fn new(op: Operator, path: &str) -> Self { 171 | Self { 172 | op, 173 | path: Arc::new(path.to_string()), 174 | } 175 | } 176 | } 177 | 178 | impl AsyncChunkReader for AsyncOpendalReader { 179 | fn len(&mut self) -> BoxFuture<'_, std::io::Result> { 180 | let path = self.path.clone(); 181 | Box::pin(async move { 182 | let meta = self.op.stat(&path).await?; 183 | Ok(meta.content_length()) 184 | }) 185 | } 186 | 187 | fn get_bytes( 188 | &mut self, 189 | offset_from_start: u64, 190 | length: u64, 191 | ) -> BoxFuture<'_, std::io::Result> { 192 | let path = self.path.clone(); 193 | 194 | Box::pin(async move { 195 | let reader = self 196 | .op 197 | .read_with(&path) 198 | .range(offset_from_start..offset_from_start + length) 199 | .await?; 200 | Ok(reader.to_bytes()) 201 | }) 202 | } 203 | } 204 | } 205 | 206 | #[cfg(all(feature = "async", feature = "opendal"))] 207 | pub use async_opendal_reader::AsyncOpendalReader; 208 | -------------------------------------------------------------------------------- /src/statistics.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use crate::{error, proto}; 19 | 20 | /// Contains statistics for a specific column, for the entire file 21 | /// or for a specific stripe. 22 | #[derive(Debug, Clone)] 23 | pub struct ColumnStatistics { 24 | number_of_values: u64, 25 | /// Use aid in 'IS NULL' predicates 26 | has_null: bool, 27 | type_statistics: Option, 28 | } 29 | 30 | impl ColumnStatistics { 31 | pub fn number_of_values(&self) -> u64 { 32 | self.number_of_values 33 | } 34 | 35 | pub fn has_null(&self) -> bool { 36 | self.has_null 37 | } 38 | 39 | pub fn type_statistics(&self) -> Option<&TypeStatistics> { 40 | self.type_statistics.as_ref() 41 | } 42 | } 43 | 44 | #[derive(Debug, Clone)] 45 | pub enum TypeStatistics { 46 | /// For TinyInt, SmallInt, Int and BigInt 47 | Integer { 48 | min: i64, 49 | max: i64, 50 | /// If sum overflows then recorded as None 51 | sum: Option, 52 | }, 53 | /// For Float and Double 54 | Double { 55 | min: f64, 56 | max: f64, 57 | /// If sum overflows then recorded as None 58 | sum: Option, 59 | }, 60 | String { 61 | min: String, 62 | max: String, 63 | /// Total length of all strings 64 | sum: i64, 65 | }, 66 | /// For Boolean 67 | Bucket { true_count: u64 }, 68 | Decimal { 69 | // TODO: use our own decimal type? 70 | min: String, 71 | max: String, 72 | sum: String, 73 | }, 74 | Date { 75 | /// Days since epoch 76 | min: i32, 77 | max: i32, 78 | }, 79 | Binary { 80 | // Total number of bytes across all values 81 | sum: i64, 82 | }, 83 | Timestamp { 84 | /// Milliseconds since epoch 85 | /// These were used before ORC-135 86 | /// Where local timezone offset was included 87 | min: i64, 88 | max: i64, 89 | /// Milliseconds since UNIX epoch 90 | min_utc: i64, 91 | max_utc: i64, 92 | }, 93 | Collection { 94 | min_children: u64, 95 | max_children: u64, 96 | total_children: u64, 97 | }, 98 | } 99 | 100 | impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics { 101 | type Error = error::OrcError; 102 | 103 | fn try_from(value: &proto::ColumnStatistics) -> Result { 104 | let type_statistics = if let Some(stats) = &value.int_statistics { 105 | Some(TypeStatistics::Integer { 106 | min: stats.minimum(), 107 | max: stats.maximum(), 108 | sum: stats.sum, 109 | }) 110 | } else if let Some(stats) = &value.double_statistics { 111 | Some(TypeStatistics::Double { 112 | min: stats.minimum(), 113 | max: stats.maximum(), 114 | sum: stats.sum, 115 | }) 116 | } else if let Some(stats) = &value.string_statistics { 117 | Some(TypeStatistics::String { 118 | min: stats.minimum().to_owned(), 119 | max: stats.maximum().to_owned(), 120 | sum: stats.sum(), 121 | }) 122 | } else if let Some(stats) = &value.bucket_statistics { 123 | // TODO: false count? 124 | Some(TypeStatistics::Bucket { 125 | true_count: stats.count[0], // TODO: safety check this 126 | }) 127 | } else if let Some(stats) = &value.decimal_statistics { 128 | Some(TypeStatistics::Decimal { 129 | min: stats.minimum().to_owned(), 130 | max: stats.maximum().to_owned(), 131 | sum: stats.sum().to_owned(), 132 | }) 133 | } else if let Some(stats) = &value.date_statistics { 134 | Some(TypeStatistics::Date { 135 | min: stats.minimum(), 136 | max: stats.maximum(), 137 | }) 138 | } else if let Some(stats) = &value.binary_statistics { 139 | Some(TypeStatistics::Binary { sum: stats.sum() }) 140 | } else if let Some(stats) = &value.timestamp_statistics { 141 | Some(TypeStatistics::Timestamp { 142 | min: stats.minimum(), 143 | max: stats.maximum(), 144 | min_utc: stats.minimum_utc(), 145 | max_utc: stats.maximum_utc(), 146 | }) 147 | } else { 148 | value 149 | .collection_statistics 150 | .as_ref() 151 | .map(|stats| TypeStatistics::Collection { 152 | min_children: stats.min_children(), 153 | max_children: stats.max_children(), 154 | total_children: stats.total_children(), 155 | }) 156 | }; 157 | Ok(Self { 158 | number_of_values: value.number_of_values(), 159 | has_null: value.has_null(), 160 | type_statistics, 161 | }) 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/writer/mod.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::fmt::Debug; 19 | 20 | use bytes::Bytes; 21 | 22 | use crate::proto; 23 | 24 | pub mod column; 25 | pub mod stripe; 26 | 27 | #[derive(Debug, Clone, Copy, Eq, PartialEq)] 28 | pub enum StreamType { 29 | Present, 30 | Data, 31 | Length, 32 | DictionaryData, 33 | Secondary, 34 | } 35 | 36 | impl From for proto::stream::Kind { 37 | fn from(value: StreamType) -> Self { 38 | match value { 39 | StreamType::Present => proto::stream::Kind::Present, 40 | StreamType::Data => proto::stream::Kind::Data, 41 | StreamType::Length => proto::stream::Kind::Length, 42 | StreamType::DictionaryData => proto::stream::Kind::DictionaryData, 43 | StreamType::Secondary => proto::stream::Kind::Secondary, 44 | } 45 | } 46 | } 47 | 48 | #[derive(Debug, Clone)] 49 | pub struct Stream { 50 | kind: StreamType, 51 | bytes: Bytes, 52 | } 53 | 54 | impl Stream { 55 | pub fn into_parts(self) -> (StreamType, Bytes) { 56 | (self.kind, self.bytes) 57 | } 58 | } 59 | 60 | #[derive(Debug, Clone, Copy, Eq, PartialEq)] 61 | pub enum ColumnEncoding { 62 | Direct, 63 | DirectV2, 64 | Dictionary { size: usize }, 65 | DictionaryV2 { size: usize }, 66 | } 67 | 68 | impl From<&ColumnEncoding> for proto::ColumnEncoding { 69 | fn from(value: &ColumnEncoding) -> Self { 70 | match value { 71 | ColumnEncoding::Direct => proto::ColumnEncoding { 72 | kind: Some(proto::column_encoding::Kind::Direct.into()), 73 | dictionary_size: None, 74 | bloom_encoding: None, 75 | }, 76 | ColumnEncoding::DirectV2 => proto::ColumnEncoding { 77 | kind: Some(proto::column_encoding::Kind::DirectV2.into()), 78 | dictionary_size: None, 79 | bloom_encoding: None, 80 | }, 81 | ColumnEncoding::Dictionary { size } => proto::ColumnEncoding { 82 | kind: Some(proto::column_encoding::Kind::Dictionary.into()), 83 | dictionary_size: Some(*size as u32), 84 | bloom_encoding: None, 85 | }, 86 | ColumnEncoding::DictionaryV2 { size } => proto::ColumnEncoding { 87 | kind: Some(proto::column_encoding::Kind::DictionaryV2.into()), 88 | dictionary_size: Some(*size as u32), 89 | bloom_encoding: None, 90 | }, 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /taplo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | ## https://taplo.tamasfe.dev/configuration/file.html 19 | 20 | include = ["**/Cargo.toml"] 21 | 22 | [formatting] 23 | # Align consecutive entries vertically. 24 | align_entries = false 25 | # Append trailing commas for multi-line arrays. 26 | array_trailing_comma = true 27 | # Expand arrays to multiple lines that exceed the maximum column width. 28 | array_auto_expand = true 29 | # Collapse arrays that don't exceed the maximum column width and don't contain comments. 30 | array_auto_collapse = false 31 | # Omit white space padding from single-line arrays 32 | compact_arrays = true 33 | # Omit white space padding from the start and end of inline tables. 34 | compact_inline_tables = false 35 | # Maximum column width in characters, affects array expansion and collapse, this doesn't take whitespace into account. 36 | # Note that this is not set in stone, and works on a best-effort basis. 37 | column_width = 120 38 | # Indent based on tables and arrays of tables and their subtables, subtables out of order are not indented. 39 | indent_tables = false 40 | # The substring that is used for indentation, should be tabs or spaces (but technically can be anything). 41 | indent_string = ' ' 42 | # Add trailing newline at the end of the file if not present. 43 | trailing_newline = true 44 | # Alphabetically reorder keys that are not separated by empty lines. 45 | reorder_keys = false 46 | # Maximum amount of allowed consecutive blank lines. This does not affect the whitespace at the end of the document, as it is always stripped. 47 | allowed_blank_lines = 1 48 | # Use CRLF for line endings. 49 | crlf = false 50 | 51 | [[rule]] 52 | keys = ["dependencies", "dev-dependencies", "build-dependencies"] 53 | formatting = { reorder_keys = true } 54 | -------------------------------------------------------------------------------- /tests/basic/data/alltypes.lz4.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/alltypes.lz4.orc -------------------------------------------------------------------------------- /tests/basic/data/alltypes.lzo.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/alltypes.lzo.orc -------------------------------------------------------------------------------- /tests/basic/data/alltypes.none.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/alltypes.none.orc -------------------------------------------------------------------------------- /tests/basic/data/alltypes.snappy.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/alltypes.snappy.orc -------------------------------------------------------------------------------- /tests/basic/data/alltypes.zlib.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/alltypes.zlib.orc -------------------------------------------------------------------------------- /tests/basic/data/alltypes.zstd.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/alltypes.zstd.orc -------------------------------------------------------------------------------- /tests/basic/data/demo-11-zlib.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/demo-11-zlib.orc -------------------------------------------------------------------------------- /tests/basic/data/demo-12-zlib.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/demo-12-zlib.orc -------------------------------------------------------------------------------- /tests/basic/data/f32_long_long_gzip.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/f32_long_long_gzip.orc -------------------------------------------------------------------------------- /tests/basic/data/long_bool.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/long_bool.orc -------------------------------------------------------------------------------- /tests/basic/data/long_bool_gzip.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/long_bool_gzip.orc -------------------------------------------------------------------------------- /tests/basic/data/nested_array.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/nested_array.orc -------------------------------------------------------------------------------- /tests/basic/data/nested_array_float.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/nested_array_float.orc -------------------------------------------------------------------------------- /tests/basic/data/nested_array_struct.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/nested_array_struct.orc -------------------------------------------------------------------------------- /tests/basic/data/nested_map.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/nested_map.orc -------------------------------------------------------------------------------- /tests/basic/data/nested_map_struct.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/nested_map_struct.orc -------------------------------------------------------------------------------- /tests/basic/data/nested_struct.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/nested_struct.orc -------------------------------------------------------------------------------- /tests/basic/data/overflowing_timestamps.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/overflowing_timestamps.orc -------------------------------------------------------------------------------- /tests/basic/data/pyarrow_timestamps.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/pyarrow_timestamps.orc -------------------------------------------------------------------------------- /tests/basic/data/string_dict.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/string_dict.orc -------------------------------------------------------------------------------- /tests/basic/data/string_dict_gzip.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/string_dict_gzip.orc -------------------------------------------------------------------------------- /tests/basic/data/string_long.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/string_long.orc -------------------------------------------------------------------------------- /tests/basic/data/string_long_long.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/string_long_long.orc -------------------------------------------------------------------------------- /tests/basic/data/string_long_long_gzip.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/string_long_long_gzip.orc -------------------------------------------------------------------------------- /tests/basic/data/test.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/basic/data/test.orc -------------------------------------------------------------------------------- /tests/basic/misc.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | pub const LONG_STRING_DICT_EXPECTED: &str = r#"+------+ 19 | | dict | 20 | +------+ 21 | | abc | 22 | | efgh | 23 | | abc | 24 | | efgh | 25 | | abc | 26 | | efgh | 27 | | abc | 28 | | efgh | 29 | | abc | 30 | | efgh | 31 | | abc | 32 | | efgh | 33 | | abc | 34 | | efgh | 35 | | abc | 36 | | efgh | 37 | | abc | 38 | | efgh | 39 | | abc | 40 | | efgh | 41 | | abc | 42 | | efgh | 43 | | abc | 44 | | efgh | 45 | | abc | 46 | | efgh | 47 | | abc | 48 | | efgh | 49 | | abc | 50 | | efgh | 51 | | abc | 52 | | efgh | 53 | | abc | 54 | | efgh | 55 | | abc | 56 | | efgh | 57 | | abc | 58 | | efgh | 59 | | abc | 60 | | efgh | 61 | | abc | 62 | | efgh | 63 | | abc | 64 | | efgh | 65 | | abc | 66 | | efgh | 67 | | abc | 68 | | efgh | 69 | | abc | 70 | | efgh | 71 | | abc | 72 | | efgh | 73 | | abc | 74 | | efgh | 75 | | abc | 76 | | efgh | 77 | | abc | 78 | | efgh | 79 | | abc | 80 | | efgh | 81 | | abc | 82 | | efgh | 83 | | abc | 84 | | efgh | 85 | +------+"#; 86 | 87 | pub const LONG_STRING_EXPECTED: &str = r#"+------+ 88 | | dict | 89 | +------+ 90 | | abcd | 91 | | efgh | 92 | | abcd | 93 | | efgh | 94 | | abcd | 95 | | efgh | 96 | | abcd | 97 | | efgh | 98 | | abcd | 99 | | efgh | 100 | | abcd | 101 | | efgh | 102 | | abcd | 103 | | efgh | 104 | | abcd | 105 | | efgh | 106 | | abcd | 107 | | efgh | 108 | | abcd | 109 | | efgh | 110 | | abcd | 111 | | efgh | 112 | | abcd | 113 | | efgh | 114 | | abcd | 115 | | efgh | 116 | | abcd | 117 | | efgh | 118 | | abcd | 119 | | efgh | 120 | | abcd | 121 | | efgh | 122 | | abcd | 123 | | efgh | 124 | | abcd | 125 | | efgh | 126 | | abcd | 127 | | efgh | 128 | | abcd | 129 | | efgh | 130 | | abcd | 131 | | efgh | 132 | | abcd | 133 | | efgh | 134 | | abcd | 135 | | efgh | 136 | | abcd | 137 | | efgh | 138 | | abcd | 139 | | efgh | 140 | | abcd | 141 | | efgh | 142 | | abcd | 143 | | efgh | 144 | | abcd | 145 | | efgh | 146 | | abcd | 147 | | efgh | 148 | | abcd | 149 | | efgh | 150 | | abcd | 151 | | efgh | 152 | | abcd | 153 | | efgh | 154 | +------+"#; 155 | 156 | pub const LONG_BOOL_EXPECTED: &str = r#"+------+ 157 | | long | 158 | +------+ 159 | | true | 160 | | true | 161 | | true | 162 | | true | 163 | | true | 164 | | true | 165 | | true | 166 | | true | 167 | | true | 168 | | true | 169 | | true | 170 | | true | 171 | | true | 172 | | true | 173 | | true | 174 | | true | 175 | | true | 176 | | true | 177 | | true | 178 | | true | 179 | | true | 180 | | true | 181 | | true | 182 | | true | 183 | | true | 184 | | true | 185 | | true | 186 | | true | 187 | | true | 188 | | true | 189 | | true | 190 | | true | 191 | +------+"#; 192 | -------------------------------------------------------------------------------- /tests/integration/data/README.md: -------------------------------------------------------------------------------- 1 | These files are imported from [Apache ORC's examples](https://github.com/apache/orc/tree/207085de3722054485e685811f8e5f2e11aa4deb/examples) 2 | -------------------------------------------------------------------------------- /tests/integration/data/TestCSVFileImport.test10rows.csv: -------------------------------------------------------------------------------- 1 | 0,a,0.0 2 | 1,b,1.1 3 | 2,c,2.2 4 | 3,d, 5 | 4,,4.4 6 | ,f,5.5 7 | ,, 8 | 7,h,7.7 9 | 8,i,8.8 10 | 9,j,9.9 -------------------------------------------------------------------------------- /tests/integration/data/TestCSVFileImport.testTimezoneOption.csv: -------------------------------------------------------------------------------- 1 | 2021-12-27 00:00:00.000 -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.columnProjection.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.columnProjection.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.emptyFile.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.emptyFile.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.metaData.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.metaData.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.test1.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.test1.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testDate1900.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testDate1900.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testDate2038.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testDate2038.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testMemoryManagementV11.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testMemoryManagementV11.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testMemoryManagementV12.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testMemoryManagementV12.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testPredicatePushdown.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testPredicatePushdown.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testSeek.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testSeek.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testSnappy.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testSnappy.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testStringAndBinaryStatistics.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testStringAndBinaryStatistics.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testStripeLevelStats.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testStripeLevelStats.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testTimestamp.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testTimestamp.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testUnionAndTimestamp.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testUnionAndTimestamp.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testWithoutCompressionBlockSize.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testWithoutCompressionBlockSize.orc -------------------------------------------------------------------------------- /tests/integration/data/TestOrcFile.testWithoutIndex.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestOrcFile.testWithoutIndex.orc -------------------------------------------------------------------------------- /tests/integration/data/TestStringDictionary.testRowIndex.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestStringDictionary.testRowIndex.orc -------------------------------------------------------------------------------- /tests/integration/data/TestVectorOrcFile.testLz4.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestVectorOrcFile.testLz4.orc -------------------------------------------------------------------------------- /tests/integration/data/TestVectorOrcFile.testLzo.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestVectorOrcFile.testLzo.orc -------------------------------------------------------------------------------- /tests/integration/data/TestVectorOrcFile.testZstd.0.12.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/TestVectorOrcFile.testZstd.0.12.orc -------------------------------------------------------------------------------- /tests/integration/data/bad_bloom_filter_1.6.0.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/bad_bloom_filter_1.6.0.orc -------------------------------------------------------------------------------- /tests/integration/data/bad_bloom_filter_1.6.11.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/bad_bloom_filter_1.6.11.orc -------------------------------------------------------------------------------- /tests/integration/data/complextypes_iceberg.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/complextypes_iceberg.orc -------------------------------------------------------------------------------- /tests/integration/data/corrupt/missing_blob_stream_in_string_dict.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/corrupt/missing_blob_stream_in_string_dict.orc -------------------------------------------------------------------------------- /tests/integration/data/corrupt/missing_length_stream_in_string_dict.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/corrupt/missing_length_stream_in_string_dict.orc -------------------------------------------------------------------------------- /tests/integration/data/corrupt/negative_dict_entry_lengths.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/corrupt/negative_dict_entry_lengths.orc -------------------------------------------------------------------------------- /tests/integration/data/corrupt/stripe_footer_bad_column_encodings.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/corrupt/stripe_footer_bad_column_encodings.orc -------------------------------------------------------------------------------- /tests/integration/data/decimal.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/decimal.orc -------------------------------------------------------------------------------- /tests/integration/data/decimal64_v2.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/decimal64_v2.orc -------------------------------------------------------------------------------- /tests/integration/data/decimal64_v2_cplusplus.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/decimal64_v2_cplusplus.orc -------------------------------------------------------------------------------- /tests/integration/data/demo-11-none.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/demo-11-none.orc -------------------------------------------------------------------------------- /tests/integration/data/demo-11-zlib.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/demo-11-zlib.orc -------------------------------------------------------------------------------- /tests/integration/data/demo-12-zlib.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/demo-12-zlib.orc -------------------------------------------------------------------------------- /tests/integration/data/encrypted/kms.keystore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/encrypted/kms.keystore -------------------------------------------------------------------------------- /tests/integration/data/encrypted/sample1.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/encrypted/sample1.orc -------------------------------------------------------------------------------- /tests/integration/data/encrypted/sample2.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/encrypted/sample2.orc -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.columnProjection.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.columnProjection.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.emptyFile.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.emptyFile.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.metaData.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.metaData.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.test1.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.test1.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testDate1900.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testDate1900.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testDate2038.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testDate2038.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testMemoryManagementV11.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testMemoryManagementV11.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testMemoryManagementV12.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testMemoryManagementV12.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testPredicatePushdown.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testPredicatePushdown.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testSeek.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testSeek.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testSnappy.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testSnappy.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testStringAndBinaryStatistics.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testStringAndBinaryStatistics.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testStripeLevelStats.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testStripeLevelStats.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testTimestamp.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testTimestamp.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testUnionAndTimestamp.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testUnionAndTimestamp.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestOrcFile.testWithoutIndex.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestOrcFile.testWithoutIndex.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestStringDictionary.testRowIndex.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestStringDictionary.testRowIndex.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestVectorOrcFile.testLz4.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestVectorOrcFile.testLz4.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/TestVectorOrcFile.testLzo.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/TestVectorOrcFile.testLzo.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/decimal.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/decimal.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/demo-12-zlib.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/demo-12-zlib.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/nulls-at-end-snappy.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/nulls-at-end-snappy.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/orc-file-11-format.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/orc-file-11-format.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/orc_index_int_string.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/orc_index_int_string.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/orc_split_elim.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/orc_split_elim.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/orc_split_elim_cpp.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/orc_split_elim_cpp.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/orc_split_elim_new.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/orc_split_elim_new.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected/over1k_bloom.jsn.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected/over1k_bloom.jsn.gz -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.columnProjection.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.columnProjection.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.emptyFile.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.emptyFile.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.metaData.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.metaData.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.test1.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.test1.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testDate1900.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testDate1900.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testDate2038.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testDate2038.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV11.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV11.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV12.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV12.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testPredicatePushdown.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testPredicatePushdown.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testSeek.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testSeek.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testSnappy.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testSnappy.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testStringAndBinaryStatistics.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testStringAndBinaryStatistics.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testStripeLevelStats.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testStripeLevelStats.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testUnionAndTimestamp.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testUnionAndTimestamp.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestOrcFile.testWithoutIndex.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestOrcFile.testWithoutIndex.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestStringDictionary.testRowIndex.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestStringDictionary.testRowIndex.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestVectorOrcFile.testLz4.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestVectorOrcFile.testLz4.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/TestVectorOrcFile.testLzo.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/TestVectorOrcFile.testLzo.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/decimal.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/decimal.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/demo-12-zlib.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/demo-12-zlib.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/nulls-at-end-snappy.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/nulls-at-end-snappy.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/orc-file-11-format.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/orc-file-11-format.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/orc_index_int_string.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/orc_index_int_string.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/orc_split_elim.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/orc_split_elim.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/orc_split_elim_cpp.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/orc_split_elim_cpp.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/orc_split_elim_new.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/orc_split_elim_new.feather -------------------------------------------------------------------------------- /tests/integration/data/expected_arrow/over1k_bloom.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/expected_arrow/over1k_bloom.feather -------------------------------------------------------------------------------- /tests/integration/data/nulls-at-end-snappy.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/nulls-at-end-snappy.orc -------------------------------------------------------------------------------- /tests/integration/data/orc-file-11-format.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/orc-file-11-format.orc -------------------------------------------------------------------------------- /tests/integration/data/orc_index_int_string.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/orc_index_int_string.orc -------------------------------------------------------------------------------- /tests/integration/data/orc_no_format.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/orc_no_format.orc -------------------------------------------------------------------------------- /tests/integration/data/orc_split_elim.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/orc_split_elim.orc -------------------------------------------------------------------------------- /tests/integration/data/orc_split_elim_cpp.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/orc_split_elim_cpp.orc -------------------------------------------------------------------------------- /tests/integration/data/orc_split_elim_new.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/orc_split_elim_new.orc -------------------------------------------------------------------------------- /tests/integration/data/over1k_bloom.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/over1k_bloom.orc -------------------------------------------------------------------------------- /tests/integration/data/version1999.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/version1999.orc -------------------------------------------------------------------------------- /tests/integration/data/zero.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datafusion-contrib/orc-rust/3134cab581a8e91b942d6a23aca2916ea965f6bb/tests/integration/data/zero.orc -------------------------------------------------------------------------------- /typos.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [default.extend-words] 19 | ue = "ue" 20 | datas = "datas" 21 | 22 | [files] 23 | extend-exclude = [ 24 | "tests/**/data/**", 25 | "format/orc_proto.proto", 26 | "src/proto.rs" 27 | ] 28 | --------------------------------------------------------------------------------