├── .gitignore
├── src
    ├── lib.rs
    ├── read
    │   ├── decode
    │   │   ├── variable_length.rs
    │   │   ├── mod.rs
    │   │   ├── float.rs
    │   │   ├── boolean_rle.rs
    │   │   └── rle_v2.rs
    │   ├── column.rs
    │   ├── decompress
    │   │   └── mod.rs
    │   └── mod.rs
    ├── error.rs
    ├── lib.md
    └── proto.rs
├── Cargo.toml
├── .github
    └── workflows
    │   ├── test.yml
    │   └── coverage.yml
├── LICENSE-MIT
├── README.md
├── write.py
├── tests
    └── it
    │   ├── deserialize.rs
    │   └── main.rs
└── LICENSE-APACHE


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Cargo.lock
3 | venv
4 | 
5 | *.orc
6 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![doc = include_str!("lib.md")]
2 | #![forbid(unsafe_code)]
3 | pub mod error;
4 | pub mod proto;
5 | pub mod read;
6 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "orc-format"
 3 | version = "0.3.0"
 4 | license = "MIT/Apache-2.0"
 5 | description = "Unofficial implementation of Apache ORC spec in safe Rust"
 6 | homepage = "https://github.com/DataEngineeringLabs/orc-format"
 7 | repository = "https://github.com/DataEngineeringLabs/orc-format"
 8 | authors = ["Jorge C. Leitao <jorgecarleitao@gmail.com>"]
 9 | keywords = [ "orc", "analytics" ]
10 | edition = "2021"
11 | 
12 | [dependencies]
13 | prost = { version = "0.9.0" }
14 | flate2 = "1"
15 | fallible-streaming-iterator = { version = "0.1" }
16 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     name: Test
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 |       - uses: Swatinem/rust-cache@v1
12 |       - uses: actions-rs/toolchain@v1
13 |         with:
14 |           toolchain: stable
15 |       - name: Setup ORC files
16 |         run: |
17 |           apt update && apt install python3-pip python3-venv -y -q
18 |           python3 -m venv venv
19 |           source venv/bin/activate
20 |           pip install pip --upgrade
21 |           pip install pyorc
22 |           python write.py
23 |           deactivate
24 |       - name: test
25 |         run: cargo test
26 | 


--------------------------------------------------------------------------------
/src/read/decode/variable_length.rs:
--------------------------------------------------------------------------------
 1 | use crate::error::Error;
 2 | 
 3 | use std::io::Read;
 4 | 
 5 | pub struct Values<R: Read> {
 6 |     reader: R,
 7 |     scratch: Vec<u8>,
 8 | }
 9 | 
10 | impl<R: Read> Values<R> {
11 |     pub fn new(reader: R, scratch: Vec<u8>) -> Self {
12 |         Self { reader, scratch }
13 |     }
14 | 
15 |     pub fn next(&mut self, length: usize) -> Result<&[u8], Error> {
16 |         self.scratch.clear();
17 |         self.scratch.reserve(length);
18 |         (&mut self.reader)
19 |             .take(length as u64)
20 |             .read_to_end(&mut self.scratch)?;
21 | 
22 |         Ok(&self.scratch)
23 |     }
24 | 
25 |     pub fn into_inner(self) -> Vec<u8> {
26 |         self.scratch
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/read/decode/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Contains different iterators that receive a reader ([`std::io::Read`])
 2 | //! and return values for each of ORC's physical types (e.g. boolean).
 3 | mod boolean_rle;
 4 | mod float;
 5 | mod rle_v2;
 6 | mod variable_length;
 7 | 
 8 | pub use boolean_rle::{BooleanIter, BooleanRleRunIter, BooleanRun};
 9 | pub use float::{Float, FloatIter};
10 | pub use rle_v2::{
11 |     SignedRleV2Iter, SignedRleV2Run, SignedRleV2RunIter, UnsignedRleV2Iter, UnsignedRleV2Run,
12 |     UnsignedRleV2RunIter,
13 | };
14 | pub use variable_length::Values;
15 | 
16 | #[inline]
17 | fn read_u8<R: std::io::Read>(reader: &mut R) -> Result<u8, std::io::Error> {
18 |     let mut buf = [0; 1];
19 |     reader.read_exact(&mut buf)?;
20 |     Ok(buf[0])
21 | }
22 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | //! Contains [`Error`]
 2 | use crate::proto::stream::Kind;
 3 | 
 4 | /// Possible errors from this crate.
 5 | #[derive(Debug, Clone)]
 6 | pub enum Error {
 7 |     /// Generic error returned when the file is out of spec
 8 |     OutOfSpec,
 9 |     /// When a string column contains a value with invalid UTF8
10 |     InvalidUtf8,
11 |     /// When the user requests a column that does not exist
12 |     InvalidColumn(u32),
13 |     /// When the user requests a type that does not exist for the given column
14 |     InvalidKind(u32, Kind),
15 |     /// When decoding a float fails
16 |     DecodeFloat,
17 |     /// When decompression fails
18 |     Decompression,
19 |     /// When decoding the proto files fail
20 |     InvalidProto,
21 | }
22 | 
23 | impl From<prost::DecodeError> for Error {
24 |     fn from(_: prost::DecodeError) -> Self {
25 |         Self::InvalidProto
26 |     }
27 | }
28 | 
29 | impl From<std::io::Error> for Error {
30 |     fn from(_: std::io::Error) -> Self {
31 |         Self::OutOfSpec
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/.github/workflows/coverage.yml:
--------------------------------------------------------------------------------
 1 | name: Coverage
 2 | 
 3 | on: [pull_request, push]
 4 | 
 5 | jobs:
 6 |   coverage:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - name: Install Rust
11 |         run: rustup toolchain install stable --component llvm-tools-preview
12 |       - name: Install cargo-llvm-cov
13 |         uses: taiki-e/install-action@cargo-llvm-cov
14 |       - uses: Swatinem/rust-cache@v1
15 |       - name: Setup ORC files
16 |         run: |
17 |           apt update && apt install python3-pip python3-venv -y -q
18 |           python3 -m venv venv
19 |           source venv/bin/activate
20 |           pip install pip --upgrade
21 |           pip install pyorc
22 |           python write.py
23 |           deactivate
24 |       - name: Generate code coverage
25 |         run: cargo llvm-cov --lcov --output-path lcov.info
26 |       - name: Upload coverage to Codecov
27 |         uses: codecov/codecov-action@v1
28 |         with:
29 |           token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
30 |           files: lcov.info
31 |           fail_ci_if_error: true
32 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) 2022 Jorge C Leitao
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Read Apache ORC from Rust
 2 | 
 3 | [![test](https://github.com/DataEngineeringLabs/orc-format/actions/workflows/test.yml/badge.svg)](https://github.com/DataEngineeringLabs/orc-format/actions/workflows/test.yml)
 4 | [![codecov](https://codecov.io/gh/DataEngineeringLabs/orc-format/branch/main/graph/badge.svg?token=AgyTF60R3D)](https://codecov.io/gh/DataEngineeringLabs/orc-format)
 5 | 
 6 | Read [Apache ORC](https://orc.apache.org/) in Rust.
 7 | 
 8 | This repository is similar to [parquet2](https://github.com/jorgecarleitao/parquet2) and [Avro-schema](https://github.com/DataEngineeringLabs/avro-schema), providing a toolkit to:
 9 | 
10 | * Read ORC files (proto structures)
11 | * Read stripes (the conversion from proto metadata to memory regions)
12 | * Decode stripes (the math of decode stripes into e.g. booleans, runs of RLE, etc.)
13 | 
14 | It currently reads the following (logical) types:
15 | 
16 | * booleans
17 | * strings
18 | * integers
19 | * floats
20 | 
21 | What is not yet implemented:
22 | 
23 | * Snappy, LZO decompression
24 | * RLE v2 `Patched Base` decoding
25 | * RLE v1 decoding
26 | * Utility functions to decode non-native logical types:
27 |     * decimal
28 |     * timestamp
29 |     * struct
30 |     * List
31 |     * Union
32 | 
33 | ## Run tests
34 | 
35 | ```bash
36 | python3 -m venv venv
37 | venv/bin/pip install -U pip
38 | venv/bin/pip install -U pyorc
39 | venv/bin/python write.py
40 | cargo test
41 | ```
42 | 


--------------------------------------------------------------------------------
/src/lib.md:
--------------------------------------------------------------------------------
 1 | Welcome to `orc-format` documentation. Thanks for checking it out!
 2 | 
 3 | This Rust crate is a toolkit to read and deserialize ORC to your favourite in-memory format.
 4 | 
 5 | Below is an example of how to read a column from ORC into memory:
 6 | 
 7 | ```rust
 8 | use std::fs::File;
 9 | 
10 | use orc_format::{error::Error, read, read::Column};
11 | 
12 | 
13 | fn get_column(path: &str, column: u32) -> Result<Column, Error> {
14 |     // open the file, as expected. buffering this is not necessary - we
15 |     // are very careful about the number of `read`s we perform.
16 |     let mut f = File::open(path).expect("no file found");
17 | 
18 |     // read the files' metadata
19 |     let metadata = read::read_metadata(&mut f)?;
20 | 
21 |     // the next step is to identify which stripe we want to read. Let's say it is the first one.
22 |     let stripe = 0;
23 | 
24 |     // Each stripe has a footer - we need to read it to extract the location of each column on it.
25 |     let stripe_footer = read::read_stripe_footer(&mut f, &metadata, stripe, &mut vec![])?;
26 | 
27 |     // Finally, we read the column into `Column`
28 |     read::read_stripe_column(&mut f, &metadata, stripe, stripe_footer, column, vec![])
29 | }
30 | ```
31 | 
32 | To deserialize the values of a column, use things inside `read::decode`.
33 | For example, the below contains the deserialization of the "Present" to a `Vec<bool>`.
34 | 
35 | ```rust
36 | use orc_format::{error::Error, proto::stream::Kind, read::decode::BooleanIter, read::Column};
37 | 
38 | fn deserialize_present(column: &Column, scratch: &mut Vec<u8>) -> Result<Vec<bool>, Error> {
39 |     let mut reader = column.get_stream(Kind::Present, std::mem::take(scratch))?;
40 | 
41 |     let mut validity = Vec::with_capacity(column.number_of_rows());
42 |     BooleanIter::new(&mut reader, column.number_of_rows()).try_for_each(|item| {
43 |         validity.push(item?);
44 |         Result::<(), Error>::Ok(())
45 |     })?;
46 | 
47 |     *scratch = std::mem::take(&mut reader.into_inner());
48 | 
49 |     Ok(validity)
50 | }
51 | ```
52 | 
53 | Check out the integration tests of the crate to find deserialization of other types such
54 | as floats, integers, strings and dictionaries.
55 | 


--------------------------------------------------------------------------------
/src/read/decode/float.rs:
--------------------------------------------------------------------------------
 1 | use crate::error::Error;
 2 | 
 3 | /// Sealead trait to generically represent f32 and f64.
 4 | pub trait Float: Default + Copy + private::Sealed {
 5 |     type Bytes: AsRef<[u8]> + AsMut<[u8]> + Default;
 6 |     fn from_le_bytes(bytes: Self::Bytes) -> Self;
 7 | }
 8 | 
 9 | mod private {
10 |     pub trait Sealed {} // Users in other crates cannot name this trait.
11 |     impl Sealed for f32 {}
12 |     impl Sealed for f64 {}
13 | }
14 | 
15 | impl Float for f32 {
16 |     type Bytes = [u8; 4];
17 | 
18 |     #[inline]
19 |     fn from_le_bytes(bytes: Self::Bytes) -> Self {
20 |         Self::from_le_bytes(bytes)
21 |     }
22 | }
23 | 
24 | impl Float for f64 {
25 |     type Bytes = [u8; 8];
26 | 
27 |     #[inline]
28 |     fn from_le_bytes(bytes: Self::Bytes) -> Self {
29 |         Self::from_le_bytes(bytes)
30 |     }
31 | }
32 | 
33 | /// An iterator
34 | pub struct FloatIter<T: Float, R: std::io::Read> {
35 |     reader: R,
36 |     remaining: usize,
37 |     phantom: std::marker::PhantomData<T>,
38 | }
39 | 
40 | impl<T: Float, R: std::io::Read> FloatIter<T, R> {
41 |     /// Returns a new [`FloatIter`]
42 |     #[inline]
43 |     pub fn new(reader: R, length: usize) -> Self {
44 |         Self {
45 |             reader,
46 |             remaining: length,
47 |             phantom: Default::default(),
48 |         }
49 |     }
50 | 
51 |     /// The number of items remaining
52 |     #[inline]
53 |     pub fn len(&self) -> usize {
54 |         self.remaining
55 |     }
56 | 
57 |     /// Whether the iterator is empty
58 |     #[must_use]
59 |     pub fn is_empty(&self) -> bool {
60 |         self.len() == 0
61 |     }
62 | 
63 |     /// Returns its internal reader
64 |     pub fn into_inner(self) -> R {
65 |         self.reader
66 |     }
67 | }
68 | 
69 | impl<T: Float, R: std::io::Read> Iterator for FloatIter<T, R> {
70 |     type Item = Result<T, Error>;
71 | 
72 |     #[inline]
73 |     fn next(&mut self) -> Option<Self::Item> {
74 |         if self.remaining == 0 {
75 |             return None;
76 |         }
77 |         let mut chunk: T::Bytes = Default::default();
78 |         let error = self.reader.read_exact(chunk.as_mut());
79 |         if error.is_err() {
80 |             return Some(Err(Error::DecodeFloat));
81 |         };
82 |         self.remaining -= 1;
83 |         Some(Ok(T::from_le_bytes(chunk)))
84 |     }
85 | 
86 |     #[inline]
87 |     fn size_hint(&self) -> (usize, Option<usize>) {
88 |         let remaining = self.len();
89 |         (remaining, Some(remaining))
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/read/column.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     error::Error,
 3 |     proto::{stream::Kind, ColumnEncoding, CompressionKind, StripeFooter},
 4 | };
 5 | 
 6 | use super::decompress::Decompressor;
 7 | 
 8 | /// Helper struct used to access the streams associated to an ORC column.
 9 | /// Its main use [`Column::get_stream`], to get a stream.
10 | #[derive(Debug)]
11 | pub struct Column {
12 |     data: Vec<u8>,
13 |     column: u32,
14 |     number_of_rows: u64,
15 |     footer: StripeFooter,
16 |     compression: CompressionKind,
17 | }
18 | 
19 | impl Column {
20 |     pub(crate) fn new(
21 |         data: Vec<u8>,
22 |         column: u32,
23 |         number_of_rows: u64,
24 |         footer: StripeFooter,
25 |         compression: CompressionKind,
26 |     ) -> Self {
27 |         Self {
28 |             data,
29 |             column,
30 |             number_of_rows,
31 |             footer,
32 |             compression,
33 |         }
34 |     }
35 | 
36 |     /// Returns the stream `kind` associated to this column as a [`Decompressor`].
37 |     /// `scratch` becomes owned by [`Decompressor`], which you can recover via `into_inner`.
38 |     pub fn get_stream(&self, kind: Kind, scratch: Vec<u8>) -> Result<Decompressor, Error> {
39 |         let mut start = 0; // the start of the stream
40 |         self.footer
41 |             .streams
42 |             .iter()
43 |             .filter(|stream| stream.column() == self.column && stream.kind() != Kind::RowIndex)
44 |             .map(|stream| {
45 |                 start += stream.length() as usize;
46 |                 stream
47 |             })
48 |             .find(|stream| stream.kind() == kind)
49 |             .map(|stream| {
50 |                 let length = stream.length() as usize;
51 |                 let data = &self.data[start - length..start];
52 |                 Decompressor::new(data, self.compression, scratch)
53 |             })
54 |             .ok_or(Error::InvalidKind(self.column, kind))
55 |     }
56 | 
57 |     /// Returns the encoding of the column
58 |     pub fn encoding(&self) -> &ColumnEncoding {
59 |         &self.footer.columns[self.column as usize]
60 |     }
61 | 
62 |     /// Returns the number of items in the dictionary, if any
63 |     pub fn dictionary_size(&self) -> Option<usize> {
64 |         self.footer.columns[self.column as usize]
65 |             .dictionary_size
66 |             .map(|x| x as usize)
67 |     }
68 | 
69 |     /// The number of rows on this column
70 |     pub fn number_of_rows(&self) -> usize {
71 |         self.number_of_rows as usize
72 |     }
73 | 
74 |     /// Returns the underlying footer and the pre-allocated memory region
75 |     /// containing all (compressed) streams of this column.
76 |     pub fn into_inner(self) -> (StripeFooter, Vec<u8>) {
77 |         (self.footer, self.data)
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/write.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import pyorc
  4 | 
  5 | data = {
  6 |     "a": [1.0, 2.0, None, 4.0, 5.0],
  7 |     "b": [True, False, None, True, False],
  8 |     "str_direct": ["a", "cccccc", None, "ddd", "ee"],
  9 |     "d": ["a", "bb", None, "ccc", "ddd"],
 10 |     "e": ["ddd", "cc", None, "bb", "a"],
 11 |     "f": ["aaaaa", "bbbbb", None, "ccccc", "ddddd"],
 12 |     "int_short_repeated": [5, 5, None, 5, 5],
 13 |     "int_neg_short_repeated": [-5, -5, None, -5, -5],
 14 |     "int_delta": [1, 2, None, 4, 5],
 15 |     "int_neg_delta": [5, 4, None, 2, 1],
 16 |     "int_direct": [1, 6, None, 3, 2],
 17 |     "int_neg_direct": [-1, -6, None, -3, -2],
 18 |     "bigint_direct": [1, 6, None, 3, 2],
 19 |     "bigint_neg_direct": [-1, -6, None, -3, -2],
 20 |     "bigint_other": [5, -5, 1, 5, 5],
 21 |     "utf8_increase": ["a", "bb", "ccc", "dddd", "eeeee"],
 22 |     "utf8_decrease": ["eeeee", "dddd", "ccc", "bb", "a"],
 23 | }
 24 | 
 25 | def infer_schema(data):
 26 |     schema = "struct<"
 27 |     for key, value in data.items():
 28 |         dt = type(value[0])
 29 |         if dt == float:
 30 |             dt = "float"
 31 |         elif dt == int:
 32 |             dt = "int"
 33 |         elif dt == bool:
 34 |             dt = "boolean"
 35 |         elif dt == str:
 36 |             dt = "string"
 37 |         else:
 38 |             raise NotImplementedError
 39 |         if key.startswith("double"):
 40 |             dt = "double"
 41 |         if key.startswith("bigint"):
 42 |             dt = "bigint"
 43 |         schema += key + ":" + dt + ","
 44 | 
 45 |     schema = schema[:-1] + ">"
 46 |     return schema
 47 | 
 48 | 
 49 | 
 50 | def _write(
 51 |     schema: str,
 52 |     data,
 53 |     file_name: str,
 54 |     compression=pyorc.CompressionKind.NONE,
 55 |     dict_key_size_threshold=0.0,
 56 | ):
 57 |     output = open(file_name, "wb")
 58 |     writer = pyorc.Writer(
 59 |         output,
 60 |         schema,
 61 |         dict_key_size_threshold=dict_key_size_threshold,
 62 |         # use a small number to ensure that compression crosses value boundaries
 63 |         compression_block_size=32,
 64 |         compression=compression,
 65 |     )
 66 |     num_rows = len(list(data.values())[0])
 67 |     for x in range(num_rows):
 68 |         row = tuple(values[x] for values in data.values())
 69 |         writer.write(row)
 70 |     writer.close()
 71 | 
 72 |     with open(file_name, "rb") as f:
 73 |         reader = pyorc.Reader(f)
 74 |         list(reader)
 75 | 
 76 | 
 77 | _write(
 78 |     infer_schema(data),
 79 |     data,
 80 |     "test.orc",
 81 | )
 82 | 
 83 | data_boolean = {
 84 |     "long": [True] * 32,
 85 | }
 86 | 
 87 | _write("struct<long:boolean>", data_boolean, "long_bool.orc")
 88 | 
 89 | _write("struct<long:boolean>", data_boolean, "long_bool_gzip.orc", pyorc.CompressionKind.ZLIB)
 90 | 
 91 | data_dict = {
 92 |     "dict": ["abcd", "efgh"] * 32,
 93 | }
 94 | 
 95 | _write("struct<dict:string>", data_dict, "string_long.orc")
 96 | 
 97 | data_dict = {
 98 |     "dict": ["abc", "efgh"] * 32,
 99 | }
100 | 
101 | _write("struct<dict:string>", data_dict, "string_dict.orc", dict_key_size_threshold=0.1)
102 | 
103 | _write("struct<dict:string>", data_dict, "string_dict_gzip.orc", pyorc.CompressionKind.ZLIB)
104 | 
105 | data_dict = {
106 |     "dict": ["abcd", "efgh"] * (10**4 // 2),
107 | }
108 | 
109 | _write("struct<dict:string>", data_dict, "string_long_long.orc")
110 | _write("struct<dict:string>", data_dict, "string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
111 | 
112 | long_f32 = {
113 |     "dict": [random.uniform(0, 1) for _ in range(10**6)],
114 | }
115 | 
116 | _write("struct<dict:float>", long_f32, "f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
117 | 


--------------------------------------------------------------------------------
/src/read/decompress/mod.rs:
--------------------------------------------------------------------------------
  1 | //! Contains [`Decompressor`]
  2 | use std::io::Read;
  3 | 
  4 | use fallible_streaming_iterator::FallibleStreamingIterator;
  5 | 
  6 | use crate::error::Error;
  7 | use crate::proto::CompressionKind;
  8 | 
  9 | fn decode_header(bytes: &[u8]) -> (bool, usize) {
 10 |     let a: [u8; 3] = (&bytes[..3]).try_into().unwrap();
 11 |     let a = [0, a[0], a[1], a[2]];
 12 |     let length = u32::from_le_bytes(a);
 13 |     let is_original = a[1] & 1 == 1;
 14 |     let length = (length >> (8 + 1)) as usize;
 15 | 
 16 |     (is_original, length)
 17 | }
 18 | 
 19 | enum State<'a> {
 20 |     Original(&'a [u8]),
 21 |     Compressed(Vec<u8>),
 22 | }
 23 | 
 24 | struct DecompressorIter<'a> {
 25 |     stream: &'a [u8],
 26 |     current: Option<State<'a>>, // when we have compression but the value is original
 27 |     compression: CompressionKind,
 28 |     scratch: Vec<u8>,
 29 | }
 30 | 
 31 | impl<'a> DecompressorIter<'a> {
 32 |     pub fn new(stream: &'a [u8], compression: CompressionKind, scratch: Vec<u8>) -> Self {
 33 |         Self {
 34 |             stream,
 35 |             current: None,
 36 |             compression,
 37 |             scratch,
 38 |         }
 39 |     }
 40 | 
 41 |     pub fn into_inner(self) -> Vec<u8> {
 42 |         match self.current {
 43 |             Some(State::Compressed(some)) => some,
 44 |             _ => self.scratch,
 45 |         }
 46 |     }
 47 | }
 48 | 
 49 | impl<'a> FallibleStreamingIterator for DecompressorIter<'a> {
 50 |     type Item = [u8];
 51 | 
 52 |     type Error = Error;
 53 | 
 54 |     #[inline]
 55 |     fn advance(&mut self) -> Result<(), Self::Error> {
 56 |         if self.stream.is_empty() {
 57 |             self.current = None;
 58 |             return Ok(());
 59 |         }
 60 |         match self.compression {
 61 |             CompressionKind::None => {
 62 |                 // todo: take stratch from current State::Compressed for re-use
 63 |                 self.current = Some(State::Original(self.stream));
 64 |                 self.stream = &[];
 65 |             }
 66 |             CompressionKind::Zlib => {
 67 |                 // todo: take stratch from current State::Compressed for re-use
 68 |                 let (is_original, length) = decode_header(self.stream);
 69 |                 self.stream = &self.stream[3..];
 70 |                 let (maybe_compressed, remaining) = self.stream.split_at(length);
 71 |                 self.stream = remaining;
 72 |                 if is_original {
 73 |                     self.current = Some(State::Original(maybe_compressed));
 74 |                 } else {
 75 |                     let mut gz = flate2::read::DeflateDecoder::new(maybe_compressed);
 76 |                     self.scratch.clear();
 77 |                     gz.read_to_end(&mut self.scratch)?;
 78 |                     self.current = Some(State::Compressed(std::mem::take(&mut self.scratch)));
 79 |                 }
 80 |             }
 81 |             _ => todo!(),
 82 |         };
 83 |         Ok(())
 84 |     }
 85 | 
 86 |     #[inline]
 87 |     fn get(&self) -> Option<&Self::Item> {
 88 |         self.current.as_ref().map(|x| match x {
 89 |             State::Original(x) => *x,
 90 |             State::Compressed(x) => x.as_ref(),
 91 |         })
 92 |     }
 93 | }
 94 | 
 95 | /// A [`Read`]er fulfilling the ORC specification of reading compressed data.
 96 | pub struct Decompressor<'a> {
 97 |     decompressor: DecompressorIter<'a>,
 98 |     offset: usize,
 99 |     is_first: bool,
100 | }
101 | 
102 | impl<'a> Decompressor<'a> {
103 |     /// Creates a new [`Decompressor`] that will use `scratch` as a temporary region.
104 |     pub fn new(stream: &'a [u8], compression: CompressionKind, scratch: Vec<u8>) -> Self {
105 |         Self {
106 |             decompressor: DecompressorIter::new(stream, compression, scratch),
107 |             offset: 0,
108 |             is_first: true,
109 |         }
110 |     }
111 | 
112 |     /// Returns the internal memory region, so it can be re-used
113 |     pub fn into_inner(self) -> Vec<u8> {
114 |         self.decompressor.into_inner()
115 |     }
116 | }
117 | 
118 | impl<'a> std::io::Read for Decompressor<'a> {
119 |     fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
120 |         if self.is_first {
121 |             self.is_first = false;
122 |             self.decompressor.advance().unwrap();
123 |         }
124 |         let current = self.decompressor.get();
125 |         let current = if let Some(current) = current {
126 |             if current.len() == self.offset {
127 |                 self.decompressor.advance().unwrap();
128 |                 self.offset = 0;
129 |                 let current = self.decompressor.get();
130 |                 if let Some(current) = current {
131 |                     current
132 |                 } else {
133 |                     return Ok(0);
134 |                 }
135 |             } else {
136 |                 &current[self.offset..]
137 |             }
138 |         } else {
139 |             return Ok(0);
140 |         };
141 | 
142 |         if current.len() >= buf.len() {
143 |             buf.copy_from_slice(&current[..buf.len()]);
144 |             self.offset += buf.len();
145 |             Ok(buf.len())
146 |         } else {
147 |             buf[..current.len()].copy_from_slice(current);
148 |             self.offset += current.len();
149 |             Ok(current.len())
150 |         }
151 |     }
152 | }
153 | 
154 | #[cfg(test)]
155 | mod tests {
156 |     use super::*;
157 | 
158 |     #[test]
159 |     fn decode_uncompressed() {
160 |         // 5 uncompressed = [0x0b, 0x00, 0x00] = [0b1011, 0, 0]
161 |         let bytes = &[0b1011, 0, 0, 0];
162 | 
163 |         let (is_original, length) = decode_header(bytes);
164 |         assert!(is_original);
165 |         assert_eq!(length, 5);
166 |     }
167 | 
168 |     #[test]
169 |     fn decode_compressed() {
170 |         // 100_000 compressed = [0x40, 0x0d, 0x03] = [0b01000000, 0b00001101, 0b00000011]
171 |         let bytes = &[0b01000000, 0b00001101, 0b00000011, 0];
172 | 
173 |         let (is_original, length) = decode_header(bytes);
174 |         assert!(!is_original);
175 |         assert_eq!(length, 100_000);
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/src/read/mod.rs:
--------------------------------------------------------------------------------
  1 | //! APIs to read from ORC
  2 | //!
  3 | //! Reading from ORC is essentially composed by:
  4 | //! 1. Identify the column type based on the file's schema
  5 | //! 2. Read the stripe (or part of it in projection pushdown)
  6 | //! 3. For each column, select the relevant region of the stripe
  7 | //! 4. Attach an Iterator to the region
  8 | 
  9 | use std::io::{Read, Seek, SeekFrom};
 10 | 
 11 | use prost::Message;
 12 | 
 13 | use crate::error::Error;
 14 | use crate::proto::stream::Kind;
 15 | use crate::proto::{CompressionKind, Footer, Metadata, PostScript, StripeFooter};
 16 | 
 17 | mod column;
 18 | pub mod decode;
 19 | pub mod decompress;
 20 | pub use column::Column;
 21 | 
 22 | const DEFAULT_FOOTER_SIZE: u64 = 16 * 1024;
 23 | 
 24 | // see (unstable) Seek::stream_len
 25 | fn stream_len(seek: &mut impl Seek) -> std::result::Result<u64, std::io::Error> {
 26 |     let old_pos = seek.seek(SeekFrom::Current(0))?;
 27 |     let len = seek.seek(SeekFrom::End(0))?;
 28 | 
 29 |     // Avoid seeking a third time when we were already at the end of the
 30 |     // stream. The branch is usually way cheaper than a seek operation.
 31 |     if old_pos != len {
 32 |         seek.seek(SeekFrom::Start(old_pos))?;
 33 |     }
 34 | 
 35 |     Ok(len)
 36 | }
 37 | 
 38 | /// The file's metadata.
 39 | #[derive(Debug)]
 40 | pub struct FileMetadata {
 41 |     pub postscript: PostScript,
 42 |     pub footer: Footer,
 43 |     pub metadata: Metadata,
 44 | }
 45 | 
 46 | pub fn read_metadata<R>(reader: &mut R) -> Result<FileMetadata, Error>
 47 | where
 48 |     R: Read + Seek,
 49 | {
 50 |     let file_len = stream_len(reader)?;
 51 | 
 52 |     // initial read of the footer
 53 |     let footer_len = if file_len < DEFAULT_FOOTER_SIZE {
 54 |         file_len
 55 |     } else {
 56 |         DEFAULT_FOOTER_SIZE
 57 |     };
 58 | 
 59 |     reader.seek(SeekFrom::End(-(footer_len as i64)))?;
 60 |     let mut tail_bytes = Vec::with_capacity(footer_len as usize);
 61 |     reader.take(footer_len).read_to_end(&mut tail_bytes)?;
 62 | 
 63 |     // The final byte of the file contains the serialized length of the Postscript,
 64 |     // which must be less than 256 bytes.
 65 |     let postscript_len = tail_bytes[tail_bytes.len() - 1] as usize;
 66 |     tail_bytes.truncate(tail_bytes.len() - 1);
 67 | 
 68 |     // next is the postscript
 69 |     let postscript = PostScript::decode(&tail_bytes[tail_bytes.len() - postscript_len..])?;
 70 |     tail_bytes.truncate(tail_bytes.len() - postscript_len);
 71 | 
 72 |     // next is the footer
 73 |     let footer_length = postscript.footer_length.ok_or(Error::OutOfSpec)? as usize; // todo: throw error
 74 | 
 75 |     let footer = &tail_bytes[tail_bytes.len() - footer_length..];
 76 |     let footer = deserialize_footer(footer, postscript.compression())?;
 77 |     tail_bytes.truncate(tail_bytes.len() - footer_length);
 78 | 
 79 |     // finally the metadata
 80 |     let metadata_length = postscript.metadata_length.ok_or(Error::OutOfSpec)? as usize; // todo: throw error
 81 |     let metadata = &tail_bytes[tail_bytes.len() - metadata_length..];
 82 |     let metadata = deserialize_footer_metadata(metadata, postscript.compression())?;
 83 | 
 84 |     Ok(FileMetadata {
 85 |         postscript,
 86 |         footer,
 87 |         metadata,
 88 |     })
 89 | }
 90 | 
 91 | /// Reads, decompresses and deserializes the stripe's footer as [`StripeFooter`] using
 92 | /// `scratch` as an intermediary memory region.
 93 | /// # Implementation
 94 | /// This function is guaranteed to perform exactly one seek and one read to `reader`.
 95 | pub fn read_stripe_footer<R: Read + Seek>(
 96 |     reader: &mut R,
 97 |     metadata: &FileMetadata,
 98 |     stripe: usize,
 99 |     scratch: &mut Vec<u8>,
100 | ) -> Result<StripeFooter, Error> {
101 |     let stripe = &metadata.footer.stripes[stripe];
102 | 
103 |     let start = stripe.offset() + stripe.index_length() + stripe.data_length();
104 |     let len = stripe.footer_length();
105 |     reader.seek(SeekFrom::Start(start))?;
106 | 
107 |     scratch.clear();
108 |     scratch.reserve(len as usize);
109 |     reader.take(len).read_to_end(scratch)?;
110 |     deserialize_stripe_footer(scratch, metadata.postscript.compression())
111 | }
112 | 
113 | /// Reads `column` from the stripe into a [`Column`].
114 | /// `scratch` becomes owned by [`Column`], which you can recover via `into_inner`.
115 | /// # Implementation
116 | /// This function is guaranteed to perform exactly one seek and one read to `reader`.
117 | pub fn read_stripe_column<R: Read + Seek>(
118 |     reader: &mut R,
119 |     metadata: &FileMetadata,
120 |     stripe: usize,
121 |     footer: StripeFooter,
122 |     column: u32,
123 |     mut scratch: Vec<u8>,
124 | ) -> Result<Column, Error> {
125 |     let stripe = &metadata.footer.stripes[stripe];
126 | 
127 |     let mut start = 0; // the start of the stream
128 | 
129 |     let start = footer
130 |         .streams
131 |         .iter()
132 |         .map(|stream| {
133 |             start += stream.length();
134 |             (start, stream)
135 |         })
136 |         .find(|(_, stream)| stream.column() == column && stream.kind() != Kind::RowIndex)
137 |         .map(|(start, stream)| start - stream.length())
138 |         .ok_or(Error::InvalidColumn(column))?;
139 | 
140 |     let length = footer
141 |         .streams
142 |         .iter()
143 |         .filter(|stream| stream.column() == column && stream.kind() != Kind::RowIndex)
144 |         .fold(0, |acc, stream| acc + stream.length());
145 | 
146 |     let start = stripe.offset() + start;
147 |     reader.seek(SeekFrom::Start(start))?;
148 | 
149 |     scratch.clear();
150 |     scratch.reserve(length as usize);
151 |     reader.take(length).read_to_end(&mut scratch)?;
152 |     Ok(Column::new(
153 |         scratch,
154 |         column,
155 |         stripe.number_of_rows(),
156 |         footer,
157 |         metadata.postscript.compression(),
158 |     ))
159 | }
160 | 
161 | fn deserialize_footer(bytes: &[u8], compression: CompressionKind) -> Result<Footer, Error> {
162 |     let mut buffer = vec![];
163 |     decompress::Decompressor::new(bytes, compression, vec![]).read_to_end(&mut buffer)?;
164 |     Ok(Footer::decode(&*buffer)?)
165 | }
166 | 
167 | fn deserialize_footer_metadata(
168 |     bytes: &[u8],
169 |     compression: CompressionKind,
170 | ) -> Result<Metadata, Error> {
171 |     let mut buffer = vec![];
172 |     decompress::Decompressor::new(bytes, compression, vec![]).read_to_end(&mut buffer)?;
173 |     Ok(Metadata::decode(&*buffer)?)
174 | }
175 | 
176 | fn deserialize_stripe_footer(
177 |     bytes: &[u8],
178 |     compression: CompressionKind,
179 | ) -> Result<StripeFooter, Error> {
180 |     let mut buffer = vec![];
181 |     decompress::Decompressor::new(bytes, compression, vec![]).read_to_end(&mut buffer)?;
182 |     Ok(StripeFooter::decode(&*buffer)?)
183 | }
184 | 


--------------------------------------------------------------------------------
/src/read/decode/boolean_rle.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Read;
  2 | 
  3 | use crate::error::Error;
  4 | 
  5 | use super::read_u8;
  6 | 
  7 | #[derive(Debug, Copy, Clone, PartialEq)]
  8 | #[allow(clippy::large_enum_variant)]
  9 | pub enum BooleanRun {
 10 |     Run(u8, u16),
 11 |     Literals([u8; 255]),
 12 | }
 13 | 
 14 | pub struct BooleanRleRunIter<R: Read> {
 15 |     reader: R,
 16 | }
 17 | 
 18 | impl<R: Read> BooleanRleRunIter<R> {
 19 |     pub fn new(reader: R) -> Self {
 20 |         Self { reader }
 21 |     }
 22 | 
 23 |     pub fn into_inner(self) -> R {
 24 |         self.reader
 25 |     }
 26 | }
 27 | 
 28 | fn read_literals<R: Read>(reader: &mut R, header: i8) -> Result<[u8; 255], Error> {
 29 |     let length = (-header) as usize;
 30 | 
 31 |     let mut literals = [0u8; 255];
 32 | 
 33 |     reader
 34 |         .take(length as u64)
 35 |         .read_exact(&mut literals[..length])?;
 36 | 
 37 |     Ok(literals)
 38 | }
 39 | 
 40 | impl<R: Read> Iterator for BooleanRleRunIter<R> {
 41 |     type Item = Result<BooleanRun, Error>;
 42 | 
 43 |     #[inline]
 44 |     fn next(&mut self) -> Option<Self::Item> {
 45 |         let header = read_u8(&mut self.reader);
 46 |         let header = match header {
 47 |             Ok(header) => header as i8,
 48 |             Err(e) => return Some(Err(e.into())),
 49 |         };
 50 |         if header < 0 {
 51 |             Some(read_literals(&mut self.reader, header).map(BooleanRun::Literals))
 52 |         } else {
 53 |             let length = header as u16 + 3;
 54 |             // this is not ok - it may require more than one byte
 55 |             let value = read_u8(&mut self.reader);
 56 |             let value = match value {
 57 |                 Ok(value) => value,
 58 |                 Err(e) => return Some(Err(e.into())),
 59 |             };
 60 |             Some(Ok(BooleanRun::Run(value, length)))
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | pub struct BooleanIter<R: Read> {
 66 |     iter: BooleanRleRunIter<R>,
 67 |     current: Option<BooleanRun>,
 68 |     position: u8,
 69 |     byte_position: usize,
 70 |     remaining: usize,
 71 | }
 72 | 
 73 | impl<'a, R: Read> BooleanIter<R> {
 74 |     pub fn new(reader: R, length: usize) -> Self {
 75 |         Self {
 76 |             iter: BooleanRleRunIter::new(reader),
 77 |             current: None,
 78 |             position: 0,
 79 |             byte_position: 0,
 80 |             remaining: length,
 81 |         }
 82 |     }
 83 | 
 84 |     pub fn into_inner(self) -> R {
 85 |         self.iter.into_inner()
 86 |     }
 87 | }
 88 | 
 89 | impl<R: Read> Iterator for BooleanIter<R> {
 90 |     type Item = Result<bool, Error>;
 91 | 
 92 |     #[inline]
 93 |     fn next(&mut self) -> Option<Self::Item> {
 94 |         if let Some(run) = &self.current {
 95 |             match run {
 96 |                 BooleanRun::Run(value, repetitions) => {
 97 |                     let repetitions = *repetitions;
 98 |                     let mask = 128u8 >> self.position;
 99 |                     let result = value & mask == mask;
100 |                     self.position += 1;
101 |                     if self.remaining == 0 {
102 |                         self.current = None;
103 |                         return None;
104 |                     } else {
105 |                         self.remaining -= 1;
106 |                     }
107 |                     if self.position == 8 {
108 |                         if repetitions == 0 {
109 |                             self.current = None;
110 |                         } else {
111 |                             self.current = Some(BooleanRun::Run(*value, repetitions - 1));
112 |                         }
113 |                         self.position = 0;
114 |                     }
115 |                     Some(Ok(result))
116 |                 }
117 |                 BooleanRun::Literals(bytes) => {
118 |                     let mask = 128u8 >> self.position;
119 |                     let result = bytes[self.byte_position] & mask == mask;
120 |                     self.position += 1;
121 |                     if self.remaining == 0 {
122 |                         self.current = None;
123 |                         return None;
124 |                     } else {
125 |                         self.remaining -= 1;
126 |                     }
127 |                     if self.position == 8 {
128 |                         if bytes.len() == 1 {
129 |                             self.current = None;
130 |                             self.byte_position = 0;
131 |                         } else {
132 |                             self.byte_position += 1;
133 |                         }
134 |                         self.position = 0;
135 |                     }
136 |                     Some(Ok(result))
137 |                 }
138 |             }
139 |         } else if self.remaining > 0 {
140 |             match self.iter.next()? {
141 |                 Ok(run) => {
142 |                     self.current = Some(run);
143 |                     self.next()
144 |                 }
145 |                 Err(e) => {
146 |                     self.remaining = 0;
147 |                     Some(Err(e))
148 |                 }
149 |             }
150 |         } else {
151 |             None
152 |         }
153 |     }
154 | 
155 |     fn size_hint(&self) -> (usize, Option<usize>) {
156 |         (self.remaining, Some(self.remaining))
157 |     }
158 | }
159 | 
160 | #[cfg(test)]
161 | mod test {
162 |     use super::*;
163 | 
164 |     #[test]
165 |     fn basic() {
166 |         let data = [0x61u8, 0x00];
167 | 
168 |         let data = &mut data.as_ref();
169 | 
170 |         let iter = BooleanIter::new(data, 100)
171 |             .collect::<Result<Vec<_>, Error>>()
172 |             .unwrap();
173 |         assert_eq!(iter, vec![false; 100])
174 |     }
175 | 
176 |     #[test]
177 |     fn literals() {
178 |         let data = [0xfeu8, 0b01000100, 0b01000101];
179 | 
180 |         let data = &mut data.as_ref();
181 | 
182 |         let iter = BooleanIter::new(data, 16)
183 |             .collect::<Result<Vec<_>, Error>>()
184 |             .unwrap();
185 |         assert_eq!(
186 |             iter,
187 |             vec![
188 |                 false, true, false, false, false, true, false, false, // 0b01000100
189 |                 false, true, false, false, false, true, false, true, // 0b01000101
190 |             ]
191 |         )
192 |     }
193 | 
194 |     #[test]
195 |     fn another() {
196 |         // "For example, the byte sequence [0xff, 0x80] would be one true followed by seven false values."
197 |         let data = [0xff, 0x80];
198 | 
199 |         let data = &mut data.as_ref();
200 | 
201 |         let iter = BooleanIter::new(data, 8)
202 |             .collect::<Result<Vec<_>, Error>>()
203 |             .unwrap();
204 |         assert_eq!(
205 |             iter,
206 |             vec![true, false, false, false, false, false, false, false,]
207 |         )
208 |     }
209 | }
210 | 


--------------------------------------------------------------------------------
/tests/it/deserialize.rs:
--------------------------------------------------------------------------------
  1 | use orc_format::{
  2 |     error::Error,
  3 |     proto::{column_encoding::Kind as ColumnEncodingKind, stream::Kind},
  4 |     read,
  5 |     read::decode::{
  6 |         BooleanIter, SignedRleV2Iter, SignedRleV2Run, SignedRleV2RunIter, UnsignedRleV2Run,
  7 |         UnsignedRleV2RunIter,
  8 |     },
  9 |     read::decompress::Decompressor,
 10 |     read::Column,
 11 | };
 12 | 
 13 | fn deserialize_validity(column: &Column, scratch: &mut Vec<u8>) -> Result<Vec<bool>, Error> {
 14 |     let mut reader = column.get_stream(Kind::Present, std::mem::take(scratch))?;
 15 | 
 16 |     let mut validity = Vec::with_capacity(column.number_of_rows());
 17 |     BooleanIter::new(&mut reader, column.number_of_rows()).try_for_each(|item| {
 18 |         validity.push(item?);
 19 |         Result::<(), Error>::Ok(())
 20 |     })?;
 21 | 
 22 |     *scratch = std::mem::take(&mut reader.into_inner());
 23 | 
 24 |     Ok(validity)
 25 | }
 26 | 
 27 | pub fn deserialize_f32_array(column: &Column) -> Result<(Vec<bool>, Vec<f32>), Error> {
 28 |     let mut scratch = vec![];
 29 | 
 30 |     let validity = deserialize_validity(column, &mut scratch)?;
 31 | 
 32 |     let reader = column.get_stream(Kind::Data, scratch)?;
 33 | 
 34 |     let num_of_values: usize = validity.iter().map(|x| *x as usize).sum();
 35 | 
 36 |     let mut valid_values = Vec::with_capacity(num_of_values);
 37 |     let mut iter = read::decode::FloatIter::<f32, _>::new(reader, num_of_values);
 38 |     iter.try_for_each(|item| {
 39 |         valid_values.push(item?);
 40 |         Result::<(), Error>::Ok(())
 41 |     })?;
 42 | 
 43 |     let _ = iter.into_inner();
 44 | 
 45 |     Ok((validity, valid_values))
 46 | }
 47 | 
 48 | pub fn deserialize_int_array(column: &Column) -> Result<(Vec<bool>, Vec<i64>), Error> {
 49 |     let mut scratch = vec![];
 50 | 
 51 |     let validity = deserialize_validity(column, &mut scratch)?;
 52 | 
 53 |     let num_of_values: usize = validity.iter().map(|x| *x as usize).sum();
 54 | 
 55 |     let reader = column.get_stream(Kind::Data, scratch)?;
 56 | 
 57 |     let mut valid_values = Vec::with_capacity(num_of_values);
 58 | 
 59 |     let mut iter = SignedRleV2RunIter::new(reader, num_of_values, vec![]);
 60 | 
 61 |     iter.try_for_each(|run| {
 62 |         run.map(|run| match run {
 63 |             SignedRleV2Run::Direct(values) => valid_values.extend(values),
 64 |             SignedRleV2Run::Delta(values) => valid_values.extend(values),
 65 |             SignedRleV2Run::ShortRepeat(values) => valid_values.extend(values),
 66 |         })
 67 |     })?;
 68 | 
 69 |     let (_, _) = iter.into_inner();
 70 | 
 71 |     // test the other iterator
 72 |     let reader = column.get_stream(Kind::Data, vec![])?;
 73 | 
 74 |     let mut valid_values1 = Vec::with_capacity(num_of_values);
 75 |     SignedRleV2Iter::new(reader, num_of_values, vec![]).try_for_each(|item| {
 76 |         valid_values1.push(item?);
 77 |         Result::<(), Error>::Ok(())
 78 |     })?;
 79 |     assert_eq!(valid_values1, valid_values);
 80 | 
 81 |     Ok((validity, valid_values))
 82 | }
 83 | 
 84 | pub fn deserialize_bool_array(column: &Column) -> Result<(Vec<bool>, Vec<bool>), Error> {
 85 |     let mut scratch = vec![];
 86 | 
 87 |     let validity = deserialize_validity(column, &mut scratch)?;
 88 | 
 89 |     let num_of_values: usize = validity.iter().map(|x| *x as usize).sum();
 90 | 
 91 |     let reader = column.get_stream(Kind::Data, std::mem::take(&mut scratch))?;
 92 | 
 93 |     let mut valid_values = Vec::with_capacity(num_of_values);
 94 | 
 95 |     let mut iter = BooleanIter::new(reader, num_of_values);
 96 |     iter.try_for_each(|item| {
 97 |         valid_values.push(item?);
 98 |         Result::<(), Error>::Ok(())
 99 |     })?;
100 | 
101 |     let _ = iter.into_inner();
102 | 
103 |     Ok((validity, valid_values))
104 | }
105 | 
106 | pub fn deserialize_str(
107 |     mut lengths: UnsignedRleV2RunIter<Decompressor>,
108 |     values: &mut read::decode::Values<Decompressor>,
109 |     num_of_values: usize,
110 | ) -> Result<Vec<String>, Error> {
111 |     let mut result = Vec::with_capacity(num_of_values);
112 | 
113 |     for run in lengths.by_ref() {
114 |         let f = |length| {
115 |             values.next(length as usize).and_then(|x| {
116 |                 std::str::from_utf8(x)
117 |                     .map(|x| x.to_string())
118 |                     .map_err(|_| Error::InvalidUtf8)
119 |             })
120 |         };
121 |         match run? {
122 |             UnsignedRleV2Run::Direct(lengths) => lengths.map(f).try_for_each(|x| {
123 |                 result.push(x?);
124 |                 Result::<_, Error>::Ok(())
125 |             }),
126 |             UnsignedRleV2Run::Delta(lengths) => lengths.map(f).try_for_each(|x| {
127 |                 result.push(x?);
128 |                 Result::<_, Error>::Ok(())
129 |             }),
130 |             UnsignedRleV2Run::ShortRepeat(lengths) => lengths.map(f).try_for_each(|x| {
131 |                 result.push(x?);
132 |                 Result::<_, Error>::Ok(())
133 |             }),
134 |         }?
135 |     }
136 | 
137 |     let (_, _) = lengths.into_inner();
138 | 
139 |     Ok(result)
140 | }
141 | 
142 | pub fn deserialize_str_dict_array(
143 |     column: &Column,
144 |     scratch: Vec<u8>,
145 |     num_of_values: usize,
146 | ) -> Result<Vec<String>, Error> {
147 |     let values = column.get_stream(Kind::DictionaryData, scratch)?;
148 | 
149 |     let mut values_iter = read::decode::Values::new(values, vec![]);
150 | 
151 |     let scratch2 = vec![];
152 |     let lengths = column.get_stream(Kind::Length, scratch2)?;
153 | 
154 |     let lengths = UnsignedRleV2RunIter::new(lengths, column.dictionary_size().unwrap(), vec![]);
155 | 
156 |     let values = deserialize_str(lengths, &mut values_iter, 0)?;
157 |     let scratch = values_iter.into_inner();
158 | 
159 |     let indices = column.get_stream(Kind::Data, scratch)?;
160 |     let mut indices = UnsignedRleV2RunIter::new(indices, column.number_of_rows(), vec![]);
161 | 
162 |     let f = |x| values.get(x as usize).cloned().ok_or(Error::OutOfSpec);
163 | 
164 |     let mut result = Vec::with_capacity(num_of_values);
165 |     for run in indices.by_ref() {
166 |         run.and_then(|run| match run {
167 |             UnsignedRleV2Run::Direct(values) => values.map(f).try_for_each(|x| {
168 |                 result.push(x?);
169 |                 Result::<_, Error>::Ok(())
170 |             }),
171 |             UnsignedRleV2Run::Delta(values) => values.map(f).try_for_each(|x| {
172 |                 result.push(x?);
173 |                 Result::<_, Error>::Ok(())
174 |             }),
175 |             UnsignedRleV2Run::ShortRepeat(values) => values.map(f).try_for_each(|x| {
176 |                 result.push(x?);
177 |                 Result::<_, Error>::Ok(())
178 |             }),
179 |         })?;
180 |     }
181 | 
182 |     let (_, _) = indices.into_inner();
183 | 
184 |     Ok(result)
185 | }
186 | 
187 | fn deserialize_str_array_direct(
188 |     column: &Column,
189 | 
190 |     scratch: Vec<u8>,
191 |     num_of_values: usize,
192 | ) -> Result<Vec<String>, Error> {
193 |     let values = column.get_stream(Kind::Data, scratch)?;
194 |     let mut values = read::decode::Values::new(values, vec![]);
195 | 
196 |     let scratch1 = vec![];
197 |     let lengths = column.get_stream(Kind::Length, scratch1)?;
198 |     let lengths = UnsignedRleV2RunIter::new(lengths, num_of_values, vec![]);
199 | 
200 |     deserialize_str(lengths, &mut values, num_of_values)
201 | }
202 | 
203 | pub fn deserialize_str_array(column: &Column) -> Result<(Vec<bool>, Vec<String>), Error> {
204 |     let mut scratch = vec![];
205 | 
206 |     let validity = deserialize_validity(column, &mut scratch)?;
207 | 
208 |     let num_of_values: usize = validity.iter().map(|x| *x as usize).sum();
209 | 
210 |     // todo: generalize to other encodings
211 |     let encoding = column.encoding();
212 |     let valid_values = match encoding.kind() {
213 |         ColumnEncodingKind::DirectV2 => {
214 |             deserialize_str_array_direct(column, scratch, num_of_values)?
215 |         }
216 |         ColumnEncodingKind::DictionaryV2 => {
217 |             deserialize_str_dict_array(column, scratch, num_of_values)?
218 |         }
219 |         other => todo!("{other:?}"),
220 |     };
221 |     Ok((validity, valid_values))
222 | }
223 | 


--------------------------------------------------------------------------------
/tests/it/main.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::File;
  2 | 
  3 | mod deserialize;
  4 | use deserialize::*;
  5 | 
  6 | use orc_format::{error::Error, read, read::Column};
  7 | 
  8 | fn get_column(path: &str, column: u32) -> Result<Column, Error> {
  9 |     // open the file, as expected. buffering this is not necessary - we
 10 |     // are very careful about the number of `read`s we perform.
 11 |     let mut f = File::open(path).expect("no file found");
 12 | 
 13 |     // read the files' metadata
 14 |     let metadata = read::read_metadata(&mut f)?;
 15 | 
 16 |     // the next step is to identify which stripe we want to read. Let's say it is the first one.
 17 |     let stripe = 0;
 18 | 
 19 |     // Each stripe has a footer - we need to read it to extract the location of each column on it.
 20 |     let stripe_footer = read::read_stripe_footer(&mut f, &metadata, stripe, &mut vec![])?;
 21 | 
 22 |     // Finally, we read the column into `Column`
 23 |     read::read_stripe_column(&mut f, &metadata, stripe, stripe_footer, column, vec![])
 24 | }
 25 | 
 26 | #[test]
 27 | fn read_bool() -> Result<(), Error> {
 28 |     let column = get_column("test.orc", 2)?;
 29 | 
 30 |     let (a, b) = deserialize_bool_array(&column)?;
 31 |     assert_eq!(a, vec![true, true, false, true, true]);
 32 |     assert_eq!(b, vec![true, false, true, false]);
 33 | 
 34 |     let (_footer, _scratch) = column.into_inner();
 35 |     Ok(())
 36 | }
 37 | 
 38 | #[test]
 39 | fn read_str_direct() -> Result<(), Error> {
 40 |     let column = get_column("test.orc", 3)?;
 41 | 
 42 |     let (a, b) = deserialize_str_array(&column)?;
 43 |     assert_eq!(a, vec![true, true, false, true, true]);
 44 |     assert_eq!(b, vec!["a", "cccccc", "ddd", "ee"]);
 45 |     Ok(())
 46 | }
 47 | 
 48 | #[test]
 49 | fn read_str_delta_plus() -> Result<(), Error> {
 50 |     let column = get_column("test.orc", 4)?;
 51 | 
 52 |     let (a, b) = deserialize_str_array(&column)?;
 53 |     assert_eq!(a, vec![true, true, false, true, true]);
 54 |     assert_eq!(b, vec!["a", "bb", "ccc", "ddd"]);
 55 |     Ok(())
 56 | }
 57 | 
 58 | #[test]
 59 | fn read_str_delta_minus() -> Result<(), Error> {
 60 |     let column = get_column("test.orc", 5)?;
 61 | 
 62 |     let (a, b) = deserialize_str_array(&column)?;
 63 |     assert_eq!(a, vec![true, true, false, true, true]);
 64 |     assert_eq!(b, vec!["ddd", "cc", "bb", "a"]);
 65 |     Ok(())
 66 | }
 67 | 
 68 | #[test]
 69 | fn read_str_short_repeat() -> Result<(), Error> {
 70 |     let column = get_column("test.orc", 6)?;
 71 | 
 72 |     let (a, b) = deserialize_str_array(&column)?;
 73 |     assert_eq!(a, vec![true, true, false, true, true]);
 74 |     assert_eq!(b, vec!["aaaaa", "bbbbb", "ccccc", "ddddd"]);
 75 |     Ok(())
 76 | }
 77 | 
 78 | #[test]
 79 | fn read_f32() -> Result<(), Error> {
 80 |     let column = get_column("test.orc", 1)?;
 81 | 
 82 |     let (a, b) = deserialize_f32_array(&column)?;
 83 |     assert_eq!(a, vec![true, true, false, true, true]);
 84 |     assert_eq!(b, vec![1.0, 2.0, 4.0, 5.0]);
 85 |     Ok(())
 86 | }
 87 | 
 88 | #[test]
 89 | fn read_int_short_repeated() -> Result<(), Error> {
 90 |     let column = get_column("test.orc", 7)?;
 91 | 
 92 |     let (a, b) = deserialize_int_array(&column)?;
 93 |     assert_eq!(a, vec![true, true, false, true, true]);
 94 |     assert_eq!(b, vec![5, 5, 5, 5]);
 95 |     Ok(())
 96 | }
 97 | 
 98 | #[test]
 99 | fn read_int_neg_short_repeated() -> Result<(), Error> {
100 |     let column = get_column("test.orc", 8)?;
101 | 
102 |     let (a, b) = deserialize_int_array(&column)?;
103 |     assert_eq!(a, vec![true, true, false, true, true]);
104 |     assert_eq!(b, vec![-5, -5, -5, -5]);
105 |     Ok(())
106 | }
107 | 
108 | #[test]
109 | fn read_int_delta() -> Result<(), Error> {
110 |     let column = get_column("test.orc", 9)?;
111 | 
112 |     let (a, b) = deserialize_int_array(&column)?;
113 |     assert_eq!(a, vec![true, true, false, true, true]);
114 |     assert_eq!(b, vec![1, 2, 4, 5]);
115 |     Ok(())
116 | }
117 | 
118 | #[test]
119 | fn read_int_neg_delta() -> Result<(), Error> {
120 |     let column = get_column("test.orc", 10)?;
121 | 
122 |     let (a, b) = deserialize_int_array(&column)?;
123 |     assert_eq!(a, vec![true, true, false, true, true]);
124 |     assert_eq!(b, vec![5, 4, 2, 1]);
125 |     Ok(())
126 | }
127 | 
128 | #[test]
129 | fn read_int_direct() -> Result<(), Error> {
130 |     let column = get_column("test.orc", 11)?;
131 | 
132 |     let (a, b) = deserialize_int_array(&column)?;
133 |     assert_eq!(a, vec![true, true, false, true, true]);
134 |     assert_eq!(b, vec![1, 6, 3, 2]);
135 |     Ok(())
136 | }
137 | 
138 | #[test]
139 | fn read_int_neg_direct() -> Result<(), Error> {
140 |     let column = get_column("test.orc", 12)?;
141 | 
142 |     let (a, b) = deserialize_int_array(&column)?;
143 |     assert_eq!(a, vec![true, true, false, true, true]);
144 |     assert_eq!(b, vec![-1, -6, -3, -2]);
145 |     Ok(())
146 | }
147 | 
148 | #[test]
149 | fn read_bigint_direct() -> Result<(), Error> {
150 |     let column = get_column("test.orc", 13)?;
151 | 
152 |     let (a, b) = deserialize_int_array(&column)?;
153 |     assert_eq!(a, vec![true, true, false, true, true]);
154 |     assert_eq!(b, vec![1, 6, 3, 2]);
155 |     Ok(())
156 | }
157 | 
158 | #[test]
159 | fn read_bigint_neg_direct() -> Result<(), Error> {
160 |     let column = get_column("test.orc", 14)?;
161 | 
162 |     let (a, b) = deserialize_int_array(&column)?;
163 |     assert_eq!(a, vec![true, true, false, true, true]);
164 |     assert_eq!(b, vec![-1, -6, -3, -2]);
165 |     Ok(())
166 | }
167 | 
168 | #[test]
169 | fn read_bigint_other() -> Result<(), Error> {
170 |     let column = get_column("test.orc", 15)?;
171 | 
172 |     let (a, b) = deserialize_int_array(&column)?;
173 |     assert_eq!(a, vec![true, true, true, true, true]);
174 |     assert_eq!(b, vec![5, -5, 1, 5, 5]);
175 |     Ok(())
176 | }
177 | 
178 | #[test]
179 | fn read_boolean_long() -> Result<(), Error> {
180 |     let column = get_column("long_bool.orc", 1)?;
181 | 
182 |     let (a, b) = deserialize_bool_array(&column)?;
183 |     assert_eq!(a, vec![true; 32]);
184 |     assert_eq!(b, vec![true; 32]);
185 |     Ok(())
186 | }
187 | 
188 | #[test]
189 | fn read_bool_compressed() -> Result<(), Error> {
190 |     let column = get_column("long_bool_gzip.orc", 1)?;
191 | 
192 |     let (a, b) = deserialize_bool_array(&column)?;
193 |     assert_eq!(a, vec![true; 32]);
194 |     assert_eq!(b, vec![true; 32]);
195 |     Ok(())
196 | }
197 | 
198 | #[test]
199 | fn read_string_long() -> Result<(), Error> {
200 |     let column = get_column("string_long.orc", 1)?;
201 | 
202 |     let (a, b) = deserialize_str_array(&column)?;
203 |     assert_eq!(a, vec![true; 64]);
204 |     assert_eq!(
205 |         b,
206 |         vec!["abcd", "efgh"]
207 |             .into_iter()
208 |             .cycle()
209 |             .take(64)
210 |             .collect::<Vec<_>>()
211 |     );
212 |     Ok(())
213 | }
214 | 
215 | #[test]
216 | fn read_string_dict() -> Result<(), Error> {
217 |     let column = get_column("string_dict.orc", 1)?;
218 | 
219 |     let (a, b) = deserialize_str_array(&column)?;
220 |     assert_eq!(a, vec![true; 64]);
221 |     assert_eq!(
222 |         b,
223 |         vec!["abc", "efgh"]
224 |             .into_iter()
225 |             .cycle()
226 |             .take(64)
227 |             .collect::<Vec<_>>()
228 |     );
229 |     Ok(())
230 | }
231 | 
232 | #[test]
233 | fn read_string_dict_gzip() -> Result<(), Error> {
234 |     let column = get_column("string_dict_gzip.orc", 1)?;
235 | 
236 |     let (a, b) = deserialize_str_array(&column)?;
237 |     assert_eq!(a, vec![true; 64]);
238 |     assert_eq!(
239 |         b,
240 |         vec!["abc", "efgh"]
241 |             .into_iter()
242 |             .cycle()
243 |             .take(64)
244 |             .collect::<Vec<_>>()
245 |     );
246 |     Ok(())
247 | }
248 | 
249 | #[test]
250 | fn read_string_long_long() -> Result<(), Error> {
251 |     let column = get_column("string_long_long.orc", 1)?;
252 | 
253 |     let (a, b) = deserialize_str_array(&column)?;
254 |     assert_eq!(a.len(), 10_000);
255 |     assert_eq!(a, vec![true; 10_000]);
256 |     assert_eq!(b.len(), 10_000);
257 |     assert_eq!(
258 |         b,
259 |         vec!["abcd", "efgh"]
260 |             .into_iter()
261 |             .cycle()
262 |             .take(10_000)
263 |             .collect::<Vec<_>>()
264 |     );
265 |     Ok(())
266 | }
267 | 
268 | #[test]
269 | fn read_string_long_long_gzip() -> Result<(), Error> {
270 |     let column = get_column("string_long_long_gzip.orc", 1)?;
271 | 
272 |     let (a, b) = deserialize_str_array(&column)?;
273 |     assert_eq!(a.len(), 10_000);
274 |     assert_eq!(a, vec![true; 10_000]);
275 |     assert_eq!(b.len(), 10_000);
276 |     assert_eq!(
277 |         b,
278 |         vec!["abcd", "efgh"]
279 |             .into_iter()
280 |             .cycle()
281 |             .take(10_000)
282 |             .collect::<Vec<_>>()
283 |     );
284 |     Ok(())
285 | }
286 | 
287 | #[test]
288 | fn read_f32_long_long_gzip() -> Result<(), Error> {
289 |     let column = get_column("f32_long_long_gzip.orc", 1)?;
290 | 
291 |     let (a, b) = deserialize_f32_array(&column)?;
292 |     assert_eq!(a.len(), 1_000_000);
293 |     assert_eq!(a, vec![true; 1_000_000]);
294 |     assert_eq!(b.len(), 1_000_000);
295 |     Ok(())
296 | }
297 | 
298 | #[test]
299 | fn read_string_increase() -> Result<(), Error> {
300 |     let column = get_column("test.orc", 16)?;
301 | 
302 |     let (a, b) = deserialize_str_array(&column)?;
303 |     assert_eq!(a, vec![true; 5]);
304 |     assert_eq!(b, vec!["a", "bb", "ccc", "dddd", "eeeee"]);
305 |     Ok(())
306 | }
307 | 
308 | #[test]
309 | fn read_string_decrease() -> Result<(), Error> {
310 |     let column = get_column("test.orc", 17)?;
311 | 
312 |     let (a, b) = deserialize_str_array(&column)?;
313 |     assert_eq!(a, vec![true; 5]);
314 |     assert_eq!(b, vec!["eeeee", "dddd", "ccc", "bb", "a"]);
315 |     Ok(())
316 | }
317 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         https://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    Copyright 2022 Jorge C. Leitão
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |        https://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/src/proto.rs:
--------------------------------------------------------------------------------
  1 | #[derive(Clone, PartialEq, ::prost::Message)]
  2 | pub struct IntegerStatistics {
  3 |     #[prost(sint64, optional, tag = "1")]
  4 |     pub minimum: ::core::option::Option<i64>,
  5 |     #[prost(sint64, optional, tag = "2")]
  6 |     pub maximum: ::core::option::Option<i64>,
  7 |     #[prost(sint64, optional, tag = "3")]
  8 |     pub sum: ::core::option::Option<i64>,
  9 | }
 10 | #[derive(Clone, PartialEq, ::prost::Message)]
 11 | pub struct DoubleStatistics {
 12 |     #[prost(double, optional, tag = "1")]
 13 |     pub minimum: ::core::option::Option<f64>,
 14 |     #[prost(double, optional, tag = "2")]
 15 |     pub maximum: ::core::option::Option<f64>,
 16 |     #[prost(double, optional, tag = "3")]
 17 |     pub sum: ::core::option::Option<f64>,
 18 | }
 19 | #[derive(Clone, PartialEq, ::prost::Message)]
 20 | pub struct StringStatistics {
 21 |     #[prost(string, optional, tag = "1")]
 22 |     pub minimum: ::core::option::Option<::prost::alloc::string::String>,
 23 |     #[prost(string, optional, tag = "2")]
 24 |     pub maximum: ::core::option::Option<::prost::alloc::string::String>,
 25 |     /// sum will store the total length of all strings in a stripe
 26 |     #[prost(sint64, optional, tag = "3")]
 27 |     pub sum: ::core::option::Option<i64>,
 28 | }
 29 | #[derive(Clone, PartialEq, ::prost::Message)]
 30 | pub struct BucketStatistics {
 31 |     #[prost(uint64, repeated, tag = "1")]
 32 |     pub count: ::prost::alloc::vec::Vec<u64>,
 33 | }
 34 | #[derive(Clone, PartialEq, ::prost::Message)]
 35 | pub struct DecimalStatistics {
 36 |     #[prost(string, optional, tag = "1")]
 37 |     pub minimum: ::core::option::Option<::prost::alloc::string::String>,
 38 |     #[prost(string, optional, tag = "2")]
 39 |     pub maximum: ::core::option::Option<::prost::alloc::string::String>,
 40 |     #[prost(string, optional, tag = "3")]
 41 |     pub sum: ::core::option::Option<::prost::alloc::string::String>,
 42 | }
 43 | #[derive(Clone, PartialEq, ::prost::Message)]
 44 | pub struct DateStatistics {
 45 |     /// min,max values saved as days since epoch
 46 |     #[prost(sint32, optional, tag = "1")]
 47 |     pub minimum: ::core::option::Option<i32>,
 48 |     #[prost(sint32, optional, tag = "2")]
 49 |     pub maximum: ::core::option::Option<i32>,
 50 | }
 51 | #[derive(Clone, PartialEq, ::prost::Message)]
 52 | pub struct TimestampStatistics {
 53 |     /// min,max values saved as milliseconds since epoch
 54 |     #[prost(sint64, optional, tag = "1")]
 55 |     pub minimum: ::core::option::Option<i64>,
 56 |     #[prost(sint64, optional, tag = "2")]
 57 |     pub maximum: ::core::option::Option<i64>,
 58 | }
 59 | #[derive(Clone, PartialEq, ::prost::Message)]
 60 | pub struct BinaryStatistics {
 61 |     /// sum will store the total binary blob length in a stripe
 62 |     #[prost(sint64, optional, tag = "1")]
 63 |     pub sum: ::core::option::Option<i64>,
 64 | }
 65 | #[derive(Clone, PartialEq, ::prost::Message)]
 66 | pub struct ColumnStatistics {
 67 |     #[prost(uint64, optional, tag = "1")]
 68 |     pub number_of_values: ::core::option::Option<u64>,
 69 |     #[prost(message, optional, tag = "2")]
 70 |     pub int_statistics: ::core::option::Option<IntegerStatistics>,
 71 |     #[prost(message, optional, tag = "3")]
 72 |     pub double_statistics: ::core::option::Option<DoubleStatistics>,
 73 |     #[prost(message, optional, tag = "4")]
 74 |     pub string_statistics: ::core::option::Option<StringStatistics>,
 75 |     #[prost(message, optional, tag = "5")]
 76 |     pub bucket_statistics: ::core::option::Option<BucketStatistics>,
 77 |     #[prost(message, optional, tag = "6")]
 78 |     pub decimal_statistics: ::core::option::Option<DecimalStatistics>,
 79 |     #[prost(message, optional, tag = "7")]
 80 |     pub date_statistics: ::core::option::Option<DateStatistics>,
 81 |     #[prost(message, optional, tag = "8")]
 82 |     pub binary_statistics: ::core::option::Option<BinaryStatistics>,
 83 |     #[prost(message, optional, tag = "9")]
 84 |     pub timestamp_statistics: ::core::option::Option<TimestampStatistics>,
 85 |     #[prost(bool, optional, tag = "10")]
 86 |     pub has_null: ::core::option::Option<bool>,
 87 | }
 88 | #[derive(Clone, PartialEq, ::prost::Message)]
 89 | pub struct RowIndexEntry {
 90 |     #[prost(uint64, repeated, tag = "1")]
 91 |     pub positions: ::prost::alloc::vec::Vec<u64>,
 92 |     #[prost(message, optional, tag = "2")]
 93 |     pub statistics: ::core::option::Option<ColumnStatistics>,
 94 | }
 95 | #[derive(Clone, PartialEq, ::prost::Message)]
 96 | pub struct RowIndex {
 97 |     #[prost(message, repeated, tag = "1")]
 98 |     pub entry: ::prost::alloc::vec::Vec<RowIndexEntry>,
 99 | }
100 | #[derive(Clone, PartialEq, ::prost::Message)]
101 | pub struct BloomFilter {
102 |     #[prost(uint32, optional, tag = "1")]
103 |     pub num_hash_functions: ::core::option::Option<u32>,
104 |     #[prost(fixed64, repeated, packed = "false", tag = "2")]
105 |     pub bitset: ::prost::alloc::vec::Vec<u64>,
106 | }
107 | #[derive(Clone, PartialEq, ::prost::Message)]
108 | pub struct BloomFilterIndex {
109 |     #[prost(message, repeated, tag = "1")]
110 |     pub bloom_filter: ::prost::alloc::vec::Vec<BloomFilter>,
111 | }
112 | #[derive(Clone, PartialEq, ::prost::Message)]
113 | pub struct Stream {
114 |     #[prost(enumeration = "stream::Kind", optional, tag = "1")]
115 |     pub kind: ::core::option::Option<i32>,
116 |     #[prost(uint32, optional, tag = "2")]
117 |     pub column: ::core::option::Option<u32>,
118 |     #[prost(uint64, optional, tag = "3")]
119 |     pub length: ::core::option::Option<u64>,
120 | }
121 | /// Nested message and enum types in `Stream`.
122 | pub mod stream {
123 |     /// if you add new index stream kinds, you need to make sure to update
124 |     /// StreamName to ensure it is added to the stripe in the right area
125 |     #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
126 |     #[repr(i32)]
127 |     pub enum Kind {
128 |         Present = 0,
129 |         Data = 1,
130 |         Length = 2,
131 |         DictionaryData = 3,
132 |         DictionaryCount = 4,
133 |         Secondary = 5,
134 |         RowIndex = 6,
135 |         BloomFilter = 7,
136 |     }
137 | }
138 | #[derive(Clone, PartialEq, ::prost::Message)]
139 | pub struct ColumnEncoding {
140 |     #[prost(enumeration = "column_encoding::Kind", optional, tag = "1")]
141 |     pub kind: ::core::option::Option<i32>,
142 |     #[prost(uint32, optional, tag = "2")]
143 |     pub dictionary_size: ::core::option::Option<u32>,
144 | }
145 | /// Nested message and enum types in `ColumnEncoding`.
146 | pub mod column_encoding {
147 |     #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
148 |     #[repr(i32)]
149 |     pub enum Kind {
150 |         Direct = 0,
151 |         Dictionary = 1,
152 |         DirectV2 = 2,
153 |         DictionaryV2 = 3,
154 |     }
155 | }
156 | #[derive(Clone, PartialEq, ::prost::Message)]
157 | pub struct StripeFooter {
158 |     #[prost(message, repeated, tag = "1")]
159 |     pub streams: ::prost::alloc::vec::Vec<Stream>,
160 |     #[prost(message, repeated, tag = "2")]
161 |     pub columns: ::prost::alloc::vec::Vec<ColumnEncoding>,
162 |     #[prost(string, optional, tag = "3")]
163 |     pub writer_timezone: ::core::option::Option<::prost::alloc::string::String>,
164 | }
165 | #[derive(Clone, PartialEq, ::prost::Message)]
166 | pub struct Type {
167 |     #[prost(enumeration = "r#type::Kind", optional, tag = "1")]
168 |     pub kind: ::core::option::Option<i32>,
169 |     #[prost(uint32, repeated, tag = "2")]
170 |     pub subtypes: ::prost::alloc::vec::Vec<u32>,
171 |     #[prost(string, repeated, tag = "3")]
172 |     pub field_names: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
173 |     #[prost(uint32, optional, tag = "4")]
174 |     pub maximum_length: ::core::option::Option<u32>,
175 |     #[prost(uint32, optional, tag = "5")]
176 |     pub precision: ::core::option::Option<u32>,
177 |     #[prost(uint32, optional, tag = "6")]
178 |     pub scale: ::core::option::Option<u32>,
179 | }
180 | /// Nested message and enum types in `Type`.
181 | pub mod r#type {
182 |     #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
183 |     #[repr(i32)]
184 |     pub enum Kind {
185 |         Boolean = 0,
186 |         Byte = 1,
187 |         Short = 2,
188 |         Int = 3,
189 |         Long = 4,
190 |         Float = 5,
191 |         Double = 6,
192 |         String = 7,
193 |         Binary = 8,
194 |         Timestamp = 9,
195 |         List = 10,
196 |         Map = 11,
197 |         Struct = 12,
198 |         Union = 13,
199 |         Decimal = 14,
200 |         Date = 15,
201 |         Varchar = 16,
202 |         Char = 17,
203 |     }
204 | }
205 | #[derive(Clone, PartialEq, ::prost::Message)]
206 | pub struct StripeInformation {
207 |     #[prost(uint64, optional, tag = "1")]
208 |     pub offset: ::core::option::Option<u64>,
209 |     #[prost(uint64, optional, tag = "2")]
210 |     pub index_length: ::core::option::Option<u64>,
211 |     #[prost(uint64, optional, tag = "3")]
212 |     pub data_length: ::core::option::Option<u64>,
213 |     #[prost(uint64, optional, tag = "4")]
214 |     pub footer_length: ::core::option::Option<u64>,
215 |     #[prost(uint64, optional, tag = "5")]
216 |     pub number_of_rows: ::core::option::Option<u64>,
217 | }
218 | #[derive(Clone, PartialEq, ::prost::Message)]
219 | pub struct UserMetadataItem {
220 |     #[prost(string, optional, tag = "1")]
221 |     pub name: ::core::option::Option<::prost::alloc::string::String>,
222 |     #[prost(bytes = "vec", optional, tag = "2")]
223 |     pub value: ::core::option::Option<::prost::alloc::vec::Vec<u8>>,
224 | }
225 | #[derive(Clone, PartialEq, ::prost::Message)]
226 | pub struct StripeStatistics {
227 |     #[prost(message, repeated, tag = "1")]
228 |     pub col_stats: ::prost::alloc::vec::Vec<ColumnStatistics>,
229 | }
230 | #[derive(Clone, PartialEq, ::prost::Message)]
231 | pub struct Metadata {
232 |     #[prost(message, repeated, tag = "1")]
233 |     pub stripe_stats: ::prost::alloc::vec::Vec<StripeStatistics>,
234 | }
235 | #[derive(Clone, PartialEq, ::prost::Message)]
236 | pub struct Footer {
237 |     #[prost(uint64, optional, tag = "1")]
238 |     pub header_length: ::core::option::Option<u64>,
239 |     #[prost(uint64, optional, tag = "2")]
240 |     pub content_length: ::core::option::Option<u64>,
241 |     #[prost(message, repeated, tag = "3")]
242 |     pub stripes: ::prost::alloc::vec::Vec<StripeInformation>,
243 |     #[prost(message, repeated, tag = "4")]
244 |     pub types: ::prost::alloc::vec::Vec<Type>,
245 |     #[prost(message, repeated, tag = "5")]
246 |     pub metadata: ::prost::alloc::vec::Vec<UserMetadataItem>,
247 |     #[prost(uint64, optional, tag = "6")]
248 |     pub number_of_rows: ::core::option::Option<u64>,
249 |     #[prost(message, repeated, tag = "7")]
250 |     pub statistics: ::prost::alloc::vec::Vec<ColumnStatistics>,
251 |     #[prost(uint32, optional, tag = "8")]
252 |     pub row_index_stride: ::core::option::Option<u32>,
253 | }
254 | /// Serialized length must be less that 255 bytes
255 | #[derive(Clone, PartialEq, ::prost::Message)]
256 | pub struct PostScript {
257 |     #[prost(uint64, optional, tag = "1")]
258 |     pub footer_length: ::core::option::Option<u64>,
259 |     #[prost(enumeration = "CompressionKind", optional, tag = "2")]
260 |     pub compression: ::core::option::Option<i32>,
261 |     #[prost(uint64, optional, tag = "3")]
262 |     pub compression_block_size: ::core::option::Option<u64>,
263 |     /// the version of the file format
264 |     ///   [0, 11] = Hive 0.11
265 |     ///   [0, 12] = Hive 0.12
266 |     #[prost(uint32, repeated, tag = "4")]
267 |     pub version: ::prost::alloc::vec::Vec<u32>,
268 |     #[prost(uint64, optional, tag = "5")]
269 |     pub metadata_length: ::core::option::Option<u64>,
270 |     /// Version of the writer:
271 |     ///   0 (or missing) = original
272 |     ///   1 = HIVE-8732 fixed
273 |     #[prost(uint32, optional, tag = "6")]
274 |     pub writer_version: ::core::option::Option<u32>,
275 |     /// Leave this last in the record
276 |     #[prost(string, optional, tag = "8000")]
277 |     pub magic: ::core::option::Option<::prost::alloc::string::String>,
278 | }
279 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
280 | #[repr(i32)]
281 | pub enum CompressionKind {
282 |     None = 0,
283 |     Zlib = 1,
284 |     Snappy = 2,
285 |     Lzo = 3,
286 | }
287 | 


--------------------------------------------------------------------------------
/src/read/decode/rle_v2.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Read;
  2 | 
  3 | use crate::error::Error;
  4 | 
  5 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
  6 | enum EncodingTypeV2 {
  7 |     ShortRepeat,
  8 |     Direct,
  9 |     PatchedBase,
 10 |     Delta,
 11 | }
 12 | 
 13 | fn header_to_rle_v2_short_repeated_width(header: u8) -> u8 {
 14 |     (header & 0b00111000) >> 3
 15 | }
 16 | 
 17 | fn header_to_rle_v2_short_repeated_count(header: u8) -> u8 {
 18 |     header & 0b00000111
 19 | }
 20 | 
 21 | fn rle_v2_direct_bit_width(value: u8) -> u8 {
 22 |     match value {
 23 |         0 => 1,
 24 |         1 => 2,
 25 |         3 => 4,
 26 |         7 => 8,
 27 |         15 => 16,
 28 |         23 => 24,
 29 |         27 => 32,
 30 |         28 => 40,
 31 |         29 => 48,
 32 |         30 => 56,
 33 |         31 => 64,
 34 |         other => todo!("{other}"),
 35 |     }
 36 | }
 37 | 
 38 | fn header_to_rle_v2_direct_bit_width(header: u8) -> u8 {
 39 |     let bit_width = (header & 0b00111110) >> 1;
 40 |     rle_v2_direct_bit_width(bit_width)
 41 | }
 42 | 
 43 | fn rle_v2_delta_bit_width(value: u8) -> u8 {
 44 |     match value {
 45 |         0 => 0,
 46 |         1 => 2,
 47 |         3 => 4,
 48 |         7 => 8,
 49 |         15 => 16,
 50 |         23 => 24,
 51 |         27 => 32,
 52 |         28 => 40,
 53 |         29 => 48,
 54 |         30 => 56,
 55 |         31 => 64,
 56 |         other => todo!("{other}"),
 57 |     }
 58 | }
 59 | 
 60 | fn header_to_rle_v2_delta_bit_width(header: u8) -> u8 {
 61 |     let bit_width = (header & 0b00111110) >> 1;
 62 |     rle_v2_delta_bit_width(bit_width)
 63 | }
 64 | 
 65 | fn header_to_rle_v2_direct_length(header: u8, header1: u8) -> u16 {
 66 |     let bit = header & 0b00000001;
 67 |     let r = u16::from_be_bytes([bit, header1]);
 68 |     1 + r
 69 | }
 70 | 
 71 | fn unsigned_varint<R: Read>(reader: &mut R) -> Result<u64, Error> {
 72 |     let mut i = 0u64;
 73 |     let mut buf = [0u8; 1];
 74 |     let mut j = 0;
 75 |     loop {
 76 |         if j > 9 {
 77 |             // if j * 7 > 64
 78 |             return Err(Error::OutOfSpec);
 79 |         }
 80 |         reader.read_exact(&mut buf[..])?;
 81 |         i |= (u64::from(buf[0] & 0x7F)) << (j * 7);
 82 |         if (buf[0] >> 7) == 0 {
 83 |             break;
 84 |         } else {
 85 |             j += 1;
 86 |         }
 87 |     }
 88 |     Ok(i)
 89 | }
 90 | 
 91 | #[inline]
 92 | fn zigzag(z: u64) -> i64 {
 93 |     if z & 0x1 == 0 {
 94 |         (z >> 1) as i64
 95 |     } else {
 96 |         !(z >> 1) as i64
 97 |     }
 98 | }
 99 | 
100 | fn signed_varint<R: Read>(reader: &mut R) -> Result<i64, Error> {
101 |     unsigned_varint(reader).map(zigzag)
102 | }
103 | 
104 | #[inline]
105 | fn unpack(bytes: &[u8], num_bits: u8, index: usize) -> u64 {
106 |     if num_bits == 0 {
107 |         return 0;
108 |     };
109 |     let num_bits = num_bits as usize;
110 |     let start = num_bits * index; // in bits
111 |     let length = num_bits; // in bits
112 |     let byte_start = start / 8;
113 |     let byte_end = (start + length + 7) / 8;
114 |     // copy swapped
115 |     let slice = &bytes[byte_start..byte_end];
116 |     let mut a = [0u8; 8];
117 |     for (i, item) in slice.iter().rev().enumerate() {
118 |         a[i] = *item;
119 |     }
120 |     let bits = u64::from_le_bytes(a);
121 |     let offset = (slice.len() * 8 - num_bits) % 8 - start % 8;
122 |     (bits >> offset) & (!0u64 >> (64 - num_bits))
123 | }
124 | 
125 | #[derive(Debug)]
126 | pub struct UnsignedDirectRun {
127 |     data: Vec<u8>,
128 |     bit_width: u8,
129 |     index: usize,
130 |     length: usize,
131 | }
132 | 
133 | impl UnsignedDirectRun {
134 |     #[inline]
135 |     pub fn try_new<R: Read>(
136 |         header: u8,
137 |         reader: &mut R,
138 |         mut scratch: Vec<u8>,
139 |     ) -> Result<Self, Error> {
140 |         let mut header1 = [0u8];
141 |         reader.read_exact(&mut header1)?;
142 |         let bit_width = header_to_rle_v2_direct_bit_width(header);
143 | 
144 |         let length = header_to_rle_v2_direct_length(header, header1[0]);
145 | 
146 |         let additional = ((bit_width as usize) * (length as usize) + 7) / 8;
147 |         scratch.clear();
148 |         scratch.reserve(additional);
149 |         reader.take(additional as u64).read_to_end(&mut scratch)?;
150 | 
151 |         Ok(Self {
152 |             data: scratch,
153 |             bit_width,
154 |             index: 0,
155 |             length: length as usize,
156 |         })
157 |     }
158 | 
159 |     #[inline]
160 |     pub fn len(&self) -> usize {
161 |         self.length - self.index
162 |     }
163 | }
164 | 
165 | impl Iterator for UnsignedDirectRun {
166 |     type Item = u64;
167 | 
168 |     #[inline]
169 |     fn next(&mut self) -> Option<Self::Item> {
170 |         (self.index != self.length).then(|| {
171 |             let index = self.index;
172 |             self.index += 1;
173 |             unpack(&self.data, self.bit_width, index)
174 |         })
175 |     }
176 | 
177 |     #[inline]
178 |     fn size_hint(&self) -> (usize, Option<usize>) {
179 |         let remaining = self.len();
180 |         (remaining, Some(remaining))
181 |     }
182 | }
183 | 
184 | pub struct UnsignedDeltaRun {
185 |     encoded_deltas: Vec<u8>,
186 |     bit_width: u8,
187 |     index: usize,
188 |     length: usize,
189 |     base: u64,
190 |     delta_base: i64,
191 | }
192 | 
193 | impl UnsignedDeltaRun {
194 |     #[inline]
195 |     pub fn try_new<R: Read>(
196 |         header: u8,
197 |         reader: &mut R,
198 |         mut scratch: Vec<u8>,
199 |     ) -> Result<Self, Error> {
200 |         let mut header1 = [0u8];
201 |         reader.read_exact(&mut header1)?;
202 |         let bit_width = header_to_rle_v2_delta_bit_width(header);
203 | 
204 |         let length = header_to_rle_v2_direct_length(header, header1[0]);
205 | 
206 |         let base = unsigned_varint(reader)?;
207 |         let delta_base = signed_varint(reader)?;
208 |         let additional = ((length as usize - 2) * bit_width as usize + 7) / 8;
209 | 
210 |         scratch.clear();
211 |         scratch.reserve(additional);
212 |         reader.take(additional as u64).read_to_end(&mut scratch)?;
213 | 
214 |         Ok(Self {
215 |             base,
216 |             encoded_deltas: scratch,
217 |             bit_width,
218 |             index: 0,
219 |             length: length as usize,
220 |             delta_base,
221 |         })
222 |     }
223 | 
224 |     #[inline]
225 |     pub fn len(&self) -> usize {
226 |         self.length - self.index
227 |     }
228 | 
229 |     #[inline]
230 |     pub fn into_inner(mut self) -> Vec<u8> {
231 |         self.encoded_deltas.clear();
232 |         self.encoded_deltas
233 |     }
234 | }
235 | 
236 | impl Iterator for UnsignedDeltaRun {
237 |     type Item = u64;
238 | 
239 |     #[inline]
240 |     fn next(&mut self) -> Option<Self::Item> {
241 |         (self.index != self.length).then(|| {
242 |             let index = self.index;
243 |             if index == 0 {
244 |                 self.index += 1;
245 |                 return self.base;
246 |             }
247 |             if index == 1 || self.bit_width == 0 {
248 |                 self.index += 1;
249 |                 if self.delta_base > 0 {
250 |                     self.base += self.delta_base as u64;
251 |                 } else {
252 |                     self.base -= (-self.delta_base) as u64;
253 |                 }
254 |                 return self.base;
255 |             }
256 |             self.index += 1;
257 |             let delta = unpack(&self.encoded_deltas, self.bit_width, index - 2);
258 |             if self.delta_base > 0 {
259 |                 self.base += delta;
260 |             } else {
261 |                 self.base -= delta;
262 |             }
263 |             self.base
264 |         })
265 |     }
266 | 
267 |     #[inline]
268 |     fn size_hint(&self) -> (usize, Option<usize>) {
269 |         let remaining = self.len();
270 |         (remaining, Some(remaining))
271 |     }
272 | }
273 | 
274 | #[derive(Debug)]
275 | pub struct UnsignedShortRepeat {
276 |     value: u64,
277 |     remaining: usize,
278 |     scratch: Vec<u8>,
279 | }
280 | 
281 | impl UnsignedShortRepeat {
282 |     #[inline]
283 |     fn try_new<R: Read>(header: u8, reader: &mut R, mut scratch: Vec<u8>) -> Result<Self, Error> {
284 |         let width = 1 + header_to_rle_v2_short_repeated_width(header);
285 |         let count = 3 + header_to_rle_v2_short_repeated_count(header);
286 | 
287 |         scratch.clear();
288 |         scratch.reserve(width as usize);
289 |         reader.take(width as u64).read_to_end(&mut scratch)?;
290 | 
291 |         let mut a = [0u8; 8];
292 |         a[8 - scratch.len()..].copy_from_slice(&scratch);
293 |         let value = u64::from_be_bytes(a);
294 |         scratch.clear();
295 | 
296 |         Ok(Self {
297 |             value,
298 |             remaining: count as usize,
299 |             scratch,
300 |         })
301 |     }
302 | 
303 |     #[inline]
304 |     pub fn len(&self) -> usize {
305 |         self.remaining
306 |     }
307 | 
308 |     #[inline]
309 |     pub fn into_inner(self) -> Vec<u8> {
310 |         self.scratch
311 |     }
312 | }
313 | 
314 | impl Iterator for UnsignedShortRepeat {
315 |     type Item = u64;
316 | 
317 |     #[inline]
318 |     fn next(&mut self) -> Option<Self::Item> {
319 |         (self.remaining != 0).then(|| {
320 |             self.remaining -= 1;
321 |             self.value
322 |         })
323 |     }
324 | 
325 |     #[inline]
326 |     fn size_hint(&self) -> (usize, Option<usize>) {
327 |         (self.len(), Some(self.len()))
328 |     }
329 | }
330 | 
331 | #[derive(Debug)]
332 | pub struct SignedDeltaRun {
333 |     encoded_deltas: Vec<u8>,
334 |     bit_width: u8,
335 |     index: usize,
336 |     length: usize,
337 |     base: i64,
338 |     delta_base: i64,
339 | }
340 | 
341 | impl SignedDeltaRun {
342 |     #[inline]
343 |     fn try_new<R: Read>(header: u8, reader: &mut R, mut scratch: Vec<u8>) -> Result<Self, Error> {
344 |         let mut header1 = [0u8];
345 |         reader.read_exact(&mut header1)?;
346 |         let bit_width = header_to_rle_v2_delta_bit_width(header);
347 | 
348 |         let length = header_to_rle_v2_direct_length(header, header1[0]);
349 | 
350 |         let base = unsigned_varint(reader).map(zigzag)?;
351 |         let delta_base = signed_varint(reader)?;
352 |         let additional = ((length as usize - 2) * bit_width as usize + 7) / 8;
353 | 
354 |         scratch.clear();
355 |         scratch.reserve(additional);
356 |         reader.take(additional as u64).read_to_end(&mut scratch)?;
357 | 
358 |         Ok(Self {
359 |             base,
360 |             encoded_deltas: scratch,
361 |             bit_width,
362 |             index: 0,
363 |             length: length as usize,
364 |             delta_base,
365 |         })
366 |     }
367 | 
368 |     pub fn len(&self) -> usize {
369 |         self.length - self.index
370 |     }
371 | 
372 |     #[must_use]
373 |     pub fn is_empty(&self) -> bool {
374 |         self.len() == 0
375 |     }
376 | }
377 | 
378 | impl Iterator for SignedDeltaRun {
379 |     type Item = i64;
380 | 
381 |     #[inline]
382 |     fn next(&mut self) -> Option<Self::Item> {
383 |         (self.index != self.length).then(|| {
384 |             let index = self.index;
385 |             if index == 0 {
386 |                 self.index += 1;
387 |                 return self.base;
388 |             }
389 |             if index == 1 || self.bit_width == 0 {
390 |                 self.index += 1;
391 |                 if self.delta_base > 0 {
392 |                     self.base += self.delta_base as i64;
393 |                 } else {
394 |                     self.base -= (-self.delta_base) as i64;
395 |                 }
396 |                 return self.base;
397 |             }
398 |             self.index += 1;
399 |             // edge case where `bit_width == 0`, where deltas are equal to base delta
400 |             let delta = unpack(&self.encoded_deltas, self.bit_width, index - 2);
401 |             if self.delta_base > 0 {
402 |                 self.base += delta as i64;
403 |             } else {
404 |                 self.base -= delta as i64;
405 |             }
406 |             self.base
407 |         })
408 |     }
409 | 
410 |     #[inline]
411 |     fn size_hint(&self) -> (usize, Option<usize>) {
412 |         let remaining = self.length - self.index;
413 |         (remaining, Some(remaining))
414 |     }
415 | }
416 | 
417 | #[inline]
418 | fn run_encoding(header: u8) -> EncodingTypeV2 {
419 |     match (header & 128 == 128, header & 64 == 64) {
420 |         // 11... = 3
421 |         (true, true) => EncodingTypeV2::Delta,
422 |         // 10... = 2
423 |         (true, false) => EncodingTypeV2::PatchedBase,
424 |         // 01... = 1
425 |         (false, true) => EncodingTypeV2::Direct,
426 |         // 00... = 0
427 |         (false, false) => EncodingTypeV2::ShortRepeat,
428 |     }
429 | }
430 | 
431 | /// An enum describing one of the RLE v2 runs for unsigned integers
432 | pub enum UnsignedRleV2Run {
433 |     /// Direct
434 |     Direct(UnsignedDirectRun),
435 |     /// Delta
436 |     Delta(UnsignedDeltaRun),
437 |     /// Short repeat
438 |     ShortRepeat(UnsignedShortRepeat),
439 | }
440 | 
441 | impl UnsignedRleV2Run {
442 |     /// Returns a new [`UnsignedRleV2Run`] owning `scratch`.
443 |     pub fn try_new<R: Read>(reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
444 |         let mut header = [0u8];
445 |         reader.read_exact(&mut header)?;
446 |         let header = header[0];
447 |         let encoding = run_encoding(header);
448 | 
449 |         match encoding {
450 |             EncodingTypeV2::Direct => {
451 |                 UnsignedDirectRun::try_new(header, reader, scratch).map(Self::Direct)
452 |             }
453 |             EncodingTypeV2::Delta => {
454 |                 UnsignedDeltaRun::try_new(header, reader, scratch).map(Self::Delta)
455 |             }
456 |             EncodingTypeV2::ShortRepeat => {
457 |                 UnsignedShortRepeat::try_new(header, reader, scratch).map(Self::ShortRepeat)
458 |             }
459 |             other => todo!("{other:?}"),
460 |         }
461 |     }
462 | 
463 |     /// The number of items remaining
464 |     pub fn len(&self) -> usize {
465 |         match self {
466 |             Self::Direct(run) => run.len(),
467 |             Self::Delta(run) => run.len(),
468 |             Self::ShortRepeat(run) => run.len(),
469 |         }
470 |     }
471 | 
472 |     /// Whether the iterator is empty
473 |     #[must_use]
474 |     pub fn is_empty(&self) -> bool {
475 |         self.len() == 0
476 |     }
477 | }
478 | 
479 | /// A fallible [`Iterator`] of [`UnsignedRleV2Run`].
480 | pub struct UnsignedRleV2RunIter<R: Read> {
481 |     reader: R,
482 |     scratch: Vec<u8>,
483 |     length: usize,
484 | }
485 | 
486 | impl<R: Read> UnsignedRleV2RunIter<R> {
487 |     /// Returns a new [`UnsignedRleV2RunIter`].
488 |     pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
489 |         Self {
490 |             reader,
491 |             scratch,
492 |             length,
493 |         }
494 |     }
495 | 
496 |     /// Returns its internal buffer
497 |     pub fn into_inner(mut self) -> (R, Vec<u8>) {
498 |         self.scratch.clear();
499 |         (self.reader, self.scratch)
500 |     }
501 | }
502 | 
503 | impl<R: Read> Iterator for UnsignedRleV2RunIter<R> {
504 |     type Item = Result<UnsignedRleV2Run, Error>;
505 | 
506 |     fn next(&mut self) -> Option<Self::Item> {
507 |         (self.length != 0).then(|| {
508 |             let run =
509 |                 UnsignedRleV2Run::try_new(&mut self.reader, std::mem::take(&mut self.scratch))?;
510 |             self.length -= run.len();
511 |             Ok(run)
512 |         })
513 |     }
514 | }
515 | 
516 | /// A fallible [`Iterator`] of [`i64`].
517 | pub struct UnsignedRleV2Iter<R: Read> {
518 |     current: Option<UnsignedRleV2Run>,
519 |     runs: UnsignedRleV2RunIter<R>,
520 | }
521 | 
522 | impl<R: Read> UnsignedRleV2Iter<R> {
523 |     /// Returns a new [`SignedRleV2Iter`].
524 |     pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
525 |         Self {
526 |             runs: UnsignedRleV2RunIter::new(reader, length, scratch),
527 |             current: None,
528 |         }
529 |     }
530 | 
531 |     /// Returns its internal buffer
532 |     pub fn into_inner(self) -> (R, Vec<u8>) {
533 |         self.runs.into_inner()
534 |     }
535 | }
536 | 
537 | impl<R: Read> Iterator for UnsignedRleV2Iter<R> {
538 |     type Item = Result<u64, Error>;
539 | 
540 |     #[inline]
541 |     fn next(&mut self) -> Option<Self::Item> {
542 |         let next = if let Some(run) = &mut self.current {
543 |             match run {
544 |                 UnsignedRleV2Run::Direct(values_iter) => values_iter.next(),
545 |                 UnsignedRleV2Run::Delta(values_iter) => values_iter.next(),
546 |                 UnsignedRleV2Run::ShortRepeat(values_iter) => values_iter.next(),
547 |             }
548 |         } else {
549 |             None
550 |         };
551 | 
552 |         if next.is_none() {
553 |             match self.runs.next()? {
554 |                 Ok(run) => self.current = Some(run),
555 |                 Err(e) => return Some(Err(e)),
556 |             }
557 |             self.next()
558 |         } else {
559 |             next.map(Ok)
560 |         }
561 |     }
562 | }
563 | 
564 | #[derive(Debug)]
565 | pub struct SignedDirectRun(UnsignedDirectRun);
566 | 
567 | impl SignedDirectRun {
568 |     pub fn try_new<R: Read>(header: u8, reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
569 |         UnsignedDirectRun::try_new(header, reader, scratch).map(Self)
570 |     }
571 | 
572 |     pub fn len(&self) -> usize {
573 |         self.0.len()
574 |     }
575 | 
576 |     /// Whether the iterator is empty
577 |     #[must_use]
578 |     pub fn is_empty(&self) -> bool {
579 |         self.len() == 0
580 |     }
581 | }
582 | 
583 | impl Iterator for SignedDirectRun {
584 |     type Item = i64;
585 | 
586 |     fn next(&mut self) -> Option<Self::Item> {
587 |         self.0.next().map(zigzag)
588 |     }
589 | 
590 |     fn size_hint(&self) -> (usize, Option<usize>) {
591 |         self.0.size_hint()
592 |     }
593 | }
594 | 
595 | #[derive(Debug)]
596 | pub struct SignedShortRepeat(UnsignedShortRepeat);
597 | 
598 | impl SignedShortRepeat {
599 |     pub fn try_new<R: Read>(header: u8, reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
600 |         UnsignedShortRepeat::try_new(header, reader, scratch).map(Self)
601 |     }
602 | 
603 |     /// The number of items remaining
604 |     pub fn len(&self) -> usize {
605 |         self.0.len()
606 |     }
607 | 
608 |     /// Whether the iterator is empty
609 |     #[must_use]
610 |     pub fn is_empty(&self) -> bool {
611 |         self.len() == 0
612 |     }
613 | }
614 | 
615 | impl Iterator for SignedShortRepeat {
616 |     type Item = i64;
617 | 
618 |     fn next(&mut self) -> Option<Self::Item> {
619 |         self.0.next().map(zigzag)
620 |     }
621 | 
622 |     fn size_hint(&self) -> (usize, Option<usize>) {
623 |         self.0.size_hint()
624 |     }
625 | }
626 | 
627 | /// An enum describing one of the RLE v2 runs for signed integers
628 | #[derive(Debug)]
629 | pub enum SignedRleV2Run {
630 |     /// Direct
631 |     Direct(SignedDirectRun),
632 |     /// Delta
633 |     Delta(SignedDeltaRun),
634 |     /// Short repeat
635 |     ShortRepeat(SignedShortRepeat),
636 | }
637 | 
638 | impl SignedRleV2Run {
639 |     /// Returns a new [`SignedRleV2Run`], moving `scratch` to itself
640 |     pub fn try_new<R: Read>(reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
641 |         let mut header = [0u8];
642 |         reader.read_exact(&mut header)?;
643 |         let header = header[0];
644 |         let encoding = run_encoding(header);
645 | 
646 |         match encoding {
647 |             EncodingTypeV2::Direct => {
648 |                 SignedDirectRun::try_new(header, reader, scratch).map(Self::Direct)
649 |             }
650 |             EncodingTypeV2::Delta => {
651 |                 SignedDeltaRun::try_new(header, reader, scratch).map(Self::Delta)
652 |             }
653 |             EncodingTypeV2::ShortRepeat => {
654 |                 SignedShortRepeat::try_new(header, reader, scratch).map(Self::ShortRepeat)
655 |             }
656 |             other => todo!("{other:?}"),
657 |         }
658 |     }
659 | 
660 |     /// The number of items remaining
661 |     pub fn len(&self) -> usize {
662 |         match self {
663 |             Self::Direct(run) => run.len(),
664 |             Self::Delta(run) => run.len(),
665 |             Self::ShortRepeat(run) => run.len(),
666 |         }
667 |     }
668 | 
669 |     /// Whether the iterator is empty
670 |     #[must_use]
671 |     pub fn is_empty(&self) -> bool {
672 |         self.len() == 0
673 |     }
674 | }
675 | 
676 | /// A fallible [`Iterator`] of [`SignedRleV2Run`].
677 | pub struct SignedRleV2RunIter<R: Read> {
678 |     reader: R,
679 |     scratch: Vec<u8>,
680 |     length: usize,
681 | }
682 | 
683 | impl<R: Read> SignedRleV2RunIter<R> {
684 |     /// Returns a new [`SignedRleV2RunIter`].
685 |     pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
686 |         Self {
687 |             reader,
688 |             scratch,
689 |             length,
690 |         }
691 |     }
692 | 
693 |     pub fn into_inner(mut self) -> (R, Vec<u8>) {
694 |         self.scratch.clear();
695 |         (self.reader, self.scratch)
696 |     }
697 | }
698 | 
699 | impl<R: Read> Iterator for SignedRleV2RunIter<R> {
700 |     type Item = Result<SignedRleV2Run, Error>;
701 | 
702 |     #[inline]
703 |     fn next(&mut self) -> Option<Self::Item> {
704 |         (self.length != 0).then(|| {
705 |             let run = SignedRleV2Run::try_new(&mut self.reader, std::mem::take(&mut self.scratch))?;
706 |             self.length -= run.len();
707 |             Ok(run)
708 |         })
709 |     }
710 | }
711 | 
712 | /// A fallible [`Iterator`] of [`i64`].
713 | pub struct SignedRleV2Iter<R: Read> {
714 |     current: Option<SignedRleV2Run>,
715 |     runs: SignedRleV2RunIter<R>,
716 | }
717 | 
718 | impl<R: Read> SignedRleV2Iter<R> {
719 |     /// Returns a new [`SignedRleV2Iter`].
720 |     pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
721 |         Self {
722 |             runs: SignedRleV2RunIter::new(reader, length, scratch),
723 |             current: None,
724 |         }
725 |     }
726 | 
727 |     /// Returns its internal buffer
728 |     pub fn into_inner(self) -> (R, Vec<u8>) {
729 |         self.runs.into_inner()
730 |     }
731 | }
732 | 
733 | impl<R: Read> Iterator for SignedRleV2Iter<R> {
734 |     type Item = Result<i64, Error>;
735 | 
736 |     #[inline]
737 |     fn next(&mut self) -> Option<Self::Item> {
738 |         let next = if let Some(run) = &mut self.current {
739 |             match run {
740 |                 SignedRleV2Run::Direct(values_iter) => values_iter.next(),
741 |                 SignedRleV2Run::Delta(values_iter) => values_iter.next(),
742 |                 SignedRleV2Run::ShortRepeat(values_iter) => values_iter.next(),
743 |             }
744 |         } else {
745 |             None
746 |         };
747 | 
748 |         if next.is_none() {
749 |             match self.runs.next()? {
750 |                 Ok(run) => self.current = Some(run),
751 |                 Err(e) => return Some(Err(e)),
752 |             }
753 |             self.next()
754 |         } else {
755 |             next.map(Ok)
756 |         }
757 |     }
758 | }
759 | 
760 | #[cfg(test)]
761 | mod test {
762 |     use super::*;
763 | 
764 |     #[test]
765 |     fn test_zigzag() {
766 |         assert_eq!(zigzag(2), 1);
767 |         assert_eq!(zigzag(4), 2);
768 |     }
769 | 
770 |     #[test]
771 |     fn unpacking() {
772 |         let bytes = [0b01000000u8];
773 |         assert_eq!(unpack(&bytes, 2, 0), 1);
774 |         assert_eq!(unpack(&bytes, 2, 1), 0);
775 |     }
776 | 
777 |     #[test]
778 |     fn short_repeat() {
779 |         // [10000, 10000, 10000, 10000, 10000]
780 |         let data: [u8; 3] = [0x0a, 0x27, 0x10];
781 | 
782 |         let a = UnsignedShortRepeat::try_new(data[0], &mut &data[1..], vec![])
783 |             .unwrap()
784 |             .collect::<Vec<_>>();
785 |         assert_eq!(a, vec![10000, 10000, 10000, 10000, 10000]);
786 |     }
787 | 
788 |     #[test]
789 |     fn direct() {
790 |         // [23713, 43806, 57005, 48879]
791 |         let data: [u8; 10] = [0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe, 0xef];
792 | 
793 |         let data = &mut data.as_ref();
794 | 
795 |         let a = UnsignedDirectRun::try_new(data[0], &mut &data[1..], vec![])
796 |             .unwrap()
797 |             .collect::<Vec<_>>();
798 |         assert_eq!(a, vec![23713, 43806, 57005, 48879]);
799 |     }
800 | 
801 |     #[test]
802 |     fn delta() {
803 |         // [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
804 |         // 0x22 = 34
805 |         // 0x42 = 66
806 |         // 0x46 = 70
807 |         let data: [u8; 8] = [0xc6, 0x09, 0x02, 0x02, 0x22, 0x42, 0x42, 0x46];
808 | 
809 |         let data = &mut data.as_ref();
810 | 
811 |         let a = UnsignedDeltaRun::try_new(data[0], &mut &data[1..], vec![])
812 |             .unwrap()
813 |             .collect::<Vec<_>>();
814 |         assert_eq!(a, vec![2, 3, 5, 7, 11, 13, 17, 19, 23, 29]);
815 |     }
816 | }
817 | 


--------------------------------------------------------------------------------