├── .github └── workflows │ ├── coverage.yml │ └── test.yml ├── .gitignore ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── src ├── error.rs ├── file.rs ├── lib.md ├── lib.rs ├── read │ ├── block.rs │ ├── decode.rs │ ├── decompress.rs │ └── mod.rs ├── read_async │ ├── block.rs │ ├── decode.rs │ └── mod.rs ├── schema │ ├── de.rs │ ├── mod.rs │ └── se.rs ├── write │ ├── block.rs │ ├── compression.rs │ ├── encode.rs │ ├── file.rs │ └── mod.rs └── write_async.rs └── tests └── it ├── file.rs └── main.rs /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | 3 | on: [pull_request, push] 4 | 5 | jobs: 6 | coverage: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Install Rust 11 | run: rustup toolchain install stable --component llvm-tools-preview 12 | - name: Install cargo-llvm-cov 13 | uses: taiki-e/install-action@cargo-llvm-cov 14 | - uses: Swatinem/rust-cache@v1 15 | - name: Generate code coverage 16 | run: cargo llvm-cov --features full --lcov --output-path lcov.info 17 | - name: Upload coverage to Codecov 18 | uses: codecov/codecov-action@v1 19 | with: 20 | files: lcov.info 21 | fail_ci_if_error: true 22 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Check and test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ubuntu-latest: 7 | name: Test full 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: Install Rust 12 | run: rustup update stable 13 | - uses: Swatinem/rust-cache@v1 14 | - name: Run 15 | run: cargo test --features full 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "avro-schema" 3 | version = "0.3.0" 4 | license = "MIT/Apache-2.0" 5 | description = "Apache Avro specification" 6 | homepage = "https://github.com/DataEngineeringLabs/avro-schema" 7 | repository = "https://github.com/DataEngineeringLabs/avro-schema" 8 | authors = ["Jorge C. Leitao "] 9 | keywords = [ "avro", "analytics" ] 10 | edition = "2018" 11 | 12 | [dependencies] 13 | serde_json = { version = "1.0", default-features = false, features = ["std"] } 14 | serde = { version = "1.0", default-features = false } 15 | 16 | fallible-streaming-iterator = { version = "0.1" } 17 | 18 | libflate = { version = "1.1.1", optional = true } 19 | snap = { version = "1", optional = true } 20 | crc = { version = "2", optional = true } 21 | 22 | # for async 23 | futures = { version = "0.3", optional = true } 24 | async-stream = { version = "0.3.2", optional = true } 25 | 26 | [features] 27 | default = [] 28 | full = ["compression", "async"] 29 | compression = [ 30 | "libflate", 31 | "snap", 32 | "crc", 33 | ] 34 | async = ["futures", "async-stream"] 35 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | https://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | Copyright 2022 Jorge C. Leitão 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | https://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2022 Jorge C Leitao 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Avro-schema - Avro schema in Rust 2 | 3 | [![test](https://github.com/DataEngineeringLabs/avro-schema/actions/workflows/test.yml/badge.svg)](https://github.com/DataEngineeringLabs/avro-schema/actions/workflows/test.yml) 4 | [![codecov](https://codecov.io/gh/DataEngineeringLabs/avro-schema/branch/main/graph/badge.svg)](https://codecov.io/gh/DataEngineeringLabs/avro-schema) 5 | 6 | This crate contains the complete implementation of the schemas of 7 | the [Avro specification](https://avro.apache.org/docs/current/spec.html) in 8 | native Rust. 9 | 10 | See API documentation with examples on how to read and write. 11 | 12 | ## License 13 | 14 | Licensed under either of 15 | 16 | * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 17 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 18 | 19 | at your option. 20 | 21 | ### Contribution 22 | 23 | Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. 24 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | //! Contains [`Error`] 2 | 3 | /// Error from this crate 4 | #[derive(Debug, Clone, Copy)] 5 | pub enum Error { 6 | /// Generic error when the file is out of spec 7 | OutOfSpec, 8 | /// When reading or writing with compression but the feature flag "compression" is not active. 9 | RequiresCompression, 10 | } 11 | 12 | impl std::fmt::Display for Error { 13 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 14 | write!(f, "{:?}", self) 15 | } 16 | } 17 | 18 | impl From for Error { 19 | fn from(_: std::io::Error) -> Self { 20 | Error::OutOfSpec 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/file.rs: -------------------------------------------------------------------------------- 1 | //! Contains structs found in Avro files 2 | use crate::schema::Record; 3 | 4 | /// Avro file's Metadata 5 | #[derive(Debug, Clone, PartialEq, Hash)] 6 | pub struct FileMetadata { 7 | /// The Record represented in the file's Schema 8 | pub record: Record, 9 | /// The files' compression 10 | pub compression: Option, 11 | /// The files' marker, present in every block 12 | pub marker: [u8; 16], 13 | } 14 | 15 | /// A compressed Avro block. 16 | #[derive(Debug, Clone, Default, PartialEq, Eq)] 17 | pub struct CompressedBlock { 18 | /// The number of rows 19 | pub number_of_rows: usize, 20 | /// The compressed data 21 | pub data: Vec, 22 | } 23 | 24 | impl CompressedBlock { 25 | /// Creates a new CompressedBlock 26 | pub fn new(number_of_rows: usize, data: Vec) -> Self { 27 | Self { 28 | number_of_rows, 29 | data, 30 | } 31 | } 32 | } 33 | 34 | /// An uncompressed Avro block. 35 | #[derive(Debug, Clone, Default, PartialEq, Eq)] 36 | pub struct Block { 37 | /// The number of rows 38 | pub number_of_rows: usize, 39 | /// The uncompressed data 40 | pub data: Vec, 41 | } 42 | 43 | impl Block { 44 | /// Creates a new Block 45 | pub fn new(number_of_rows: usize, data: Vec) -> Self { 46 | Self { 47 | number_of_rows, 48 | data, 49 | } 50 | } 51 | } 52 | 53 | /// Valid compressions 54 | #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] 55 | pub enum Compression { 56 | /// Deflate 57 | Deflate, 58 | /// Snappy 59 | Snappy, 60 | } 61 | -------------------------------------------------------------------------------- /src/lib.md: -------------------------------------------------------------------------------- 1 | Welcome to avro-schema's documentation. Thanks for checking it out! 2 | 3 | This is a library containing declarations of the 4 | [Avro specification](https://avro.apache.org/docs/current/spec.html) 5 | in Rust's struct and enums together with serialization and deserialization 6 | implementations based on `serde_json`. 7 | 8 | It also contains basic functionality to read and deserialize Avro's file's metadata 9 | and blocks. 10 | 11 | Example of reading a file: 12 | 13 | ```rust 14 | use std::convert::TryInto; 15 | use std::fs::File; 16 | use std::io::BufReader; 17 | 18 | use avro_schema::error::Error; 19 | use avro_schema::read::fallible_streaming_iterator::FallibleStreamingIterator; 20 | 21 | fn read_avro(path: &str) -> Result<(), Error> { 22 | let file = &mut BufReader::new(File::open(path)?); 23 | 24 | let metadata = avro_schema::read::read_metadata(file)?; 25 | 26 | println!("{:#?}", metadata); 27 | 28 | let mut blocks = 29 | avro_schema::read::BlockStreamingIterator::new(file, metadata.compression, metadata.marker); 30 | 31 | while let Some(block) = blocks.next()? { 32 | let _fields = &metadata.record.fields; 33 | let length = block.number_of_rows; 34 | let mut block: &[u8] = block.data.as_ref(); 35 | // at this point you can deserialize the block based on `_fields` according 36 | // to avro's specification. Note that `Block` is already decompressed. 37 | // for example, if there was a single field with f32, we would use 38 | for _ in 0..length { 39 | let (item, remaining) = block.split_at(4); 40 | block = remaining; 41 | let _value = f32::from_le_bytes(item.try_into().unwrap()); 42 | // if there were more fields, we would need to consume (or skip) the remaining 43 | // here. You can use `avro_schema::read::decode::zigzag_i64` for integers :D 44 | } 45 | } 46 | 47 | Ok(()) 48 | } 49 | ``` 50 | 51 | Example of writing a file 52 | 53 | ```rust 54 | use std::fs::File; 55 | 56 | use avro_schema::error::Error; 57 | use avro_schema::file::Block; 58 | use avro_schema::schema::{Field, Record, Schema}; 59 | 60 | fn write_avro(compression: Option) -> Result<(), Error> { 61 | let mut file = File::create("test.avro")?; 62 | 63 | let record = Record::new("", vec![Field::new("value", Schema::Float)]); 64 | 65 | avro_schema::write::write_metadata(&mut file, record, compression)?; 66 | 67 | // given some data: 68 | let array = vec![1.0f32, 2.0]; 69 | 70 | // we need to create a `Block` 71 | let mut data: Vec = vec![]; 72 | for item in array.iter() { 73 | let bytes = item.to_le_bytes(); 74 | data.extend(bytes); 75 | } 76 | let mut block = Block::new(array.len(), data); 77 | 78 | // once completed, we compress it 79 | let mut compressed_block = avro_schema::file::CompressedBlock::default(); 80 | let _ = avro_schema::write::compress(&mut block, &mut compressed_block, compression)?; 81 | 82 | // and finally write it to the file 83 | avro_schema::write::write_block(&mut file, &compressed_block)?; 84 | 85 | Ok(()) 86 | } 87 | ``` 88 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("lib.md")] 2 | #![forbid(unsafe_code)] 3 | #![forbid(missing_docs)] 4 | 5 | pub mod error; 6 | pub mod file; 7 | pub mod schema; 8 | 9 | pub mod read; 10 | #[cfg(feature = "async")] 11 | #[cfg_attr(docsrs, doc(cfg(feature = "async")))] 12 | pub mod read_async; 13 | 14 | pub mod write; 15 | #[cfg(feature = "async")] 16 | #[cfg_attr(docsrs, doc(cfg(feature = "async")))] 17 | pub mod write_async; 18 | -------------------------------------------------------------------------------- /src/read/block.rs: -------------------------------------------------------------------------------- 1 | //! APIs to read from Avro format to arrow. 2 | use std::io::Read; 3 | 4 | use fallible_streaming_iterator::FallibleStreamingIterator; 5 | 6 | use crate::{error::Error, file::CompressedBlock}; 7 | 8 | use super::decode; 9 | 10 | fn read_size(reader: &mut R) -> Result<(usize, usize), Error> { 11 | let rows = match decode::internal_zigzag_i64(reader) { 12 | Ok(a) => a, 13 | Err(error) => match error { 14 | decode::DecodeError::EndOfFile => return Ok((0, 0)), 15 | decode::DecodeError::OutOfSpec => return Err(Error::OutOfSpec), 16 | }, 17 | }; 18 | let bytes = decode::zigzag_i64(reader)?; 19 | Ok((rows as usize, bytes as usize)) 20 | } 21 | 22 | /// Reads a [`CompressedBlock`] from the `reader`. 23 | /// # Error 24 | /// This function errors iff either the block cannot be read or the sync marker does not match 25 | fn read_block( 26 | reader: &mut R, 27 | block: &mut CompressedBlock, 28 | marker: [u8; 16], 29 | ) -> Result<(), Error> { 30 | let (rows, bytes) = read_size(reader)?; 31 | block.number_of_rows = rows; 32 | if rows == 0 { 33 | return Ok(()); 34 | }; 35 | 36 | block.data.clear(); 37 | block 38 | .data 39 | .try_reserve(bytes) 40 | .map_err(|_| Error::OutOfSpec)?; 41 | reader.take(bytes as u64).read_to_end(&mut block.data)?; 42 | 43 | let mut block_marker = [0u8; 16]; 44 | reader.read_exact(&mut block_marker)?; 45 | 46 | if block_marker != marker { 47 | return Err(Error::OutOfSpec); 48 | } 49 | Ok(()) 50 | } 51 | 52 | /// [`FallibleStreamingIterator`] of [`CompressedBlock`]. 53 | pub struct CompressedBlockStreamingIterator { 54 | buf: CompressedBlock, 55 | reader: R, 56 | marker: [u8; 16], 57 | } 58 | 59 | impl CompressedBlockStreamingIterator { 60 | /// Creates a new [`CompressedBlockStreamingIterator`]. 61 | pub fn new(reader: R, marker: [u8; 16], scratch: Vec) -> Self { 62 | Self { 63 | reader, 64 | marker, 65 | buf: CompressedBlock::new(0, scratch), 66 | } 67 | } 68 | 69 | /// The buffer of [`CompressedBlockStreamingIterator`]. 70 | pub fn buffer(&mut self) -> &mut CompressedBlock { 71 | &mut self.buf 72 | } 73 | 74 | /// Deconstructs itself 75 | pub fn into_inner(self) -> (R, Vec) { 76 | (self.reader, self.buf.data) 77 | } 78 | } 79 | 80 | impl FallibleStreamingIterator for CompressedBlockStreamingIterator { 81 | type Error = Error; 82 | type Item = CompressedBlock; 83 | 84 | fn advance(&mut self) -> Result<(), Error> { 85 | read_block(&mut self.reader, &mut self.buf, self.marker)?; 86 | Ok(()) 87 | } 88 | 89 | fn get(&self) -> Option<&Self::Item> { 90 | if self.buf.number_of_rows > 0 { 91 | Some(&self.buf) 92 | } else { 93 | None 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/read/decode.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::io::Read; 3 | 4 | use crate::error::Error; 5 | use crate::file::Compression; 6 | use crate::schema::Schema; 7 | 8 | use super::{avro_decode, read_header}; 9 | 10 | pub enum DecodeError { 11 | OutOfSpec, 12 | EndOfFile, 13 | } 14 | 15 | impl From for Error { 16 | fn from(_: DecodeError) -> Self { 17 | Error::OutOfSpec 18 | } 19 | } 20 | 21 | pub fn internal_zigzag_i64(reader: &mut R) -> Result { 22 | let z = decode_variable(reader)?; 23 | Ok(if z & 0x1 == 0 { 24 | (z >> 1) as i64 25 | } else { 26 | !(z >> 1) as i64 27 | }) 28 | } 29 | 30 | pub fn zigzag_i64(reader: &mut R) -> Result { 31 | let z = decode_variable(reader)?; 32 | Ok(if z & 0x1 == 0 { 33 | (z >> 1) as i64 34 | } else { 35 | !(z >> 1) as i64 36 | }) 37 | } 38 | 39 | #[inline] 40 | fn decode_variable(reader: &mut R) -> Result { 41 | avro_decode!(reader) 42 | } 43 | 44 | fn _read_binary(reader: &mut R) -> Result, Error> { 45 | let len: usize = zigzag_i64(reader)? as usize; 46 | let mut buf = vec![]; 47 | buf.try_reserve(len).map_err(|_| Error::OutOfSpec)?; 48 | reader.take(len as u64).read_to_end(&mut buf)?; 49 | Ok(buf) 50 | } 51 | 52 | pub fn read_header(reader: &mut R) -> Result>, Error> { 53 | read_header!(reader) 54 | } 55 | 56 | pub(crate) fn read_file_marker(reader: &mut R) -> Result<[u8; 16], Error> { 57 | let mut marker = [0u8; 16]; 58 | reader.read_exact(&mut marker)?; 59 | Ok(marker) 60 | } 61 | 62 | /// Deserializes the Avro header into an Avro [`Schema`] and optional [`Compression`]. 63 | pub(crate) fn deserialize_header( 64 | header: HashMap>, 65 | ) -> Result<(Schema, Option), Error> { 66 | let schema = header 67 | .get("avro.schema") 68 | .ok_or(Error::OutOfSpec) 69 | .and_then(|bytes| serde_json::from_slice(bytes.as_ref()).map_err(|_| Error::OutOfSpec))?; 70 | 71 | let compression = header.get("avro.codec").and_then(|bytes| { 72 | let bytes: &[u8] = bytes.as_ref(); 73 | match bytes { 74 | b"snappy" => Some(Compression::Snappy), 75 | b"deflate" => Some(Compression::Deflate), 76 | _ => None, 77 | } 78 | }); 79 | Ok((schema, compression)) 80 | } 81 | -------------------------------------------------------------------------------- /src/read/decompress.rs: -------------------------------------------------------------------------------- 1 | //! APIs to read from Avro format to arrow. 2 | use std::io::Read; 3 | 4 | use fallible_streaming_iterator::FallibleStreamingIterator; 5 | 6 | use crate::error::Error; 7 | 8 | use crate::file::Compression; 9 | use crate::file::{Block, CompressedBlock}; 10 | 11 | use super::block::CompressedBlockStreamingIterator; 12 | 13 | #[cfg(feature = "compression")] 14 | const CRC_TABLE: crc::Crc = crc::Crc::::new(&crc::CRC_32_ISO_HDLC); 15 | 16 | /// Decompresses a [`CompressedBlock`] into [`Block`] 17 | /// Returns whether the buffers where swapped. 18 | pub fn decompress_block( 19 | block: &mut CompressedBlock, 20 | decompressed: &mut Block, 21 | compression: Option, 22 | ) -> Result { 23 | decompressed.number_of_rows = block.number_of_rows; 24 | let block = &mut block.data; 25 | let decompressed = &mut decompressed.data; 26 | 27 | match compression { 28 | None => { 29 | std::mem::swap(block, decompressed); 30 | Ok(true) 31 | } 32 | #[cfg(feature = "compression")] 33 | Some(Compression::Deflate) => { 34 | decompressed.clear(); 35 | let mut decoder = libflate::deflate::Decoder::new(&block[..]); 36 | decoder.read_to_end(decompressed)?; 37 | Ok(false) 38 | } 39 | #[cfg(feature = "compression")] 40 | Some(Compression::Snappy) => { 41 | let crc = &block[block.len() - 4..]; 42 | let block = &block[..block.len() - 4]; 43 | 44 | let len = snap::raw::decompress_len(block).map_err(|_| Error::OutOfSpec)?; 45 | decompressed.clear(); 46 | decompressed.resize(len, 0); 47 | snap::raw::Decoder::new() 48 | .decompress(block, decompressed) 49 | .map_err(|_| Error::OutOfSpec)?; 50 | 51 | let expected_crc = u32::from_be_bytes([crc[0], crc[1], crc[2], crc[3]]); 52 | 53 | let actual_crc = CRC_TABLE.checksum(decompressed); 54 | if expected_crc != actual_crc { 55 | return Err(Error::OutOfSpec); 56 | } 57 | Ok(false) 58 | } 59 | #[cfg(not(feature = "compression"))] 60 | Some(Compression::Deflate) => Err(Error::RequiresCompression), 61 | #[cfg(not(feature = "compression"))] 62 | Some(Compression::Snappy) => Err(Error::RequiresCompression), 63 | } 64 | } 65 | 66 | /// [`FallibleStreamingIterator`] of decompressed [`Block`] 67 | pub struct BlockStreamingIterator { 68 | blocks: CompressedBlockStreamingIterator, 69 | compression: Option, 70 | buf: Block, 71 | was_swapped: bool, 72 | } 73 | 74 | /// Returns a [`FallibleStreamingIterator`] of [`Block`]. 75 | pub fn block_iterator( 76 | reader: R, 77 | compression: Option, 78 | marker: [u8; 16], 79 | ) -> BlockStreamingIterator { 80 | BlockStreamingIterator::::new(reader, compression, marker) 81 | } 82 | 83 | impl BlockStreamingIterator { 84 | /// Returns a new [`BlockStreamingIterator`]. 85 | pub fn new(reader: R, compression: Option, marker: [u8; 16]) -> Self { 86 | Self { 87 | blocks: CompressedBlockStreamingIterator::new(reader, marker, vec![]), 88 | compression, 89 | buf: Block::new(0, vec![]), 90 | was_swapped: false, 91 | } 92 | } 93 | 94 | /// Deconstructs itself into its internal reader 95 | #[inline] 96 | pub fn into_inner(self) -> R { 97 | self.blocks.into_inner().0 98 | } 99 | } 100 | 101 | impl FallibleStreamingIterator for BlockStreamingIterator { 102 | type Error = Error; 103 | type Item = Block; 104 | 105 | #[inline] 106 | fn advance(&mut self) -> Result<(), Error> { 107 | if self.was_swapped { 108 | std::mem::swap(&mut self.blocks.buffer().data, &mut self.buf.data); 109 | } 110 | self.blocks.advance()?; 111 | self.was_swapped = decompress_block(self.blocks.buffer(), &mut self.buf, self.compression)?; 112 | Ok(()) 113 | } 114 | 115 | #[inline] 116 | fn get(&self) -> Option<&Self::Item> { 117 | if self.buf.number_of_rows > 0 { 118 | Some(&self.buf) 119 | } else { 120 | None 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/read/mod.rs: -------------------------------------------------------------------------------- 1 | //! Functions to read and decompress Files' metadata and blocks 2 | mod block; 3 | mod decode; 4 | pub(crate) mod decompress; 5 | 6 | use std::io::Read; 7 | 8 | use crate::error::Error; 9 | use crate::file::FileMetadata; 10 | use crate::schema::Schema; 11 | 12 | pub use fallible_streaming_iterator; 13 | 14 | // macros that can operate in sync and async code. 15 | macro_rules! avro_decode { 16 | ($reader:ident $($_await:tt)*) => { 17 | { 18 | let mut i = 0u64; 19 | let mut buf = [0u8; 1]; 20 | let mut j = 0; 21 | loop { 22 | if j > 9 { 23 | // if j * 7 > 64 24 | return Err(DecodeError::OutOfSpec); 25 | } 26 | $reader.read_exact(&mut buf[..])$($_await)*.map_err(|_| DecodeError::EndOfFile)?; 27 | i |= (u64::from(buf[0] & 0x7F)) << (j * 7); 28 | if (buf[0] >> 7) == 0 { 29 | break; 30 | } else { 31 | j += 1; 32 | } 33 | } 34 | 35 | Ok(i) 36 | } 37 | } 38 | } 39 | 40 | macro_rules! read_header { 41 | ($reader:ident $($_await:tt)*) => {{ 42 | let mut items = HashMap::new(); 43 | 44 | loop { 45 | let len = zigzag_i64($reader)$($_await)*.map_err(|_| Error::OutOfSpec)? as usize; 46 | if len == 0 { 47 | break Ok(items); 48 | } 49 | 50 | items.reserve(len); 51 | for _ in 0..len { 52 | let key = _read_binary($reader)$($_await)*?; 53 | let key = String::from_utf8(key) 54 | .map_err(|_| Error::OutOfSpec)?; 55 | let value = _read_binary($reader)$($_await)*?; 56 | items.insert(key, value); 57 | } 58 | } 59 | }}; 60 | } 61 | 62 | macro_rules! read_metadata_macro { 63 | ($reader:ident $($_await:tt)*) => {{ 64 | let mut magic_number = [0u8; 4]; 65 | $reader.read_exact(&mut magic_number)$($_await)*.map_err(|_| Error::OutOfSpec)?; 66 | 67 | // see https://avro.apache.org/docs/current/spec.html#Object+Container+Files 68 | if magic_number != [b'O', b'b', b'j', 1u8] { 69 | return Err(Error::OutOfSpec); 70 | } 71 | 72 | let header = decode::read_header($reader)$($_await)*?; 73 | 74 | let (schema, compression) = deserialize_header(header)?; 75 | 76 | let marker = decode::read_file_marker($reader)$($_await)*?; 77 | 78 | let record = if let Schema::Record(record) = schema { 79 | record 80 | } else { 81 | return Err(Error::OutOfSpec) 82 | }; 83 | 84 | Ok(FileMetadata { 85 | record, 86 | compression, 87 | marker, 88 | }) 89 | }}; 90 | } 91 | 92 | #[allow(unused_imports)] 93 | pub(crate) use { 94 | avro_decode, decode::deserialize_header, decode::DecodeError, read_header, read_metadata_macro, 95 | }; 96 | 97 | /// Reads the metadata from `reader` into [`FileMetadata`]. 98 | /// # Error 99 | /// This function errors iff the header is not a valid avro file header. 100 | pub fn read_metadata(reader: &mut R) -> Result { 101 | read_metadata_macro!(reader) 102 | } 103 | 104 | pub use decompress::{block_iterator, BlockStreamingIterator}; 105 | -------------------------------------------------------------------------------- /src/read_async/block.rs: -------------------------------------------------------------------------------- 1 | //! APIs to read from Avro format to arrow. 2 | use async_stream::try_stream; 3 | use futures::AsyncRead; 4 | use futures::AsyncReadExt; 5 | use futures::Stream; 6 | 7 | use crate::error::Error; 8 | use crate::file::CompressedBlock; 9 | use crate::read::DecodeError; 10 | 11 | use super::decode::zigzag_i64; 12 | 13 | async fn read_size(reader: &mut R) -> Result<(usize, usize), Error> { 14 | let rows = match zigzag_i64(reader).await { 15 | Ok(a) => a, 16 | Err(error) => match error { 17 | DecodeError::EndOfFile => return Ok((0, 0)), 18 | DecodeError::OutOfSpec => return Err(Error::OutOfSpec), 19 | }, 20 | }; 21 | 22 | let bytes = zigzag_i64(reader).await?; 23 | Ok((rows as usize, bytes as usize)) 24 | } 25 | 26 | /// Reads a [`CompressedBlock`] from the `reader`. 27 | /// # Error 28 | /// This function errors iff either the block cannot be read or the sync marker does not match 29 | async fn read_block( 30 | reader: &mut R, 31 | block: &mut CompressedBlock, 32 | marker: [u8; 16], 33 | ) -> Result<(), Error> { 34 | let (rows, bytes) = read_size(reader).await?; 35 | block.number_of_rows = rows; 36 | if rows == 0 { 37 | return Ok(()); 38 | }; 39 | 40 | block.data.clear(); 41 | block 42 | .data 43 | .try_reserve(bytes) 44 | .map_err(|_| Error::OutOfSpec)?; 45 | reader 46 | .take(bytes as u64) 47 | .read_to_end(&mut block.data) 48 | .await?; 49 | 50 | let mut block_marker = [0u8; 16]; 51 | reader.read_exact(&mut block_marker).await?; 52 | 53 | if block_marker != marker { 54 | return Err(Error::OutOfSpec); 55 | } 56 | Ok(()) 57 | } 58 | 59 | /// Returns a fallible [`Stream`] of Avro blocks bound to `reader` 60 | pub async fn block_stream( 61 | reader: &mut R, 62 | marker: [u8; 16], 63 | ) -> impl Stream> + '_ { 64 | try_stream! { 65 | loop { 66 | let mut block = CompressedBlock::new(0, vec![]); 67 | read_block(reader, &mut block, marker).await?; 68 | if block.number_of_rows == 0 { 69 | break 70 | } 71 | yield block 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/read_async/decode.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use futures::AsyncRead; 4 | use futures::AsyncReadExt; 5 | 6 | use crate::error::Error; 7 | use crate::read::DecodeError; 8 | use crate::read::{avro_decode, read_header}; 9 | 10 | pub async fn zigzag_i64(reader: &mut R) -> Result { 11 | let z = decode_variable(reader).await?; 12 | Ok(if z & 0x1 == 0 { 13 | (z >> 1) as i64 14 | } else { 15 | !(z >> 1) as i64 16 | }) 17 | } 18 | 19 | async fn decode_variable(reader: &mut R) -> Result { 20 | avro_decode!(reader.await) 21 | } 22 | 23 | /// Reads the file marker asynchronously 24 | pub(crate) async fn read_file_marker( 25 | reader: &mut R, 26 | ) -> Result<[u8; 16], Error> { 27 | let mut marker = [0u8; 16]; 28 | reader.read_exact(&mut marker).await?; 29 | Ok(marker) 30 | } 31 | 32 | async fn _read_binary(reader: &mut R) -> Result, Error> { 33 | let len: usize = zigzag_i64(reader).await? as usize; 34 | let mut buf = vec![]; 35 | buf.try_reserve(len).map_err(|_| Error::OutOfSpec)?; 36 | reader.take(len as u64).read_to_end(&mut buf).await?; 37 | Ok(buf) 38 | } 39 | 40 | pub(crate) async fn read_header( 41 | reader: &mut R, 42 | ) -> Result>, Error> { 43 | read_header!(reader.await) 44 | } 45 | -------------------------------------------------------------------------------- /src/read_async/mod.rs: -------------------------------------------------------------------------------- 1 | //! Async Avro 2 | use futures::AsyncRead; 3 | use futures::AsyncReadExt; 4 | 5 | use crate::error::Error; 6 | use crate::file::FileMetadata; 7 | 8 | use crate::read::read_metadata_macro; 9 | use crate::schema::Schema; 10 | 11 | mod block; 12 | mod decode; 13 | use crate::read::deserialize_header; 14 | use decode::*; 15 | 16 | /// Reads the avro metadata from `reader` into a [`Schema`], [`Compression`] and magic marker. 17 | pub async fn read_metadata( 18 | reader: &mut R, 19 | ) -> Result { 20 | read_metadata_macro!(reader.await) 21 | } 22 | 23 | async fn _read_binary(reader: &mut R) -> Result, Error> { 24 | let len: usize = zigzag_i64(reader).await? as usize; 25 | let mut buf = vec![]; 26 | buf.try_reserve(len).map_err(|_| Error::OutOfSpec)?; 27 | reader.take(len as u64).read_to_end(&mut buf).await?; 28 | Ok(buf) 29 | } 30 | 31 | pub use super::read::decompress::decompress_block; 32 | pub use block::block_stream; 33 | -------------------------------------------------------------------------------- /src/schema/de.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, fmt}; 2 | 3 | use serde::{ 4 | de::{MapAccess, SeqAccess, Visitor}, 5 | Deserialize, Deserializer, 6 | }; 7 | use serde_json::Value; 8 | 9 | use super::*; 10 | 11 | fn to_primitive(v: &str) -> Option { 12 | use Schema::*; 13 | Some(match v { 14 | "null" => Null, 15 | "boolean" => Boolean, 16 | "bytes" => Bytes(None), 17 | "string" => String(None), 18 | "int" => Int(None), 19 | "long" => Long(None), 20 | "float" => Float, 21 | "double" => Double, 22 | _ => return None, 23 | }) 24 | } 25 | 26 | fn get_type(map: &mut HashMap) -> Result { 27 | if let Some(v) = map.remove("type") { 28 | if let Value::String(v) = v { 29 | Ok(v) 30 | } else if let Value::Null = v { 31 | Ok("null".to_string()) 32 | } else { 33 | Err(serde::de::Error::custom("type must be a string")) 34 | } 35 | } else { 36 | Err(serde::de::Error::missing_field("type")) 37 | } 38 | } 39 | 40 | fn as_string(v: Value, helper: &str) -> Result { 41 | if let Value::String(v) = v { 42 | Ok(v) 43 | } else { 44 | Err(serde::de::Error::custom(format!( 45 | "{} must be a string", 46 | helper 47 | ))) 48 | } 49 | } 50 | 51 | fn remove_string( 52 | data: &mut HashMap, 53 | key: &str, 54 | ) -> Result, E> { 55 | match data.remove(key) { 56 | Some(s) => as_string(s, key).map(Some), 57 | None => Ok(None), 58 | } 59 | } 60 | 61 | fn remove_usize( 62 | data: &mut HashMap, 63 | key: &str, 64 | ) -> Result, E> { 65 | data.remove(key) 66 | .map(|x| serde_json::from_value::(x).map_err(serde::de::Error::custom)) 67 | .transpose() 68 | } 69 | 70 | fn remove_vec_string( 71 | data: &mut HashMap, 72 | key: &str, 73 | ) -> Result, E> { 74 | match data.remove(key) { 75 | Some(s) => { 76 | if let Value::Array(x) = s { 77 | x.into_iter().map(|x| as_string(x, key)).collect() 78 | } else { 79 | Err(serde::de::Error::custom(format!( 80 | "{} must be a string", 81 | key 82 | ))) 83 | } 84 | } 85 | None => Ok(vec![]), 86 | } 87 | } 88 | 89 | fn to_enum(data: &mut HashMap) -> Result { 90 | Ok(Schema::Enum(Enum { 91 | name: remove_string(data, "name")? 92 | .ok_or_else(|| serde::de::Error::custom("name is required in enum"))?, 93 | namespace: remove_string(data, "namespace")?, 94 | aliases: remove_vec_string(data, "aliases")?, 95 | doc: remove_string(data, "doc")?, 96 | symbols: remove_vec_string(data, "symbols")?, 97 | default: remove_string(data, "default")?, 98 | })) 99 | } 100 | 101 | fn to_map(data: &mut HashMap) -> Result { 102 | let item = data 103 | .remove("values") 104 | .ok_or_else(|| serde::de::Error::custom("values is required in a map"))?; 105 | let schema: Schema = serde_json::from_value(item).map_err(serde::de::Error::custom)?; 106 | Ok(Schema::Map(Box::new(schema))) 107 | } 108 | 109 | fn to_schema( 110 | data: &mut HashMap, 111 | key: &str, 112 | ) -> Result, E> { 113 | let schema = data.remove(key); 114 | schema 115 | .map(|schema| serde_json::from_value(schema).map_err(serde::de::Error::custom)) 116 | .transpose() 117 | } 118 | 119 | fn to_array(data: &mut HashMap) -> Result { 120 | let schema = 121 | to_schema(data, "items")?.ok_or_else(|| E::custom("items is required in an array"))?; 122 | Ok(Schema::Array(Box::new(schema))) 123 | } 124 | 125 | fn to_field(data: Value) -> Result { 126 | serde_json::from_value(data).map_err(E::custom) 127 | } 128 | 129 | fn to_vec_fields( 130 | data: &mut HashMap, 131 | key: &str, 132 | ) -> Result, E> { 133 | match data.remove(key) { 134 | Some(s) => { 135 | if let Value::Array(x) = s { 136 | x.into_iter().map(to_field).collect() 137 | } else { 138 | Err(E::custom(format!("{} must be a string", key))) 139 | } 140 | } 141 | None => Ok(vec![]), 142 | } 143 | } 144 | 145 | fn to_record(data: &mut HashMap) -> Result { 146 | Ok(Schema::Record(Record { 147 | name: remove_string(data, "name")? 148 | .ok_or_else(|| serde::de::Error::custom("name is required in enum"))?, 149 | namespace: remove_string(data, "namespace")?, 150 | aliases: remove_vec_string(data, "aliases")?, 151 | doc: remove_string(data, "doc")?, 152 | fields: to_vec_fields(data, "fields")?, 153 | })) 154 | } 155 | 156 | fn to_fixed(data: &mut HashMap) -> Result { 157 | let size = remove_usize(data, "size")? 158 | .ok_or_else(|| serde::de::Error::custom("size is required in fixed"))?; 159 | 160 | let logical = remove_string(data, "logicalType")?.unwrap_or_default(); 161 | let logical = match logical.as_ref() { 162 | "decimal" => { 163 | let precision = remove_usize(data, "precision")?; 164 | let scale = remove_usize(data, "scale")?.unwrap_or_default(); 165 | precision.map(|p| FixedLogical::Decimal(p, scale)) 166 | } 167 | "duration" => Some(FixedLogical::Duration), 168 | _ => None, 169 | }; 170 | 171 | Ok(Schema::Fixed(Fixed { 172 | name: remove_string(data, "name")? 173 | .ok_or_else(|| serde::de::Error::custom("name is required in fixed"))?, 174 | namespace: remove_string(data, "namespace")?, 175 | aliases: remove_vec_string(data, "aliases")?, 176 | doc: remove_string(data, "doc")?, 177 | size, 178 | logical, 179 | })) 180 | } 181 | 182 | fn to_order( 183 | data: &mut HashMap, 184 | key: &str, 185 | ) -> Result, E> { 186 | remove_string(data, key)? 187 | .map(|x| { 188 | Ok(match x.as_ref() { 189 | "ascending" => Order::Ascending, 190 | "descending" => Order::Descending, 191 | "ignore" => Order::Ignore, 192 | _ => { 193 | return Err(serde::de::Error::custom( 194 | "order can only be one of {ascending, descending, ignore}", 195 | )) 196 | } 197 | }) 198 | }) 199 | .transpose() 200 | } 201 | 202 | struct SchemaVisitor {} 203 | 204 | impl<'de> Visitor<'de> for SchemaVisitor { 205 | type Value = Schema; 206 | 207 | // Format a message stating what data this Visitor expects to receive. 208 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 209 | formatter.write_str("a null, string, array or map describing an Avro schema") 210 | } 211 | 212 | fn visit_some(self, deserializer: D) -> Result 213 | where 214 | D: Deserializer<'de>, 215 | { 216 | deserializer.deserialize_any(SchemaVisitor {}) 217 | } 218 | 219 | fn visit_none(self) -> Result 220 | where 221 | E: serde::de::Error, 222 | { 223 | Ok(Schema::Null) 224 | } 225 | 226 | fn visit_str(self, v: &str) -> Result 227 | where 228 | E: serde::de::Error, 229 | { 230 | to_primitive(v) 231 | .ok_or_else(|| serde::de::Error::custom("string must be a valid primitive Schema")) 232 | } 233 | 234 | fn visit_seq(self, mut seq: A) -> Result 235 | where 236 | A: SeqAccess<'de>, 237 | { 238 | let mut vec = Vec::with_capacity(seq.size_hint().unwrap_or(0)); 239 | while let Some(item) = seq.next_element::()? { 240 | let schema: Schema = serde_json::from_value(item).map_err(serde::de::Error::custom)?; 241 | vec.push(schema) 242 | } 243 | Ok(Schema::Union(vec)) 244 | } 245 | 246 | fn visit_map(self, mut access: M) -> Result 247 | where 248 | M: MapAccess<'de>, 249 | { 250 | let mut map = HashMap::::with_capacity(access.size_hint().unwrap_or(0)); 251 | 252 | // While there are entries remaining in the input, add them 253 | // into our map. 254 | while let Some((key, value)) = access.next_entry()? { 255 | map.insert(key, value); 256 | } 257 | 258 | let (schema, type_) = get_type(&mut map).map(|x| (to_primitive(&x), x))?; 259 | 260 | if let Some(schema) = schema { 261 | Ok(match type_.as_ref() { 262 | "string" => { 263 | let logical = remove_string(&mut map, "logicalType")?.unwrap_or_default(); 264 | match logical.as_ref() { 265 | "uuid" => Schema::String(Some(StringLogical::Uuid)), 266 | _ => schema, 267 | } 268 | } 269 | "int" => { 270 | let logical = remove_string(&mut map, "logicalType")?.unwrap_or_default(); 271 | match logical.as_ref() { 272 | "date" => Schema::Int(Some(IntLogical::Date)), 273 | "time-millis" => Schema::Int(Some(IntLogical::Time)), 274 | _ => schema, 275 | } 276 | } 277 | "long" => { 278 | let logical = remove_string(&mut map, "logicalType")?.unwrap_or_default(); 279 | match logical.as_ref() { 280 | "time-micros" => Schema::Long(Some(LongLogical::Time)), 281 | "timestamp-millis" => Schema::Long(Some(LongLogical::TimestampMillis)), 282 | "timestamp-micros" => Schema::Long(Some(LongLogical::TimestampMicros)), 283 | "local-timestamp-millis" => { 284 | Schema::Long(Some(LongLogical::LocalTimestampMillis)) 285 | } 286 | "local-timestamp-micros" => { 287 | Schema::Long(Some(LongLogical::LocalTimestampMicros)) 288 | } 289 | _ => schema, 290 | } 291 | } 292 | "bytes" => { 293 | let logical = remove_string(&mut map, "logicalType")?.unwrap_or_default(); 294 | match logical.as_ref() { 295 | "decimal" => { 296 | let precision = remove_usize(&mut map, "precision")?; 297 | let scale = remove_usize(&mut map, "scale")?.unwrap_or_default(); 298 | Schema::Bytes(precision.map(|p| BytesLogical::Decimal(p, scale))) 299 | } 300 | _ => schema, 301 | } 302 | } 303 | _ => schema, 304 | }) 305 | } else { 306 | match type_.as_ref() { 307 | "enum" => to_enum(&mut map), 308 | "map" => to_map(&mut map), 309 | "array" => to_array(&mut map), 310 | "record" => to_record(&mut map), 311 | "fixed" => to_fixed(&mut map), 312 | other => todo!("{}", other), 313 | } 314 | } 315 | } 316 | } 317 | 318 | impl<'de> Deserialize<'de> for Schema { 319 | fn deserialize(deserializer: D) -> Result 320 | where 321 | D: Deserializer<'de>, 322 | { 323 | deserializer.deserialize_option(SchemaVisitor {}) 324 | } 325 | } 326 | 327 | struct FieldVisitor {} 328 | 329 | impl<'de> Visitor<'de> for FieldVisitor { 330 | type Value = Field; 331 | 332 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 333 | formatter.write_str("a map describing an Avro field") 334 | } 335 | 336 | fn visit_map(self, mut access: M) -> Result 337 | where 338 | M: MapAccess<'de>, 339 | { 340 | let mut map = HashMap::::with_capacity(access.size_hint().unwrap_or(0)); 341 | 342 | // While there are entries remaining in the input, add them 343 | // into our map. 344 | while let Some((key, value)) = access.next_entry()? { 345 | map.insert(key, value); 346 | } 347 | 348 | Ok(Field { 349 | name: remove_string(&mut map, "name")? 350 | .ok_or_else(|| serde::de::Error::custom("name is required in enum"))?, 351 | doc: remove_string(&mut map, "doc")?, 352 | schema: to_schema(&mut map, "type")? 353 | .ok_or_else(|| serde::de::Error::custom("type is required in Field"))?, 354 | default: to_schema(&mut map, "default")?, 355 | order: to_order(&mut map, "order")?, 356 | aliases: remove_vec_string(&mut map, "aliases")?, 357 | }) 358 | } 359 | } 360 | 361 | impl<'de> Deserialize<'de> for Field { 362 | fn deserialize(deserializer: D) -> Result 363 | where 364 | D: Deserializer<'de>, 365 | { 366 | deserializer.deserialize_map(FieldVisitor {}) 367 | } 368 | } 369 | -------------------------------------------------------------------------------- /src/schema/mod.rs: -------------------------------------------------------------------------------- 1 | //! Contains structs defining Avro's logical types 2 | mod de; 3 | mod se; 4 | 5 | /// An Avro Schema. It describes all _physical_ and _logical_ types. 6 | /// See [the spec](https://avro.apache.org/docs/current/spec.html) for details. 7 | #[derive(Debug, Clone, PartialEq, Hash)] 8 | pub enum Schema { 9 | /// A null type 10 | Null, 11 | /// Boolean (physically represented as a single byte) 12 | Boolean, 13 | /// 32 bit signed integer (physically represented as a zigzag encoded variable number of bytes) 14 | Int(Option), 15 | /// 64 bit signed integer (physically represented as a zigzag encoded variable number of bytes) 16 | Long(Option), 17 | /// 32 bit float (physically represented as 4 bytes in little endian) 18 | Float, 19 | /// 64 bit float (physically represented as 8 bytes in little endian) 20 | Double, 21 | /// variable length bytes (physically represented by a zigzag encoded positive integer followed by its number of bytes) 22 | Bytes(Option), 23 | /// variable length utf8 (physically represented by a zigzag encoded positive integer followed by its number of bytes) 24 | String(Option), 25 | /// Record 26 | Record(Record), 27 | /// Enum with a known number of variants 28 | Enum(Enum), 29 | /// Array of a uniform type with N entries 30 | Array(Box), 31 | /// A map String -> type 32 | Map(Box), 33 | /// A union of a heterogeneous number of types 34 | Union(Vec), 35 | /// todo 36 | Fixed(Fixed), 37 | } 38 | 39 | /// Order of a [`Field`]. 40 | #[derive(Debug, Clone, Copy, PartialEq, Hash)] 41 | pub enum Order { 42 | /// Ascending order 43 | Ascending, 44 | /// Descending order 45 | Descending, 46 | /// Order is to be ignored 47 | Ignore, 48 | } 49 | 50 | /// An Avro field. 51 | /// See [the spec](https://avro.apache.org/docs/current/spec.html) for details. 52 | #[derive(Debug, Clone, PartialEq, Hash)] 53 | pub struct Field { 54 | /// Its name 55 | pub name: String, 56 | /// Its optional documentation 57 | pub doc: Option, 58 | /// Its Schema 59 | pub schema: Schema, 60 | /// Its default value 61 | pub default: Option, 62 | /// Its optional order 63 | pub order: Option, 64 | /// Its aliases 65 | pub aliases: Vec, 66 | } 67 | 68 | impl Field { 69 | /// Returns a new [`Field`] without a doc, default, order or aliases 70 | pub fn new>(name: I, schema: Schema) -> Self { 71 | Self { 72 | name: name.into(), 73 | doc: None, 74 | schema, 75 | default: None, 76 | order: None, 77 | aliases: vec![], 78 | } 79 | } 80 | } 81 | 82 | /// Struct to hold data from a [`Schema::Record`]. 83 | #[derive(Debug, Clone, PartialEq, Hash)] 84 | pub struct Record { 85 | /// Its name 86 | pub name: String, 87 | /// Its optional namespace 88 | pub namespace: Option, 89 | /// Its optional documentation 90 | pub doc: Option, 91 | /// Its aliases 92 | pub aliases: Vec, 93 | /// Its children fields 94 | pub fields: Vec, 95 | } 96 | 97 | impl Record { 98 | /// Returns a new [`Record`] without a namespace, doc or aliases 99 | pub fn new>(name: I, fields: Vec) -> Self { 100 | Self { 101 | name: name.into(), 102 | namespace: None, 103 | doc: None, 104 | fields, 105 | aliases: vec![], 106 | } 107 | } 108 | } 109 | 110 | /// Struct to hold data from a [`Schema::Fixed`]. 111 | #[derive(Debug, Clone, PartialEq, Hash)] 112 | pub struct Fixed { 113 | /// Its name 114 | pub name: String, 115 | /// Its optional namespace 116 | pub namespace: Option, 117 | /// Its optional documentation 118 | pub doc: Option, 119 | /// Its aliases 120 | pub aliases: Vec, 121 | /// Its size 122 | pub size: usize, 123 | /// Its optional logical type 124 | pub logical: Option, 125 | } 126 | 127 | impl Fixed { 128 | /// Returns a new [`Fixed`] without a namespace, doc or aliases 129 | pub fn new>(name: I, size: usize) -> Self { 130 | Self { 131 | name: name.into(), 132 | namespace: None, 133 | doc: None, 134 | size, 135 | aliases: vec![], 136 | logical: None, 137 | } 138 | } 139 | } 140 | 141 | /// Struct to hold data from a [`Schema::Enum`]. 142 | #[derive(Debug, Clone, PartialEq, Hash)] 143 | pub struct Enum { 144 | /// Its name 145 | pub name: String, 146 | /// Its optional namespace 147 | pub namespace: Option, 148 | /// Its aliases 149 | pub aliases: Vec, 150 | /// Its optional documentation 151 | pub doc: Option, 152 | /// Its set of symbols 153 | pub symbols: Vec, 154 | /// Its default symbol 155 | pub default: Option, 156 | } 157 | 158 | impl Enum { 159 | /// Returns a minimal [`Enum`]. 160 | pub fn new>(name: I, symbols: Vec) -> Self { 161 | Self { 162 | name: name.into(), 163 | namespace: None, 164 | doc: None, 165 | symbols, 166 | aliases: vec![], 167 | default: None, 168 | } 169 | } 170 | } 171 | 172 | impl From for Schema { 173 | fn from(enum_: Enum) -> Self { 174 | Schema::Enum(enum_) 175 | } 176 | } 177 | 178 | impl From for Schema { 179 | fn from(record: Record) -> Self { 180 | Schema::Record(record) 181 | } 182 | } 183 | 184 | impl From for Schema { 185 | fn from(fixed: Fixed) -> Self { 186 | Schema::Fixed(fixed) 187 | } 188 | } 189 | 190 | /// Enum of all logical types of [`Schema::Int`] 191 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 192 | pub enum IntLogical { 193 | /// A date 194 | Date, 195 | /// A time 196 | Time, 197 | } 198 | 199 | /// Enum of all logical types of [`Schema::Long`] 200 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 201 | pub enum LongLogical { 202 | /// A time 203 | Time, 204 | /// A timestamp 205 | TimestampMillis, 206 | /// A timestamp 207 | TimestampMicros, 208 | /// A timestamp without timezone 209 | LocalTimestampMillis, 210 | /// A timestamp without timezone 211 | LocalTimestampMicros, 212 | } 213 | 214 | /// Enum of all logical types of [`Schema::String`] 215 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 216 | pub enum StringLogical { 217 | /// A UUID 218 | Uuid, 219 | } 220 | 221 | /// Enum of all logical types of [`Schema::Fixed`] 222 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 223 | pub enum FixedLogical { 224 | /// A decimal 225 | Decimal(usize, usize), 226 | /// A duration 227 | Duration, 228 | } 229 | 230 | /// Enum of all logical types of [`Schema::Bytes`] 231 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 232 | pub enum BytesLogical { 233 | /// A decimal 234 | Decimal(usize, usize), 235 | } 236 | -------------------------------------------------------------------------------- /src/schema/se.rs: -------------------------------------------------------------------------------- 1 | use serde::ser::{SerializeMap, SerializeSeq}; 2 | use serde::{Serialize, Serializer}; 3 | 4 | use super::*; 5 | 6 | impl Serialize for Schema { 7 | fn serialize(&self, serializer: S) -> Result 8 | where 9 | S: Serializer, 10 | { 11 | match self { 12 | Schema::Null => serializer.serialize_str("null"), 13 | Schema::Boolean => serializer.serialize_str("boolean"), 14 | Schema::Int(logical) => match logical { 15 | None => serializer.serialize_str("int"), 16 | Some(logical) => { 17 | let mut map = serializer.serialize_map(Some(2))?; 18 | map.serialize_entry("type", "int")?; 19 | let name = match logical { 20 | IntLogical::Date => "date", 21 | IntLogical::Time => "time-millis", 22 | }; 23 | map.serialize_entry("logicalType", name)?; 24 | map.end() 25 | } 26 | }, 27 | Schema::Long(logical) => match logical { 28 | None => serializer.serialize_str("long"), 29 | Some(logical) => { 30 | let mut map = serializer.serialize_map(Some(2))?; 31 | map.serialize_entry("type", "long")?; 32 | let name = match logical { 33 | LongLogical::Time => "time-micros", 34 | LongLogical::TimestampMillis => "timestamp-millis", 35 | LongLogical::TimestampMicros => "timestamp-micros", 36 | LongLogical::LocalTimestampMillis => "local-timestamp-millis", 37 | LongLogical::LocalTimestampMicros => "local-timestamp-micros", 38 | }; 39 | map.serialize_entry("logicalType", name)?; 40 | map.end() 41 | } 42 | }, 43 | Schema::Float => serializer.serialize_str("float"), 44 | Schema::Double => serializer.serialize_str("double"), 45 | Schema::Bytes(logical) => match logical { 46 | None => serializer.serialize_str("bytes"), 47 | Some(logical) => match logical { 48 | BytesLogical::Decimal(precision, scale) => { 49 | let mut map = serializer.serialize_map(Some(4))?; 50 | map.serialize_entry("type", "bytes")?; 51 | map.serialize_entry("logicalType", "decimal")?; 52 | map.serialize_entry("precision", precision)?; 53 | if *scale > 0 { 54 | map.serialize_entry("scale", scale)?; 55 | } 56 | map.end() 57 | } 58 | }, 59 | }, 60 | Schema::String(logical) => match logical { 61 | None => serializer.serialize_str("string"), 62 | Some(logical) => match logical { 63 | StringLogical::Uuid => { 64 | let mut map = serializer.serialize_map(Some(1))?; 65 | map.serialize_entry("type", "string")?; 66 | map.serialize_entry("logicalType", "uuid")?; 67 | map.end() 68 | } 69 | }, 70 | }, 71 | Schema::Record(record) => { 72 | let Record { 73 | name, 74 | namespace, 75 | doc, 76 | aliases, 77 | fields, 78 | } = record; 79 | let mut map = serializer.serialize_map(Some(3))?; 80 | map.serialize_entry("type", "record")?; 81 | map.serialize_entry("name", name)?; 82 | if let Some(namespace) = namespace { 83 | map.serialize_entry("namespace", namespace)?; 84 | } 85 | if !aliases.is_empty() { 86 | map.serialize_entry("aliases", aliases)?; 87 | } 88 | if let Some(doc) = doc { 89 | map.serialize_entry("doc", doc)?; 90 | } 91 | map.serialize_entry("fields", fields)?; 92 | map.end() 93 | } 94 | Schema::Enum(enum_) => { 95 | let Enum { 96 | name, 97 | namespace, 98 | aliases, 99 | doc, 100 | symbols, 101 | default, 102 | } = enum_; 103 | let mut map = serializer.serialize_map(Some(3))?; 104 | map.serialize_entry("type", "enum")?; 105 | map.serialize_entry("name", name)?; 106 | if let Some(namespace) = namespace { 107 | map.serialize_entry("namespace", namespace)?; 108 | } 109 | if !aliases.is_empty() { 110 | map.serialize_entry("aliases", aliases)?; 111 | } 112 | if let Some(doc) = doc { 113 | map.serialize_entry("doc", doc)?; 114 | } 115 | if let Some(default) = default { 116 | map.serialize_entry("default", default)?; 117 | } 118 | map.serialize_entry("symbols", symbols)?; 119 | map.end() 120 | } 121 | Schema::Array(schema) => { 122 | let mut map = serializer.serialize_map(Some(2))?; 123 | map.serialize_entry("type", "array")?; 124 | map.serialize_entry("items", schema.as_ref())?; 125 | map.end() 126 | } 127 | Schema::Map(schema) => { 128 | let mut map = serializer.serialize_map(Some(2))?; 129 | map.serialize_entry("type", "map")?; 130 | map.serialize_entry("values", schema.as_ref())?; 131 | map.end() 132 | } 133 | Schema::Union(schemas) => { 134 | let mut seq = serializer.serialize_seq(Some(schemas.len()))?; 135 | for schema in schemas { 136 | seq.serialize_element(schema)?; 137 | } 138 | seq.end() 139 | } 140 | Schema::Fixed(fixed) => { 141 | let Fixed { 142 | name, 143 | namespace, 144 | doc, 145 | aliases, 146 | size, 147 | logical, 148 | } = fixed; 149 | 150 | let mut map = serializer.serialize_map(None)?; 151 | map.serialize_entry("type", "fixed")?; 152 | map.serialize_entry("name", name)?; 153 | if let Some(namespace) = namespace { 154 | map.serialize_entry("namespace", namespace)?; 155 | } 156 | if !aliases.is_empty() { 157 | map.serialize_entry("aliases", aliases)?; 158 | } 159 | if let Some(doc) = doc { 160 | map.serialize_entry("doc", doc)?; 161 | } 162 | map.serialize_entry("size", size)?; 163 | 164 | if let Some(logical) = logical { 165 | match logical { 166 | FixedLogical::Decimal(precision, scale) => { 167 | map.serialize_entry("logicalType", "decimal")?; 168 | map.serialize_entry("precision", precision)?; 169 | if *scale > 0 { 170 | map.serialize_entry("scale", scale)?; 171 | } 172 | } 173 | FixedLogical::Duration => map.serialize_entry("logicalType", "duration")?, 174 | } 175 | } 176 | 177 | map.end() 178 | } 179 | } 180 | } 181 | } 182 | 183 | impl Serialize for Field { 184 | fn serialize(&self, serializer: S) -> Result 185 | where 186 | S: Serializer, 187 | { 188 | let Field { 189 | name, 190 | doc, 191 | schema, 192 | default, 193 | order, 194 | aliases, 195 | } = self; 196 | 197 | let mut map = serializer.serialize_map(None)?; 198 | map.serialize_entry("name", name)?; 199 | if !aliases.is_empty() { 200 | map.serialize_entry("aliases", aliases)?; 201 | } 202 | if let Some(doc) = doc { 203 | map.serialize_entry("doc", doc)?; 204 | } 205 | if let Some(default) = default { 206 | map.serialize_entry("default", default)?; 207 | } 208 | map.serialize_entry("type", schema)?; 209 | if let Some(order) = order { 210 | let order = match order { 211 | Order::Ascending => "ascending", 212 | Order::Descending => "descending", 213 | Order::Ignore => "ignore", 214 | }; 215 | map.serialize_entry("order", order)?; 216 | } 217 | 218 | map.end() 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/write/block.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use crate::error::Error; 4 | 5 | use crate::file::CompressedBlock; 6 | 7 | use super::{encode::zigzag_encode, file::SYNC_NUMBER}; 8 | 9 | /// Writes a [`CompressedBlock`] to `writer` 10 | pub fn write_block(writer: &mut W, block: &CompressedBlock) -> Result<(), Error> { 11 | // write size and rows 12 | zigzag_encode(block.number_of_rows as i64, writer)?; 13 | zigzag_encode(block.data.len() as i64, writer)?; 14 | 15 | writer.write_all(&block.data)?; 16 | 17 | writer.write_all(&SYNC_NUMBER)?; 18 | 19 | Ok(()) 20 | } 21 | -------------------------------------------------------------------------------- /src/write/compression.rs: -------------------------------------------------------------------------------- 1 | //! APIs to read from Avro format to arrow. 2 | 3 | use crate::error::Error; 4 | 5 | use crate::file::{Block, CompressedBlock, Compression}; 6 | 7 | #[cfg(feature = "compression")] 8 | const CRC_TABLE: crc::Crc = crc::Crc::::new(&crc::CRC_32_ISO_HDLC); 9 | 10 | /// Compresses a [`Block`] to a [`CompressedBlock`]. 11 | pub fn compress( 12 | block: &mut Block, 13 | compressed: &mut CompressedBlock, 14 | compression: Option, 15 | ) -> Result { 16 | compressed.number_of_rows = block.number_of_rows; 17 | let block = &mut block.data; 18 | let compressed = &mut compressed.data; 19 | 20 | match compression { 21 | None => { 22 | std::mem::swap(block, compressed); 23 | Ok(true) 24 | } 25 | #[cfg(feature = "compression")] 26 | Some(Compression::Deflate) => { 27 | use std::io::Write; 28 | compressed.clear(); 29 | let mut encoder = libflate::deflate::Encoder::new(compressed); 30 | encoder.write_all(block)?; 31 | encoder.finish(); 32 | Ok(false) 33 | } 34 | #[cfg(feature = "compression")] 35 | Some(Compression::Snappy) => { 36 | use snap::raw::{max_compress_len, Encoder}; 37 | 38 | compressed.clear(); 39 | 40 | let required_len = max_compress_len(block.len()); 41 | compressed.resize(required_len, 0); 42 | let compressed_bytes = Encoder::new() 43 | .compress(block, compressed) 44 | .map_err(|_| Error::OutOfSpec)?; 45 | compressed.truncate(compressed_bytes); 46 | 47 | compressed.extend(CRC_TABLE.checksum(block).to_be_bytes()); 48 | Ok(false) 49 | } 50 | #[cfg(not(feature = "compression"))] 51 | Some(Compression::Deflate) => Err(Error::RequiresCompression), 52 | #[cfg(not(feature = "compression"))] 53 | Some(Compression::Snappy) => Err(Error::RequiresCompression), 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/write/encode.rs: -------------------------------------------------------------------------------- 1 | //! Functions used to encode Avro physical types 2 | use crate::error::Error; 3 | 4 | /// Zigzag encoding of a signed integer. 5 | #[inline] 6 | pub fn zigzag_encode(n: i64, writer: &mut W) -> Result<(), Error> { 7 | _zigzag_encode(((n << 1) ^ (n >> 63)) as u64, writer) 8 | } 9 | 10 | #[inline] 11 | fn _zigzag_encode(mut z: u64, writer: &mut W) -> Result<(), Error> { 12 | loop { 13 | if z <= 0x7F { 14 | writer.write_all(&[(z & 0x7F) as u8])?; 15 | break; 16 | } else { 17 | writer.write_all(&[(0x80 | (z & 0x7F)) as u8])?; 18 | z >>= 7; 19 | } 20 | } 21 | Ok(()) 22 | } 23 | 24 | pub(crate) fn write_binary(bytes: &[u8], writer: &mut W) -> Result<(), Error> { 25 | zigzag_encode(bytes.len() as i64, writer)?; 26 | writer.write_all(bytes)?; 27 | Ok(()) 28 | } 29 | -------------------------------------------------------------------------------- /src/write/file.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use crate::error::Error; 4 | use crate::file::Compression; 5 | use crate::schema::{Record, Schema}; 6 | 7 | use super::encode; 8 | 9 | pub(crate) const SYNC_NUMBER: [u8; 16] = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]; 10 | // * Four bytes, ASCII 'O', 'b', 'j', followed by 1. 11 | pub(crate) const AVRO_MAGIC: [u8; 4] = [b'O', b'b', b'j', 1u8]; 12 | 13 | /// Serializes an [`Schema`] and optional [`Compression`] into an avro header. 14 | fn serialize_header( 15 | schema: &Schema, 16 | compression: Option, 17 | ) -> Result>, Error> { 18 | let schema = serde_json::to_string(schema).map_err(|_| Error::OutOfSpec)?; 19 | 20 | let mut header = HashMap::>::default(); 21 | 22 | header.insert("avro.schema".to_string(), schema.into_bytes()); 23 | if let Some(compression) = compression { 24 | let value = match compression { 25 | Compression::Snappy => b"snappy".to_vec(), 26 | Compression::Deflate => b"deflate".to_vec(), 27 | }; 28 | header.insert("avro.codec".to_string(), value); 29 | }; 30 | 31 | Ok(header) 32 | } 33 | 34 | /// Writes Avro's metadata to `writer`. 35 | pub fn write_metadata( 36 | writer: &mut W, 37 | record: Record, 38 | compression: Option, 39 | ) -> Result<(), Error> { 40 | writer.write_all(&AVRO_MAGIC)?; 41 | 42 | // * file metadata, including the schema. 43 | let schema = Schema::Record(record); 44 | 45 | write_schema(writer, &schema, compression)?; 46 | 47 | // The 16-byte, randomly-generated sync marker for this file. 48 | writer.write_all(&SYNC_NUMBER)?; 49 | 50 | Ok(()) 51 | } 52 | 53 | pub(crate) fn write_schema( 54 | writer: &mut W, 55 | schema: &Schema, 56 | compression: Option, 57 | ) -> Result<(), Error> { 58 | let header = serialize_header(schema, compression)?; 59 | 60 | encode::zigzag_encode(header.len() as i64, writer)?; 61 | for (name, item) in header { 62 | encode::write_binary(name.as_bytes(), writer)?; 63 | encode::write_binary(&item, writer)?; 64 | } 65 | writer.write_all(&[0])?; 66 | Ok(()) 67 | } 68 | -------------------------------------------------------------------------------- /src/write/mod.rs: -------------------------------------------------------------------------------- 1 | //! Functions to compress and write Files' metadata and blocks 2 | mod compression; 3 | pub use compression::compress; 4 | mod block; 5 | pub mod encode; 6 | pub(crate) mod file; 7 | pub use block::write_block; 8 | pub use file::write_metadata; 9 | -------------------------------------------------------------------------------- /src/write_async.rs: -------------------------------------------------------------------------------- 1 | //! Functions to asynchronously write Files' metadata and blocks 2 | use futures::{AsyncWrite, AsyncWriteExt}; 3 | 4 | use crate::{ 5 | error::Error, 6 | file::{CompressedBlock, Compression}, 7 | schema::{Record, Schema}, 8 | write::encode::zigzag_encode, 9 | write::file::{write_schema, AVRO_MAGIC, SYNC_NUMBER}, 10 | }; 11 | 12 | /// Writes Avro's metadata to `writer`. 13 | pub async fn write_metadata( 14 | writer: &mut W, 15 | record: Record, 16 | compression: Option, 17 | ) -> Result<(), Error> 18 | where 19 | W: AsyncWrite + Unpin, 20 | { 21 | writer.write_all(&AVRO_MAGIC).await?; 22 | 23 | // * file metadata, including the schema. 24 | let schema = Schema::Record(record); 25 | 26 | let mut scratch = vec![]; 27 | write_schema(&mut scratch, &schema, compression)?; 28 | 29 | writer.write_all(&scratch).await?; 30 | 31 | // The 16-byte, randomly-generated sync marker for this file. 32 | writer.write_all(&SYNC_NUMBER).await?; 33 | 34 | Ok(()) 35 | } 36 | 37 | /// Writes a [`CompressedBlock`] to `writer` 38 | pub async fn write_block(writer: &mut W, block: &CompressedBlock) -> Result<(), Error> 39 | where 40 | W: AsyncWrite + Unpin, 41 | { 42 | // write size and rows 43 | let mut scratch = Vec::with_capacity(10); 44 | zigzag_encode(block.number_of_rows as i64, &mut scratch)?; 45 | writer.write_all(&scratch).await?; 46 | scratch.clear(); 47 | zigzag_encode(block.data.len() as i64, &mut scratch)?; 48 | writer.write_all(&scratch).await?; 49 | 50 | writer.write_all(&block.data).await?; 51 | 52 | writer.write_all(&SYNC_NUMBER).await?; 53 | 54 | Ok(()) 55 | } 56 | -------------------------------------------------------------------------------- /tests/it/file.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryInto; 2 | 3 | use avro_schema::error::Error; 4 | use avro_schema::file::{Block, Compression}; 5 | use avro_schema::read::fallible_streaming_iterator::FallibleStreamingIterator; 6 | use avro_schema::schema::{Field, Record, Schema}; 7 | 8 | fn read_avro(mut data: &[u8]) -> Result, Error> { 9 | let metadata = avro_schema::read::read_metadata(&mut data)?; 10 | 11 | let mut blocks = avro_schema::read::BlockStreamingIterator::new( 12 | &mut data, 13 | metadata.compression, 14 | metadata.marker, 15 | ); 16 | 17 | let mut values = vec![]; 18 | while let Some(block) = blocks.next()? { 19 | let _fields = &metadata.record.fields; 20 | let length = block.number_of_rows; 21 | let mut block: &[u8] = block.data.as_ref(); 22 | // at this point you can deserialize the block based on `_fields` according 23 | // to avro's specification. Note that `Block` is already decompressed. 24 | // for example, if there was a single field with f32, we would use 25 | for _ in 0..length { 26 | let (item, remaining) = block.split_at(4); 27 | block = remaining; 28 | let value = f32::from_le_bytes(item.try_into().unwrap()); 29 | values.push(value) 30 | // if there were more fields, we would need to consume (or skip) the remaining 31 | // here. You can use `avro_schema::read::decode::zigzag_i64` for integers :D 32 | } 33 | } 34 | 35 | Ok(values) 36 | } 37 | 38 | fn write_avro( 39 | compression: Option, 40 | array: &[f32], 41 | ) -> Result, Error> { 42 | let mut file = vec![]; 43 | 44 | let record = Record::new("", vec![Field::new("value", Schema::Float)]); 45 | 46 | avro_schema::write::write_metadata(&mut file, record, compression)?; 47 | 48 | // we need to create a `Block` 49 | let mut data: Vec = vec![]; 50 | for item in array.iter() { 51 | let bytes = item.to_le_bytes(); 52 | data.extend(bytes); 53 | } 54 | let mut block = Block::new(array.len(), data); 55 | 56 | // once completed, we compress it 57 | let mut compressed_block = avro_schema::file::CompressedBlock::default(); 58 | let _ = avro_schema::write::compress(&mut block, &mut compressed_block, compression)?; 59 | 60 | // and finally write it to the file 61 | avro_schema::write::write_block(&mut file, &compressed_block)?; 62 | 63 | Ok(file) 64 | } 65 | 66 | #[test] 67 | fn round_trip() -> Result<(), Error> { 68 | let original = vec![0.1, 0.2]; 69 | let file = write_avro(None, &original)?; 70 | let read = read_avro(&file)?; 71 | assert_eq!(read, original); 72 | Ok(()) 73 | } 74 | 75 | #[test] 76 | fn round_trip_deflate() -> Result<(), Error> { 77 | let original = vec![0.1, 0.2]; 78 | let file = write_avro(Some(Compression::Deflate), &original)?; 79 | let read = read_avro(&file)?; 80 | assert_eq!(read, original); 81 | Ok(()) 82 | } 83 | 84 | #[test] 85 | fn round_trip_snappy() -> Result<(), Error> { 86 | let original = vec![0.1, 0.2]; 87 | let file = write_avro(Some(Compression::Snappy), &original)?; 88 | let read = read_avro(&file)?; 89 | assert_eq!(read, original); 90 | Ok(()) 91 | } 92 | -------------------------------------------------------------------------------- /tests/it/main.rs: -------------------------------------------------------------------------------- 1 | mod file; 2 | 3 | use serde_json::Result; 4 | 5 | use avro_schema::schema::{BytesLogical, Field, LongLogical, Schema}; 6 | 7 | fn cases() -> Vec<(&'static str, Schema)> { 8 | use Schema::*; 9 | vec![ 10 | (r#"null"#, Null), 11 | (r#"{"type": "null"}"#, Null), 12 | (r#"{"type": null}"#, Null), 13 | (r#""null""#, Null), 14 | (r#""boolean""#, Boolean), 15 | (r#"{"type": "boolean"}"#, Boolean), 16 | (r#""string""#, String(None)), 17 | (r#"{"type": "string"}"#, String(None)), 18 | (r#""bytes""#, Bytes(None)), 19 | (r#"{"type": "bytes"}"#, Bytes(None)), 20 | ( 21 | r#"{"type": "bytes", "logicalType": "decimal", "precision": 10}"#, 22 | Bytes(Some(BytesLogical::Decimal(10, 0))), 23 | ), 24 | (r#""int""#, Int(None)), 25 | (r#"{"type": "int"}"#, Int(None)), 26 | (r#""long""#, Long(None)), 27 | (r#"{"type": "long"}"#, Long(None)), 28 | ( 29 | r#"{"type": "long", "logicalType": "timestamp-millis"}"#, 30 | Long(Some(LongLogical::TimestampMillis)), 31 | ), 32 | (r#""float""#, Float), 33 | (r#"{"type": "float"}"#, Float), 34 | (r#""double""#, Double), 35 | (r#"{"type": "double"}"#, Double), 36 | ( 37 | r#"{"type": "enum", "name": "Test", "symbols": ["A", "B"]}"#, 38 | Enum(avro_schema::schema::Enum::new( 39 | "Test", 40 | vec!["A".to_string(), "B".to_string()], 41 | )), 42 | ), 43 | (r#"["null", "string"]"#, Union(vec![Null, String(None)])), 44 | ( 45 | r#"[{"type": "null"}, {"type": "string"}]"#, 46 | Union(vec![Null, String(None)]), 47 | ), 48 | ( 49 | r#"{"type": "map", "values": "long"}"#, 50 | Map(Box::new(Long(None))), 51 | ), 52 | ( 53 | r#"{ 54 | "type": "map", 55 | "values": {"type": "enum", "name": "Test", "symbols": ["A", "B"]} 56 | }"#, 57 | Map(Box::new(Enum(avro_schema::schema::Enum::new( 58 | "Test", 59 | vec!["A".to_string(), "B".to_string()], 60 | )))), 61 | ), 62 | ( 63 | r#"{"type": "array", "items": "long"}"#, 64 | Array(Box::new(Long(None))), 65 | ), 66 | ( 67 | r#"{ 68 | "type": "array", 69 | "items": {"type": "enum", "name": "Test", "symbols": ["A", "B"]} 70 | }"#, 71 | Array(Box::new( 72 | avro_schema::schema::Enum::new("Test", vec!["A".to_string(), "B".to_string()]) 73 | .into(), 74 | )), 75 | ), 76 | ( 77 | r#"{ 78 | "type":"record", 79 | "name":"HandshakeResponse", 80 | "namespace":"org.apache.avro.ipc", 81 | "fields":[ 82 | { 83 | "name":"match", 84 | "type":{ 85 | "type":"enum", 86 | "name":"HandshakeMatch", 87 | "symbols":["BOTH", "CLIENT", "NONE"] 88 | } 89 | }, 90 | {"name":"serverProtocol", "type":["null", "string"]}, 91 | { 92 | "name":"serverHash", 93 | "type":["null", {"name":"MD5", "size":16, "type":"fixed"}] 94 | }, 95 | { 96 | "name": "meta", 97 | "type": ["null", {"type": "map", "values":"bytes"}] 98 | }, 99 | { 100 | "name": "duration", 101 | "type": { 102 | "logicalType": "duration", 103 | "name": "duration", 104 | "type": "fixed", 105 | "size": 12 106 | } 107 | } 108 | ] 109 | }"#, 110 | Record(avro_schema::schema::Record { 111 | name: "HandshakeResponse".to_string(), 112 | namespace: Some("org.apache.avro.ipc".to_string()), 113 | doc: None, 114 | aliases: vec![], 115 | fields: vec![ 116 | Field::new( 117 | "match", 118 | avro_schema::schema::Enum::new( 119 | "HandshakeMatch", 120 | vec!["BOTH".to_string(), "CLIENT".to_string(), "NONE".to_string()], 121 | ) 122 | .into(), 123 | ), 124 | Field::new("serverProtocol", Union(vec![Null, String(None)])), 125 | Field::new( 126 | "serverHash", 127 | Union(vec![ 128 | Null, 129 | avro_schema::schema::Fixed::new("MD5", 16).into(), 130 | ]), 131 | ), 132 | Field::new("meta", Union(vec![Null, Map(Box::new(Bytes(None)))])), 133 | Field::new( 134 | "duration", 135 | avro_schema::schema::Fixed { 136 | name: "duration".to_string(), 137 | size: 12, 138 | namespace: None, 139 | doc: None, 140 | aliases: vec![], 141 | logical: Some(avro_schema::schema::FixedLogical::Duration), 142 | } 143 | .into(), 144 | ), 145 | ], 146 | }), 147 | ), 148 | ] 149 | } 150 | 151 | #[test] 152 | fn test_deserialize() -> Result<()> { 153 | for (data, expected) in cases() { 154 | let v: avro_schema::schema::Schema = serde_json::from_str(data)?; 155 | assert_eq!(v, expected); 156 | } 157 | Ok(()) 158 | } 159 | --------------------------------------------------------------------------------