├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── examples └── demo.rs ├── src ├── checkedfile.rs ├── compression.rs ├── error.rs ├── lib.rs ├── swapvec.rs └── swapveciter.rs └── tests ├── compression.rs ├── custom_compression.rs ├── reset_iterator.rs └── write_read_back.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | name: Rust 4 | 5 | on: 6 | push: 7 | branches: [ "main" ] 8 | pull_request: 9 | branches: [ "main" ] 10 | 11 | env: 12 | CARGO_TERM_COLOR: always 13 | 14 | jobs: 15 | build: 16 | 17 | runs-on: self-hosted 18 | 19 | steps: 20 | - uses: actions/checkout@v3 21 | - uses: actions-rs/toolchain@v1 22 | with: 23 | toolchain: stable 24 | - name: Build 25 | run: cargo build --verbose 26 | - name: Run tests 27 | run: cargo test --verbose 28 | - name: Run lint 29 | run: cargo clippy 30 | 31 | windows-check: 32 | runs-on: self-hosted 33 | steps: 34 | - uses: actions-rs/toolchain@v1 35 | with: 36 | target: x86_64-pc-windows-msvc 37 | toolchain: stable 38 | override: true 39 | - uses: actions/checkout@v2 40 | - name: Check Windows 41 | run: cargo check --target x86_64-pc-windows-msvc 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "adler" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 10 | 11 | [[package]] 12 | name = "bincode" 13 | version = "1.3.3" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" 16 | dependencies = [ 17 | "serde", 18 | ] 19 | 20 | [[package]] 21 | name = "bitflags" 22 | version = "1.3.2" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 25 | 26 | [[package]] 27 | name = "bitflags" 28 | version = "2.4.1" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" 31 | 32 | [[package]] 33 | name = "cfg-if" 34 | version = "1.0.0" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 37 | 38 | [[package]] 39 | name = "errno" 40 | version = "0.3.6" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "7c18ee0ed65a5f1f81cac6b1d213b69c35fa47d4252ad41f1486dbd8226fe36e" 43 | dependencies = [ 44 | "libc", 45 | "windows-sys", 46 | ] 47 | 48 | [[package]] 49 | name = "fastrand" 50 | version = "2.0.1" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" 53 | 54 | [[package]] 55 | name = "libc" 56 | version = "0.2.150" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" 59 | 60 | [[package]] 61 | name = "linux-raw-sys" 62 | version = "0.4.11" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "969488b55f8ac402214f3f5fd243ebb7206cf82de60d3172994707a4bcc2b829" 65 | 66 | [[package]] 67 | name = "lz4_flex" 68 | version = "0.10.0" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83" 71 | dependencies = [ 72 | "twox-hash", 73 | ] 74 | 75 | [[package]] 76 | name = "miniz_oxide" 77 | version = "0.7.1" 78 | source = "registry+https://github.com/rust-lang/crates.io-index" 79 | checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" 80 | dependencies = [ 81 | "adler", 82 | ] 83 | 84 | [[package]] 85 | name = "proc-macro2" 86 | version = "1.0.69" 87 | source = "registry+https://github.com/rust-lang/crates.io-index" 88 | checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" 89 | dependencies = [ 90 | "unicode-ident", 91 | ] 92 | 93 | [[package]] 94 | name = "quote" 95 | version = "1.0.33" 96 | source = "registry+https://github.com/rust-lang/crates.io-index" 97 | checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" 98 | dependencies = [ 99 | "proc-macro2", 100 | ] 101 | 102 | [[package]] 103 | name = "redox_syscall" 104 | version = "0.4.1" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" 107 | dependencies = [ 108 | "bitflags 1.3.2", 109 | ] 110 | 111 | [[package]] 112 | name = "rustix" 113 | version = "0.38.22" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "80109a168d9bc0c7f483083244543a6eb0dba02295d33ca268145e6190d6df0c" 116 | dependencies = [ 117 | "bitflags 2.4.1", 118 | "errno", 119 | "libc", 120 | "linux-raw-sys", 121 | "windows-sys", 122 | ] 123 | 124 | [[package]] 125 | name = "serde" 126 | version = "1.0.192" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "bca2a08484b285dcb282d0f67b26cadc0df8b19f8c12502c13d966bf9482f001" 129 | dependencies = [ 130 | "serde_derive", 131 | ] 132 | 133 | [[package]] 134 | name = "serde_derive" 135 | version = "1.0.192" 136 | source = "registry+https://github.com/rust-lang/crates.io-index" 137 | checksum = "d6c7207fbec9faa48073f3e3074cbe553af6ea512d7c21ba46e434e70ea9fbc1" 138 | dependencies = [ 139 | "proc-macro2", 140 | "quote", 141 | "syn", 142 | ] 143 | 144 | [[package]] 145 | name = "static_assertions" 146 | version = "1.1.0" 147 | source = "registry+https://github.com/rust-lang/crates.io-index" 148 | checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" 149 | 150 | [[package]] 151 | name = "swapvec" 152 | version = "0.4.2" 153 | dependencies = [ 154 | "bincode", 155 | "lz4_flex", 156 | "miniz_oxide", 157 | "serde", 158 | "tempfile", 159 | ] 160 | 161 | [[package]] 162 | name = "syn" 163 | version = "2.0.39" 164 | source = "registry+https://github.com/rust-lang/crates.io-index" 165 | checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" 166 | dependencies = [ 167 | "proc-macro2", 168 | "quote", 169 | "unicode-ident", 170 | ] 171 | 172 | [[package]] 173 | name = "tempfile" 174 | version = "3.8.1" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" 177 | dependencies = [ 178 | "cfg-if", 179 | "fastrand", 180 | "redox_syscall", 181 | "rustix", 182 | "windows-sys", 183 | ] 184 | 185 | [[package]] 186 | name = "twox-hash" 187 | version = "1.6.3" 188 | source = "registry+https://github.com/rust-lang/crates.io-index" 189 | checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" 190 | dependencies = [ 191 | "cfg-if", 192 | "static_assertions", 193 | ] 194 | 195 | [[package]] 196 | name = "unicode-ident" 197 | version = "1.0.12" 198 | source = "registry+https://github.com/rust-lang/crates.io-index" 199 | checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" 200 | 201 | [[package]] 202 | name = "windows-sys" 203 | version = "0.48.0" 204 | source = "registry+https://github.com/rust-lang/crates.io-index" 205 | checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" 206 | dependencies = [ 207 | "windows-targets", 208 | ] 209 | 210 | [[package]] 211 | name = "windows-targets" 212 | version = "0.48.5" 213 | source = "registry+https://github.com/rust-lang/crates.io-index" 214 | checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" 215 | dependencies = [ 216 | "windows_aarch64_gnullvm", 217 | "windows_aarch64_msvc", 218 | "windows_i686_gnu", 219 | "windows_i686_msvc", 220 | "windows_x86_64_gnu", 221 | "windows_x86_64_gnullvm", 222 | "windows_x86_64_msvc", 223 | ] 224 | 225 | [[package]] 226 | name = "windows_aarch64_gnullvm" 227 | version = "0.48.5" 228 | source = "registry+https://github.com/rust-lang/crates.io-index" 229 | checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" 230 | 231 | [[package]] 232 | name = "windows_aarch64_msvc" 233 | version = "0.48.5" 234 | source = "registry+https://github.com/rust-lang/crates.io-index" 235 | checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" 236 | 237 | [[package]] 238 | name = "windows_i686_gnu" 239 | version = "0.48.5" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" 242 | 243 | [[package]] 244 | name = "windows_i686_msvc" 245 | version = "0.48.5" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" 248 | 249 | [[package]] 250 | name = "windows_x86_64_gnu" 251 | version = "0.48.5" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" 254 | 255 | [[package]] 256 | name = "windows_x86_64_gnullvm" 257 | version = "0.48.5" 258 | source = "registry+https://github.com/rust-lang/crates.io-index" 259 | checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" 260 | 261 | [[package]] 262 | name = "windows_x86_64_msvc" 263 | version = "0.48.5" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" 266 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "swapvec" 3 | version = "0.4.2" 4 | edition = "2021" 5 | authors = ["Julian Büttner "] 6 | license = "MIT" 7 | repository = "https://github.com/julianbuettner/swapvec" 8 | readme = "README.md" 9 | description = "A Vector swapping to disk after exceeding a given length" 10 | 11 | [dependencies] 12 | bincode = "1.3.3" 13 | lz4_flex = "0.10.0" 14 | miniz_oxide = "0.7.1" 15 | serde = "1.0.160" 16 | tempfile = "3.5.0" 17 | 18 | [lib] 19 | name = "swapvec" 20 | crate-type = ["lib"] 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Julian Büttner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SwapVec 2 | 3 | A vector which swaps to disk when exceeding a certain length. 4 | 5 | Useful if you do not want to use a queue, but first collecting 6 | all data and then consuming it. 7 | 8 | Imagine multiple threads slowly producing giant vectors of data, 9 | passing it to a single consumer later on. 10 | 11 | Or a CSV upload of multiple gigabytes to an HTTP server, 12 | in which you want to validate every 13 | line while uploading, without directly starting a Database 14 | transaction or keeping everything in memory. 15 | 16 | ## Features 17 | - Multiplatform (Linux, Windows, MacOS) 18 | - Creates temporary file only after exceeding threshold 19 | - Works on `T: Serialize + Deserialize + Clone` 20 | - Temporary file removed even when terminating the program 21 | - Checksums to guarantee integrity 22 | - Can be moved across threads 23 | 24 | ## Limitations 25 | - Due to potentially doing IO, most actions are wrapped in a `Result` 26 | - Currently, no "start swapping after n MiB" is implemented 27 | - Would need element wise space calculation due to heap elements (e.g. `String`) 28 | - `Compression` currently does not compress. It is there to keep the API stable. 29 | - No async support (yet) 30 | - When pushing elements or consuming iterators, SwapVec is "write only" 31 | - Only forwards iterations 32 | - Can be reset though 33 | 34 | ## Examples 35 | 36 | ### Basic Usage 37 | 38 | ```rust 39 | use swapvec::SwapVec; 40 | let iterator = (0..9).into_iter(); 41 | let mut much_data = SwapVec::default(); 42 | // Starts using disk for big iterators 43 | much_data.consume(iterator).unwrap(); 44 | for value in much_data.into_iter() { 45 | println!("Read back: {}", value.unwrap()); 46 | } 47 | ``` 48 | 49 | ### Examples 50 | 51 | Currently there is only one simple example, 52 | doing some basic operations and getting metrics like 53 | getting the batches/bytes written to file. 54 | . Run it with 55 | 56 | ```bash 57 | cargo run --example demo 58 | ``` 59 | -------------------------------------------------------------------------------- /examples/demo.rs: -------------------------------------------------------------------------------- 1 | use swapvec::{SwapVec, SwapVecConfig}; 2 | 3 | const DATA_MB: u64 = 20; 4 | 5 | fn main() { 6 | let element_count = DATA_MB / 8; 7 | let big_iterator = 0..element_count * 1024 * 1024; 8 | 9 | let config = swapvec::SwapVecConfig { 10 | batch_size: 8 * 1024, 11 | ..SwapVecConfig::default() 12 | }; 13 | let mut swapvec: SwapVec<_> = SwapVec::with_config(config); 14 | swapvec.consume(big_iterator.into_iter()).unwrap(); 15 | 16 | println!("Data size: {}MB", DATA_MB); 17 | println!("Done. Batches written: {}", swapvec.batches_written()); 18 | println!( 19 | "Filesize: {}MB", 20 | swapvec 21 | .file_size() 22 | .map(|x| x as f32 / 1024. / 1024.) 23 | .unwrap_or(0.) 24 | ); 25 | println!("Read back"); 26 | 27 | let read_back: Vec<_> = swapvec.into_iter().map(|x| x.unwrap()).collect(); 28 | 29 | println!("Elements read back: {}", read_back.len()); 30 | } 31 | -------------------------------------------------------------------------------- /src/checkedfile.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | hash::{Hash, Hasher}, 3 | io::{self, BufReader, BufWriter, Error, Read, Seek, Write}, collections::hash_map::DefaultHasher, 4 | }; 5 | 6 | use crate::SwapVecError; 7 | 8 | #[derive(Debug)] 9 | pub struct BatchInfo { 10 | pub hash: u64, 11 | pub bytes: usize, 12 | } 13 | 14 | pub(crate) struct BatchWriter { 15 | inner: BufWriter, 16 | batch_infos: Vec, 17 | } 18 | 19 | pub(crate) struct BatchReader { 20 | inner: BufReader, 21 | batch_infos: Vec, 22 | batch_index: usize, 23 | buffer: Vec, 24 | } 25 | 26 | fn hash_bytes(bytes: &[u8]) -> u64 { 27 | let mut hasher = DefaultHasher::new(); 28 | bytes.hash(&mut hasher); 29 | hasher.finish() 30 | } 31 | 32 | impl BatchWriter { 33 | pub fn new(writer: T) -> Self { 34 | Self { 35 | batch_infos: Vec::new(), 36 | inner: BufWriter::new(writer), 37 | } 38 | } 39 | pub fn write_batch(&mut self, buffer: &[u8]) -> Result<(), io::Error> { 40 | self.inner.write_all(buffer)?; 41 | self.batch_infos.push(BatchInfo { 42 | hash: hash_bytes(buffer), 43 | bytes: buffer.len(), 44 | }); 45 | self.inner.flush() 46 | } 47 | pub fn bytes_written(&self) -> usize { 48 | self.batch_infos.iter().map(|b| b.bytes).sum() 49 | } 50 | pub fn batch_count(&self) -> usize { 51 | self.batch_infos.len() 52 | } 53 | } 54 | 55 | impl BatchReader { 56 | pub fn reset(&mut self) -> Result<(), Error> { 57 | self.inner.seek(io::SeekFrom::Start(0))?; 58 | self.batch_index = 0; 59 | self.buffer.clear(); 60 | Ok(()) 61 | } 62 | } 63 | 64 | impl BatchReader { 65 | pub fn read_batch(&mut self) -> Result, SwapVecError> { 66 | let batch_info = self.batch_infos.get(self.batch_index); 67 | self.batch_index += 1; 68 | if batch_info.is_none() { 69 | return Ok(None); 70 | } 71 | let batch_info = batch_info.unwrap(); 72 | self.buffer.resize(batch_info.bytes, 0); 73 | self.inner.read_exact(self.buffer.as_mut_slice())?; 74 | if hash_bytes(self.buffer.as_slice()) != batch_info.hash { 75 | // return Err(SwapVecError::WrongChecksum); 76 | } 77 | Ok(Some(self.buffer.as_slice())) 78 | } 79 | } 80 | 81 | impl TryFrom> for BatchReader { 82 | type Error = std::io::Error; 83 | 84 | fn try_from(value: BatchWriter) -> Result { 85 | let mut inner = value 86 | .inner 87 | .into_inner() 88 | .map_err(|inner_error| inner_error.into_error())?; 89 | inner.seek(io::SeekFrom::Start(0))?; 90 | Ok(Self { 91 | inner: BufReader::new(inner), 92 | batch_infos: value.batch_infos, 93 | batch_index: 0, 94 | buffer: Vec::new(), 95 | }) 96 | } 97 | } 98 | 99 | #[cfg(test)] 100 | mod test { 101 | use std::io::Cursor; 102 | 103 | use super::*; 104 | 105 | #[test] 106 | fn read_write_checked_io() { 107 | let buffer = Cursor::new(vec![0; 128]); 108 | let mut batch_writer = BatchWriter::new(buffer); 109 | batch_writer 110 | .write_batch(&[1, 2, 3]) 111 | .expect("Could not write to IO buffer"); 112 | batch_writer 113 | .write_batch(&[44, 55]) 114 | .expect("Could not write to IO buffer"); 115 | 116 | // batch_writer.wtf(); 117 | // panic!() 118 | let mut reader: BatchReader<_> = batch_writer 119 | .try_into() 120 | .expect("Could not flush into IO buffer"); 121 | assert_eq!( 122 | reader 123 | .read_batch() 124 | .expect("Could not read batch") 125 | .expect("Batch was unexpectedly empty"), 126 | &[1, 2, 3] 127 | ); 128 | reader.reset().expect("Could not reset"); 129 | assert_eq!( 130 | reader 131 | .read_batch() 132 | .expect("Could not read batch") 133 | .expect("Batch was unexpectedly empty"), 134 | &[1, 2, 3] 135 | ); 136 | assert_eq!( 137 | reader 138 | .read_batch() 139 | .expect("Could not read batch") 140 | .expect("Batch was unexpectedly empty"), 141 | &[44, 55] 142 | ); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/compression.rs: -------------------------------------------------------------------------------- 1 | use lz4_flex::{compress_prepend_size, decompress_size_prepended}; 2 | 3 | use crate::{swapvec::CompressionLevel, Compression}; 4 | 5 | /// Provide your own compression algorithm by 6 | /// creating an empty struct implementing `compress` 7 | /// and `decompress`. 8 | /// 9 | /// Your compression algorithm is allowed to fail, 10 | /// but _must_ always decompress into the same 11 | /// bytes. Undefined behaviour otherwise. 12 | /// 13 | /// Note: You must always also implement 14 | /// CompressBoxedClone, to allow cloning 15 | /// and debugging of the configuration. 16 | /// 17 | /// ```rust 18 | /// use swapvec::Compress; 19 | /// struct DummyCompression; 20 | /// impl Compress for DummyCompression { 21 | /// fn compress(&self, block: Vec) -> Vec { 22 | /// block 23 | /// } 24 | /// fn decompress(&self, block: Vec) -> Result, ()> { 25 | /// Ok(block) 26 | /// } 27 | /// } 28 | /// 29 | /// let bytes = vec![1, 2, 3]; 30 | /// let compression = DummyCompression; 31 | /// assert_eq!(bytes, compression.decompress(compression.compress(bytes.clone())).unwrap()); 32 | /// ``` 33 | pub trait Compress { 34 | /// Compress bytes blockwise. The compressed block 35 | /// will be put into `self.decompress()` later. 36 | fn compress(&self, block: Vec) -> Vec; 37 | /// Receive block which was earlier `compress()`ed. 38 | /// If the result is `Ok`, the same bytes which were 39 | /// `compress()`es earlier are expected. 40 | fn decompress(&self, block: Vec) -> Result, ()>; 41 | } 42 | 43 | impl Compress for Option { 44 | fn compress(&self, block: Vec) -> Vec { 45 | match self { 46 | Some(Compression::Lz4) => compress_prepend_size(&block).to_vec(), 47 | Some(Compression::Deflate(level)) => { 48 | let compression_level = match level { 49 | CompressionLevel::Fast => 2, 50 | CompressionLevel::Default => 6, 51 | CompressionLevel::Slow => 9, 52 | }; 53 | miniz_oxide::deflate::compress_to_vec(&block, compression_level) 54 | } 55 | Some(Compression::Custom(algo)) => algo.compress(block), 56 | None => block, 57 | } 58 | } 59 | fn decompress(&self, block: Vec) -> Result, ()> { 60 | match self { 61 | Some(Compression::Lz4) => decompress_size_prepended(&block).map_err(|_| ()), 62 | Some(Compression::Deflate(_)) => { 63 | miniz_oxide::inflate::decompress_to_vec(&block).map_err(|_| ()) 64 | } 65 | Some(Compression::Custom(algo)) => algo.decompress(block), 66 | None => Ok(block), 67 | } 68 | } 69 | } 70 | 71 | /// Your custom compression algorithm struct must be debugable 72 | /// and clonable. Implement this trait to keep the main 73 | /// configuration debugable and clonable. 74 | pub trait CompressBoxedClone: Compress + std::fmt::Debug { 75 | /// Clone your empty struct and return it as a new Box. 76 | fn boxed_clone(&self) -> Box; 77 | } 78 | 79 | #[cfg(test)] 80 | mod test { 81 | use super::*; 82 | 83 | #[test] 84 | fn test_lz4() { 85 | let compression = Some(Compression::Lz4); 86 | let data: Vec = (0..u8::MAX).collect(); 87 | let compressed = compression.compress(data.clone()); 88 | let decompressed = compression.decompress(compressed).unwrap(); 89 | assert_eq!(decompressed, data); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | /// A collection of all possible errors. 2 | /// 3 | /// Errors could be divided into write and read 4 | /// errors, but this makes error handling a bit less 5 | /// comfortable, so they are united here. 6 | #[derive(Debug)] 7 | #[non_exhaustive] 8 | pub enum SwapVecError { 9 | /// The program is missing permissions to create a temporary file 10 | MissingPermissions, 11 | /// A batch could not be written due to a full disk 12 | OutOfDisk, 13 | /// A read back batch had a wrong checksum 14 | WrongChecksum, 15 | /// A batch could not be decompressed correctly. 16 | /// This also happens only if the file has been corrupted. 17 | Decompression, 18 | /// The batch was read back successfully, 19 | /// but the serialization failed. 20 | /// 21 | /// Take a look at the `Serialize` implementation 22 | /// of your type `T`. 23 | SerializationFailed(bincode::ErrorKind), 24 | /// Every other possibility 25 | Other(std::io::ErrorKind), 26 | } 27 | 28 | impl From for SwapVecError { 29 | fn from(_value: std::io::Error) -> Self { 30 | match _value.kind() { 31 | // TODO https://github.com/rust-lang/rust/issues/86442 32 | // std::io::ErrorKind::StorageFull => Self::OutOfDisk, 33 | std::io::ErrorKind::PermissionDenied => Self::MissingPermissions, 34 | e => Self::Other(e), 35 | } 36 | } 37 | } 38 | 39 | impl From> for SwapVecError { 40 | fn from(value: Box) -> Self { 41 | SwapVecError::SerializationFailed(*value) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(missing_docs)] 2 | #![doc = include_str!("../README.md")] 3 | 4 | mod compression; 5 | mod error; 6 | mod swapvec; 7 | mod swapveciter; 8 | mod checkedfile; 9 | 10 | pub use self::swapvec::{Compression, CompressionLevel, SwapVec, SwapVecConfig}; 11 | pub use compression::{Compress, CompressBoxedClone}; 12 | pub use error::SwapVecError; 13 | pub use swapveciter::SwapVecIter; 14 | -------------------------------------------------------------------------------- /src/swapvec.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::VecDeque, 3 | fmt::Debug, 4 | fs::File, 5 | }; 6 | 7 | use serde::{Deserialize, Serialize}; 8 | 9 | use crate::{ 10 | checkedfile::BatchWriter, 11 | compression::{Compress, CompressBoxedClone}, 12 | error::SwapVecError, 13 | swapveciter::SwapVecIter, 14 | }; 15 | 16 | /// Set compression level of the compression 17 | /// algorithm. This maps to different values 18 | /// depending on the chosen algortihm. 19 | #[derive(Clone, Debug, Copy)] 20 | pub enum CompressionLevel { 21 | /// Slower than default, higher compression. 22 | /// Might be useful for big amount of data 23 | /// which requires heavier compression. 24 | Slow, 25 | /// A good ratio of compression ratio to cpu time. 26 | Default, 27 | /// Accept worse compression for speed. 28 | /// Useful for easily compressable data with 29 | /// many repetitions. 30 | Fast, 31 | } 32 | 33 | /// Configure compression for the temporary 34 | /// file into which your data might be swapped out. 35 | #[derive(Debug)] 36 | #[non_exhaustive] 37 | pub enum Compression { 38 | /// Read more about LZ4 here: [LZ4] 39 | /// [LZ4]: https://github.com/lz4/lz4 40 | Lz4, 41 | /// Deflate, mostly known from gzip. 42 | Deflate(CompressionLevel), 43 | /// Provide your own compression algortihm by implementing 44 | /// `Compress`. 45 | Custom(Box), 46 | } 47 | 48 | impl Clone for Compression { 49 | fn clone(&self) -> Self { 50 | match &self { 51 | Self::Lz4 => Self::Lz4, 52 | Self::Deflate(n) => Self::Deflate(*n), 53 | Self::Custom(x) => Self::Custom(x.boxed_clone()), 54 | } 55 | } 56 | } 57 | 58 | /// Configure when and how the vector should swap. 59 | /// 60 | /// The file creation will happen after max(swap_after, batch_size) 61 | /// elements. 62 | /// 63 | /// Keep in mind, that if the temporary file exists, 64 | /// after ever batch_size elements, at least one write (syscall) 65 | /// will happen. 66 | #[derive(Debug)] 67 | pub struct SwapVecConfig { 68 | /// The vector will create a temporary file and starting to 69 | /// swap after so many elements. 70 | /// If your elements have a certain size in bytes, you can 71 | /// multiply this value to calculate the required storage. 72 | /// 73 | /// If you want to start swapping with the first batch, 74 | /// set to batch_size or smaller. 75 | /// 76 | /// Default: 32 * 1024 * 1024 77 | pub swap_after: usize, 78 | /// How many elements at once should be written to disk. 79 | /// Keep in mind, that for every batch one hash (`u64`) 80 | /// and one bytecount (`usize`) 81 | /// will be kept in memory. 82 | /// 83 | /// One batch write will result in at least one syscall. 84 | /// 85 | /// Default: 32 * 1024 86 | pub batch_size: usize, 87 | /// If and how you want to compress your temporary file. 88 | /// This might be only useful for data which is compressable, 89 | /// like timeseries often are. 90 | /// 91 | /// Default: No compression 92 | pub compression: Option, 93 | } 94 | 95 | impl Default for SwapVecConfig { 96 | fn default() -> Self { 97 | Self { 98 | swap_after: 32 * 1024 * 1024, 99 | batch_size: 32 * 1024, 100 | compression: None, 101 | } 102 | } 103 | } 104 | 105 | /// An only growing array type 106 | /// which swaps to disk, based on it's initial configuration. 107 | /// 108 | /// Create a mutable instance, and then 109 | /// pass iterators or elements to grow it. 110 | /// ```rust 111 | /// let mut bigvec = swapvec::SwapVec::default(); 112 | /// let iterator = (0..9); 113 | /// bigvec.consume(iterator); 114 | /// bigvec.push(99); 115 | /// let new_iterator = bigvec.into_iter(); 116 | /// ``` 117 | pub struct SwapVec 118 | where 119 | for<'a> T: Serialize + Deserialize<'a>, 120 | { 121 | tempfile: Option>, 122 | vector: VecDeque, 123 | config: SwapVecConfig, 124 | } 125 | 126 | impl Deserialize<'a>> Default for SwapVec { 127 | fn default() -> Self { 128 | Self { 129 | tempfile: None, 130 | vector: VecDeque::new(), 131 | config: SwapVecConfig::default(), 132 | } 133 | } 134 | } 135 | 136 | impl Deserialize<'a>> Debug for SwapVec { 137 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 138 | write!( 139 | f, 140 | "SwapVec {{elements_in_ram: {}, elements_in_file: {}}}", 141 | self.vector.len(), 142 | self.tempfile.as_ref().map(|x| x.batch_count()).unwrap_or(0) * self.config.batch_size, 143 | ) 144 | } 145 | } 146 | 147 | impl SwapVec 148 | where 149 | for<'a> T: Serialize + Deserialize<'a> + Clone, 150 | { 151 | /// Intialize with non-default configuration. 152 | pub fn with_config(config: SwapVecConfig) -> Self { 153 | Self { 154 | tempfile: None, 155 | vector: VecDeque::new(), 156 | config, 157 | } 158 | } 159 | 160 | /// Give away an entire iterator for consumption. 161 | /// Might return an error, due to possibly triggered batch flush (IO). 162 | pub fn consume(&mut self, it: impl Iterator) -> Result<(), SwapVecError> { 163 | for element in it { 164 | self.push(element)?; 165 | self.after_push_work()?; 166 | } 167 | Ok(()) 168 | } 169 | 170 | /// Push a single element. 171 | /// Might return an error, due to possibly triggered batch flush (IO). 172 | /// Will write at most one batch per insert. 173 | /// If `swap_after` is bigger than `batch_size` and a file is created, 174 | /// every insert will 175 | /// write one batch to disk, until the elements in memory have a count 176 | /// smaller than or equal to batch size. 177 | pub fn push(&mut self, element: T) -> Result<(), SwapVecError> { 178 | self.vector.push_back(element); 179 | self.after_push_work() 180 | } 181 | 182 | /// Check if enough items have been pushed so that 183 | /// the temporary file has been created. 184 | /// Will be false if element count is below swap_after and below batch_size 185 | pub fn written_to_file(&self) -> bool { 186 | self.tempfile.is_some() 187 | } 188 | 189 | /// Get the file size in bytes of the temporary file. 190 | /// Might do IO and therefore could return some Result. 191 | pub fn file_size(&self) -> Option { 192 | self.tempfile.as_ref().map(|f| f.bytes_written()) 193 | } 194 | 195 | /// Basically int(elements pushed / batch size) 196 | pub fn batches_written(&self) -> usize { 197 | match self.tempfile.as_ref() { 198 | None => 0, 199 | Some(f) => f.batch_count(), 200 | } 201 | } 202 | 203 | fn after_push_work(&mut self) -> Result<(), SwapVecError> { 204 | if self.vector.len() <= self.config.batch_size { 205 | return Ok(()); 206 | } 207 | if self.tempfile.is_none() && self.vector.len() <= self.config.swap_after { 208 | return Ok(()); 209 | } 210 | 211 | // Flush batch 212 | if self.tempfile.is_none() { 213 | let tf = tempfile::Builder::new().tempfile_in(".")?.into_file(); 214 | self.tempfile = Some(BatchWriter::new(tf)); 215 | } 216 | assert!(self.tempfile.is_some()); 217 | let batch: Vec<_> = self.vector.drain(0..self.config.batch_size).collect(); 218 | 219 | let buffer = bincode::serialize(&batch)?; 220 | let compressed = self.config.compression.compress(buffer); 221 | self.tempfile.as_mut().unwrap().write_batch(&compressed)?; 222 | Ok(()) 223 | } 224 | } 225 | 226 | impl Deserialize<'a> + Clone> IntoIterator for SwapVec { 227 | type Item = Result; 228 | type IntoIter = SwapVecIter; 229 | 230 | fn into_iter(self) -> Self::IntoIter { 231 | SwapVecIter::new(self.tempfile, self.vector, self.config) 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/swapveciter.rs: -------------------------------------------------------------------------------- 1 | use std::collections::VecDeque; 2 | use std::fs::File; 3 | 4 | use serde::{Deserialize, Serialize}; 5 | 6 | use crate::checkedfile::{BatchReader, BatchWriter}; 7 | use crate::compression::Compress; 8 | use crate::error::SwapVecError; 9 | use crate::swapvec::SwapVecConfig; 10 | 11 | struct VecDequeIndex { 12 | value: VecDeque, 13 | } 14 | 15 | impl From> for VecDequeIndex { 16 | fn from(value: VecDeque) -> Self { 17 | Self { value } 18 | } 19 | } 20 | 21 | impl VecDequeIndex { 22 | fn get(&self, i: usize) -> Option { 23 | let (a, b) = self.value.as_slices(); 24 | if i < a.len() { 25 | a.get(i).cloned() 26 | } else { 27 | b.get(i - a.len()).cloned() 28 | } 29 | } 30 | } 31 | 32 | /// Iterator for SwapVec. 33 | /// 34 | /// Items might be read from disk, 35 | /// so every item is wrapped in a `Result`. 36 | /// The iterator aborts after the first error. 37 | /// 38 | /// Dropping the iterator removes the temporary file, if existing. 39 | /// Also quitting the program should remove the temporary file. 40 | pub struct SwapVecIter 41 | where 42 | for<'a> T: Serialize + Deserialize<'a> + Clone, 43 | { 44 | // Do not error on new, because into_iter() 45 | // is not allowed to fail. Fail at first try then. 46 | new_error: Option, 47 | current_batch_rev: Vec, 48 | tempfile: Option>, 49 | // last_elements are elements, 50 | // which have not been written to disk. 51 | // Therefore, for iterating from zero, 52 | // first read elements from disk and 53 | // then from last_elements. 54 | last_elements: VecDequeIndex, 55 | last_elements_index: usize, 56 | config: SwapVecConfig, 57 | } 58 | 59 | impl Deserialize<'a> + Clone> SwapVecIter { 60 | pub(crate) fn new( 61 | tempfile_written: Option>, 62 | last_elements: VecDeque, 63 | config: SwapVecConfig, 64 | ) -> Self { 65 | let (tempfile, new_error) = match tempfile_written.map(|v| v.try_into()) { 66 | None => (None, None), 67 | Some(Ok(v)) => (Some(v), None), 68 | Some(Err(e)) => (None, Some(e)), 69 | }; 70 | 71 | let last_elements: VecDequeIndex<_> = last_elements.into(); 72 | Self { 73 | new_error, 74 | current_batch_rev: Vec::with_capacity(config.batch_size), 75 | last_elements, 76 | last_elements_index: 0, 77 | tempfile, 78 | config, 79 | } 80 | } 81 | 82 | fn read_batch(&mut self) -> Result>, SwapVecError> { 83 | if self.tempfile.is_none() { 84 | return Ok(None); 85 | } 86 | assert!(self.tempfile.is_some()); 87 | if let Some(err) = self.new_error.take() { 88 | return Err(err.into()); 89 | } 90 | 91 | let tempfile = self.tempfile.as_mut().unwrap(); 92 | let buffer = tempfile.read_batch()?; 93 | if buffer.is_none() { 94 | return Ok(None); 95 | } 96 | let buffer = buffer.unwrap(); 97 | let decompressed: Vec = self 98 | .config 99 | .compression 100 | .decompress(buffer.to_vec()) 101 | .map_err(|_| SwapVecError::Decompression)?; 102 | 103 | let batch: Vec = bincode::deserialize(&decompressed)?; 104 | 105 | Ok(Some(batch)) 106 | } 107 | 108 | fn next_in_batch(&mut self) -> Result, SwapVecError> { 109 | if let Some(v) = self.current_batch_rev.pop() { 110 | return Ok(Some(v)); 111 | } 112 | if let Some(mut new_batch) = self.read_batch()? { 113 | new_batch.reverse(); 114 | self.current_batch_rev = new_batch; 115 | Ok(self.current_batch_rev.pop()) 116 | } else { 117 | Ok(None) 118 | } 119 | } 120 | 121 | /// Resets the iteration, starting from the first element. 122 | /// If a file exists, it will be read from the beginning. 123 | /// 124 | /// To use this feature, you probably don't want to consume 125 | /// the iterator (`bigvec.map(|x| x * 2)`), but to use 126 | /// [`Iterator::by_ref()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.by_ref) 127 | /// ```rust 128 | /// let mut bigvec = swapvec::SwapVec::default(); 129 | /// bigvec.consume(0..99); 130 | /// let mut new_iterator = bigvec.into_iter(); 131 | /// let sum: usize = new_iterator.by_ref().map(|v| v.unwrap()).sum(); 132 | /// new_iterator.reset(); 133 | /// let sum_double: usize = new_iterator.by_ref().map(|v| v.unwrap() * 2).sum(); 134 | /// ``` 135 | pub fn reset(&mut self) { 136 | self.current_batch_rev.clear(); 137 | self.last_elements_index = 0; 138 | if let Some(tempfile) = self.tempfile.as_mut() { 139 | if let Err(e) = tempfile.reset() { 140 | self.new_error = Some(e); 141 | } 142 | } 143 | } 144 | } 145 | 146 | impl Deserialize<'a> + Clone> Iterator for SwapVecIter { 147 | type Item = Result; 148 | 149 | fn next(&mut self) -> Option { 150 | if let Some(item) = self.current_batch_rev.pop() { 151 | return Some(Ok(item)); 152 | } 153 | 154 | match self.next_in_batch() { 155 | Err(err) => Some(Err(err)), 156 | Ok(Some(item)) => Some(Ok(item)), 157 | Ok(None) => { 158 | let index = self.last_elements_index; 159 | self.last_elements_index += 1; 160 | self.last_elements.get(index).map(|x| Ok(x)) 161 | } 162 | } 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /tests/compression.rs: -------------------------------------------------------------------------------- 1 | use swapvec::{Compression, CompressionLevel, SwapVec, SwapVecConfig}; 2 | 3 | #[test] 4 | fn write_and_read_back_with_compression() { 5 | let data: Vec = (0..999).collect(); 6 | 7 | let compression_configs: Vec> = vec![ 8 | None, 9 | Some(Compression::Lz4), 10 | Some(Compression::Deflate(CompressionLevel::Fast)), 11 | Some(Compression::Deflate(CompressionLevel::Default)), 12 | Some(Compression::Deflate(CompressionLevel::Slow)), 13 | ]; 14 | 15 | for compression in compression_configs { 16 | let config = SwapVecConfig { 17 | compression: compression.clone(), 18 | swap_after: 16, 19 | batch_size: 8, 20 | }; 21 | let mut v = SwapVec::with_config(config); 22 | v.consume(data.iter().copied()).unwrap(); 23 | let read_back: Vec = v 24 | .into_iter() 25 | .map(|x| { 26 | x.unwrap_or_else(|e| panic!("Failed for compression {:?} {:?}", compression, e)) 27 | }) 28 | .collect(); 29 | assert_eq!(read_back, data,); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tests/custom_compression.rs: -------------------------------------------------------------------------------- 1 | use swapvec::{Compress, CompressBoxedClone, Compression, SwapVec, SwapVecConfig}; 2 | 3 | #[derive(Debug)] 4 | struct MyCompression; 5 | 6 | impl Compress for MyCompression { 7 | fn compress(&self, block: Vec) -> Vec { 8 | block 9 | } 10 | fn decompress(&self, block: Vec) -> Result, ()> { 11 | Ok(block) 12 | } 13 | } 14 | 15 | impl CompressBoxedClone for MyCompression { 16 | fn boxed_clone(&self) -> Box { 17 | Box::new(MyCompression) 18 | } 19 | } 20 | 21 | #[test] 22 | fn custom_compression() { 23 | let config = SwapVecConfig { 24 | compression: Some(Compression::Custom(Box::new(MyCompression))), 25 | swap_after: 16, 26 | batch_size: 5, 27 | }; 28 | 29 | let vector: Vec = (0..999).collect(); 30 | let mut v = SwapVec::with_config(config); 31 | v.consume(vector.clone().into_iter()).unwrap(); 32 | assert!(v.written_to_file()); 33 | let vector_read_back: Vec = v.into_iter().map(|x| x.unwrap()).collect(); 34 | assert_eq!(vector, vector_read_back); 35 | } 36 | -------------------------------------------------------------------------------- /tests/reset_iterator.rs: -------------------------------------------------------------------------------- 1 | use swapvec::{SwapVec, SwapVecConfig}; 2 | 3 | #[test] 4 | fn reset_with_file() { 5 | let config = SwapVecConfig { 6 | compression: None, 7 | swap_after: 16, 8 | batch_size: 5, 9 | }; 10 | 11 | let vector: Vec = (0..999).collect(); 12 | 13 | let mut v = SwapVec::with_config(config); 14 | v.consume(vector.clone().into_iter()).unwrap(); 15 | 16 | assert!(v.written_to_file()); 17 | 18 | let mut iterator = v.into_iter(); 19 | let vector_read_back: Vec = iterator.by_ref().map(|x| x.unwrap()).collect(); 20 | assert_eq!(vector, vector_read_back); 21 | 22 | iterator.reset(); 23 | let vector_read_back2: Vec = iterator.map(|x| x.unwrap()).collect(); 24 | assert_eq!(vector, vector_read_back2); 25 | } 26 | -------------------------------------------------------------------------------- /tests/write_read_back.rs: -------------------------------------------------------------------------------- 1 | use swapvec::{SwapVec, SwapVecConfig}; 2 | 3 | #[test] 4 | fn with_file() { 5 | let config = SwapVecConfig { 6 | compression: None, 7 | swap_after: 16, 8 | batch_size: 5, 9 | }; 10 | 11 | let vector: Vec = (0..999).collect(); 12 | 13 | let mut v = SwapVec::with_config(config); 14 | v.consume(vector.clone().into_iter()).unwrap(); 15 | 16 | assert!(v.written_to_file()); 17 | 18 | let vector_read_back: Vec = v.into_iter().map(|x| x.unwrap()).collect(); 19 | 20 | assert_eq!(vector, vector_read_back); 21 | } 22 | 23 | #[test] 24 | fn without_file() { 25 | let config = SwapVecConfig { 26 | compression: None, 27 | swap_after: 1001, 28 | batch_size: 5, 29 | }; 30 | 31 | let vector: Vec = (0..999).collect(); 32 | 33 | let mut v = SwapVec::with_config(config); 34 | v.consume(vector.clone().into_iter()).unwrap(); 35 | 36 | assert!(!v.written_to_file()); 37 | 38 | let vector_read_back: Vec = v.into_iter().map(|x| x.unwrap()).collect(); 39 | 40 | assert_eq!(vector, vector_read_back); 41 | } 42 | --------------------------------------------------------------------------------