├── .github └── workflows │ ├── release.yml │ └── test.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── examples ├── custom_serializer.rs ├── custom_type.rs └── quickstart.rs ├── rustfmt.toml └── src ├── buffer.rs ├── chunk.rs ├── lib.rs ├── main.rs ├── merger.rs └── sort.rs /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | release: 5 | types: 6 | - released 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Install Rust 14 | run: rustup update stable 15 | - name: Build and publish 16 | run: | 17 | cargo login ${{ secrets.CRATESIO_TOKEN }} 18 | cargo publish 19 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - dev 7 | - master 8 | push: 9 | branches: 10 | - master 11 | 12 | jobs: 13 | test: 14 | name: Run unit-tests 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Install Rust 19 | run: rustup update stable 20 | - name: Run tests 21 | run: cargo test --all-features 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | .idea -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ext-sort" 3 | version = "0.1.5" 4 | edition = "2021" 5 | license = "Unlicense" 6 | description = "rust external sort algorithm implementation" 7 | readme = "README.md" 8 | 9 | homepage = "https://github.com/dapper91/ext-sort-rs" 10 | documentation = "https://docs.rs/ext-sort/" 11 | repository = "https://github.com/dapper91/ext-sort-rs" 12 | 13 | categories = ["algorithms"] 14 | keywords = ["algorithms", "sort", "sorting", "external-sort", "external"] 15 | 16 | [dependencies] 17 | bytesize = { version = "1.1.0", optional = true } 18 | clap = { version = "3.0.0", features = ["derive"], optional = true } 19 | deepsize = { version = "0.2.0", optional = true } 20 | env_logger = { version = "0.9.0", optional = true} 21 | log = "0.4.8" 22 | rayon = "1.5.0" 23 | rmp-serde = "1.1.1" 24 | serde = { version = "1.0.120", features = ["derive"] } 25 | tempfile = "3.2.0" 26 | 27 | [dev-dependencies] 28 | rstest = "0.12.0" 29 | rand = "0.8.0" 30 | 31 | [features] 32 | memory-limit = ["deepsize"] 33 | 34 | [[bin]] 35 | name = "ext-sort" 36 | required-features = ["bytesize", "clap", "env_logger", "memory-limit"] 37 | 38 | [[example]] 39 | name = "quickstart" 40 | required-features = ["bytesize", "env_logger", "memory-limit"] 41 | 42 | [[example]] 43 | name = "custom_serializer" 44 | required-features = ["env_logger"] 45 | 46 | [[example]] 47 | name = "custom_type" 48 | required-features = ["env_logger"] 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Crates.io][crates-badge]][crates-url] 2 | [![License][licence-badge]][licence-url] 3 | [![Test Status][test-badge]][test-url] 4 | [![Documentation][doc-badge]][doc-url] 5 | 6 | [crates-badge]: https://img.shields.io/crates/v/ext-sort.svg 7 | [crates-url]: https://crates.io/crates/ext-sort 8 | [licence-badge]: https://img.shields.io/badge/license-Unlicense-blue.svg 9 | [licence-url]: https://github.com/dapper91/ext-sort-rs/blob/master/LICENSE 10 | [test-badge]: https://github.com/dapper91/ext-sort-rs/actions/workflows/test.yml/badge.svg?branch=master 11 | [test-url]: https://github.com/dapper91/ext-sort-rs/actions/workflows/test.yml 12 | [doc-badge]: https://docs.rs/ext-sort/badge.svg 13 | [doc-url]: https://docs.rs/ext-sort 14 | 15 | 16 | # Rust external sort 17 | 18 | `ext-sort` is a rust external sort algorithm implementation. 19 | 20 | External sorting is a class of sorting algorithms that can handle massive amounts of data. External sorting 21 | is required when the data being sorted do not fit into the main memory (RAM) of a computer and instead must be 22 | resided in slower external memory, usually a hard disk drive. Sorting is achieved in two passes. During the 23 | first pass it sorts chunks of data that each fit in RAM, during the second pass it merges the sorted chunks together. 24 | For more information see [External Sorting](https://en.wikipedia.org/wiki/External_sorting). 25 | 26 | ## Overview 27 | 28 | `ext-sort` supports the following features: 29 | 30 | * **Data agnostic:** 31 | it supports all data types that implement `serde` serialization/deserialization by default, 32 | otherwise you can implement your own serialization/deserialization mechanism. 33 | * **Serialization format agnostic:** 34 | the library uses `MessagePack` serialization format by default, but it can be easily substituted by your custom one 35 | if `MessagePack` serialization/deserialization performance is not sufficient for your task. 36 | * **Multithreading support:** 37 | multi-threaded sorting is supported, which means data is sorted in multiple threads utilizing maximum CPU resources 38 | and reducing sorting time. 39 | * **Memory limit support:** 40 | memory limited sorting is supported. It allows you to limit sorting memory consumption 41 | (`memory-limit` feature required). 42 | 43 | # Basic example 44 | 45 | Activate `memory-limit` feature of the ext-sort crate on Cargo.toml: 46 | 47 | ```toml 48 | [dependencies] 49 | ext-sort = { version = "^0.1.5", features = ["memory-limit"] } 50 | ``` 51 | 52 | ```rust 53 | use std::fs; 54 | use std::io::{self, prelude::*}; 55 | use std::path; 56 | 57 | use bytesize::MB; 58 | use env_logger; 59 | use log; 60 | 61 | use ext_sort::{buffer::mem::MemoryLimitedBufferBuilder, ExternalSorter, ExternalSorterBuilder}; 62 | 63 | fn main() { 64 | env_logger::Builder::new().filter_level(log::LevelFilter::Debug).init(); 65 | 66 | let input_reader = io::BufReader::new(fs::File::open("input.txt").unwrap()); 67 | let mut output_writer = io::BufWriter::new(fs::File::create("output.txt").unwrap()); 68 | 69 | let sorter: ExternalSorter = ExternalSorterBuilder::new() 70 | .with_tmp_dir(path::Path::new("./")) 71 | .with_buffer(MemoryLimitedBufferBuilder::new(50 * MB)) 72 | .build() 73 | .unwrap(); 74 | 75 | let sorted = sorter.sort(input_reader.lines()).unwrap(); 76 | 77 | for item in sorted.map(Result::unwrap) { 78 | output_writer.write_all(format!("{}\n", item).as_bytes()).unwrap(); 79 | } 80 | output_writer.flush().unwrap(); 81 | } 82 | ``` 83 | -------------------------------------------------------------------------------- /examples/custom_serializer.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::fs::File; 3 | use std::io::{self, prelude::*, BufReader, BufWriter, Take}; 4 | use std::path; 5 | 6 | use env_logger; 7 | use log; 8 | 9 | use ext_sort::{ExternalChunk, ExternalSorter, ExternalSorterBuilder, LimitedBufferBuilder}; 10 | 11 | struct CustomExternalChunk { 12 | reader: io::Take>, 13 | } 14 | 15 | impl ExternalChunk for CustomExternalChunk { 16 | type SerializationError = io::Error; 17 | type DeserializationError = io::Error; 18 | 19 | fn new(reader: Take>) -> Self { 20 | CustomExternalChunk { reader } 21 | } 22 | 23 | fn dump( 24 | chunk_writer: &mut BufWriter, 25 | items: impl IntoIterator, 26 | ) -> Result<(), Self::SerializationError> { 27 | for item in items { 28 | chunk_writer.write_all(&item.to_le_bytes())?; 29 | } 30 | 31 | return Ok(()); 32 | } 33 | } 34 | 35 | impl Iterator for CustomExternalChunk { 36 | type Item = Result; 37 | 38 | fn next(&mut self) -> Option { 39 | if self.reader.limit() == 0 { 40 | None 41 | } else { 42 | let mut buf: [u8; 4] = [0; 4]; 43 | match self.reader.read_exact(&mut buf.as_mut_slice()) { 44 | Ok(_) => Some(Ok(u32::from_le_bytes(buf))), 45 | Err(err) => Some(Err(err)), 46 | } 47 | } 48 | } 49 | } 50 | 51 | fn main() { 52 | env_logger::Builder::new().filter_level(log::LevelFilter::Debug).init(); 53 | 54 | let input_reader = io::BufReader::new(fs::File::open("input.txt").unwrap()); 55 | let mut output_writer = io::BufWriter::new(fs::File::create("output.txt").unwrap()); 56 | 57 | let sorter: ExternalSorter = 58 | ExternalSorterBuilder::new() 59 | .with_tmp_dir(path::Path::new("./")) 60 | .with_buffer(LimitedBufferBuilder::new(1_000_000, true)) 61 | .build() 62 | .unwrap(); 63 | 64 | let sorted = sorter 65 | .sort(input_reader.lines().map(|line| { 66 | let line = line.unwrap(); 67 | let number = line.parse().unwrap(); 68 | 69 | return Ok(number); 70 | })) 71 | .unwrap(); 72 | 73 | for item in sorted.map(Result::unwrap) { 74 | output_writer.write_all(format!("{}\n", item).as_bytes()).unwrap(); 75 | } 76 | output_writer.flush().unwrap(); 77 | } 78 | -------------------------------------------------------------------------------- /examples/custom_type.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::error::Error; 3 | use std::fmt::{Display, Formatter}; 4 | use std::fs; 5 | use std::io::{self, prelude::*}; 6 | use std::path; 7 | 8 | use env_logger; 9 | use log; 10 | use serde; 11 | 12 | use ext_sort::{ExternalSorter, ExternalSorterBuilder, LimitedBufferBuilder}; 13 | 14 | #[derive(Debug)] 15 | enum CsvParseError { 16 | RowError(String), 17 | ColumnError(String), 18 | } 19 | 20 | impl Display for CsvParseError { 21 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 22 | match self { 23 | CsvParseError::ColumnError(err) => write!(f, "column format error: {}", err), 24 | CsvParseError::RowError(err) => write!(f, "row format error: {}", err), 25 | } 26 | } 27 | } 28 | 29 | impl Error for CsvParseError {} 30 | 31 | #[derive(PartialEq, Eq, serde::Serialize, serde::Deserialize)] 32 | struct Person { 33 | name: String, 34 | surname: String, 35 | age: u8, 36 | } 37 | 38 | impl Person { 39 | fn as_csv(&self) -> String { 40 | format!("{},{},{}", self.name, self.surname, self.age) 41 | } 42 | 43 | fn from_str(s: &str) -> Result { 44 | let parts: Vec<&str> = s.split(',').collect(); 45 | if parts.len() != 3 { 46 | Err(CsvParseError::RowError("wrong columns number".to_string())) 47 | } else { 48 | Ok(Person { 49 | name: parts[0].to_string(), 50 | surname: parts[1].to_string(), 51 | age: parts[2] 52 | .parse() 53 | .map_err(|err| CsvParseError::ColumnError(format!("age field format error: {}", err)))?, 54 | }) 55 | } 56 | } 57 | } 58 | 59 | impl PartialOrd for Person { 60 | fn partial_cmp(&self, other: &Self) -> Option { 61 | Some(self.cmp(&other)) 62 | } 63 | } 64 | 65 | impl Ord for Person { 66 | fn cmp(&self, other: &Self) -> Ordering { 67 | self.surname 68 | .cmp(&other.surname) 69 | .then(self.name.cmp(&other.name)) 70 | .then(self.age.cmp(&other.age)) 71 | } 72 | } 73 | 74 | fn main() { 75 | env_logger::Builder::new().filter_level(log::LevelFilter::Debug).init(); 76 | 77 | let input_reader = io::BufReader::new(fs::File::open("input.csv").unwrap()); 78 | let mut output_writer = io::BufWriter::new(fs::File::create("output.csv").unwrap()); 79 | 80 | let sorter: ExternalSorter = ExternalSorterBuilder::new() 81 | .with_tmp_dir(path::Path::new("./")) 82 | .with_buffer(LimitedBufferBuilder::new(1_000_000, true)) 83 | .build() 84 | .unwrap(); 85 | 86 | let sorted = sorter 87 | .sort( 88 | input_reader 89 | .lines() 90 | .map(|line| line.map(|line| Person::from_str(&line).unwrap())), 91 | ) 92 | .unwrap(); 93 | 94 | for item in sorted.map(Result::unwrap) { 95 | output_writer 96 | .write_all(format!("{}\n", item.as_csv()).as_bytes()) 97 | .unwrap(); 98 | } 99 | output_writer.flush().unwrap(); 100 | } 101 | -------------------------------------------------------------------------------- /examples/quickstart.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::io::{self, prelude::*}; 3 | use std::path; 4 | 5 | use bytesize::MB; 6 | use env_logger; 7 | use log; 8 | 9 | use ext_sort::{buffer::mem::MemoryLimitedBufferBuilder, ExternalSorter, ExternalSorterBuilder}; 10 | 11 | fn main() { 12 | env_logger::Builder::new().filter_level(log::LevelFilter::Debug).init(); 13 | 14 | let input_reader = io::BufReader::new(fs::File::open("input.txt").unwrap()); 15 | let mut output_writer = io::BufWriter::new(fs::File::create("output.txt").unwrap()); 16 | 17 | let sorter: ExternalSorter = ExternalSorterBuilder::new() 18 | .with_tmp_dir(path::Path::new("./")) 19 | .with_buffer(MemoryLimitedBufferBuilder::new(50 * MB)) 20 | .build() 21 | .unwrap(); 22 | 23 | let sorted = sorter.sort(input_reader.lines()).unwrap(); 24 | 25 | for item in sorted.map(Result::unwrap) { 26 | output_writer.write_all(format!("{}\n", item).as_bytes()).unwrap(); 27 | } 28 | output_writer.flush().unwrap(); 29 | } 30 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | edition = "2018" 2 | max_width = 120 3 | -------------------------------------------------------------------------------- /src/buffer.rs: -------------------------------------------------------------------------------- 1 | //! Limited chunk buffer. 2 | 3 | use rayon; 4 | 5 | /// Limited buffer builder. Creates buffers using provided buffer parameters. 6 | pub trait ChunkBufferBuilder: Default { 7 | /// Building buffer type 8 | type Buffer: ChunkBuffer; 9 | 10 | /// Creates a new [`ChunkBuffer`] trait instance. 11 | fn build(&self) -> Self::Buffer; 12 | } 13 | 14 | /// Base limited buffer interface. Provides methods for pushing data to the buffer and checking buffer state. 15 | pub trait ChunkBuffer: IntoIterator + rayon::slice::ParallelSliceMut + Send { 16 | /// Adds a new element to the buffer. 17 | /// 18 | /// # Arguments 19 | /// * `item` - Item to be added to the buffer 20 | fn push(&mut self, item: T); 21 | 22 | /// Returns the buffer length. 23 | fn len(&self) -> usize; 24 | 25 | /// Checks if the buffer reached the limit. 26 | fn is_full(&self) -> bool; 27 | } 28 | 29 | /// [`LimitedBuffer`] builder. 30 | pub struct LimitedBufferBuilder { 31 | buffer_limit: usize, 32 | preallocate: bool, 33 | } 34 | 35 | impl LimitedBufferBuilder { 36 | /// Creates a new instance of a builder. 37 | /// 38 | /// # Arguments 39 | /// * `buffer_limit` - Buffer size limit in element count 40 | /// * `preallocate` - If buffer should be preallocated 41 | pub fn new(buffer_limit: usize, preallocate: bool) -> Self { 42 | LimitedBufferBuilder { 43 | buffer_limit, 44 | preallocate, 45 | } 46 | } 47 | } 48 | 49 | impl ChunkBufferBuilder for LimitedBufferBuilder { 50 | type Buffer = LimitedBuffer; 51 | 52 | fn build(&self) -> Self::Buffer { 53 | if self.preallocate { 54 | LimitedBuffer::new(self.buffer_limit) 55 | } else { 56 | LimitedBuffer::with_capacity(self.buffer_limit) 57 | } 58 | } 59 | } 60 | 61 | impl Default for LimitedBufferBuilder { 62 | fn default() -> Self { 63 | LimitedBufferBuilder { 64 | buffer_limit: usize::MAX, 65 | preallocate: false, 66 | } 67 | } 68 | } 69 | 70 | /// Buffer limited by elements count. 71 | pub struct LimitedBuffer { 72 | limit: usize, 73 | inner: Vec, 74 | } 75 | 76 | impl LimitedBuffer { 77 | /// Creates a new buffer instance. 78 | /// 79 | /// # Arguments 80 | /// * `limit` - Buffer elements count limit 81 | pub fn new(limit: usize) -> Self { 82 | LimitedBuffer { 83 | limit, 84 | inner: Vec::new(), 85 | } 86 | } 87 | 88 | /// Creates a new buffer instance with provided capacity. 89 | /// 90 | /// # Arguments 91 | /// * `limit` - Buffer elements count limit 92 | pub fn with_capacity(limit: usize) -> Self { 93 | LimitedBuffer { 94 | limit, 95 | inner: Vec::with_capacity(limit), 96 | } 97 | } 98 | } 99 | 100 | impl ChunkBuffer for LimitedBuffer { 101 | fn push(&mut self, item: T) { 102 | self.inner.push(item); 103 | } 104 | 105 | fn len(&self) -> usize { 106 | self.inner.len() 107 | } 108 | 109 | fn is_full(&self) -> bool { 110 | self.inner.len() >= self.limit 111 | } 112 | } 113 | 114 | impl IntoIterator for LimitedBuffer { 115 | type Item = T; 116 | type IntoIter = as IntoIterator>::IntoIter; 117 | 118 | fn into_iter(self) -> Self::IntoIter { 119 | self.inner.into_iter() 120 | } 121 | } 122 | 123 | impl rayon::slice::ParallelSliceMut for LimitedBuffer { 124 | fn as_parallel_slice_mut(&mut self) -> &mut [T] { 125 | self.inner.as_mut_slice() 126 | } 127 | } 128 | 129 | #[cfg(test)] 130 | mod test { 131 | use super::{ChunkBuffer, ChunkBufferBuilder, LimitedBufferBuilder}; 132 | 133 | #[test] 134 | fn test_limited_buffer() { 135 | let builder = LimitedBufferBuilder::new(2, true); 136 | let mut buffer = builder.build(); 137 | 138 | buffer.push(0); 139 | assert_eq!(buffer.is_full(), false); 140 | buffer.push(1); 141 | assert_eq!(buffer.is_full(), true); 142 | 143 | let data = Vec::from_iter(buffer); 144 | assert_eq!(data, vec![0, 1]); 145 | } 146 | } 147 | 148 | #[cfg(feature = "memory-limit")] 149 | pub mod mem { 150 | use deepsize; 151 | use rayon; 152 | 153 | use super::{ChunkBuffer, ChunkBufferBuilder}; 154 | 155 | /// [`MemoryLimitedBuffer`] builder. 156 | pub struct MemoryLimitedBufferBuilder { 157 | buffer_limit: u64, 158 | } 159 | 160 | impl MemoryLimitedBufferBuilder { 161 | /// Creates a new instance of a builder. 162 | /// 163 | /// # Arguments 164 | /// * `buffer_limit` - Buffer size limit in bytes 165 | pub fn new(buffer_limit: u64) -> Self { 166 | MemoryLimitedBufferBuilder { buffer_limit } 167 | } 168 | } 169 | 170 | impl ChunkBufferBuilder for MemoryLimitedBufferBuilder 171 | where 172 | T: deepsize::DeepSizeOf, 173 | { 174 | type Buffer = MemoryLimitedBuffer; 175 | 176 | fn build(&self) -> Self::Buffer { 177 | MemoryLimitedBuffer::new(self.buffer_limit) 178 | } 179 | } 180 | 181 | impl Default for MemoryLimitedBufferBuilder { 182 | fn default() -> Self { 183 | MemoryLimitedBufferBuilder { buffer_limit: u64::MAX } 184 | } 185 | } 186 | 187 | /// Buffer limited by consumed memory. 188 | pub struct MemoryLimitedBuffer { 189 | limit: u64, 190 | current_size: u64, 191 | inner: Vec, 192 | } 193 | 194 | impl MemoryLimitedBuffer { 195 | /// Creates a new instance of a buffer. 196 | /// 197 | /// # Arguments 198 | /// * `limit` - Buffer size limit in bytes 199 | pub fn new(limit: u64) -> Self { 200 | MemoryLimitedBuffer { 201 | limit, 202 | current_size: 0, 203 | inner: Vec::new(), 204 | } 205 | } 206 | 207 | /// Returns buffer size in bytes. 208 | pub fn mem_size(&self) -> u64 { 209 | self.current_size 210 | } 211 | } 212 | 213 | impl ChunkBuffer for MemoryLimitedBuffer 214 | where 215 | T: deepsize::DeepSizeOf, 216 | { 217 | fn push(&mut self, item: T) { 218 | self.current_size += item.deep_size_of() as u64; 219 | self.inner.push(item); 220 | } 221 | 222 | fn len(&self) -> usize { 223 | self.inner.len() 224 | } 225 | 226 | fn is_full(&self) -> bool { 227 | self.current_size >= self.limit 228 | } 229 | } 230 | 231 | impl IntoIterator for MemoryLimitedBuffer { 232 | type Item = T; 233 | type IntoIter = as IntoIterator>::IntoIter; 234 | 235 | fn into_iter(self) -> Self::IntoIter { 236 | self.inner.into_iter() 237 | } 238 | } 239 | 240 | impl rayon::slice::ParallelSliceMut for MemoryLimitedBuffer { 241 | fn as_parallel_slice_mut(&mut self) -> &mut [T] { 242 | self.inner.as_mut_slice() 243 | } 244 | } 245 | 246 | #[cfg(test)] 247 | mod test { 248 | use deepsize; 249 | 250 | use super::{ChunkBuffer, ChunkBufferBuilder, MemoryLimitedBufferBuilder}; 251 | 252 | #[derive(Debug, Clone, PartialEq, Eq, deepsize::DeepSizeOf)] 253 | struct MyType { 254 | number: i64, 255 | string: String, 256 | } 257 | 258 | #[test] 259 | fn test_memory_limited_buffer() { 260 | let builder = MemoryLimitedBufferBuilder::new(76); 261 | let mut buffer = builder.build(); 262 | 263 | let item1 = MyType { 264 | number: 0, // 8 bytes 265 | string: "hello!".into(), // 8 + 8 + 8 + 6 = 30 bytes 266 | }; 267 | buffer.push(item1.clone()); 268 | assert_eq!(buffer.mem_size(), 38); 269 | assert_eq!(buffer.is_full(), false); 270 | 271 | let item2 = MyType { 272 | number: 1, // 8 bytes 273 | string: "world!".into(), // 8 + 8 + 8 + 6 = 30 bytes 274 | }; 275 | buffer.push(item2.clone()); 276 | assert_eq!(buffer.mem_size(), 76); 277 | assert_eq!(buffer.is_full(), true); 278 | 279 | let actual_data = Vec::from_iter(buffer); 280 | let expected_data = vec![item1, item2]; 281 | assert_eq!(actual_data, expected_data); 282 | } 283 | } 284 | } 285 | -------------------------------------------------------------------------------- /src/chunk.rs: -------------------------------------------------------------------------------- 1 | //! External chunk. 2 | 3 | use std::error::Error; 4 | use std::fmt::{self, Display}; 5 | use std::fs; 6 | use std::io; 7 | use std::io::prelude::*; 8 | use std::marker::PhantomData; 9 | 10 | use tempfile; 11 | 12 | /// External chunk error 13 | #[derive(Debug)] 14 | pub enum ExternalChunkError { 15 | /// Common I/O error. 16 | IO(io::Error), 17 | /// Data serialization error. 18 | SerializationError(S), 19 | } 20 | 21 | impl Error for ExternalChunkError {} 22 | 23 | impl Display for ExternalChunkError { 24 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 25 | match self { 26 | ExternalChunkError::IO(err) => write!(f, "{}", err), 27 | ExternalChunkError::SerializationError(err) => write!(f, "{}", err), 28 | } 29 | } 30 | } 31 | 32 | impl From for ExternalChunkError { 33 | fn from(err: io::Error) -> Self { 34 | ExternalChunkError::IO(err) 35 | } 36 | } 37 | 38 | /// External chunk interface. Provides methods for creating a chunk stored on file system and reading data from it. 39 | pub trait ExternalChunk: Sized + Iterator> { 40 | /// Error returned when data serialization failed. 41 | type SerializationError: Error; 42 | /// Error returned when data deserialization failed. 43 | type DeserializationError: Error; 44 | 45 | /// Builds an instance of an external chunk creating file and dumping the items to it. 46 | /// 47 | /// # Arguments 48 | /// * `dir` - Directory the chunk file is created in 49 | /// * `items` - Items to be dumped to the chunk 50 | /// * `buf_size` - File I/O buffer size 51 | fn build( 52 | dir: &tempfile::TempDir, 53 | items: impl IntoIterator, 54 | buf_size: Option, 55 | ) -> Result> { 56 | let tmp_file = tempfile::tempfile_in(dir)?; 57 | 58 | let mut chunk_writer = match buf_size { 59 | Some(buf_size) => io::BufWriter::with_capacity(buf_size, tmp_file.try_clone()?), 60 | None => io::BufWriter::new(tmp_file.try_clone()?), 61 | }; 62 | 63 | Self::dump(&mut chunk_writer, items).map_err(ExternalChunkError::SerializationError)?; 64 | 65 | chunk_writer.flush()?; 66 | 67 | let mut chunk_reader = match buf_size { 68 | Some(buf_size) => io::BufReader::with_capacity(buf_size, tmp_file.try_clone()?), 69 | None => io::BufReader::new(tmp_file.try_clone()?), 70 | }; 71 | 72 | chunk_reader.rewind()?; 73 | let file_len = tmp_file.metadata()?.len(); 74 | 75 | return Ok(Self::new(chunk_reader.take(file_len))); 76 | } 77 | 78 | /// Creates and instance of an external chunk. 79 | /// 80 | /// # Arguments 81 | /// * `reader` - The reader of the file the chunk is stored in 82 | fn new(reader: io::Take>) -> Self; 83 | 84 | /// Dumps items to an external file. 85 | /// 86 | /// # Arguments 87 | /// * `chunk_writer` - The writer of the file the data should be dumped in 88 | /// * `items` - Items to be dumped 89 | fn dump( 90 | chunk_writer: &mut io::BufWriter, 91 | items: impl IntoIterator, 92 | ) -> Result<(), Self::SerializationError>; 93 | } 94 | 95 | /// RMP (Rust MessagePack) external chunk implementation. 96 | /// It uses MessagePack as a data serialization format. 97 | /// For more information see [msgpack.org](https://msgpack.org/). 98 | /// 99 | /// # Example 100 | /// 101 | /// ```no_run 102 | /// use tempfile::TempDir; 103 | /// use ext_sort::{ExternalChunk, RmpExternalChunk}; 104 | /// 105 | /// let dir = TempDir::new().unwrap(); 106 | /// let chunk: RmpExternalChunk = ExternalChunk::build(&dir, (0..1000), None).unwrap(); 107 | /// ``` 108 | pub struct RmpExternalChunk { 109 | reader: io::Take>, 110 | 111 | item_type: PhantomData, 112 | } 113 | 114 | impl ExternalChunk for RmpExternalChunk 115 | where 116 | T: serde::ser::Serialize + serde::de::DeserializeOwned, 117 | { 118 | type SerializationError = rmp_serde::encode::Error; 119 | type DeserializationError = rmp_serde::decode::Error; 120 | 121 | fn new(reader: io::Take>) -> Self { 122 | RmpExternalChunk { 123 | reader, 124 | item_type: PhantomData, 125 | } 126 | } 127 | 128 | fn dump( 129 | mut chunk_writer: &mut io::BufWriter, 130 | items: impl IntoIterator, 131 | ) -> Result<(), Self::SerializationError> { 132 | for item in items.into_iter() { 133 | rmp_serde::encode::write(&mut chunk_writer, &item)?; 134 | } 135 | 136 | return Ok(()); 137 | } 138 | } 139 | 140 | impl Iterator for RmpExternalChunk 141 | where 142 | T: serde::ser::Serialize + serde::de::DeserializeOwned, 143 | { 144 | type Item = Result>::DeserializationError>; 145 | 146 | fn next(&mut self) -> Option { 147 | if self.reader.limit() == 0 { 148 | None 149 | } else { 150 | match rmp_serde::decode::from_read(&mut self.reader) { 151 | Ok(result) => Some(Ok(result)), 152 | Err(err) => Some(Err(err)), 153 | } 154 | } 155 | } 156 | } 157 | 158 | #[cfg(test)] 159 | mod test { 160 | use rstest::*; 161 | 162 | use super::{ExternalChunk, RmpExternalChunk}; 163 | 164 | #[fixture] 165 | fn tmp_dir() -> tempfile::TempDir { 166 | tempfile::tempdir_in("./").unwrap() 167 | } 168 | 169 | #[rstest] 170 | fn test_rmp_chunk(tmp_dir: tempfile::TempDir) { 171 | let saved = Vec::from_iter(0..100); 172 | 173 | let chunk: RmpExternalChunk = ExternalChunk::build(&tmp_dir, saved.clone(), None).unwrap(); 174 | 175 | let restored: Result, _> = chunk.collect(); 176 | let restored = restored.unwrap(); 177 | 178 | assert_eq!(restored, saved); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! `ext-sort` is a rust external sort algorithm implementation. 2 | //! 3 | //! External sorting is a class of sorting algorithms that can handle massive amounts of data. External sorting 4 | //! is required when the data being sorted do not fit into the main memory (RAM) of a computer and instead must be 5 | //! resided in slower external memory, usually a hard disk drive. Sorting is achieved in two passes. During the 6 | //! first pass it sorts chunks of data that each fit in RAM, during the second pass it merges the sorted chunks 7 | //! together. For more information see [External Sorting](https://en.wikipedia.org/wiki/External_sorting). 8 | //! 9 | //! # Overview 10 | //! 11 | //! `ext-sort` supports the following features: 12 | //! 13 | //! * **Data agnostic:** 14 | //! it supports all data types that implement `serde` serialization/deserialization by default, 15 | //! otherwise you can implement your own serialization/deserialization mechanism. 16 | //! * **Serialization format agnostic:** 17 | //! the library uses `MessagePack` serialization format by default, but it can be easily substituted by your custom 18 | //! one if `MessagePack` serialization/deserialization performance is not sufficient for your task. 19 | //! * **Multithreading support:** 20 | //! multi-threaded sorting is supported, which means data is sorted in multiple threads utilizing maximum CPU 21 | //! resources and reducing sorting time. 22 | //! * **Memory limit support:** 23 | //! memory limited sorting is supported. It allows you to limit sorting memory consumption 24 | //! (`memory-limit` feature required). 25 | //! 26 | //! # Example 27 | //! 28 | //! ```no_run 29 | //! use std::fs; 30 | //! use std::io::{self, prelude::*}; 31 | //! use std::path; 32 | //! 33 | //! use bytesize::MB; 34 | //! use env_logger; 35 | //! use log; 36 | //! 37 | //! use ext_sort::{buffer::mem::MemoryLimitedBufferBuilder, ExternalSorter, ExternalSorterBuilder}; 38 | //! 39 | //! fn main() { 40 | //! env_logger::Builder::new().filter_level(log::LevelFilter::Debug).init(); 41 | //! 42 | //! let input_reader = io::BufReader::new(fs::File::open("input.txt").unwrap()); 43 | //! let mut output_writer = io::BufWriter::new(fs::File::create("output.txt").unwrap()); 44 | //! 45 | //! let sorter: ExternalSorter = ExternalSorterBuilder::new() 46 | //! .with_tmp_dir(path::Path::new("./")) 47 | //! .with_buffer(MemoryLimitedBufferBuilder::new(50 * MB)) 48 | //! .build() 49 | //! .unwrap(); 50 | //! 51 | //! let sorted = sorter.sort(input_reader.lines()).unwrap(); 52 | //! 53 | //! for item in sorted.map(Result::unwrap) { 54 | //! output_writer.write_all(format!("{}\n", item).as_bytes()).unwrap(); 55 | //! } 56 | //! output_writer.flush().unwrap(); 57 | //! } 58 | //! ``` 59 | 60 | pub mod buffer; 61 | pub mod chunk; 62 | pub mod merger; 63 | pub mod sort; 64 | 65 | pub use buffer::{ChunkBuffer, ChunkBufferBuilder, LimitedBuffer, LimitedBufferBuilder}; 66 | pub use chunk::{ExternalChunk, RmpExternalChunk}; 67 | pub use merger::BinaryHeapMerger; 68 | pub use sort::{ExternalSorter, ExternalSorterBuilder, SortError}; 69 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::io::{self, prelude::*}; 3 | use std::path; 4 | use std::process; 5 | 6 | use bytesize::ByteSize; 7 | use clap::ArgEnum; 8 | use env_logger; 9 | use log; 10 | 11 | use ext_sort::buffer::mem::MemoryLimitedBufferBuilder; 12 | use ext_sort::{ExternalSorter, ExternalSorterBuilder}; 13 | 14 | fn main() { 15 | let arg_parser = build_arg_parser(); 16 | 17 | let log_level: LogLevel = arg_parser.value_of_t_or_exit("log_level"); 18 | init_logger(log_level); 19 | 20 | let order: Order = arg_parser.value_of_t_or_exit("sort"); 21 | let tmp_dir: Option<&str> = arg_parser.value_of("tmp_dir"); 22 | let chunk_size = arg_parser.value_of("chunk_size").expect("value is required"); 23 | let threads: Option = arg_parser 24 | .is_present("threads") 25 | .then(|| arg_parser.value_of_t_or_exit("threads")); 26 | 27 | let input = arg_parser.value_of("input").expect("value is required"); 28 | let input_stream = match fs::File::open(input) { 29 | Ok(file) => io::BufReader::new(file), 30 | Err(err) => { 31 | log::error!("input file opening error: {}", err); 32 | process::exit(1); 33 | } 34 | }; 35 | 36 | let output = arg_parser.value_of("output").expect("value is required"); 37 | let mut output_stream = match fs::File::create(output) { 38 | Ok(file) => io::BufWriter::new(file), 39 | Err(err) => { 40 | log::error!("output file creation error: {}", err); 41 | process::exit(1); 42 | } 43 | }; 44 | 45 | let mut sorter_builder = ExternalSorterBuilder::new(); 46 | if let Some(threads) = threads { 47 | sorter_builder = sorter_builder.with_threads_number(threads); 48 | } 49 | 50 | if let Some(tmp_dir) = tmp_dir { 51 | sorter_builder = sorter_builder.with_tmp_dir(path::Path::new(tmp_dir)); 52 | } 53 | 54 | sorter_builder = sorter_builder.with_buffer(MemoryLimitedBufferBuilder::new( 55 | chunk_size.parse::().expect("value is pre-validated").as_u64(), 56 | )); 57 | 58 | let sorter: ExternalSorter = match sorter_builder.build() { 59 | Ok(sorter) => sorter, 60 | Err(err) => { 61 | log::error!("sorter initialization error: {}", err); 62 | process::exit(1); 63 | } 64 | }; 65 | 66 | let compare = |a: &String, b: &String| { 67 | if order == Order::Asc { 68 | a.cmp(&b) 69 | } else { 70 | a.cmp(&b).reverse() 71 | } 72 | }; 73 | 74 | let sorted_stream = match sorter.sort_by(input_stream.lines(), compare) { 75 | Ok(sorted_stream) => sorted_stream, 76 | Err(err) => { 77 | log::error!("data sorting error: {}", err); 78 | process::exit(1); 79 | } 80 | }; 81 | 82 | for line in sorted_stream { 83 | let line = match line { 84 | Ok(line) => line, 85 | Err(err) => { 86 | log::error!("sorting stream error: {}", err); 87 | process::exit(1); 88 | } 89 | }; 90 | if let Err(err) = output_stream.write_all(format!("{}\n", line).as_bytes()) { 91 | log::error!("data saving error: {}", err); 92 | process::exit(1); 93 | }; 94 | } 95 | 96 | if let Err(err) = output_stream.flush() { 97 | log::error!("data flushing error: {}", err); 98 | process::exit(1); 99 | } 100 | } 101 | 102 | #[derive(Copy, Clone, clap::ArgEnum)] 103 | enum LogLevel { 104 | Off, 105 | Error, 106 | Warn, 107 | Info, 108 | Debug, 109 | Trace, 110 | } 111 | 112 | impl LogLevel { 113 | pub fn possible_values() -> impl Iterator> { 114 | Self::value_variants().iter().filter_map(|v| v.to_possible_value()) 115 | } 116 | } 117 | 118 | impl std::str::FromStr for LogLevel { 119 | type Err = String; 120 | 121 | fn from_str(s: &str) -> Result { 122 | ::from_str(s, false) 123 | } 124 | } 125 | 126 | #[derive(Copy, Clone, PartialEq, clap::ArgEnum)] 127 | enum Order { 128 | Asc, 129 | Desc, 130 | } 131 | 132 | impl Order { 133 | pub fn possible_values() -> impl Iterator> { 134 | Order::value_variants().iter().filter_map(|v| v.to_possible_value()) 135 | } 136 | } 137 | 138 | impl std::str::FromStr for Order { 139 | type Err = String; 140 | 141 | fn from_str(s: &str) -> Result { 142 | ::from_str(s, false) 143 | } 144 | } 145 | 146 | fn build_arg_parser() -> clap::ArgMatches { 147 | clap::App::new("ext-sort") 148 | .author("Dmitry P. ") 149 | .about("external sorter") 150 | .arg( 151 | clap::Arg::new("input") 152 | .short('i') 153 | .long("input") 154 | .help("file to be sorted") 155 | .required(true) 156 | .takes_value(true), 157 | ) 158 | .arg( 159 | clap::Arg::new("output") 160 | .short('o') 161 | .long("output") 162 | .help("result file") 163 | .required(true) 164 | .takes_value(true), 165 | ) 166 | .arg( 167 | clap::Arg::new("sort") 168 | .short('s') 169 | .long("sort") 170 | .help("sorting order") 171 | .takes_value(true) 172 | .default_value("asc") 173 | .possible_values(Order::possible_values()), 174 | ) 175 | .arg( 176 | clap::Arg::new("log_level") 177 | .short('l') 178 | .long("loglevel") 179 | .help("logging level") 180 | .takes_value(true) 181 | .default_value("info") 182 | .possible_values(LogLevel::possible_values()), 183 | ) 184 | .arg( 185 | clap::Arg::new("threads") 186 | .short('t') 187 | .long("threads") 188 | .help("number of threads to use for parallel sorting") 189 | .takes_value(true), 190 | ) 191 | .arg( 192 | clap::Arg::new("tmp_dir") 193 | .short('d') 194 | .long("tmp-dir") 195 | .help("directory to be used to store temporary data") 196 | .takes_value(true), 197 | ) 198 | .arg( 199 | clap::Arg::new("chunk_size") 200 | .short('c') 201 | .long("chunk-size") 202 | .help("chunk size") 203 | .required(true) 204 | .takes_value(true) 205 | .validator(|v| match v.parse::() { 206 | Ok(_) => Ok(()), 207 | Err(err) => Err(format!("Chunk size format incorrect: {}", err)), 208 | }), 209 | ) 210 | .get_matches() 211 | } 212 | 213 | fn init_logger(log_level: LogLevel) { 214 | env_logger::Builder::new() 215 | .filter_level(match log_level { 216 | LogLevel::Off => log::LevelFilter::Off, 217 | LogLevel::Error => log::LevelFilter::Error, 218 | LogLevel::Warn => log::LevelFilter::Warn, 219 | LogLevel::Info => log::LevelFilter::Info, 220 | LogLevel::Debug => log::LevelFilter::Debug, 221 | LogLevel::Trace => log::LevelFilter::Trace, 222 | }) 223 | .format_timestamp_millis() 224 | .init(); 225 | } 226 | -------------------------------------------------------------------------------- /src/merger.rs: -------------------------------------------------------------------------------- 1 | //! Binary heap merger. 2 | 3 | use std::cmp::Ordering; 4 | use std::collections::BinaryHeap; 5 | use std::error::Error; 6 | 7 | /// Value wrapper binding custom compare function to a value. 8 | struct OrderedWrapper 9 | where 10 | F: Fn(&T, &T) -> Ordering, 11 | { 12 | value: T, 13 | compare: F, 14 | } 15 | 16 | impl OrderedWrapper 17 | where 18 | F: Fn(&T, &T) -> Ordering, 19 | { 20 | fn wrap(value: T, compare: F) -> Self { 21 | OrderedWrapper { value, compare } 22 | } 23 | 24 | fn unwrap(self) -> T { 25 | self.value 26 | } 27 | } 28 | 29 | impl PartialEq for OrderedWrapper 30 | where 31 | F: Fn(&T, &T) -> Ordering, 32 | { 33 | fn eq(&self, other: &Self) -> bool { 34 | self.cmp(other) == Ordering::Equal 35 | } 36 | } 37 | 38 | impl Eq for OrderedWrapper where F: Fn(&T, &T) -> Ordering {} 39 | 40 | impl PartialOrd for OrderedWrapper 41 | where 42 | F: Fn(&T, &T) -> Ordering, 43 | { 44 | fn partial_cmp(&self, other: &Self) -> Option { 45 | Some(self.cmp(other)) 46 | } 47 | } 48 | impl Ord for OrderedWrapper 49 | where 50 | F: Fn(&T, &T) -> Ordering, 51 | { 52 | fn cmp(&self, other: &Self) -> Ordering { 53 | (self.compare)(&self.value, &other.value) 54 | } 55 | } 56 | 57 | /// Binary heap merger implementation. 58 | /// Merges multiple sorted inputs into a single sorted output. 59 | /// Time complexity is *m* \* log(*n*) in worst case where *m* is the number of items, 60 | /// *n* is the number of chunks (inputs). 61 | pub struct BinaryHeapMerger 62 | where 63 | E: Error, 64 | F: Fn(&T, &T) -> Ordering, 65 | C: IntoIterator>, 66 | { 67 | // binary heap is max-heap by default so we reverse it to convert it to min-heap 68 | items: BinaryHeap<(std::cmp::Reverse>, std::cmp::Reverse)>, 69 | chunks: Vec, 70 | initiated: bool, 71 | compare: F, 72 | } 73 | 74 | impl BinaryHeapMerger 75 | where 76 | E: Error, 77 | F: Fn(&T, &T) -> Ordering, 78 | C: IntoIterator>, 79 | { 80 | /// Creates an instance of a binary heap merger using chunks as inputs. 81 | /// Chunk items should be sorted in ascending order otherwise the result is undefined. 82 | /// 83 | /// # Arguments 84 | /// * `chunks` - Chunks to be merged in a single sorted one 85 | pub fn new(chunks: I, compare: F) -> Self 86 | where 87 | I: IntoIterator, 88 | { 89 | let chunks = Vec::from_iter(chunks.into_iter().map(|c| c.into_iter())); 90 | let items = BinaryHeap::with_capacity(chunks.len()); 91 | 92 | return BinaryHeapMerger { 93 | chunks, 94 | items, 95 | compare, 96 | initiated: false, 97 | }; 98 | } 99 | } 100 | 101 | impl Iterator for BinaryHeapMerger 102 | where 103 | E: Error, 104 | F: Fn(&T, &T) -> Ordering + Copy, 105 | C: IntoIterator>, 106 | { 107 | type Item = Result; 108 | 109 | /// Returns the next item from the inputs in ascending order. 110 | fn next(&mut self) -> Option { 111 | if !self.initiated { 112 | for (idx, chunk) in self.chunks.iter_mut().enumerate() { 113 | if let Some(item) = chunk.next() { 114 | match item { 115 | Ok(item) => self 116 | .items 117 | .push((std::cmp::Reverse(OrderedWrapper::wrap(item, self.compare)), std::cmp::Reverse(idx))), 118 | Err(err) => return Some(Err(err)), 119 | } 120 | } 121 | } 122 | self.initiated = true; 123 | } 124 | 125 | let (result, idx) = self.items.pop()?; 126 | if let Some(item) = self.chunks[idx.0].next() { 127 | match item { 128 | Ok(item) => self 129 | .items 130 | .push((std::cmp::Reverse(OrderedWrapper::wrap(item, self.compare)), idx)), 131 | Err(err) => return Some(Err(err)), 132 | } 133 | } 134 | 135 | return Some(Ok(result.0.unwrap())); 136 | } 137 | } 138 | 139 | #[cfg(test)] 140 | mod test { 141 | use rstest::*; 142 | use std::error::Error; 143 | use std::io::{self, ErrorKind}; 144 | 145 | use super::BinaryHeapMerger; 146 | 147 | #[rstest] 148 | #[case( 149 | vec![], 150 | vec![], 151 | )] 152 | #[case( 153 | vec![ 154 | vec![], 155 | vec![] 156 | ], 157 | vec![], 158 | )] 159 | #[case( 160 | vec![ 161 | vec![Ok(4), Ok(5), Ok(7)], 162 | vec![Ok(1), Ok(6)], 163 | vec![Ok(3)], 164 | vec![], 165 | ], 166 | vec![Ok(1), Ok(3), Ok(4), Ok(5), Ok(6), Ok(7)], 167 | )] 168 | #[case( 169 | vec![ 170 | vec![Result::Err(io::Error::new(ErrorKind::Other, "test error"))] 171 | ], 172 | vec![ 173 | Result::Err(io::Error::new(ErrorKind::Other, "test error")) 174 | ], 175 | )] 176 | #[case( 177 | vec![ 178 | vec![Ok(3), Result::Err(io::Error::new(ErrorKind::Other, "test error"))], 179 | vec![Ok(1), Ok(2)], 180 | ], 181 | vec![ 182 | Ok(1), 183 | Ok(2), 184 | Result::Err(io::Error::new(ErrorKind::Other, "test error")), 185 | ], 186 | )] 187 | fn test_merger( 188 | #[case] chunks: Vec>>, 189 | #[case] expected_result: Vec>, 190 | ) { 191 | let merger = BinaryHeapMerger::new(chunks, i32::cmp); 192 | let actual_result = merger.collect(); 193 | assert!( 194 | compare_vectors_of_result::<_, io::Error>(&actual_result, &expected_result), 195 | "actual={:?}, expected={:?}", 196 | actual_result, 197 | expected_result 198 | ); 199 | } 200 | 201 | fn compare_vectors_of_result( 202 | actual: &Vec>, 203 | expected: &Vec>, 204 | ) -> bool { 205 | actual 206 | .into_iter() 207 | .zip(expected) 208 | .all( 209 | |(actual_result, expected_result)| match (actual_result, expected_result) { 210 | (Ok(actual_result), Ok(expected_result)) if actual_result == expected_result => true, 211 | (Err(actual_err), Err(expected_err)) => actual_err.to_string() == expected_err.to_string(), 212 | _ => false, 213 | }, 214 | ) 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /src/sort.rs: -------------------------------------------------------------------------------- 1 | //! External sorter. 2 | 3 | use log; 4 | use std::cmp::Ordering; 5 | use std::error::Error; 6 | use std::fmt; 7 | use std::fmt::{Debug, Display}; 8 | use std::io; 9 | use std::marker::PhantomData; 10 | use std::path::Path; 11 | 12 | use crate::chunk::{ExternalChunk, ExternalChunkError, RmpExternalChunk}; 13 | use crate::merger::BinaryHeapMerger; 14 | use crate::{ChunkBuffer, ChunkBufferBuilder, LimitedBufferBuilder}; 15 | 16 | /// Sorting error. 17 | #[derive(Debug)] 18 | pub enum SortError { 19 | /// Temporary directory or file creation error. 20 | TempDir(io::Error), 21 | /// Workers thread pool initialization error. 22 | ThreadPoolBuildError(rayon::ThreadPoolBuildError), 23 | /// Common I/O error. 24 | IO(io::Error), 25 | /// Data serialization error. 26 | SerializationError(S), 27 | /// Data deserialization error. 28 | DeserializationError(D), 29 | /// Input data stream error 30 | InputError(I), 31 | } 32 | 33 | impl Error for SortError 34 | where 35 | S: Error + 'static, 36 | D: Error + 'static, 37 | I: Error + 'static, 38 | { 39 | fn source(&self) -> Option<&(dyn Error + 'static)> { 40 | Some(match &self { 41 | SortError::TempDir(err) => err, 42 | SortError::ThreadPoolBuildError(err) => err, 43 | SortError::IO(err) => err, 44 | SortError::SerializationError(err) => err, 45 | SortError::DeserializationError(err) => err, 46 | SortError::InputError(err) => err, 47 | }) 48 | } 49 | } 50 | 51 | impl Display for SortError { 52 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 53 | match &self { 54 | SortError::TempDir(err) => write!(f, "temporary directory or file not created: {}", err), 55 | SortError::ThreadPoolBuildError(err) => write!(f, "thread pool initialization failed: {}", err), 56 | SortError::IO(err) => write!(f, "I/O operation failed: {}", err), 57 | SortError::SerializationError(err) => write!(f, "data serialization error: {}", err), 58 | SortError::DeserializationError(err) => write!(f, "data deserialization error: {}", err), 59 | SortError::InputError(err) => write!(f, "input data stream error: {}", err), 60 | } 61 | } 62 | } 63 | 64 | /// External sorter builder. Provides methods for [`ExternalSorter`] initialization. 65 | #[derive(Clone)] 66 | pub struct ExternalSorterBuilder> 67 | where 68 | T: Send, 69 | E: Error, 70 | B: ChunkBufferBuilder, 71 | C: ExternalChunk, 72 | { 73 | /// Number of threads to be used to sort data in parallel. 74 | threads_number: Option, 75 | /// Directory to be used to store temporary data. 76 | tmp_dir: Option>, 77 | /// Chunk file read/write buffer size. 78 | rw_buf_size: Option, 79 | /// Chunk buffer builder. 80 | buffer_builder: B, 81 | 82 | /// External chunk type. 83 | external_chunk_type: PhantomData, 84 | /// Input item type. 85 | item_type: PhantomData, 86 | /// Input error type. 87 | input_error_type: PhantomData, 88 | } 89 | 90 | impl ExternalSorterBuilder 91 | where 92 | T: Send, 93 | E: Error, 94 | B: ChunkBufferBuilder, 95 | C: ExternalChunk, 96 | { 97 | /// Creates an instance of a builder with default parameters. 98 | pub fn new() -> Self { 99 | ExternalSorterBuilder::default() 100 | } 101 | 102 | /// Builds an [`ExternalSorter`] instance using provided configuration. 103 | pub fn build( 104 | self, 105 | ) -> Result, SortError> { 106 | ExternalSorter::new( 107 | self.threads_number, 108 | self.tmp_dir.as_deref(), 109 | self.buffer_builder, 110 | self.rw_buf_size, 111 | ) 112 | } 113 | 114 | /// Sets number of threads to be used to sort data in parallel. 115 | pub fn with_threads_number(mut self, threads_number: usize) -> ExternalSorterBuilder { 116 | self.threads_number = Some(threads_number); 117 | return self; 118 | } 119 | 120 | /// Sets directory to be used to store temporary data. 121 | pub fn with_tmp_dir(mut self, path: &Path) -> ExternalSorterBuilder { 122 | self.tmp_dir = Some(path.into()); 123 | return self; 124 | } 125 | 126 | /// Sets buffer builder. 127 | pub fn with_buffer(mut self, buffer_builder: B) -> ExternalSorterBuilder { 128 | self.buffer_builder = buffer_builder; 129 | return self; 130 | } 131 | 132 | /// Sets chunk read/write buffer size. 133 | pub fn with_rw_buf_size(mut self, buf_size: usize) -> ExternalSorterBuilder { 134 | self.rw_buf_size = Some(buf_size); 135 | return self; 136 | } 137 | } 138 | 139 | impl Default for ExternalSorterBuilder 140 | where 141 | T: Send, 142 | E: Error, 143 | B: ChunkBufferBuilder, 144 | C: ExternalChunk, 145 | { 146 | fn default() -> Self { 147 | ExternalSorterBuilder { 148 | threads_number: None, 149 | tmp_dir: None, 150 | rw_buf_size: None, 151 | buffer_builder: B::default(), 152 | external_chunk_type: PhantomData, 153 | item_type: PhantomData, 154 | input_error_type: PhantomData, 155 | } 156 | } 157 | } 158 | 159 | /// External sorter. 160 | pub struct ExternalSorter> 161 | where 162 | T: Send, 163 | E: Error, 164 | B: ChunkBufferBuilder, 165 | C: ExternalChunk, 166 | { 167 | /// Sorting thread pool. 168 | thread_pool: rayon::ThreadPool, 169 | /// Directory to be used to store temporary data. 170 | tmp_dir: tempfile::TempDir, 171 | /// Chunk buffer builder. 172 | buffer_builder: B, 173 | /// Chunk file read/write buffer size. 174 | rw_buf_size: Option, 175 | 176 | /// External chunk type. 177 | external_chunk_type: PhantomData, 178 | /// Input item type. 179 | item_type: PhantomData, 180 | /// Input error type. 181 | input_error_type: PhantomData, 182 | } 183 | 184 | impl ExternalSorter 185 | where 186 | T: Send, 187 | E: Error, 188 | B: ChunkBufferBuilder, 189 | C: ExternalChunk, 190 | { 191 | /// Creates a new external sorter instance. 192 | /// 193 | /// # Arguments 194 | /// * `threads_number` - Number of threads to be used to sort data in parallel. If the parameter is [`None`] 195 | /// threads number will be selected based on available CPU core number. 196 | /// * `tmp_path` - Directory to be used to store temporary data. If paramater is [`None`] default OS temporary 197 | /// directory will be used. 198 | /// * `buffer_builder` - An instance of a buffer builder that will be used for chunk buffer creation. 199 | /// * `rw_buf_size` - Chunks file read/write buffer size. 200 | pub fn new( 201 | threads_number: Option, 202 | tmp_path: Option<&Path>, 203 | buffer_builder: B, 204 | rw_buf_size: Option, 205 | ) -> Result> { 206 | return Ok(ExternalSorter { 207 | rw_buf_size, 208 | buffer_builder, 209 | thread_pool: Self::init_thread_pool(threads_number)?, 210 | tmp_dir: Self::init_tmp_directory(tmp_path)?, 211 | external_chunk_type: PhantomData, 212 | item_type: PhantomData, 213 | input_error_type: PhantomData, 214 | }); 215 | } 216 | 217 | fn init_thread_pool( 218 | threads_number: Option, 219 | ) -> Result> { 220 | let mut thread_pool_builder = rayon::ThreadPoolBuilder::new(); 221 | 222 | if let Some(threads_number) = threads_number { 223 | log::info!("initializing thread-pool (threads: {})", threads_number); 224 | thread_pool_builder = thread_pool_builder.num_threads(threads_number); 225 | } else { 226 | log::info!("initializing thread-pool (threads: default)"); 227 | } 228 | let thread_pool = thread_pool_builder 229 | .build() 230 | .map_err(|err| SortError::ThreadPoolBuildError(err))?; 231 | 232 | return Ok(thread_pool); 233 | } 234 | 235 | fn init_tmp_directory( 236 | tmp_path: Option<&Path>, 237 | ) -> Result> { 238 | let tmp_dir = if let Some(tmp_path) = tmp_path { 239 | tempfile::tempdir_in(tmp_path) 240 | } else { 241 | tempfile::tempdir() 242 | } 243 | .map_err(|err| SortError::TempDir(err))?; 244 | 245 | log::info!("using {} as a temporary directory", tmp_dir.path().display()); 246 | 247 | return Ok(tmp_dir); 248 | } 249 | 250 | /// Sorts data from the input. 251 | /// Returns an iterator that can be used to get sorted data stream. 252 | /// 253 | /// # Arguments 254 | /// * `input` - Input stream data to be fetched from 255 | pub fn sort( 256 | &self, 257 | input: I, 258 | ) -> Result< 259 | BinaryHeapMerger Ordering + Copy, C>, 260 | SortError, 261 | > 262 | where 263 | T: Ord, 264 | I: IntoIterator>, 265 | { 266 | self.sort_by(input, T::cmp) 267 | } 268 | 269 | /// Sorts data from the input using a custom compare function. 270 | /// Returns an iterator that can be used to get sorted data stream. 271 | /// 272 | /// # Arguments 273 | /// * `input` - Input stream data to be fetched from 274 | /// * `compare` - Function be be used to compare items 275 | pub fn sort_by( 276 | &self, 277 | input: I, 278 | compare: F, 279 | ) -> Result< 280 | BinaryHeapMerger, 281 | SortError, 282 | > 283 | where 284 | I: IntoIterator>, 285 | F: Fn(&T, &T) -> Ordering + Sync + Send + Copy, 286 | { 287 | let mut chunk_buf = self.buffer_builder.build(); 288 | let mut external_chunks = Vec::new(); 289 | 290 | for item in input.into_iter() { 291 | match item { 292 | Ok(item) => chunk_buf.push(item), 293 | Err(err) => return Err(SortError::InputError(err)), 294 | } 295 | 296 | if chunk_buf.is_full() { 297 | external_chunks.push(self.create_chunk(chunk_buf, compare)?); 298 | chunk_buf = self.buffer_builder.build(); 299 | } 300 | } 301 | 302 | if chunk_buf.len() > 0 { 303 | external_chunks.push(self.create_chunk(chunk_buf, compare)?); 304 | } 305 | 306 | log::debug!("external sort preparation done"); 307 | 308 | return Ok(BinaryHeapMerger::new(external_chunks, compare)); 309 | } 310 | 311 | fn create_chunk( 312 | &self, 313 | mut buffer: impl ChunkBuffer, 314 | compare: F, 315 | ) -> Result> 316 | where 317 | F: Fn(&T, &T) -> Ordering + Sync + Send, 318 | { 319 | log::debug!("sorting chunk data ..."); 320 | self.thread_pool.install(|| { 321 | buffer.par_sort_by(compare); 322 | }); 323 | 324 | log::debug!("saving chunk data"); 325 | let external_chunk = 326 | ExternalChunk::build(&self.tmp_dir, buffer, self.rw_buf_size).map_err(|err| match err { 327 | ExternalChunkError::IO(err) => SortError::IO(err), 328 | ExternalChunkError::SerializationError(err) => SortError::SerializationError(err), 329 | })?; 330 | 331 | return Ok(external_chunk); 332 | } 333 | } 334 | 335 | #[cfg(test)] 336 | mod test { 337 | use std::io; 338 | use std::path::Path; 339 | 340 | use rand::seq::SliceRandom; 341 | use rstest::*; 342 | 343 | use super::{ExternalSorter, ExternalSorterBuilder, LimitedBufferBuilder}; 344 | 345 | #[rstest] 346 | #[case(false)] 347 | #[case(true)] 348 | fn test_external_sorter(#[case] reversed: bool) { 349 | let input_sorted = 0..100; 350 | 351 | let mut input_shuffled = Vec::from_iter(input_sorted.clone()); 352 | input_shuffled.shuffle(&mut rand::thread_rng()); 353 | 354 | let input: Vec> = Vec::from_iter(input_shuffled.into_iter().map(|item| Ok(item))); 355 | 356 | let sorter: ExternalSorter = ExternalSorterBuilder::new() 357 | .with_buffer(LimitedBufferBuilder::new(8, true)) 358 | .with_threads_number(2) 359 | .with_tmp_dir(Path::new("./")) 360 | .build() 361 | .unwrap(); 362 | 363 | let compare = if reversed { 364 | |a: &i32, b: &i32| a.cmp(b).reverse() 365 | } else { 366 | |a: &i32, b: &i32| a.cmp(b) 367 | }; 368 | 369 | let result = sorter.sort_by(input, compare).unwrap(); 370 | 371 | let actual_result: Result, _> = result.collect(); 372 | let actual_result = actual_result.unwrap(); 373 | let expected_result = if reversed { 374 | Vec::from_iter(input_sorted.clone().rev()) 375 | } else { 376 | Vec::from_iter(input_sorted.clone()) 377 | }; 378 | 379 | assert_eq!(actual_result, expected_result) 380 | } 381 | 382 | #[rstest] 383 | #[case(false)] 384 | #[case(true)] 385 | fn test_external_sorter_stability(#[case] reversed: bool) { 386 | let input_sorted = (0..20).flat_map(|x|(0..5).map(move |y| (x, y))); 387 | 388 | let mut input_shuffled = Vec::from_iter(input_sorted.clone()); 389 | input_shuffled.shuffle(&mut rand::thread_rng()); 390 | // sort input by the second field to check sorting stability 391 | input_shuffled.sort_by(|a: &(i32, i32), b: &(i32, i32)| if reversed {a.1.cmp(&b.1).reverse()} else {a.1.cmp(&b.1)}); 392 | 393 | let input: Vec> = Vec::from_iter(input_shuffled.into_iter().map(|item| Ok(item))); 394 | 395 | let sorter: ExternalSorter<(i32, i32), _> = ExternalSorterBuilder::new() 396 | .with_buffer(LimitedBufferBuilder::new(8, true)) 397 | .with_threads_number(2) 398 | .with_tmp_dir(Path::new("./")) 399 | .build() 400 | .unwrap(); 401 | 402 | let compare = if reversed { 403 | |a: &(i32, i32), b: &(i32, i32)| a.0.cmp(&b.0).reverse() 404 | } else { 405 | |a: &(i32, i32), b: &(i32, i32)| a.0.cmp(&b.0) 406 | }; 407 | 408 | let result = sorter.sort_by(input, compare).unwrap(); 409 | 410 | let actual_result: Result, _> = result.collect(); 411 | let actual_result = actual_result.unwrap(); 412 | let expected_result = if reversed { 413 | Vec::from_iter(input_sorted.clone().rev()) 414 | } else { 415 | Vec::from_iter(input_sorted.clone()) 416 | }; 417 | 418 | assert_eq!(actual_result, expected_result) 419 | } 420 | } 421 | --------------------------------------------------------------------------------