├── crates ├── lsio_bench │ ├── README.md │ ├── Cargo.toml │ └── src │ │ └── main.rs ├── lsio_io │ ├── README.md │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── lsio_uring │ ├── README.md │ ├── src │ │ ├── lib.rs │ │ ├── opcode.rs │ │ ├── close.rs │ │ ├── io_uring.rs │ │ ├── user_data.rs │ │ ├── open_file.rs │ │ ├── get_range.rs │ │ ├── tracker.rs │ │ ├── operation.rs │ │ ├── get_ranges.rs │ │ ├── sqe.rs │ │ └── worker.rs │ ├── benches │ │ ├── fio.ini │ │ └── get.rs │ ├── Cargo.toml │ └── tests │ │ └── integration_test.rs ├── lsio_threadpool │ ├── src │ │ ├── lib.rs │ │ ├── shared_state.rs │ │ ├── park_manager.rs │ │ ├── worker.rs │ │ └── threadpool.rs │ ├── Cargo.toml │ └── README.md └── lsio_aligned_bytes │ ├── Cargo.toml │ ├── README.md │ └── src │ └── lib.rs ├── Cargo.toml ├── LICENSE ├── .gitignore ├── planned_design.md └── README.md /crates/lsio_bench/README.md: -------------------------------------------------------------------------------- 1 | Benchmark LSIO. 2 | 3 | -------------------------------------------------------------------------------- /crates/lsio_io/README.md: -------------------------------------------------------------------------------- 1 | Provides a common framework for all LSIO IO backends. 2 | -------------------------------------------------------------------------------- /crates/lsio_uring/README.md: -------------------------------------------------------------------------------- 1 | LSIO's IO backend for [io_uring](https://en.wikipedia.org/wiki/Io_uring). 2 | 3 | -------------------------------------------------------------------------------- /crates/lsio_threadpool/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | 3 | mod park_manager; 4 | mod shared_state; 5 | mod threadpool; 6 | mod worker; 7 | 8 | pub use threadpool::ThreadPool; 9 | pub use worker::WorkerThread; 10 | -------------------------------------------------------------------------------- /crates/lsio_threadpool/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsio_threadpool" 3 | version = "0.0.0" 4 | publish = false 5 | edition.workspace = true 6 | license.workspace = true 7 | homepage.workspace = true 8 | repository.workspace = true 9 | readme = "README.md" 10 | authors.workspace = true 11 | 12 | [dependencies] 13 | crossbeam-deque.workspace = true 14 | -------------------------------------------------------------------------------- /crates/lsio_threadpool/README.md: -------------------------------------------------------------------------------- 1 | `lsio_threadpool` provides a simple [work stealing](https://en.wikipedia.org/wiki/Work_stealing) threadpool. 2 | 3 | `lsio_threadpool` is a fairly minimal wrapper around [`crossbeam_deque`]. The vast bulk of the fiddly, low-level implementation of work stealing is provided by [`crossbeam_deque`]! 4 | 5 | To get started, please read the documentation for [`ThreadPool::new`]. 6 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | 3 | pub(crate) mod close; 4 | pub(crate) mod get_range; 5 | pub(crate) mod get_ranges; 6 | pub(crate) mod io_uring; 7 | pub(crate) mod opcode; 8 | pub(crate) mod open_file; 9 | pub(crate) mod operation; 10 | pub(crate) mod sqe; 11 | pub(crate) mod tracker; 12 | pub(crate) mod user_data; 13 | pub(crate) mod worker; 14 | 15 | pub use io_uring::IoUring; 16 | -------------------------------------------------------------------------------- /crates/lsio_aligned_bytes/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsio_aligned_bytes" 3 | version = "0.0.1" # Maybe we will publish this as a stand-alone crate. 4 | edition.workspace = true 5 | license.workspace = true 6 | homepage.workspace = true 7 | repository.workspace = true 8 | readme = "README.md" 9 | authors.workspace = true 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | 13 | [dependencies] 14 | anyhow.workspace = true 15 | -------------------------------------------------------------------------------- /crates/lsio_bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsio_bench" 3 | version = "0.0.0" 4 | description = "Benchmark LSIO." 5 | publish = false 6 | edition.workspace = true 7 | license.workspace = true 8 | homepage.workspace = true 9 | repository.workspace = true 10 | readme = "README.md" 11 | authors.workspace = true 12 | 13 | [dependencies] 14 | clap = { version = "4.5.4", features = ["derive"] } 15 | indicatif = "0.17.8" 16 | lsio_uring = { path = "../lsio_uring" } 17 | lsio_io = { path = "../lsio_io" } 18 | -------------------------------------------------------------------------------- /crates/lsio_io/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsio_io" 3 | version = "0.0.0" 4 | publish = false 5 | edition.workspace = true 6 | license.workspace = true 7 | homepage.workspace = true 8 | repository.workspace = true 9 | readme = "README.md" 10 | authors.workspace = true 11 | 12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 13 | 14 | [dependencies] 15 | anyhow = { workspace = true } 16 | lsio_aligned_bytes = { path = "../lsio_aligned_bytes" } 17 | crossbeam-channel = { workspace = true } 18 | 19 | -------------------------------------------------------------------------------- /crates/lsio_uring/benches/fio.ini: -------------------------------------------------------------------------------- 1 | [global] 2 | nrfiles=1000 3 | filesize=256Ki 4 | direct=1 5 | iodepth=64 6 | ioengine=io_uring 7 | numjobs=1 8 | thread=1 9 | directory=/tmp/fio 10 | registerfiles=1 11 | sqthread_poll=1 12 | fixedbufs=1 13 | 14 | [sequential_read_1000_files_each_256KiB] 15 | readwrite=read 16 | blocksize=256Ki 17 | 18 | [read_1000_files_each_256KiB_with_gaps] 19 | wait_for=sequential_read_1000_files_each_256KiB 20 | readwrite=read:32Ki 21 | blocksize=4Ki 22 | 23 | [read_1_file_of_1GiB_with_gaps] 24 | wait_for=read_1000_files_each_256KiB_with_gaps 25 | readwrite=read:64Ki 26 | nrfiles=1 27 | filesize=1Gi 28 | blocksize=4Ki 29 | 30 | [rand_read_1GiB_file] 31 | wait_for=read_1_file_of_1GiB_with_gaps 32 | readwrite=randread 33 | nrfiles=1 34 | filesize=1Gi 35 | blocksize=4Ki 36 | -------------------------------------------------------------------------------- /crates/lsio_uring/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsio_uring" 3 | version = "0.0.0" 4 | publish = false 5 | authors = { workspace = true } 6 | edition = { workspace = true } 7 | homepage = { workspace = true } 8 | repository = { workspace = true } 9 | license = { workspace = true } 10 | readme = "README.md" 11 | 12 | [dependencies] 13 | lsio_aligned_bytes = { path = "../lsio_aligned_bytes" } 14 | lsio_io = { path = "../lsio_io" } 15 | lsio_threadpool = { path = "../lsio_threadpool" } 16 | anyhow = { workspace = true } 17 | crossbeam-channel = { workspace = true } 18 | io-uring = { workspace = true } 19 | libc = { workspace = true } 20 | nix = { workspace = true } 21 | 22 | [dev-dependencies] 23 | criterion = { workspace = true } 24 | tempfile = { workspace = true } 25 | rand = { workspace = true } 26 | 27 | [[bench]] # Yes, this is supposed to have double square brackets! 28 | name = "get" 29 | harness = false 30 | 31 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/opcode.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | use io_uring::opcode; 4 | 5 | /// Simple wrapper around io_uring opcode::*::CODE; 6 | #[derive(PartialEq)] 7 | pub(crate) struct OpCode(u8); 8 | 9 | impl OpCode { 10 | pub(crate) const fn new(op: u8) -> Self { 11 | Self(op) 12 | } 13 | 14 | pub(crate) fn name(&self) -> &'static str { 15 | match self.0 { 16 | opcode::OpenAt::CODE => "openat", 17 | opcode::Read::CODE => "read", 18 | opcode::Close::CODE => "close", 19 | _ => "Un-recognised opcode", 20 | } 21 | } 22 | 23 | pub(crate) fn value(&self) -> u8 { 24 | self.0 25 | } 26 | } 27 | 28 | impl fmt::Debug for OpCode { 29 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 30 | f.debug_tuple("OpCode") 31 | .field(&self.0) 32 | .field(&self.name()) 33 | .finish() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["crates/*"] 3 | resolver = "2" 4 | 5 | [workspace.package] 6 | edition = "2021" 7 | license = "MIT" 8 | homepage = "https://github.com/JackKelly/light-speed-io" 9 | repository = "https://github.com/JackKelly/light-speed-io" 10 | readme = "README.md" 11 | authors=["Jack Kelly "] 12 | 13 | 14 | [workspace.dependencies] 15 | anyhow = "1.0.83" 16 | bytes = "1.6.0" 17 | criterion = { version = "0.5.1", features = ["html_reports", "async_tokio"] } 18 | crossbeam-deque = "0.8.5" 19 | crossbeam-channel = "0.5.12" 20 | io-uring = "0.6.4" 21 | libc = "0.2.153" # Used for filesystem flags 22 | nix = { version = "0.28.0", features = ["fs"] } 23 | object_store = "0.10.1" 24 | snafu = "0.8.2" 25 | tokio = { version = "1.37.0", features = ["rt-multi-thread"]} 26 | url = "2.5.0" 27 | tempfile = "3.10" 28 | rand = "0.8" 29 | 30 | [profile.bench] 31 | debug = true # Enable debuginfo when profiling with cargo flamegraph. 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-2024 Jack Kelly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/close.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use lsio_threadpool::WorkerThread; 4 | 5 | use crate::{ 6 | open_file::OpenFile, 7 | operation::{NextStep, Operation, UringOperation}, 8 | sqe::build_close_sqe, 9 | }; 10 | 11 | #[derive(Debug)] 12 | pub(crate) struct Close { 13 | file: Arc, 14 | } 15 | 16 | impl Close { 17 | pub(crate) fn new(file: Arc) -> Self { 18 | Self { file } 19 | } 20 | } 21 | 22 | impl UringOperation for Close { 23 | fn submit_first_step( 24 | &mut self, 25 | index_of_op: usize, 26 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 27 | ) -> Result<(), io_uring::squeue::PushError> { 28 | let entry = build_close_sqe(index_of_op, *self.file.file_descriptor()); 29 | unsafe { local_uring_submission_queue.push(&entry) } 30 | } 31 | 32 | fn process_opcode_and_submit_next_step( 33 | &mut self, 34 | idx_and_opcode: &crate::user_data::UringUserData, 35 | _cqe_result: i32, 36 | _local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 37 | _worker_thread: &WorkerThread, 38 | _output_channel: &mut crossbeam_channel::Sender>, 39 | ) -> NextStep { 40 | if idx_and_opcode.opcode().value() != io_uring::opcode::Close::CODE { 41 | panic!("Unrecognised opcode!"); 42 | } 43 | NextStep::Done 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /crates/lsio_threadpool/src/shared_state.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{ 2 | atomic::{AtomicBool, Ordering::Relaxed}, 3 | mpsc, Arc, 4 | }; 5 | 6 | use crossbeam_deque as deque; 7 | 8 | use crate::park_manager::ParkManagerCommand; 9 | 10 | /// `ThreadPool` owns a `SharedState`, and each `WorkerThread` owns a cloned `SharedState`. 11 | #[derive(Debug)] 12 | pub(crate) struct SharedState 13 | where 14 | T: Send, 15 | { 16 | pub(crate) injector: Arc>, 17 | pub(crate) keep_running: Arc, 18 | pub(crate) chan_to_park_manager: mpsc::Sender, 19 | pub(crate) at_least_one_thread_is_parked: Arc, 20 | } 21 | 22 | impl SharedState 23 | where 24 | T: Send, 25 | { 26 | pub(crate) fn unpark_at_most_n_threads(&self, n: u32) { 27 | if self.at_least_one_thread_is_parked.load(Relaxed) { 28 | self.chan_to_park_manager 29 | .send(ParkManagerCommand::WakeAtMostNThreads(n)) 30 | .unwrap(); 31 | } 32 | } 33 | } 34 | 35 | impl Clone for SharedState 36 | where 37 | T: Send, 38 | { 39 | fn clone(&self) -> Self { 40 | Self { 41 | injector: Arc::clone(&self.injector), 42 | keep_running: Arc::clone(&self.keep_running), 43 | chan_to_park_manager: self.chan_to_park_manager.clone(), 44 | at_least_one_thread_is_parked: Arc::clone(&self.at_least_one_thread_is_parked), 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/io_uring.rs: -------------------------------------------------------------------------------- 1 | use std::{ffi::CString, os::unix::ffi::OsStrExt}; 2 | 3 | use crate::get_ranges::GetRanges; 4 | use crate::operation::Operation; 5 | use crate::worker::UringWorker; 6 | use lsio_io::{Completion, Output, Reader}; 7 | use lsio_threadpool::{ThreadPool, WorkerThread}; 8 | 9 | pub struct IoUring { 10 | threadpool: ThreadPool, 11 | output_rx: crossbeam_channel::Receiver>, 12 | } 13 | 14 | impl IoUring { 15 | pub fn new(n_worker_threads: usize) -> Self { 16 | let (output_tx, output_rx) = crossbeam_channel::bounded(1_024); 17 | Self { 18 | threadpool: ThreadPool::new( 19 | n_worker_threads, 20 | move |worker_thread: WorkerThread| { 21 | let mut uring_worker = UringWorker::new(worker_thread, output_tx.clone()); 22 | uring_worker.run(); 23 | }, 24 | ), 25 | output_rx, 26 | } 27 | } 28 | } 29 | 30 | impl Completion for IoUring { 31 | fn completion(&self) -> &crossbeam_channel::Receiver> { 32 | &self.output_rx 33 | } 34 | } 35 | 36 | impl Reader for IoUring { 37 | fn get_ranges( 38 | &mut self, 39 | location: &std::path::Path, 40 | ranges: Vec>, 41 | user_data: Vec, 42 | ) -> anyhow::Result<()> { 43 | let location = CString::new(location.as_os_str().as_bytes()) 44 | .expect("Failed to convert path '{path}' to CString"); 45 | let task = Operation::GetRanges(GetRanges::new(location, ranges, user_data)); 46 | self.threadpool.push(task); 47 | Ok(()) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/user_data.rs: -------------------------------------------------------------------------------- 1 | use crate::opcode::OpCode; 2 | 3 | /// The u64 io_uring user_data represents the index_of_op in the left-most 32 bits, 4 | /// and represents the io_uring opcode CODE in the right-most 32 bits. 5 | #[derive(Debug)] 6 | pub(crate) struct UringUserData { 7 | index_of_op: u32, 8 | op: OpCode, 9 | } 10 | 11 | impl UringUserData { 12 | pub(crate) fn new(index_of_op: usize, op: u8) -> Self { 13 | Self { 14 | index_of_op: index_of_op.try_into().unwrap(), 15 | op: OpCode::new(op), 16 | } 17 | } 18 | 19 | pub(crate) const fn index_of_op(&self) -> u32 { 20 | self.index_of_op 21 | } 22 | 23 | pub(crate) const fn opcode(&self) -> &OpCode { 24 | &self.op 25 | } 26 | } 27 | 28 | impl From for UringUserData { 29 | fn from(value: u64) -> Self { 30 | let index_of_op: u32 = (value >> 32).try_into().unwrap(); 31 | let op = OpCode::new((value & 0xFF).try_into().unwrap()); 32 | Self { index_of_op, op } 33 | } 34 | } 35 | 36 | impl Into for UringUserData { 37 | fn into(self) -> u64 { 38 | let index_of_op: u64 = (self.index_of_op as u64) << 32; 39 | index_of_op | self.op.value() as u64 40 | } 41 | } 42 | 43 | #[cfg(test)] 44 | mod tests { 45 | use super::*; 46 | 47 | #[test] 48 | fn test_uring_user_data_round_trip() { 49 | const INDEX: usize = 100; 50 | const OPCODE: u8 = io_uring::opcode::Read::CODE; 51 | let uring_user_data = UringUserData::new(INDEX, OPCODE); 52 | let user_data_u64: u64 = uring_user_data.into(); 53 | let uring_user_data = UringUserData::from(user_data_u64); 54 | assert_eq!(uring_user_data.index_of_op, INDEX as u32); 55 | assert_eq!(uring_user_data.op, OpCode::new(OPCODE)); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /crates/lsio_threadpool/src/park_manager.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::VecDeque, 3 | sync::{ 4 | atomic::{AtomicBool, Ordering::Relaxed}, 5 | mpsc::{self, RecvError}, 6 | Arc, 7 | }, 8 | thread, 9 | }; 10 | 11 | pub(crate) enum ParkManagerCommand { 12 | WakeAtMostNThreads(u32), 13 | ThreadIsParked(thread::Thread), 14 | Stop, 15 | } 16 | 17 | pub(crate) struct ParkManager { 18 | rx: mpsc::Receiver, 19 | at_least_one_thread_is_parked: Arc, 20 | parked_threads: VecDeque, 21 | } 22 | 23 | impl ParkManager { 24 | pub(crate) fn start( 25 | rx: mpsc::Receiver, 26 | at_least_one_thread_is_parked: Arc, 27 | n_worker_threads: usize, 28 | ) -> thread::JoinHandle<()> { 29 | let mut park_manager = Self { 30 | rx, 31 | at_least_one_thread_is_parked, 32 | parked_threads: VecDeque::with_capacity(n_worker_threads), 33 | }; 34 | thread::Builder::new() 35 | .name("ParkManager".to_string()) 36 | .spawn(move || park_manager.main_loop()) 37 | .expect("Failed to spawn the ParkManager thread!") 38 | } 39 | 40 | fn main_loop(&mut self) { 41 | use ParkManagerCommand::*; 42 | loop { 43 | match self.rx.recv() { 44 | Ok(cmd) => match cmd { 45 | ThreadIsParked(t) => self.thread_is_parked(t), 46 | WakeAtMostNThreads(n) => self.wake_at_most_n_threads(n), 47 | Stop => break, 48 | }, 49 | Err(RecvError) => break, 50 | } 51 | } 52 | } 53 | 54 | fn thread_is_parked(&mut self, t: thread::Thread) { 55 | self.at_least_one_thread_is_parked.store(true, Relaxed); 56 | debug_assert!(!self.parked_threads.iter().any(|pt| pt.id() == t.id())); 57 | self.parked_threads.push_back(t); 58 | } 59 | 60 | fn wake_at_most_n_threads(&mut self, n: u32) { 61 | for _ in 0..n { 62 | match self.parked_threads.pop_front() { 63 | Some(thread) => thread.unpark(), 64 | None => break, 65 | } 66 | } 67 | if self.parked_threads.is_empty() { 68 | self.at_least_one_thread_is_parked.store(false, Relaxed); 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/open_file.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::CString; 2 | 3 | #[derive(Debug)] 4 | pub(crate) struct OpenFile { 5 | location: CString, 6 | file_descriptor: io_uring::types::Fd, 7 | /// The file size in bytes. 8 | /// Note that we always have to `statx` the file to get the `alignment`, so we'll always get 9 | /// the file size, too. 10 | size: u64, 11 | alignment: u32, 12 | } 13 | 14 | impl OpenFile { 15 | pub(crate) fn file_descriptor(&self) -> &io_uring::types::Fd { 16 | &self.file_descriptor 17 | } 18 | 19 | pub(crate) fn size(&self) -> u64 { 20 | self.size 21 | } 22 | 23 | pub(crate) fn alignment(&self) -> u32 { 24 | self.alignment 25 | } 26 | } 27 | 28 | /// Used to build an [`OpenFile`]. 29 | #[derive(Debug)] 30 | pub(crate) struct OpenFileBuilder { 31 | location: CString, 32 | file_descriptor: Option, 33 | statx: libc::statx, 34 | assume_statx_is_initialised: bool, 35 | } 36 | 37 | impl OpenFileBuilder { 38 | pub(crate) fn new(location: CString) -> Self { 39 | Self { 40 | location, 41 | file_descriptor: None, 42 | statx: unsafe { std::mem::zeroed() }, 43 | assume_statx_is_initialised: false, 44 | } 45 | } 46 | 47 | pub(crate) const fn location(&self) -> &CString { 48 | &self.location 49 | } 50 | 51 | pub(crate) fn set_file_descriptor(&mut self, file_descriptor: io_uring::types::Fd) { 52 | self.file_descriptor = Some(file_descriptor); 53 | } 54 | 55 | pub(crate) fn get_statx_ptr(&mut self) -> *mut libc::statx { 56 | &mut self.statx as *mut libc::statx 57 | } 58 | 59 | pub(crate) unsafe fn assume_statx_is_initialised(&mut self) { 60 | self.assume_statx_is_initialised = true; 61 | } 62 | 63 | pub(crate) fn is_ready(&self) -> bool { 64 | self.file_descriptor.is_some() && self.assume_statx_is_initialised 65 | } 66 | 67 | /// Safety: [`Self::is_ready`] must return `true` before calling `build`! 68 | /// Panics: If `build` is called while [`Self::is_ready`] is still false. 69 | pub(crate) fn build(self) -> OpenFile { 70 | assert!(self.is_ready()); 71 | OpenFile { 72 | location: self.location, 73 | file_descriptor: self.file_descriptor.unwrap(), 74 | size: self.statx.stx_size, 75 | alignment: self.statx.stx_dio_mem_align, 76 | // TODO: Maybe also use `statx.stx_dio_offset_align`. 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /crates/lsio_io/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | 3 | use lsio_aligned_bytes::AlignedBytes; 4 | use std::ops::Range; 5 | 6 | // TODO: Consider how to *group* instructions, such that LSIO guarantees that all operations in 7 | // group _n_ will be completed before any operations in group _n+1_ are started. See: 8 | // https://github.com/JackKelly/light-speed-io/issues/68 9 | 10 | /// All IO backends must expose their completion queue. 11 | pub trait Completion { 12 | fn completion(&self) -> &crossbeam_channel::Receiver>; 13 | } 14 | 15 | /// Methods for IO backends that can read from IO. 16 | pub trait Reader { 17 | /// Submit a GetRanges operation. 18 | /// 19 | /// `ranges` specify the byte ranges to read. Negative numbers are relative to the filesize. 20 | /// (Like indexing lists in Python.) For example: 21 | /// 0..-1 The entire file. 22 | /// 0..100 The first 100 bytes. 23 | /// -100..-1 The last 100 bytes. 24 | /// 25 | /// `user_data` is used to identify each byte_range. 26 | /// One `user_data` instance per byte_range. 27 | /// For example, in Zarr, this would be used to identify the 28 | /// location at which this chunk appears in the merged array. 29 | /// 30 | /// # Errors: 31 | /// If the user submits a `get_ranges` operation with an invalid filename then 32 | /// the user will receive a single `std::io::Error(std::io::ErrorKind::NotFound)` with context 33 | /// that describes the filename that failed. If a subset of the `ranges` results in an error 34 | /// (e.g. reading beyond end of the file) then the user will receive a mixture of `Ok(Output)` 35 | /// and `Err`, where the `Err` will include context such as the filename and byte range. 36 | fn get_ranges( 37 | &mut self, 38 | // We take ownership because this function returns immediately. If we used references then 39 | // there would be nothing to stop the user from dropping the owned objects (and 40 | // invalidating the references!). 41 | location: &std::path::Path, 42 | ranges: Vec>, 43 | user_data: Vec, 44 | ) -> anyhow::Result<()>; 45 | } 46 | 47 | /// `Chunk` is used throughout the LSIO stack. It is passed from the I/O layer to 48 | /// the compute layer, and to the application layer. (To be more precise: `Result` is usually 49 | /// what is passed around!). 50 | #[derive(Debug)] 51 | pub struct Chunk { 52 | pub buffer: AlignedBytes, 53 | /// `user_data` can be used to uniquely identify each chunk, for example by providing an index 54 | /// into an array that provides more information about each chunk. 55 | pub user_data: u64, 56 | } 57 | 58 | /// Holds the data that is output from each IO operation. 59 | #[derive(Debug)] 60 | pub enum Output { 61 | Chunk(Chunk), 62 | // Other variants could be: 63 | // `BytesWritten`, `Listing(Vec)`, etc. 64 | } 65 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/get_range.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | close::Close, 3 | open_file::OpenFile, 4 | operation::{NextStep, Operation, UringOperation}, 5 | sqe::build_read_range_sqe, 6 | user_data::UringUserData, 7 | }; 8 | use lsio_aligned_bytes::AlignedBytes; 9 | use lsio_io::{Chunk, Output}; 10 | use lsio_threadpool::WorkerThread; 11 | use std::{ops::Range, sync::Arc}; 12 | 13 | #[derive(Debug)] 14 | pub(crate) struct GetRange { 15 | file: Arc, // TODO: Replace Arc with Atomic counter? 16 | range: Range, 17 | user_data: u64, 18 | buffer: Option, // This is an `Option` so we can `take` it. 19 | } 20 | 21 | impl GetRange { 22 | pub(crate) fn new(file: Arc, range: Range, user_data: u64) -> Self { 23 | // TODO: Split reads of more than 2 GiB into multiple smaller reads! See issue #99. 24 | if range.len() > 2_147_479_552 { 25 | panic!( 26 | "`read` will transfer at most 2 GiB but {} bytes were requested. \ 27 | See https://github.com/JackKelly/light-speed-io/issues/99", 28 | range.len() 29 | ); 30 | } 31 | Self { 32 | file, 33 | range, 34 | user_data, 35 | buffer: None, 36 | } 37 | } 38 | } 39 | 40 | impl UringOperation for GetRange { 41 | /// This method assume that the file has already been opened (by the [`GetRanges`] operation). 42 | fn submit_first_step( 43 | &mut self, 44 | index_of_op: usize, 45 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 46 | ) -> Result<(), io_uring::squeue::PushError> { 47 | let (entry, buffer) = build_read_range_sqe(index_of_op, &self.file, &self.range); 48 | self.buffer = Some(buffer); 49 | unsafe { local_uring_submission_queue.push(&entry) } // TODO: Does `entry` have to stay 50 | // alive for longer? 51 | } 52 | 53 | fn process_opcode_and_submit_next_step( 54 | &mut self, 55 | idx_and_opcode: &UringUserData, 56 | cqe_result: i32, 57 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 58 | _worker_thread: &WorkerThread, 59 | output_channel: &mut crossbeam_channel::Sender>, 60 | ) -> NextStep { 61 | // Check that the opcode of the CQE is what we expected: 62 | if idx_and_opcode.opcode().value() != io_uring::opcode::Read::CODE { 63 | panic!("Unrecognised opcode!"); 64 | } 65 | if cqe_result >= 0 { 66 | // TODO: Check we've read the correct number of bytes: 67 | // Check `cqe_result_value == self.buffer.len()`. 68 | // TODO: Retry if we read less data than requested! See issue #100. 69 | 70 | output_channel 71 | .send(Ok(Output::Chunk(Chunk { 72 | buffer: self.buffer.take().unwrap(), 73 | user_data: self.user_data, 74 | }))) 75 | .unwrap(); 76 | }; 77 | // Check if it's time to close the file: 78 | if Arc::strong_count(&self.file) == 1 { 79 | // We're the last operation on this file, so it's time to close this file. 80 | let mut close_op = Close::new(Arc::clone(&self.file)); 81 | close_op 82 | .submit_first_step( 83 | idx_and_opcode.index_of_op() as _, 84 | local_uring_submission_queue, 85 | ) 86 | .unwrap(); 87 | NextStep::ReplaceWith(Operation::Close(close_op)) 88 | } else { 89 | NextStep::Done 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /crates/lsio_threadpool/src/worker.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | iter, 3 | sync::{atomic::Ordering::Relaxed, Arc}, 4 | thread, 5 | }; 6 | 7 | use crossbeam_deque as deque; 8 | 9 | use crate::{park_manager::ParkManagerCommand, shared_state::SharedState}; 10 | 11 | /// Provides methods that allow user-defined closures to find new tasks to work on, 12 | /// submit new tasks, park this thread, and check if the closure should continue looping. 13 | /// 14 | /// Uses do not construct `WorkerThread`s. Instead, [`ThreadPool::new`](crate::ThreadPool::new) 15 | /// creates one `WorkerThread` per thread, and passes that thread's `WorkerThread` to the 16 | /// user-supplied closure for that thread. 17 | pub struct WorkerThread 18 | where 19 | T: Send, 20 | { 21 | shared: SharedState, 22 | 23 | /// Queues for implementing work-stealing: 24 | local_queue: deque::Worker, 25 | stealers: Arc>>, 26 | } 27 | 28 | impl WorkerThread 29 | where 30 | T: Send, 31 | { 32 | pub(crate) fn new( 33 | shared: SharedState, 34 | local_queue: deque::Worker, 35 | stealers: Arc>>, 36 | ) -> Self { 37 | Self { 38 | shared, 39 | local_queue, 40 | stealers, 41 | } 42 | } 43 | 44 | /// Get the next task to work on. This function never blocks. 45 | pub fn find_task(&self) -> Option { 46 | // Adapted from https://docs.rs/crossbeam-deque/latest/crossbeam_deque/#examples 47 | 48 | // Pop a task from the local queue, if not empty. 49 | self.local_queue.pop().or_else(|| { 50 | // Otherwise, we need to look for a task elsewhere. 51 | iter::repeat_with(|| { 52 | // Try stealing a batch of tasks from the global queue. 53 | self.shared 54 | .injector 55 | .steal_batch_and_pop(&self.local_queue) 56 | // Or try stealing a task from one of the other threads. 57 | .or_else(|| self.stealers.iter().map(|s| s.steal()).collect()) 58 | }) 59 | // Loop while no task was stolen and any steal operation needs to be retried. 60 | .find(|s| !s.is_retry()) 61 | // Extract the stolen task, if there is one. 62 | .and_then(|s| s.success()) 63 | }) 64 | } 65 | 66 | /// Returns true if the task should keep running. 67 | pub fn keep_running(&self) -> bool { 68 | self.shared.keep_running.load(Relaxed) 69 | } 70 | 71 | /// Park this thread. 72 | /// 73 | /// Before parking, this function will register this thread with the `ParkManager` 74 | /// so that this thread can be automatically unparked when necessary. 75 | pub fn park(&self) { 76 | self.shared 77 | .chan_to_park_manager 78 | .send(ParkManagerCommand::ThreadIsParked(thread::current())) 79 | .unwrap_or_else(|e| { 80 | panic!( 81 | "failed to send ThreadIsParked({:?}) message to ParkManager! {e:?}", 82 | thread::current(), 83 | ) 84 | }); 85 | thread::park(); 86 | } 87 | 88 | /// Push a task onto this thread's local queue of tasks. 89 | /// 90 | /// Tasks on the local queue may be stolen by other threads! 91 | pub fn push(&self, task: T) { 92 | self.local_queue.push(task); 93 | self.maybe_unpark_other_threads(); 94 | } 95 | 96 | fn maybe_unpark_other_threads(&self) { 97 | let n = self.local_queue.len(); 98 | if n > 1 { 99 | self.shared.unpark_at_most_n_threads(n as _); 100 | } 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/tracker.rs: -------------------------------------------------------------------------------- 1 | use std::collections::VecDeque; 2 | 3 | pub(crate) struct Tracker { 4 | pub(crate) ops_in_flight: Vec>, 5 | pub(crate) next_index: VecDeque, 6 | len: usize, 7 | } 8 | 9 | impl Tracker { 10 | pub(crate) fn new(n: usize) -> Self { 11 | Self { 12 | ops_in_flight: (0..n).map(|_| None).collect(), 13 | next_index: (0..n).collect(), 14 | len: 0, 15 | } 16 | } 17 | 18 | pub(crate) fn get_next_index(&mut self) -> Option { 19 | self.next_index.pop_front() 20 | } 21 | 22 | pub(crate) fn put(&mut self, index: usize, op: T) { 23 | self.ops_in_flight[index].replace(op); 24 | self.len += 1; 25 | } 26 | 27 | pub(crate) fn get(&mut self, index: usize) -> Option> { 28 | if self.ops_in_flight[index].is_none() { 29 | None 30 | } else { 31 | Some(TrackerGuard { 32 | index, 33 | tracker: self, 34 | }) 35 | } 36 | } 37 | 38 | pub(crate) fn is_empty(&self) -> bool { 39 | self.len == 0 40 | } 41 | 42 | pub(crate) fn is_full(&self) -> bool { 43 | self.next_index.is_empty() 44 | } 45 | } 46 | 47 | pub(crate) struct TrackerGuard<'a, T> { 48 | index: usize, 49 | tracker: &'a mut Tracker, 50 | } 51 | 52 | impl<'a, T> TrackerGuard<'a, T> { 53 | /// Safety: If TrackerGuard exists, then we know that `self.index` is valid. 54 | /// So `as_mut` can never fail. 55 | pub(crate) fn as_mut(&mut self) -> &mut T { 56 | self.tracker.ops_in_flight[self.index].as_mut().unwrap() 57 | } 58 | 59 | pub(crate) fn remove(&mut self) -> T { 60 | self.tracker.next_index.push_back(self.index); 61 | self.tracker.len -= 1; 62 | self.tracker.ops_in_flight[self.index].take().unwrap() 63 | } 64 | 65 | pub(crate) fn replace(&mut self, op: T) { 66 | self.tracker.ops_in_flight[self.index].replace(op); 67 | } 68 | } 69 | 70 | #[cfg(test)] 71 | mod tests { 72 | use super::*; 73 | 74 | #[test] 75 | fn test_op_tracker() { 76 | let mut tracker = Tracker::new(2); 77 | 78 | // Check that removing an item before inserting an item returns None. 79 | assert!(tracker.get(0).is_none()); 80 | 81 | // Put one string into the tracker, and then remove that string. 82 | let i0 = tracker.get_next_index().unwrap(); 83 | assert_eq!(i0, 0); 84 | let s0 = "string0".to_string(); 85 | tracker.put(i0, s0.clone()); 86 | assert_eq!(tracker.get(i0).unwrap().remove(), s0); 87 | // The tracker is now empty. 88 | 89 | // Put another string into the tracker. Don't remove it yet. 90 | let i1 = tracker.get_next_index().unwrap(); 91 | assert_eq!(i1, 1); 92 | let s1 = "string1".to_string(); 93 | tracker.put(i1, s1.clone()); 94 | 95 | // Put another string into the tracker. Don't remove it yet. 96 | let i2 = tracker.get_next_index().unwrap(); 97 | assert_eq!(i2, 0); 98 | let s2 = "string2".to_string(); 99 | tracker.put(i2, s2.clone()); 100 | 101 | // Check that we can't put any more strings into tracker 102 | assert!(tracker.get_next_index().is_none()); 103 | 104 | // Check the strings are correct 105 | assert_eq!(tracker.get(i1).unwrap().remove(), s1); 106 | assert_eq!(tracker.get(i2).unwrap().remove(), s2); 107 | } 108 | 109 | #[test] 110 | #[should_panic(expected = "index out of bounds")] 111 | fn test_panic_if_wrong_index() { 112 | let mut tracker: Tracker = Tracker::new(2); 113 | tracker.get(100); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/operation.rs: -------------------------------------------------------------------------------- 1 | use lsio_threadpool::WorkerThread; 2 | 3 | use crate::{close::Close, get_range::GetRange, get_ranges::GetRanges, user_data::UringUserData}; 4 | 5 | /// We keep a `Tracker` in each thread to track progress of each operation: 6 | #[derive(Debug)] 7 | pub(crate) enum Operation { 8 | GetRanges(GetRanges), 9 | GetRange(GetRange), 10 | Close(Close), 11 | } 12 | 13 | impl Operation { 14 | fn apply_func_to_all_inner_structs(&mut self, mut f: F) -> R 15 | where 16 | F: FnMut(&mut dyn UringOperation) -> R, 17 | { 18 | use Operation::*; 19 | match self { 20 | GetRanges(s) => f(s), 21 | GetRange(s) => f(s), 22 | Close(s) => f(s), 23 | } 24 | } 25 | } 26 | 27 | impl UringOperation for Operation { 28 | fn submit_first_step( 29 | &mut self, 30 | index_of_op: usize, 31 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 32 | ) -> Result<(), io_uring::squeue::PushError> { 33 | self.apply_func_to_all_inner_structs(|s| { 34 | UringOperation::submit_first_step(s, index_of_op, local_uring_submission_queue) 35 | }) 36 | } 37 | 38 | fn process_opcode_and_submit_next_step( 39 | &mut self, 40 | idx_and_opcode: &UringUserData, 41 | cqe_result: i32, 42 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 43 | worker_thread: &WorkerThread, 44 | output_channel: &mut crossbeam_channel::Sender>, 45 | ) -> NextStep { 46 | self.apply_func_to_all_inner_structs(|s| { 47 | UringOperation::maybe_send_error(s, idx_and_opcode, cqe_result, output_channel); 48 | UringOperation::process_opcode_and_submit_next_step( 49 | s, 50 | idx_and_opcode, 51 | cqe_result, 52 | local_uring_submission_queue, 53 | worker_thread, 54 | output_channel, 55 | ) 56 | }) 57 | } 58 | } 59 | 60 | /// ------------------ COMMON TO ALL URING OPERATIONS --------------------- 61 | /// Some aims of this design: 62 | /// - Allocate on the stack 63 | /// - Cleanly separate the code that implements the state machine for handling each operation. 64 | /// - Gain the benefits of using the typestate pattern, whilst still allowing us to keep the types 65 | /// in a vector. See issue #117. 66 | pub(crate) trait UringOperation: std::fmt::Debug { 67 | fn submit_first_step( 68 | &mut self, 69 | index_of_op: usize, 70 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 71 | ) -> Result<(), io_uring::squeue::PushError>; 72 | 73 | fn process_opcode_and_submit_next_step( 74 | &mut self, 75 | idx_and_opcode: &UringUserData, 76 | cqe_result: i32, 77 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 78 | worker_thread: &WorkerThread, 79 | output_channel: &mut crossbeam_channel::Sender>, 80 | ) -> NextStep; 81 | 82 | fn maybe_send_error( 83 | &self, 84 | idx_and_opcode: &UringUserData, 85 | cqe_result: i32, 86 | output_channel: &mut crossbeam_channel::Sender>, 87 | ) { 88 | if cqe_result < 0 { 89 | // TODO: We probably want a custom Error struct (or enum?) which has machine-readable 90 | // fields for filename, byte_range(s), user_data, error code, opcode. But this 91 | // `anyhow::Error` will do for now. 92 | let nix_err = nix::Error::from_raw(-cqe_result); 93 | let context = format!( 94 | "{nix_err} (reported by io_uring completion queue entry (CQE)). More details: \ 95 | idx_and_opcode: {idx_and_opcode:?}. cqe_result: {cqe_result}. self: {self:?}", 96 | ); 97 | let err = Err(anyhow::Error::new(nix_err).context(context)); 98 | output_channel.send(err).unwrap(); 99 | } 100 | } 101 | } 102 | 103 | pub(crate) enum NextStep { 104 | Pending, 105 | Done, 106 | ReplaceWith(Operation), 107 | } 108 | -------------------------------------------------------------------------------- /crates/lsio_uring/tests/integration_test.rs: -------------------------------------------------------------------------------- 1 | use crossbeam_channel::RecvTimeoutError; 2 | use lsio_aligned_bytes::AlignedBytes; 3 | use lsio_io::{Completion, Reader}; 4 | use lsio_uring::IoUring; 5 | use rand::Rng; 6 | use std::fs::File; 7 | use std::io::Read; 8 | use std::{io::Write, time::Duration}; 9 | 10 | const KIBIBYTE: usize = 1024; 11 | const MEBIBYTE: usize = KIBIBYTE * 1024; 12 | 13 | #[test] 14 | fn test_get_ranges() -> anyhow::Result<()> { 15 | const N_WORKER_THREADS: usize = 4; 16 | const FILE_SIZE: usize = MEBIBYTE; 17 | const CHUNK_SIZE: usize = KIBIBYTE * 4; 18 | const N_CHUNKS: usize = FILE_SIZE / CHUNK_SIZE; 19 | 20 | // Create random ASCII text (that we will write to disk later): 21 | println!("Creating random data..."); 22 | let distr = rand::distributions::Uniform::new_inclusive(32, 126); 23 | let file_contents: Vec = rand::thread_rng() 24 | .sample_iter(distr) 25 | .take(((CHUNK_SIZE as f32) * 1.5) as _) 26 | .collect::>() 27 | .into_iter() 28 | .cycle() 29 | .take(FILE_SIZE) 30 | .collect(); 31 | assert_eq!(file_contents.len(), FILE_SIZE); 32 | 33 | // Create filename in temporary directory: 34 | let filename = 35 | std::env::temp_dir().join(format!("lsio_uring_tempfile_{}", rand::random::())); 36 | 37 | // Write file: 38 | println!("Writing random data to disk..."); 39 | { 40 | let mut file = File::create(&filename)?; 41 | file.write_all(&file_contents)?; 42 | file.flush()?; 43 | file.sync_all()?; 44 | } 45 | 46 | // Check file is correctly written to disk: 47 | { 48 | let mut file = File::open(&filename)?; 49 | let mut temp_buffer = Vec::with_capacity(FILE_SIZE); 50 | file.read_to_end(&mut temp_buffer)?; 51 | assert!(temp_buffer.eq(&file_contents)); 52 | assert_eq!(temp_buffer.len(), FILE_SIZE); 53 | } 54 | 55 | // Define byte ranges to load: 56 | let ranges = (0..N_CHUNKS) 57 | .map(|chunk_i| { 58 | let chunk_start = (chunk_i * CHUNK_SIZE) as isize; 59 | let chunk_end = chunk_start + (CHUNK_SIZE as isize); 60 | chunk_start..chunk_end 61 | }) 62 | .collect(); 63 | 64 | // Define user_data (so we can identify the chunks!) 65 | let user_data = (0..N_CHUNKS as u64).collect(); 66 | 67 | // Submit get_ranges operation: 68 | println!("Reading data using io_uring!!!"); 69 | let mut uring = IoUring::new(N_WORKER_THREADS); 70 | uring.get_ranges(&filename, ranges, user_data)?; 71 | 72 | // Re-assemble byte ranges: 73 | let mut vec_of_aligned_bytes: Vec> = (0..N_CHUNKS).map(|_| None).collect(); 74 | 75 | for i in 0..N_CHUNKS { 76 | match uring.completion().recv_timeout(Duration::from_millis(500)) { 77 | Ok(output) => match output { 78 | Ok(c) => { 79 | let lsio_io::Output::Chunk(c) = c; 80 | vec_of_aligned_bytes[c.user_data as usize] = Some(c.buffer); 81 | } 82 | Err(e) => panic!("Error reading chunk {i}! {e:?}"), 83 | }, 84 | Err(RecvTimeoutError::Timeout) => panic!("Timed out waiting for chunk {i}!"), 85 | Err(RecvTimeoutError::Disconnected) => { 86 | panic!("Disconnected whilst waiting for chunk {i}!") 87 | } 88 | }; 89 | } 90 | println!("Finished reading using io_uring!"); 91 | 92 | // Check that the completion queue does the right thing when IoUring is dropped: 93 | let completion = uring.completion().clone(); 94 | drop(uring); 95 | assert!(completion.recv().is_err()); 96 | drop(completion); 97 | 98 | // Re-assemble the chunks into the complete file: 99 | println!("Assembling buffer:"); 100 | let mut assembled_buf = Vec::with_capacity(FILE_SIZE); 101 | for aligned_bytes in vec_of_aligned_bytes { 102 | assembled_buf.extend_from_slice(aligned_bytes.unwrap().as_slice()); 103 | } 104 | 105 | println!( 106 | "Read from disk: {:?}", 107 | core::str::from_utf8(&assembled_buf[0..100]).unwrap() 108 | ); 109 | println!( 110 | "Ground truth : {:?}", 111 | core::str::from_utf8(&file_contents[0..100]).unwrap() 112 | ); 113 | 114 | assert!(assembled_buf.eq(&file_contents)); 115 | 116 | // Clean up: 117 | std::fs::remove_file(&filename)?; 118 | 119 | Ok(()) 120 | } 121 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/get_ranges.rs: -------------------------------------------------------------------------------- 1 | use std::{ffi::CString, iter::zip, ops::Range, sync::Arc}; 2 | 3 | use lsio_threadpool::WorkerThread; 4 | 5 | use crate::{ 6 | get_range::GetRange, 7 | open_file::OpenFileBuilder, 8 | operation::{NextStep, Operation, UringOperation}, 9 | sqe::{build_openat_sqe, build_statx_sqe}, 10 | }; 11 | 12 | const N_CQES_EXPECTED: u8 = 2; // We're expecting CQEs for `openat` and `statx`. 13 | 14 | #[derive(Debug)] 15 | pub(crate) struct GetRanges { 16 | open_file_builder: Option, 17 | ranges: Vec>, 18 | user_data: Vec, 19 | 20 | // If both CQEs succeed then we'll capture their outputs in `open_file_builder`. But, in case 21 | // one or more CQEs reports a failure, we need an additional mechanism to track how many CQEs 22 | // we've received. 23 | n_cqes_received: u8, 24 | } 25 | 26 | impl GetRanges { 27 | pub(crate) fn new(location: CString, ranges: Vec>, user_data: Vec) -> Self { 28 | assert_eq!(ranges.len(), user_data.len()); 29 | Self { 30 | open_file_builder: Some(OpenFileBuilder::new(location)), 31 | ranges, 32 | user_data, 33 | n_cqes_received: 0, 34 | } 35 | } 36 | 37 | // io_uring can't process multiple range requests in a single op. So, once we've opened the 38 | // file and gotten its metadata, we need to submit one `Operation::GetRange` per byte range. 39 | fn submit_get_range_ops(&mut self, worker_thread: &WorkerThread) { 40 | let file = Arc::new(self.open_file_builder.take().unwrap().build()); 41 | for (range, user_data) in zip(&self.ranges, &self.user_data) { 42 | let get_range_op = GetRange::new(file.clone(), range.to_owned(), *user_data); 43 | worker_thread.push(Operation::GetRange(get_range_op)); 44 | } 45 | } 46 | } 47 | 48 | impl UringOperation for GetRanges { 49 | fn submit_first_step( 50 | &mut self, 51 | index_of_op: usize, 52 | local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 53 | ) -> Result<(), io_uring::squeue::PushError> { 54 | let open_entry = build_openat_sqe( 55 | index_of_op, 56 | self.open_file_builder.as_ref().unwrap().location(), 57 | ); 58 | let statx_entry = 59 | build_statx_sqe(index_of_op, &mut self.open_file_builder.as_mut().unwrap()); 60 | unsafe { 61 | local_uring_submission_queue.push(&open_entry)?; 62 | local_uring_submission_queue.push(&statx_entry)?; 63 | }; 64 | Ok(()) 65 | } 66 | 67 | fn process_opcode_and_submit_next_step( 68 | &mut self, 69 | idx_and_opcode: &crate::user_data::UringUserData, 70 | cqe_result: i32, 71 | _local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue, 72 | worker_thread: &WorkerThread, 73 | _output_channel: &mut crossbeam_channel::Sender>, 74 | ) -> NextStep { 75 | self.n_cqes_received += 1; 76 | if cqe_result >= 0 { 77 | match idx_and_opcode.opcode().value() { 78 | io_uring::opcode::OpenAt::CODE => { 79 | self.open_file_builder 80 | .as_mut() 81 | .unwrap() 82 | .set_file_descriptor(io_uring::types::Fd(cqe_result)); 83 | } 84 | io_uring::opcode::Statx::CODE => { 85 | unsafe { 86 | self.open_file_builder 87 | .as_mut() 88 | .unwrap() 89 | .assume_statx_is_initialised(); 90 | }; 91 | } 92 | _ => panic!("Unrecognised opcode! {idx_and_opcode:?}"), 93 | }; 94 | }; 95 | 96 | assert!(self.n_cqes_received <= N_CQES_EXPECTED); 97 | if self.n_cqes_received == N_CQES_EXPECTED { 98 | if self.open_file_builder.as_mut().unwrap().is_ready() { 99 | self.submit_get_range_ops(worker_thread); 100 | NextStep::Done 101 | } else { 102 | // We've seen all the CQEs we were expecting, but `open_file_builder` isn't ready. So 103 | // at least one of the CQEs must have resulted in an error. Nevertheless, we're "done". 104 | NextStep::Done 105 | } 106 | } else { 107 | // We're expecting one more CQE. 108 | NextStep::Pending 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################ START OF RUST GITIGNORE ############################ 2 | # Generated by Cargo 3 | 4 | # will have compiled files and executables 5 | debug/ 6 | target/ 7 | 8 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries. 9 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 10 | Cargo.lock 11 | 12 | # These are backup files generated by rustfmt 13 | **/*.rs.bk 14 | 15 | # MSVC Windows builds of rustc generate these, which store debugging information 16 | *.pdb 17 | ################ END OF RUST GITIGNORE ############################## 18 | 19 | ################ START OF PYTHON GITIGNORE ########################## 20 | # Taken from github.com/github/gitignore/blob/main/Python.gitignore 21 | 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | share/python-wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | MANIFEST 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/ 62 | .tox/ 63 | .nox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *.cover 70 | *.py,cover 71 | .hypothesis/ 72 | .pytest_cache/ 73 | cover/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Django stuff: 80 | *.log 81 | local_settings.py 82 | db.sqlite3 83 | db.sqlite3-journal 84 | 85 | # Flask stuff: 86 | instance/ 87 | .webassets-cache 88 | 89 | # Scrapy stuff: 90 | .scrapy 91 | 92 | # Sphinx documentation 93 | docs/_build/ 94 | 95 | # PyBuilder 96 | .pybuilder/ 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # IPython 103 | profile_default/ 104 | ipython_config.py 105 | 106 | # pyenv 107 | # For a library or package, you might want to ignore these files since the code is 108 | # intended to run in multiple environments; otherwise, check them in: 109 | # .python-version 110 | 111 | # pipenv 112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 115 | # install all needed dependencies. 116 | #Pipfile.lock 117 | 118 | # poetry 119 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 120 | # This is especially recommended for binary packages to ensure reproducibility, and is more 121 | # commonly ignored for libraries. 122 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 123 | #poetry.lock 124 | 125 | # pdm 126 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 127 | #pdm.lock 128 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 129 | # in version control. 130 | # https://pdm.fming.dev/#use-with-ide 131 | .pdm.toml 132 | 133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 134 | __pypackages__/ 135 | 136 | # Celery stuff 137 | celerybeat-schedule 138 | celerybeat.pid 139 | 140 | # SageMath parsed files 141 | *.sage.py 142 | 143 | # Environments 144 | .env 145 | .venv 146 | env/ 147 | venv/ 148 | ENV/ 149 | env.bak/ 150 | venv.bak/ 151 | 152 | # Spyder project settings 153 | .spyderproject 154 | .spyproject 155 | 156 | # Rope project settings 157 | .ropeproject 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # mypy 163 | .mypy_cache/ 164 | .dmypy.json 165 | dmypy.json 166 | 167 | # Pyre type checker 168 | .pyre/ 169 | 170 | # pytype static type analyzer 171 | .pytype/ 172 | 173 | # Cython debug symbols 174 | cython_debug/ 175 | 176 | # PyCharm 177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 179 | # and can be added to the global gitignore or merged into this file. For a more nuclear 180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 181 | #.idea/ 182 | ################ END OF PYTHON GITIGNORE ############################ 183 | 184 | .vscode/ 185 | perf.data 186 | perf.data.* 187 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/sqe.rs: -------------------------------------------------------------------------------- 1 | use io_uring::squeue; 2 | use io_uring::types; 3 | use lsio_aligned_bytes::AlignedBytes; 4 | use lsio_aligned_bytes::AlignedBytesMut; 5 | use std::ffi::CString; 6 | use std::ops::Range; 7 | 8 | use crate::open_file::OpenFile; 9 | use crate::open_file::OpenFileBuilder; 10 | use crate::user_data::UringUserData; 11 | 12 | const ALIGN: isize = 512; // TODO: Get ALIGN at runtime from statx. 13 | 14 | /// # Documentation about the openat operation in io_uring: 15 | /// - https://man7.org/linux/man-pages/man2/openat.2.html 16 | /// - https://man7.org/linux/man-pages/man3/io_uring_prep_openat.3.html 17 | pub(crate) fn build_openat_sqe(index_of_op: usize, location: &CString) -> squeue::Entry { 18 | // Prepare the "openat" submission queue entry (SQE): 19 | io_uring::opcode::OpenAt::new( 20 | // `dirfd` is ignored if the pathname is absolute. 21 | // See the "openat()" section in https://man7.org/linux/man-pages/man2/openat.2.html 22 | types::Fd(-1), 23 | location.as_ptr(), 24 | ) 25 | .flags(libc::O_RDONLY | libc::O_DIRECT) 26 | .build() 27 | .user_data(UringUserData::new(index_of_op, io_uring::opcode::OpenAt::CODE).into()) 28 | } 29 | 30 | /// Build a `statx` submission queue entry (SQE). 31 | /// 32 | /// # Safety 33 | /// Assumes the struct that `statx_ptr` points to exists and has been zeroed. 34 | /// 35 | /// # Documentation about the statx operation in io_uring: 36 | /// - https://man7.org/linux/man-pages/man2/statx.2.html 37 | /// - https://man7.org/linux/man-pages/man3/io_uring_prep_statx.3.html 38 | /// - https://docs.rs/io-uring/latest/io_uring/opcode/struct.Statx.html 39 | /// - https://docs.rs/libc/latest/libc/struct.statx.html 40 | pub(crate) fn build_statx_sqe( 41 | index_of_op: usize, 42 | open_file_builder: &mut OpenFileBuilder, 43 | ) -> squeue::Entry { 44 | // Prepare the "statx" submission queue entry (SQE): 45 | io_uring::opcode::Statx::new( 46 | // `dirfd` is ignored if the pathname is absolute. See: 47 | // https://man7.org/linux/man-pages/man2/statx.2.html 48 | types::Fd(-1), 49 | open_file_builder.location().as_ptr(), 50 | open_file_builder.get_statx_ptr() as *mut _, 51 | ) 52 | // See here for a description of the flags for statx: 53 | // https://man7.org/linux/man-pages/man2/statx.2.html 54 | .mask(libc::STATX_SIZE | libc::STATX_DIOALIGN) 55 | .build() 56 | .user_data(UringUserData::new(index_of_op, io_uring::opcode::Statx::CODE).into()) 57 | } 58 | 59 | pub(crate) fn build_read_range_sqe( 60 | index_of_op: usize, 61 | file: &OpenFile, 62 | range: &Range, 63 | ) -> (squeue::Entry, AlignedBytes) { 64 | let filesize: isize = file.size().try_into().unwrap(); 65 | let start_offset = if range.start >= 0 { 66 | range.start 67 | } else { 68 | // `range.start` is negative. We interpret a negative `range.start` 69 | // as an offset from the end of the file. 70 | filesize + range.start 71 | }; 72 | assert!(start_offset >= 0); 73 | 74 | let end_offset = if range.end >= 0 { 75 | range.end 76 | } else { 77 | // `range.end` is negative. We interpret a negative `range.end` 78 | // as an offset from the end of the file, where `range.end = -1` means the last byte. 79 | filesize + range.end + 1 80 | }; 81 | assert!(end_offset >= 0); 82 | 83 | let aligned_start_offset = (start_offset / ALIGN) * ALIGN; 84 | 85 | let mut buffer; 86 | { 87 | let buf_len = end_offset - aligned_start_offset; 88 | assert!(buf_len > 0); 89 | 90 | // Allocate vector. If `buf_len` is not exactly divisible by ALIGN, then 91 | // `AlignedBytesMut::new` will extend the length until it is aligned. 92 | buffer = AlignedBytesMut::new(buf_len as usize, ALIGN.try_into().unwrap()); 93 | // From now on, use `buffer.len()` as the correct length! 94 | // This code is in its own scope so that `buf_len` cannot be used in subsequent code. 95 | } 96 | 97 | // Prepare the "read" opcode: 98 | let read_op = io_uring::opcode::Read::new( 99 | *file.file_descriptor(), 100 | buffer.as_mut_ptr(), 101 | buffer.len().try_into().unwrap(), 102 | ) 103 | .offset(aligned_start_offset as _) 104 | .build() 105 | .user_data(UringUserData::new(index_of_op, io_uring::opcode::Read::CODE).into()); 106 | 107 | // If the `start_offset` is not aligned, then the start of the buffer will contain data that 108 | // the user did not request. 109 | if aligned_start_offset != start_offset { 110 | _ = buffer 111 | .split_to((start_offset - aligned_start_offset).try_into().unwrap()) 112 | .unwrap(); 113 | } 114 | 115 | // `freeze` the buffer, and set the slice to the slice requested by the user: 116 | let start_slice: usize = (start_offset - aligned_start_offset).try_into().unwrap(); 117 | let end_slice: usize = (end_offset - aligned_start_offset).try_into().unwrap(); 118 | let mut buffer = buffer.freeze().unwrap(); 119 | buffer.set_slice(start_slice..end_slice); 120 | 121 | (read_op, buffer) 122 | } 123 | 124 | /// # Documentation about the `close` operation: 125 | /// - https://man7.org/linux/man-pages/man2/close.2.html 126 | pub(crate) fn build_close_sqe( 127 | index_of_op: usize, 128 | file_descriptor: io_uring::types::Fd, 129 | ) -> squeue::Entry { 130 | io_uring::opcode::Close::new(file_descriptor) 131 | .build() 132 | .user_data(UringUserData::new(index_of_op, io_uring::opcode::Close::CODE).into()) 133 | } 134 | -------------------------------------------------------------------------------- /crates/lsio_uring/src/worker.rs: -------------------------------------------------------------------------------- 1 | use io_uring::{cqueue, squeue}; 2 | use lsio_io::Output; 3 | use lsio_threadpool::WorkerThread; 4 | 5 | use crate::{ 6 | operation::{NextStep, Operation, UringOperation}, 7 | tracker::Tracker, 8 | user_data::UringUserData, 9 | }; 10 | 11 | /// `MAX_SQ_ENTRIES_PER_ITERATION` describes the most SQEs that will be submitted to the io_uring SQ by 12 | /// a single iteration of the `run` loop. This constant is used to make sure we have enough 13 | /// headroom in the SQ before each iteration of the `run` loop. 14 | const MAX_SQ_ENTRIES_PER_ITERATION: usize = 2; 15 | 16 | /// Size of the io_uring submission queue (SQ). 17 | const SQ_RING_SIZE: usize = 64; 18 | 19 | /// We keep filling the SQ until we hit the "high water line" before we start draining the 20 | /// completion queue. This ensures that we allow io_uring to process as many operations in parallel 21 | /// as possible. 22 | const HIGH_WATER_LINE: usize = SQ_RING_SIZE / 2; 23 | 24 | pub struct UringWorker { 25 | uring: io_uring::IoUring, 26 | ops_in_flight: Tracker, 27 | worker_thread: WorkerThread, 28 | output_tx: crossbeam_channel::Sender>, 29 | } 30 | 31 | impl UringWorker { 32 | pub(crate) fn new( 33 | worker_thread: WorkerThread, 34 | output_tx: crossbeam_channel::Sender>, 35 | ) -> Self { 36 | assert!(MAX_SQ_ENTRIES_PER_ITERATION < SQ_RING_SIZE); 37 | 38 | let ring: io_uring::IoUring = io_uring::IoUring::builder() 39 | // TODO: Allow the user to decide whether sqpoll is used. 40 | .setup_sqpoll(1000) // The kernel sqpoll thread will sleep after this many milliseconds. 41 | .build(SQ_RING_SIZE as _) 42 | .expect("Failed to initialise io_uring."); 43 | 44 | assert_eq!(ring.params().cq_entries(), ring.params().sq_entries() * 2); 45 | 46 | Self { 47 | uring: ring, 48 | ops_in_flight: Tracker::new(SQ_RING_SIZE), 49 | worker_thread, 50 | output_tx, 51 | } 52 | } 53 | 54 | /// The main loop for the thread. 55 | pub(crate) fn run(&mut self) { 56 | while self.worker_thread.keep_running() { 57 | if self.ops_in_flight.is_full() || self.uring_is_full() { 58 | if self.uring.completion().is_empty() { 59 | // The SQ is full but no completion events are ready! So we have no choice: 60 | // We *have* to wait for some completion events to to complete: 61 | self.uring.submit_and_wait(1).unwrap(); 62 | } 63 | // The CQ has CQEs for us, so we fall through to the CQ processing loop. 64 | } else { 65 | match self.worker_thread.find_task() { 66 | Some(mut operation) => { 67 | // Submit first step of `operation`, and track `operation`: 68 | let index_of_op = self 69 | .ops_in_flight 70 | .get_next_index() 71 | .expect("Failed to get_next_task on tracker!"); 72 | operation 73 | .submit_first_step(index_of_op, &mut self.uring.submission()) 74 | .expect("Failed to submit_first_step of Operation!"); 75 | // TODO: Instead of calling `submit()` on every loop, we should keep our 76 | // own check on how long has elapsed since we last submitted to the SQ, 77 | // and only call `submit()` when we know the SQ has gone to sleep. 78 | // See issue #129. 79 | self.uring.submitter().submit().unwrap(); 80 | self.ops_in_flight.put(index_of_op, operation); 81 | if self.sq_len_plus_cq_len() < HIGH_WATER_LINE { 82 | // We want to "top up" the SQ before we process any CQEs. 83 | // Without this, we run the risk of submitting one SQE, then draining 84 | // that CQE, then submitting another SQE, and training that CQE, etc. 85 | // In other words, we run the risk of not letting io_uring handle 86 | // multiple SQEs at once! 87 | continue; 88 | } 89 | } 90 | None => { 91 | // There are no new operations to submit, so let's work out if we need to 92 | // park or process the completion queue. 93 | if self.ops_in_flight.is_empty() { 94 | // There's nothing to do! So we have to sleep: 95 | self.worker_thread.park(); 96 | // When we wake, there definitely won't be anything in our uring, so 97 | // continue to the top of the while loop: 98 | continue; 99 | } 100 | } 101 | } 102 | } 103 | 104 | for cqe in unsafe { self.uring.completion_shared() } { 105 | let idx_and_opcode = UringUserData::from(cqe.user_data()); 106 | let idx_of_op = idx_and_opcode.index_of_op() as usize; 107 | let mut op_guard = self.ops_in_flight.get(idx_of_op).unwrap(); 108 | let next_step = op_guard.as_mut().process_opcode_and_submit_next_step( 109 | &idx_and_opcode, 110 | cqe.result(), 111 | &mut unsafe { self.uring.submission_shared() }, 112 | &self.worker_thread, 113 | &mut self.output_tx, 114 | ); 115 | match next_step { 116 | NextStep::Pending => (), // By default, op_guard will keep the operation. 117 | NextStep::ReplaceWith(op) => op_guard.replace(op), 118 | NextStep::Done => { 119 | let _ = op_guard.remove(); 120 | } 121 | }; 122 | } 123 | } 124 | assert!(self.ops_in_flight.is_empty()); 125 | } 126 | 127 | /// io_uring submission queue (SQ) length plus the io_uring completion queue (CQ) length: 128 | fn sq_len_plus_cq_len(&self) -> usize { 129 | unsafe { self.uring.submission_shared().len() + self.uring.completion_shared().len() } 130 | } 131 | 132 | fn uring_is_full(&self) -> bool { 133 | self.sq_len_plus_cq_len() >= SQ_RING_SIZE - MAX_SQ_ENTRIES_PER_ITERATION 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /crates/lsio_bench/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | env::temp_dir, 3 | fs::File, 4 | io::Write, 5 | ops::Range, 6 | path::{Path, PathBuf}, 7 | process::Command, 8 | time::{Duration, Instant}, 9 | }; 10 | 11 | use clap::{error::ErrorKind, CommandFactory, Parser}; 12 | use indicatif::{ProgressBar, ProgressStyle}; 13 | use lsio_io::{Completion, Reader}; 14 | use lsio_uring::IoUring; 15 | 16 | const FILENAME_PREFIX: &str = "lsio_bench_"; 17 | const MEBIBYTE: f64 = (1024 * 1024) as _; 18 | 19 | #[derive(Parser, Debug)] 20 | #[command(version, about, long_about = None)] 21 | struct Args { 22 | /// Prefix filenames with this directory. If not set, will default to the system's temporary 23 | /// directory. This directory must already exist. 24 | #[arg(short, long)] 25 | directory: Option, 26 | 27 | /// The number of files to read from for this benchmark. 28 | #[arg(short, long, default_value_t = 1, value_parser = clap::value_parser!(u32).range(1..))] 29 | nrfiles: u32, 30 | 31 | /// The size of each file, in bytes 32 | #[arg(short, long, default_value_t = 1024 * 1024, value_parser = clap::value_parser!(u64).range(1..))] 33 | filesize: u64, 34 | 35 | /// The chunk size in bytes. By default, the blocksize will be the same as the filesize. 36 | #[arg(short, long, value_parser = clap::value_parser!(u64).range(1..))] 37 | blocksize: Option, 38 | 39 | /// The number of worker threads that lsio_uring uses: 40 | #[arg(short = 'w', long, default_value_t = 4, value_parser = clap::value_parser!(u64).range(1..1024))] 41 | nr_worker_threads: u64, 42 | } 43 | 44 | fn main() -> std::io::Result<()> { 45 | let args = Args::parse(); 46 | 47 | let directory = check_directory_or_use_temp_dir(&args.directory); 48 | 49 | let filenames: Vec = (0..args.nrfiles) 50 | .map(|i| directory.join(format!("{FILENAME_PREFIX}{i}"))) 51 | .collect(); 52 | 53 | create_files_if_necessary(&filenames, args.filesize)?; 54 | 55 | clear_page_cache(&directory); 56 | 57 | read_files( 58 | &filenames, 59 | args.filesize, 60 | args.blocksize, 61 | args.nr_worker_threads as usize, 62 | ); 63 | 64 | Ok(()) 65 | } 66 | 67 | fn check_directory_or_use_temp_dir(directory: &Option) -> PathBuf { 68 | // Check directory exists. Or use temp_dir. 69 | if let Some(directory) = directory.as_deref() { 70 | if directory.is_dir() { 71 | directory.to_path_buf() 72 | } else { 73 | let mut cmd = Args::command(); 74 | cmd.error( 75 | ErrorKind::ValueValidation, 76 | format!("Directory {directory:?} does not exist, or is not a directory"), 77 | ) 78 | .exit(); 79 | } 80 | } else { 81 | temp_dir() 82 | } 83 | } 84 | 85 | fn create_files_if_necessary(filenames: &[PathBuf], filesize: u64) -> std::io::Result<()> { 86 | // Create progress bar: 87 | println!( 88 | "Creating {} files (if necessary), each of filesize {filesize} bytes...", 89 | filenames.len() 90 | ); 91 | let pb = ProgressBar::new(filenames.len() as _); 92 | pb.set_style(get_progress_bar_style()); 93 | 94 | // Loop through files: 95 | let mut file_contents: Option> = None; 96 | for filename in filenames { 97 | if filename.exists() && get_filesize(&filename)? == filesize { 98 | pb.set_message(format!("exists: {filename:?}")); 99 | } else { 100 | pb.set_message(format!("creating: {filename:?}")); 101 | if file_contents.is_none() { 102 | file_contents = Some((0..filesize).map(|i| i as u8).collect()); 103 | } 104 | let mut file = File::create(&filename)?; 105 | file.write_all(file_contents.as_ref().unwrap())?; 106 | file.flush()?; 107 | } 108 | pb.inc(1); 109 | } 110 | pb.finish_with_message("done"); 111 | Ok(()) 112 | } 113 | 114 | fn get_filesize(filename: &Path) -> std::io::Result { 115 | Ok(File::open(&filename)?.metadata()?.len()) 116 | } 117 | 118 | fn get_progress_bar_style() -> ProgressStyle { 119 | ProgressStyle::with_template("[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}") 120 | .unwrap() 121 | .progress_chars("##-") 122 | } 123 | 124 | fn read_files( 125 | filenames: &[PathBuf], 126 | filesize: u64, 127 | blocksize: Option, 128 | n_worker_threads: usize, 129 | ) { 130 | let blocksize = if let Some(bs) = blocksize { 131 | bs 132 | } else { 133 | filesize 134 | }; 135 | 136 | // Calculate chunks 137 | let n_chunks = filesize / blocksize; 138 | let chunks: Vec> = (0..n_chunks) 139 | .map(|chunk_i| { 140 | let chunk_start = (chunk_i * blocksize) as isize; 141 | let chunk_end = chunk_start + (blocksize as isize); 142 | chunk_start..chunk_end 143 | }) 144 | .collect(); 145 | assert_eq!(chunks.len(), n_chunks as _); 146 | 147 | // Define user_data (so we can identify the chunks!) 148 | let user_data: Vec = (0..n_chunks as u64).collect(); 149 | 150 | let mut uring = IoUring::new(n_worker_threads); 151 | 152 | // Set up progress bar: 153 | let n_files = filenames.len() as u64; 154 | let n_total_chunks = n_files * n_chunks; 155 | println!("Performing read benchmark for {n_files} files x {n_chunks} chunks per file = {n_total_chunks} total chunks:"); 156 | let pb = ProgressBar::new(n_total_chunks); 157 | pb.set_style(get_progress_bar_style()); 158 | 159 | let started = Instant::now(); 160 | 161 | // Submit all the get_ranges requests: 162 | for filename in filenames { 163 | uring 164 | .get_ranges(&filename, chunks.clone(), user_data.clone()) 165 | .unwrap(); 166 | } 167 | 168 | // Collect results 169 | for _ in 0..n_total_chunks { 170 | match uring 171 | .completion() 172 | .recv_timeout(Duration::from_millis(10000)) 173 | { 174 | Ok(_) => pb.inc(1), 175 | Err(e) => panic!("Error collecting chunk! {e:?}"), 176 | } 177 | } 178 | pb.finish(); 179 | 180 | // Calculate bandwidth 181 | let total_secs = started.elapsed().as_secs_f64(); 182 | let total_bytes = (filesize * n_files) as f64; 183 | let bytes_per_sec = total_bytes / total_secs; 184 | println!("Total runtime: {} secs", total_secs); 185 | println!("Total mebibytes: {} MiB", total_bytes / MEBIBYTE); 186 | println!( 187 | "Total bandwidth = {} mebibytes per sec", 188 | bytes_per_sec / MEBIBYTE 189 | ); 190 | } 191 | 192 | fn clear_page_cache(directory: &Path) { 193 | println!("Clearing page cache for {directory:?}..."); 194 | let _ = Command::new("vmtouch") 195 | .arg("-e") 196 | .arg(directory) 197 | .output() 198 | .expect("vmtouch failed to start"); 199 | } 200 | -------------------------------------------------------------------------------- /planned_design.md: -------------------------------------------------------------------------------- 1 | # Planned Design for `light-speed-io` 2 | 3 | # Summary 4 | 5 | The ultimate aim is to load and process multi-dimensional arrays as quickly and as efficiently as modern hardware will allow! 6 | 7 | Why? Initially, to make life as easy as possible for folks who want to train ML models on large multi-dimensional datasets (like weather predictions). Specifically: 8 | 9 | - Enable folks to train directly from source datasets (instead of having to prepare batches ahead-of-time) at multiple GB/sec, 10 | - make it as easy as possible to combine different datasets on-the-fly (e.g. NWPs and satellite datasets, see [issue #142](https://github.com/JackKelly/light-speed-io/issues/142)), 11 | - use single on-disk datasets for as many ML experiments as possible (see [issue #141](https://github.com/JackKelly/light-speed-io/issues/141)). Stop people having to manually duplicate data (with different chunking schemes) for each ML model. 12 | 13 | LSIO will provide a suite of tools for loading and processing large, labelled, multi-dimensional datasets. Where "large" means datasets that are too large to fit into RAM, and where "labelled" means datasets where each array dimension can be associated with coordinates. For example, a dataset of satellite imagery might have 4 dimensions: x, y, time, and spectral channel. The x and y dimensions might be labelled with longitude and latitude coordinates, respectively. 14 | 15 | The main focus will be on processing data on a single machine. Hopefully tools like Dask could schedule LSIO across multiple machines. 16 | 17 | Please see [this blog post](https://jack-kelly.com/blog/2023-07-28-speeding-up-zarr) for more details of the background and motivations behind this project. 18 | 19 | This git repository contains multiple crates. Each crate implements "just one thing". Each crate will exist in one of five levels of abstraction. And there will be a Python API to each level of abstraction. See the "planned design" diagram below. 20 | 21 | ## Fitting into the ecosystem 22 | Today, there are many awesome software packages for working with large, labelled, multi-dimensional datasets (such as xarray, fsspec, dask, kerchunk, satpy, etc.). LSIO aims to help speed up this existing stack: Either by providing tools that existing Python packages can hook into, or by providing new tools which play nicely with the existing stack, or by creating new tools with very similar Python APIs to existing Python APIs. 23 | 24 | ## Why bother to build `light-speed-io`? What gap does it fill? 25 | LSIO is all about computational speed _and_ efficiency! Today, using existing packages, you can achieve high throughput by spinning up a large cluster. But that's expensive, power-hungry, and tedious! The aim of LSIO is to enable high throughput and low latency on a single machine. 26 | 27 | To look at this from the users' perspective: one of the main aims is to enable users to far more easily train ML models on huge multi-dimensional datasets. 28 | 29 | ## How to be efficient and fast? 30 | By being [sympathetic](https://dzone.com/articles/mechanical-sympathy) to the hardware! 31 | 32 | That sounds abstract! In concrete terms, one central aim is for the machine to do as little work as possible. Specifically: 33 | 34 | Minimise the number of: 35 | - round-trips to RAM, 36 | - system calls, 37 | - heap allocations, 38 | - network requests, 39 | - memory copies. 40 | 41 | Maximise the use of: 42 | - CPU caches, 43 | - all the levels of parallelism available within a single machine, 44 | - modern, efficient IO APIs like io_uring. 45 | 46 | ## Concrete goals 47 | Some example concrete goals include: 48 | - Compute summary statistics of multi-terabyte dataset on a laptop, at a speed of about 5 minutes per terabyte (from a fast local SSD), with minimal RAM requirements. 49 | - Train a large machine learning model from two Zarr datasets (e.g. satellite imagery and numerical weather predictions) at a sustained bandwidth to the GPU of at least 1 gigabyte per second (from local SSDs or from a cloud storage bucket), whilst performing some light processing on the data on-the-fly. Crucially, each ML training example should be a random crop of the multi-dimensional dataset. (Random cropping is particularly slow on today's software stack.) 50 | 51 | ## Priorities 52 | The first area of focus is on high-speed IO for local SSDs on Linux, to speed up training ML models from sharded Zarr datasets. But we're definitely also interested in helping speed up access to data stored in cloud object storage (see [issue #10](https://github.com/JackKelly/light-speed-io/issues/10)), and in helping to speed up general data analytics tasks on multi-dimensional data. 53 | 54 | ## How long will this take? 55 | Implementing the complete design sketched out in this doc will take _years_! 56 | 57 | By the end of 2024, I hope to have MVP implementations of "level 1 (I/O)" and "level 2 (parallel compute on chunks)" and a basic Zarr implementation for level 4. But please don't hold me to that! 58 | 59 | # Which crates would live in this repo? What would they do? And how would they communicate with each other? 60 | 61 | ![Planned design for LSIO](planned_design.svg) 62 | ([Original Google Draw version of this diagram](https://docs.google.com/drawings/d/1cpRai2k9y2Y9v4ieaof33FT27uB4JlK_rJL9Lvbj4MM/edit?usp=sharing).) 63 | 64 | My hope is to categorise the crates into several different levels of abstraction: 65 | 66 | ## Abstraction level 1: Data input/output 67 | This is lowest level of abstraction: the level closest to the hardware. 68 | 69 | ### Common interface 70 | These IO crates will share a common interface: 71 | - Instructions will be given to the IO crate via a common set of methods, defined in a `Reader` trait and a `Writer` trait. Under the hood, the `lsio_uring` crate will encode each of the user's commands into a `struct`, and put these structs into a multi-producer multi-consumer `crossbeam::channel`. An example of an instruction could be: "get 1,000,000 chunks of `/foo/bar`". These `IoOperation`s will probably be grouped ([#68](https://github.com/JackKelly/light-speed-io/issues/68)), such that the IO crate will guarantee that all operations in group _n_ are completed before any IO operations in group _n+1_ are started. 72 | - Output channel: A multi-producer multi-consumer `crossbeam::channel` of `Chunk`s to return completed data to the user (these will also be grouped) (see [#105](https://github.com/JackKelly/light-speed-io/issues/105)). 73 | 74 | LSIO will also enable buffer recycling whereby the user can optionally tell the IO crate "re-use each IO buffer once I've dropped it" (to minimise the number of heap allocations). ([#38](https://github.com/JackKelly/light-speed-io/issues/38)). This will probably be implemented via the `drop` method on `AlignedBytes`. 75 | 76 | ### Crates 77 | - [ ] `aligned_bytes` 78 | - [ ] `lsio_uring` (this is what I'm currently working on): provide a small threadpool which performs IO using io_uring. 79 | - [ ] [`lsio_io_python_bridge` #39)[https://github.com/JackKelly/light-speed-io/issues/39] 80 | - [ ] [`object_store_bridge` #107](https://github.com/JackKelly/light-speed-io/issues/107) (also see [Ideas for fast cloud storage #10](https://github.com/JackKelly/light-speed-io/issues/10)) 81 | - [ ] maybe other crates for high-performance local storage on MacOS and/or Windows. 82 | 83 | ## Abstraction level 2: Parallel compute on chunks 84 | 85 | ### Common interface 86 | These crates will all consume the `output channel` from the IO layer. 87 | 88 | ### Crates 89 | - [ ] `lsio_compute`: Perform parallel computation on data. Users can supply any function to be applied to each chunk. The actual computation will probably be orchestrated by Rayon. This crate will implement functions for operating on the `struct Chunks` that represents each buffer with its metadata (see #105). 90 | - [ ] `lsio_codecs`: Compression / decompression 91 | 92 | ## Abstraction level 3: Automatically scheduling compute & IO 93 | The aim is to do to as little work as possible to satisfy the user's requests: don't repeat work (if we can avoid it) and don't do work that doesn't contribute to the final outcome. 94 | 95 | - [ ] `lsio_scheduler` 96 | 97 | ## Abstraction level 4: Crates that load / write to a specific file format 98 | 99 | These crates will each include a Python API. 100 | 101 | ### Crates 102 | - [ ] `lsio_zarr_codec_pipeline`: A pure-Rust implementation of [`zarr-python` v3's new `CodecPipeline`](https://github.com/zarr-developers/zarr-python/issues/1806#issuecomment-2085680824), that can be used as a storage and decompression backend for `zarr-python`. 103 | - [ ] `lsio_zarr` 104 | - [ ] `lsio_grib` 105 | - [ ] etc. 106 | 107 | ## Abstraction level 5: Domain-specific computation 108 | 109 | ### Crates 110 | - [ ] `lsio_rechunker` 111 | - [ ] `lsio_array` 112 | 113 | -------------------------------------------------------------------------------- /crates/lsio_aligned_bytes/README.md: -------------------------------------------------------------------------------- 1 | A memory buffer allocated on the heap. 2 | 3 | The start position and end position of the backing buffer are both aligned in memory. The user 4 | specifies the memory alignment at runtime. This is useful for working with `O_DIRECT` file IO, 5 | where the filesystem will often expect the buffer to be aligned to the logical block size of 6 | the filesystem[^o_direct] (typically 512 bytes). 7 | 8 | The API is loosely inspired by the [`bytes`](https://docs.rs/bytes/latest/bytes/index.html) crate. 9 | To give a very quick overview of the `bytes` crate: The `bytes` crate has an (immutable) 10 | [`Bytes`](https://docs.rs/bytes/latest/bytes/struct.Bytes.html) struct and a (mutable) 11 | [`BytesMut`](https://docs.rs/bytes/latest/bytes/struct.BytesMut.html) struct. `BytesMut` can be 12 | `split` into multiple non-overlapping owned views of the same backing buffer. The backing 13 | buffer is dropped when all the views referencing that buffer are dropped. `BytesMut` can be 14 | [frozen](https://docs.rs/bytes/latest/bytes/struct.BytesMut.html#method.freeze) to produce an 15 | (immutable) `Bytes` struct which, in turn, can be sliced to produce (potentially overlapping) 16 | owned views of the same backing buffer. 17 | 18 | `aligned_bytes` follows a similar pattern: 19 | 20 | [`AlignedBytesMut`] can be [`AlignedBytesMut::split_to`] to produce multiple non-overlapping mutable 21 | views of the same backing buffer without copying the memory (each `AlignedBytesMut` has its own 22 | `range` (which represents the byte range that this `AlignedBytesMut` has exclusive access to) 23 | and an `Arc`). The splitting process guarantees that views cannot overlap, so we 24 | do not have to use locks, whilst allowing multiple threads to write to (non-overlapping regions 25 | of) the same buffer. 26 | 27 | When you have finished writing into the buffer, drop all but one of the `AlignedBytesMut` 28 | objects, and call [`AlignedBytesMut::freeze`] on the last `AlignedByteMut`. This will 29 | consume the `AlignedBytesMut` and return an (immutable) [`AlignedBytes`]. Then you 30 | can `clone` and [`AlignedBytes::set_slice`] to get (potentially overlapping) owned views of the 31 | same backing buffer. 32 | 33 | The backing buffer will be dropped when all views into the backing buffer are dropped. 34 | 35 | Unlike `bytes`, `aligned_bytes` does not use a `vtable`, nor does it allow users to grow the 36 | backing buffers. `aligned_bytes` implements the minimal set of features required for the rest 37 | of the LSIO project! In fact, `aligned_bytes` is _so_ minimal that the only way to write data 38 | into an `AlignedBytesMut` is via [`AlignedBytesMut::as_mut_ptr`] (because that's what the 39 | operating system expects!) 40 | 41 | # Examples and use-cases 42 | 43 | **Use case 1: The user requests multiple contiguous byte ranges from LSIO.** 44 | 45 | Let's say the user requests two byte ranges from a single file: `0..4096`, and `4096..8192`. 46 | 47 | Under the hood, LSIO will: 48 | 49 | - Notice that these two byte ranges are consecutive, and merge these two byte ranges into a 50 | single read operation. 51 | - Allocate a single 8,192 byte `AlignedBytesMut`, aligned to 512-bytes. 52 | - Submit a `read` operation to `io_uring` for all 8,192 bytes. 53 | - When the single read op completes, we `freeze` the buffer, which consumes the 54 | `AlignedBytesMut` and returns an `AlignedBytes`, which we then `reset_slice()` to view the 55 | entire 8,192 backing buffer. 56 | - Split the `AlignedBytes` into two owned `AlignedBytes`, and return these to the user. 57 | - The underlying buffer will be dropped when the user drops the two `AlignedBytes`. 58 | 59 | Here's a code sketch to show how this works: 60 | 61 | ```rust 62 | use lsio_aligned_bytes::AlignedBytesMut; 63 | 64 | // Allocate a single 8,192 byte `AlignedBytesMut`: 65 | const LEN: usize = 8_192; 66 | const ALIGN: usize = 512; 67 | let mut bytes = AlignedBytesMut::new(LEN, ALIGN); 68 | 69 | // Write into the buffer. (In this toy example, we'll write directly into the buffer. 70 | // But in "real" code, we'd pass the pointer to the operating system, which in turn 71 | // would write data into the buffer for us.) 72 | let ptr = bytes.as_mut_ptr(); 73 | for i in 0..LEN { 74 | unsafe { *ptr.offset(i as isize) = i as u8; } 75 | } 76 | 77 | // Freeze (to get a read-only `AlignedBytes`). We `unwrap` because `freeze` 78 | // will fail if there's more than one `AlignedBytesMut` referencing our backing buffer. 79 | let mut bytes = bytes.freeze().unwrap(); 80 | bytes.reset_slice(); 81 | let expected_byte_string: Vec = (0..LEN).map(|i| i as u8).collect(); 82 | assert_eq!(bytes.as_slice(), expected_byte_string); 83 | 84 | // Slice the buffer into two new buffers: 85 | let mut buffer_0 = bytes.clone(); 86 | buffer_0.set_slice(0..4_096); 87 | let mut buffer_1 = bytes.clone(); 88 | buffer_1.set_slice(4_096..8_192); 89 | assert_eq!(buffer_0.len(), 4_096); 90 | assert_eq!(buffer_1.len(), 4_096); 91 | assert_eq!(buffer_0.as_slice(), &expected_byte_string[0..4_096]); 92 | assert_eq!(buffer_1.as_slice(), &expected_byte_string[4_096..8_192]); 93 | 94 | // Check that the original `bytes` buffer is still valid: 95 | assert_eq!(bytes.as_slice(), &expected_byte_string); 96 | 97 | // Remove the original `bytes` and check that the two views of the same buffer 98 | // are still valid: 99 | drop(bytes); 100 | assert_eq!(buffer_0.as_slice(), &expected_byte_string[0..4_096]); 101 | assert_eq!(buffer_1.as_slice(), &expected_byte_string[4_096..8_192]); 102 | ``` 103 | 104 | **Use-case 2: The user requests a single 8 GiB file.** 105 | 106 | Linux can't read more than 2 GiB at once[^linux_read]. So we need to read the 8 GiB files in 107 | multiple chunks. 108 | 109 | LSIO will: 110 | - Allocate a single 8 GiB `AlignedBytesMut`. 111 | - Split this into a new 2 GiB `AlignedBytesMut` and the old `AlignedBytesMut` is reduced to 6 GiB. 112 | Both of these buffers must have their starts and ends aligned. Then repeat the process to 113 | get a total of 4 x 2 GiB `AlignedBytesMut`s. 114 | - Issue four `read` operations to the OS (one operation per `AlignedBytesMut`). 115 | - When the first, second, and third `read` ops complete, drop their `AlignedBytesMut` 116 | (but that won't drop the underlying storage, it just removes its reference). 117 | - When the last `read` op completes, `freeze` the last `AlignedBytesMut` to get an immutable `AlignedBytes`. 118 | `reset_slice` to get the 8 GB slice requested by the user. Pass this 8 GiB `AlignedBytes` to the user. 119 | 120 | ```rust 121 | use lsio_aligned_bytes::AlignedBytesMut; 122 | 123 | // Allocate a single array (for this toy example, we'll just allocate 8 MiB, instead of 8 GiB!) 124 | const MiB: usize = 2_usize.pow(20); 125 | const LEN: usize = 8 * MiB; 126 | const ALIGN: usize = 512; 127 | let mut bytes_3 = AlignedBytesMut::new(LEN, ALIGN); 128 | // `bytes_3` will be the final of four bytes_ arrays! 129 | 130 | // Split into a 2 MiB buffer, and a 6 MiB buffer: 131 | let mut bytes_0 = bytes_3.split_to(2 * MiB).unwrap(); 132 | assert_eq!(bytes_0.len(), 2 * MiB); 133 | assert_eq!(bytes_3.len(), 6 * MiB); 134 | 135 | // Continue splitting: 136 | let mut bytes_1 = bytes_3.split_to(4 * MiB).unwrap(); 137 | let mut bytes_2 = bytes_3.split_to(6 * MiB).unwrap(); 138 | assert_eq!(bytes_0.len(), 2 * MiB); 139 | assert_eq!(bytes_1.len(), 2 * MiB); 140 | assert_eq!(bytes_2.len(), 2 * MiB); 141 | assert_eq!(bytes_3.len(), 2 * MiB); 142 | 143 | // Write into the arrays: 144 | // Fill the first 2 MiB with zeros, fill the second 2 MiB with ones, etc. 145 | for i in 0..(2 * MiB) { 146 | unsafe { 147 | *bytes_0.as_mut_ptr().offset(i as isize) = 0; 148 | *bytes_1.as_mut_ptr().offset(i as isize) = 1; 149 | *bytes_2.as_mut_ptr().offset(i as isize) = 2; 150 | *bytes_3.as_mut_ptr().offset(i as isize) = 3; 151 | } 152 | } 153 | 154 | // Drop three of the four AlignedBytesMuts, in preparation for freezing: 155 | drop(bytes_0); 156 | drop(bytes_1); 157 | drop(bytes_2); 158 | 159 | // Needs to be `mut` so we can `reset_slice()`. Doesn't actually mutate the buffer! 160 | let mut bytes = bytes_3.freeze().unwrap(); 161 | bytes.reset_slice(); 162 | 163 | let expected: Vec = (0..LEN).map(|i| (i / (2 * MiB)) as u8).collect(); 164 | // We use `Iterator::eq` instead of `assert_eq!` to avoid `assert_eq!` printing out 165 | // 16 million numbers if the arrays aren't exactly equal! 166 | assert!(bytes.as_slice().iter().eq(expected.iter())); 167 | 168 | ``` 169 | 170 | [^o_direct]: For more information on `O_DIRECT`, including the memory alignment requirements, 171 | see all the mentions of `O_DIRECT` in the [`open(2)`](https://man7.org/linux/man-pages/man2/open.2.html) man page. 172 | [^linux_read]: Actually, the limit isn't exactly 2 GiB. On Linux, `read` will transfer at most 173 | 2,147,479,552 bytes. See the [`read`](https://man7.org/linux/man-pages/man2/read.2.html) man 174 | page! 175 | 176 | -------------------------------------------------------------------------------- /crates/lsio_aligned_bytes/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(missing_docs)] 2 | #![doc = include_str!("../README.md")] 3 | 4 | use anyhow; 5 | use std::{alloc, ops::Range, slice, sync::Arc}; 6 | 7 | /// A mutable aligned buffer. 8 | #[derive(Debug)] 9 | pub struct AlignedBytesMut { 10 | buf: Arc, 11 | 12 | /// The slice requested by the user. 13 | range: Range, 14 | } 15 | 16 | unsafe impl Send for AlignedBytesMut {} 17 | unsafe impl Sync for AlignedBytesMut {} 18 | 19 | impl AlignedBytesMut { 20 | /// Creates a new `AlignedBytesMut`. 21 | /// 22 | /// Aligns the start and end of the buffer with `align`. 23 | /// 'align' must not be zero, and must be a power of two. 24 | /// `len` is the length of the underlying buffer, in bytes. 25 | pub fn new(len: usize, align: usize) -> Self { 26 | let inner_buf = InnerBuffer::new(len, align); 27 | Self { 28 | buf: Arc::new(inner_buf), 29 | range: 0..len, 30 | } 31 | } 32 | 33 | /// Returns the length of the `range` requested by the user. The `range` is a view into the 34 | /// underlying buffer. The underlying buffer may be larger than `len`. 35 | pub fn len(&self) -> usize { 36 | self.range.len() 37 | } 38 | 39 | /// Returns a mutable pointer to the underlying buffer offset by `self.range.start`. 40 | pub fn as_mut_ptr(&mut self) -> *mut u8 { 41 | let ptr = self.buf.as_mut_ptr(); 42 | unsafe { ptr.offset(self.range.start as isize) } 43 | } 44 | 45 | /// Split this view of the underlying buffer into two views at the given index. 46 | /// 47 | /// This does not allocate a new buffer. Instead, both `AlignedBytesMut` objects reference 48 | /// the same underlying backing buffer. 49 | /// 50 | /// `idx` indexes into the backing buffer. 51 | /// 52 | /// `idx` must not be zero. `idx` must be exactly divisible by the alignment of the underlying 53 | /// buffer. `idx` must be contained in `self.range`. 54 | /// 55 | /// Afterwards, `self` contains `[idx, range.end)`. The returned `AlignedBytesMut` 56 | /// contains elements `[range.start, idx)`. 57 | /// 58 | /// To show this graphically: 59 | /// 60 | /// Before calling `split_to`: 61 | /// 62 | /// ```text 63 | /// Underlying buffer: 0 1 2 3 4 5 6 7 8 9 64 | /// self.range : [2, 8) 65 | /// ``` 66 | /// 67 | /// After calling `split_to(6)`: 68 | /// 69 | /// ```text 70 | /// Underlying buffer: 0 1 2 3 4 5 6 7 8 9 71 | /// self.range : [6, 8) 72 | /// other.range : [2, 6) 73 | /// ``` 74 | pub fn split_to(&mut self, idx: usize) -> anyhow::Result { 75 | if !self.range.contains(&idx) { 76 | Err(anyhow::format_err!( 77 | "idx {idx} is not contained in this buffer's range {:?}", 78 | self.range, 79 | )) 80 | } else if idx == 0 { 81 | Err(anyhow::format_err!("idx must not be zero!")) 82 | } else if idx % self.buf.alignment() != 0 { 83 | Err(anyhow::format_err!( 84 | "idx {idx} must be exactly divisible by the alignment {}", 85 | self.buf.alignment() 86 | )) 87 | } else { 88 | let new_range = self.range.start..idx; 89 | self.range.start = idx; 90 | Ok(AlignedBytesMut { 91 | buf: self.buf.clone(), 92 | range: new_range, 93 | }) 94 | } 95 | } 96 | 97 | /// If this is the only `AlignedBytesMut` with access to the underlying buffer 98 | /// then `freeze` consumes `self` and returns a read-only `AlignedBytes` 99 | /// (wrapped in `Ok`), which contains a reference to the underlying buffer, 100 | /// and has its `range` set to byte range of the `AlignedBytesMut`. 101 | /// If, on the other hand, other `AlignedBytesMut`s have access to 102 | /// the underlying buffer then `freeze` will return `Err(self)`. 103 | pub fn freeze(self) -> Result { 104 | if Arc::strong_count(&self.buf) == 1 { 105 | Ok(AlignedBytes { 106 | buf: self.buf, 107 | range: self.range, 108 | }) 109 | } else { 110 | Err(self) 111 | } 112 | } 113 | } 114 | 115 | /// Immutable. 116 | #[derive(Debug, Clone)] 117 | pub struct AlignedBytes { 118 | buf: Arc, 119 | 120 | /// The slice requested by the user. 121 | range: Range, 122 | } 123 | 124 | unsafe impl Send for AlignedBytes {} 125 | unsafe impl Sync for AlignedBytes {} 126 | 127 | /// An immutable view of a memory buffer. 128 | /// 129 | /// The only way to make is an `AlignedBytes` is using [`AlignedBytesMut::freeze`]. 130 | impl AlignedBytes { 131 | /// Sets the slice for `self`. 132 | /// 133 | /// The requested `range` indexes into the entire underlying buffer. 134 | /// 135 | /// ## Panics 136 | /// Panics if `range.is_empty()` or if `range.end` > the size of the underlying buffer. 137 | pub fn set_slice(&mut self, range: Range) -> &Self { 138 | assert!(!range.is_empty()); 139 | assert!(range.end <= self.buf.len()); 140 | self.range = range; 141 | self 142 | } 143 | 144 | /// Resets this `AlignedBytes` range to be equal to the total extent of the underlying buffer. 145 | pub fn reset_slice(&mut self) -> &Self { 146 | self.range = 0..self.buf.len(); 147 | self 148 | } 149 | 150 | /// Returns the length of the `range` requested by the user. The `range` is a view into the 151 | /// underlying buffer. The underlying buffer may be larger than `len`. 152 | pub fn len(&self) -> usize { 153 | self.range.len() 154 | } 155 | 156 | /// Returns a constant pointer to `self.range.start` of the underlying buffer. 157 | pub fn as_ptr(&self) -> *const u8 { 158 | let ptr = self.buf.as_ptr(); 159 | unsafe { ptr.offset(self.range.start as isize) } 160 | } 161 | 162 | /// Returns an immutable slice of the `range` view of the underlying buffer. 163 | pub fn as_slice(&self) -> &[u8] { 164 | unsafe { slice::from_raw_parts(self.as_ptr(), self.len()) } 165 | } 166 | } 167 | 168 | #[derive(Debug)] 169 | struct InnerBuffer { 170 | buf: *mut u8, // TODO: Replace `*mut u8` with `NotNull`. 171 | 172 | /// `layout.size()` gives the number of bytes _actually_ allocated, which will be 173 | /// a multiple of `align`. 174 | layout: alloc::Layout, 175 | } 176 | 177 | impl InnerBuffer { 178 | fn new(len: usize, align: usize) -> Self { 179 | assert_ne!(len, 0); 180 | let layout = alloc::Layout::from_size_align(len, align) 181 | .expect("failed to create Layout!") 182 | .pad_to_align(); 183 | let buf = unsafe { alloc::alloc(layout) }; 184 | if buf.is_null() { 185 | alloc::handle_alloc_error(layout); 186 | } 187 | Self { buf, layout } 188 | } 189 | 190 | /// Returns the total size of the underlying buffer. 191 | const fn len(&self) -> usize { 192 | self.layout.size() 193 | } 194 | 195 | /// Returns the alignment, in bytes. 196 | const fn alignment(&self) -> usize { 197 | self.layout.align() 198 | } 199 | 200 | fn as_mut_ptr(&self) -> *mut u8 { 201 | self.buf 202 | } 203 | 204 | fn as_ptr(&self) -> *const u8 { 205 | self.buf 206 | } 207 | } 208 | 209 | impl Drop for InnerBuffer { 210 | fn drop(&mut self) { 211 | unsafe { alloc::dealloc(self.buf, self.layout) }; 212 | } 213 | } 214 | 215 | #[cfg(test)] 216 | mod tests { 217 | use super::*; 218 | 219 | #[test] 220 | fn test_write_and_read() { 221 | // Create a new buffer: 222 | const LEN: usize = 16; 223 | let mut aligned_buf1 = AlignedBytesMut::new(LEN, 8); 224 | let mut aligned_buf2 = AlignedBytesMut::new(LEN, 8); 225 | 226 | // Set the values of the buffer: 227 | { 228 | let ptr1 = aligned_buf1.as_mut_ptr(); 229 | let ptr2 = aligned_buf2.as_mut_ptr(); 230 | unsafe { 231 | for i in 0..LEN { 232 | *ptr1.offset(i as _) = i as u8; 233 | *ptr2.offset(i as _) = i as u8; 234 | } 235 | } 236 | } 237 | // Read the values back out: 238 | { 239 | let slice1 = aligned_buf1.freeze().unwrap(); 240 | let slice2 = aligned_buf2.freeze().unwrap(); 241 | for i in 0..LEN { 242 | assert_eq!(slice1.as_slice()[i], i as u8); 243 | assert_eq!(slice2.as_slice()[i], i as u8); 244 | } 245 | assert_eq!( 246 | slice1.as_slice(), 247 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] 248 | ); 249 | } 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /crates/lsio_uring/benches/get.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use light_speed_io::object_store_adapter::ObjectStoreAdapter; 3 | use object_store::{path::Path as ObjectStorePath, ObjectStore}; 4 | use std::{ 5 | ops::Range, 6 | process::Command, 7 | time::{Duration, Instant}, 8 | }; 9 | use tokio::runtime::Runtime; 10 | 11 | const FILE_SIZE_BYTES: usize = 262_144; 12 | const DATA_PATH: &str = "/mnt/t700-2tb/fio/"; 13 | const RANGE: Range = 0..(1024 * 16); 14 | 15 | async fn uring_get(filenames: &Vec, n_iterations: u64) -> Duration { 16 | let mut total_time = Duration::ZERO; 17 | for _ in 0..n_iterations { 18 | // Setup (not timed): 19 | let store = ObjectStoreAdapter::default(); 20 | clear_page_cache(); 21 | let mut futures = Vec::with_capacity(filenames.len()); 22 | 23 | // Timed code: 24 | let start_of_iter = Instant::now(); 25 | for filename in filenames { 26 | futures.push(store.get(filename)); 27 | } 28 | for f in futures { 29 | let b = f.await.expect("At least one Result was an Error"); 30 | assert_eq!(b.as_slice().len(), FILE_SIZE_BYTES); 31 | } 32 | total_time += start_of_iter.elapsed(); 33 | } 34 | total_time 35 | } 36 | 37 | async fn uring_get_range(filenames: &Vec, n_iterations: u64) -> Duration { 38 | let mut total_time = Duration::ZERO; 39 | for _ in 0..n_iterations { 40 | // Setup (not timed): 41 | let store = ObjectStoreAdapter::default(); 42 | clear_page_cache(); 43 | let mut futures = Vec::with_capacity(filenames.len()); 44 | 45 | // Timed code: 46 | let start_of_iter = Instant::now(); 47 | for filename in filenames { 48 | futures.push(store.get_range(filename, RANGE)); 49 | } 50 | for f in futures { 51 | let b = f.await.expect("At least one Result was an Error"); 52 | assert_eq!(b.as_slice().len(), RANGE.len()); 53 | } 54 | total_time += start_of_iter.elapsed(); 55 | } 56 | total_time 57 | } 58 | 59 | async fn local_file_system_get(filenames: &Vec, n_iterations: u64) -> Duration { 60 | // Unfortunately, I can't find a better way to share code between `load_files_with_io_uring_local` 61 | // and `load_files_with_local_file_system` because `ObjectStoreAdapter` doesn't yet `impl ObjectStore`. 62 | // And `ObjectStoreAdapter::get` and `LocalFileSystem::get` return slightly different types. 63 | // TODO: Reduce duplication if/when `ObjectStoreAdapter` implements `ObjectStore`. 64 | 65 | let mut total_time = Duration::ZERO; 66 | for _ in 0..n_iterations { 67 | // Setup (not timed): 68 | clear_page_cache(); 69 | let mut handles = Vec::with_capacity(filenames.len()); 70 | 71 | // Timed code: 72 | let start_of_iter = Instant::now(); 73 | for filename in filenames { 74 | let filename = filename.clone(); 75 | handles.push(tokio::spawn(async move { 76 | // We can't create the `store` outside of `spawn` and move it into `spawn`. 77 | // So we have to create the `store` _inside_ this `async` block. 78 | let store = object_store::local::LocalFileSystem::default(); 79 | let result = store.get(&filename).await.unwrap(); 80 | result.bytes().await 81 | })); 82 | } 83 | 84 | for h in handles { 85 | let bytes = h.await.unwrap().unwrap(); 86 | assert_eq!(bytes.len(), FILE_SIZE_BYTES); 87 | } 88 | 89 | total_time += start_of_iter.elapsed(); 90 | } 91 | total_time 92 | } 93 | 94 | async fn local_file_system_get_range( 95 | filenames: &Vec, 96 | n_iterations: u64, 97 | ) -> Duration { 98 | // Unfortunately, I can't find a better way to share code between `load_files_with_io_uring_local` 99 | // and `load_files_with_local_file_system` because `ObjectStoreAdapter` doesn't yet `impl ObjectStore`. 100 | // And `ObjectStoreAdapter::get` and `LocalFileSystem::get` return slightly different types. 101 | // TODO: Reduce duplication if/when `ObjectStoreAdapter` implements `ObjectStore`. 102 | 103 | const RANGE_USIZE: Range = Range { 104 | start: RANGE.start as usize, 105 | end: RANGE.end as usize, 106 | }; 107 | 108 | let mut total_time = Duration::ZERO; 109 | for _ in 0..n_iterations { 110 | // Setup (not timed): 111 | clear_page_cache(); 112 | let mut handles = Vec::with_capacity(filenames.len()); 113 | 114 | // Timed code: 115 | let start_of_iter = Instant::now(); 116 | for filename in filenames { 117 | let filename = filename.clone(); 118 | handles.push(tokio::spawn(async move { 119 | // We can't create the `store` outside of `spawn` and move it into `spawn`. 120 | // So we have to create the `store` _inside_ this `async` block. 121 | let store = object_store::local::LocalFileSystem::default(); 122 | store.get_range(&filename, RANGE_USIZE).await.unwrap() 123 | })); 124 | } 125 | 126 | for h in handles { 127 | let bytes = h.await.unwrap(); 128 | assert_eq!(bytes.len(), RANGE.len()); 129 | } 130 | 131 | total_time += start_of_iter.elapsed(); 132 | } 133 | total_time 134 | } 135 | 136 | fn bench_get(c: &mut Criterion) { 137 | const N_FILES: usize = 1000; 138 | 139 | // Configure group: 140 | let mut group = c.benchmark_group(format!("get_{N_FILES}_whole_files")); 141 | group.sample_size(10); 142 | group.warm_up_time(Duration::from_millis(2000)); 143 | group.throughput(criterion::Throughput::Bytes( 144 | (FILE_SIZE_BYTES * N_FILES) as u64, 145 | )); 146 | 147 | let filenames = get_filenames(N_FILES); 148 | 149 | // Run function: 150 | group.bench_function("uring_get", |b| { 151 | // Insert a call to `to_async` to convert the bencher to async mode. 152 | // The timing loops are the same as with the normal bencher. 153 | b.to_async(Runtime::new().unwrap()) 154 | .iter_custom(|n_iterations| uring_get(&filenames, n_iterations)); 155 | }); 156 | 157 | // Run function: 158 | group.bench_function("local_file_system_get", |b| { 159 | // Insert a call to `to_async` to convert the bencher to async mode. 160 | // The timing loops are the same as with the normal bencher. 161 | b.to_async(Runtime::new().unwrap()) 162 | .iter_custom(|n_iterations| local_file_system_get(&filenames, n_iterations)); 163 | }); 164 | 165 | group.finish(); 166 | } 167 | 168 | fn bench_get_range(c: &mut Criterion) { 169 | const N_FILES: usize = 1000; 170 | 171 | // Configure group: 172 | let mut group = c.benchmark_group(format!("get_{}_bytes_from_{N_FILES}_files", RANGE.len())); 173 | group.sample_size(10); 174 | group.warm_up_time(Duration::from_millis(2000)); 175 | group.throughput(criterion::Throughput::Bytes((RANGE.len() * N_FILES) as u64)); 176 | 177 | let filenames = get_filenames(N_FILES); 178 | 179 | // Run function: 180 | group.bench_function("uring_get_range", |b| { 181 | // Insert a call to `to_async` to convert the bencher to async mode. 182 | // The timing loops are the same as with the normal bencher. 183 | b.to_async(Runtime::new().unwrap()) 184 | .iter_custom(|n_iterations| uring_get_range(&filenames, n_iterations)); 185 | }); 186 | 187 | // Run function: 188 | group.bench_function("local_file_system_get_range", |b| { 189 | // Insert a call to `to_async` to convert the bencher to async mode. 190 | // The timing loops are the same as with the normal bencher. 191 | b.to_async(Runtime::new().unwrap()) 192 | .iter_custom(|n_iterations| local_file_system_get_range(&filenames, n_iterations)); 193 | }); 194 | 195 | group.finish(); 196 | } 197 | 198 | criterion_group!(benches, bench_get, bench_get_range); 199 | criterion_main!(benches); 200 | 201 | fn clear_page_cache() { 202 | let _ = Command::new("vmtouch") 203 | .arg("-e") 204 | .arg(DATA_PATH) 205 | .output() 206 | .expect("vmtouch failed to start"); 207 | 208 | // let _ = Command::new("sudo") 209 | // .arg("sysctl") 210 | // .arg("-w") 211 | // .arg("vm.drop_caches=3") 212 | // .output() 213 | // .expect("sudo sysctl failed to start"); 214 | } 215 | 216 | fn get_filenames(n: usize) -> Vec { 217 | // Create a vector of filenames (files created by `fio`) 218 | (0..n) 219 | .map(|i| { 220 | ObjectStorePath::from(format!( 221 | "//{DATA_PATH}sequential_read_1000_files_each_256KiB.0.{i}" 222 | )) 223 | }) 224 | .collect() 225 | } 226 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Light Speed IO (LSIO) 2 | 3 | > [!WARNING] 4 | > I've paused development on LSIO for now because it's clear that, for most users and most datasets in the world of weather forecasting, the data will increasingly be stored on cloud object storage, not local storage. And LSIO was mostly focused on speeding up local storage. 5 | > In its current state, LSIO is a very minimal proof-of-concept that `io_uring` is faster than `object_store` when reading many small chunks of files from local PCIe 5 SSDs on Linux. There is no Python API yet. 6 | 7 | The ultimate ambition is to enable folks to efficiently load and process large, multi-dimensional datasets as fast as modern CPUs & I/O subsystems will allow. 8 | 9 | For now, this repo is just a place for me to tinker with ideas. 10 | 11 | Under the hood, `light-speed-io` uses [`io_uring`](https://kernel.dk/io_uring.pdf) on Linux for local files, and will use [`object_store`](https://lib.rs/crates/object_store) for all other data I/O. 12 | 13 | My first use-case for light-speed-io is to help to speed up reading [Zarr](https://zarr.dev/). After that, I'm interested in helping to create fast readers for "native" geospatial file formats like GRIB2 and EUMETSAT native files. And, even further than that, I'm interested in efficient & fast _computation_ on [out-of-core](https://en.wikipedia.org/w/index.php?title=Out-of-core), chunked, labelled, multi-dimensional data. 14 | 15 | See [`planned_design.md`](planned_design.md) for more info on the planned design. And please see [this blogpost](https://jack-kelly.com/blog/2023-07-28-speeding-up-zarr) for my motivations for wanting to help speed up Zarr. 16 | 17 | # Benchmarks 18 | 19 | Hardware: 20 | - CPU: AMD [Epyc 9124](https://en.wikichip.org/wiki/amd/epyc/9124) (16 physical cores, 64 MB L3, 3 GHz base, Zen 4) 21 | - PCIe 5 SSD for benchmarking (1 TB Crucial T700 with heatsink) 22 | - 64 GB RAM (4 sticks of Kingston 16 GB DDR5 Pro RDIMM 4800 MT/s) 23 | 24 | Task: 25 | - Read data from 500 files, each file is 41,214,400 bytes, using a block size of 262,144 bytes. 26 | 27 | Results: 28 | - `object_store`: 6.045 GiB/s 29 | - `LSIO` (using 8 worker threads): 11.2 GiB/s 30 | 31 | See [this issue](https://github.com/JackKelly/light-speed-io/issues/50#issuecomment-1992230414) for more details of the benchmarking. 32 | 33 | # Roadmap 34 | 35 | (This will almost certainly change!) 36 | 37 | The list below is in (rough) chronological order. This roadmap is also represented in the [GitHub milestones for this project, when sorted alphabetically](https://github.com/JackKelly/light-speed-io/milestones?direction=asc&sort=title&state=open). 38 | 39 | ### Throw-away prototype 40 | - [x] Initial prototype where a single crate does the IO and compute 41 | - [x] `io_uring` prototype 42 | - [x] `io_uring` prototype using `Rayon` to loop through io_uring completion queue 43 | - [x] `io_uring` async/await implementation with `object_store`-like API 44 | - [x] Try mixing `Tokio` with `Rayon` 45 | - [x] Don't initialise buffers 46 | - [x] Use aligned buffers and `O_DIRECT` 47 | - [x] Benchmark against `object_store` using `criterion` 48 | - [x] Chain open, read, close ops in `io_uring` 49 | - [x] Build new workstation (with PCIe5 SSD) 50 | - [x] Try using trait objects vs enum vs `Box::into_raw` for tracking in-flight operations 51 | - [x] Try using fixed (registered) file descriptors 52 | - [x] Try using `Rayon` for the IO threadpool 53 | - [x] Investigate Rust's `Stream`. 54 | 55 | ### Fresh start. Laying the foundations. New crates: 56 | - [x] `lsio_aligned_bytes`: Shareable buffer which can be aligned to arbitrary boundaries at runtime 57 | - [x] `lsio_threadpool`: Work-stealing threadpool (based on `crossbeam-deque`) 58 | - [x] `lsio_io`: Traits for all LSIO IO backends 59 | 60 | ### MVP IO layer 61 | - [x] Implement minimal `lsio_uring` IO backend (for loading data from a local SSD) with user-defined number of worker threads 62 | - [x] [Benchmark `lsio_uring` backend](https://github.com/JackKelly/light-speed-io/milestone/3) 63 | - [ ] [Implement minimal `lsio_object_store_bridge` IO backend](https://github.com/JackKelly/light-speed-io/milestone/4) 64 | - [ ] [Compare benchmarks for `lsio_uring` vs `lsio_object_store_bridge`](https://github.com/JackKelly/light-speed-io/milestone/7) 65 | - [ ] [Improve usability and robustness](https://github.com/JackKelly/light-speed-io/milestone/8) 66 | - [ ] [Group operations](https://github.com/JackKelly/light-speed-io/milestone/9) 67 | 68 | ### MVP Compute layer 69 | - [ ] Build a general-purpose work-steeling framework for applying arbitrary functions to chunks of data in parallel. And respect groups. 70 | - [ ] Wrap a few decompression algorithms 71 | 72 | ### MVP File format layer: Read from Zarr 73 | - [ ] MVP Zarr library (just for _reading_ data) 74 | - [ ] Python API for `lsio_zarr` 75 | - [ ] Benchmark `lsio_zarr` vs `zarr-python v3` (from Python) 76 | 77 | ### Improve the IO layer: 78 | - [ ] Optimise (merge and split) IO operations 79 | 80 | ### Improve the compute layer 81 | - [ ] Investigate how xarray can "push down" chunkwise computation to LSIO 82 | 83 | ### MVP End-user applications! 84 | - [ ] Compute simple stats of a large dataset (to see if we hit our target of processing 1 TB per 5 mins on a laptop!) 85 | - [ ] Load Zarr into a PyTorch training pipeline 86 | - [ ] Implement merging multiple datasets on-the-fly (e.g. NWP and satellite). 87 | 88 | ### First release! 89 | - [ ] Docs; GitHub actions for Python releases; more rigorous automated testing; etc. 90 | - [ ] Release! 91 | - [ ] Enable Zarr-Python to use LSIO as a storage and codec pipeline? 92 | 93 | ### Implement writing 94 | - [ ] Implement writing using `lsio_uring` 95 | - [ ] Implement writing using `lsio_object_store_bridge` 96 | - [ ] Implement writing in `lsio_zarr` 97 | 98 | ### Improve IO: 99 | - [ ] [Speed up reading from cloud storage buckets](https://github.com/JackKelly/light-speed-io/issues/10) (using object_store) 100 | - [ ] Maybe experiment with [using io_uring for reading from cloud storage buckets](https://github.com/JackKelly/light-speed-io/issues/10#issuecomment-2178689758) 101 | - [ ] Re-use IO buffers 102 | - [ ] Register buffers with `io_uring` 103 | - [ ] Python API for LSIO's IO layer (and LSIO's compute layer?) 104 | 105 | ### Improve the file formats layer: Add GRIB support??? 106 | (Although maybe this won't be necessary because [dynamical.org](https://dynamical.org) are converting datasets to Zarr) 107 | - [ ] Implement simple GRIB reader? 108 | - [ ] Convert GRIB to Zarr? 109 | - [ ] Load GRIB into a PyTorch training pipeline? 110 | 111 | ### Grow the team? (Only if the preceding work has shown promise) 112 | - [ ] Try to raise grant funding? 113 | - [ ] Hire??? 114 | 115 | ### Future work (in no particular order, and no promise any of these will be done!) 116 | - [ ] [Multi-dataset abstraction layer](https://github.com/JackKelly/light-speed-io/issues/142) (under the hood, the same data would be chunked differently for different use-cases. But that complexity would be hidden from users. Users would just interact with a single "logical dataset".) 117 | - [ ] Allow xarray to "push down" all its operations to LSIO 118 | - [ ] xarray-like data structures implemented in Rust? ([notes](https://docs.google.com/document/d/1_T0ay9wXozgqq334E2w1SROdlAM7y6JSgL1rmXJnIO0/edit#heading=h.7ctns22vpab5)) 119 | - [ ] Fast indexing operations for xarray ([notes](https://docs.google.com/document/d/1_T0ay9wXozgqq334E2w1SROdlAM7y6JSgL1rmXJnIO0/edit#heading=h.kjphntldyaaw)) 120 | - [ ] Support for kerchunk / [VirtualiZarr](https://discourse.pangeo.io/t/pangeo-showcase-virtualizarr-create-virtual-zarr-stores-using-xarray-syntax/4127) / [Zarr Manifest Storage Transformer](https://github.com/zarr-developers/zarr-specs/issues/287) 121 | - [ ] Compute using SIMD / NPUs / GPUs, perhaps using [Bend](https://github.com/JackKelly/light-speed-io/issues/132) / [Mojo](https://github.com/JackKelly/light-speed-io/discussions/12) 122 | - [ ] Support many compression algorithms 123 | - [ ] Automatically tune performance 124 | - [ ] "Smart" scheduling of compute and IO (see [notes](https://docs.google.com/document/d/1_T0ay9wXozgqq334E2w1SROdlAM7y6JSgL1rmXJnIO0/edit#heading=h.bqhd2mq9o42t)) 125 | - [ ] Tile-based algorithms for numpy 126 | - [ ] EUMETSAT Native file format 127 | - [ ] NetCDF 128 | - [ ] Warping / spatial reprojection 129 | - [ ] Rechunking Zarr 130 | - [ ] Converting between formats (e.g. convert EUMETSAT `.nat` files to 10-bit per channel bit-packed Zarr). If there's no computation to be done on the data during conversion then do all the copying with `io_uring`: open source file -> read chunks from source -> write to destination -> etc. 131 | - [ ] Write a wiki (or a book) on high-performance multi-dimensional data IO and compute 132 | - [ ] Integrate with Dask to run tasks across many machines 133 | - [ ] Use LSIO as the storage and compute backend for other software packages 134 | 135 | # Project structure 136 | 137 | Light Speed IO is organised as a [Cargo workspace](https://doc.rust-lang.org/book/ch14-03-cargo-workspaces.html) with multiple ([small](https://rust-unofficial.github.io/patterns/patterns/structural/small-crates.html)) crates. The crates are organised in a [flat crate structure](https://matklad.github.io/2021/08/22/large-rust-workspaces.html). The flat crate structure is used by projects such as [Ruff](https://github.com/astral-sh/ruff), [Polars](https://github.com/pola-rs/polars), and [rust-analyser](https://github.com/rust-lang/rust-analyzer). 138 | 139 | LSIO crate names use snake_case, following in the footsteps of the [Rust Book](https://doc.rust-lang.org/book/ch14-03-cargo-workspaces.html) and [Ruff](https://github.com/astral-sh/ruff/tree/main/crates). (The choice of snake_case versus hyphens is, as far as I can tell, entirely arbitrary: [Polars](https://github.com/pola-rs/polars/tree/main/crates) and [rust-analyser](https://github.com/rust-lang/rust-analyzer/tree/master/crates) both use hyphens. I just prefer the look of underscores!) 140 | -------------------------------------------------------------------------------- /crates/lsio_threadpool/src/threadpool.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | sync::{ 3 | atomic::{AtomicBool, Ordering::Relaxed}, 4 | mpsc::{self}, 5 | Arc, 6 | }, 7 | thread::{self, JoinHandle}, 8 | }; 9 | 10 | use crossbeam_deque as deque; 11 | 12 | use crate::{ 13 | park_manager::{ParkManager, ParkManagerCommand}, 14 | shared_state::SharedState, 15 | worker::WorkerThread, 16 | }; 17 | 18 | /// Manages a pool of worker threads. Each worker thread runs a clone of a user-supplied closure. 19 | #[derive(Debug)] 20 | pub struct ThreadPool 21 | where 22 | T: Send, 23 | { 24 | worker_thread_handles: Vec>, 25 | park_manager_thread_handle: Option>, 26 | shared: SharedState, 27 | } 28 | 29 | impl ThreadPool 30 | where 31 | T: Send + 'static, 32 | { 33 | /// Starts a new threadpool with `n_worker_threads` threads and runs a clone of `op` on 34 | /// each thread. `op` takes one argument: a [`WorkerThread`] which provides helpful methods 35 | /// for the operation. Worker threads will shutdown when the `ThreadPool` goes out of scope. 36 | /// 37 | /// `new` also starts a separate thread which is responsible for tracking parked threads. 38 | /// 39 | /// Note that the `'static` lifetime constraint for `OP` basically just means that `op` can't 40 | /// capture any non-`'static` references. It's perfectly fine for `op` to capture owned types 41 | /// (such as `Vec`), as long as those owned types don't include any non-`'static` references. 42 | /// 43 | /// Typically, `op` will begin with any necessary setup (e.g. instantiating objects that will 44 | /// live for the lifetime of the thread) and will then enter a loop, something like: 45 | /// 46 | /// ``` 47 | /// use lsio_threadpool::ThreadPool; 48 | /// const N_THREADS: usize = 4; 49 | /// let pool = ThreadPool::new(N_THREADS, |worker_thread| { 50 | /// /* Optional: Configure per-thread state. */ 51 | /// 52 | /// while worker_thread.keep_running() { 53 | /// match worker_thread.find_task() { 54 | /// Some(task) => { 55 | /// process_task(task); 56 | /// // Optionally submit new tasks: 57 | /// // `worker_thread.push(new_task)` 58 | /// // Tasks might be "stolen" by other threads. 59 | /// }, 60 | /// 61 | /// // Park the thread. `worker_thread.park` automatically registers this worker 62 | /// // thread with the `ParkManager`. The thread will be automatically unparked if 63 | /// // necessary. As a user of the `lsio_threadpool` library, you don't have to 64 | /// // unpark threads manually! 65 | /// None => worker_thread.park(), 66 | /// } 67 | /// } 68 | /// }); 69 | /// 70 | /// fn process_task(task: u8) { 71 | /// /* do something */ 72 | /// } 73 | /// 74 | /// // `pool`'s threads will be shut down (by `ThreadPool` setting 75 | /// // `worker_thread.keep_running()` to false) when `pool` goes out of scope. 76 | /// ``` 77 | /// 78 | pub fn new(n_worker_threads: usize, op: OP) -> Self 79 | where 80 | OP: Fn(WorkerThread) + Send + Clone + 'static, 81 | { 82 | let (chan_to_park_manager, rx_for_park_manager) = mpsc::channel(); 83 | let shared = SharedState { 84 | injector: Arc::new(deque::Injector::new()), 85 | keep_running: Arc::new(AtomicBool::new(true)), 86 | chan_to_park_manager, 87 | at_least_one_thread_is_parked: Arc::new(AtomicBool::new(false)), 88 | }; 89 | 90 | // Spawn ParkManager thread: 91 | let park_manager_thread_handle = Some(ParkManager::start( 92 | rx_for_park_manager, 93 | Arc::clone(&shared.at_least_one_thread_is_parked), 94 | n_worker_threads, 95 | )); 96 | 97 | // Create work stealing queues: 98 | let mut local_queues: Vec> = (0..n_worker_threads) 99 | .map(|_| deque::Worker::new_fifo()) 100 | .collect(); 101 | let stealers: Arc>> = Arc::new( 102 | local_queues 103 | .iter() 104 | .map(|local_queue| local_queue.stealer()) 105 | .collect(), 106 | ); 107 | 108 | // Spawn worker threads: 109 | let worker_thread_handles = (0..n_worker_threads) 110 | .map(|_| { 111 | let work_stealer = WorkerThread::new( 112 | shared.clone(), 113 | local_queues.pop().unwrap(), 114 | Arc::clone(&stealers), 115 | ); 116 | 117 | let op_clone = op.clone(); 118 | thread::spawn(move || (op_clone)(work_stealer)) 119 | }) 120 | .collect(); 121 | 122 | Self { 123 | worker_thread_handles, 124 | park_manager_thread_handle, 125 | shared, 126 | } 127 | } 128 | 129 | /// Push a task from outside the threadpool into the global 130 | /// "[injector](crossbeam_deque::Injector)" queue. 131 | /// This is how users of `ThreadPool` submit tasks to the threadpool. 132 | /// 133 | /// `push` will automatically unpark worker threads if necessary. 134 | pub fn push(&self, task: T) { 135 | self.shared.injector.push(task); 136 | self.shared.unpark_at_most_n_threads(1); 137 | } 138 | } 139 | 140 | impl Drop for ThreadPool 141 | where 142 | T: Send, 143 | { 144 | fn drop(&mut self) { 145 | // Stop and join the worker threads: 146 | self.shared.keep_running.store(false, Relaxed); 147 | for handle in self.worker_thread_handles.drain(..) { 148 | handle.thread().unpark(); 149 | handle 150 | .join() 151 | .unwrap_or_else(|e| println!("A worker thread panic: {e:?}")); 152 | } 153 | 154 | // Stop and join the ParkManager: 155 | self.shared 156 | .chan_to_park_manager 157 | .send(ParkManagerCommand::Stop) 158 | .unwrap(); 159 | self.park_manager_thread_handle 160 | .take() 161 | .unwrap() 162 | .join() 163 | .unwrap(); 164 | } 165 | } 166 | 167 | #[cfg(test)] 168 | 169 | mod tests { 170 | use std::{ 171 | collections::HashMap, 172 | sync::{mpsc::TryRecvError, Mutex}, 173 | thread::ThreadId, 174 | time::Duration, 175 | }; 176 | 177 | use super::*; 178 | 179 | fn add_one_to_hash(hash: &Arc>>) { 180 | let mut log = hash.lock().unwrap(); 181 | log.entry(thread::current().id()) 182 | .and_modify(|count| *count += 1) 183 | .or_insert(1); 184 | } 185 | 186 | #[test] 187 | fn test_threadpool() { 188 | const N_THREADS: usize = 4; 189 | const MULTIPLIER: usize = 8; 190 | const N_TASKS: usize = N_THREADS * MULTIPLIER; 191 | 192 | let (output_tx, output_rx) = mpsc::channel::(); 193 | 194 | // This HashMap maps from ThreadId to the number of times that thread gets Some(task). 195 | let n_tasks_per_thread = Arc::new(Mutex::new(HashMap::new())); 196 | 197 | // This HashMap maps from ThreadId to the number of times that thread has parked. 198 | let n_parks_per_thread = Arc::new(Mutex::new(HashMap::new())); 199 | 200 | let pool = ThreadPool::new(N_THREADS, { 201 | let n_tasks_per_thread = Arc::clone(&n_tasks_per_thread); 202 | let n_parks_per_thread = Arc::clone(&n_parks_per_thread); 203 | move |worker_thread: WorkerThread| { 204 | while worker_thread.keep_running() { 205 | match worker_thread.find_task() { 206 | Some(task) => { 207 | output_tx.send(task).unwrap(); 208 | add_one_to_hash(&n_tasks_per_thread); 209 | // Give other threads a chance to do work. Without this `sleep`, 210 | // one thread tends to to the majority of the work! 211 | thread::sleep(Duration::from_micros(1)); 212 | } 213 | None => { 214 | add_one_to_hash(&n_parks_per_thread); 215 | worker_thread.park(); 216 | } 217 | }; 218 | } 219 | } 220 | }); 221 | 222 | // Push tasks onto the global injector queue: 223 | for i in 0..N_TASKS { 224 | if i % N_THREADS == 0 { 225 | // Wait a moment to let the worker threads park, to check they wake up again! 226 | // Also wait at the start, to let the worker threads "come up". 227 | thread::sleep(Duration::from_millis(10)); 228 | } 229 | pool.push(i); 230 | } 231 | 232 | // Collect outputs and stop the work when all the outputs arrive: 233 | let mut outputs: Vec = output_rx.iter().take(N_TASKS).collect(); 234 | outputs.sort(); 235 | assert!(outputs.into_iter().eq(0..N_TASKS)); 236 | assert!(matches!( 237 | output_rx.try_recv().unwrap_err(), 238 | TryRecvError::Empty 239 | )); 240 | drop(pool); 241 | assert!(matches!( 242 | output_rx.try_recv().unwrap_err(), 243 | TryRecvError::Disconnected 244 | )); 245 | 246 | // Check the n_tasks_per_thread and n_parks_per_thread statistics: 247 | let unwrap_and_check_len = |log: Arc>>| { 248 | let log = Mutex::into_inner(Arc::into_inner(log).unwrap()).unwrap(); 249 | assert_eq!(log.len(), N_THREADS); 250 | log 251 | }; 252 | let n_tasks_per_thread = unwrap_and_check_len(n_tasks_per_thread); 253 | let n_parks_per_thread = unwrap_and_check_len(n_parks_per_thread); 254 | 255 | const MIN_TASKS_PER_THREAD: usize = 2; 256 | for (thread_id, n_tasks) in n_tasks_per_thread.iter() { 257 | assert!( 258 | *n_tasks >= MIN_TASKS_PER_THREAD, 259 | "{thread_id:?} only did {n_tasks} tasks, which is < the threshold {MIN_TASKS_PER_THREAD} tasks!" 260 | ); 261 | } 262 | for (thread_id, n_parks) in n_parks_per_thread.iter() { 263 | assert!( 264 | *n_parks == MULTIPLIER || *n_parks == MULTIPLIER + 1, 265 | "{thread_id:?} did not park the correct number of times!" 266 | ); 267 | } 268 | } 269 | } 270 | --------------------------------------------------------------------------------