├── crates
    ├── lsio_bench
    │   ├── README.md
    │   ├── Cargo.toml
    │   └── src
    │   │   └── main.rs
    ├── lsio_io
    │   ├── README.md
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── lsio_uring
    │   ├── README.md
    │   ├── src
    │   │   ├── lib.rs
    │   │   ├── opcode.rs
    │   │   ├── close.rs
    │   │   ├── io_uring.rs
    │   │   ├── user_data.rs
    │   │   ├── open_file.rs
    │   │   ├── get_range.rs
    │   │   ├── tracker.rs
    │   │   ├── operation.rs
    │   │   ├── get_ranges.rs
    │   │   ├── sqe.rs
    │   │   └── worker.rs
    │   ├── benches
    │   │   ├── fio.ini
    │   │   └── get.rs
    │   ├── Cargo.toml
    │   └── tests
    │   │   └── integration_test.rs
    ├── lsio_threadpool
    │   ├── src
    │   │   ├── lib.rs
    │   │   ├── shared_state.rs
    │   │   ├── park_manager.rs
    │   │   ├── worker.rs
    │   │   └── threadpool.rs
    │   ├── Cargo.toml
    │   └── README.md
    └── lsio_aligned_bytes
    │   ├── Cargo.toml
    │   ├── README.md
    │   └── src
    │       └── lib.rs
├── Cargo.toml
├── LICENSE
├── .gitignore
├── planned_design.md
└── README.md


/crates/lsio_bench/README.md:
--------------------------------------------------------------------------------
1 | Benchmark LSIO.
2 | 
3 | 


--------------------------------------------------------------------------------
/crates/lsio_io/README.md:
--------------------------------------------------------------------------------
1 | Provides a common framework for all LSIO IO backends.
2 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/README.md:
--------------------------------------------------------------------------------
1 | LSIO's IO backend for [io_uring](https://en.wikipedia.org/wiki/Io_uring).
2 | 
3 | 


--------------------------------------------------------------------------------
/crates/lsio_threadpool/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![doc = include_str!("../README.md")]
 2 | 
 3 | mod park_manager;
 4 | mod shared_state;
 5 | mod threadpool;
 6 | mod worker;
 7 | 
 8 | pub use threadpool::ThreadPool;
 9 | pub use worker::WorkerThread;
10 | 


--------------------------------------------------------------------------------
/crates/lsio_threadpool/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsio_threadpool"
 3 | version = "0.0.0"
 4 | publish = false
 5 | edition.workspace = true
 6 | license.workspace = true
 7 | homepage.workspace = true
 8 | repository.workspace = true
 9 | readme = "README.md"
10 | authors.workspace = true
11 | 
12 | [dependencies]
13 | crossbeam-deque.workspace = true
14 | 


--------------------------------------------------------------------------------
/crates/lsio_threadpool/README.md:
--------------------------------------------------------------------------------
1 | `lsio_threadpool` provides a simple [work stealing](https://en.wikipedia.org/wiki/Work_stealing) threadpool.
2 | 
3 | `lsio_threadpool` is a fairly minimal wrapper around [`crossbeam_deque`]. The vast bulk of the fiddly, low-level implementation of work stealing is provided by [`crossbeam_deque`]!
4 | 
5 | To get started, please read the documentation for [`ThreadPool::new`].
6 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![doc = include_str!("../README.md")]
 2 | 
 3 | pub(crate) mod close;
 4 | pub(crate) mod get_range;
 5 | pub(crate) mod get_ranges;
 6 | pub(crate) mod io_uring;
 7 | pub(crate) mod opcode;
 8 | pub(crate) mod open_file;
 9 | pub(crate) mod operation;
10 | pub(crate) mod sqe;
11 | pub(crate) mod tracker;
12 | pub(crate) mod user_data;
13 | pub(crate) mod worker;
14 | 
15 | pub use io_uring::IoUring;
16 | 


--------------------------------------------------------------------------------
/crates/lsio_aligned_bytes/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsio_aligned_bytes"
 3 | version = "0.0.1"  # Maybe we will publish this as a stand-alone crate.
 4 | edition.workspace = true
 5 | license.workspace = true
 6 | homepage.workspace = true
 7 | repository.workspace = true
 8 | readme = "README.md"
 9 | authors.workspace = true
10 | 
11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
12 | 
13 | [dependencies]
14 | anyhow.workspace = true
15 | 


--------------------------------------------------------------------------------
/crates/lsio_bench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsio_bench"
 3 | version = "0.0.0"
 4 | description = "Benchmark LSIO."
 5 | publish = false
 6 | edition.workspace = true
 7 | license.workspace = true
 8 | homepage.workspace = true
 9 | repository.workspace = true
10 | readme = "README.md"
11 | authors.workspace = true
12 | 
13 | [dependencies]
14 | clap = { version = "4.5.4", features = ["derive"] }
15 | indicatif = "0.17.8"
16 | lsio_uring = { path = "../lsio_uring" }
17 | lsio_io = { path = "../lsio_io" }
18 | 


--------------------------------------------------------------------------------
/crates/lsio_io/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsio_io"
 3 | version = "0.0.0"
 4 | publish = false
 5 | edition.workspace = true
 6 | license.workspace = true
 7 | homepage.workspace = true
 8 | repository.workspace = true
 9 | readme = "README.md"
10 | authors.workspace = true
11 | 
12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
13 | 
14 | [dependencies]
15 | anyhow = { workspace = true }
16 | lsio_aligned_bytes = { path = "../lsio_aligned_bytes" }
17 | crossbeam-channel = { workspace = true }
18 | 
19 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/benches/fio.ini:
--------------------------------------------------------------------------------
 1 | [global]
 2 | nrfiles=1000
 3 | filesize=256Ki
 4 | direct=1
 5 | iodepth=64
 6 | ioengine=io_uring
 7 | numjobs=1
 8 | thread=1
 9 | directory=/tmp/fio
10 | registerfiles=1
11 | sqthread_poll=1
12 | fixedbufs=1
13 | 
14 | [sequential_read_1000_files_each_256KiB]
15 | readwrite=read
16 | blocksize=256Ki
17 | 
18 | [read_1000_files_each_256KiB_with_gaps]
19 | wait_for=sequential_read_1000_files_each_256KiB
20 | readwrite=read:32Ki
21 | blocksize=4Ki
22 | 
23 | [read_1_file_of_1GiB_with_gaps]
24 | wait_for=read_1000_files_each_256KiB_with_gaps
25 | readwrite=read:64Ki
26 | nrfiles=1
27 | filesize=1Gi
28 | blocksize=4Ki
29 | 
30 | [rand_read_1GiB_file]
31 | wait_for=read_1_file_of_1GiB_with_gaps
32 | readwrite=randread
33 | nrfiles=1
34 | filesize=1Gi
35 | blocksize=4Ki
36 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsio_uring"
 3 | version = "0.0.0"
 4 | publish = false
 5 | authors = { workspace = true }
 6 | edition = { workspace = true }
 7 | homepage = { workspace = true }
 8 | repository = { workspace = true }
 9 | license = { workspace = true }
10 | readme = "README.md"
11 | 
12 | [dependencies]
13 | lsio_aligned_bytes = { path = "../lsio_aligned_bytes" }
14 | lsio_io = { path = "../lsio_io" }
15 | lsio_threadpool = { path = "../lsio_threadpool" }
16 | anyhow = { workspace = true } 
17 | crossbeam-channel =  { workspace = true }
18 | io-uring =  { workspace = true } 
19 | libc =  { workspace = true } 
20 | nix =  { workspace = true } 
21 | 
22 | [dev-dependencies]
23 | criterion = { workspace = true }
24 | tempfile = { workspace = true }
25 | rand = { workspace = true }
26 | 
27 | [[bench]]  # Yes, this is supposed to have double square brackets!
28 | name = "get"
29 | harness = false
30 | 
31 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/opcode.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | 
 3 | use io_uring::opcode;
 4 | 
 5 | /// Simple wrapper around io_uring opcode::*::CODE;
 6 | #[derive(PartialEq)]
 7 | pub(crate) struct OpCode(u8);
 8 | 
 9 | impl OpCode {
10 |     pub(crate) const fn new(op: u8) -> Self {
11 |         Self(op)
12 |     }
13 | 
14 |     pub(crate) fn name(&self) -> &'static str {
15 |         match self.0 {
16 |             opcode::OpenAt::CODE => "openat",
17 |             opcode::Read::CODE => "read",
18 |             opcode::Close::CODE => "close",
19 |             _ => "Un-recognised opcode",
20 |         }
21 |     }
22 | 
23 |     pub(crate) fn value(&self) -> u8 {
24 |         self.0
25 |     }
26 | }
27 | 
28 | impl fmt::Debug for OpCode {
29 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
30 |         f.debug_tuple("OpCode")
31 |             .field(&self.0)
32 |             .field(&self.name())
33 |             .finish()
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["crates/*"]
 3 | resolver = "2"
 4 | 
 5 | [workspace.package]
 6 | edition = "2021"
 7 | license = "MIT"
 8 | homepage = "https://github.com/JackKelly/light-speed-io"
 9 | repository = "https://github.com/JackKelly/light-speed-io"
10 | readme = "README.md"
11 | authors=["Jack Kelly <jack@openclimatefix.org>"]
12 | 
13 | 
14 | [workspace.dependencies]
15 | anyhow = "1.0.83"
16 | bytes = "1.6.0"
17 | criterion = { version = "0.5.1", features = ["html_reports", "async_tokio"] }
18 | crossbeam-deque = "0.8.5"
19 | crossbeam-channel = "0.5.12"
20 | io-uring = "0.6.4"
21 | libc = "0.2.153"  # Used for filesystem flags
22 | nix = { version = "0.28.0", features = ["fs"] }
23 | object_store = "0.10.1"
24 | snafu = "0.8.2"
25 | tokio = { version = "1.37.0", features = ["rt-multi-thread"]}
26 | url = "2.5.0"
27 | tempfile = "3.10"
28 | rand = "0.8"
29 | 
30 | [profile.bench]
31 | debug = true  # Enable debuginfo when profiling with cargo flamegraph.
32 | 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-2024 Jack Kelly
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/close.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use lsio_threadpool::WorkerThread;
 4 | 
 5 | use crate::{
 6 |     open_file::OpenFile,
 7 |     operation::{NextStep, Operation, UringOperation},
 8 |     sqe::build_close_sqe,
 9 | };
10 | 
11 | #[derive(Debug)]
12 | pub(crate) struct Close {
13 |     file: Arc<OpenFile>,
14 | }
15 | 
16 | impl Close {
17 |     pub(crate) fn new(file: Arc<OpenFile>) -> Self {
18 |         Self { file }
19 |     }
20 | }
21 | 
22 | impl UringOperation for Close {
23 |     fn submit_first_step(
24 |         &mut self,
25 |         index_of_op: usize,
26 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
27 |     ) -> Result<(), io_uring::squeue::PushError> {
28 |         let entry = build_close_sqe(index_of_op, *self.file.file_descriptor());
29 |         unsafe { local_uring_submission_queue.push(&entry) }
30 |     }
31 | 
32 |     fn process_opcode_and_submit_next_step(
33 |         &mut self,
34 |         idx_and_opcode: &crate::user_data::UringUserData,
35 |         _cqe_result: i32,
36 |         _local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
37 |         _worker_thread: &WorkerThread<Operation>,
38 |         _output_channel: &mut crossbeam_channel::Sender<anyhow::Result<lsio_io::Output>>,
39 |     ) -> NextStep {
40 |         if idx_and_opcode.opcode().value() != io_uring::opcode::Close::CODE {
41 |             panic!("Unrecognised opcode!");
42 |         }
43 |         NextStep::Done
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/crates/lsio_threadpool/src/shared_state.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::{
 2 |     atomic::{AtomicBool, Ordering::Relaxed},
 3 |     mpsc, Arc,
 4 | };
 5 | 
 6 | use crossbeam_deque as deque;
 7 | 
 8 | use crate::park_manager::ParkManagerCommand;
 9 | 
10 | /// `ThreadPool` owns a `SharedState<T>`, and each `WorkerThread` owns a cloned `SharedState<T>`.
11 | #[derive(Debug)]
12 | pub(crate) struct SharedState<T>
13 | where
14 |     T: Send,
15 | {
16 |     pub(crate) injector: Arc<deque::Injector<T>>,
17 |     pub(crate) keep_running: Arc<AtomicBool>,
18 |     pub(crate) chan_to_park_manager: mpsc::Sender<ParkManagerCommand>,
19 |     pub(crate) at_least_one_thread_is_parked: Arc<AtomicBool>,
20 | }
21 | 
22 | impl<T> SharedState<T>
23 | where
24 |     T: Send,
25 | {
26 |     pub(crate) fn unpark_at_most_n_threads(&self, n: u32) {
27 |         if self.at_least_one_thread_is_parked.load(Relaxed) {
28 |             self.chan_to_park_manager
29 |                 .send(ParkManagerCommand::WakeAtMostNThreads(n))
30 |                 .unwrap();
31 |         }
32 |     }
33 | }
34 | 
35 | impl<T> Clone for SharedState<T>
36 | where
37 |     T: Send,
38 | {
39 |     fn clone(&self) -> Self {
40 |         Self {
41 |             injector: Arc::clone(&self.injector),
42 |             keep_running: Arc::clone(&self.keep_running),
43 |             chan_to_park_manager: self.chan_to_park_manager.clone(),
44 |             at_least_one_thread_is_parked: Arc::clone(&self.at_least_one_thread_is_parked),
45 |         }
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/io_uring.rs:
--------------------------------------------------------------------------------
 1 | use std::{ffi::CString, os::unix::ffi::OsStrExt};
 2 | 
 3 | use crate::get_ranges::GetRanges;
 4 | use crate::operation::Operation;
 5 | use crate::worker::UringWorker;
 6 | use lsio_io::{Completion, Output, Reader};
 7 | use lsio_threadpool::{ThreadPool, WorkerThread};
 8 | 
 9 | pub struct IoUring {
10 |     threadpool: ThreadPool<Operation>,
11 |     output_rx: crossbeam_channel::Receiver<anyhow::Result<Output>>,
12 | }
13 | 
14 | impl IoUring {
15 |     pub fn new(n_worker_threads: usize) -> Self {
16 |         let (output_tx, output_rx) = crossbeam_channel::bounded(1_024);
17 |         Self {
18 |             threadpool: ThreadPool::new(
19 |                 n_worker_threads,
20 |                 move |worker_thread: WorkerThread<Operation>| {
21 |                     let mut uring_worker = UringWorker::new(worker_thread, output_tx.clone());
22 |                     uring_worker.run();
23 |                 },
24 |             ),
25 |             output_rx,
26 |         }
27 |     }
28 | }
29 | 
30 | impl Completion for IoUring {
31 |     fn completion(&self) -> &crossbeam_channel::Receiver<anyhow::Result<Output>> {
32 |         &self.output_rx
33 |     }
34 | }
35 | 
36 | impl Reader for IoUring {
37 |     fn get_ranges(
38 |         &mut self,
39 |         location: &std::path::Path,
40 |         ranges: Vec<std::ops::Range<isize>>,
41 |         user_data: Vec<u64>,
42 |     ) -> anyhow::Result<()> {
43 |         let location = CString::new(location.as_os_str().as_bytes())
44 |             .expect("Failed to convert path '{path}' to CString");
45 |         let task = Operation::GetRanges(GetRanges::new(location, ranges, user_data));
46 |         self.threadpool.push(task);
47 |         Ok(())
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/user_data.rs:
--------------------------------------------------------------------------------
 1 | use crate::opcode::OpCode;
 2 | 
 3 | /// The u64 io_uring user_data represents the index_of_op in the left-most 32 bits,
 4 | /// and represents the io_uring opcode CODE in the right-most 32 bits.
 5 | #[derive(Debug)]
 6 | pub(crate) struct UringUserData {
 7 |     index_of_op: u32,
 8 |     op: OpCode,
 9 | }
10 | 
11 | impl UringUserData {
12 |     pub(crate) fn new(index_of_op: usize, op: u8) -> Self {
13 |         Self {
14 |             index_of_op: index_of_op.try_into().unwrap(),
15 |             op: OpCode::new(op),
16 |         }
17 |     }
18 | 
19 |     pub(crate) const fn index_of_op(&self) -> u32 {
20 |         self.index_of_op
21 |     }
22 | 
23 |     pub(crate) const fn opcode(&self) -> &OpCode {
24 |         &self.op
25 |     }
26 | }
27 | 
28 | impl From<u64> for UringUserData {
29 |     fn from(value: u64) -> Self {
30 |         let index_of_op: u32 = (value >> 32).try_into().unwrap();
31 |         let op = OpCode::new((value & 0xFF).try_into().unwrap());
32 |         Self { index_of_op, op }
33 |     }
34 | }
35 | 
36 | impl Into<u64> for UringUserData {
37 |     fn into(self) -> u64 {
38 |         let index_of_op: u64 = (self.index_of_op as u64) << 32;
39 |         index_of_op | self.op.value() as u64
40 |     }
41 | }
42 | 
43 | #[cfg(test)]
44 | mod tests {
45 |     use super::*;
46 | 
47 |     #[test]
48 |     fn test_uring_user_data_round_trip() {
49 |         const INDEX: usize = 100;
50 |         const OPCODE: u8 = io_uring::opcode::Read::CODE;
51 |         let uring_user_data = UringUserData::new(INDEX, OPCODE);
52 |         let user_data_u64: u64 = uring_user_data.into();
53 |         let uring_user_data = UringUserData::from(user_data_u64);
54 |         assert_eq!(uring_user_data.index_of_op, INDEX as u32);
55 |         assert_eq!(uring_user_data.op, OpCode::new(OPCODE));
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/crates/lsio_threadpool/src/park_manager.rs:
--------------------------------------------------------------------------------
 1 | use std::{
 2 |     collections::VecDeque,
 3 |     sync::{
 4 |         atomic::{AtomicBool, Ordering::Relaxed},
 5 |         mpsc::{self, RecvError},
 6 |         Arc,
 7 |     },
 8 |     thread,
 9 | };
10 | 
11 | pub(crate) enum ParkManagerCommand {
12 |     WakeAtMostNThreads(u32),
13 |     ThreadIsParked(thread::Thread),
14 |     Stop,
15 | }
16 | 
17 | pub(crate) struct ParkManager {
18 |     rx: mpsc::Receiver<ParkManagerCommand>,
19 |     at_least_one_thread_is_parked: Arc<AtomicBool>,
20 |     parked_threads: VecDeque<thread::Thread>,
21 | }
22 | 
23 | impl ParkManager {
24 |     pub(crate) fn start(
25 |         rx: mpsc::Receiver<ParkManagerCommand>,
26 |         at_least_one_thread_is_parked: Arc<AtomicBool>,
27 |         n_worker_threads: usize,
28 |     ) -> thread::JoinHandle<()> {
29 |         let mut park_manager = Self {
30 |             rx,
31 |             at_least_one_thread_is_parked,
32 |             parked_threads: VecDeque::with_capacity(n_worker_threads),
33 |         };
34 |         thread::Builder::new()
35 |             .name("ParkManager".to_string())
36 |             .spawn(move || park_manager.main_loop())
37 |             .expect("Failed to spawn the ParkManager thread!")
38 |     }
39 | 
40 |     fn main_loop(&mut self) {
41 |         use ParkManagerCommand::*;
42 |         loop {
43 |             match self.rx.recv() {
44 |                 Ok(cmd) => match cmd {
45 |                     ThreadIsParked(t) => self.thread_is_parked(t),
46 |                     WakeAtMostNThreads(n) => self.wake_at_most_n_threads(n),
47 |                     Stop => break,
48 |                 },
49 |                 Err(RecvError) => break,
50 |             }
51 |         }
52 |     }
53 | 
54 |     fn thread_is_parked(&mut self, t: thread::Thread) {
55 |         self.at_least_one_thread_is_parked.store(true, Relaxed);
56 |         debug_assert!(!self.parked_threads.iter().any(|pt| pt.id() == t.id()));
57 |         self.parked_threads.push_back(t);
58 |     }
59 | 
60 |     fn wake_at_most_n_threads(&mut self, n: u32) {
61 |         for _ in 0..n {
62 |             match self.parked_threads.pop_front() {
63 |                 Some(thread) => thread.unpark(),
64 |                 None => break,
65 |             }
66 |         }
67 |         if self.parked_threads.is_empty() {
68 |             self.at_least_one_thread_is_parked.store(false, Relaxed);
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/open_file.rs:
--------------------------------------------------------------------------------
 1 | use std::ffi::CString;
 2 | 
 3 | #[derive(Debug)]
 4 | pub(crate) struct OpenFile {
 5 |     location: CString,
 6 |     file_descriptor: io_uring::types::Fd,
 7 |     /// The file size in bytes.
 8 |     /// Note that we always have to `statx` the file to get the `alignment`, so we'll always get
 9 |     /// the file size, too.
10 |     size: u64,
11 |     alignment: u32,
12 | }
13 | 
14 | impl OpenFile {
15 |     pub(crate) fn file_descriptor(&self) -> &io_uring::types::Fd {
16 |         &self.file_descriptor
17 |     }
18 | 
19 |     pub(crate) fn size(&self) -> u64 {
20 |         self.size
21 |     }
22 | 
23 |     pub(crate) fn alignment(&self) -> u32 {
24 |         self.alignment
25 |     }
26 | }
27 | 
28 | /// Used to build an [`OpenFile`].
29 | #[derive(Debug)]
30 | pub(crate) struct OpenFileBuilder {
31 |     location: CString,
32 |     file_descriptor: Option<io_uring::types::Fd>,
33 |     statx: libc::statx,
34 |     assume_statx_is_initialised: bool,
35 | }
36 | 
37 | impl OpenFileBuilder {
38 |     pub(crate) fn new(location: CString) -> Self {
39 |         Self {
40 |             location,
41 |             file_descriptor: None,
42 |             statx: unsafe { std::mem::zeroed() },
43 |             assume_statx_is_initialised: false,
44 |         }
45 |     }
46 | 
47 |     pub(crate) const fn location(&self) -> &CString {
48 |         &self.location
49 |     }
50 | 
51 |     pub(crate) fn set_file_descriptor(&mut self, file_descriptor: io_uring::types::Fd) {
52 |         self.file_descriptor = Some(file_descriptor);
53 |     }
54 | 
55 |     pub(crate) fn get_statx_ptr(&mut self) -> *mut libc::statx {
56 |         &mut self.statx as *mut libc::statx
57 |     }
58 | 
59 |     pub(crate) unsafe fn assume_statx_is_initialised(&mut self) {
60 |         self.assume_statx_is_initialised = true;
61 |     }
62 | 
63 |     pub(crate) fn is_ready(&self) -> bool {
64 |         self.file_descriptor.is_some() && self.assume_statx_is_initialised
65 |     }
66 | 
67 |     /// Safety: [`Self::is_ready`] must return `true` before calling `build`!
68 |     /// Panics: If `build` is called while [`Self::is_ready`] is still false.
69 |     pub(crate) fn build(self) -> OpenFile {
70 |         assert!(self.is_ready());
71 |         OpenFile {
72 |             location: self.location,
73 |             file_descriptor: self.file_descriptor.unwrap(),
74 |             size: self.statx.stx_size,
75 |             alignment: self.statx.stx_dio_mem_align,
76 |             // TODO: Maybe also use `statx.stx_dio_offset_align`.
77 |         }
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/crates/lsio_io/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![doc = include_str!("../README.md")]
 2 | 
 3 | use lsio_aligned_bytes::AlignedBytes;
 4 | use std::ops::Range;
 5 | 
 6 | // TODO: Consider how to *group* instructions, such that LSIO guarantees that all operations in
 7 | // group _n_ will be completed before any operations in group _n+1_ are started. See:
 8 | // https://github.com/JackKelly/light-speed-io/issues/68
 9 | 
10 | /// All IO backends must expose their completion queue.
11 | pub trait Completion {
12 |     fn completion(&self) -> &crossbeam_channel::Receiver<anyhow::Result<Output>>;
13 | }
14 | 
15 | /// Methods for IO backends that can read from IO.
16 | pub trait Reader {
17 |     /// Submit a GetRanges operation.
18 |     ///
19 |     /// `ranges` specify the byte ranges to read. Negative numbers are relative to the filesize.
20 |     /// (Like indexing lists in Python.) For example:
21 |     ///        0..-1   The entire file.
22 |     ///        0..100  The first 100 bytes.
23 |     ///     -100..-1   The last 100 bytes.
24 |     ///
25 |     /// `user_data` is used to identify each byte_range.
26 |     /// One `user_data` instance per byte_range.
27 |     /// For example, in Zarr, this would be used to identify the
28 |     /// location at which this chunk appears in the merged array.
29 |     ///
30 |     /// # Errors:
31 |     /// If the user submits a `get_ranges` operation with an invalid filename then
32 |     /// the user will receive a single `std::io::Error(std::io::ErrorKind::NotFound)` with context
33 |     /// that describes the filename that failed. If a subset of the `ranges` results in an error
34 |     /// (e.g. reading beyond end of the file) then the user will receive a mixture of `Ok(Output)`
35 |     /// and `Err`, where the `Err` will include context such as the filename and byte range.
36 |     fn get_ranges(
37 |         &mut self,
38 |         // We take ownership because this function returns immediately. If we used references then
39 |         // there would be nothing to stop the user from dropping the owned objects (and
40 |         // invalidating the references!).
41 |         location: &std::path::Path,
42 |         ranges: Vec<Range<isize>>,
43 |         user_data: Vec<u64>,
44 |     ) -> anyhow::Result<()>;
45 | }
46 | 
47 | /// `Chunk` is used throughout the LSIO stack. It is passed from the I/O layer to
48 | /// the compute layer, and to the application layer. (To be more precise: `Result<Chunk>` is usually
49 | /// what is passed around!).
50 | #[derive(Debug)]
51 | pub struct Chunk {
52 |     pub buffer: AlignedBytes,
53 |     /// `user_data` can be used to uniquely identify each chunk, for example by providing an index
54 |     /// into an array that provides more information about each chunk.
55 |     pub user_data: u64,
56 | }
57 | 
58 | /// Holds the data that is output from each IO operation.
59 | #[derive(Debug)]
60 | pub enum Output {
61 |     Chunk(Chunk),
62 |     // Other variants could be:
63 |     // `BytesWritten`, `Listing(Vec<FileMetadata>)`, etc.
64 | }
65 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/get_range.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     close::Close,
 3 |     open_file::OpenFile,
 4 |     operation::{NextStep, Operation, UringOperation},
 5 |     sqe::build_read_range_sqe,
 6 |     user_data::UringUserData,
 7 | };
 8 | use lsio_aligned_bytes::AlignedBytes;
 9 | use lsio_io::{Chunk, Output};
10 | use lsio_threadpool::WorkerThread;
11 | use std::{ops::Range, sync::Arc};
12 | 
13 | #[derive(Debug)]
14 | pub(crate) struct GetRange {
15 |     file: Arc<OpenFile>, // TODO: Replace Arc with Atomic counter?
16 |     range: Range<isize>,
17 |     user_data: u64,
18 |     buffer: Option<AlignedBytes>, // This is an `Option` so we can `take` it.
19 | }
20 | 
21 | impl GetRange {
22 |     pub(crate) fn new(file: Arc<OpenFile>, range: Range<isize>, user_data: u64) -> Self {
23 |         // TODO: Split reads of more than 2 GiB into multiple smaller reads! See issue #99.
24 |         if range.len() > 2_147_479_552 {
25 |             panic!(
26 |                 "`read` will transfer at most 2 GiB but {} bytes were requested. \
27 |                      See https://github.com/JackKelly/light-speed-io/issues/99",
28 |                 range.len()
29 |             );
30 |         }
31 |         Self {
32 |             file,
33 |             range,
34 |             user_data,
35 |             buffer: None,
36 |         }
37 |     }
38 | }
39 | 
40 | impl UringOperation for GetRange {
41 |     /// This method assume that the file has already been opened (by the [`GetRanges`] operation).
42 |     fn submit_first_step(
43 |         &mut self,
44 |         index_of_op: usize,
45 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
46 |     ) -> Result<(), io_uring::squeue::PushError> {
47 |         let (entry, buffer) = build_read_range_sqe(index_of_op, &self.file, &self.range);
48 |         self.buffer = Some(buffer);
49 |         unsafe { local_uring_submission_queue.push(&entry) } // TODO: Does `entry` have to stay
50 |                                                              // alive for longer?
51 |     }
52 | 
53 |     fn process_opcode_and_submit_next_step(
54 |         &mut self,
55 |         idx_and_opcode: &UringUserData,
56 |         cqe_result: i32,
57 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
58 |         _worker_thread: &WorkerThread<Operation>,
59 |         output_channel: &mut crossbeam_channel::Sender<anyhow::Result<Output>>,
60 |     ) -> NextStep {
61 |         // Check that the opcode of the CQE is what we expected:
62 |         if idx_and_opcode.opcode().value() != io_uring::opcode::Read::CODE {
63 |             panic!("Unrecognised opcode!");
64 |         }
65 |         if cqe_result >= 0 {
66 |             // TODO: Check we've read the correct number of bytes:
67 |             //       Check `cqe_result_value == self.buffer.len()`.
68 |             // TODO: Retry if we read less data than requested! See issue #100.
69 | 
70 |             output_channel
71 |                 .send(Ok(Output::Chunk(Chunk {
72 |                     buffer: self.buffer.take().unwrap(),
73 |                     user_data: self.user_data,
74 |                 })))
75 |                 .unwrap();
76 |         };
77 |         // Check if it's time to close the file:
78 |         if Arc::strong_count(&self.file) == 1 {
79 |             // We're the last operation on this file, so it's time to close this file.
80 |             let mut close_op = Close::new(Arc::clone(&self.file));
81 |             close_op
82 |                 .submit_first_step(
83 |                     idx_and_opcode.index_of_op() as _,
84 |                     local_uring_submission_queue,
85 |                 )
86 |                 .unwrap();
87 |             NextStep::ReplaceWith(Operation::Close(close_op))
88 |         } else {
89 |             NextStep::Done
90 |         }
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/crates/lsio_threadpool/src/worker.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     iter,
  3 |     sync::{atomic::Ordering::Relaxed, Arc},
  4 |     thread,
  5 | };
  6 | 
  7 | use crossbeam_deque as deque;
  8 | 
  9 | use crate::{park_manager::ParkManagerCommand, shared_state::SharedState};
 10 | 
 11 | /// Provides methods that allow user-defined closures to find new tasks to work on,
 12 | /// submit new tasks, park this thread, and check if the closure should continue looping.
 13 | ///
 14 | /// Uses do not construct `WorkerThread`s. Instead, [`ThreadPool::new`](crate::ThreadPool::new)
 15 | /// creates one `WorkerThread` per thread, and passes that thread's `WorkerThread` to the
 16 | /// user-supplied closure for that thread.
 17 | pub struct WorkerThread<T>
 18 | where
 19 |     T: Send,
 20 | {
 21 |     shared: SharedState<T>,
 22 | 
 23 |     /// Queues for implementing work-stealing:
 24 |     local_queue: deque::Worker<T>,
 25 |     stealers: Arc<Vec<deque::Stealer<T>>>,
 26 | }
 27 | 
 28 | impl<T> WorkerThread<T>
 29 | where
 30 |     T: Send,
 31 | {
 32 |     pub(crate) fn new(
 33 |         shared: SharedState<T>,
 34 |         local_queue: deque::Worker<T>,
 35 |         stealers: Arc<Vec<deque::Stealer<T>>>,
 36 |     ) -> Self {
 37 |         Self {
 38 |             shared,
 39 |             local_queue,
 40 |             stealers,
 41 |         }
 42 |     }
 43 | 
 44 |     /// Get the next task to work on. This function never blocks.
 45 |     pub fn find_task(&self) -> Option<T> {
 46 |         // Adapted from https://docs.rs/crossbeam-deque/latest/crossbeam_deque/#examples
 47 | 
 48 |         // Pop a task from the local queue, if not empty.
 49 |         self.local_queue.pop().or_else(|| {
 50 |             // Otherwise, we need to look for a task elsewhere.
 51 |             iter::repeat_with(|| {
 52 |                 // Try stealing a batch of tasks from the global queue.
 53 |                 self.shared
 54 |                     .injector
 55 |                     .steal_batch_and_pop(&self.local_queue)
 56 |                     // Or try stealing a task from one of the other threads.
 57 |                     .or_else(|| self.stealers.iter().map(|s| s.steal()).collect())
 58 |             })
 59 |             // Loop while no task was stolen and any steal operation needs to be retried.
 60 |             .find(|s| !s.is_retry())
 61 |             // Extract the stolen task, if there is one.
 62 |             .and_then(|s| s.success())
 63 |         })
 64 |     }
 65 | 
 66 |     /// Returns true if the task should keep running.
 67 |     pub fn keep_running(&self) -> bool {
 68 |         self.shared.keep_running.load(Relaxed)
 69 |     }
 70 | 
 71 |     /// Park this thread.
 72 |     ///
 73 |     /// Before parking, this function will register this thread with the `ParkManager`
 74 |     /// so that this thread can be automatically unparked when necessary.
 75 |     pub fn park(&self) {
 76 |         self.shared
 77 |             .chan_to_park_manager
 78 |             .send(ParkManagerCommand::ThreadIsParked(thread::current()))
 79 |             .unwrap_or_else(|e| {
 80 |                 panic!(
 81 |                     "failed to send ThreadIsParked({:?}) message to ParkManager! {e:?}",
 82 |                     thread::current(),
 83 |                 )
 84 |             });
 85 |         thread::park();
 86 |     }
 87 | 
 88 |     /// Push a task onto this thread's local queue of tasks.
 89 |     ///
 90 |     /// Tasks on the local queue may be stolen by other threads!
 91 |     pub fn push(&self, task: T) {
 92 |         self.local_queue.push(task);
 93 |         self.maybe_unpark_other_threads();
 94 |     }
 95 | 
 96 |     fn maybe_unpark_other_threads(&self) {
 97 |         let n = self.local_queue.len();
 98 |         if n > 1 {
 99 |             self.shared.unpark_at_most_n_threads(n as _);
100 |         }
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/tracker.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::VecDeque;
  2 | 
  3 | pub(crate) struct Tracker<T> {
  4 |     pub(crate) ops_in_flight: Vec<Option<T>>,
  5 |     pub(crate) next_index: VecDeque<usize>,
  6 |     len: usize,
  7 | }
  8 | 
  9 | impl<T> Tracker<T> {
 10 |     pub(crate) fn new(n: usize) -> Self {
 11 |         Self {
 12 |             ops_in_flight: (0..n).map(|_| None).collect(),
 13 |             next_index: (0..n).collect(),
 14 |             len: 0,
 15 |         }
 16 |     }
 17 | 
 18 |     pub(crate) fn get_next_index(&mut self) -> Option<usize> {
 19 |         self.next_index.pop_front()
 20 |     }
 21 | 
 22 |     pub(crate) fn put(&mut self, index: usize, op: T) {
 23 |         self.ops_in_flight[index].replace(op);
 24 |         self.len += 1;
 25 |     }
 26 | 
 27 |     pub(crate) fn get(&mut self, index: usize) -> Option<TrackerGuard<T>> {
 28 |         if self.ops_in_flight[index].is_none() {
 29 |             None
 30 |         } else {
 31 |             Some(TrackerGuard {
 32 |                 index,
 33 |                 tracker: self,
 34 |             })
 35 |         }
 36 |     }
 37 | 
 38 |     pub(crate) fn is_empty(&self) -> bool {
 39 |         self.len == 0
 40 |     }
 41 | 
 42 |     pub(crate) fn is_full(&self) -> bool {
 43 |         self.next_index.is_empty()
 44 |     }
 45 | }
 46 | 
 47 | pub(crate) struct TrackerGuard<'a, T> {
 48 |     index: usize,
 49 |     tracker: &'a mut Tracker<T>,
 50 | }
 51 | 
 52 | impl<'a, T> TrackerGuard<'a, T> {
 53 |     /// Safety: If TrackerGuard exists, then we know that `self.index` is valid.
 54 |     /// So `as_mut` can never fail.
 55 |     pub(crate) fn as_mut(&mut self) -> &mut T {
 56 |         self.tracker.ops_in_flight[self.index].as_mut().unwrap()
 57 |     }
 58 | 
 59 |     pub(crate) fn remove(&mut self) -> T {
 60 |         self.tracker.next_index.push_back(self.index);
 61 |         self.tracker.len -= 1;
 62 |         self.tracker.ops_in_flight[self.index].take().unwrap()
 63 |     }
 64 | 
 65 |     pub(crate) fn replace(&mut self, op: T) {
 66 |         self.tracker.ops_in_flight[self.index].replace(op);
 67 |     }
 68 | }
 69 | 
 70 | #[cfg(test)]
 71 | mod tests {
 72 |     use super::*;
 73 | 
 74 |     #[test]
 75 |     fn test_op_tracker() {
 76 |         let mut tracker = Tracker::new(2);
 77 | 
 78 |         // Check that removing an item before inserting an item returns None.
 79 |         assert!(tracker.get(0).is_none());
 80 | 
 81 |         // Put one string into the tracker, and then remove that string.
 82 |         let i0 = tracker.get_next_index().unwrap();
 83 |         assert_eq!(i0, 0);
 84 |         let s0 = "string0".to_string();
 85 |         tracker.put(i0, s0.clone());
 86 |         assert_eq!(tracker.get(i0).unwrap().remove(), s0);
 87 |         // The tracker is now empty.
 88 | 
 89 |         // Put another string into the tracker. Don't remove it yet.
 90 |         let i1 = tracker.get_next_index().unwrap();
 91 |         assert_eq!(i1, 1);
 92 |         let s1 = "string1".to_string();
 93 |         tracker.put(i1, s1.clone());
 94 | 
 95 |         // Put another string into the tracker. Don't remove it yet.
 96 |         let i2 = tracker.get_next_index().unwrap();
 97 |         assert_eq!(i2, 0);
 98 |         let s2 = "string2".to_string();
 99 |         tracker.put(i2, s2.clone());
100 | 
101 |         // Check that we can't put any more strings into tracker
102 |         assert!(tracker.get_next_index().is_none());
103 | 
104 |         // Check the strings are correct
105 |         assert_eq!(tracker.get(i1).unwrap().remove(), s1);
106 |         assert_eq!(tracker.get(i2).unwrap().remove(), s2);
107 |     }
108 | 
109 |     #[test]
110 |     #[should_panic(expected = "index out of bounds")]
111 |     fn test_panic_if_wrong_index() {
112 |         let mut tracker: Tracker<String> = Tracker::new(2);
113 |         tracker.get(100);
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/operation.rs:
--------------------------------------------------------------------------------
  1 | use lsio_threadpool::WorkerThread;
  2 | 
  3 | use crate::{close::Close, get_range::GetRange, get_ranges::GetRanges, user_data::UringUserData};
  4 | 
  5 | /// We keep a `Tracker<Operation>` in each thread to track progress of each operation:
  6 | #[derive(Debug)]
  7 | pub(crate) enum Operation {
  8 |     GetRanges(GetRanges),
  9 |     GetRange(GetRange),
 10 |     Close(Close),
 11 | }
 12 | 
 13 | impl Operation {
 14 |     fn apply_func_to_all_inner_structs<F, R>(&mut self, mut f: F) -> R
 15 |     where
 16 |         F: FnMut(&mut dyn UringOperation) -> R,
 17 |     {
 18 |         use Operation::*;
 19 |         match self {
 20 |             GetRanges(s) => f(s),
 21 |             GetRange(s) => f(s),
 22 |             Close(s) => f(s),
 23 |         }
 24 |     }
 25 | }
 26 | 
 27 | impl UringOperation for Operation {
 28 |     fn submit_first_step(
 29 |         &mut self,
 30 |         index_of_op: usize,
 31 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
 32 |     ) -> Result<(), io_uring::squeue::PushError> {
 33 |         self.apply_func_to_all_inner_structs(|s| {
 34 |             UringOperation::submit_first_step(s, index_of_op, local_uring_submission_queue)
 35 |         })
 36 |     }
 37 | 
 38 |     fn process_opcode_and_submit_next_step(
 39 |         &mut self,
 40 |         idx_and_opcode: &UringUserData,
 41 |         cqe_result: i32,
 42 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
 43 |         worker_thread: &WorkerThread<Operation>,
 44 |         output_channel: &mut crossbeam_channel::Sender<anyhow::Result<lsio_io::Output>>,
 45 |     ) -> NextStep {
 46 |         self.apply_func_to_all_inner_structs(|s| {
 47 |             UringOperation::maybe_send_error(s, idx_and_opcode, cqe_result, output_channel);
 48 |             UringOperation::process_opcode_and_submit_next_step(
 49 |                 s,
 50 |                 idx_and_opcode,
 51 |                 cqe_result,
 52 |                 local_uring_submission_queue,
 53 |                 worker_thread,
 54 |                 output_channel,
 55 |             )
 56 |         })
 57 |     }
 58 | }
 59 | 
 60 | /// ------------------ COMMON TO ALL URING OPERATIONS ---------------------
 61 | /// Some aims of this design:
 62 | /// - Allocate on the stack
 63 | /// - Cleanly separate the code that implements the state machine for handling each operation.
 64 | /// - Gain the benefits of using the typestate pattern, whilst still allowing us to keep the types
 65 | /// in a vector. See issue #117.
 66 | pub(crate) trait UringOperation: std::fmt::Debug {
 67 |     fn submit_first_step(
 68 |         &mut self,
 69 |         index_of_op: usize,
 70 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
 71 |     ) -> Result<(), io_uring::squeue::PushError>;
 72 | 
 73 |     fn process_opcode_and_submit_next_step(
 74 |         &mut self,
 75 |         idx_and_opcode: &UringUserData,
 76 |         cqe_result: i32,
 77 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
 78 |         worker_thread: &WorkerThread<Operation>,
 79 |         output_channel: &mut crossbeam_channel::Sender<anyhow::Result<lsio_io::Output>>,
 80 |     ) -> NextStep;
 81 | 
 82 |     fn maybe_send_error(
 83 |         &self,
 84 |         idx_and_opcode: &UringUserData,
 85 |         cqe_result: i32,
 86 |         output_channel: &mut crossbeam_channel::Sender<anyhow::Result<lsio_io::Output>>,
 87 |     ) {
 88 |         if cqe_result < 0 {
 89 |             // TODO: We probably want a custom Error struct (or enum?) which has machine-readable
 90 |             // fields for filename, byte_range(s), user_data, error code, opcode. But this
 91 |             // `anyhow::Error` will do for now.
 92 |             let nix_err = nix::Error::from_raw(-cqe_result);
 93 |             let context = format!(
 94 |                 "{nix_err} (reported by io_uring completion queue entry (CQE)). More details: \
 95 |                     idx_and_opcode: {idx_and_opcode:?}. cqe_result: {cqe_result}. self: {self:?}",
 96 |             );
 97 |             let err = Err(anyhow::Error::new(nix_err).context(context));
 98 |             output_channel.send(err).unwrap();
 99 |         }
100 |     }
101 | }
102 | 
103 | pub(crate) enum NextStep {
104 |     Pending,
105 |     Done,
106 |     ReplaceWith(Operation),
107 | }
108 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/tests/integration_test.rs:
--------------------------------------------------------------------------------
  1 | use crossbeam_channel::RecvTimeoutError;
  2 | use lsio_aligned_bytes::AlignedBytes;
  3 | use lsio_io::{Completion, Reader};
  4 | use lsio_uring::IoUring;
  5 | use rand::Rng;
  6 | use std::fs::File;
  7 | use std::io::Read;
  8 | use std::{io::Write, time::Duration};
  9 | 
 10 | const KIBIBYTE: usize = 1024;
 11 | const MEBIBYTE: usize = KIBIBYTE * 1024;
 12 | 
 13 | #[test]
 14 | fn test_get_ranges() -> anyhow::Result<()> {
 15 |     const N_WORKER_THREADS: usize = 4;
 16 |     const FILE_SIZE: usize = MEBIBYTE;
 17 |     const CHUNK_SIZE: usize = KIBIBYTE * 4;
 18 |     const N_CHUNKS: usize = FILE_SIZE / CHUNK_SIZE;
 19 | 
 20 |     // Create random ASCII text (that we will write to disk later):
 21 |     println!("Creating random data...");
 22 |     let distr = rand::distributions::Uniform::new_inclusive(32, 126);
 23 |     let file_contents: Vec<u8> = rand::thread_rng()
 24 |         .sample_iter(distr)
 25 |         .take(((CHUNK_SIZE as f32) * 1.5) as _)
 26 |         .collect::<Vec<u8>>()
 27 |         .into_iter()
 28 |         .cycle()
 29 |         .take(FILE_SIZE)
 30 |         .collect();
 31 |     assert_eq!(file_contents.len(), FILE_SIZE);
 32 | 
 33 |     // Create filename in temporary directory:
 34 |     let filename =
 35 |         std::env::temp_dir().join(format!("lsio_uring_tempfile_{}", rand::random::<u32>()));
 36 | 
 37 |     // Write file:
 38 |     println!("Writing random data to disk...");
 39 |     {
 40 |         let mut file = File::create(&filename)?;
 41 |         file.write_all(&file_contents)?;
 42 |         file.flush()?;
 43 |         file.sync_all()?;
 44 |     }
 45 | 
 46 |     // Check file is correctly written to disk:
 47 |     {
 48 |         let mut file = File::open(&filename)?;
 49 |         let mut temp_buffer = Vec::with_capacity(FILE_SIZE);
 50 |         file.read_to_end(&mut temp_buffer)?;
 51 |         assert!(temp_buffer.eq(&file_contents));
 52 |         assert_eq!(temp_buffer.len(), FILE_SIZE);
 53 |     }
 54 | 
 55 |     // Define byte ranges to load:
 56 |     let ranges = (0..N_CHUNKS)
 57 |         .map(|chunk_i| {
 58 |             let chunk_start = (chunk_i * CHUNK_SIZE) as isize;
 59 |             let chunk_end = chunk_start + (CHUNK_SIZE as isize);
 60 |             chunk_start..chunk_end
 61 |         })
 62 |         .collect();
 63 | 
 64 |     // Define user_data (so we can identify the chunks!)
 65 |     let user_data = (0..N_CHUNKS as u64).collect();
 66 | 
 67 |     // Submit get_ranges operation:
 68 |     println!("Reading data using io_uring!!!");
 69 |     let mut uring = IoUring::new(N_WORKER_THREADS);
 70 |     uring.get_ranges(&filename, ranges, user_data)?;
 71 | 
 72 |     // Re-assemble byte ranges:
 73 |     let mut vec_of_aligned_bytes: Vec<Option<AlignedBytes>> = (0..N_CHUNKS).map(|_| None).collect();
 74 | 
 75 |     for i in 0..N_CHUNKS {
 76 |         match uring.completion().recv_timeout(Duration::from_millis(500)) {
 77 |             Ok(output) => match output {
 78 |                 Ok(c) => {
 79 |                     let lsio_io::Output::Chunk(c) = c;
 80 |                     vec_of_aligned_bytes[c.user_data as usize] = Some(c.buffer);
 81 |                 }
 82 |                 Err(e) => panic!("Error reading chunk {i}! {e:?}"),
 83 |             },
 84 |             Err(RecvTimeoutError::Timeout) => panic!("Timed out waiting for chunk {i}!"),
 85 |             Err(RecvTimeoutError::Disconnected) => {
 86 |                 panic!("Disconnected whilst waiting for chunk {i}!")
 87 |             }
 88 |         };
 89 |     }
 90 |     println!("Finished reading using io_uring!");
 91 | 
 92 |     // Check that the completion queue does the right thing when IoUring is dropped:
 93 |     let completion = uring.completion().clone();
 94 |     drop(uring);
 95 |     assert!(completion.recv().is_err());
 96 |     drop(completion);
 97 | 
 98 |     // Re-assemble the chunks into the complete file:
 99 |     println!("Assembling buffer:");
100 |     let mut assembled_buf = Vec::with_capacity(FILE_SIZE);
101 |     for aligned_bytes in vec_of_aligned_bytes {
102 |         assembled_buf.extend_from_slice(aligned_bytes.unwrap().as_slice());
103 |     }
104 | 
105 |     println!(
106 |         "Read from disk: {:?}",
107 |         core::str::from_utf8(&assembled_buf[0..100]).unwrap()
108 |     );
109 |     println!(
110 |         "Ground truth  : {:?}",
111 |         core::str::from_utf8(&file_contents[0..100]).unwrap()
112 |     );
113 | 
114 |     assert!(assembled_buf.eq(&file_contents));
115 | 
116 |     // Clean up:
117 |     std::fs::remove_file(&filename)?;
118 | 
119 |     Ok(())
120 | }
121 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/get_ranges.rs:
--------------------------------------------------------------------------------
  1 | use std::{ffi::CString, iter::zip, ops::Range, sync::Arc};
  2 | 
  3 | use lsio_threadpool::WorkerThread;
  4 | 
  5 | use crate::{
  6 |     get_range::GetRange,
  7 |     open_file::OpenFileBuilder,
  8 |     operation::{NextStep, Operation, UringOperation},
  9 |     sqe::{build_openat_sqe, build_statx_sqe},
 10 | };
 11 | 
 12 | const N_CQES_EXPECTED: u8 = 2; // We're expecting CQEs for `openat` and `statx`.
 13 | 
 14 | #[derive(Debug)]
 15 | pub(crate) struct GetRanges {
 16 |     open_file_builder: Option<OpenFileBuilder>,
 17 |     ranges: Vec<Range<isize>>,
 18 |     user_data: Vec<u64>,
 19 | 
 20 |     // If both CQEs succeed then we'll capture their outputs in `open_file_builder`. But, in case
 21 |     // one or more CQEs reports a failure, we need an additional mechanism to track how many CQEs
 22 |     // we've received.
 23 |     n_cqes_received: u8,
 24 | }
 25 | 
 26 | impl GetRanges {
 27 |     pub(crate) fn new(location: CString, ranges: Vec<Range<isize>>, user_data: Vec<u64>) -> Self {
 28 |         assert_eq!(ranges.len(), user_data.len());
 29 |         Self {
 30 |             open_file_builder: Some(OpenFileBuilder::new(location)),
 31 |             ranges,
 32 |             user_data,
 33 |             n_cqes_received: 0,
 34 |         }
 35 |     }
 36 | 
 37 |     // io_uring can't process multiple range requests in a single op. So, once we've opened the
 38 |     // file and gotten its metadata, we need to submit one `Operation::GetRange` per byte range.
 39 |     fn submit_get_range_ops(&mut self, worker_thread: &WorkerThread<Operation>) {
 40 |         let file = Arc::new(self.open_file_builder.take().unwrap().build());
 41 |         for (range, user_data) in zip(&self.ranges, &self.user_data) {
 42 |             let get_range_op = GetRange::new(file.clone(), range.to_owned(), *user_data);
 43 |             worker_thread.push(Operation::GetRange(get_range_op));
 44 |         }
 45 |     }
 46 | }
 47 | 
 48 | impl UringOperation for GetRanges {
 49 |     fn submit_first_step(
 50 |         &mut self,
 51 |         index_of_op: usize,
 52 |         local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
 53 |     ) -> Result<(), io_uring::squeue::PushError> {
 54 |         let open_entry = build_openat_sqe(
 55 |             index_of_op,
 56 |             self.open_file_builder.as_ref().unwrap().location(),
 57 |         );
 58 |         let statx_entry =
 59 |             build_statx_sqe(index_of_op, &mut self.open_file_builder.as_mut().unwrap());
 60 |         unsafe {
 61 |             local_uring_submission_queue.push(&open_entry)?;
 62 |             local_uring_submission_queue.push(&statx_entry)?;
 63 |         };
 64 |         Ok(())
 65 |     }
 66 | 
 67 |     fn process_opcode_and_submit_next_step(
 68 |         &mut self,
 69 |         idx_and_opcode: &crate::user_data::UringUserData,
 70 |         cqe_result: i32,
 71 |         _local_uring_submission_queue: &mut io_uring::squeue::SubmissionQueue,
 72 |         worker_thread: &WorkerThread<Operation>,
 73 |         _output_channel: &mut crossbeam_channel::Sender<anyhow::Result<lsio_io::Output>>,
 74 |     ) -> NextStep {
 75 |         self.n_cqes_received += 1;
 76 |         if cqe_result >= 0 {
 77 |             match idx_and_opcode.opcode().value() {
 78 |                 io_uring::opcode::OpenAt::CODE => {
 79 |                     self.open_file_builder
 80 |                         .as_mut()
 81 |                         .unwrap()
 82 |                         .set_file_descriptor(io_uring::types::Fd(cqe_result));
 83 |                 }
 84 |                 io_uring::opcode::Statx::CODE => {
 85 |                     unsafe {
 86 |                         self.open_file_builder
 87 |                             .as_mut()
 88 |                             .unwrap()
 89 |                             .assume_statx_is_initialised();
 90 |                     };
 91 |                 }
 92 |                 _ => panic!("Unrecognised opcode! {idx_and_opcode:?}"),
 93 |             };
 94 |         };
 95 | 
 96 |         assert!(self.n_cqes_received <= N_CQES_EXPECTED);
 97 |         if self.n_cqes_received == N_CQES_EXPECTED {
 98 |             if self.open_file_builder.as_mut().unwrap().is_ready() {
 99 |                 self.submit_get_range_ops(worker_thread);
100 |                 NextStep::Done
101 |             } else {
102 |                 // We've seen all the CQEs we were expecting, but `open_file_builder` isn't ready. So
103 |                 // at least one of the CQEs must have resulted in an error. Nevertheless, we're "done".
104 |                 NextStep::Done
105 |             }
106 |         } else {
107 |             // We're expecting one more CQE.
108 |             NextStep::Pending
109 |         }
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ################ START OF RUST GITIGNORE ############################
  2 | # Generated by Cargo
  3 | 
  4 | # will have compiled files and executables
  5 | debug/
  6 | target/
  7 | 
  8 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries.
  9 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 10 | Cargo.lock
 11 | 
 12 | # These are backup files generated by rustfmt
 13 | **/*.rs.bk
 14 | 
 15 | # MSVC Windows builds of rustc generate these, which store debugging information
 16 | *.pdb
 17 | ################ END OF RUST GITIGNORE ##############################
 18 | 
 19 | ################ START OF PYTHON GITIGNORE ##########################
 20 | # Taken from github.com/github/gitignore/blob/main/Python.gitignore
 21 | 
 22 | # Byte-compiled / optimized / DLL files
 23 | __pycache__/
 24 | *.py[cod]
 25 | *$py.class
 26 | 
 27 | # C extensions
 28 | *.so
 29 | 
 30 | # Distribution / packaging
 31 | .Python
 32 | build/
 33 | develop-eggs/
 34 | dist/
 35 | downloads/
 36 | eggs/
 37 | .eggs/
 38 | lib/
 39 | lib64/
 40 | parts/
 41 | sdist/
 42 | var/
 43 | wheels/
 44 | share/python-wheels/
 45 | *.egg-info/
 46 | .installed.cfg
 47 | *.egg
 48 | MANIFEST
 49 | 
 50 | # PyInstaller
 51 | #  Usually these files are written by a python script from a template
 52 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 53 | *.manifest
 54 | *.spec
 55 | 
 56 | # Installer logs
 57 | pip-log.txt
 58 | pip-delete-this-directory.txt
 59 | 
 60 | # Unit test / coverage reports
 61 | htmlcov/
 62 | .tox/
 63 | .nox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | nosetests.xml
 68 | coverage.xml
 69 | *.cover
 70 | *.py,cover
 71 | .hypothesis/
 72 | .pytest_cache/
 73 | cover/
 74 | 
 75 | # Translations
 76 | *.mo
 77 | *.pot
 78 | 
 79 | # Django stuff:
 80 | *.log
 81 | local_settings.py
 82 | db.sqlite3
 83 | db.sqlite3-journal
 84 | 
 85 | # Flask stuff:
 86 | instance/
 87 | .webassets-cache
 88 | 
 89 | # Scrapy stuff:
 90 | .scrapy
 91 | 
 92 | # Sphinx documentation
 93 | docs/_build/
 94 | 
 95 | # PyBuilder
 96 | .pybuilder/
 97 | target/
 98 | 
 99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 | 
102 | # IPython
103 | profile_default/
104 | ipython_config.py
105 | 
106 | # pyenv
107 | #   For a library or package, you might want to ignore these files since the code is
108 | #   intended to run in multiple environments; otherwise, check them in:
109 | # .python-version
110 | 
111 | # pipenv
112 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
113 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
114 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
115 | #   install all needed dependencies.
116 | #Pipfile.lock
117 | 
118 | # poetry
119 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
120 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
121 | #   commonly ignored for libraries.
122 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
123 | #poetry.lock
124 | 
125 | # pdm
126 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
127 | #pdm.lock
128 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
129 | #   in version control.
130 | #   https://pdm.fming.dev/#use-with-ide
131 | .pdm.toml
132 | 
133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134 | __pypackages__/
135 | 
136 | # Celery stuff
137 | celerybeat-schedule
138 | celerybeat.pid
139 | 
140 | # SageMath parsed files
141 | *.sage.py
142 | 
143 | # Environments
144 | .env
145 | .venv
146 | env/
147 | venv/
148 | ENV/
149 | env.bak/
150 | venv.bak/
151 | 
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 | 
156 | # Rope project settings
157 | .ropeproject
158 | 
159 | # mkdocs documentation
160 | /site
161 | 
162 | # mypy
163 | .mypy_cache/
164 | .dmypy.json
165 | dmypy.json
166 | 
167 | # Pyre type checker
168 | .pyre/
169 | 
170 | # pytype static type analyzer
171 | .pytype/
172 | 
173 | # Cython debug symbols
174 | cython_debug/
175 | 
176 | # PyCharm
177 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
180 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
181 | #.idea/
182 | ################ END OF PYTHON GITIGNORE ############################
183 | 
184 | .vscode/
185 | perf.data
186 | perf.data.*
187 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/sqe.rs:
--------------------------------------------------------------------------------
  1 | use io_uring::squeue;
  2 | use io_uring::types;
  3 | use lsio_aligned_bytes::AlignedBytes;
  4 | use lsio_aligned_bytes::AlignedBytesMut;
  5 | use std::ffi::CString;
  6 | use std::ops::Range;
  7 | 
  8 | use crate::open_file::OpenFile;
  9 | use crate::open_file::OpenFileBuilder;
 10 | use crate::user_data::UringUserData;
 11 | 
 12 | const ALIGN: isize = 512; // TODO: Get ALIGN at runtime from statx.
 13 | 
 14 | /// # Documentation about the openat operation in io_uring:
 15 | /// - https://man7.org/linux/man-pages/man2/openat.2.html
 16 | /// - https://man7.org/linux/man-pages/man3/io_uring_prep_openat.3.html
 17 | pub(crate) fn build_openat_sqe(index_of_op: usize, location: &CString) -> squeue::Entry {
 18 |     // Prepare the "openat" submission queue entry (SQE):
 19 |     io_uring::opcode::OpenAt::new(
 20 |         // `dirfd` is ignored if the pathname is absolute.
 21 |         // See the "openat()" section in https://man7.org/linux/man-pages/man2/openat.2.html
 22 |         types::Fd(-1),
 23 |         location.as_ptr(),
 24 |     )
 25 |     .flags(libc::O_RDONLY | libc::O_DIRECT)
 26 |     .build()
 27 |     .user_data(UringUserData::new(index_of_op, io_uring::opcode::OpenAt::CODE).into())
 28 | }
 29 | 
 30 | /// Build a `statx` submission queue entry (SQE).
 31 | ///
 32 | /// # Safety
 33 | /// Assumes the struct that `statx_ptr` points to exists and has been zeroed.
 34 | ///
 35 | /// # Documentation about the statx operation in io_uring:
 36 | /// - https://man7.org/linux/man-pages/man2/statx.2.html
 37 | /// - https://man7.org/linux/man-pages/man3/io_uring_prep_statx.3.html
 38 | /// - https://docs.rs/io-uring/latest/io_uring/opcode/struct.Statx.html
 39 | /// - https://docs.rs/libc/latest/libc/struct.statx.html
 40 | pub(crate) fn build_statx_sqe(
 41 |     index_of_op: usize,
 42 |     open_file_builder: &mut OpenFileBuilder,
 43 | ) -> squeue::Entry {
 44 |     // Prepare the "statx" submission queue entry (SQE):
 45 |     io_uring::opcode::Statx::new(
 46 |         // `dirfd` is ignored if the pathname is absolute. See:
 47 |         // https://man7.org/linux/man-pages/man2/statx.2.html
 48 |         types::Fd(-1),
 49 |         open_file_builder.location().as_ptr(),
 50 |         open_file_builder.get_statx_ptr() as *mut _,
 51 |     )
 52 |     // See here for a description of the flags for statx:
 53 |     // https://man7.org/linux/man-pages/man2/statx.2.html
 54 |     .mask(libc::STATX_SIZE | libc::STATX_DIOALIGN)
 55 |     .build()
 56 |     .user_data(UringUserData::new(index_of_op, io_uring::opcode::Statx::CODE).into())
 57 | }
 58 | 
 59 | pub(crate) fn build_read_range_sqe(
 60 |     index_of_op: usize,
 61 |     file: &OpenFile,
 62 |     range: &Range<isize>,
 63 | ) -> (squeue::Entry, AlignedBytes) {
 64 |     let filesize: isize = file.size().try_into().unwrap();
 65 |     let start_offset = if range.start >= 0 {
 66 |         range.start
 67 |     } else {
 68 |         // `range.start` is negative. We interpret a negative `range.start`
 69 |         // as an offset from the end of the file.
 70 |         filesize + range.start
 71 |     };
 72 |     assert!(start_offset >= 0);
 73 | 
 74 |     let end_offset = if range.end >= 0 {
 75 |         range.end
 76 |     } else {
 77 |         // `range.end` is negative. We interpret a negative `range.end`
 78 |         // as an offset from the end of the file, where `range.end = -1` means the last byte.
 79 |         filesize + range.end + 1
 80 |     };
 81 |     assert!(end_offset >= 0);
 82 | 
 83 |     let aligned_start_offset = (start_offset / ALIGN) * ALIGN;
 84 | 
 85 |     let mut buffer;
 86 |     {
 87 |         let buf_len = end_offset - aligned_start_offset;
 88 |         assert!(buf_len > 0);
 89 | 
 90 |         // Allocate vector. If `buf_len` is not exactly divisible by ALIGN, then
 91 |         // `AlignedBytesMut::new` will extend the length until it is aligned.
 92 |         buffer = AlignedBytesMut::new(buf_len as usize, ALIGN.try_into().unwrap());
 93 |         // From now on, use `buffer.len()` as the correct length!
 94 |         // This code is in its own scope so that `buf_len` cannot be used in subsequent code.
 95 |     }
 96 | 
 97 |     // Prepare the "read" opcode:
 98 |     let read_op = io_uring::opcode::Read::new(
 99 |         *file.file_descriptor(),
100 |         buffer.as_mut_ptr(),
101 |         buffer.len().try_into().unwrap(),
102 |     )
103 |     .offset(aligned_start_offset as _)
104 |     .build()
105 |     .user_data(UringUserData::new(index_of_op, io_uring::opcode::Read::CODE).into());
106 | 
107 |     // If the `start_offset` is not aligned, then the start of the buffer will contain data that
108 |     // the user did not request.
109 |     if aligned_start_offset != start_offset {
110 |         _ = buffer
111 |             .split_to((start_offset - aligned_start_offset).try_into().unwrap())
112 |             .unwrap();
113 |     }
114 | 
115 |     // `freeze` the buffer, and set the slice to the slice requested by the user:
116 |     let start_slice: usize = (start_offset - aligned_start_offset).try_into().unwrap();
117 |     let end_slice: usize = (end_offset - aligned_start_offset).try_into().unwrap();
118 |     let mut buffer = buffer.freeze().unwrap();
119 |     buffer.set_slice(start_slice..end_slice);
120 | 
121 |     (read_op, buffer)
122 | }
123 | 
124 | /// # Documentation about the `close` operation:
125 | /// - https://man7.org/linux/man-pages/man2/close.2.html
126 | pub(crate) fn build_close_sqe(
127 |     index_of_op: usize,
128 |     file_descriptor: io_uring::types::Fd,
129 | ) -> squeue::Entry {
130 |     io_uring::opcode::Close::new(file_descriptor)
131 |         .build()
132 |         .user_data(UringUserData::new(index_of_op, io_uring::opcode::Close::CODE).into())
133 | }
134 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/src/worker.rs:
--------------------------------------------------------------------------------
  1 | use io_uring::{cqueue, squeue};
  2 | use lsio_io::Output;
  3 | use lsio_threadpool::WorkerThread;
  4 | 
  5 | use crate::{
  6 |     operation::{NextStep, Operation, UringOperation},
  7 |     tracker::Tracker,
  8 |     user_data::UringUserData,
  9 | };
 10 | 
 11 | /// `MAX_SQ_ENTRIES_PER_ITERATION` describes the most SQEs that will be submitted to the io_uring SQ by
 12 | /// a single iteration of the `run` loop. This constant is used to make sure we have enough
 13 | /// headroom in the SQ before each iteration of the `run` loop.
 14 | const MAX_SQ_ENTRIES_PER_ITERATION: usize = 2;
 15 | 
 16 | /// Size of the io_uring submission queue (SQ).
 17 | const SQ_RING_SIZE: usize = 64;
 18 | 
 19 | /// We keep filling the SQ until we hit the "high water line" before we start draining the
 20 | /// completion queue. This ensures that we allow io_uring to process as many operations in parallel
 21 | /// as possible.
 22 | const HIGH_WATER_LINE: usize = SQ_RING_SIZE / 2;
 23 | 
 24 | pub struct UringWorker {
 25 |     uring: io_uring::IoUring,
 26 |     ops_in_flight: Tracker<Operation>,
 27 |     worker_thread: WorkerThread<Operation>,
 28 |     output_tx: crossbeam_channel::Sender<anyhow::Result<Output>>,
 29 | }
 30 | 
 31 | impl UringWorker {
 32 |     pub(crate) fn new(
 33 |         worker_thread: WorkerThread<Operation>,
 34 |         output_tx: crossbeam_channel::Sender<anyhow::Result<Output>>,
 35 |     ) -> Self {
 36 |         assert!(MAX_SQ_ENTRIES_PER_ITERATION < SQ_RING_SIZE);
 37 | 
 38 |         let ring: io_uring::IoUring<squeue::Entry, cqueue::Entry> = io_uring::IoUring::builder()
 39 |             // TODO: Allow the user to decide whether sqpoll is used.
 40 |             .setup_sqpoll(1000) // The kernel sqpoll thread will sleep after this many milliseconds.
 41 |             .build(SQ_RING_SIZE as _)
 42 |             .expect("Failed to initialise io_uring.");
 43 | 
 44 |         assert_eq!(ring.params().cq_entries(), ring.params().sq_entries() * 2);
 45 | 
 46 |         Self {
 47 |             uring: ring,
 48 |             ops_in_flight: Tracker::new(SQ_RING_SIZE),
 49 |             worker_thread,
 50 |             output_tx,
 51 |         }
 52 |     }
 53 | 
 54 |     /// The main loop for the thread.
 55 |     pub(crate) fn run(&mut self) {
 56 |         while self.worker_thread.keep_running() {
 57 |             if self.ops_in_flight.is_full() || self.uring_is_full() {
 58 |                 if self.uring.completion().is_empty() {
 59 |                     // The SQ is full but no completion events are ready! So we have no choice:
 60 |                     // We *have* to wait for some completion events to to complete:
 61 |                     self.uring.submit_and_wait(1).unwrap();
 62 |                 }
 63 |                 // The CQ has CQEs for us, so we fall through to the CQ processing loop.
 64 |             } else {
 65 |                 match self.worker_thread.find_task() {
 66 |                     Some(mut operation) => {
 67 |                         // Submit first step of `operation`, and track `operation`:
 68 |                         let index_of_op = self
 69 |                             .ops_in_flight
 70 |                             .get_next_index()
 71 |                             .expect("Failed to get_next_task on tracker!");
 72 |                         operation
 73 |                             .submit_first_step(index_of_op, &mut self.uring.submission())
 74 |                             .expect("Failed to submit_first_step of Operation!");
 75 |                         // TODO: Instead of calling `submit()` on every loop, we should keep our
 76 |                         // own check on how long has elapsed since we last submitted to the SQ,
 77 |                         // and only call `submit()` when we know the SQ has gone to sleep.
 78 |                         // See issue #129.
 79 |                         self.uring.submitter().submit().unwrap();
 80 |                         self.ops_in_flight.put(index_of_op, operation);
 81 |                         if self.sq_len_plus_cq_len() < HIGH_WATER_LINE {
 82 |                             // We want to "top up" the SQ before we process any CQEs.
 83 |                             // Without this, we run the risk of submitting one SQE, then draining
 84 |                             // that CQE, then submitting another SQE, and training that CQE, etc.
 85 |                             // In other words, we run the risk of not letting io_uring handle
 86 |                             // multiple SQEs at once!
 87 |                             continue;
 88 |                         }
 89 |                     }
 90 |                     None => {
 91 |                         // There are no new operations to submit, so let's work out if we need to
 92 |                         // park or process the completion queue.
 93 |                         if self.ops_in_flight.is_empty() {
 94 |                             // There's nothing to do! So we have to sleep:
 95 |                             self.worker_thread.park();
 96 |                             // When we wake, there definitely won't be anything in our uring, so
 97 |                             // continue to the top of the while loop:
 98 |                             continue;
 99 |                         }
100 |                     }
101 |                 }
102 |             }
103 | 
104 |             for cqe in unsafe { self.uring.completion_shared() } {
105 |                 let idx_and_opcode = UringUserData::from(cqe.user_data());
106 |                 let idx_of_op = idx_and_opcode.index_of_op() as usize;
107 |                 let mut op_guard = self.ops_in_flight.get(idx_of_op).unwrap();
108 |                 let next_step = op_guard.as_mut().process_opcode_and_submit_next_step(
109 |                     &idx_and_opcode,
110 |                     cqe.result(),
111 |                     &mut unsafe { self.uring.submission_shared() },
112 |                     &self.worker_thread,
113 |                     &mut self.output_tx,
114 |                 );
115 |                 match next_step {
116 |                     NextStep::Pending => (), // By default, op_guard will keep the operation.
117 |                     NextStep::ReplaceWith(op) => op_guard.replace(op),
118 |                     NextStep::Done => {
119 |                         let _ = op_guard.remove();
120 |                     }
121 |                 };
122 |             }
123 |         }
124 |         assert!(self.ops_in_flight.is_empty());
125 |     }
126 | 
127 |     /// io_uring submission queue (SQ) length plus the io_uring completion queue (CQ) length:
128 |     fn sq_len_plus_cq_len(&self) -> usize {
129 |         unsafe { self.uring.submission_shared().len() + self.uring.completion_shared().len() }
130 |     }
131 | 
132 |     fn uring_is_full(&self) -> bool {
133 |         self.sq_len_plus_cq_len() >= SQ_RING_SIZE - MAX_SQ_ENTRIES_PER_ITERATION
134 |     }
135 | }
136 | 


--------------------------------------------------------------------------------
/crates/lsio_bench/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     env::temp_dir,
  3 |     fs::File,
  4 |     io::Write,
  5 |     ops::Range,
  6 |     path::{Path, PathBuf},
  7 |     process::Command,
  8 |     time::{Duration, Instant},
  9 | };
 10 | 
 11 | use clap::{error::ErrorKind, CommandFactory, Parser};
 12 | use indicatif::{ProgressBar, ProgressStyle};
 13 | use lsio_io::{Completion, Reader};
 14 | use lsio_uring::IoUring;
 15 | 
 16 | const FILENAME_PREFIX: &str = "lsio_bench_";
 17 | const MEBIBYTE: f64 = (1024 * 1024) as _;
 18 | 
 19 | #[derive(Parser, Debug)]
 20 | #[command(version, about, long_about = None)]
 21 | struct Args {
 22 |     /// Prefix filenames with this directory. If not set, will default to the system's temporary
 23 |     /// directory. This directory must already exist.
 24 |     #[arg(short, long)]
 25 |     directory: Option<PathBuf>,
 26 | 
 27 |     /// The number of files to read from for this benchmark.
 28 |     #[arg(short, long, default_value_t = 1, value_parser = clap::value_parser!(u32).range(1..))]
 29 |     nrfiles: u32,
 30 | 
 31 |     /// The size of each file, in bytes
 32 |     #[arg(short, long, default_value_t = 1024 * 1024, value_parser = clap::value_parser!(u64).range(1..))]
 33 |     filesize: u64,
 34 | 
 35 |     /// The chunk size in bytes. By default, the blocksize will be the same as the filesize.
 36 |     #[arg(short, long, value_parser = clap::value_parser!(u64).range(1..))]
 37 |     blocksize: Option<u64>,
 38 | 
 39 |     /// The number of worker threads that lsio_uring uses:
 40 |     #[arg(short = 'w', long, default_value_t = 4, value_parser = clap::value_parser!(u64).range(1..1024))]
 41 |     nr_worker_threads: u64,
 42 | }
 43 | 
 44 | fn main() -> std::io::Result<()> {
 45 |     let args = Args::parse();
 46 | 
 47 |     let directory = check_directory_or_use_temp_dir(&args.directory);
 48 | 
 49 |     let filenames: Vec<PathBuf> = (0..args.nrfiles)
 50 |         .map(|i| directory.join(format!("{FILENAME_PREFIX}{i}")))
 51 |         .collect();
 52 | 
 53 |     create_files_if_necessary(&filenames, args.filesize)?;
 54 | 
 55 |     clear_page_cache(&directory);
 56 | 
 57 |     read_files(
 58 |         &filenames,
 59 |         args.filesize,
 60 |         args.blocksize,
 61 |         args.nr_worker_threads as usize,
 62 |     );
 63 | 
 64 |     Ok(())
 65 | }
 66 | 
 67 | fn check_directory_or_use_temp_dir(directory: &Option<PathBuf>) -> PathBuf {
 68 |     // Check directory exists. Or use temp_dir.
 69 |     if let Some(directory) = directory.as_deref() {
 70 |         if directory.is_dir() {
 71 |             directory.to_path_buf()
 72 |         } else {
 73 |             let mut cmd = Args::command();
 74 |             cmd.error(
 75 |                 ErrorKind::ValueValidation,
 76 |                 format!("Directory {directory:?} does not exist, or is not a directory"),
 77 |             )
 78 |             .exit();
 79 |         }
 80 |     } else {
 81 |         temp_dir()
 82 |     }
 83 | }
 84 | 
 85 | fn create_files_if_necessary(filenames: &[PathBuf], filesize: u64) -> std::io::Result<()> {
 86 |     // Create progress bar:
 87 |     println!(
 88 |         "Creating {} files (if necessary), each of filesize {filesize} bytes...",
 89 |         filenames.len()
 90 |     );
 91 |     let pb = ProgressBar::new(filenames.len() as _);
 92 |     pb.set_style(get_progress_bar_style());
 93 | 
 94 |     // Loop through files:
 95 |     let mut file_contents: Option<Vec<u8>> = None;
 96 |     for filename in filenames {
 97 |         if filename.exists() && get_filesize(&filename)? == filesize {
 98 |             pb.set_message(format!("exists: {filename:?}"));
 99 |         } else {
100 |             pb.set_message(format!("creating: {filename:?}"));
101 |             if file_contents.is_none() {
102 |                 file_contents = Some((0..filesize).map(|i| i as u8).collect());
103 |             }
104 |             let mut file = File::create(&filename)?;
105 |             file.write_all(file_contents.as_ref().unwrap())?;
106 |             file.flush()?;
107 |         }
108 |         pb.inc(1);
109 |     }
110 |     pb.finish_with_message("done");
111 |     Ok(())
112 | }
113 | 
114 | fn get_filesize(filename: &Path) -> std::io::Result<u64> {
115 |     Ok(File::open(&filename)?.metadata()?.len())
116 | }
117 | 
118 | fn get_progress_bar_style() -> ProgressStyle {
119 |     ProgressStyle::with_template("[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}")
120 |         .unwrap()
121 |         .progress_chars("##-")
122 | }
123 | 
124 | fn read_files(
125 |     filenames: &[PathBuf],
126 |     filesize: u64,
127 |     blocksize: Option<u64>,
128 |     n_worker_threads: usize,
129 | ) {
130 |     let blocksize = if let Some(bs) = blocksize {
131 |         bs
132 |     } else {
133 |         filesize
134 |     };
135 | 
136 |     // Calculate chunks
137 |     let n_chunks = filesize / blocksize;
138 |     let chunks: Vec<Range<isize>> = (0..n_chunks)
139 |         .map(|chunk_i| {
140 |             let chunk_start = (chunk_i * blocksize) as isize;
141 |             let chunk_end = chunk_start + (blocksize as isize);
142 |             chunk_start..chunk_end
143 |         })
144 |         .collect();
145 |     assert_eq!(chunks.len(), n_chunks as _);
146 | 
147 |     // Define user_data (so we can identify the chunks!)
148 |     let user_data: Vec<u64> = (0..n_chunks as u64).collect();
149 | 
150 |     let mut uring = IoUring::new(n_worker_threads);
151 | 
152 |     // Set up progress bar:
153 |     let n_files = filenames.len() as u64;
154 |     let n_total_chunks = n_files * n_chunks;
155 |     println!("Performing read benchmark for {n_files} files x {n_chunks} chunks per file = {n_total_chunks} total chunks:");
156 |     let pb = ProgressBar::new(n_total_chunks);
157 |     pb.set_style(get_progress_bar_style());
158 | 
159 |     let started = Instant::now();
160 | 
161 |     // Submit all the get_ranges requests:
162 |     for filename in filenames {
163 |         uring
164 |             .get_ranges(&filename, chunks.clone(), user_data.clone())
165 |             .unwrap();
166 |     }
167 | 
168 |     // Collect results
169 |     for _ in 0..n_total_chunks {
170 |         match uring
171 |             .completion()
172 |             .recv_timeout(Duration::from_millis(10000))
173 |         {
174 |             Ok(_) => pb.inc(1),
175 |             Err(e) => panic!("Error collecting chunk! {e:?}"),
176 |         }
177 |     }
178 |     pb.finish();
179 | 
180 |     // Calculate bandwidth
181 |     let total_secs = started.elapsed().as_secs_f64();
182 |     let total_bytes = (filesize * n_files) as f64;
183 |     let bytes_per_sec = total_bytes / total_secs;
184 |     println!("Total runtime: {} secs", total_secs);
185 |     println!("Total mebibytes: {} MiB", total_bytes / MEBIBYTE);
186 |     println!(
187 |         "Total bandwidth = {} mebibytes per sec",
188 |         bytes_per_sec / MEBIBYTE
189 |     );
190 | }
191 | 
192 | fn clear_page_cache(directory: &Path) {
193 |     println!("Clearing page cache for {directory:?}...");
194 |     let _ = Command::new("vmtouch")
195 |         .arg("-e")
196 |         .arg(directory)
197 |         .output()
198 |         .expect("vmtouch failed to start");
199 | }
200 | 


--------------------------------------------------------------------------------
/planned_design.md:
--------------------------------------------------------------------------------
  1 | # Planned Design for `light-speed-io`
  2 | 
  3 | # Summary
  4 | 
  5 | The ultimate aim is to load and process multi-dimensional arrays as quickly and as efficiently as modern hardware will allow!
  6 | 
  7 | Why? Initially, to make life as easy as possible for folks who want to train ML models on large multi-dimensional datasets (like weather predictions). Specifically:
  8 | 
  9 | - Enable folks to train directly from source datasets (instead of having to prepare batches ahead-of-time) at multiple GB/sec,
 10 | - make it as easy as possible to combine different datasets on-the-fly (e.g. NWPs and satellite datasets, see [issue #142](https://github.com/JackKelly/light-speed-io/issues/142)),
 11 | - use single on-disk datasets for as many ML experiments as possible (see [issue #141](https://github.com/JackKelly/light-speed-io/issues/141)). Stop people having to manually duplicate data (with different chunking schemes) for each ML model.
 12 | 
 13 | LSIO will provide a suite of tools for loading and processing large, labelled, multi-dimensional datasets. Where "large" means datasets that are too large to fit into RAM, and where "labelled" means datasets where each array dimension can be associated with coordinates. For example, a dataset of satellite imagery might have 4 dimensions: x, y, time, and spectral channel. The x and y dimensions might be labelled with longitude and latitude coordinates, respectively.
 14 | 
 15 | The main focus will be on processing data on a single machine. Hopefully tools like Dask could schedule LSIO across multiple machines.
 16 | 
 17 | Please see [this blog post](https://jack-kelly.com/blog/2023-07-28-speeding-up-zarr) for more details of the background and motivations behind this project.
 18 | 
 19 | This git repository contains multiple crates. Each crate implements "just one thing". Each crate will exist in one of five levels of abstraction. And there will be a Python API to each level of abstraction. See the "planned design" diagram below.
 20 | 
 21 | ## Fitting into the ecosystem
 22 | Today, there are many awesome software packages for working with large, labelled, multi-dimensional datasets (such as xarray, fsspec, dask, kerchunk, satpy, etc.). LSIO aims to help speed up this existing stack: Either by providing tools that existing Python packages can hook into, or by providing new tools which play nicely with the existing stack, or by creating new tools with very similar Python APIs to existing Python APIs.
 23 | 
 24 | ## Why bother to build `light-speed-io`? What gap does it fill?
 25 | LSIO is all about computational speed _and_ efficiency! Today, using existing packages, you can achieve high throughput by spinning up a large cluster. But that's expensive, power-hungry, and tedious! The aim of LSIO is to enable high throughput and low latency on a single machine.
 26 | 
 27 | To look at this from the users' perspective: one of the main aims is to enable users to far more easily train ML models on huge multi-dimensional datasets.
 28 | 
 29 | ## How to be efficient and fast?
 30 | By being [sympathetic](https://dzone.com/articles/mechanical-sympathy) to the hardware!
 31 | 
 32 | That sounds abstract! In concrete terms, one central aim is for the machine to do as little work as possible. Specifically:
 33 | 
 34 | Minimise the number of:
 35 | - round-trips to RAM,
 36 | - system calls,
 37 | - heap allocations,
 38 | - network requests,
 39 | - memory copies.
 40 | 
 41 | Maximise the use of:
 42 | - CPU caches,
 43 | - all the levels of parallelism available within a single machine,
 44 | - modern, efficient IO APIs like io_uring.
 45 | 
 46 | ## Concrete goals
 47 | Some example concrete goals include:
 48 | - Compute summary statistics of multi-terabyte dataset on a laptop, at a speed of about 5 minutes per terabyte (from a fast local SSD), with minimal RAM requirements.
 49 | - Train a large machine learning model from two Zarr datasets (e.g. satellite imagery and numerical weather predictions) at a sustained bandwidth to the GPU of at least 1 gigabyte per second (from local SSDs or from a cloud storage bucket), whilst performing some light processing on the data on-the-fly. Crucially, each ML training example should be a random crop of the multi-dimensional dataset. (Random cropping is particularly slow on today's software stack.)
 50 | 
 51 | ## Priorities
 52 | The first area of focus is on high-speed IO for local SSDs on Linux, to speed up training ML models from sharded Zarr datasets. But we're definitely also interested in helping speed up access to data stored in cloud object storage (see [issue #10](https://github.com/JackKelly/light-speed-io/issues/10)), and in helping to speed up general data analytics tasks on multi-dimensional data.
 53 | 
 54 | ## How long will this take?
 55 | Implementing the complete design sketched out in this doc will take _years_!
 56 | 
 57 | By the end of 2024, I hope to have MVP implementations of "level 1 (I/O)" and "level 2 (parallel compute on chunks)" and a basic Zarr implementation for level 4. But please don't hold me to that!
 58 | 
 59 | # Which crates would live in this repo? What would they do? And how would they communicate with each other? 
 60 | 
 61 | ![Planned design for LSIO](planned_design.svg)
 62 | ([Original Google Draw version of this diagram](https://docs.google.com/drawings/d/1cpRai2k9y2Y9v4ieaof33FT27uB4JlK_rJL9Lvbj4MM/edit?usp=sharing).)
 63 | 
 64 | My hope is to categorise the crates into several different levels of abstraction:
 65 | 
 66 | ## Abstraction level 1: Data input/output
 67 | This is lowest level of abstraction: the level closest to the hardware.
 68 | 
 69 | ### Common interface
 70 | These IO crates will share a common interface: 
 71 | - Instructions will be given to the IO crate via a common set of methods, defined in a `Reader` trait and a `Writer` trait. Under the hood, the `lsio_uring` crate will encode each of the user's commands into a `struct`, and put these structs into a multi-producer multi-consumer `crossbeam::channel`. An example of an instruction could be: "get 1,000,000 chunks of `/foo/bar`". These `IoOperation`s will probably be grouped ([#68](https://github.com/JackKelly/light-speed-io/issues/68)), such that the IO crate will guarantee that all operations in group _n_ are completed before any IO operations in group _n+1_ are started.
 72 | - Output channel: A multi-producer multi-consumer `crossbeam::channel` of `Chunk`s to return completed data to the user (these will also be grouped) (see [#105](https://github.com/JackKelly/light-speed-io/issues/105)).
 73 | 
 74 | LSIO will also enable buffer recycling whereby the user can optionally tell the IO crate "re-use each IO buffer once I've dropped it" (to minimise the number of heap allocations). ([#38](https://github.com/JackKelly/light-speed-io/issues/38)). This will probably be implemented via the `drop` method on `AlignedBytes`.
 75 | 
 76 | ### Crates
 77 | - [ ] `aligned_bytes`
 78 | - [ ] `lsio_uring` (this is what I'm currently working on): provide a small threadpool which performs IO using io_uring.
 79 | - [ ] [`lsio_io_python_bridge` #39)[https://github.com/JackKelly/light-speed-io/issues/39]
 80 | - [ ] [`object_store_bridge` #107](https://github.com/JackKelly/light-speed-io/issues/107) (also see [Ideas for fast cloud storage #10](https://github.com/JackKelly/light-speed-io/issues/10))
 81 | - [ ] maybe other crates for high-performance local storage on MacOS and/or Windows.
 82 | 
 83 | ## Abstraction level 2: Parallel compute on chunks
 84 | 
 85 | ### Common interface
 86 | These crates will all consume the `output channel` from the IO layer.
 87 | 
 88 | ### Crates
 89 | - [ ] `lsio_compute`: Perform parallel computation on data. Users can supply any function to be applied to each chunk. The actual computation will probably be orchestrated by Rayon. This crate will implement functions for operating on the `struct Chunks` that represents each buffer with its metadata (see #105).
 90 | - [ ] `lsio_codecs`: Compression / decompression
 91 | 
 92 | ## Abstraction level 3: Automatically scheduling compute & IO
 93 | The aim is to do to as little work as possible to satisfy the user's requests: don't repeat work (if we can avoid it) and don't do work that doesn't contribute to the final outcome.
 94 | 
 95 | - [ ] `lsio_scheduler`
 96 | 
 97 | ## Abstraction level 4: Crates that load / write to a specific file format
 98 | 
 99 | These crates will each include a Python API.
100 | 
101 | ### Crates
102 | - [ ] `lsio_zarr_codec_pipeline`: A pure-Rust implementation of [`zarr-python` v3's new `CodecPipeline`](https://github.com/zarr-developers/zarr-python/issues/1806#issuecomment-2085680824), that can be used as a storage and decompression backend for `zarr-python`.
103 | - [ ] `lsio_zarr`
104 | - [ ] `lsio_grib`
105 | - [ ] etc.
106 | 
107 | ## Abstraction level 5: Domain-specific computation
108 | 
109 | ### Crates
110 | - [ ] `lsio_rechunker`
111 | - [ ] `lsio_array`
112 | 
113 | 


--------------------------------------------------------------------------------
/crates/lsio_aligned_bytes/README.md:
--------------------------------------------------------------------------------
  1 | A memory buffer allocated on the heap.
  2 | 
  3 | The start position and end position of the backing buffer are both aligned in memory. The user
  4 | specifies the memory alignment at runtime. This is useful for working with `O_DIRECT` file IO,
  5 | where the filesystem will often expect the buffer to be aligned to the logical block size of
  6 | the filesystem[^o_direct] (typically 512 bytes).
  7 | 
  8 | The API is loosely inspired by the [`bytes`](https://docs.rs/bytes/latest/bytes/index.html) crate.
  9 | To give a very quick overview of the `bytes` crate: The `bytes` crate has an (immutable)
 10 | [`Bytes`](https://docs.rs/bytes/latest/bytes/struct.Bytes.html) struct and a (mutable)
 11 | [`BytesMut`](https://docs.rs/bytes/latest/bytes/struct.BytesMut.html) struct. `BytesMut` can be
 12 | `split` into multiple non-overlapping owned views of the same backing buffer. The backing
 13 | buffer is dropped when all the views referencing that buffer are dropped. `BytesMut` can be
 14 | [frozen](https://docs.rs/bytes/latest/bytes/struct.BytesMut.html#method.freeze) to produce an
 15 | (immutable) `Bytes` struct which, in turn, can be sliced to produce (potentially overlapping)
 16 | owned views of the same backing buffer.
 17 | 
 18 | `aligned_bytes` follows a similar pattern:
 19 | 
 20 | [`AlignedBytesMut`] can be [`AlignedBytesMut::split_to`] to produce multiple non-overlapping mutable
 21 | views of the same backing buffer without copying the memory (each `AlignedBytesMut` has its own
 22 | `range` (which represents the byte range that this `AlignedBytesMut` has exclusive access to)
 23 | and an `Arc<InnerBuffer>`). The splitting process guarantees that views cannot overlap, so we
 24 | do not have to use locks, whilst allowing multiple threads to write to (non-overlapping regions
 25 | of) the same buffer.
 26 | 
 27 | When you have finished writing into the buffer, drop all but one of the `AlignedBytesMut`
 28 | objects, and call [`AlignedBytesMut::freeze`] on the last `AlignedByteMut`. This will
 29 | consume the `AlignedBytesMut` and return an (immutable) [`AlignedBytes`]. Then you
 30 | can `clone` and [`AlignedBytes::set_slice`] to get (potentially overlapping) owned views of the
 31 | same backing buffer.
 32 | 
 33 | The backing buffer will be dropped when all views into the backing buffer are dropped.
 34 | 
 35 | Unlike `bytes`, `aligned_bytes` does not use a `vtable`, nor does it allow users to grow the
 36 | backing buffers. `aligned_bytes` implements the minimal set of features required for the rest
 37 | of the LSIO project! In fact, `aligned_bytes` is _so_ minimal that the only way to write data
 38 | into an `AlignedBytesMut` is via [`AlignedBytesMut::as_mut_ptr`] (because that's what the
 39 | operating system expects!)
 40 | 
 41 | # Examples and use-cases
 42 | 
 43 | **Use case 1: The user requests multiple contiguous byte ranges from LSIO.**
 44 | 
 45 | Let's say the user requests two byte ranges from a single file: `0..4096`, and `4096..8192`.
 46 | 
 47 | Under the hood, LSIO will:
 48 | 
 49 | - Notice that these two byte ranges are consecutive, and merge these two byte ranges into a
 50 |   single read operation.
 51 | - Allocate a single 8,192 byte `AlignedBytesMut`, aligned to 512-bytes.
 52 | - Submit a `read` operation to `io_uring` for all 8,192 bytes.
 53 | - When the single read op completes, we `freeze` the buffer, which consumes the
 54 |   `AlignedBytesMut` and returns an `AlignedBytes`, which we then `reset_slice()` to view the
 55 |   entire 8,192 backing buffer.
 56 | - Split the `AlignedBytes` into two owned `AlignedBytes`, and return these to the user.
 57 | - The underlying buffer will be dropped when the user drops the two `AlignedBytes`.
 58 | 
 59 | Here's a code sketch to show how this works:
 60 | 
 61 | ```rust
 62 | use lsio_aligned_bytes::AlignedBytesMut;
 63 | 
 64 | // Allocate a single 8,192 byte `AlignedBytesMut`:
 65 | const LEN: usize = 8_192;
 66 | const ALIGN: usize = 512;
 67 | let mut bytes = AlignedBytesMut::new(LEN, ALIGN);
 68 | 
 69 | // Write into the buffer. (In this toy example, we'll write directly into the buffer.
 70 | // But in "real" code, we'd pass the pointer to the operating system, which in turn
 71 | // would write data into the buffer for us.)
 72 | let ptr = bytes.as_mut_ptr();
 73 | for i in 0..LEN {
 74 |     unsafe { *ptr.offset(i as isize) = i as u8; }
 75 | }
 76 | 
 77 | // Freeze (to get a read-only `AlignedBytes`). We `unwrap` because `freeze`
 78 | // will fail if there's more than one `AlignedBytesMut` referencing our backing buffer.
 79 | let mut bytes = bytes.freeze().unwrap();
 80 | bytes.reset_slice();
 81 | let expected_byte_string: Vec<u8> = (0..LEN).map(|i| i as u8).collect();
 82 | assert_eq!(bytes.as_slice(), expected_byte_string);
 83 | 
 84 | // Slice the buffer into two new buffers:
 85 | let mut buffer_0 = bytes.clone();
 86 | buffer_0.set_slice(0..4_096);
 87 | let mut buffer_1 = bytes.clone();
 88 | buffer_1.set_slice(4_096..8_192);
 89 | assert_eq!(buffer_0.len(), 4_096);
 90 | assert_eq!(buffer_1.len(), 4_096);
 91 | assert_eq!(buffer_0.as_slice(), &expected_byte_string[0..4_096]);
 92 | assert_eq!(buffer_1.as_slice(), &expected_byte_string[4_096..8_192]);
 93 | 
 94 | // Check that the original `bytes` buffer is still valid:
 95 | assert_eq!(bytes.as_slice(), &expected_byte_string);
 96 | 
 97 | // Remove the original `bytes` and check that the two views of the same buffer
 98 | // are still valid:
 99 | drop(bytes);
100 | assert_eq!(buffer_0.as_slice(), &expected_byte_string[0..4_096]);
101 | assert_eq!(buffer_1.as_slice(), &expected_byte_string[4_096..8_192]);
102 | ```
103 | 
104 | **Use-case 2: The user requests a single 8 GiB file.**
105 | 
106 | Linux can't read more than 2 GiB at once[^linux_read]. So we need to read the 8 GiB files in
107 | multiple chunks.
108 | 
109 | LSIO will:
110 | - Allocate a single 8 GiB `AlignedBytesMut`.
111 | - Split this into a new 2 GiB `AlignedBytesMut` and the old `AlignedBytesMut` is reduced to 6 GiB.
112 |   Both of these buffers must have their starts and ends aligned. Then repeat the process to
113 |   get a total of 4 x 2 GiB `AlignedBytesMut`s.
114 | - Issue four `read` operations to the OS (one operation per `AlignedBytesMut`).
115 | - When the first, second, and third `read` ops complete, drop their `AlignedBytesMut`
116 |   (but that won't drop the underlying storage, it just removes its reference).
117 | - When the last `read` op completes, `freeze` the last `AlignedBytesMut` to get an immutable `AlignedBytes`.
118 |   `reset_slice` to get the 8 GB slice requested by the user. Pass this 8 GiB `AlignedBytes` to the user.
119 | 
120 | ```rust
121 | use lsio_aligned_bytes::AlignedBytesMut;
122 | 
123 | // Allocate a single array (for this toy example, we'll just allocate 8 MiB, instead of 8 GiB!)
124 | const MiB: usize = 2_usize.pow(20);
125 | const LEN: usize = 8 * MiB;
126 | const ALIGN: usize = 512;
127 | let mut bytes_3 = AlignedBytesMut::new(LEN, ALIGN);
128 | // `bytes_3` will be the final of four bytes_<n> arrays!
129 | 
130 | // Split into a 2 MiB buffer, and a 6 MiB buffer:
131 | let mut bytes_0 = bytes_3.split_to(2 * MiB).unwrap();
132 | assert_eq!(bytes_0.len(), 2 * MiB);
133 | assert_eq!(bytes_3.len(), 6 * MiB);
134 | 
135 | // Continue splitting:
136 | let mut bytes_1 = bytes_3.split_to(4 * MiB).unwrap();
137 | let mut bytes_2 = bytes_3.split_to(6 * MiB).unwrap();
138 | assert_eq!(bytes_0.len(), 2 * MiB);
139 | assert_eq!(bytes_1.len(), 2 * MiB);
140 | assert_eq!(bytes_2.len(), 2 * MiB);
141 | assert_eq!(bytes_3.len(), 2 * MiB);
142 | 
143 | // Write into the arrays:
144 | // Fill the first 2 MiB with zeros, fill the second 2 MiB with ones, etc.
145 | for i in 0..(2 * MiB) {
146 |     unsafe {
147 |         *bytes_0.as_mut_ptr().offset(i as isize) = 0;
148 |         *bytes_1.as_mut_ptr().offset(i as isize) = 1;
149 |         *bytes_2.as_mut_ptr().offset(i as isize) = 2;
150 |         *bytes_3.as_mut_ptr().offset(i as isize) = 3;
151 |     }
152 | }
153 | 
154 | // Drop three of the four AlignedBytesMuts, in preparation for freezing:
155 | drop(bytes_0);
156 | drop(bytes_1);
157 | drop(bytes_2);
158 | 
159 | // Needs to be `mut` so we can `reset_slice()`. Doesn't actually mutate the buffer!
160 | let mut bytes = bytes_3.freeze().unwrap();
161 | bytes.reset_slice();
162 | 
163 | let expected: Vec<u8> = (0..LEN).map(|i| (i / (2 * MiB)) as u8).collect();
164 | // We use `Iterator::eq` instead of `assert_eq!` to avoid `assert_eq!` printing out
165 | // 16 million numbers if the arrays aren't exactly equal!
166 | assert!(bytes.as_slice().iter().eq(expected.iter()));
167 | 
168 | ```
169 | 
170 | [^o_direct]: For more information on `O_DIRECT`, including the memory alignment requirements,
171 |   see all the mentions of `O_DIRECT` in the [`open(2)`](https://man7.org/linux/man-pages/man2/open.2.html) man page.
172 | [^linux_read]: Actually, the limit isn't exactly 2 GiB. On Linux, `read` will transfer at most
173 |   2,147,479,552 bytes. See the [`read`](https://man7.org/linux/man-pages/man2/read.2.html) man
174 |   page!
175 | 
176 | 


--------------------------------------------------------------------------------
/crates/lsio_aligned_bytes/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![warn(missing_docs)]
  2 | #![doc = include_str!("../README.md")]
  3 | 
  4 | use anyhow;
  5 | use std::{alloc, ops::Range, slice, sync::Arc};
  6 | 
  7 | /// A mutable aligned buffer.
  8 | #[derive(Debug)]
  9 | pub struct AlignedBytesMut {
 10 |     buf: Arc<InnerBuffer>,
 11 | 
 12 |     /// The slice requested by the user.
 13 |     range: Range<usize>,
 14 | }
 15 | 
 16 | unsafe impl Send for AlignedBytesMut {}
 17 | unsafe impl Sync for AlignedBytesMut {}
 18 | 
 19 | impl AlignedBytesMut {
 20 |     /// Creates a new `AlignedBytesMut`.
 21 |     ///
 22 |     /// Aligns the start and end of the buffer with `align`.
 23 |     /// 'align' must not be zero, and must be a power of two.
 24 |     /// `len` is the length of the underlying buffer, in bytes.
 25 |     pub fn new(len: usize, align: usize) -> Self {
 26 |         let inner_buf = InnerBuffer::new(len, align);
 27 |         Self {
 28 |             buf: Arc::new(inner_buf),
 29 |             range: 0..len,
 30 |         }
 31 |     }
 32 | 
 33 |     /// Returns the length of the `range` requested by the user. The `range` is a view into the
 34 |     /// underlying buffer. The underlying buffer may be larger than `len`.
 35 |     pub fn len(&self) -> usize {
 36 |         self.range.len()
 37 |     }
 38 | 
 39 |     /// Returns a mutable pointer to the underlying buffer offset by `self.range.start`.
 40 |     pub fn as_mut_ptr(&mut self) -> *mut u8 {
 41 |         let ptr = self.buf.as_mut_ptr();
 42 |         unsafe { ptr.offset(self.range.start as isize) }
 43 |     }
 44 | 
 45 |     /// Split this view of the underlying buffer into two views at the given index.
 46 |     ///
 47 |     /// This does not allocate a new buffer. Instead, both `AlignedBytesMut` objects reference
 48 |     /// the same underlying backing buffer.
 49 |     ///
 50 |     /// `idx` indexes into the backing buffer.
 51 |     ///
 52 |     /// `idx` must not be zero. `idx` must be exactly divisible by the alignment of the underlying
 53 |     /// buffer. `idx` must be contained in `self.range`.
 54 |     ///
 55 |     /// Afterwards, `self` contains `[idx, range.end)`. The returned `AlignedBytesMut`
 56 |     /// contains elements `[range.start, idx)`.
 57 |     ///
 58 |     /// To show this graphically:
 59 |     ///
 60 |     /// Before calling `split_to`:
 61 |     ///
 62 |     /// ```text
 63 |     /// Underlying buffer:  0 1 2 3 4 5 6 7 8 9
 64 |     /// self.range       :     [2,          8)
 65 |     /// ```
 66 |     ///
 67 |     /// After calling `split_to(6)`:
 68 |     ///
 69 |     /// ```text
 70 |     /// Underlying buffer:  0 1 2 3 4 5 6 7 8 9
 71 |     /// self.range       :             [6,  8)
 72 |     /// other.range      :     [2,      6)
 73 |     /// ```
 74 |     pub fn split_to(&mut self, idx: usize) -> anyhow::Result<Self> {
 75 |         if !self.range.contains(&idx) {
 76 |             Err(anyhow::format_err!(
 77 |                 "idx {idx} is not contained in this buffer's range {:?}",
 78 |                 self.range,
 79 |             ))
 80 |         } else if idx == 0 {
 81 |             Err(anyhow::format_err!("idx must not be zero!"))
 82 |         } else if idx % self.buf.alignment() != 0 {
 83 |             Err(anyhow::format_err!(
 84 |                 "idx {idx} must be exactly divisible by the alignment {}",
 85 |                 self.buf.alignment()
 86 |             ))
 87 |         } else {
 88 |             let new_range = self.range.start..idx;
 89 |             self.range.start = idx;
 90 |             Ok(AlignedBytesMut {
 91 |                 buf: self.buf.clone(),
 92 |                 range: new_range,
 93 |             })
 94 |         }
 95 |     }
 96 | 
 97 |     /// If this is the only `AlignedBytesMut` with access to the underlying buffer
 98 |     /// then `freeze` consumes `self` and returns a read-only `AlignedBytes`
 99 |     /// (wrapped in `Ok`), which contains a reference to the underlying buffer,
100 |     /// and has its `range` set to byte range of the `AlignedBytesMut`.
101 |     /// If, on the other hand, other `AlignedBytesMut`s have access to
102 |     /// the underlying buffer then `freeze` will return `Err(self)`.
103 |     pub fn freeze(self) -> Result<AlignedBytes, Self> {
104 |         if Arc::strong_count(&self.buf) == 1 {
105 |             Ok(AlignedBytes {
106 |                 buf: self.buf,
107 |                 range: self.range,
108 |             })
109 |         } else {
110 |             Err(self)
111 |         }
112 |     }
113 | }
114 | 
115 | /// Immutable.
116 | #[derive(Debug, Clone)]
117 | pub struct AlignedBytes {
118 |     buf: Arc<InnerBuffer>,
119 | 
120 |     /// The slice requested by the user.
121 |     range: Range<usize>,
122 | }
123 | 
124 | unsafe impl Send for AlignedBytes {}
125 | unsafe impl Sync for AlignedBytes {}
126 | 
127 | /// An immutable view of a memory buffer.
128 | ///
129 | /// The only way to make is an `AlignedBytes` is using [`AlignedBytesMut::freeze`].
130 | impl AlignedBytes {
131 |     /// Sets the slice for `self`.
132 |     ///
133 |     /// The requested `range` indexes into the entire underlying buffer.
134 |     ///
135 |     /// ## Panics
136 |     /// Panics if `range.is_empty()` or if `range.end` > the size of the underlying buffer.
137 |     pub fn set_slice(&mut self, range: Range<usize>) -> &Self {
138 |         assert!(!range.is_empty());
139 |         assert!(range.end <= self.buf.len());
140 |         self.range = range;
141 |         self
142 |     }
143 | 
144 |     /// Resets this `AlignedBytes` range to be equal to the total extent of the underlying buffer.
145 |     pub fn reset_slice(&mut self) -> &Self {
146 |         self.range = 0..self.buf.len();
147 |         self
148 |     }
149 | 
150 |     /// Returns the length of the `range` requested by the user. The `range` is a view into the
151 |     /// underlying buffer. The underlying buffer may be larger than `len`.
152 |     pub fn len(&self) -> usize {
153 |         self.range.len()
154 |     }
155 | 
156 |     /// Returns a constant pointer to `self.range.start` of the underlying buffer.
157 |     pub fn as_ptr(&self) -> *const u8 {
158 |         let ptr = self.buf.as_ptr();
159 |         unsafe { ptr.offset(self.range.start as isize) }
160 |     }
161 | 
162 |     /// Returns an immutable slice of the `range` view of the underlying buffer.
163 |     pub fn as_slice(&self) -> &[u8] {
164 |         unsafe { slice::from_raw_parts(self.as_ptr(), self.len()) }
165 |     }
166 | }
167 | 
168 | #[derive(Debug)]
169 | struct InnerBuffer {
170 |     buf: *mut u8, // TODO: Replace `*mut u8` with `NotNull<u8>`.
171 | 
172 |     /// `layout.size()` gives the number of bytes _actually_ allocated, which will be
173 |     /// a multiple of `align`.
174 |     layout: alloc::Layout,
175 | }
176 | 
177 | impl InnerBuffer {
178 |     fn new(len: usize, align: usize) -> Self {
179 |         assert_ne!(len, 0);
180 |         let layout = alloc::Layout::from_size_align(len, align)
181 |             .expect("failed to create Layout!")
182 |             .pad_to_align();
183 |         let buf = unsafe { alloc::alloc(layout) };
184 |         if buf.is_null() {
185 |             alloc::handle_alloc_error(layout);
186 |         }
187 |         Self { buf, layout }
188 |     }
189 | 
190 |     /// Returns the total size of the underlying buffer.
191 |     const fn len(&self) -> usize {
192 |         self.layout.size()
193 |     }
194 | 
195 |     /// Returns the alignment, in bytes.
196 |     const fn alignment(&self) -> usize {
197 |         self.layout.align()
198 |     }
199 | 
200 |     fn as_mut_ptr(&self) -> *mut u8 {
201 |         self.buf
202 |     }
203 | 
204 |     fn as_ptr(&self) -> *const u8 {
205 |         self.buf
206 |     }
207 | }
208 | 
209 | impl Drop for InnerBuffer {
210 |     fn drop(&mut self) {
211 |         unsafe { alloc::dealloc(self.buf, self.layout) };
212 |     }
213 | }
214 | 
215 | #[cfg(test)]
216 | mod tests {
217 |     use super::*;
218 | 
219 |     #[test]
220 |     fn test_write_and_read() {
221 |         // Create a new buffer:
222 |         const LEN: usize = 16;
223 |         let mut aligned_buf1 = AlignedBytesMut::new(LEN, 8);
224 |         let mut aligned_buf2 = AlignedBytesMut::new(LEN, 8);
225 | 
226 |         // Set the values of the buffer:
227 |         {
228 |             let ptr1 = aligned_buf1.as_mut_ptr();
229 |             let ptr2 = aligned_buf2.as_mut_ptr();
230 |             unsafe {
231 |                 for i in 0..LEN {
232 |                     *ptr1.offset(i as _) = i as u8;
233 |                     *ptr2.offset(i as _) = i as u8;
234 |                 }
235 |             }
236 |         }
237 |         // Read the values back out:
238 |         {
239 |             let slice1 = aligned_buf1.freeze().unwrap();
240 |             let slice2 = aligned_buf2.freeze().unwrap();
241 |             for i in 0..LEN {
242 |                 assert_eq!(slice1.as_slice()[i], i as u8);
243 |                 assert_eq!(slice2.as_slice()[i], i as u8);
244 |             }
245 |             assert_eq!(
246 |                 slice1.as_slice(),
247 |                 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
248 |             );
249 |         }
250 |     }
251 | }
252 | 


--------------------------------------------------------------------------------
/crates/lsio_uring/benches/get.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{criterion_group, criterion_main, Criterion};
  2 | use light_speed_io::object_store_adapter::ObjectStoreAdapter;
  3 | use object_store::{path::Path as ObjectStorePath, ObjectStore};
  4 | use std::{
  5 |     ops::Range,
  6 |     process::Command,
  7 |     time::{Duration, Instant},
  8 | };
  9 | use tokio::runtime::Runtime;
 10 | 
 11 | const FILE_SIZE_BYTES: usize = 262_144;
 12 | const DATA_PATH: &str = "/mnt/t700-2tb/fio/";
 13 | const RANGE: Range<isize> = 0..(1024 * 16);
 14 | 
 15 | async fn uring_get(filenames: &Vec<ObjectStorePath>, n_iterations: u64) -> Duration {
 16 |     let mut total_time = Duration::ZERO;
 17 |     for _ in 0..n_iterations {
 18 |         // Setup (not timed):
 19 |         let store = ObjectStoreAdapter::default();
 20 |         clear_page_cache();
 21 |         let mut futures = Vec::with_capacity(filenames.len());
 22 | 
 23 |         // Timed code:
 24 |         let start_of_iter = Instant::now();
 25 |         for filename in filenames {
 26 |             futures.push(store.get(filename));
 27 |         }
 28 |         for f in futures {
 29 |             let b = f.await.expect("At least one Result was an Error");
 30 |             assert_eq!(b.as_slice().len(), FILE_SIZE_BYTES);
 31 |         }
 32 |         total_time += start_of_iter.elapsed();
 33 |     }
 34 |     total_time
 35 | }
 36 | 
 37 | async fn uring_get_range(filenames: &Vec<ObjectStorePath>, n_iterations: u64) -> Duration {
 38 |     let mut total_time = Duration::ZERO;
 39 |     for _ in 0..n_iterations {
 40 |         // Setup (not timed):
 41 |         let store = ObjectStoreAdapter::default();
 42 |         clear_page_cache();
 43 |         let mut futures = Vec::with_capacity(filenames.len());
 44 | 
 45 |         // Timed code:
 46 |         let start_of_iter = Instant::now();
 47 |         for filename in filenames {
 48 |             futures.push(store.get_range(filename, RANGE));
 49 |         }
 50 |         for f in futures {
 51 |             let b = f.await.expect("At least one Result was an Error");
 52 |             assert_eq!(b.as_slice().len(), RANGE.len());
 53 |         }
 54 |         total_time += start_of_iter.elapsed();
 55 |     }
 56 |     total_time
 57 | }
 58 | 
 59 | async fn local_file_system_get(filenames: &Vec<ObjectStorePath>, n_iterations: u64) -> Duration {
 60 |     // Unfortunately, I can't find a better way to share code between `load_files_with_io_uring_local`
 61 |     // and `load_files_with_local_file_system` because `ObjectStoreAdapter` doesn't yet `impl ObjectStore`.
 62 |     // And `ObjectStoreAdapter::get` and `LocalFileSystem::get` return slightly different types.
 63 |     // TODO: Reduce duplication if/when `ObjectStoreAdapter` implements `ObjectStore`.
 64 | 
 65 |     let mut total_time = Duration::ZERO;
 66 |     for _ in 0..n_iterations {
 67 |         // Setup (not timed):
 68 |         clear_page_cache();
 69 |         let mut handles = Vec::with_capacity(filenames.len());
 70 | 
 71 |         // Timed code:
 72 |         let start_of_iter = Instant::now();
 73 |         for filename in filenames {
 74 |             let filename = filename.clone();
 75 |             handles.push(tokio::spawn(async move {
 76 |                 // We can't create the `store` outside of `spawn` and move it into `spawn`.
 77 |                 // So we have to create the `store` _inside_ this `async` block.
 78 |                 let store = object_store::local::LocalFileSystem::default();
 79 |                 let result = store.get(&filename).await.unwrap();
 80 |                 result.bytes().await
 81 |             }));
 82 |         }
 83 | 
 84 |         for h in handles {
 85 |             let bytes = h.await.unwrap().unwrap();
 86 |             assert_eq!(bytes.len(), FILE_SIZE_BYTES);
 87 |         }
 88 | 
 89 |         total_time += start_of_iter.elapsed();
 90 |     }
 91 |     total_time
 92 | }
 93 | 
 94 | async fn local_file_system_get_range(
 95 |     filenames: &Vec<ObjectStorePath>,
 96 |     n_iterations: u64,
 97 | ) -> Duration {
 98 |     // Unfortunately, I can't find a better way to share code between `load_files_with_io_uring_local`
 99 |     // and `load_files_with_local_file_system` because `ObjectStoreAdapter` doesn't yet `impl ObjectStore`.
100 |     // And `ObjectStoreAdapter::get` and `LocalFileSystem::get` return slightly different types.
101 |     // TODO: Reduce duplication if/when `ObjectStoreAdapter` implements `ObjectStore`.
102 | 
103 |     const RANGE_USIZE: Range<usize> = Range {
104 |         start: RANGE.start as usize,
105 |         end: RANGE.end as usize,
106 |     };
107 | 
108 |     let mut total_time = Duration::ZERO;
109 |     for _ in 0..n_iterations {
110 |         // Setup (not timed):
111 |         clear_page_cache();
112 |         let mut handles = Vec::with_capacity(filenames.len());
113 | 
114 |         // Timed code:
115 |         let start_of_iter = Instant::now();
116 |         for filename in filenames {
117 |             let filename = filename.clone();
118 |             handles.push(tokio::spawn(async move {
119 |                 // We can't create the `store` outside of `spawn` and move it into `spawn`.
120 |                 // So we have to create the `store` _inside_ this `async` block.
121 |                 let store = object_store::local::LocalFileSystem::default();
122 |                 store.get_range(&filename, RANGE_USIZE).await.unwrap()
123 |             }));
124 |         }
125 | 
126 |         for h in handles {
127 |             let bytes = h.await.unwrap();
128 |             assert_eq!(bytes.len(), RANGE.len());
129 |         }
130 | 
131 |         total_time += start_of_iter.elapsed();
132 |     }
133 |     total_time
134 | }
135 | 
136 | fn bench_get(c: &mut Criterion) {
137 |     const N_FILES: usize = 1000;
138 | 
139 |     // Configure group:
140 |     let mut group = c.benchmark_group(format!("get_{N_FILES}_whole_files"));
141 |     group.sample_size(10);
142 |     group.warm_up_time(Duration::from_millis(2000));
143 |     group.throughput(criterion::Throughput::Bytes(
144 |         (FILE_SIZE_BYTES * N_FILES) as u64,
145 |     ));
146 | 
147 |     let filenames = get_filenames(N_FILES);
148 | 
149 |     // Run function:
150 |     group.bench_function("uring_get", |b| {
151 |         // Insert a call to `to_async` to convert the bencher to async mode.
152 |         // The timing loops are the same as with the normal bencher.
153 |         b.to_async(Runtime::new().unwrap())
154 |             .iter_custom(|n_iterations| uring_get(&filenames, n_iterations));
155 |     });
156 | 
157 |     // Run function:
158 |     group.bench_function("local_file_system_get", |b| {
159 |         // Insert a call to `to_async` to convert the bencher to async mode.
160 |         // The timing loops are the same as with the normal bencher.
161 |         b.to_async(Runtime::new().unwrap())
162 |             .iter_custom(|n_iterations| local_file_system_get(&filenames, n_iterations));
163 |     });
164 | 
165 |     group.finish();
166 | }
167 | 
168 | fn bench_get_range(c: &mut Criterion) {
169 |     const N_FILES: usize = 1000;
170 | 
171 |     // Configure group:
172 |     let mut group = c.benchmark_group(format!("get_{}_bytes_from_{N_FILES}_files", RANGE.len()));
173 |     group.sample_size(10);
174 |     group.warm_up_time(Duration::from_millis(2000));
175 |     group.throughput(criterion::Throughput::Bytes((RANGE.len() * N_FILES) as u64));
176 | 
177 |     let filenames = get_filenames(N_FILES);
178 | 
179 |     // Run function:
180 |     group.bench_function("uring_get_range", |b| {
181 |         // Insert a call to `to_async` to convert the bencher to async mode.
182 |         // The timing loops are the same as with the normal bencher.
183 |         b.to_async(Runtime::new().unwrap())
184 |             .iter_custom(|n_iterations| uring_get_range(&filenames, n_iterations));
185 |     });
186 | 
187 |     // Run function:
188 |     group.bench_function("local_file_system_get_range", |b| {
189 |         // Insert a call to `to_async` to convert the bencher to async mode.
190 |         // The timing loops are the same as with the normal bencher.
191 |         b.to_async(Runtime::new().unwrap())
192 |             .iter_custom(|n_iterations| local_file_system_get_range(&filenames, n_iterations));
193 |     });
194 | 
195 |     group.finish();
196 | }
197 | 
198 | criterion_group!(benches, bench_get, bench_get_range);
199 | criterion_main!(benches);
200 | 
201 | fn clear_page_cache() {
202 |     let _ = Command::new("vmtouch")
203 |         .arg("-e")
204 |         .arg(DATA_PATH)
205 |         .output()
206 |         .expect("vmtouch failed to start");
207 | 
208 |     // let _ = Command::new("sudo")
209 |     //     .arg("sysctl")
210 |     //     .arg("-w")
211 |     //     .arg("vm.drop_caches=3")
212 |     //     .output()
213 |     //     .expect("sudo sysctl failed to start");
214 | }
215 | 
216 | fn get_filenames(n: usize) -> Vec<ObjectStorePath> {
217 |     // Create a vector of filenames (files created by `fio`)
218 |     (0..n)
219 |         .map(|i| {
220 |             ObjectStorePath::from(format!(
221 |                 "//{DATA_PATH}sequential_read_1000_files_each_256KiB.0.{i}"
222 |             ))
223 |         })
224 |         .collect()
225 | }
226 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Light Speed IO (LSIO)
  2 | 
  3 | > [!WARNING]
  4 | > I've paused development on LSIO for now because it's clear that, for most users and most datasets in the world of weather forecasting, the data will increasingly be stored on cloud object storage, not local storage. And LSIO was mostly focused on speeding up local storage.
  5 | > In its current state, LSIO is a very minimal proof-of-concept that `io_uring` is faster than `object_store` when reading many small chunks of files from local PCIe 5 SSDs on Linux. There is no Python API yet.
  6 | 
  7 | The ultimate ambition is to enable folks to efficiently load and process large, multi-dimensional datasets as fast as modern CPUs & I/O subsystems will allow.
  8 | 
  9 | For now, this repo is just a place for me to tinker with ideas.
 10 | 
 11 | Under the hood, `light-speed-io` uses [`io_uring`](https://kernel.dk/io_uring.pdf) on Linux for local files, and will use [`object_store`](https://lib.rs/crates/object_store) for all other data I/O.
 12 | 
 13 | My first use-case for light-speed-io is to help to speed up reading [Zarr](https://zarr.dev/). After that, I'm interested in helping to create fast readers for "native" geospatial file formats like GRIB2 and EUMETSAT native files. And, even further than that, I'm interested in efficient & fast _computation_ on [out-of-core](https://en.wikipedia.org/w/index.php?title=Out-of-core), chunked, labelled, multi-dimensional data.
 14 | 
 15 | See [`planned_design.md`](planned_design.md) for more info on the planned design. And please see [this blogpost](https://jack-kelly.com/blog/2023-07-28-speeding-up-zarr) for my motivations for wanting to help speed up Zarr.
 16 | 
 17 | # Benchmarks
 18 | 
 19 | Hardware:
 20 | - CPU: AMD [Epyc 9124](https://en.wikichip.org/wiki/amd/epyc/9124) (16 physical cores, 64 MB L3, 3 GHz base, Zen 4)
 21 | - PCIe 5 SSD for benchmarking (1 TB Crucial T700 with heatsink)
 22 | - 64 GB RAM (4 sticks of Kingston 16 GB DDR5 Pro RDIMM 4800 MT/s)
 23 | 
 24 | Task:
 25 | - Read data from 500 files, each file is 41,214,400 bytes, using a block size of 262,144 bytes.
 26 | 
 27 | Results:
 28 | - `object_store`: 6.045 GiB/s
 29 | - `LSIO` (using 8 worker threads): 11.2 GiB/s
 30 | 
 31 | See [this issue](https://github.com/JackKelly/light-speed-io/issues/50#issuecomment-1992230414) for more details of the benchmarking.
 32 | 
 33 | # Roadmap
 34 | 
 35 | (This will almost certainly change!)
 36 | 
 37 | The list below is in (rough) chronological order. This roadmap is also represented in the [GitHub milestones for this project, when sorted alphabetically](https://github.com/JackKelly/light-speed-io/milestones?direction=asc&sort=title&state=open).
 38 | 
 39 | ### Throw-away prototype
 40 | - [x] Initial prototype where a single crate does the IO and compute
 41 | - [x] `io_uring` prototype
 42 | - [x] `io_uring` prototype using `Rayon` to loop through io_uring completion queue
 43 | - [x] `io_uring` async/await implementation with `object_store`-like API
 44 | - [x] Try mixing `Tokio` with `Rayon`
 45 | - [x] Don't initialise buffers
 46 | - [x] Use aligned buffers and `O_DIRECT`
 47 | - [x] Benchmark against `object_store` using `criterion`
 48 | - [x] Chain open, read, close ops in `io_uring`
 49 | - [x] Build new workstation (with PCIe5 SSD)
 50 | - [x] Try using trait objects vs enum  vs `Box::into_raw` for tracking in-flight operations
 51 | - [x] Try using fixed (registered) file descriptors
 52 | - [x] Try using `Rayon` for the IO threadpool
 53 | - [x] Investigate Rust's `Stream`.
 54 | 
 55 | ### Fresh start. Laying the foundations. New crates:
 56 | - [x] `lsio_aligned_bytes`: Shareable buffer which can be aligned to arbitrary boundaries at runtime
 57 | - [x] `lsio_threadpool`: Work-stealing threadpool (based on `crossbeam-deque`)
 58 | - [x] `lsio_io`: Traits for all LSIO IO backends
 59 | 
 60 | ### MVP IO layer
 61 | - [x] Implement minimal `lsio_uring` IO backend (for loading data from a local SSD) with user-defined number of worker threads
 62 | - [x] [Benchmark `lsio_uring` backend](https://github.com/JackKelly/light-speed-io/milestone/3)
 63 | - [ ] [Implement minimal `lsio_object_store_bridge` IO backend](https://github.com/JackKelly/light-speed-io/milestone/4)
 64 | - [ ] [Compare benchmarks for `lsio_uring` vs `lsio_object_store_bridge`](https://github.com/JackKelly/light-speed-io/milestone/7)
 65 | - [ ] [Improve usability and robustness](https://github.com/JackKelly/light-speed-io/milestone/8)
 66 | - [ ] [Group operations](https://github.com/JackKelly/light-speed-io/milestone/9)
 67 | 
 68 | ### MVP Compute layer
 69 | - [ ] Build a general-purpose work-steeling framework for applying arbitrary functions to chunks of data in parallel. And respect groups.
 70 | - [ ] Wrap a few decompression algorithms
 71 | 
 72 | ### MVP File format layer: Read from Zarr
 73 | - [ ] MVP Zarr library (just for _reading_ data)
 74 | - [ ] Python API for `lsio_zarr`
 75 | - [ ] Benchmark `lsio_zarr` vs `zarr-python v3` (from Python)
 76 | 
 77 | ### Improve the IO layer:
 78 | - [ ] Optimise (merge and split) IO operations
 79 | 
 80 | ### Improve the compute layer
 81 | - [ ] Investigate how xarray can "push down" chunkwise computation to LSIO
 82 | 
 83 | ### MVP End-user applications!
 84 | - [ ] Compute simple stats of a large dataset (to see if we hit our target of processing 1 TB per 5 mins on a laptop!)
 85 | - [ ] Load Zarr into a PyTorch training pipeline
 86 | - [ ] Implement merging multiple datasets on-the-fly (e.g. NWP and satellite).
 87 | 
 88 | ### First release!
 89 | - [ ] Docs; GitHub actions for Python releases; more rigorous automated testing; etc.
 90 | - [ ] Release!
 91 | - [ ] Enable Zarr-Python to use LSIO as a storage and codec pipeline?
 92 | 
 93 | ### Implement writing
 94 | - [ ] Implement writing using `lsio_uring`
 95 | - [ ] Implement writing using `lsio_object_store_bridge`
 96 | - [ ] Implement writing in `lsio_zarr`
 97 | 
 98 | ### Improve IO:
 99 | - [ ] [Speed up reading from cloud storage buckets](https://github.com/JackKelly/light-speed-io/issues/10) (using object_store)
100 | - [ ] Maybe experiment with [using io_uring for reading from cloud storage buckets](https://github.com/JackKelly/light-speed-io/issues/10#issuecomment-2178689758)
101 | - [ ] Re-use IO buffers
102 | - [ ] Register buffers with `io_uring`
103 | - [ ] Python API for LSIO's IO layer (and LSIO's compute layer?)
104 | 
105 | ### Improve the file formats layer: Add GRIB support???
106 | (Although maybe this won't be necessary because [dynamical.org](https://dynamical.org) are converting datasets to Zarr)
107 | - [ ] Implement simple GRIB reader?
108 | - [ ] Convert GRIB to Zarr?
109 | - [ ] Load GRIB into a PyTorch training pipeline?
110 | 
111 | ### Grow the team? (Only if the preceding work has shown promise)
112 | - [ ] Try to raise grant funding?
113 | - [ ] Hire???
114 | 
115 | ### Future work (in no particular order, and no promise any of these will be done!)
116 | - [ ] [Multi-dataset abstraction layer](https://github.com/JackKelly/light-speed-io/issues/142) (under the hood, the same data would be chunked differently for different use-cases. But that complexity would be hidden from users. Users would just interact with a single "logical dataset".)
117 | - [ ] Allow xarray to "push down" all its operations to LSIO
118 | - [ ] xarray-like data structures implemented in Rust? ([notes](https://docs.google.com/document/d/1_T0ay9wXozgqq334E2w1SROdlAM7y6JSgL1rmXJnIO0/edit#heading=h.7ctns22vpab5))
119 | - [ ] Fast indexing operations for xarray ([notes](https://docs.google.com/document/d/1_T0ay9wXozgqq334E2w1SROdlAM7y6JSgL1rmXJnIO0/edit#heading=h.kjphntldyaaw))
120 | - [ ] Support for kerchunk / [VirtualiZarr](https://discourse.pangeo.io/t/pangeo-showcase-virtualizarr-create-virtual-zarr-stores-using-xarray-syntax/4127) / [Zarr Manifest Storage Transformer](https://github.com/zarr-developers/zarr-specs/issues/287)
121 | - [ ] Compute using SIMD / NPUs / GPUs, perhaps using [Bend](https://github.com/JackKelly/light-speed-io/issues/132) / [Mojo](https://github.com/JackKelly/light-speed-io/discussions/12)
122 | - [ ] Support many compression algorithms
123 | - [ ] Automatically tune performance
124 | - [ ] "Smart" scheduling of compute and IO (see [notes](https://docs.google.com/document/d/1_T0ay9wXozgqq334E2w1SROdlAM7y6JSgL1rmXJnIO0/edit#heading=h.bqhd2mq9o42t))
125 | - [ ] Tile-based algorithms for numpy
126 | - [ ] EUMETSAT Native file format
127 | - [ ] NetCDF
128 | - [ ] Warping / spatial reprojection
129 | - [ ] Rechunking Zarr
130 | - [ ] Converting between formats (e.g. convert EUMETSAT `.nat` files to 10-bit per channel bit-packed Zarr). If there's no computation to be done on the data during conversion then do all the copying with `io_uring`: open source file -> read chunks from source -> write to destination -> etc.
131 | - [ ] Write a wiki (or a book) on high-performance multi-dimensional data IO and compute
132 | - [ ] Integrate with Dask to run tasks across many machines
133 | - [ ] Use LSIO as the storage and compute backend for other software packages
134 | 
135 | # Project structure
136 | 
137 | Light Speed IO is organised as a [Cargo workspace](https://doc.rust-lang.org/book/ch14-03-cargo-workspaces.html) with multiple ([small](https://rust-unofficial.github.io/patterns/patterns/structural/small-crates.html)) crates. The crates are organised in a [flat crate structure](https://matklad.github.io/2021/08/22/large-rust-workspaces.html). The flat crate structure is used by projects such as [Ruff](https://github.com/astral-sh/ruff), [Polars](https://github.com/pola-rs/polars), and [rust-analyser](https://github.com/rust-lang/rust-analyzer).
138 | 
139 | LSIO crate names use snake_case, following in the footsteps of the [Rust Book](https://doc.rust-lang.org/book/ch14-03-cargo-workspaces.html) and [Ruff](https://github.com/astral-sh/ruff/tree/main/crates). (The choice of snake_case versus hyphens is, as far as I can tell, entirely arbitrary: [Polars](https://github.com/pola-rs/polars/tree/main/crates) and [rust-analyser](https://github.com/rust-lang/rust-analyzer/tree/master/crates) both use hyphens. I just prefer the look of underscores!)
140 | 


--------------------------------------------------------------------------------
/crates/lsio_threadpool/src/threadpool.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     sync::{
  3 |         atomic::{AtomicBool, Ordering::Relaxed},
  4 |         mpsc::{self},
  5 |         Arc,
  6 |     },
  7 |     thread::{self, JoinHandle},
  8 | };
  9 | 
 10 | use crossbeam_deque as deque;
 11 | 
 12 | use crate::{
 13 |     park_manager::{ParkManager, ParkManagerCommand},
 14 |     shared_state::SharedState,
 15 |     worker::WorkerThread,
 16 | };
 17 | 
 18 | /// Manages a pool of worker threads. Each worker thread runs a clone of a user-supplied closure.
 19 | #[derive(Debug)]
 20 | pub struct ThreadPool<T>
 21 | where
 22 |     T: Send,
 23 | {
 24 |     worker_thread_handles: Vec<JoinHandle<()>>,
 25 |     park_manager_thread_handle: Option<JoinHandle<()>>,
 26 |     shared: SharedState<T>,
 27 | }
 28 | 
 29 | impl<T> ThreadPool<T>
 30 | where
 31 |     T: Send + 'static,
 32 | {
 33 |     /// Starts a new threadpool with `n_worker_threads` threads and runs a clone of `op` on
 34 |     /// each thread. `op` takes one argument: a [`WorkerThread<T>`] which provides helpful methods
 35 |     /// for the operation. Worker threads will shutdown when the `ThreadPool` goes out of scope.
 36 |     ///
 37 |     /// `new` also starts a separate thread which is responsible for tracking parked threads.
 38 |     ///
 39 |     /// Note that the `'static` lifetime constraint for `OP` basically just means that `op` can't
 40 |     /// capture any non-`'static` references. It's perfectly fine for `op` to capture owned types
 41 |     /// (such as `Vec`), as long as those owned types don't include any non-`'static` references.
 42 |     ///
 43 |     /// Typically, `op` will begin with any necessary setup (e.g. instantiating objects that will
 44 |     /// live for the lifetime of the thread) and will then enter a loop, something like:
 45 |     ///
 46 |     /// ```
 47 |     /// use lsio_threadpool::ThreadPool;
 48 |     /// const N_THREADS: usize = 4;
 49 |     /// let pool = ThreadPool::new(N_THREADS, |worker_thread| {
 50 |     ///     /* Optional: Configure per-thread state. */
 51 |     ///
 52 |     ///     while worker_thread.keep_running() {
 53 |     ///         match worker_thread.find_task() {
 54 |     ///             Some(task) => {
 55 |     ///                 process_task(task);
 56 |     ///                 // Optionally submit new tasks:
 57 |     ///                 // `worker_thread.push(new_task)`
 58 |     ///                 // Tasks might be "stolen" by other threads.
 59 |     ///             },
 60 |     ///
 61 |     ///             // Park the thread. `worker_thread.park` automatically registers this worker
 62 |     ///             // thread with the `ParkManager`. The thread will be automatically unparked if
 63 |     ///             // necessary. As a user of the `lsio_threadpool` library, you don't have to
 64 |     ///             // unpark threads manually!
 65 |     ///             None => worker_thread.park(),
 66 |     ///         }
 67 |     ///     }
 68 |     /// });
 69 |     ///
 70 |     /// fn process_task(task: u8) {
 71 |     ///     /* do something */
 72 |     /// }
 73 |     ///
 74 |     /// // `pool`'s threads will be shut down (by `ThreadPool` setting
 75 |     /// // `worker_thread.keep_running()` to false) when `pool` goes out of scope.
 76 |     /// ```
 77 |     ///
 78 |     pub fn new<OP>(n_worker_threads: usize, op: OP) -> Self
 79 |     where
 80 |         OP: Fn(WorkerThread<T>) + Send + Clone + 'static,
 81 |     {
 82 |         let (chan_to_park_manager, rx_for_park_manager) = mpsc::channel();
 83 |         let shared = SharedState {
 84 |             injector: Arc::new(deque::Injector::new()),
 85 |             keep_running: Arc::new(AtomicBool::new(true)),
 86 |             chan_to_park_manager,
 87 |             at_least_one_thread_is_parked: Arc::new(AtomicBool::new(false)),
 88 |         };
 89 | 
 90 |         // Spawn ParkManager thread:
 91 |         let park_manager_thread_handle = Some(ParkManager::start(
 92 |             rx_for_park_manager,
 93 |             Arc::clone(&shared.at_least_one_thread_is_parked),
 94 |             n_worker_threads,
 95 |         ));
 96 | 
 97 |         // Create work stealing queues:
 98 |         let mut local_queues: Vec<deque::Worker<T>> = (0..n_worker_threads)
 99 |             .map(|_| deque::Worker::new_fifo())
100 |             .collect();
101 |         let stealers: Arc<Vec<deque::Stealer<T>>> = Arc::new(
102 |             local_queues
103 |                 .iter()
104 |                 .map(|local_queue| local_queue.stealer())
105 |                 .collect(),
106 |         );
107 | 
108 |         // Spawn worker threads:
109 |         let worker_thread_handles = (0..n_worker_threads)
110 |             .map(|_| {
111 |                 let work_stealer = WorkerThread::new(
112 |                     shared.clone(),
113 |                     local_queues.pop().unwrap(),
114 |                     Arc::clone(&stealers),
115 |                 );
116 | 
117 |                 let op_clone = op.clone();
118 |                 thread::spawn(move || (op_clone)(work_stealer))
119 |             })
120 |             .collect();
121 | 
122 |         Self {
123 |             worker_thread_handles,
124 |             park_manager_thread_handle,
125 |             shared,
126 |         }
127 |     }
128 | 
129 |     /// Push a task from outside the threadpool into the global
130 |     /// "[injector](crossbeam_deque::Injector)" queue.
131 |     /// This is how users of `ThreadPool` submit tasks to the threadpool.
132 |     ///
133 |     /// `push` will automatically unpark worker threads if necessary.
134 |     pub fn push(&self, task: T) {
135 |         self.shared.injector.push(task);
136 |         self.shared.unpark_at_most_n_threads(1);
137 |     }
138 | }
139 | 
140 | impl<T> Drop for ThreadPool<T>
141 | where
142 |     T: Send,
143 | {
144 |     fn drop(&mut self) {
145 |         // Stop and join the worker threads:
146 |         self.shared.keep_running.store(false, Relaxed);
147 |         for handle in self.worker_thread_handles.drain(..) {
148 |             handle.thread().unpark();
149 |             handle
150 |                 .join()
151 |                 .unwrap_or_else(|e| println!("A worker thread panic: {e:?}"));
152 |         }
153 | 
154 |         // Stop and join the ParkManager:
155 |         self.shared
156 |             .chan_to_park_manager
157 |             .send(ParkManagerCommand::Stop)
158 |             .unwrap();
159 |         self.park_manager_thread_handle
160 |             .take()
161 |             .unwrap()
162 |             .join()
163 |             .unwrap();
164 |     }
165 | }
166 | 
167 | #[cfg(test)]
168 | 
169 | mod tests {
170 |     use std::{
171 |         collections::HashMap,
172 |         sync::{mpsc::TryRecvError, Mutex},
173 |         thread::ThreadId,
174 |         time::Duration,
175 |     };
176 | 
177 |     use super::*;
178 | 
179 |     fn add_one_to_hash(hash: &Arc<Mutex<HashMap<ThreadId, usize>>>) {
180 |         let mut log = hash.lock().unwrap();
181 |         log.entry(thread::current().id())
182 |             .and_modify(|count| *count += 1)
183 |             .or_insert(1);
184 |     }
185 | 
186 |     #[test]
187 |     fn test_threadpool() {
188 |         const N_THREADS: usize = 4;
189 |         const MULTIPLIER: usize = 8;
190 |         const N_TASKS: usize = N_THREADS * MULTIPLIER;
191 | 
192 |         let (output_tx, output_rx) = mpsc::channel::<usize>();
193 | 
194 |         // This HashMap maps from ThreadId to the number of times that thread gets Some(task).
195 |         let n_tasks_per_thread = Arc::new(Mutex::new(HashMap::new()));
196 | 
197 |         // This HashMap maps from ThreadId to the number of times that thread has parked.
198 |         let n_parks_per_thread = Arc::new(Mutex::new(HashMap::new()));
199 | 
200 |         let pool = ThreadPool::new(N_THREADS, {
201 |             let n_tasks_per_thread = Arc::clone(&n_tasks_per_thread);
202 |             let n_parks_per_thread = Arc::clone(&n_parks_per_thread);
203 |             move |worker_thread: WorkerThread<usize>| {
204 |                 while worker_thread.keep_running() {
205 |                     match worker_thread.find_task() {
206 |                         Some(task) => {
207 |                             output_tx.send(task).unwrap();
208 |                             add_one_to_hash(&n_tasks_per_thread);
209 |                             // Give other threads a chance to do work. Without this `sleep`,
210 |                             // one thread tends to to the majority of the work!
211 |                             thread::sleep(Duration::from_micros(1));
212 |                         }
213 |                         None => {
214 |                             add_one_to_hash(&n_parks_per_thread);
215 |                             worker_thread.park();
216 |                         }
217 |                     };
218 |                 }
219 |             }
220 |         });
221 | 
222 |         // Push tasks onto the global injector queue:
223 |         for i in 0..N_TASKS {
224 |             if i % N_THREADS == 0 {
225 |                 // Wait a moment to let the worker threads park, to check they wake up again!
226 |                 // Also wait at the start, to let the worker threads "come up".
227 |                 thread::sleep(Duration::from_millis(10));
228 |             }
229 |             pool.push(i);
230 |         }
231 | 
232 |         // Collect outputs and stop the work when all the outputs arrive:
233 |         let mut outputs: Vec<usize> = output_rx.iter().take(N_TASKS).collect();
234 |         outputs.sort();
235 |         assert!(outputs.into_iter().eq(0..N_TASKS));
236 |         assert!(matches!(
237 |             output_rx.try_recv().unwrap_err(),
238 |             TryRecvError::Empty
239 |         ));
240 |         drop(pool);
241 |         assert!(matches!(
242 |             output_rx.try_recv().unwrap_err(),
243 |             TryRecvError::Disconnected
244 |         ));
245 | 
246 |         // Check the n_tasks_per_thread and n_parks_per_thread statistics:
247 |         let unwrap_and_check_len = |log: Arc<Mutex<HashMap<ThreadId, usize>>>| {
248 |             let log = Mutex::into_inner(Arc::into_inner(log).unwrap()).unwrap();
249 |             assert_eq!(log.len(), N_THREADS);
250 |             log
251 |         };
252 |         let n_tasks_per_thread = unwrap_and_check_len(n_tasks_per_thread);
253 |         let n_parks_per_thread = unwrap_and_check_len(n_parks_per_thread);
254 | 
255 |         const MIN_TASKS_PER_THREAD: usize = 2;
256 |         for (thread_id, n_tasks) in n_tasks_per_thread.iter() {
257 |             assert!(
258 |                 *n_tasks >= MIN_TASKS_PER_THREAD,
259 |                 "{thread_id:?} only did {n_tasks} tasks, which is < the threshold {MIN_TASKS_PER_THREAD} tasks!"
260 |             );
261 |         }
262 |         for (thread_id, n_parks) in n_parks_per_thread.iter() {
263 |             assert!(
264 |                 *n_parks == MULTIPLIER || *n_parks == MULTIPLIER + 1,
265 |                 "{thread_id:?} did not park the correct number of times!"
266 |             );
267 |         }
268 |     }
269 | }
270 | 


--------------------------------------------------------------------------------