├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md └── src ├── context.rs ├── lib.rs └── schedule.rs /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | .vscode -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "schedwalk" 3 | version = "0.1.0" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "Test futures under all possible polling schedules" 7 | repo = "https://github.com/mpdn/schedwalk" 8 | 9 | [dependencies] 10 | async-task = "4.2.0" 11 | 12 | [dev-dependencies] 13 | futures = "0.3.21" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Mike Pedersen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # schedwalk 2 | 3 | Test futures under all possible polling schedules. 4 | 5 | Concurrent systems are hard. It can be very easy to accidentally assume progress happens in some 6 | specific order (i.e. race conditions). For async systems in Rust, that might be an assumption on the 7 | order that futures are polled - an assumption of the polling schedule. 8 | 9 | Most async test runtimes only executes one schedule for your test and will never cover all possible 10 | schedules. `schedwalk` is a an async test harness that allows you to reliably test all possible 11 | schedules. 12 | 13 | 14 | ## Example 15 | 16 | Suppose we are developing a web application and want to compute the average response time. We might 17 | model that with two tasks like this: 18 | 19 | ``` 20 | # use std::convert::identity as spawn; 21 | # futures::executor::block_on(async { 22 | use futures::{channel::mpsc, join}; 23 | 24 | let (sender, mut receiver) = mpsc::unbounded::(); 25 | 26 | let send_task = spawn(async move { 27 | sender.unbounded_send(23).unwrap(); 28 | sender.unbounded_send(20).unwrap(); 29 | sender.unbounded_send(54).unwrap(); 30 | }); 31 | 32 | let avg_task = spawn(async move { 33 | let mut sum = 0; 34 | let mut count = 0; 35 | while let Some(num) = receiver.try_next().unwrap() { 36 | sum += num; 37 | count += 1; 38 | } 39 | 40 | println!("average is {}", sum / count) 41 | }); 42 | 43 | join!(send_task, avg_task); 44 | # }) 45 | ``` 46 | 47 | But this has a race condition bug. What if `avg_task` executes before `send_task`? Then `count` will 48 | be 0 and we will thus divide by 0! We have implicitly assumed one task executes before the other. 49 | 50 | So how can we have create a test that trigger the above race condition? We could try executing under 51 | an async runtime like Tokio, but the problem with this is that it does not actually guarantee that 52 | the failing schedule will be executed. And in fact, at time of writing, it seems that the single 53 | threaded executor *never* triggers the failure. Using the multithreaded executor *may* trigger the 54 | failure, but there is no guarantee of that. At best, we have created a flaky test. 55 | 56 | Ideally, we want to test such code in a way where we fail deterministically every time in case of 57 | such bugs. 58 | 59 | Enter `schedwalk`: a library for testing futures under all possible schedules. Using `schedwalk` we 60 | can create a test like this: 61 | 62 | ```should_panic 63 | use schedwalk::{for_all_schedules, spawn}; 64 | use futures::{channel::mpsc, join}; 65 | 66 | for_all_schedules(|| async { 67 | let (sender, mut receiver) = mpsc::unbounded::(); 68 | 69 | let send_task = spawn(async move { 70 | sender.unbounded_send(23).unwrap(); 71 | sender.unbounded_send(20).unwrap(); 72 | sender.unbounded_send(54).unwrap(); 73 | }); 74 | 75 | let avg_task = spawn(async move { 76 | let mut sum = 0; 77 | let mut count = 0; 78 | while let Some(num) = receiver.try_next().unwrap() { 79 | sum += num; 80 | count += 1; 81 | } 82 | 83 | println!("average is {}", sum / count) 84 | }); 85 | 86 | join!(send_task, avg_task); 87 | }) 88 | ``` 89 | 90 | `schedwalk` will then execute the future under all possible schedules. In this case there are just 91 | two: one where `send_task` executes first and one where `avg_task` executes first. This will 92 | reliably trigger the bug in our tests. 93 | 94 | To make debugging easier, panics and deadlocks will print the polling schedule as a string to 95 | standard error. Setting the environment variable `SCHEDULE` to this will execute only the exact 96 | failing schedule. The above example will print `panic in SCHEDULE=01`. Executing the test again with 97 | `SCHEDULE=01 cargo test example` will then execute only that exact schedule. 98 | 99 | ## Caveats 100 | 101 | There are a few important caveats to `schedwalk`: 102 | - `schedwalk` assumes determinism. Futures must spawn and poll futures in the same order every time. 103 | I.e. there can be no thread-local or global state influencing the order futures are polled and no 104 | external IO can influence the system. 105 | - `schedwalk` will exhaustively walk all possible schedules. In cases of high amounts of 106 | futures that can be polled concurrently this can quickly become intractable. -------------------------------------------------------------------------------- /src/context.rs: -------------------------------------------------------------------------------- 1 | use std::cell::{Cell, RefCell, RefMut}; 2 | 3 | use async_task::Runnable; 4 | 5 | thread_local!( 6 | static CONTEXT: Context = Context { 7 | state: Cell::new(State::Stopped), 8 | runnables: RefCell::new(Vec::new()), 9 | } 10 | ); 11 | 12 | #[derive(Clone, Copy)] 13 | enum State { 14 | Running, 15 | Stopping, 16 | Stopped, 17 | } 18 | 19 | pub struct Context { 20 | state: Cell, 21 | runnables: RefCell>, 22 | } 23 | 24 | impl Context { 25 | pub fn schedule(runnable: Runnable) { 26 | CONTEXT.with(|context| match context.state.get() { 27 | State::Running => context.runnables.borrow_mut().push(runnable), 28 | State::Stopping => (), 29 | State::Stopped => panic!( 30 | "not within a schedwalk context, must be called from within a schedwalk context" 31 | ), 32 | }) 33 | } 34 | 35 | pub fn init(f: impl FnOnce(&Context) -> R) -> R { 36 | CONTEXT.with(|context| { 37 | assert!( 38 | matches!(context.state.get(), State::Stopped), 39 | "already within a schedwalk context, cannot start new context here" 40 | ); 41 | 42 | context.state.set(State::Running); 43 | 44 | struct DropGuard<'a>(&'a Context); 45 | 46 | impl Drop for DropGuard<'_> { 47 | fn drop(&mut self) { 48 | self.0.state.set(State::Stopping); 49 | self.0.runnables.borrow_mut().clear(); 50 | self.0.state.set(State::Stopped); 51 | } 52 | } 53 | 54 | let _drop_guard = DropGuard(context); 55 | 56 | f(context) 57 | }) 58 | } 59 | 60 | pub fn runnables(&self) -> RefMut> { 61 | self.runnables.borrow_mut() 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(missing_docs)] 2 | #![doc = include_str!("../README.md")] 3 | 4 | use std::{ 5 | env, 6 | future::Future, 7 | panic::{catch_unwind, resume_unwind}, 8 | pin::Pin, 9 | sync::atomic::{AtomicBool, Ordering}, 10 | task, 11 | }; 12 | 13 | use async_task::Task; 14 | use context::Context; 15 | 16 | mod context; 17 | mod schedule; 18 | 19 | const SCHEDULE_ENV: &str = "SCHEDULE"; 20 | 21 | /// A spawned future that can be awaited. 22 | /// 23 | /// This is the equivalent of Tokio's `tokio::task::JoinHandle`. 24 | /// 25 | /// A `JoinHandle` detaches when the handle is dropped. The underlying task will continue to run 26 | /// unless [`JoinHandle::abort`] was called. 27 | pub struct JoinHandle { 28 | task: Option>, 29 | abort: AtomicBool, 30 | } 31 | 32 | /// An error when joining a future via a [`JoinHandle`]. 33 | /// 34 | /// Currently, as panics are not handled by schedwalk, an error can only occur if 35 | /// [`JoinHandle::abort`] is called, but this may change in the future. 36 | pub struct JoinError(); 37 | 38 | impl JoinError { 39 | /// Whether this error is due to cancellation. 40 | pub fn is_cancelled(&self) -> bool { 41 | true 42 | } 43 | } 44 | 45 | impl JoinHandle { 46 | fn new(task: Task) -> Self { 47 | JoinHandle { 48 | task: Some(task), 49 | abort: AtomicBool::new(false), 50 | } 51 | } 52 | } 53 | 54 | impl JoinHandle { 55 | /// Aborts the underlying task. 56 | /// 57 | /// If the task is not complete, this will cause it to complete with a [`JoinError`]. 58 | /// Otherwise, it will not have an effect. 59 | pub fn abort(&self) { 60 | self.abort.store(true, Ordering::Relaxed) 61 | } 62 | } 63 | 64 | impl Drop for JoinHandle { 65 | fn drop(&mut self) { 66 | if let Some(task) = self.task.take() { 67 | task.detach() 68 | } 69 | } 70 | } 71 | 72 | impl Future for JoinHandle { 73 | type Output = Result; 74 | 75 | #[inline] 76 | fn poll(mut self: Pin<&mut Self>, cx: &mut task::Context) -> task::Poll { 77 | let JoinHandle { task, abort } = &mut *self; 78 | 79 | match task { 80 | Some(task) if task.is_finished() || !*abort.get_mut() => { 81 | Pin::new(task).poll(cx).map(Ok) 82 | } 83 | _ => { 84 | task.take(); 85 | task::Poll::Ready(Err(JoinError())) 86 | } 87 | } 88 | } 89 | } 90 | 91 | /// Spawns a new asynchronous task and returns a [`JoinHandle`] to it. 92 | /// 93 | /// This must be called within a context created by [`for_all_schedules`]. Failure to do so will 94 | /// throw an exception. 95 | pub fn spawn(future: T) -> JoinHandle 96 | where 97 | T: Future + Send + 'static, 98 | T::Output: Send + 'static, 99 | { 100 | JoinHandle::new(spawn_task(future)) 101 | } 102 | 103 | fn spawn_task(future: T) -> Task 104 | where 105 | T: Future + Send + 'static, 106 | T::Output: Send + 'static, 107 | { 108 | let (runnable, task) = async_task::spawn(future, Context::schedule); 109 | runnable.schedule(); 110 | task 111 | } 112 | 113 | /// Executes the given future multiple times, each time under a new polling schedule, eventually 114 | /// executing it under all possible polling schedules. 115 | /// 116 | /// This can be used to deterministically test for what would otherwise be asynchronous race 117 | /// conditions. 118 | /// 119 | /// If a panic occurs when executing a schedule, it will be written to standard error. For ease of 120 | /// debugging, rerunning the test with `SCHEDULE` set to this string will execute that particular 121 | /// failing schedule only. 122 | /// 123 | /// This assumes *determinism*; the spawned futures and the order they are polled in must not depend 124 | /// on anything external to the function such as network or thread locals. This function will panic 125 | /// in case non-determinism is detected, but it cannot do so reliably in all cases. 126 | #[inline] 127 | pub fn for_all_schedules(mut f: impl FnMut() -> T) 128 | where 129 | T: Future + 'static + Send, 130 | { 131 | fn walk(spawn: &mut dyn FnMut() -> Task<()>) { 132 | match env::var(SCHEDULE_ENV) { 133 | Ok(schedule) => walk_schedule(&schedule, spawn), 134 | Err(env::VarError::NotPresent) => walk_exhaustive(&mut Vec::new(), spawn), 135 | Err(env::VarError::NotUnicode(_)) => { 136 | panic!( 137 | "found a schedule in {}, but it was not valid unicode", 138 | SCHEDULE_ENV 139 | ) 140 | } 141 | } 142 | } 143 | 144 | // Defer to `dyn` as quickly as possible to minimize per-test compilation overhead 145 | walk(&mut || spawn_task(f())) 146 | } 147 | 148 | fn walk_schedule(schedule: &str, spawn: &mut dyn FnMut() -> Task<()>) { 149 | let mut schedule = schedule::Decoder::new(schedule); 150 | Context::init(|context| { 151 | let task = spawn(); 152 | loop { 153 | let runnable = { 154 | let mut runnables = context.runnables(); 155 | let choices = runnables.len(); 156 | 157 | if choices == 0 { 158 | assert!(task.is_finished(), "deadlock"); 159 | break; 160 | } else { 161 | runnables.swap_remove(schedule.read(choices)) 162 | } 163 | }; 164 | 165 | runnable.run(); 166 | } 167 | }) 168 | } 169 | 170 | fn walk_exhaustive(schedule: &mut Vec<(usize, usize)>, spawn: &mut dyn FnMut() -> Task<()>) { 171 | fn advance(schedule: &mut Vec<(usize, usize)>) -> bool { 172 | loop { 173 | if let Some((choice, len)) = schedule.pop() { 174 | let new_choice = choice + 1; 175 | if new_choice < len { 176 | schedule.push((new_choice, len)); 177 | return true; 178 | } 179 | } else { 180 | return false; 181 | } 182 | } 183 | } 184 | 185 | Context::init(|context| 'schedules: loop { 186 | let mut step = 0; 187 | let task = spawn(); 188 | 189 | loop { 190 | let runnable = { 191 | let mut runnables = context.runnables(); 192 | let choices = runnables.len(); 193 | 194 | let choice = if step < schedule.len() { 195 | let (choice, existing_choices) = schedule[step]; 196 | 197 | assert_eq!( 198 | choices, 199 | existing_choices, 200 | "nondeterminism: number of pollable futures ({}) did not equal number in previous executions ({})", 201 | choices, 202 | existing_choices, 203 | ); 204 | 205 | choice 206 | } else if choices == 0 { 207 | if task.is_finished() { 208 | if advance(schedule) { 209 | continue 'schedules; 210 | } else { 211 | break 'schedules; 212 | } 213 | } else { 214 | panic!( 215 | "deadlock in {}={}", 216 | SCHEDULE_ENV, 217 | schedule::encode(&schedule) 218 | ); 219 | } 220 | } else { 221 | schedule.push((0, choices)); 222 | 0 223 | }; 224 | 225 | runnables.swap_remove(choice) 226 | }; 227 | 228 | step += 1; 229 | let result = catch_unwind(|| runnable.run()); 230 | 231 | if let Err(panic) = result { 232 | eprintln!("panic in {}={}", SCHEDULE_ENV, schedule::encode(&schedule)); 233 | resume_unwind(panic) 234 | } 235 | } 236 | }) 237 | } 238 | 239 | #[cfg(test)] 240 | mod tests { 241 | use std::{ 242 | any::Any, 243 | fmt::Debug, 244 | panic::{panic_any, AssertUnwindSafe}, 245 | }; 246 | 247 | use futures::{ 248 | channel::{mpsc, oneshot}, 249 | future::{pending, select, Either}, 250 | }; 251 | 252 | use super::*; 253 | 254 | fn assert_panics(f: impl FnOnce() -> T) -> Box 255 | where 256 | T: Debug, 257 | { 258 | catch_unwind(AssertUnwindSafe(f)).expect_err("expected panic") 259 | } 260 | 261 | fn assert_finds_panicking_schedule(mut f: impl FnMut() -> T) -> String 262 | where 263 | T: Future + 'static + Send, 264 | { 265 | let mut schedule = Vec::new(); 266 | 267 | assert_panics(|| walk_exhaustive(&mut schedule, &mut || spawn_task(f()))) 268 | .downcast::() 269 | .expect("expected test panic"); 270 | 271 | let encoded_schedule = schedule::encode(&schedule); 272 | 273 | assert_panics(|| walk_schedule(&encoded_schedule, &mut || spawn_task(f()))) 274 | .downcast::() 275 | .expect("expected test panic"); 276 | 277 | encoded_schedule 278 | } 279 | 280 | struct PanicMarker; 281 | 282 | fn panic_target() { 283 | panic_any(PanicMarker); 284 | } 285 | 286 | #[test] 287 | fn basic() { 288 | assert_finds_panicking_schedule(|| async { panic_target() }); 289 | } 290 | 291 | #[test] 292 | fn spawn_panic() { 293 | assert_finds_panicking_schedule(|| async { 294 | spawn(async { panic_target() }); 295 | }); 296 | } 297 | 298 | #[test] 299 | fn example() { 300 | let f = || async { 301 | let (sender, mut receiver) = mpsc::unbounded::(); 302 | 303 | spawn(async move { 304 | sender.unbounded_send(1).unwrap(); 305 | sender.unbounded_send(3).unwrap(); 306 | sender.unbounded_send(2).unwrap(); 307 | }); 308 | 309 | spawn(async move { 310 | let mut sum = 0; 311 | let mut count = 0; 312 | while let Some(num) = receiver.try_next().unwrap() { 313 | sum += num; 314 | count += 1; 315 | } 316 | 317 | println!("average is {}", sum / count) 318 | }); 319 | }; 320 | 321 | let mut schedule = Vec::new(); 322 | assert_panics(|| walk_exhaustive(&mut schedule, &mut || spawn_task(f()))); 323 | assert_eq!(schedule::encode(&schedule), "01") 324 | } 325 | 326 | #[test] 327 | fn channels() { 328 | assert_finds_panicking_schedule(|| async { 329 | let (sender_a, receiver_a) = oneshot::channel(); 330 | let (sender_b, receiver_b) = oneshot::channel(); 331 | 332 | spawn(async { 333 | drop(sender_a.send(())); 334 | }); 335 | 336 | spawn(async { 337 | drop(sender_b.send(())); 338 | }); 339 | 340 | match select(receiver_a, receiver_b).await { 341 | Either::Left(_) => (), 342 | Either::Right(_) => panic_target(), 343 | } 344 | }); 345 | } 346 | 347 | #[test] 348 | fn walk_basic() { 349 | for_all_schedules(|| async { () }); 350 | } 351 | 352 | #[test] 353 | fn walk_channels() { 354 | for_all_schedules(|| async { 355 | let (sender_a, receiver_a) = oneshot::channel(); 356 | let (sender_b, receiver_b) = oneshot::channel(); 357 | 358 | spawn(async { 359 | sender_a.send(()).unwrap(); 360 | }); 361 | 362 | spawn(async { 363 | sender_b.send(()).unwrap(); 364 | }); 365 | 366 | receiver_a.await.unwrap(); 367 | receiver_b.await.unwrap(); 368 | }); 369 | } 370 | 371 | #[test] 372 | #[should_panic] 373 | fn walk_deadlock() { 374 | for_all_schedules(|| pending::<()>()) 375 | } 376 | 377 | #[test] 378 | #[should_panic] 379 | fn channel_deadlock() { 380 | for_all_schedules(|| async { 381 | let (sender, receiver) = oneshot::channel::<()>(); 382 | 383 | receiver.await.unwrap(); 384 | drop(sender) 385 | }); 386 | } 387 | } 388 | -------------------------------------------------------------------------------- /src/schedule.rs: -------------------------------------------------------------------------------- 1 | const RADIX: usize = 32; 2 | 3 | pub fn encode(choices: &[(usize, usize)]) -> String { 4 | let mut schedule = String::new(); 5 | 6 | for &(mut choice, mut len) in choices { 7 | assert!(choice < len); 8 | 9 | while len > 0 { 10 | schedule.push(char::from_digit((choice % RADIX) as u32, RADIX as u32).unwrap()); 11 | choice /= RADIX; 12 | len /= RADIX; 13 | } 14 | } 15 | 16 | schedule 17 | } 18 | 19 | pub struct Decoder { 20 | schedule: std::vec::IntoIter, 21 | } 22 | 23 | impl Decoder { 24 | pub fn new(schedule: &str) -> Decoder { 25 | Decoder { 26 | schedule: schedule.chars().collect::>().into_iter(), 27 | } 28 | } 29 | 30 | pub fn read(&mut self, mut len: usize) -> usize { 31 | let mut choice = 0; 32 | let mut offset = 1; 33 | 34 | while len > 0 { 35 | let digit = self.schedule.next().map_or(0, |ch| { 36 | ch.to_digit(std::cmp::min(len, RADIX) as u32) 37 | .expect("invalid schedule") as usize 38 | }); 39 | 40 | choice += offset * digit; 41 | offset *= RADIX; 42 | len /= RADIX; 43 | } 44 | 45 | choice 46 | } 47 | } 48 | 49 | #[cfg(test)] 50 | mod tests { 51 | use super::{encode, Decoder}; 52 | 53 | fn encode_decode(choices: &[(usize, usize)]) { 54 | let mut dec = Decoder::new(&encode(choices)); 55 | 56 | for &(choice, len) in choices { 57 | assert_eq!(choice, dec.read(len)) 58 | } 59 | } 60 | 61 | #[test] 62 | fn empty() { 63 | encode_decode(&[]) 64 | } 65 | 66 | #[test] 67 | fn one() { 68 | encode_decode(&[(1, 2)]) 69 | } 70 | 71 | #[test] 72 | fn two() { 73 | encode_decode(&[(1, 2), (4, 5)]); 74 | } 75 | 76 | #[test] 77 | fn long() { 78 | encode_decode(&[(1, 2), (4, 5), (0, 1), (10, 11), (2, 3), (0, 9)]); 79 | } 80 | 81 | #[test] 82 | fn big() { 83 | encode_decode(&[(12312312, 1231231234)]); 84 | } 85 | } 86 | --------------------------------------------------------------------------------