├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── include └── atomic_queues.hpp └── src ├── atomic_queue_test.cpp ├── mpmc_bench.cpp └── spsc_bench.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .vscode 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | project(AtomicQueues LANGUAGES CXX) 3 | 4 | set(CMAKE_CXX_STANDARD 20) 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 6 | set(CMAKE_CXX_EXTENSIONS OFF) 7 | 8 | add_library(atomic_queues INTERFACE) 9 | target_include_directories(atomic_queues INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) 10 | 11 | Include(FetchContent) 12 | 13 | FetchContent_Declare( 14 | Catch2 15 | GIT_REPOSITORY https://github.com/catchorg/Catch2.git 16 | GIT_TAG v3.4.0 17 | ) 18 | 19 | FetchContent_MakeAvailable(Catch2) 20 | 21 | add_executable(atomic_queue_test src/atomic_queue_test.cpp) 22 | target_link_libraries(atomic_queue_test atomic_queues Catch2::Catch2WithMain) 23 | 24 | add_custom_target(run_tests 25 | COMMAND atomic_queue_test 26 | DEPENDS atomic_queue_test 27 | WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} 28 | ) 29 | 30 | add_executable(spsc_bench src/spsc_bench.cpp) 31 | target_link_libraries(spsc_bench atomic_queues) 32 | target_compile_options(spsc_bench PRIVATE -O3) 33 | 34 | add_executable(mpmc_bench src/mpmc_bench.cpp) 35 | target_link_libraries(mpmc_bench atomic_queues) 36 | target_compile_options(mpmc_bench PRIVATE -O3) 37 | 38 | add_custom_target(run_spsc_bench 39 | COMMAND spsc_bench 40 | DEPENDS spsc_bench 41 | WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} 42 | ) 43 | 44 | add_custom_target(run_mpmc_bench 45 | COMMAND mpmc_bench 1 2 4 8 16 46 | DEPENDS mpmc_bench 47 | WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} 48 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Joad Nacer 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # atomic_queues 2 | 3 | This repository contains a single header-file, atomic_queues.hpp, which contains bounded MpmcQueue and SpscQueue implementations for C++20. The SPSC queue appears to be the best performing SPSC queue, while the MPMC queue appears to be the best performing implementation for N threads <= N cores (and is by far the best performing at low contention). 4 | 5 | If you require high performance at N threads > N cores, you may want to use a queue such as [moodycamel's ConcurrentQueue](https://github.com/cameron314/concurrentqueue). I am working on an original "composite" MMPMC queue implementation which will eventually be added to this repo and will hopefully be the top performing queue at high contention. 6 | 7 | Note that performance varies greatly by system and the benchmarks in this README are not realistic, benchmark queues yourself if trying to optimize performance. 8 | 9 | The implementation of the MPMC queue is based on [Dmitry Vyukov's bounded MPMC queue](https://www.1024cores.net/home/lock-free-algorithms/queues/bounded-mpmc-queue), and the implementation of the SPSC queue is based on [Erik Rigtor's SpscQueue](https://github.com/rigtorp/SPSCQueue). Modifications have been made to these implementations in order to improve performance and configurability. 10 | 11 | # Usage and Configuration 12 | 13 | Both queues' templates take the following parameters: 14 | ```c++ 15 | 16 | ``` 17 | 18 | - `typename T`: The type stored in the queue. 19 | - `size_t N = 0`: the capacity of the queue - providing this as a template parameter allows the compiler to optimize the modulo operation, which may greatly improve performance at low contention (see benchmarks), and allows for compile-time checking of the parameter's validity*. 20 | - `ValidSizeConstraint SizeConstraint = jdz::EnforcePowerOfTwo`: Constraining the capacity to power of two sizes allows for the use of a bitwise AND operation instead of a ternary operation for SPSC or a modulo for MPMC, which may improve performance. Options are `jdz::EnforcePowerOfTwo` and `jdz::DoNotEnforcePowerOfTwo`. 21 | - `BufferType BufType = jdz::UseHeapBuffer`: Determines the buffer type - HeapBuffer or StackBuffer. Note that using StackBuffer may cause segfaults if the queue capacity is too large due to stack size limits. Pick an appropriate capacity, or adjust your stack size using ie `ulimit -s` on Linux. Options are `jdz::UseHeapBuffer` and `jdz::UseStackBuffer`. 22 | 23 | The constructor's buffer_size argument must be provided if N is equal to 0, and follows the same restrictions as N*. Passing in a custom allocator allows for use of the queue with huge pages or in shared memory. 24 | 25 | *Buffer capacity must be greater than 1, and must respect the power of two size constraint. If N is provided, then the buffer_size_ constructor argument must be either 0 or equal to N. 26 | 27 | Both queues implement the following interface: 28 | 29 | ```c++ 30 | Queue(size_t buffer_size = N; 31 | std::allocator> allocator = std::allocator>()); 32 | 33 | template 34 | void emplace(Args &&... args) noexcept; // blocking 35 | 36 | void push(const T &data) noexcept 37 | requires std::is_nothrow_copy_constructible_v; // blocking 38 | 39 | template 40 | void push(P &&data) noexcept 41 | requires std::is_nothrow_constructible_v; // blocking 42 | 43 | template 44 | [[nodiscard]] bool try_emplace(Args &&... args) noexcept 45 | requires std::is_nothrow_constructible_v; // non-blocking 46 | 47 | bool try_push(const T &data) noexcept 48 | requires std::is_nothrow_copy_constructible_v; // non-blocking 49 | 50 | template 51 | [[nodiscard]] bool try_push(P &&data) noexcept 52 | requires std::is_nothrow_constructible_v; // non-blocking 53 | 54 | void pop(T &v) noexcept; // blocking; 55 | 56 | [[nodiscard]] bool try_pop(T &v) noexcept; // non-blocking 57 | 58 | [[nodiscard]] size_t size() const noexcept; 59 | 60 | [[nodiscard]] size_t capacity() const noexcept; 61 | 62 | [[nodiscard]] bool empty() const noexcept; 63 | ``` 64 | 65 | The blocking methods perform better than the non-blocking methods when not-preempted. If number of threads is larger than the number of cores and preemption becomes an issue, you may achieve better performance with a retry loop over the `try*` methods, although I would recommend using a different library in this situation. 66 | 67 | # Benchmarks 68 | Benchmarks are included in the src/ folder of this repository, testing various versions of the queue implementations. These are currently not especially rigorous and will be improved. 69 | 70 | Posted below are the results of benchmarking the implementations found in this repository against other notable implementations. Benchmarks are run using the blocking read/write methods where possible, ie `pop` and `push` for jdz queues. Note that these may perform differently than the non-blocking methods - ie, for vyukov style queues* (such as jdz implementations and rigtorp), these perform better as long as N threads <= N cores but perform worse than the non-blocking methods below that. 71 | 72 | These benchmarks involve submitting small data types (uint64_t) between equal numbers of producers and consumers producing and consuming at max throughput - this is not a realistic benchmark. Contention in real systems is likely lower for same number of threads, and it is important to test your use case yourself. 73 | 74 | I also plan to add benchmarks of the queues used as SPMC/MPSC, which may produce interesting results. 75 | 76 | *Note that the original Vyukov implementation did not contain blocking methods. 77 | 78 | ## SPSC Benchmarks: 79 | These benchmarks measure the ops/ms of one producer transmitting 1 billion uint64_t to one consumer. Benchmarks were run 5 times and averaged. Benchmarks are run with a capacity of 65536, or 65537 for non-power of 2 jdz trials. The queues from this repository are the best performing, followed by drogalis's queue. 80 | 81 | Benchmarked queues are: 82 | 83 | - `jdz-cmp-pow2`: jdz queue: `jdz::SpscQueue` - compile-time power of 2 capacity. 84 | - `jdz-run-pow2`: jdz queue: `jdz::SpscQueue` - runtime power of 2 capacity. 85 | - `jdz-cmp`: jdz queue: `jdz::SpscQueue` - compile-time non-power of 2 capacity. 86 | - `jdz-run`: jdz queue: `jdz::SpscQueue` - runtime non-power of 2 capacity. 87 | - `dro`: [Andrew Drogalis's SPSC-Queue](https://github.com/drogalis) 88 | - `rigtorp`: [Erik Rigtorp's SpscQueue](https://github.com/rigtorp/SPSCQueue). 89 | - `atomic_queue`: [Maxim Egorushkin's atomic_queue](https://github.com/max0x7ba/atomic_queue) with SPSC=true. 90 | - `deaod`: [deaod's spsc_queue](https://github.com/Deaod/spsc_queue). 91 | - `cml-rwcb`: [moodycamel's BlockingReaderWriterCircularBuffer](https://github.com/cameron314/readerwriterqueue). 92 | - `cml-rwq`: [moodycamel's ReaderWriterQueue](https://github.com/cameron314/readerwriterqueue). 93 | 94 | ### x86_64 - Intel i7-11800H 95 | 96 | ![spscl](https://i.imgur.com/vQdPhrc.png) 97 | 98 | ## MPMC Benchmarks 99 | These benchmarks show the throughput measured for one producer transmitting 100 million uint64_t to one consumer. Benchmarks were run 5 times and averaged. Benchmarks are run with a capacity of 8192, or 8193 for non-power of 2 jdz trials. 100 | 101 | We can see clearly that moodycamel's queue is the best by far N threads > N cores, but performs less well below this. 102 | 103 | Benchmarked queues are: 104 | 105 | - `jdz-cmp-pow2`: jdz queue: `jdz::MpmcQueue` - compile-time power of 2 capacity. 106 | - `jdz-run-pow2`: jdz queue: `jdz::MpmcQueue` - runtime power of 2 capacity. 107 | - `jdz-cmp`: jdz queue: `jdz::MpmcQueue` - compile-time non-power of 2 capacity. 108 | - `jdz-run`: jdz queue: `jdz::MpmcQueue` - runtime non-power of 2 capacity. 109 | - `rigtorp`: [Erik Rigtorp's MpmcQueue](https://github.com/rigtorp/MPMCQueue). 110 | - `atomic_queue`: [Maxim Egorushkin's atomic_queue](https://github.com/max0x7ba/atomic_queue). 111 | - `moodycamel`: [moodycamel's ConcurrentQueue](https://github.com/cameron314/concurrentqueue). 112 | - `es-mpmc`: [Erez Strauss's lockfree_mpmc_queue](https://github.com/erez-strauss). 113 | - `xenium-vyukov`: [Manuel Pöter's vyukov_bounded_queue](https://github.com/mpoeter/xenium/tree/master). 114 | 115 | ### x86_64 - Intel i7-11800H 116 | ![1p1cl](https://i.imgur.com/2aVkRSG.png) 117 | ![2p2cl](https://i.imgur.com/2jvkYWb.png) 118 | ![4p4cl](https://i.imgur.com/hjwKwZA.png) 119 | ![8p8cl](https://i.imgur.com/0ij0eo8.png) 120 | ![16p16cl](https://i.imgur.com/1ZoUIlb.png) 121 | -------------------------------------------------------------------------------- /include/atomic_queues.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright Joad Nacer 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 5 | associated documentation files (the “Software”), to deal in the Software without restriction, 6 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 7 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all copies or 11 | substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 14 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 15 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 16 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | namespace jdz 28 | { 29 | 30 | // Determines whether capacity should be enforced to be a power of two size, allowing for use of bitwise instead of a modulo/ternary for mpmc/spsc. 31 | struct EnforcePowerOfTwo {}; 32 | struct DoNotEnforcePowerOfTwo {}; 33 | 34 | // Passing as final template parameter to a queue will determine if using HeapBuffer or StackBuffer (default = HeapBuffer) 35 | // Note that using UseStackBuffer may cause a segfault when attempting to create too large of an array. 36 | // Ensure you pick an appropriate capacity for your stack size limit, or increase this using ie `ulimit -s` on linux 37 | struct UseHeapBuffer; 38 | struct UseStackBuffer; 39 | 40 | namespace details 41 | { 42 | 43 | #if defined(__cpp_lib_hardware_interference_size) 44 | static constexpr size_t cache_line = std::hardware_destructive_interference_size; 45 | #else 46 | static constexpr size_t cache_line = 64; 47 | #endif 48 | 49 | static constexpr bool is_power_of_two(size_t n) { 50 | return (n & (n-1)) == 0; 51 | } 52 | 53 | template 54 | concept IsEnforcePowerOfTwo = std::is_same_v; 55 | 56 | template 57 | concept IsDoNotEnforcePowerOfTwo = std::is_same_v; 58 | 59 | template 60 | concept IsValidSizeConstraint = IsEnforcePowerOfTwo || IsDoNotEnforcePowerOfTwo; 61 | 62 | template 63 | concept AlignedToCacheLine = alignof(T) % cache_line == 0; 64 | 65 | template 66 | concept SizeMultipleOfCacheLine = sizeof(T) % cache_line == 0; 67 | 68 | template 69 | concept FalseSharingSafe = AlignedToCacheLine && SizeMultipleOfCacheLine; 70 | 71 | template 72 | concept OptionalPowerOfTwo = IsValidSizeConstraint 73 | && (IsDoNotEnforcePowerOfTwo || is_power_of_two(N)); 74 | 75 | template 76 | concept ZeroOrGreaterThanOne = N == 0 || N > 1; 77 | 78 | template 79 | concept IsTrivialType = std::is_trivially_copyable_v 80 | && std::is_trivially_destructible_v 81 | && sizeof(T) <= sizeof(uint64_t); 82 | 83 | template 84 | concept BufferType = std::is_same_v || std::is_same_v; 85 | 86 | template 87 | constexpr size_t PlusOneIfNotPowerOfTwo = N == 0 ? 0 : (std::is_same_v ? N : N + 1); 88 | 89 | template 90 | size_t plus_one_if_not_pow2(const size_t n) { 91 | return n == 0 ? 0 : (std::is_same_v ? n : n + 1); 92 | } 93 | 94 | template 95 | class HeapBuffer 96 | { 97 | private: 98 | T *buffer_; 99 | 100 | const size_t buffer_size_; 101 | const size_t buffer_mask_; 102 | 103 | std::allocator allocator_; 104 | 105 | public: 106 | explicit HeapBuffer(const size_t buffer_size, const std::allocator &allocator) 107 | : buffer_size_(buffer_size), 108 | buffer_mask_(buffer_size_ - 1), 109 | allocator_(allocator) { 110 | buffer_ = allocator_.allocate(buffer_size_ + 1); 111 | } 112 | 113 | ~HeapBuffer() noexcept { 114 | allocator_.deallocate(buffer_, buffer_size_ + 1); 115 | } 116 | 117 | T& operator[](const size_t index) noexcept { 118 | if constexpr (!ApplyModulo) { 119 | return buffer_[index]; 120 | } 121 | else if constexpr (IsEnforcePowerOfTwo) { 122 | return buffer_[index & buffer_mask_]; 123 | } 124 | else if constexpr (N != 0) { 125 | return buffer_[index % N]; 126 | } 127 | else { 128 | return buffer_[index % buffer_size_]; 129 | } 130 | } 131 | 132 | const T& operator[](const size_t index) const noexcept { 133 | if constexpr (!ApplyModulo) { 134 | return buffer_[index]; 135 | } 136 | if constexpr (IsEnforcePowerOfTwo) { 137 | return buffer_[index & buffer_mask_]; 138 | } 139 | else if constexpr (N != 0) { 140 | return buffer_[index % N]; 141 | } 142 | else { 143 | return buffer_[index % buffer_size_]; 144 | } 145 | } 146 | }; 147 | 148 | template 149 | class StackBuffer 150 | { 151 | private: 152 | std::array buffer_; 153 | 154 | public: 155 | StackBuffer(auto, auto) noexcept {} 156 | 157 | ~StackBuffer() noexcept = default; 158 | 159 | T& operator[](const size_t index) noexcept { 160 | if constexpr (ApplyModulo) { 161 | return buffer_[index % N]; 162 | } 163 | else { 164 | return buffer_[index]; 165 | } 166 | } 167 | 168 | const T& operator[](const size_t index) const noexcept { 169 | if constexpr (ApplyModulo) { 170 | return buffer_[index % N]; 171 | } 172 | else { 173 | return buffer_[index]; 174 | } 175 | } 176 | }; 177 | 178 | template 179 | struct SeqField; 180 | 181 | template <> 182 | struct SeqField { 183 | alignas(cache_line) std::atomic seq; 184 | 185 | SeqField(size_t i) : seq(i) {} 186 | }; 187 | 188 | template <> 189 | struct SeqField { 190 | SeqField(size_t) {} 191 | }; 192 | 193 | template 194 | struct IsConstructedField; 195 | 196 | template 197 | struct IsConstructedField { 198 | IsConstructedField(bool is_constructed) {} 199 | }; 200 | 201 | template<> 202 | struct IsConstructedField { 203 | bool is_constructed; 204 | 205 | IsConstructedField(bool is_constructed) : is_constructed(is_constructed) {} 206 | }; 207 | 208 | template<> 209 | struct IsConstructedField { 210 | #ifdef __aarch64__ 211 | alignas(cache_line) bool is_constructed; 212 | #else 213 | bool is_constructed; 214 | #endif 215 | 216 | IsConstructedField(bool is_constructed) : is_constructed(is_constructed) {} 217 | }; 218 | 219 | template 220 | using RawData = std::array; 221 | 222 | template 223 | class Cell; 224 | 225 | template 226 | requires IsTrivialType 227 | class Cell : public SeqField { 228 | private: 229 | T val_; 230 | 231 | public: 232 | Cell() : SeqField(0) {} 233 | 234 | Cell(size_t i) : SeqField(i) {} 235 | 236 | void construct(T val) noexcept { 237 | val_ = val; 238 | } 239 | 240 | T read() noexcept { 241 | return val_; 242 | } 243 | 244 | void destroy() {} 245 | }; 246 | 247 | template 248 | class Cell : public SeqField, 249 | public IsConstructedField> 250 | { 251 | private: 252 | static constexpr bool IsTriviallyDestructible = std::is_trivially_destructible_v; 253 | static constexpr bool IsNotTriviallyDestructible = !IsTriviallyDestructible; 254 | 255 | alignas(alignof(T)) RawData data_; 256 | 257 | public: 258 | Cell() : SeqField(0), IsConstructedField(false) {} 259 | 260 | Cell(size_t i) : SeqField(i), IsConstructedField(false) {} 261 | 262 | ~Cell() noexcept 263 | requires IsTriviallyDestructible {} 264 | 265 | ~Cell() noexcept { 266 | if constexpr (IsNotTriviallyDestructible) { 267 | if (this->is_constructed) destroy(); 268 | } 269 | } 270 | 271 | template 272 | void construct(Args &&...args) noexcept 273 | requires std::is_nothrow_constructible_v { 274 | new (&data_) T(std::forward(args)...); 275 | 276 | if constexpr (IsNotTriviallyDestructible) { 277 | this->is_constructed = true; 278 | } 279 | } 280 | 281 | void destroy() noexcept { 282 | if constexpr (IsNotTriviallyDestructible) { 283 | reinterpret_cast(&data_)->~T(); 284 | 285 | this->is_constructed = false; 286 | } 287 | } 288 | 289 | T &&read() noexcept { 290 | return reinterpret_cast(data_); 291 | } 292 | }; 293 | 294 | template 295 | concept IsValidQueue = ZeroOrGreaterThanOne 296 | && IsValidSizeConstraint 297 | && OptionalPowerOfTwo; 298 | 299 | template 300 | concept IsValidMpmcQueue = IsValidQueue && FalseSharingSafe>; 301 | 302 | template 303 | concept IsValidSpscQueue = IsValidQueue; 304 | 305 | template < 306 | typename DerivedImpl, 307 | bool UseSeq, 308 | bool ApplyModulo, 309 | typename T, 310 | size_t N = 0, 311 | IsValidSizeConstraint SizeConstraint = EnforcePowerOfTwo, 312 | BufferType BufType = UseHeapBuffer 313 | > 314 | requires IsValidQueue 315 | class BaseQueue 316 | { 317 | private: 318 | static constexpr bool UseStack = std::is_same_v; 319 | 320 | static_assert(!UseStack || (UseStack && N != 0), 321 | "Capacity must be set via comptime-parameter to a non-zero value if using UseStackBuffer"); 322 | 323 | using value_t = Cell; 324 | using heap_buf = HeapBuffer; 325 | using stack_buf = StackBuffer; 326 | 327 | using allocator_t = typename std::allocator; 328 | 329 | using buffer_t = typename std::conditional_t; 330 | 331 | protected: 332 | alignas(cache_line) buffer_t buffer_; 333 | 334 | alignas(cache_line) const size_t buffer_size_; 335 | 336 | public: 337 | explicit BaseQueue(const size_t buffer_size = N, const allocator_t &allocator = allocator_t()) 338 | : buffer_(buffer_size, allocator), buffer_size_(buffer_size) { 339 | 340 | if constexpr (N != 0) { 341 | assert(buffer_size == N 342 | && "Do not specify a constructor buffer size different from compile-time buffer size"); 343 | } 344 | else { 345 | if (buffer_size <= 1) { 346 | throw std::invalid_argument("buffer_size should be greater than 1"); 347 | } 348 | if (IsEnforcePowerOfTwo && !details::is_power_of_two(buffer_size)) { 349 | throw std::invalid_argument("buffer_size should be a power of 2"); 350 | } 351 | } 352 | 353 | for (size_t i = 0; i < buffer_size; i++) { 354 | new (&buffer_[i]) value_t(i); 355 | } 356 | 357 | } 358 | 359 | ~BaseQueue() noexcept { 360 | for (size_t i = 0; i < buffer_size_; i++) { 361 | buffer_[i].~Cell(); 362 | } 363 | } 364 | 365 | BaseQueue(const BaseQueue &) = delete; 366 | BaseQueue &operator=(const BaseQueue &) = delete; 367 | 368 | template 369 | void emplace(Args &&... args) noexcept 370 | requires std::is_nothrow_constructible_v { 371 | static_cast(this)->emplace(std::forward(args)...); 372 | } 373 | 374 | void push(const T &data) noexcept 375 | requires std::is_nothrow_copy_constructible_v { 376 | static_cast(this)->emplace(data); 377 | } 378 | 379 | template 380 | void push(P &&data) noexcept 381 | requires std::is_nothrow_constructible_v { 382 | static_cast(this)->emplace(std::forward

(data)); 383 | } 384 | 385 | template 386 | [[nodiscard]] bool try_emplace(Args &&... args) noexcept 387 | requires std::is_nothrow_constructible_v { 388 | return static_cast(this)->try_emplace(std::forward(args)...); 389 | } 390 | 391 | [[nodiscard]] bool try_push(const T &data) noexcept 392 | requires std::is_nothrow_copy_constructible_v { 393 | return static_cast(this)->try_emplace(data); 394 | } 395 | 396 | template 397 | [[nodiscard]] bool try_push(P &&data) noexcept 398 | requires std::is_nothrow_constructible_v { 399 | return static_cast(this)->try_emplace(std::forward

(data)); 400 | } 401 | 402 | void pop(T &v) noexcept { 403 | static_cast(this)->pop(v); 404 | } 405 | 406 | [[nodiscard]] bool try_pop(T &v) noexcept { 407 | return static_cast(this)->try_pop(v); 408 | } 409 | 410 | /// Will return a negative value if there are one or more readers waiting on an empty queue 411 | template 412 | [[nodiscard]]typename std::enable_if_t, size_t> 413 | size() const noexcept { 414 | std::ptrdiff_t diff = static_cast(this)->get_enqueue_pos() 415 | - static_cast(this)->get_dequeue_pos(); 416 | 417 | if (diff < 0) diff += buffer_size_; 418 | 419 | return static_cast(diff); 420 | } 421 | 422 | template 423 | [[nodiscard]] typename std::enable_if_t, size_t> 424 | size() const noexcept { 425 | return static_cast(this)->get_enqueue_pos() 426 | - static_cast(this)->get_dequeue_pos(); 427 | } 428 | 429 | [[nodiscard]] size_t capacity() const noexcept { 430 | return buffer_size_; 431 | } 432 | 433 | [[nodiscard]] bool empty() const noexcept { 434 | return size() <= 0; 435 | } 436 | 437 | }; 438 | 439 | static inline bool cas_add_one_relaxed(std::atomic &atomic, size_t& val) noexcept { 440 | return atomic.compare_exchange_weak( 441 | val, val + 1, std::memory_order::relaxed, std::memory_order::relaxed); 442 | } 443 | 444 | } // namespace details 445 | 446 | template < 447 | typename T, 448 | size_t N = 0, 449 | details::IsValidSizeConstraint SizeConstraint = EnforcePowerOfTwo, 450 | details::BufferType BufType = UseHeapBuffer 451 | > 452 | requires details::IsValidMpmcQueue 453 | class MpmcQueue : public details::BaseQueue, true, true, T, N, SizeConstraint, BufType> 454 | { 455 | private: 456 | using base_queue_t = details::BaseQueue, true, true, T, N, SizeConstraint, BufType>; 457 | using allocator_t = typename std::allocator>; 458 | 459 | alignas(details::cache_line) std::atomic enqueue_pos_{0}; 460 | alignas(details::cache_line) std::atomic dequeue_pos_{0}; 461 | 462 | public: 463 | explicit MpmcQueue(const size_t buffer_size = N, const allocator_t &allocator = allocator_t()) 464 | : base_queue_t(buffer_size, allocator) {} 465 | 466 | template 467 | void emplace(Args &&... args) noexcept 468 | requires std::is_nothrow_constructible_v { 469 | size_t pos = enqueue_pos_.fetch_add(1, std::memory_order::relaxed); 470 | auto &cell = this->buffer_[pos]; 471 | 472 | while (pos != cell.seq.load(std::memory_order::acquire)); 473 | 474 | cell.construct(std::forward(args)...); 475 | cell.seq.store(pos+1, std::memory_order::release); 476 | } 477 | 478 | template 479 | [[nodiscard]] bool try_emplace(Args &&... args) noexcept 480 | requires std::is_nothrow_constructible_v { 481 | while (true) { 482 | size_t pos = enqueue_pos_.load(std::memory_order::relaxed); 483 | auto &cell = this->buffer_[pos]; 484 | 485 | const size_t seq = cell.seq.load(std::memory_order::acquire); 486 | const int64_t diff = seq - pos; 487 | 488 | if (diff == 0 && details::cas_add_one_relaxed(enqueue_pos_, pos)) { 489 | cell.construct(std::forward(args)...); 490 | cell.seq.store(pos+1, std::memory_order::release); 491 | 492 | return true; 493 | } 494 | else if (diff < 0) { 495 | return false; 496 | } 497 | } 498 | } 499 | 500 | void pop(T &v) noexcept { 501 | const size_t pos = dequeue_pos_.fetch_add(1, std::memory_order::relaxed); 502 | auto &cell = this->buffer_[pos]; 503 | 504 | while (pos + 1 != cell.seq.load(std::memory_order::acquire)); 505 | 506 | v = cell.read(); 507 | cell.destroy(); 508 | 509 | cell.seq.store(pos + this->buffer_size_, std::memory_order::release); 510 | } 511 | 512 | [[nodiscard]] bool try_pop(T &v) noexcept { 513 | while (true) { 514 | size_t pos = dequeue_pos_.load(std::memory_order::relaxed); 515 | auto &cell = this->buffer_[pos]; 516 | 517 | const size_t seq = cell.seq.load(std::memory_order::acquire); 518 | const int64_t diff = seq - (pos + 1); 519 | 520 | if (diff == 0 && details::cas_add_one_relaxed(dequeue_pos_, pos)) { 521 | v = cell.read(); 522 | cell.destroy(); 523 | 524 | cell.seq.store(pos + this->buffer_size_, std::memory_order::release); 525 | 526 | return true; 527 | } 528 | else if (diff < 0) { 529 | return false; 530 | } 531 | } 532 | } 533 | 534 | [[nodiscard]] size_t get_enqueue_pos() const noexcept { 535 | return enqueue_pos_.load(std::memory_order::relaxed); 536 | } 537 | 538 | [[nodiscard]] size_t get_dequeue_pos() const noexcept { 539 | return dequeue_pos_.load(std::memory_order::relaxed); 540 | } 541 | }; 542 | 543 | template < 544 | typename T, 545 | size_t N = 0, 546 | details::IsValidSizeConstraint SizeConstraint = EnforcePowerOfTwo, 547 | details::BufferType BufType = UseHeapBuffer 548 | > 549 | requires details::IsValidSpscQueue 550 | class SpscQueue : public details::BaseQueue< 551 | SpscQueue, 552 | false, std::is_same_v, 554 | T, 555 | #ifdef __clang__ 556 | N, 557 | #else 558 | details::PlusOneIfNotPowerOfTwo, 559 | #endif 560 | SizeConstraint, BufType 561 | > 562 | { 563 | private: 564 | using base_queue_t = details::BaseQueue< 565 | SpscQueue, 566 | false, std::is_same_v, 568 | T, 569 | #ifdef __clang__ 570 | N, 571 | #else 572 | details::PlusOneIfNotPowerOfTwo, 573 | #endif 574 | SizeConstraint, BufType 575 | >; 576 | 577 | using allocator_t = typename std::allocator>; 578 | 579 | alignas(details::cache_line) std::atomic enqueue_pos_{0}; 580 | size_t cached_enqueue_limit_ = 0; 581 | 582 | alignas(details::cache_line) std::atomic dequeue_pos_{0}; 583 | size_t cached_dequeue_limit_ = 0; 584 | 585 | public: 586 | explicit SpscQueue(const size_t buffer_size = N, const allocator_t &allocator = allocator_t()) 587 | #ifdef __clang__ 588 | : base_queue_t(buffer_size, allocator) {} 589 | #else 590 | : base_queue_t(details::plus_one_if_not_pow2(buffer_size), allocator) {} 591 | #endif 592 | 593 | template 594 | void emplace(Args &&... args) noexcept 595 | requires std::is_nothrow_constructible_v { 596 | const size_t pos = enqueue_pos_.load(std::memory_order::relaxed); 597 | const size_t pos_tmp = get_pos_tmp(pos); 598 | auto &cell = this->buffer_[pos]; 599 | 600 | while (pos_tmp == cached_enqueue_limit_) { 601 | cached_enqueue_limit_ = dequeue_pos_.load(std::memory_order::acquire) + buffer_size_if_pow2(); 602 | } 603 | 604 | cell.construct(std::forward(args)...); 605 | enqueue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release); 606 | } 607 | 608 | template 609 | [[nodiscard]] bool try_emplace(Args &&... args) noexcept 610 | requires std::is_nothrow_constructible_v { 611 | const size_t pos = enqueue_pos_.load(std::memory_order::relaxed); 612 | const size_t pos_tmp = get_pos_tmp(pos); 613 | auto &cell = this->buffer_[pos]; 614 | 615 | if (pos_tmp == cached_enqueue_limit_) { 616 | cached_enqueue_limit_ = dequeue_pos_.load(std::memory_order::acquire) + buffer_size_if_pow2(); 617 | 618 | if (pos_tmp == cached_enqueue_limit_) return false; 619 | } 620 | 621 | cell.construct(std::forward(args)...); 622 | enqueue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release); 623 | 624 | return true; 625 | } 626 | 627 | void pop(T &v) noexcept { 628 | const size_t pos = dequeue_pos_.load(std::memory_order::relaxed); 629 | const size_t pos_tmp = get_pos_tmp(pos); 630 | auto &cell = this->buffer_[pos]; 631 | 632 | while (pos == cached_dequeue_limit_) { 633 | cached_dequeue_limit_ = enqueue_pos_.load(std::memory_order::acquire); 634 | } 635 | 636 | v = cell.read(); 637 | cell.destroy(); 638 | 639 | dequeue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release); 640 | } 641 | 642 | [[nodiscard]] bool try_pop(T &v) noexcept { 643 | const size_t pos = dequeue_pos_.load(std::memory_order::relaxed); 644 | const size_t pos_tmp = get_pos_tmp(pos); 645 | auto &cell = this->buffer_[pos]; 646 | 647 | if (pos == cached_dequeue_limit_) { 648 | cached_dequeue_limit_ = enqueue_pos_.load(std::memory_order::acquire); 649 | 650 | if (pos == cached_dequeue_limit_) return false; 651 | } 652 | 653 | v = cell.read(); 654 | cell.destroy(); 655 | 656 | dequeue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release); 657 | 658 | return true; 659 | } 660 | 661 | [[nodiscard]] size_t get_enqueue_pos() const noexcept { 662 | return enqueue_pos_.load(std::memory_order::relaxed); 663 | } 664 | 665 | [[nodiscard]] size_t get_dequeue_pos() const noexcept { 666 | return dequeue_pos_.load(std::memory_order::relaxed); 667 | } 668 | 669 | #ifdef __clang__ 670 | 671 | template 672 | [[nodiscard]] typename std::enable_if_t 673 | get_pos_tmp(const size_t pos) const noexcept { 674 | return pos + 1 == this->buffer_size_ ? 0 : pos + 1; 675 | } 676 | 677 | template 678 | [[nodiscard]] typename std::enable_if_t 679 | get_pos_tmp(const size_t pos) const noexcept { 680 | return pos + 1 == (N+1) ? 0 : pos + 1; 681 | } 682 | 683 | [[nodiscard]] size_t get_store_pos(const size_t pos) const noexcept { 684 | return pos; 685 | } 686 | 687 | [[nodiscard]] size_t buffer_size_if_pow2() const noexcept { 688 | return 0; 689 | } 690 | 691 | #else 692 | 693 | template 694 | [[nodiscard]] typename std::enable_if_t && NN == 0, size_t> 695 | get_pos_tmp(const size_t pos) const noexcept { 696 | return pos + 1 == this->buffer_size_ ? 0 : pos + 1; 697 | } 698 | 699 | template 700 | [[nodiscard]] typename std::enable_if_t && NN != 0, size_t> 701 | get_pos_tmp(const size_t pos) const noexcept { 702 | return pos + 1 == (N+1) ? 0 : pos + 1; 703 | } 704 | 705 | template 706 | [[nodiscard]] typename std::enable_if_t, size_t> 707 | get_pos_tmp(const size_t pos) const noexcept { 708 | return pos; 709 | } 710 | 711 | template 712 | [[nodiscard]] typename std::enable_if_t, size_t> 713 | get_store_pos(const size_t pos) const noexcept { 714 | return pos; 715 | } 716 | 717 | template 718 | [[nodiscard]] typename std::enable_if_t, size_t> 719 | get_store_pos(const size_t pos) const noexcept { 720 | return pos + 1; 721 | } 722 | 723 | template 724 | [[nodiscard]] typename std::enable_if_t, size_t> 725 | buffer_size_if_pow2() const noexcept { 726 | return 0; 727 | } 728 | 729 | template 730 | [[nodiscard]] typename std::enable_if_t && NN == 0, size_t> 731 | buffer_size_if_pow2() const noexcept { 732 | return this->buffer_size_; 733 | } 734 | 735 | template 736 | [[nodiscard]] typename std::enable_if_t && NN != 0, size_t> 737 | buffer_size_if_pow2() const noexcept { 738 | return N; 739 | } 740 | 741 | #endif 742 | }; 743 | 744 | 745 | } // namespace jdz -------------------------------------------------------------------------------- /src/atomic_queue_test.cpp: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "atomic_queues.hpp" 11 | 12 | using namespace jdz; 13 | 14 | #define RESET_CONSTRUCTORS constructor_count = 0; destructor_count = 0; 15 | 16 | #define REQUIRE_CONSTRUCTORS(CONS, DES) REQUIRE(constructor_count == CONS); REQUIRE(destructor_count == DES); 17 | 18 | #define TEST_CONSTRUCTORS(METHOD, NUM) RESET_CONSTRUCTORS METHOD; REQUIRE_CONSTRUCTORS(NUM, NUM) 19 | 20 | #define FUZZ_ROUNDS 1'000'000 21 | 22 | static size_t constructor_count = 0; 23 | static size_t destructor_count = 0; 24 | 25 | class TestElement { 26 | public: 27 | TestElement() noexcept : value(0) { 28 | constructor_count++; 29 | } 30 | 31 | TestElement(int value) noexcept: value(value) { 32 | constructor_count++; 33 | } 34 | 35 | TestElement(const TestElement& other) noexcept { 36 | constructor_count++; 37 | value = other.value; 38 | } 39 | 40 | ~TestElement() noexcept { 41 | destructor_count++; 42 | } 43 | 44 | private: 45 | int value; 46 | }; 47 | 48 | static TestElement elem; 49 | 50 | template 51 | void testDoubleFillAndEmpty(size_t capacity) { 52 | T queue(capacity); 53 | 54 | for (int i = 0; i < 2; i++) { 55 | REQUIRE(queue.empty()); 56 | REQUIRE(queue.size() == 0); 57 | 58 | for (int j = 1; j <= capacity; j++) { 59 | queue.emplace(); 60 | REQUIRE(queue.size() == j); 61 | } 62 | 63 | for (int j = capacity - 1; j >= 0; j--) { 64 | queue.pop(elem); 65 | REQUIRE(queue.size() == j); 66 | } 67 | } 68 | } 69 | 70 | template 71 | void testAllMethods(size_t capacity) { 72 | assert(capacity >= 2 && "test requires capacity >= 2"); 73 | 74 | T queue(capacity); 75 | 76 | REQUIRE(queue.empty()); 77 | REQUIRE(queue.size() == 0); 78 | 79 | queue.emplace(); 80 | queue.push(elem); 81 | 82 | REQUIRE(!queue.empty()); 83 | REQUIRE(queue.size() == 2); 84 | 85 | queue.pop(elem); 86 | 87 | REQUIRE(queue.size() == 1); 88 | REQUIRE(queue.try_pop(elem) == true); 89 | REQUIRE(queue.size() == 0); 90 | REQUIRE(queue.try_pop(elem) == false); 91 | 92 | REQUIRE(queue.try_emplace() == true); 93 | REQUIRE(queue.try_push(elem) == true); 94 | REQUIRE(queue.size() == 2); 95 | 96 | for (size_t i = 2; i < capacity; i++) { 97 | queue.emplace(); 98 | } 99 | 100 | REQUIRE(queue.size() == capacity); 101 | REQUIRE(queue.try_emplace() == false); 102 | REQUIRE(queue.try_push(elem) == false); 103 | REQUIRE(queue.size() == capacity); 104 | } 105 | 106 | template 107 | void fuzz_write_worker(T &queue, std::atomic &global_sum, size_t rounds, size_t start_val) { 108 | uint64_t sum = 0; 109 | 110 | for (size_t i = start_val; i < rounds + start_val; i++) { 111 | queue.emplace(i); 112 | sum += i; 113 | } 114 | 115 | global_sum += sum; 116 | } 117 | 118 | template 119 | void fuzz_read_worker(T &queue, std::atomic &global_sum, size_t rounds) { 120 | uint64_t sum = 0; 121 | 122 | for (size_t i = 0; i < rounds; i++) { 123 | uint64_t val; 124 | 125 | queue.pop(val); 126 | 127 | sum += val; 128 | } 129 | 130 | global_sum += sum; 131 | } 132 | 133 | template 134 | void fuzzTest(size_t num_threads, size_t capacity) { 135 | std::atomic write_sum{0}; 136 | std::atomic read_sum{0}; 137 | 138 | std::vector threads(num_threads * 2); 139 | 140 | T queue(capacity); 141 | 142 | for (size_t i = 0; i < num_threads; i++) { 143 | size_t r = i + num_threads; 144 | 145 | uint64_t rand_start = rand(); 146 | 147 | threads[i] = std::thread(fuzz_write_worker, std::ref(queue), std::ref(write_sum), FUZZ_ROUNDS/num_threads, rand_start); 148 | threads[r] = std::thread(fuzz_read_worker, std::ref(queue), std::ref(read_sum), FUZZ_ROUNDS/num_threads); 149 | } 150 | 151 | for (size_t i = 0; i < num_threads * 2; i++) { 152 | threads[i].join(); 153 | } 154 | 155 | REQUIRE(write_sum == read_sum); 156 | } 157 | 158 | TEMPLATE_TEST_CASE("MpmcQueueTest PowerOfTwo", "[unit][mpmcqueue]", 159 | (MpmcQueue), 160 | (MpmcQueue), 161 | (MpmcQueue), 162 | (MpmcQueue), 163 | (MpmcQueue), 164 | (MpmcQueue)) { 165 | TEST_CONSTRUCTORS(testDoubleFillAndEmpty(4), 8); 166 | 167 | TEST_CONSTRUCTORS(testAllMethods(4), 6); 168 | } 169 | 170 | TEMPLATE_TEST_CASE("MpmcQueueTest NonPowerOfTwo", "[unit][mpmcqueue]", 171 | (MpmcQueue), 172 | (MpmcQueue), 173 | (MpmcQueue)) { 174 | TEST_CONSTRUCTORS(testDoubleFillAndEmpty(5), 10); 175 | 176 | TEST_CONSTRUCTORS(testAllMethods(5), 7); 177 | } 178 | 179 | TEMPLATE_TEST_CASE("SpscQueueTest PowerOfTwo", "[unit][spscqueue]", 180 | (SpscQueue), 181 | (SpscQueue), 182 | (SpscQueue), 183 | (SpscQueue), 184 | (SpscQueue), 185 | (SpscQueue)) { 186 | TEST_CONSTRUCTORS(testDoubleFillAndEmpty(4), 8); 187 | 188 | TEST_CONSTRUCTORS(testAllMethods(4), 6); 189 | } 190 | 191 | TEMPLATE_TEST_CASE("SpscQueueTest NonPowerOfTwo", "[unit][spscqueue]", 192 | (SpscQueue), 193 | (SpscQueue), 194 | (SpscQueue)) { 195 | TEST_CONSTRUCTORS(testDoubleFillAndEmpty(5), 10); 196 | 197 | TEST_CONSTRUCTORS(testAllMethods(5), 7); 198 | } 199 | 200 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest PowerOfTwo", "[fuzz][mpmcqueue]", 201 | (MpmcQueue), 202 | (MpmcQueue), 203 | (MpmcQueue), 204 | (MpmcQueue), 205 | (MpmcQueue), 206 | (MpmcQueue)) { 207 | fuzzTest(1, 8192); 208 | fuzzTest(4, 8192); 209 | fuzzTest(8, 8192); 210 | } 211 | 212 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest PowerOfTwo MinSize", "[fuzz][mpmcqueue]", 213 | (MpmcQueue), 214 | (MpmcQueue), 215 | (MpmcQueue), 216 | (MpmcQueue), 217 | (MpmcQueue), 218 | (MpmcQueue)) { 219 | fuzzTest(1, 2); 220 | fuzzTest(4, 2); 221 | fuzzTest(8, 2); 222 | } 223 | 224 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest NonPowerOfTwo", "[fuzz][mpmcqueue]", 225 | (MpmcQueue), 226 | (MpmcQueue), 227 | (MpmcQueue)) { 228 | fuzzTest(1, 8193); 229 | fuzzTest(4, 8193); 230 | fuzzTest(8, 8193); 231 | } 232 | 233 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest NonPowerOfTwo MinSize", "[fuzz][mpmcqueue]", 234 | (MpmcQueue), 235 | (MpmcQueue), 236 | (MpmcQueue)) { 237 | fuzzTest(1, 3); 238 | fuzzTest(4, 3); 239 | fuzzTest(8, 3); 240 | } 241 | 242 | TEMPLATE_TEST_CASE("SpscQueueTest PowerOfTwo", "[fuzz][spscqueue]", 243 | (SpscQueue), 244 | (SpscQueue), 245 | (SpscQueue), 246 | (SpscQueue), 247 | (SpscQueue), 248 | (SpscQueue)) { 249 | fuzzTest(1, 8192); 250 | } 251 | 252 | TEMPLATE_TEST_CASE("SpscQueueTest PowerOfTwo MinSize", "[fuzz][spscqueue]", 253 | (SpscQueue), 254 | (SpscQueue), 255 | (SpscQueue), 256 | (SpscQueue), 257 | (SpscQueue), 258 | (SpscQueue)) { 259 | fuzzTest(1, 2); 260 | } 261 | 262 | TEMPLATE_TEST_CASE("SpscQueueTest NonPowerOfTwo", "[fuzz][spscqueue]", 263 | (SpscQueue), 264 | (SpscQueue), 265 | (SpscQueue)) { 266 | fuzzTest(1, 8193); 267 | } 268 | 269 | TEMPLATE_TEST_CASE("SpscQueueTest NonPowerOfTwo MinSize", "[fuzz][spscqueue]", 270 | (SpscQueue), 271 | (SpscQueue), 272 | (SpscQueue)) { 273 | fuzzTest(1, 3); 274 | } -------------------------------------------------------------------------------- /src/mpmc_bench.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "atomic_queues.hpp" 10 | 11 | constexpr uint64_t total_rounds = 100'000'000; 12 | constexpr size_t capacity = 65536; 13 | 14 | using jdz_queue_cmp_pow2 = typename jdz::MpmcQueue; 15 | using jdz_queue_run_pow2 = typename jdz::MpmcQueue; 16 | using jdz_queue_cmp = typename jdz::MpmcQueue; 17 | using jdz_queue_run = typename jdz::MpmcQueue; 18 | 19 | struct QueueType { 20 | std::string name; 21 | std::function benchmark; 22 | std::size_t capacity; 23 | }; 24 | 25 | template 26 | void mpmc_read_worker(T &queue, uint64_t rounds) { 27 | uint64_t round = 0; 28 | uint64_t val = 0; 29 | 30 | while (round++ < rounds) { 31 | queue.pop(val); 32 | } 33 | } 34 | 35 | template 36 | void mpmc_write_worker(T &queue, uint64_t rounds) { 37 | uint64_t round = 0; 38 | 39 | while (round++ < rounds) { 40 | queue.push(round); 41 | } 42 | } 43 | 44 | template 45 | auto bounded_mpmc_queue_bench(int num_threads, size_t size) { 46 | T queue(size); 47 | 48 | std::vector workers(num_threads * 2); 49 | 50 | const auto begin_time = std::chrono::system_clock::now(); 51 | 52 | for (int i = 0; i < num_threads; i++) { 53 | new (&workers[i]) std::thread([&queue, num_threads]() { 54 | mpmc_write_worker(queue, total_rounds / num_threads); 55 | }); 56 | } 57 | 58 | for (int i = 0; i < num_threads; i++) { 59 | new (&workers[num_threads + i]) std::thread([&queue, num_threads]() { 60 | mpmc_read_worker(queue, total_rounds / num_threads); 61 | }); 62 | } 63 | 64 | for (int i = 0; i < num_threads * 2; i++) { 65 | workers[i].join(); 66 | } 67 | 68 | const auto end_time = std::chrono::system_clock::now(); 69 | 70 | return (end_time - begin_time).count(); 71 | } 72 | 73 | void run_bench(int num_threads, const std::vector& queue_types) { 74 | fprintf(stdout, "=== Num Producers=%d - Num Consumers=%d ===\n", num_threads, num_threads); 75 | 76 | const int time_width = 15; 77 | const int label_width = 15; 78 | 79 | for (const auto& queue_type : queue_types) { 80 | auto time_us = queue_type.benchmark(num_threads, queue_type.capacity) / 1000; 81 | std::cout << std::setw(label_width) << std::left << queue_type.name 82 | << std::setw(time_width) << std::right << time_us << "us | rounds = " << total_rounds << std::endl; 83 | } 84 | } 85 | 86 | int main(int argc, char *argv[]) { 87 | std::vector queue_types = { 88 | {"jdz-cmp-pow2", bounded_mpmc_queue_bench, capacity}, 89 | {"jdz-run-pow2", bounded_mpmc_queue_bench, capacity}, 90 | {"jdz-cmp", bounded_mpmc_queue_bench, capacity + 1}, 91 | {"jdz-run", bounded_mpmc_queue_bench, capacity + 1} 92 | }; 93 | 94 | for (int i = 1; i < argc; i++) { 95 | run_bench(std::atoi(argv[i]), queue_types); 96 | } 97 | 98 | return 0; 99 | } -------------------------------------------------------------------------------- /src/spsc_bench.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "atomic_queues.hpp" 10 | 11 | constexpr uint64_t total_rounds = 100'000'000; 12 | constexpr size_t capacity = 65536; 13 | 14 | using jdz_queue_cmp_pow2 = typename jdz::SpscQueue; 15 | using jdz_queue_run_pow2 = typename jdz::SpscQueue; 16 | using jdz_queue_cmp = typename jdz::SpscQueue; 17 | using jdz_queue_run = typename jdz::SpscQueue; 18 | 19 | using jdz_mpmc_queue_cmp_pow2 = typename jdz::MpmcQueue; 20 | using jdz_mpmc_queue_run_pow2 = typename jdz::MpmcQueue; 21 | using jdz_mpmc_queue_cmp = typename jdz::MpmcQueue; 22 | using jdz_mpmc_queue_run = typename jdz::MpmcQueue; 23 | 24 | struct QueueType { 25 | std::string name; 26 | std::function benchmark; 27 | std::size_t capacity; 28 | }; 29 | 30 | template 31 | void spsc_read_worker(T &queue, uint64_t rounds) { 32 | uint64_t round = 0; 33 | uint64_t val = 0; 34 | 35 | while (round++ < rounds) { 36 | queue.pop(val); 37 | } 38 | } 39 | 40 | template 41 | void spsc_write_worker(T &queue, uint64_t rounds) { 42 | uint64_t round = 0; 43 | 44 | while (round++ < rounds) { 45 | queue.push(round); 46 | } 47 | } 48 | 49 | template 50 | auto bounded_spsc_queue_bench(size_t size) { 51 | T queue(size); 52 | 53 | std::vector workers(2); 54 | 55 | const auto begin_time = std::chrono::system_clock::now(); 56 | 57 | new (&workers[0]) std::thread(spsc_write_worker, std::ref(queue), total_rounds); 58 | new (&workers[1]) std::thread(spsc_read_worker, std::ref(queue), total_rounds); 59 | 60 | workers[0].join(); 61 | workers[1].join(); 62 | 63 | const auto end_time = std::chrono::system_clock::now(); 64 | 65 | return (end_time - begin_time).count(); 66 | } 67 | 68 | int main() { 69 | fprintf(stdout, "=== Num Producers=1 - Num Consumers=1 ===\n"); 70 | 71 | const int time_width = 15; 72 | const int label_width = 20; 73 | 74 | std::vector queue_types = { 75 | {"jdz-cmp-pow2", bounded_spsc_queue_bench, capacity}, 76 | {"jdz-run-pow2", bounded_spsc_queue_bench, capacity}, 77 | {"jdz-cmp", bounded_spsc_queue_bench, capacity + 1}, 78 | {"jdz-run", bounded_spsc_queue_bench, capacity + 1}, 79 | {"jdz-mpmc-cmp-pow2", bounded_spsc_queue_bench, capacity}, 80 | {"jdz-mpmc-run-pow2", bounded_spsc_queue_bench, capacity}, 81 | {"jdz-mpmc-cmp", bounded_spsc_queue_bench, capacity + 1}, 82 | {"jdz-mpmc-run", bounded_spsc_queue_bench, capacity + 1} 83 | }; 84 | 85 | for (const auto& queue_type : queue_types) { 86 | auto time_us = queue_type.benchmark(queue_type.capacity) / 1000; 87 | std::cout << std::setw(label_width) << std::left << queue_type.name 88 | << std::setw(time_width) << std::right << time_us << "us | rounds = " << total_rounds << std::endl; 89 | } 90 | } 91 | 92 | --------------------------------------------------------------------------------