├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── include
    └── atomic_queues.hpp
└── src
    ├── atomic_queue_test.cpp
    ├── mpmc_bench.cpp
    └── spsc_bench.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .vscode
3 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14)
 2 | project(AtomicQueues LANGUAGES CXX)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 20)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | set(CMAKE_CXX_EXTENSIONS OFF)
 7 | 
 8 | add_library(atomic_queues INTERFACE)
 9 | target_include_directories(atomic_queues INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
10 | 
11 | Include(FetchContent)
12 | 
13 | FetchContent_Declare(
14 |   Catch2
15 |   GIT_REPOSITORY https://github.com/catchorg/Catch2.git
16 |   GIT_TAG        v3.4.0
17 | )
18 | 
19 | FetchContent_MakeAvailable(Catch2)
20 | 
21 | add_executable(atomic_queue_test src/atomic_queue_test.cpp)
22 | target_link_libraries(atomic_queue_test atomic_queues Catch2::Catch2WithMain)
23 | 
24 | add_custom_target(run_tests
25 |     COMMAND atomic_queue_test
26 |     DEPENDS atomic_queue_test
27 |     WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
28 | )
29 | 
30 | add_executable(spsc_bench src/spsc_bench.cpp)
31 | target_link_libraries(spsc_bench atomic_queues)
32 | target_compile_options(spsc_bench PRIVATE -O3)
33 | 
34 | add_executable(mpmc_bench src/mpmc_bench.cpp)
35 | target_link_libraries(mpmc_bench atomic_queues)
36 | target_compile_options(mpmc_bench PRIVATE -O3)
37 | 
38 | add_custom_target(run_spsc_bench
39 |     COMMAND spsc_bench
40 |     DEPENDS spsc_bench
41 |     WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
42 | )
43 | 
44 | add_custom_target(run_mpmc_bench
45 |     COMMAND mpmc_bench 1 2 4 8 16
46 |     DEPENDS mpmc_bench
47 |     WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
48 | )


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright Joad Nacer
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # atomic_queues
  2 | 
  3 | This repository contains a single header-file, atomic_queues.hpp, which contains bounded MpmcQueue and SpscQueue implementations for C++20. The SPSC queue appears to be the best performing SPSC queue, while the MPMC queue appears to be the best performing implementation for N threads <= N cores (and is by far the best performing at low contention).
  4 | 
  5 | If you require high performance at N threads > N cores, you may want to use a queue such as [moodycamel's ConcurrentQueue](https://github.com/cameron314/concurrentqueue). I am working on an original "composite" MMPMC queue implementation which will eventually be added to this repo and will hopefully be the top performing queue at high contention.
  6 | 
  7 | Note that performance varies greatly by system and the benchmarks in this README are not realistic, benchmark queues yourself if trying to optimize performance.
  8 | 
  9 | The implementation of the MPMC queue is based on [Dmitry Vyukov's bounded MPMC queue](https://www.1024cores.net/home/lock-free-algorithms/queues/bounded-mpmc-queue), and the implementation of the SPSC queue is based on [Erik Rigtor's SpscQueue](https://github.com/rigtorp/SPSCQueue). Modifications have been made to these implementations in order to improve performance and configurability.
 10 | 
 11 | # Usage and Configuration
 12 | 
 13 | Both queues' templates take the following parameters:
 14 | ```c++
 15 | <typename T, size_t N = 0, ValidSizeConstraint SizeConstraint = jdz::EnforcePowerOfTwo, BufferType BufType = jdz::UseHeapBuffer>
 16 | ```
 17 | 
 18 | - `typename T`: The type stored in the queue.
 19 | - `size_t N = 0`: the capacity of the queue - providing this as a template parameter allows the compiler to optimize the modulo operation, which may greatly improve performance at low contention (see benchmarks), and allows for compile-time checking of the parameter's validity*.
 20 | - `ValidSizeConstraint SizeConstraint = jdz::EnforcePowerOfTwo`: Constraining the capacity to power of two sizes allows for the use of a bitwise AND operation instead of a ternary operation for SPSC or a modulo for MPMC, which may improve performance. Options are `jdz::EnforcePowerOfTwo` and `jdz::DoNotEnforcePowerOfTwo`.
 21 | - `BufferType BufType = jdz::UseHeapBuffer`: Determines the buffer type - HeapBuffer or StackBuffer. Note that using StackBuffer may cause segfaults if the queue capacity is too large due to stack size limits. Pick an appropriate capacity, or adjust your stack size using ie `ulimit -s` on Linux. Options are `jdz::UseHeapBuffer` and `jdz::UseStackBuffer`.
 22 | 
 23 | The constructor's buffer_size argument must be provided if N is equal to 0, and follows the same restrictions as N*. Passing in a custom allocator allows for use of the queue with huge pages or in shared memory.
 24 | 
 25 | *Buffer capacity must be greater than 1, and must respect the power of two size constraint. If N is provided, then the buffer_size_ constructor argument must be either 0 or equal to N.
 26 | 
 27 | Both queues implement the following interface:
 28 | 
 29 | ```c++
 30 | Queue(size_t buffer_size = N;
 31 |       std::allocator<jdz::details::Cell<T>> allocator = std::allocator<jdz::details::Cell<T>>());
 32 | 
 33 | template <typename... Args>
 34 | void emplace(Args &&... args) noexcept; // blocking 
 35 | 
 36 | void push(const T &data) noexcept
 37 | requires std::is_nothrow_copy_constructible_v<T>; // blocking
 38 | 
 39 | template <typename P>
 40 | void push(P &&data) noexcept
 41 | requires std::is_nothrow_constructible_v<T, P>; // blocking
 42 | 
 43 | template <typename... Args>
 44 | [[nodiscard]] bool try_emplace(Args &&... args) noexcept
 45 | requires std::is_nothrow_constructible_v<T, Args &&...>; // non-blocking
 46 | 
 47 | bool try_push(const T &data) noexcept
 48 | requires std::is_nothrow_copy_constructible_v<T>; // non-blocking
 49 | 
 50 | template <typename P>
 51 | [[nodiscard]] bool try_push(P &&data) noexcept
 52 | requires std::is_nothrow_constructible_v<T, P>; // non-blocking
 53 | 
 54 | void pop(T &v) noexcept; // blocking;
 55 |     
 56 | [[nodiscard]] bool try_pop(T &v) noexcept; // non-blocking
 57 | 
 58 | [[nodiscard]] size_t size() const noexcept;
 59 | 
 60 | [[nodiscard]] size_t capacity() const noexcept;
 61 | 
 62 | [[nodiscard]] bool empty() const noexcept;
 63 | ```
 64 | 
 65 | The blocking methods perform better than the non-blocking methods when not-preempted. If number of threads is larger than the number of cores and preemption becomes an issue, you may achieve better performance with a retry loop over the `try*` methods, although I would recommend using a different library in this situation.
 66 | 
 67 | # Benchmarks
 68 | Benchmarks are included in the src/ folder of this repository, testing various versions of the queue implementations. These are currently not especially rigorous and will be improved.
 69 | 
 70 | Posted below are the results of benchmarking the implementations found in this repository against other notable implementations. Benchmarks are run using the blocking read/write methods where possible, ie `pop` and `push` for jdz queues. Note that these may perform differently than the non-blocking methods - ie, for vyukov style queues* (such as jdz implementations and rigtorp), these perform better as long as N threads <= N cores but perform worse than the non-blocking methods below that.
 71 | 
 72 | These benchmarks involve submitting small data types (uint64_t) between equal numbers of producers and consumers producing and consuming at max throughput - this is not a realistic benchmark. Contention in real systems is likely lower for same number of threads, and it is important to test your use case yourself.
 73 | 
 74 | I also plan to add benchmarks of the queues used as SPMC/MPSC, which may produce interesting results.
 75 | 
 76 | *Note that the original Vyukov implementation did not contain blocking methods.
 77 | 
 78 | ## SPSC Benchmarks:
 79 | These benchmarks measure the ops/ms of one producer transmitting 1 billion uint64_t to one consumer. Benchmarks were run 5 times and averaged. Benchmarks are run with a capacity of 65536, or 65537 for non-power of 2 jdz trials. The queues from this repository are the best performing, followed by drogalis's queue.
 80 | 
 81 | Benchmarked queues are:
 82 | 
 83 | - `jdz-cmp-pow2`: jdz queue: `jdz::SpscQueue<uint64_t, 65536, jdz::EnforcePowerOfTwo, jdz::UseStackBuffer>` - compile-time power of 2 capacity.
 84 | - `jdz-run-pow2`: jdz queue: `jdz::SpscQueue<uint64_t, 0, jdz:::EnforcePowerOfTwo>` - runtime power of 2 capacity.
 85 | - `jdz-cmp`:      jdz queue: `jdz::SpscQueue<uint64_t, 65537, jdz::DoNotEnforcePowerOfTwo, jdz::UseStackBuffer>` - compile-time non-power of 2 capacity.
 86 | - `jdz-run`:      jdz queue: `jdz::SpscQueue<uint64_t, 0, jdz::DoNotEnforcePowerOfTwo>` - runtime non-power of 2 capacity.
 87 | - `dro`:          [Andrew Drogalis's SPSC-Queue](https://github.com/drogalis)
 88 | - `rigtorp`:      [Erik Rigtorp's SpscQueue](https://github.com/rigtorp/SPSCQueue).
 89 | - `atomic_queue`: [Maxim Egorushkin's atomic_queue](https://github.com/max0x7ba/atomic_queue) with SPSC=true.
 90 | - `deaod`:        [deaod's spsc_queue](https://github.com/Deaod/spsc_queue).
 91 | - `cml-rwcb`:     [moodycamel's BlockingReaderWriterCircularBuffer](https://github.com/cameron314/readerwriterqueue).
 92 | - `cml-rwq`:      [moodycamel's ReaderWriterQueue](https://github.com/cameron314/readerwriterqueue).
 93 | 
 94 | ### x86_64 - Intel i7-11800H
 95 | 
 96 | ![spscl](https://i.imgur.com/vQdPhrc.png)
 97 | 
 98 | ## MPMC Benchmarks
 99 | These benchmarks show the throughput measured for one producer transmitting 100 million uint64_t to one consumer. Benchmarks were run 5 times and averaged. Benchmarks are run with a capacity of 8192, or 8193 for non-power of 2 jdz trials.
100 | 
101 | We can see clearly that moodycamel's queue is the best by far N threads > N cores, but performs less well below this.
102 | 
103 | Benchmarked queues are:
104 | 
105 | - `jdz-cmp-pow2`:  jdz queue: `jdz::MpmcQueue<uint64_t, 65536, jdz::EnforcePowerOfTwo, jdz::UseStackBuffer>` - compile-time power of 2 capacity.
106 | - `jdz-run-pow2`:  jdz queue: `jdz::MpmcQueue<uint64_t, 0, jdz:::EnforcePowerOfTwo>` - runtime power of 2 capacity.
107 | - `jdz-cmp`:       jdz queue: `jdz::MpmcQueue<uint64_t, 65537, jdz::DoNotEnforcePowerOfTwo, jdz::UseStackBuffer>` - compile-time non-power of 2 capacity.
108 | - `jdz-run`:       jdz queue: `jdz::MpmcQueue<uint64_t, 0, jdz::DoNotEnforcePowerOfTwo>` - runtime non-power of 2 capacity.
109 | - `rigtorp`:       [Erik Rigtorp's MpmcQueue](https://github.com/rigtorp/MPMCQueue).
110 | - `atomic_queue`:  [Maxim Egorushkin's atomic_queue](https://github.com/max0x7ba/atomic_queue).
111 | - `moodycamel`:    [moodycamel's ConcurrentQueue](https://github.com/cameron314/concurrentqueue).
112 | - `es-mpmc`:       [Erez Strauss's lockfree_mpmc_queue](https://github.com/erez-strauss).
113 | - `xenium-vyukov`: [Manuel Pöter's vyukov_bounded_queue](https://github.com/mpoeter/xenium/tree/master).
114 | 
115 | ### x86_64 - Intel i7-11800H
116 | ![1p1cl](https://i.imgur.com/2aVkRSG.png)
117 | ![2p2cl](https://i.imgur.com/2jvkYWb.png)
118 | ![4p4cl](https://i.imgur.com/hjwKwZA.png)
119 | ![8p8cl](https://i.imgur.com/0ij0eo8.png)
120 | ![16p16cl](https://i.imgur.com/1ZoUIlb.png)
121 | 


--------------------------------------------------------------------------------
/include/atomic_queues.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright Joad Nacer
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
  5 | associated documentation files (the “Software”), to deal in the Software without restriction,
  6 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
  7 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
  8 | furnished to do so, subject to the following conditions:
  9 | 
 10 | The above copyright notice and this permission notice shall be included in all copies or
 11 | substantial portions of the Software.
 12 | 
 13 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
 14 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 15 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 16 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 18 | */
 19 | 
 20 | #pragma once
 21 | 
 22 | #include <atomic>
 23 | #include <new>
 24 | #include <cassert>
 25 | #include <array>
 26 | 
 27 | namespace jdz
 28 | {
 29 | 
 30 | // Determines whether capacity should be enforced to be a power of two size, allowing for use of bitwise instead of a modulo/ternary for mpmc/spsc.
 31 | struct EnforcePowerOfTwo {};
 32 | struct DoNotEnforcePowerOfTwo {};
 33 | 
 34 | // Passing as final template parameter to a queue will determine if using HeapBuffer or StackBuffer (default = HeapBuffer)
 35 | // Note that using UseStackBuffer may cause a segfault when attempting to create too large of an array.
 36 | // Ensure you pick an appropriate capacity for your stack size limit, or increase this using ie `ulimit -s` on linux
 37 | struct UseHeapBuffer;
 38 | struct UseStackBuffer;
 39 | 
 40 | namespace details
 41 | {
 42 | 
 43 | #if defined(__cpp_lib_hardware_interference_size)
 44 | static constexpr size_t cache_line = std::hardware_destructive_interference_size;
 45 | #else
 46 | static constexpr size_t cache_line = 64;
 47 | #endif
 48 | 
 49 | static constexpr bool is_power_of_two(size_t n) {
 50 |     return (n & (n-1)) == 0;
 51 | }
 52 | 
 53 | template <typename T>
 54 | concept IsEnforcePowerOfTwo = std::is_same_v<T, EnforcePowerOfTwo>;
 55 | 
 56 | template <typename T>
 57 | concept IsDoNotEnforcePowerOfTwo = std::is_same_v<T, DoNotEnforcePowerOfTwo>;
 58 | 
 59 | template <typename T>
 60 | concept IsValidSizeConstraint = IsEnforcePowerOfTwo<T> || IsDoNotEnforcePowerOfTwo<T>;
 61 | 
 62 | template <typename T>
 63 | concept AlignedToCacheLine = alignof(T) % cache_line == 0;
 64 | 
 65 | template <typename T>
 66 | concept SizeMultipleOfCacheLine = sizeof(T) % cache_line == 0;
 67 | 
 68 | template <typename T>
 69 | concept FalseSharingSafe = AlignedToCacheLine<T> && SizeMultipleOfCacheLine<T>;
 70 | 
 71 | template <size_t N, typename SizeConstraint>
 72 | concept OptionalPowerOfTwo = IsValidSizeConstraint<SizeConstraint>
 73 |                          && (IsDoNotEnforcePowerOfTwo<SizeConstraint> || is_power_of_two(N));
 74 | 
 75 | template <size_t N>
 76 | concept ZeroOrGreaterThanOne = N == 0 || N > 1;
 77 | 
 78 | template <typename T>
 79 | concept IsTrivialType = std::is_trivially_copyable_v<T>
 80 |                      && std::is_trivially_destructible_v<T>
 81 |                      && sizeof(T) <= sizeof(uint64_t);
 82 | 
 83 | template <typename T>
 84 | concept BufferType = std::is_same_v<T, UseHeapBuffer> || std::is_same_v<T, UseStackBuffer>;
 85 | 
 86 | template <size_t N, details::IsValidSizeConstraint SizeConstraint>
 87 | constexpr size_t PlusOneIfNotPowerOfTwo = N == 0 ? 0 : (std::is_same_v<SizeConstraint, EnforcePowerOfTwo> ? N : N + 1);
 88 | 
 89 | template <details::IsValidSizeConstraint SizeConstraint>
 90 | size_t plus_one_if_not_pow2(const size_t n) {
 91 |     return n == 0 ? 0 : (std::is_same_v<SizeConstraint, EnforcePowerOfTwo> ? n : n + 1);
 92 | }
 93 | 
 94 | template <typename T, IsValidSizeConstraint SizeConstraint, bool ApplyModulo, size_t N = 0>
 95 | class HeapBuffer
 96 | {
 97 | private:
 98 |     T *buffer_;
 99 | 
100 |     const size_t buffer_size_;
101 |     const size_t buffer_mask_;
102 | 
103 |     std::allocator<T> allocator_;
104 | 
105 | public:
106 |     explicit HeapBuffer(const size_t buffer_size, const std::allocator<T> &allocator)
107 |         : buffer_size_(buffer_size),
108 |           buffer_mask_(buffer_size_ - 1),
109 |           allocator_(allocator) {
110 |             buffer_ = allocator_.allocate(buffer_size_ + 1);
111 |         }
112 | 
113 |     ~HeapBuffer() noexcept {
114 |         allocator_.deallocate(buffer_, buffer_size_ + 1);
115 |     }
116 | 
117 |     T& operator[](const size_t index) noexcept {
118 |         if constexpr (!ApplyModulo) {
119 |             return buffer_[index];
120 |         }
121 |         else if constexpr (IsEnforcePowerOfTwo<SizeConstraint>) {
122 |             return buffer_[index & buffer_mask_]; 
123 |         }
124 |         else if constexpr (N != 0) {
125 |             return buffer_[index % N];
126 |         }
127 |         else {
128 |             return buffer_[index % buffer_size_];
129 |         }
130 |     }
131 | 
132 |     const T& operator[](const size_t index) const noexcept {
133 |         if constexpr (!ApplyModulo) {
134 |             return buffer_[index];
135 |         }
136 |         if constexpr (IsEnforcePowerOfTwo<SizeConstraint>) {
137 |             return buffer_[index & buffer_mask_];
138 |         }
139 |         else if constexpr (N != 0) {
140 |             return buffer_[index % N];
141 |         }
142 |         else {
143 |             return buffer_[index % buffer_size_];
144 |         }
145 |     }
146 | };
147 | 
148 | template <typename T, size_t N, bool ApplyModulo>
149 | class StackBuffer
150 | {
151 | private:
152 |     std::array<T, N+1> buffer_;
153 | 
154 | public:
155 |     StackBuffer(auto, auto) noexcept {}
156 | 
157 |     ~StackBuffer() noexcept = default;
158 | 
159 |     T& operator[](const size_t index) noexcept {
160 |         if constexpr (ApplyModulo) {
161 |             return buffer_[index % N];
162 |         }
163 |         else {
164 |             return buffer_[index];
165 |         }
166 |     }
167 | 
168 |     const T& operator[](const size_t index) const noexcept {
169 |         if constexpr (ApplyModulo) {
170 |             return buffer_[index % N];
171 |         }
172 |         else {
173 |             return buffer_[index];
174 |         }
175 |     }
176 | };
177 | 
178 | template <bool HasSeq>
179 | struct SeqField;
180 | 
181 | template <>
182 | struct SeqField<true> {
183 |     alignas(cache_line) std::atomic<size_t> seq;
184 | 
185 |     SeqField(size_t i) : seq(i) {}
186 | };
187 | 
188 | template <>
189 | struct SeqField<false> {
190 |     SeqField(size_t) {}
191 | };
192 | 
193 | template <bool HasSeq, bool TriviallyDestructible>
194 | struct IsConstructedField;
195 | 
196 | template <bool HasSeq>
197 | struct IsConstructedField<HasSeq, true> {
198 |     IsConstructedField(bool is_constructed) {}
199 | };
200 | 
201 | template<>
202 | struct IsConstructedField<true, false> {
203 |     bool is_constructed;
204 | 
205 |     IsConstructedField(bool is_constructed) : is_constructed(is_constructed) {}
206 | };
207 | 
208 | template<>
209 | struct IsConstructedField<false, false> {
210 |     #ifdef __aarch64__
211 |     alignas(cache_line) bool is_constructed;
212 |     #else
213 |     bool is_constructed;
214 |     #endif
215 | 
216 |     IsConstructedField(bool is_constructed) : is_constructed(is_constructed) {}
217 | };
218 | 
219 | template <typename T>
220 | using RawData = std::array<std::byte, sizeof(T)>;
221 | 
222 | template <typename T, bool HasSeq>
223 | class Cell;
224 | 
225 | template <typename T, bool HasSeq>
226 | requires IsTrivialType<T>
227 | class Cell<T, HasSeq> : public SeqField<HasSeq> {
228 | private:
229 |     T val_;
230 | 
231 | public:
232 |     Cell() : SeqField<HasSeq>(0) {}
233 | 
234 |     Cell(size_t i) : SeqField<HasSeq>(i) {}
235 | 
236 |     void construct(T val) noexcept {
237 |         val_ = val;
238 |     }
239 | 
240 |     T read() noexcept {
241 |         return val_;
242 |     }
243 | 
244 |     void destroy() {}
245 | };
246 | 
247 | template <typename T, bool HasSeq>
248 | class Cell : public SeqField<HasSeq>,
249 |                         public IsConstructedField<HasSeq, std::is_trivially_destructible_v<T>>
250 | {
251 | private:
252 |     static constexpr bool IsTriviallyDestructible   = std::is_trivially_destructible_v<T>;
253 |     static constexpr bool IsNotTriviallyDestructible = !IsTriviallyDestructible;
254 | 
255 |     alignas(alignof(T)) RawData<T> data_;
256 | 
257 | public:
258 |     Cell() : SeqField<HasSeq>(0), IsConstructedField<HasSeq, IsTriviallyDestructible>(false) {}
259 | 
260 |     Cell(size_t i) : SeqField<HasSeq>(i), IsConstructedField<HasSeq, IsTriviallyDestructible>(false) {}
261 | 
262 |     ~Cell() noexcept
263 |     requires IsTriviallyDestructible {}
264 | 
265 |     ~Cell() noexcept {
266 |         if constexpr (IsNotTriviallyDestructible) {
267 |             if (this->is_constructed) destroy();
268 |         }
269 |     }
270 | 
271 |     template <typename ...Args>
272 |     void construct(Args &&...args) noexcept
273 |     requires std::is_nothrow_constructible_v<T, Args&&...> {
274 |         new (&data_) T(std::forward<Args>(args)...);
275 | 
276 |         if constexpr (IsNotTriviallyDestructible) {
277 |             this->is_constructed = true;
278 |         }
279 |     }
280 | 
281 |     void destroy() noexcept {
282 |         if constexpr (IsNotTriviallyDestructible) {
283 |             reinterpret_cast<T *>(&data_)->~T();
284 | 
285 |             this->is_constructed = false;
286 |         }
287 |     }
288 | 
289 |     T &&read() noexcept {
290 |         return reinterpret_cast<T &&>(data_);
291 |     }
292 | };
293 | 
294 | template <typename T, size_t N, typename SizeConstraint>
295 | concept IsValidQueue = ZeroOrGreaterThanOne<N>
296 |                     && IsValidSizeConstraint<SizeConstraint>
297 |                     && OptionalPowerOfTwo<N, SizeConstraint>;
298 | 
299 | template <typename T, size_t N, typename SizeConstraint>
300 | concept IsValidMpmcQueue = IsValidQueue<T, N, SizeConstraint> && FalseSharingSafe<Cell<T, true>>;
301 | 
302 | template <typename T, size_t N, typename SizeConstraint>
303 | concept IsValidSpscQueue = IsValidQueue<T, N, SizeConstraint>;
304 | 
305 | template <
306 |     typename DerivedImpl,
307 |     bool UseSeq,
308 |     bool ApplyModulo,
309 |     typename T,
310 |     size_t N = 0,
311 |     IsValidSizeConstraint SizeConstraint = EnforcePowerOfTwo,
312 |     BufferType BufType = UseHeapBuffer
313 | >
314 | requires IsValidQueue<T, N, SizeConstraint>
315 | class BaseQueue
316 | {
317 | private:
318 |     static constexpr bool UseStack = std::is_same_v<BufType, UseStackBuffer>;
319 | 
320 |     static_assert(!UseStack || (UseStack && N != 0),
321 |         "Capacity must be set via comptime-parameter to a non-zero value if using UseStackBuffer");
322 | 
323 |     using value_t = Cell<T, UseSeq>;
324 |     using heap_buf = HeapBuffer<value_t, SizeConstraint, ApplyModulo, N>;
325 |     using stack_buf = StackBuffer<value_t, N, ApplyModulo>;
326 | 
327 |     using allocator_t = typename std::allocator<value_t>;
328 | 
329 |     using buffer_t = typename std::conditional_t<UseStack, stack_buf, heap_buf>;
330 | 
331 | protected:
332 |     alignas(cache_line) buffer_t buffer_;
333 | 
334 |     alignas(cache_line) const size_t buffer_size_;
335 | 
336 | public:
337 |     explicit BaseQueue(const size_t buffer_size = N, const allocator_t &allocator = allocator_t()) 
338 |                 : buffer_(buffer_size, allocator), buffer_size_(buffer_size) {
339 | 
340 |         if constexpr (N != 0) {
341 |             assert(buffer_size == N
342 |                 && "Do not specify a constructor buffer size different from compile-time buffer size");
343 |         }
344 |         else {
345 |             if (buffer_size <= 1) {
346 |                 throw std::invalid_argument("buffer_size should be greater than 1");
347 |             }
348 |             if (IsEnforcePowerOfTwo<SizeConstraint> && !details::is_power_of_two(buffer_size)) {
349 |                 throw std::invalid_argument("buffer_size should be a power of 2");
350 |             }
351 |         }
352 | 
353 |         for (size_t i = 0; i < buffer_size; i++) {
354 |             new (&buffer_[i]) value_t(i);
355 |         }
356 | 
357 |     }
358 | 
359 |     ~BaseQueue() noexcept {
360 |         for (size_t i = 0; i < buffer_size_; i++) {
361 |             buffer_[i].~Cell();
362 |         }
363 |     }
364 | 
365 |     BaseQueue(const BaseQueue &) = delete;
366 |     BaseQueue &operator=(const BaseQueue &) = delete;
367 | 
368 |     template <typename... Args>
369 |     void emplace(Args &&... args) noexcept
370 |     requires std::is_nothrow_constructible_v<T, Args &&...> {
371 |         static_cast<DerivedImpl*>(this)->emplace(std::forward<Args &&...>(args)...);
372 |     }
373 | 
374 |     void push(const T &data) noexcept
375 |     requires std::is_nothrow_copy_constructible_v<T> {
376 |         static_cast<DerivedImpl*>(this)->emplace(data);
377 |     }
378 | 
379 |     template <typename P>
380 |     void push(P &&data) noexcept
381 |     requires std::is_nothrow_constructible_v<T, P> {
382 |         static_cast<DerivedImpl*>(this)->emplace(std::forward<P>(data));
383 |     }
384 | 
385 |     template <typename... Args>
386 |     [[nodiscard]] bool try_emplace(Args &&... args) noexcept
387 |     requires std::is_nothrow_constructible_v<T, Args &&...> {
388 |         return static_cast<DerivedImpl*>(this)->try_emplace(std::forward<Args &&...>(args)...);
389 |     }
390 | 
391 |     [[nodiscard]] bool try_push(const T &data) noexcept
392 |     requires std::is_nothrow_copy_constructible_v<T> {
393 |         return static_cast<DerivedImpl*>(this)->try_emplace(data);
394 |     }
395 | 
396 |     template <typename P>
397 |     [[nodiscard]] bool try_push(P &&data) noexcept
398 |     requires std::is_nothrow_constructible_v<T, P> {
399 |         return static_cast<DerivedImpl*>(this)->try_emplace(std::forward<P>(data));
400 |     }
401 | 
402 |     void pop(T &v) noexcept {
403 |         static_cast<DerivedImpl*>(this)->pop(v);
404 |     }
405 |     
406 |     [[nodiscard]] bool try_pop(T &v) noexcept {
407 |         return static_cast<DerivedImpl*>(this)->try_pop(v);
408 |     }
409 | 
410 |     /// Will return a negative value if there are one or more readers waiting on an empty queue
411 |     template <typename U = SizeConstraint>
412 |     [[nodiscard]]typename std::enable_if_t<!std::is_same_v<U, jdz::EnforcePowerOfTwo>, size_t>
413 |     size() const noexcept {
414 |         std::ptrdiff_t diff = static_cast<const DerivedImpl*>(this)->get_enqueue_pos()
415 |                             - static_cast<const DerivedImpl*>(this)->get_dequeue_pos();
416 | 
417 |         if (diff < 0) diff += buffer_size_;
418 | 
419 |         return static_cast<size_t>(diff);
420 |     }
421 | 
422 |     template <typename U = SizeConstraint>
423 |     [[nodiscard]] typename std::enable_if_t<std::is_same_v<U, jdz::EnforcePowerOfTwo>, size_t>
424 |     size() const noexcept {
425 |         return static_cast<const DerivedImpl*>(this)->get_enqueue_pos()
426 |              - static_cast<const DerivedImpl*>(this)->get_dequeue_pos();
427 |     }
428 | 
429 |     [[nodiscard]] size_t capacity() const noexcept {
430 |         return buffer_size_;
431 |     }
432 | 
433 |     [[nodiscard]] bool empty() const noexcept {
434 |         return size() <= 0;
435 |     }
436 | 
437 | };
438 | 
439 | static inline bool cas_add_one_relaxed(std::atomic<size_t> &atomic, size_t& val) noexcept {
440 |     return atomic.compare_exchange_weak(
441 |         val, val + 1, std::memory_order::relaxed, std::memory_order::relaxed);
442 | }
443 | 
444 | } // namespace details
445 | 
446 | template <
447 |     typename T,
448 |     size_t N = 0,
449 |     details::IsValidSizeConstraint SizeConstraint = EnforcePowerOfTwo,
450 |     details::BufferType BufType = UseHeapBuffer
451 | >
452 | requires details::IsValidMpmcQueue<T, N, SizeConstraint>
453 | class MpmcQueue : public details::BaseQueue<MpmcQueue<T, N, SizeConstraint, BufType>, true, true, T, N, SizeConstraint, BufType>
454 | {
455 | private:
456 |     using base_queue_t = details::BaseQueue<MpmcQueue<T, N, SizeConstraint, BufType>, true, true, T, N, SizeConstraint, BufType>;
457 |     using allocator_t = typename std::allocator<details::Cell<T, true>>;
458 | 
459 |     alignas(details::cache_line) std::atomic<size_t> enqueue_pos_{0};
460 |     alignas(details::cache_line) std::atomic<size_t> dequeue_pos_{0};
461 | 
462 | public:
463 |     explicit MpmcQueue(const size_t buffer_size = N, const allocator_t &allocator = allocator_t()) 
464 |                         : base_queue_t(buffer_size, allocator) {}
465 | 
466 |     template <typename... Args>
467 |     void emplace(Args &&... args) noexcept
468 |     requires std::is_nothrow_constructible_v<T, Args &&...> {
469 |         size_t pos = enqueue_pos_.fetch_add(1, std::memory_order::relaxed);
470 |         auto &cell = this->buffer_[pos];
471 | 
472 |         while (pos != cell.seq.load(std::memory_order::acquire));
473 | 
474 |         cell.construct(std::forward<Args>(args)...);
475 |         cell.seq.store(pos+1, std::memory_order::release);
476 |     }
477 | 
478 |     template <typename... Args>
479 |     [[nodiscard]] bool try_emplace(Args &&... args) noexcept
480 |     requires std::is_nothrow_constructible_v<T, Args &&...> {
481 |         while (true) {
482 |             size_t pos = enqueue_pos_.load(std::memory_order::relaxed);
483 |             auto &cell = this->buffer_[pos];
484 | 
485 |             const size_t seq = cell.seq.load(std::memory_order::acquire);
486 |             const int64_t diff = seq - pos;
487 | 
488 |             if (diff == 0 && details::cas_add_one_relaxed(enqueue_pos_, pos)) {
489 |                 cell.construct(std::forward<Args>(args)...);
490 |                 cell.seq.store(pos+1, std::memory_order::release);
491 | 
492 |                 return true;
493 |             }
494 |             else if (diff < 0) {
495 |                 return false;
496 |             }
497 |         }
498 |     }
499 |     
500 |     void pop(T &v) noexcept {
501 |         const size_t pos = dequeue_pos_.fetch_add(1, std::memory_order::relaxed);
502 |         auto &cell = this->buffer_[pos];
503 | 
504 |         while (pos + 1 != cell.seq.load(std::memory_order::acquire));
505 | 
506 |         v = cell.read();
507 |         cell.destroy();
508 | 
509 |         cell.seq.store(pos + this->buffer_size_, std::memory_order::release);
510 |     }
511 | 
512 |     [[nodiscard]] bool try_pop(T &v) noexcept {
513 |         while (true) { 
514 |             size_t pos = dequeue_pos_.load(std::memory_order::relaxed);
515 |             auto &cell = this->buffer_[pos];
516 | 
517 |             const size_t seq = cell.seq.load(std::memory_order::acquire);
518 |             const int64_t diff = seq - (pos + 1);
519 | 
520 |             if (diff == 0 && details::cas_add_one_relaxed(dequeue_pos_, pos)) {
521 |                 v = cell.read();
522 |                 cell.destroy();
523 | 
524 |                 cell.seq.store(pos + this->buffer_size_, std::memory_order::release);
525 | 
526 |                 return true;
527 |             }
528 |             else if (diff < 0) {
529 |                 return false;
530 |             }
531 |         }
532 |     }
533 | 
534 |     [[nodiscard]] size_t get_enqueue_pos() const noexcept {
535 |         return enqueue_pos_.load(std::memory_order::relaxed);
536 |     }
537 | 
538 |     [[nodiscard]] size_t get_dequeue_pos() const noexcept {
539 |         return dequeue_pos_.load(std::memory_order::relaxed);
540 |     }
541 | };
542 | 
543 | template <
544 |     typename T,
545 |     size_t N = 0,
546 |     details::IsValidSizeConstraint SizeConstraint = EnforcePowerOfTwo,
547 |     details::BufferType BufType = UseHeapBuffer
548 | >
549 | requires details::IsValidSpscQueue<T, N, SizeConstraint>
550 | class SpscQueue : public details::BaseQueue<
551 |     SpscQueue<T, N, SizeConstraint, BufType>,
552 |     false, std::is_same_v<SizeConstraint,
553 |     jdz::EnforcePowerOfTwo>,
554 |     T,
555 |     #ifdef __clang__
556 |     N,
557 |     #else
558 |     details::PlusOneIfNotPowerOfTwo<N, SizeConstraint>,
559 |     #endif
560 |     SizeConstraint, BufType
561 | >
562 | {
563 | private:
564 |     using base_queue_t = details::BaseQueue<
565 |         SpscQueue<T, N, SizeConstraint, BufType>,
566 |         false, std::is_same_v<SizeConstraint,
567 |         jdz::EnforcePowerOfTwo>,
568 |         T,
569 |         #ifdef __clang__
570 |         N,
571 |         #else
572 |         details::PlusOneIfNotPowerOfTwo<N, SizeConstraint>,
573 |         #endif
574 |         SizeConstraint, BufType
575 |     >;
576 | 
577 |     using allocator_t = typename std::allocator<details::Cell<T, false>>;
578 | 
579 |     alignas(details::cache_line) std::atomic<size_t> enqueue_pos_{0};
580 |     size_t cached_enqueue_limit_ = 0;
581 | 
582 |     alignas(details::cache_line) std::atomic<size_t> dequeue_pos_{0};
583 |     size_t cached_dequeue_limit_ = 0;
584 | 
585 | public:
586 |     explicit SpscQueue(const size_t buffer_size = N, const allocator_t &allocator = allocator_t()) 
587 |     #ifdef __clang__
588 |                         : base_queue_t(buffer_size, allocator) {}
589 |     #else
590 |                         : base_queue_t(details::plus_one_if_not_pow2<SizeConstraint>(buffer_size), allocator) {}
591 |     #endif
592 | 
593 |     template <typename... Args>
594 |     void emplace(Args &&... args) noexcept
595 |     requires std::is_nothrow_constructible_v<T, Args &&...> {
596 |         const size_t pos = enqueue_pos_.load(std::memory_order::relaxed);
597 |         const size_t pos_tmp = get_pos_tmp(pos);
598 |         auto &cell = this->buffer_[pos];
599 | 
600 |         while (pos_tmp == cached_enqueue_limit_) {
601 |             cached_enqueue_limit_ = dequeue_pos_.load(std::memory_order::acquire) + buffer_size_if_pow2();
602 |         }
603 | 
604 |         cell.construct(std::forward<Args>(args)...);
605 |         enqueue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release);
606 |     }
607 | 
608 |     template <typename... Args>
609 |     [[nodiscard]] bool try_emplace(Args &&... args) noexcept
610 |     requires std::is_nothrow_constructible_v<T, Args &&...> {
611 |         const size_t pos = enqueue_pos_.load(std::memory_order::relaxed);
612 |         const size_t pos_tmp = get_pos_tmp(pos);
613 |         auto &cell = this->buffer_[pos];
614 | 
615 |         if (pos_tmp == cached_enqueue_limit_) {
616 |             cached_enqueue_limit_ = dequeue_pos_.load(std::memory_order::acquire) + buffer_size_if_pow2();
617 | 
618 |             if (pos_tmp == cached_enqueue_limit_) return false;
619 |         }
620 | 
621 |         cell.construct(std::forward<Args>(args)...);
622 |         enqueue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release);
623 | 
624 |         return true;
625 |     }
626 |     
627 |     void pop(T &v) noexcept {
628 |         const size_t pos = dequeue_pos_.load(std::memory_order::relaxed);
629 |         const size_t pos_tmp = get_pos_tmp(pos);
630 |         auto &cell = this->buffer_[pos];
631 | 
632 |         while (pos == cached_dequeue_limit_) {
633 |             cached_dequeue_limit_ = enqueue_pos_.load(std::memory_order::acquire);
634 |         }
635 | 
636 |         v = cell.read();
637 |         cell.destroy();
638 | 
639 |         dequeue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release);
640 |     }
641 | 
642 |     [[nodiscard]] bool try_pop(T &v) noexcept {
643 |         const size_t pos = dequeue_pos_.load(std::memory_order::relaxed);
644 |         const size_t pos_tmp = get_pos_tmp(pos);
645 |         auto &cell = this->buffer_[pos];
646 | 
647 |         if (pos == cached_dequeue_limit_) {
648 |             cached_dequeue_limit_ = enqueue_pos_.load(std::memory_order::acquire);
649 | 
650 |             if (pos == cached_dequeue_limit_) return false;
651 |         }
652 | 
653 |         v = cell.read();
654 |         cell.destroy();
655 | 
656 |         dequeue_pos_.store(get_store_pos(pos_tmp), std::memory_order::release);
657 | 
658 |         return true;
659 |     }
660 | 
661 |     [[nodiscard]] size_t get_enqueue_pos() const noexcept {
662 |         return enqueue_pos_.load(std::memory_order::relaxed);
663 |     }
664 | 
665 |     [[nodiscard]] size_t get_dequeue_pos() const noexcept {
666 |         return dequeue_pos_.load(std::memory_order::relaxed);
667 |     }
668 | 
669 |     #ifdef __clang__
670 | 
671 |     template <size_t NN = N>
672 |     [[nodiscard]] typename std::enable_if_t<NN == 0, size_t>
673 |     get_pos_tmp(const size_t pos) const noexcept {
674 |         return pos + 1 == this->buffer_size_ ? 0 : pos + 1;
675 |     }
676 | 
677 |     template <size_t NN = N>
678 |     [[nodiscard]] typename std::enable_if_t<NN != 0, size_t>
679 |     get_pos_tmp(const size_t pos) const noexcept {
680 |         return pos + 1 == (N+1) ? 0 : pos + 1;
681 |     }
682 | 
683 |     [[nodiscard]] size_t get_store_pos(const size_t pos) const noexcept {
684 |         return pos;
685 |     }
686 | 
687 |     [[nodiscard]] size_t buffer_size_if_pow2() const noexcept {
688 |         return 0;
689 |     }
690 | 
691 |     #else
692 | 
693 |     template <typename U = SizeConstraint, size_t NN = N>
694 |     [[nodiscard]] typename std::enable_if_t<!std::is_same_v<U, jdz::EnforcePowerOfTwo> && NN == 0, size_t>
695 |     get_pos_tmp(const size_t pos) const noexcept {
696 |         return pos + 1 == this->buffer_size_ ? 0 : pos + 1;
697 |     }
698 | 
699 |     template <typename U = SizeConstraint, size_t NN = N>
700 |     [[nodiscard]] typename std::enable_if_t<!std::is_same_v<U, jdz::EnforcePowerOfTwo> && NN != 0, size_t>
701 |     get_pos_tmp(const size_t pos) const noexcept {
702 |         return pos + 1 == (N+1) ? 0 : pos + 1;
703 |     }
704 | 
705 |     template <typename U = SizeConstraint, size_t NN = N>
706 |     [[nodiscard]] typename std::enable_if_t<std::is_same_v<U, jdz::EnforcePowerOfTwo>, size_t>
707 |     get_pos_tmp(const size_t pos) const noexcept {
708 |         return pos;
709 |     }
710 | 
711 |     template <typename U = SizeConstraint>
712 |     [[nodiscard]] typename std::enable_if_t<!std::is_same_v<U, jdz::EnforcePowerOfTwo>, size_t>
713 |     get_store_pos(const size_t pos) const noexcept {
714 |         return pos;
715 |     }
716 | 
717 |     template <typename U = SizeConstraint>
718 |     [[nodiscard]] typename std::enable_if_t<std::is_same_v<U, jdz::EnforcePowerOfTwo>, size_t>
719 |     get_store_pos(const size_t pos) const noexcept {
720 |         return pos + 1;
721 |     }
722 |     
723 |     template <typename U = SizeConstraint, size_t NN = N>
724 |     [[nodiscard]] typename std::enable_if_t<!std::is_same_v<U, jdz::EnforcePowerOfTwo>, size_t>
725 |     buffer_size_if_pow2() const noexcept {
726 |         return 0;
727 |     }
728 | 
729 |     template <typename U = SizeConstraint, size_t NN = N>
730 |     [[nodiscard]] typename std::enable_if_t<std::is_same_v<U, jdz::EnforcePowerOfTwo> && NN == 0, size_t>
731 |     buffer_size_if_pow2() const noexcept {
732 |         return this->buffer_size_;
733 |     }
734 |     
735 |     template <typename U = SizeConstraint, size_t NN = N>
736 |     [[nodiscard]] typename std::enable_if_t<std::is_same_v<U, jdz::EnforcePowerOfTwo> && NN != 0, size_t>
737 |     buffer_size_if_pow2() const noexcept {
738 |         return N;
739 |     }
740 | 
741 |     #endif
742 | };
743 | 
744 | 
745 | } // namespace jdz


--------------------------------------------------------------------------------
/src/atomic_queue_test.cpp:
--------------------------------------------------------------------------------
  1 | #define CATCH_CONFIG_MAIN
  2 | 
  3 | #include <atomic>
  4 | #include <vector>
  5 | #include <random>
  6 | #include <thread>
  7 | #include <cassert>
  8 | #include <catch2/catch_template_test_macros.hpp>
  9 | 
 10 | #include "atomic_queues.hpp"
 11 | 
 12 | using namespace jdz;
 13 | 
 14 | #define RESET_CONSTRUCTORS constructor_count = 0; destructor_count = 0;
 15 | 
 16 | #define REQUIRE_CONSTRUCTORS(CONS, DES) REQUIRE(constructor_count == CONS); REQUIRE(destructor_count == DES);
 17 | 
 18 | #define TEST_CONSTRUCTORS(METHOD, NUM) RESET_CONSTRUCTORS METHOD; REQUIRE_CONSTRUCTORS(NUM, NUM) 
 19 | 
 20 | #define FUZZ_ROUNDS 1'000'000
 21 | 
 22 | static size_t constructor_count = 0;
 23 | static size_t destructor_count = 0;
 24 | 
 25 | class TestElement {
 26 | public:
 27 |     TestElement() noexcept : value(0) {
 28 |         constructor_count++;
 29 |     }
 30 | 
 31 |     TestElement(int value)  noexcept: value(value) {
 32 |         constructor_count++;
 33 |     }
 34 | 
 35 |     TestElement(const TestElement& other) noexcept {
 36 |         constructor_count++;
 37 |         value = other.value;
 38 |     }
 39 | 
 40 |     ~TestElement() noexcept {
 41 |         destructor_count++;
 42 |     }
 43 | 
 44 | private:
 45 |     int value;
 46 | };
 47 | 
 48 | static TestElement elem;
 49 | 
 50 | template <typename T>
 51 | void testDoubleFillAndEmpty(size_t capacity) {
 52 |     T queue(capacity);
 53 | 
 54 |     for (int i = 0; i < 2; i++) {
 55 |         REQUIRE(queue.empty());
 56 |         REQUIRE(queue.size() == 0);
 57 | 
 58 |         for (int j = 1; j <= capacity; j++)  {
 59 |             queue.emplace();
 60 |             REQUIRE(queue.size() == j);
 61 |         }
 62 | 
 63 |         for (int j = capacity - 1; j >= 0; j--) {
 64 |             queue.pop(elem);
 65 |             REQUIRE(queue.size() == j);
 66 |         }
 67 |     }
 68 | }
 69 | 
 70 | template <typename T>
 71 | void testAllMethods(size_t capacity) {
 72 |     assert(capacity >= 2 && "test requires capacity >= 2");
 73 | 
 74 |     T queue(capacity);
 75 | 
 76 |     REQUIRE(queue.empty());
 77 |     REQUIRE(queue.size() == 0);
 78 | 
 79 |     queue.emplace();
 80 |     queue.push(elem);
 81 | 
 82 |     REQUIRE(!queue.empty());
 83 |     REQUIRE(queue.size() == 2);
 84 | 
 85 |     queue.pop(elem);
 86 | 
 87 |     REQUIRE(queue.size() == 1);
 88 |     REQUIRE(queue.try_pop(elem) == true);
 89 |     REQUIRE(queue.size() == 0);
 90 |     REQUIRE(queue.try_pop(elem) == false);
 91 | 
 92 |     REQUIRE(queue.try_emplace() == true);
 93 |     REQUIRE(queue.try_push(elem) == true);
 94 |     REQUIRE(queue.size() == 2);
 95 | 
 96 |     for (size_t i = 2; i < capacity; i++) {
 97 |         queue.emplace();
 98 |     }
 99 | 
100 |     REQUIRE(queue.size() == capacity);
101 |     REQUIRE(queue.try_emplace() == false);
102 |     REQUIRE(queue.try_push(elem) == false);
103 |     REQUIRE(queue.size() == capacity);
104 | }
105 | 
106 | template <typename T>
107 | void fuzz_write_worker(T &queue, std::atomic<uint64_t> &global_sum, size_t rounds, size_t start_val) {
108 |     uint64_t sum = 0;
109 | 
110 |     for (size_t i = start_val; i < rounds + start_val; i++) {
111 |         queue.emplace(i);
112 |         sum += i;
113 |     }
114 | 
115 |     global_sum += sum;
116 | }
117 | 
118 | template <typename T>
119 | void fuzz_read_worker(T &queue, std::atomic<uint64_t> &global_sum, size_t rounds) {
120 |     uint64_t sum = 0;
121 | 
122 |     for (size_t i = 0; i < rounds; i++) {
123 |         uint64_t val;
124 | 
125 |         queue.pop(val);
126 | 
127 |         sum += val;
128 |     }
129 | 
130 |     global_sum += sum;
131 | }
132 | 
133 | template <typename T>
134 | void fuzzTest(size_t num_threads, size_t capacity) {
135 |     std::atomic<uint64_t> write_sum{0};
136 |     std::atomic<uint64_t> read_sum{0};
137 | 
138 |     std::vector<std::thread> threads(num_threads * 2);
139 | 
140 |     T queue(capacity);
141 | 
142 |     for (size_t i = 0; i < num_threads; i++) {
143 |         size_t r = i + num_threads;
144 | 
145 |         uint64_t rand_start = rand();
146 | 
147 |         threads[i] = std::thread(fuzz_write_worker<T>, std::ref(queue), std::ref(write_sum), FUZZ_ROUNDS/num_threads, rand_start);
148 |         threads[r] = std::thread(fuzz_read_worker<T>, std::ref(queue), std::ref(read_sum), FUZZ_ROUNDS/num_threads);
149 |     }
150 | 
151 |     for (size_t i = 0; i < num_threads * 2; i++) {
152 |         threads[i].join();
153 |     }
154 | 
155 |     REQUIRE(write_sum == read_sum);
156 | }
157 | 
158 | TEMPLATE_TEST_CASE("MpmcQueueTest PowerOfTwo", "[unit][mpmcqueue]", 
159 |         (MpmcQueue<TestElement, 0, EnforcePowerOfTwo>),
160 |         (MpmcQueue<TestElement, 4, EnforcePowerOfTwo>),
161 |         (MpmcQueue<TestElement, 4, EnforcePowerOfTwo, UseStackBuffer>),
162 |         (MpmcQueue<TestElement, 0, DoNotEnforcePowerOfTwo>),
163 |         (MpmcQueue<TestElement, 4, DoNotEnforcePowerOfTwo>),
164 |         (MpmcQueue<TestElement, 4, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
165 |     TEST_CONSTRUCTORS(testDoubleFillAndEmpty<TestType>(4), 8);
166 | 
167 |     TEST_CONSTRUCTORS(testAllMethods<TestType>(4), 6);
168 | }
169 | 
170 | TEMPLATE_TEST_CASE("MpmcQueueTest NonPowerOfTwo", "[unit][mpmcqueue]", 
171 |         (MpmcQueue<TestElement, 0, DoNotEnforcePowerOfTwo>),
172 |         (MpmcQueue<TestElement, 5, DoNotEnforcePowerOfTwo>),
173 |         (MpmcQueue<TestElement, 5, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
174 |     TEST_CONSTRUCTORS(testDoubleFillAndEmpty<TestType>(5), 10);
175 | 
176 |     TEST_CONSTRUCTORS(testAllMethods<TestType>(5), 7);
177 | }
178 | 
179 | TEMPLATE_TEST_CASE("SpscQueueTest PowerOfTwo", "[unit][spscqueue]", 
180 |         (SpscQueue<TestElement, 0, EnforcePowerOfTwo>),
181 |         (SpscQueue<TestElement, 4, EnforcePowerOfTwo>),
182 |         (SpscQueue<TestElement, 4, EnforcePowerOfTwo, UseStackBuffer>),
183 |         (SpscQueue<TestElement, 0, DoNotEnforcePowerOfTwo>),
184 |         (SpscQueue<TestElement, 4, DoNotEnforcePowerOfTwo>),
185 |         (SpscQueue<TestElement, 4, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
186 |     TEST_CONSTRUCTORS(testDoubleFillAndEmpty<TestType>(4), 8);
187 | 
188 |     TEST_CONSTRUCTORS(testAllMethods<TestType>(4), 6);
189 | }
190 | 
191 | TEMPLATE_TEST_CASE("SpscQueueTest NonPowerOfTwo", "[unit][spscqueue]", 
192 |         (SpscQueue<TestElement, 0, DoNotEnforcePowerOfTwo>),
193 |         (SpscQueue<TestElement, 5, DoNotEnforcePowerOfTwo>),
194 |         (SpscQueue<TestElement, 5, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
195 |     TEST_CONSTRUCTORS(testDoubleFillAndEmpty<TestType>(5), 10);
196 | 
197 |     TEST_CONSTRUCTORS(testAllMethods<TestType>(5), 7);
198 | }
199 | 
200 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest PowerOfTwo", "[fuzz][mpmcqueue]", 
201 |         (MpmcQueue<uint64_t, 0, EnforcePowerOfTwo>),
202 |         (MpmcQueue<uint64_t, 8192, EnforcePowerOfTwo>),
203 |         (MpmcQueue<uint64_t, 8192, EnforcePowerOfTwo, UseStackBuffer>),
204 |         (MpmcQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
205 |         (MpmcQueue<uint64_t, 8192, DoNotEnforcePowerOfTwo>),
206 |         (MpmcQueue<uint64_t, 8192, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
207 |     fuzzTest<TestType>(1, 8192);
208 |     fuzzTest<TestType>(4, 8192);
209 |     fuzzTest<TestType>(8, 8192);
210 | }
211 | 
212 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest PowerOfTwo MinSize", "[fuzz][mpmcqueue]", 
213 |         (MpmcQueue<uint64_t, 0, EnforcePowerOfTwo>),
214 |         (MpmcQueue<uint64_t, 2, EnforcePowerOfTwo>),
215 |         (MpmcQueue<uint64_t, 2, EnforcePowerOfTwo, UseStackBuffer>),
216 |         (MpmcQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
217 |         (MpmcQueue<uint64_t, 2, DoNotEnforcePowerOfTwo>),
218 |         (MpmcQueue<uint64_t, 2, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
219 |     fuzzTest<TestType>(1, 2);
220 |     fuzzTest<TestType>(4, 2);
221 |     fuzzTest<TestType>(8, 2);
222 | }
223 | 
224 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest NonPowerOfTwo", "[fuzz][mpmcqueue]", 
225 |         (MpmcQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
226 |         (MpmcQueue<uint64_t, 8193, DoNotEnforcePowerOfTwo>),
227 |         (MpmcQueue<uint64_t, 8193, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
228 |     fuzzTest<TestType>(1, 8193);
229 |     fuzzTest<TestType>(4, 8193);
230 |     fuzzTest<TestType>(8, 8193);
231 | }
232 | 
233 | TEMPLATE_TEST_CASE("MpmcFuzzQueueTest NonPowerOfTwo MinSize", "[fuzz][mpmcqueue]", 
234 |         (MpmcQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
235 |         (MpmcQueue<uint64_t, 3, DoNotEnforcePowerOfTwo>),
236 |         (MpmcQueue<uint64_t, 3, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
237 |     fuzzTest<TestType>(1, 3);
238 |     fuzzTest<TestType>(4, 3);
239 |     fuzzTest<TestType>(8, 3);
240 | }
241 | 
242 | TEMPLATE_TEST_CASE("SpscQueueTest PowerOfTwo", "[fuzz][spscqueue]", 
243 |         (SpscQueue<uint64_t, 0, EnforcePowerOfTwo>),
244 |         (SpscQueue<uint64_t, 8192, EnforcePowerOfTwo>),
245 |         (SpscQueue<uint64_t, 8192, EnforcePowerOfTwo, UseStackBuffer>),
246 |         (SpscQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
247 |         (SpscQueue<uint64_t, 8192, DoNotEnforcePowerOfTwo>),
248 |         (SpscQueue<uint64_t, 8192, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
249 |     fuzzTest<TestType>(1, 8192);
250 | }
251 | 
252 | TEMPLATE_TEST_CASE("SpscQueueTest PowerOfTwo MinSize", "[fuzz][spscqueue]", 
253 |         (SpscQueue<uint64_t, 0, EnforcePowerOfTwo>),
254 |         (SpscQueue<uint64_t, 2, EnforcePowerOfTwo>),
255 |         (SpscQueue<uint64_t, 2, EnforcePowerOfTwo, UseStackBuffer>),
256 |         (SpscQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
257 |         (SpscQueue<uint64_t, 2, DoNotEnforcePowerOfTwo>),
258 |         (SpscQueue<uint64_t, 2, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
259 |     fuzzTest<TestType>(1, 2);
260 | }
261 | 
262 | TEMPLATE_TEST_CASE("SpscQueueTest NonPowerOfTwo", "[fuzz][spscqueue]", 
263 |         (SpscQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
264 |         (SpscQueue<uint64_t, 8193, DoNotEnforcePowerOfTwo>),
265 |         (SpscQueue<uint64_t, 8193, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
266 |     fuzzTest<TestType>(1, 8193);
267 | }
268 | 
269 | TEMPLATE_TEST_CASE("SpscQueueTest NonPowerOfTwo MinSize", "[fuzz][spscqueue]", 
270 |         (SpscQueue<uint64_t, 0, DoNotEnforcePowerOfTwo>),
271 |         (SpscQueue<uint64_t, 3, DoNotEnforcePowerOfTwo>),
272 |         (SpscQueue<uint64_t, 3, DoNotEnforcePowerOfTwo, UseStackBuffer>)) {
273 |     fuzzTest<TestType>(1, 3);
274 | }


--------------------------------------------------------------------------------
/src/mpmc_bench.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstddef>
 2 | #include <iostream>
 3 | #include <thread>
 4 | #include <chrono>
 5 | #include <iomanip>
 6 | #include <vector>
 7 | #include <functional> 
 8 | 
 9 | #include "atomic_queues.hpp"
10 | 
11 | constexpr uint64_t total_rounds = 100'000'000;
12 | constexpr size_t capacity = 65536;
13 | 
14 | using jdz_queue_cmp_pow2 = typename jdz::MpmcQueue<uint64_t, capacity, jdz::EnforcePowerOfTwo, jdz::UseStackBuffer>;
15 | using jdz_queue_run_pow2 = typename jdz::MpmcQueue<uint64_t, 0, jdz::EnforcePowerOfTwo>;
16 | using jdz_queue_cmp = typename jdz::MpmcQueue<uint64_t, capacity + 1, jdz::DoNotEnforcePowerOfTwo, jdz::UseStackBuffer>;
17 | using jdz_queue_run = typename jdz::MpmcQueue<uint64_t, 0, jdz::DoNotEnforcePowerOfTwo>;
18 | 
19 | struct QueueType {
20 |     std::string name;
21 |     std::function<uint64_t(int, size_t)> benchmark;
22 |     std::size_t capacity;
23 | };
24 | 
25 | template <typename T>
26 | void mpmc_read_worker(T &queue, uint64_t rounds) {
27 |     uint64_t round = 0;
28 |     uint64_t val = 0;
29 | 
30 |     while (round++ < rounds) {
31 |         queue.pop(val);
32 |     }
33 | }
34 | 
35 | template <typename T>
36 | void mpmc_write_worker(T &queue, uint64_t rounds) {
37 |     uint64_t round = 0;
38 | 
39 |     while (round++ < rounds) {
40 |         queue.push(round);
41 |     }
42 | }
43 | 
44 | template <typename T>
45 | auto bounded_mpmc_queue_bench(int num_threads, size_t size) {
46 |     T queue(size);
47 | 
48 |     std::vector<std::thread> workers(num_threads * 2);
49 | 
50 |     const auto begin_time = std::chrono::system_clock::now();
51 | 
52 |     for (int i = 0; i < num_threads; i++) {
53 |         new (&workers[i]) std::thread([&queue, num_threads]() {
54 |             mpmc_write_worker(queue, total_rounds / num_threads);
55 |         });
56 |     }
57 | 
58 |     for (int i = 0; i < num_threads; i++) {
59 |         new (&workers[num_threads + i]) std::thread([&queue, num_threads]() {
60 |             mpmc_read_worker(queue, total_rounds / num_threads);
61 |         });
62 |     }
63 | 
64 |     for (int i = 0; i < num_threads * 2; i++) {
65 |         workers[i].join();
66 |     }
67 | 
68 |     const auto end_time = std::chrono::system_clock::now();
69 | 
70 |     return (end_time - begin_time).count();
71 | }
72 | 
73 | void run_bench(int num_threads, const std::vector<QueueType>& queue_types) {
74 |     fprintf(stdout, "=== Num Producers=%d - Num Consumers=%d ===\n", num_threads, num_threads);
75 | 
76 |     const int time_width = 15;
77 |     const int label_width = 15;
78 | 
79 |     for (const auto& queue_type : queue_types) {
80 |         auto time_us = queue_type.benchmark(num_threads, queue_type.capacity) / 1000;
81 |         std::cout << std::setw(label_width) << std::left << queue_type.name
82 |                   << std::setw(time_width) << std::right << time_us << "us | rounds = " << total_rounds << std::endl;
83 |     }
84 | }
85 | 
86 | int main(int argc, char *argv[]) {
87 |     std::vector<QueueType> queue_types = {
88 |         {"jdz-cmp-pow2", bounded_mpmc_queue_bench<jdz_queue_cmp_pow2>, capacity},
89 |         {"jdz-run-pow2", bounded_mpmc_queue_bench<jdz_queue_run_pow2>, capacity},
90 |         {"jdz-cmp", bounded_mpmc_queue_bench<jdz_queue_cmp>, capacity + 1},
91 |         {"jdz-run", bounded_mpmc_queue_bench<jdz_queue_run>, capacity + 1}
92 |     };
93 | 
94 |     for (int i = 1; i < argc; i++) {
95 |         run_bench(std::atoi(argv[i]), queue_types);
96 |     }
97 | 
98 |     return 0;
99 | }


--------------------------------------------------------------------------------
/src/spsc_bench.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstddef>
 2 | #include <iostream>
 3 | #include <thread>
 4 | #include <chrono>
 5 | #include <iomanip>
 6 | #include <vector>
 7 | #include <functional> 
 8 | 
 9 | #include "atomic_queues.hpp"
10 | 
11 | constexpr uint64_t total_rounds = 100'000'000;
12 | constexpr size_t capacity = 65536;
13 | 
14 | using jdz_queue_cmp_pow2 = typename jdz::SpscQueue<uint64_t, capacity, jdz::EnforcePowerOfTwo, jdz::UseStackBuffer>;
15 | using jdz_queue_run_pow2 = typename jdz::SpscQueue<uint64_t, 0, jdz::EnforcePowerOfTwo>;
16 | using jdz_queue_cmp = typename jdz::SpscQueue<uint64_t, capacity + 1, jdz::DoNotEnforcePowerOfTwo, jdz::UseStackBuffer>;
17 | using jdz_queue_run = typename jdz::SpscQueue<uint64_t, 0, jdz::DoNotEnforcePowerOfTwo>;
18 | 
19 | using jdz_mpmc_queue_cmp_pow2 = typename jdz::MpmcQueue<uint64_t, capacity, jdz::EnforcePowerOfTwo, jdz::UseStackBuffer>;
20 | using jdz_mpmc_queue_run_pow2 = typename jdz::MpmcQueue<uint64_t, 0, jdz::EnforcePowerOfTwo>;
21 | using jdz_mpmc_queue_cmp = typename jdz::MpmcQueue<uint64_t, capacity + 1, jdz::DoNotEnforcePowerOfTwo, jdz::UseStackBuffer>;
22 | using jdz_mpmc_queue_run = typename jdz::MpmcQueue<uint64_t, 0, jdz::DoNotEnforcePowerOfTwo>;
23 | 
24 | struct QueueType {
25 |     std::string name;
26 |     std::function<uint64_t(size_t)> benchmark;
27 |     std::size_t capacity;
28 | };
29 | 
30 | template <typename T>
31 | void spsc_read_worker(T &queue, uint64_t rounds) {
32 |     uint64_t round = 0;
33 |     uint64_t val = 0;
34 | 
35 |     while (round++ < rounds) {
36 |         queue.pop(val);
37 |     }
38 | }
39 | 
40 | template <typename T>
41 | void spsc_write_worker(T &queue, uint64_t rounds) {
42 |     uint64_t round = 0;
43 | 
44 |     while (round++ < rounds) {
45 |         queue.push(round);
46 |     }
47 | }
48 | 
49 | template <typename T>
50 | auto bounded_spsc_queue_bench(size_t size) {
51 |     T queue(size);
52 | 
53 |     std::vector<std::thread> workers(2);
54 | 
55 |     const auto begin_time = std::chrono::system_clock::now();
56 | 
57 |     new (&workers[0]) std::thread(spsc_write_worker<T>, std::ref(queue), total_rounds);
58 |     new (&workers[1]) std::thread(spsc_read_worker<T>, std::ref(queue), total_rounds);
59 | 
60 |     workers[0].join();
61 |     workers[1].join();
62 | 
63 |     const auto end_time = std::chrono::system_clock::now();
64 | 
65 |     return (end_time - begin_time).count();
66 | }
67 | 
68 | int main() {
69 |     fprintf(stdout, "=== Num Producers=1 - Num Consumers=1 ===\n");
70 | 
71 |     const int time_width = 15;
72 |     const int label_width = 20;
73 | 
74 |     std::vector<QueueType> queue_types = {
75 |         {"jdz-cmp-pow2", bounded_spsc_queue_bench<jdz_queue_cmp_pow2>, capacity},
76 |         {"jdz-run-pow2", bounded_spsc_queue_bench<jdz_queue_run_pow2>, capacity},
77 |         {"jdz-cmp", bounded_spsc_queue_bench<jdz_queue_cmp>, capacity + 1},
78 |         {"jdz-run", bounded_spsc_queue_bench<jdz_queue_run>, capacity + 1},
79 |         {"jdz-mpmc-cmp-pow2", bounded_spsc_queue_bench<jdz_mpmc_queue_cmp_pow2>, capacity},
80 |         {"jdz-mpmc-run-pow2", bounded_spsc_queue_bench<jdz_mpmc_queue_run_pow2>, capacity},
81 |         {"jdz-mpmc-cmp", bounded_spsc_queue_bench<jdz_mpmc_queue_cmp>, capacity + 1},
82 |         {"jdz-mpmc-run", bounded_spsc_queue_bench<jdz_mpmc_queue_run>, capacity + 1}
83 |     };
84 | 
85 |     for (const auto& queue_type : queue_types) {
86 |         auto time_us = queue_type.benchmark(queue_type.capacity) / 1000;
87 |         std::cout << std::setw(label_width) << std::left << queue_type.name
88 |                   << std::setw(time_width) << std::right << time_us << "us | rounds = " << total_rounds << std::endl;
89 |     }
90 | }
91 | 
92 | 


--------------------------------------------------------------------------------