├── .gitignore ├── CMakeLists.txt ├── FastQueueIntegrityTest.cpp ├── LICENSE ├── README.md ├── deaod_spsc ├── LICENSE └── spsc_queue.hpp ├── dro ├── LICENSE └── spsc-queue.hpp ├── fast_queue_arm64.h ├── fast_queue_x86_64.h ├── fastqueue2.png ├── main.cpp ├── pin_thread.h ├── ring_buffer_concept.png └── ringbuffer.png /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | cmake-build-* 3 | build 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.22) 2 | project(fast_queue2) 3 | 4 | set(CMAKE_CXX_STANDARD 20) 5 | 6 | find_package (Threads REQUIRED) 7 | 8 | #Make sure we only target ARM/x86 64 bit architectures 9 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") 10 | message("Building target x86_64") 11 | elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") 12 | message("Building target arm64") 13 | else () 14 | message("System CPU is: " ${CMAKE_SYSTEM_PROCESSOR}) 15 | message( FATAL_ERROR "Not supported architecture, X86_64 and arm64 is." ) 16 | endif () 17 | 18 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/deaod_spsc/) 19 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dro/) 20 | 21 | add_executable(fast_queue2 main.cpp) 22 | target_link_libraries(fast_queue2 Threads::Threads) 23 | 24 | add_executable(fast_queue_integrity_test FastQueueIntegrityTest.cpp) 25 | target_link_libraries(fast_queue_integrity_test Threads::Threads) 26 | -------------------------------------------------------------------------------- /FastQueueIntegrityTest.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Anders Cedronius 3 | // 4 | 5 | // Lock-free producer (one thread) and consumer (another thread) integrity test 6 | // The test is performed by the producer producing data at an irregular rate in time 7 | // containing random data and a simple checksum + counter. 8 | // And a consumer reading the data at an equally (same dynamic range in time) irregular rate 9 | // verifying the checksum and linearity of the counter. The queue is set shallow (2 entries) to 10 | // make the test face queue full/empty situations as often as possible. 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "pin_thread.h" 18 | #if __x86_64__ || _M_X64 19 | #include "fast_queue_x86_64.h" 20 | #elif __aarch64__ || _M_ARM64 21 | #include "fast_queue_arm64.h" 22 | #else 23 | #error Architecture not supported 24 | #endif 25 | 26 | #define QUEUE_MASK 0b1 27 | #define L1_CACHE_LINE 64 28 | #define TEST_TIME_DURATION_SEC 200 29 | 30 | bool gActiveProducer = true; 31 | std::atomic gActiveConsumer = 0; 32 | bool gStartBench = false; 33 | std::atomic gTransactions = 0; 34 | uint64_t gChk = 0; 35 | 36 | void producer(FastQueue*, QUEUE_MASK, L1_CACHE_LINE> *rQueue, int32_t aCPU) { 37 | std::random_device lRndDevice; 38 | std::mt19937 lMersenneEngine{lRndDevice()}; 39 | std::uniform_int_distribution lDist{1, 500}; 40 | auto lGen = [&lDist, &lMersenneEngine]() { 41 | return lDist(lMersenneEngine); 42 | }; 43 | if (!pinThread(aCPU)) { 44 | std::cout << "Pin CPU fail. " << std::endl; 45 | rQueue->stopQueue(); 46 | return; 47 | } 48 | while (!gStartBench) { 49 | #ifdef _MSC_VER 50 | __nop(); 51 | #else 52 | asm("NOP"); 53 | #endif 54 | } 55 | uint64_t lCounter = 0; 56 | while (gActiveProducer) { 57 | auto lpData = new std::vector(1000); 58 | std::generate(lpData->begin(), lpData->end(), lGen); 59 | *(uint64_t *) lpData->data() = lCounter++; 60 | uint64_t lSimpleSum = std::accumulate(lpData->begin() + 16, lpData->end(), 0); 61 | *(uint64_t *) (lpData->data() + 8) = lSimpleSum; 62 | rQueue->push(lpData); 63 | uint64_t lSleep = lDist(lMersenneEngine); 64 | std::this_thread::sleep_for(std::chrono::nanoseconds(lSleep)); 65 | } 66 | rQueue->stopQueue(); 67 | } 68 | 69 | void consumer(FastQueue*, QUEUE_MASK, L1_CACHE_LINE> *rQueue, int32_t aCPU) { 70 | uint64_t lCounter = 0; 71 | std::random_device lRndDevice; 72 | std::mt19937 lMersenneEngine{lRndDevice()}; 73 | std::uniform_int_distribution lDist{1, 500}; 74 | if (!pinThread(aCPU)) { 75 | std::cout << "Pin CPU fail. " << std::endl; 76 | gActiveConsumer--; 77 | return; 78 | } 79 | gActiveConsumer++; 80 | while (true) { 81 | std::vector* lResult = nullptr; 82 | rQueue->pop(lResult); 83 | if (lResult == nullptr) { 84 | break; 85 | } 86 | if (lCounter != *(uint64_t *) lResult->data()) { 87 | std::cout << "Test failed.. Not linear data. " << *(uint64_t *) lResult->data() << std::endl; 88 | gActiveConsumer--; 89 | return; 90 | } 91 | uint64_t lSimpleSum = std::accumulate(lResult->begin() + 16, lResult->end(), 0); 92 | if (lSimpleSum != *(uint64_t *) (lResult->data() + 8)) { 93 | std::cout << "Test failed.. Not consistent data. " << lSimpleSum << " " << lCounter << " " << gChk 94 | << std::endl; 95 | gActiveConsumer--; 96 | return; 97 | } 98 | delete lResult; 99 | lCounter++; 100 | uint64_t lSleep = lDist(lMersenneEngine); 101 | std::this_thread::sleep_for(std::chrono::nanoseconds(lSleep)); 102 | } 103 | gTransactions = lCounter; 104 | gActiveConsumer--; 105 | } 106 | 107 | int main() { 108 | auto lQueue1 = new FastQueue*, QUEUE_MASK, L1_CACHE_LINE>(); 109 | std::thread([lQueue1] { return consumer(lQueue1, 0); }).detach(); 110 | std::thread([lQueue1] { return producer(lQueue1, 2); }).detach(); 111 | std::cout << "Producer -> Consumer (start)" << std::endl; 112 | gStartBench = true; 113 | std::this_thread::sleep_for(std::chrono::seconds(TEST_TIME_DURATION_SEC)); 114 | gActiveProducer = false; 115 | lQueue1->stopQueue(); 116 | std::cout << "Producer -> Consumer (end)" << std::endl; 117 | while (gActiveConsumer) { 118 | std::this_thread::sleep_for(std::chrono::milliseconds(1)); 119 | } 120 | delete lQueue1; 121 | std::cout << "Test ended. Did " << gTransactions << " transactions." << std::endl; 122 | return EXIT_SUCCESS; 123 | } 124 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Anders Cedronius 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Logo](fastqueue2.png) 2 | 3 | # FastQueue2 4 | 5 | FastQueue2 is a rewrite of [FastQueue](https://github.com/andersc/fastqueue). Supporting 8 byte transfers only. 6 | 7 | ## But first 8 | 9 | * Is this queue memory efficient? 10 | 11 | No. This queue aims for speed not memory efficiency. 12 | 13 | * The queue is ‘dramatically under-synchronized’ 14 | 15 | Write a test and prove it (you can use FastQueueIntegrityTest.cpp as a boilerplate). Don’t just say stuff out of the blue, prove it! 16 | 17 | * Why not use partial specialization for pointers since that's all you support? 18 | 19 | This queue supports transport of 8 bytes from a producer to a consumer. It might be a pointer and it might not be a pointer so that’s why no specialization is implemented. However, if we gain speed using specialization of pointers then let’s implement that. I did not see any gain in my tests and this queue is all about speed. 20 | 21 | 22 | ## Background 23 | 24 | When I was playing around with benchmarking various SPSC queues [deaod’s](https://github.com/Deaod/spsc_queue) queue was unbeatable. The titans: [Rigtorp](https://github.com/rigtorp/SPSCQueue), [Folly](https://github.com/facebook/folly/tree/main), [moodycamel](https://github.com/cameron314/concurrentqueue) and [boost](https://www.boost.org/doc/libs/1_66_0/doc/html/lockfree.html) where all left in the dust, it was especially fast on Apple silicon. My previous attempt ([FastQueue](https://github.com/andersc/fastqueue)) trying to beat the titans is placing itself in the top tier but not #1. In my queue I also implemented a stop-queue mechanism missing from the other implementations. Anyhow…. 25 | 26 | (Update added [Dro](https://github.com/drogalis/SPSC-Queue/tree/main) a promising SPSC Kingpin.. Run the benchmarks for results on your platform) 27 | 28 | So I took a new egoistic approach, meaning target my usecases to investigate if there were any fundamental changes to the system that then could be made. I’m only working with 64-bit CPU’s so let’s only target x86_64 and Arm64. Also for all my cases I pass pointers around so limiting the object to a 8 byte object is fine. 29 | 30 | In the general SPSC queue implementation there is a circular buffer where push is looking if it’s possible to push an object by looking at the distance between the tail and head pointer/counter. The same goes for popping an object, if there is a distance between tail and head there is at least one object to pop. That means that if the push runs on one CPU and the pop runs on another CPU you share tail/head counters and the object itself between the CPU’s. 31 | 32 | ![(deaods ringbuffer picture)](ring_buffer_concept.png) 33 | 34 | *The above picture is taken from Deaods repo* 35 | 36 | I concluded based on the way I limit the usecase it’s possible to share the queue position by looking at the object itself (I’m aware that this is probably not something revolutionary. Most likely someone at Xerox PARC wrote a paper about this in the 70’s). That means that the CPU’s do not need to share its counters it only need to share the object, and it wants to share that object anyway. So it’s the absolute minimal amount of data. 37 | However for this to work without sharing pointers/counters the object must be there or not. So when we pop the object in the reader thread then we also need to clear it’s position in the circular buffer by assigning a nullptr. And it also needs to do that without tearing that's why 8 bytes in a 64-bit environment works. 38 | 39 | ![(my ringbuffer picture)](ringbuffer.png) 40 | 41 | So the concept is exactly the same as before it’s just that we now know where we wrote an object last time now we just check if it’s possible to write an object in the next position before actually committing to doing that. If it's nullpt then we can write if not we got a full buffer and need to wait for the consumer. 42 | 43 | So if the tail hits the head we will not write any objects, and if the head hit’s the tail there are no objects to pop. 44 | 45 | Using that mechanism we only need to share the actual object between the threads/cpus. 46 | 47 | ## The need for speed 48 | 49 | * So what speed do we get on my M1 Pro? 50 | 51 | ``` 52 | DeaodSPSC pointer test started. 53 | DeaodSPSC pointer test ended. 54 | DeaodSPSC Transactions -> 12389023/s 55 | FastQueue pointer test started. 56 | FastQueue pointer test ended. 57 | FastQueue Transactions -> 17516515/s 58 | ``` 59 | 60 | And that’s a significant improvement over my previous attempt that was at around 10+M transactions per second (on my M1 Pro) while Deaod is at 12M. 61 | 62 | What sparked me initially was the total dominance by Deaod on Apple Silicone now that's taken care of.. Yes! So in my application, the way the compiler compiles the code I by far beat Deaod. 63 | 64 | * What about x86_64? 65 | 66 | I don’t have access to a lot of x86 systems but I ran the code on a 64 core AMD EPYC 7763. I had to slightly modify the code to beat Deaod. 67 | 68 | ``` 69 | DeaodSPSC pointer test started. 70 | DeaodSPSC pointer test ended. 71 | DeaodSPSC Transactions -> 12397808/s 72 | FastQueue pointer test started. 73 | FastQueue pointer test ended. 74 | FastQueue Transactions -> 13755427/s 75 | ``` 76 | 77 | So great! Still champagne, but did not totally run over the competition. 10% faster so still significant. 78 | 79 | The header file is under 60 lines of code and uses a combination of atomics and memory barriers to what I found the most optimal combination. 80 | 81 | Push looks like this: 82 | 83 | ```cpp 84 | while(mRingBuffer[mWritePosition&RING_BUFFER_SIZE].mObj != nullptr) if (mExitThreadSemaphore) [[unlikely]] return; 85 | new(&mRingBuffer[mWritePosition++&RING_BUFFER_SIZE].mObj) T{std::forward(args)...}; 86 | ``` 87 | Simple. Is the slot free? if not is the queue still operating? 88 | If it's OK to push the object just put it in the queue. 89 | 90 | Pop looks like this: 91 | 92 | ```cpp 93 | std::atomic_thread_fence(std::memory_order_consume); 94 | while (!(aOut = mRingBuffer[mReadPosition & RING_BUFFER_SIZE].mObj)) { 95 | if (mExitThread == mReadPosition) [[unlikely]] { 96 | aOut = nullptr; 97 | return; 98 | } 99 | } 100 | mRingBuffer[mReadPosition++ & RING_BUFFER_SIZE].mObj = nullptr; 101 | ``` 102 | Try popping the object. If sucessfull mark it free. 103 | If not sucessfull popping the object is the queue active? 104 | if it's not then just return nullptr 105 | 106 | 107 | Regarding inline, noexcept and [[unlikely]].. It's there. Yes I know -O3 always inlines and I have read what people say about [[unlikely]]. 108 | If you don't like it. remove and pullrequest. 109 | 110 | ## Usage 111 | 112 | See the orignal fastqueue (the link above) 113 | 114 | (just move the header file(s) to your project depending on arhitecture) 115 | **fast_queue_arm64.h** / **fast_queue_x86_64.h** 116 | 117 | ## Build and run the tests 118 | 119 | ``` 120 | git clone https://github.com/andersc/fastqueue2.git 121 | cd fastqueue2 122 | mkdir build 123 | cd build 124 | cmake -DCMAKE_BUILD_TYPE=Release .. 125 | cmake --build . 126 | ``` 127 | 128 | (Run the benchmark against Deaod) 129 | 130 | **./fast_queue2** 131 | 132 | (Run the integrity test) 133 | 134 | **./fast_queue_integrity_test** 135 | 136 | 137 | ## Some thoughts 138 | There are a couple of findings that puzzled me. 139 | 140 | 1. I had to increase the the spacing between the objects to two times the cache length for x86_64 to gain speed over Deaod. Why? It does not make any sense. 141 | 2. Pre-loading the cache when popping (I did comment out the code but play around yourself in the ARM version) did do nothing. Modern CPU’s pre-load the data speculatively anyway. 142 | 3. I got good speed when the ringbuffer size exceeded 1024 entries. Why? My guess is that it irons out the uneven behaviour between the producer consumer. It’s just that my queue there was a significant increase in efficiency while for Deaod I did not see that effect. Well. 143 | 4. We are micro-benchmarking the resuls should be 'considered with a grain of salt' 144 | 145 | Can this be beaten? Yes it can.. However the free version of me is as fast as this. The paid version of me is faster ;-) 146 | 147 | Have fun 148 | 149 | -------------------------------------------------------------------------------- /deaod_spsc/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Lukas Bagaric 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | - Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | - Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 19 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 20 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 21 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 22 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /deaod_spsc/spsc_queue.hpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | Copyright (c) 2019, Lukas Bagaric 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | - Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | - Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | ******************************************************************************* 25 | 26 | This file defines a single template, spsc_queue, which implements a bounded 27 | queue with at most one producer, and one consumer at the same time. 28 | 29 | spsc_queue is intended to be used in environments, where heap-allocation must 30 | never occur. While it is possible to use spsc_queue in real-time environments, 31 | the implementation trades a worse worst-case for a significantly better 32 | average-case. 33 | 34 | spsc_queue has highest throughput under contention if: 35 | * you have small (register sized) elements OR 36 | * if the total size of the queue (size of element times number of elements) 37 | will not exceed the size of your processors fastest cache. 38 | 39 | spsc_queue takes up to three template parameters: 40 | * T: The type of a single element 41 | * queue_size: The number of slots for elements within the queue. 42 | Note: Due to implementation details, one slot is reserved and 43 | cannot be used. 44 | * align_log2: The number of bytes to align on, expressed as an exponent for 45 | two, so the actual alignment is (1 << align_log2) bytes. This 46 | number should be at least log2(alignof(size_t)). Ideal values 47 | avoid destructive hardware interference (false sharing). 48 | Default is 7. 49 | alignof(T) must not be greater than (1 << align_log2). 50 | 51 | Interface: 52 | General: 53 | bool is_empty() const; 54 | Returns true if there is currently no object in the queue. 55 | Returns false otherwise. 56 | 57 | bool is_full() const; 58 | Returns true if no more objects can be added to the queue. 59 | Returns false otherwise. 60 | 61 | Enqueue: 62 | bool push(const T& elem); 63 | bool push(T&& elem); 64 | Tries to insert elem into the queue. Returns true if successful, false 65 | otherwise. 66 | 67 | size_type push_n(size_type count, const T& elem); 68 | Tries to insert count copies of elem into the queue. Returns the 69 | number of copies successfully inserted. 70 | 71 | template 72 | size_type write(Iterator beg, Iterator end); 73 | Tries to copy elements into the queue from beg, until end is reached. 74 | Returns the number of elements copied into the queue. 75 | 76 | template 77 | size_type write(size_type count, Iterator elems); 78 | Tries to copy count elements into the queue from elems until either the 79 | queue is full or all have been copied. 80 | Returns the number of elements copied into the queue. 81 | 82 | template 83 | bool emplace(Args&&... args); 84 | Tries to insert an object of type T constructed from args into the 85 | queue. Returns true if successful, false otherwise. 86 | 87 | template 88 | size_type emplace_n(size_type count, Args&&... args); 89 | Tries to insert count objects of type T constructed from args into 90 | the queue. Returns the number of objects successfully inserted. 91 | 92 | template 93 | bool produce(Callable&& f); 94 | Tries to insert an object into the queue by calling Callable if there is 95 | space for an object. Returns true if there was space for an object, and 96 | Callable returned true. Returns false otherwise. 97 | Callable is an invocable with one parameter of type void*, and a return 98 | type of bool. Callable is expected to place a new object of type T at 99 | the address passed to it. 100 | 101 | template 102 | size_type produce_n(size_type count, Callable&& f); 103 | Tries to insert count objects into the queue by calling Callable as long 104 | as there is space in the queue, or until Callable returns false once. 105 | Returns the number of times Callable was invoked and returned true. 106 | Callable is an invocable with one parameter of type void*, and a return 107 | type of bool. Callable is expected to place a new object of type T at 108 | the address passed to it. 109 | 110 | Dequeue: 111 | const T* front() const; 112 | T* front(); 113 | Returns a pointer to the next object in the queue, if such an object 114 | exists. Returns nullptr if the queue is empty. 115 | 116 | void discard(); 117 | Removes the next object from the queue. This function must not be called 118 | if the queue is empty. 119 | 120 | bool pop(T& out); 121 | Tries to move the next object in the queue into out, if such an object 122 | exists. Returns true if out contains a new object. Returns false if the 123 | queue was empty. 124 | 125 | template 126 | size_type read(Iterator beg, Iterator end) 127 | Tries to move elements out of the queue to [beg .. end), until either 128 | all have been moved or the queue is empty. 129 | Returns the number of elements that were moved. 130 | 131 | template 132 | size_type read(size_type count, Iterator elems) 133 | Tries to move elements out of the queue to [elems .. elems + count), 134 | until either count elements have been moved, or the queue is empty. 135 | Returns the number of elements that were moved. 136 | 137 | template 138 | bool consume(Callable&& f); 139 | Tries to remove an object from the queue by calling Callable and passing 140 | the object to it. Returns true if there was an object in the queue and 141 | Callable returned true. Returns false otherwise. 142 | Callable is an invocable with one parameter of type T*, and a return 143 | type of bool. 144 | 145 | template 146 | size_type consume_all(Callable&& f); 147 | Tries to remove all objects from the queue by calling Callable for each 148 | object, passing the address of each object to it, until either the queue 149 | is empty, or Callable returns false. Returns the number of times 150 | Callable was invoked and returned true. 151 | Callable is an invocable with one parameter of type T*, and a return 152 | type of bool. 153 | 154 | ******************************************************************************/ 155 | #pragma once 156 | 157 | #include // for std::copy_n 158 | #include // for std::array 159 | #include // for std::atomic and std::atomic_thread_fence 160 | #include // for std::byte 161 | #include // for std::invoke 162 | #include // for std::iterator_traits 163 | #include // for std::launder and placement-new operator 164 | #include // for std::forward, std::is_invocable_r, and 165 | // std::is_constructible 166 | 167 | namespace deaod { 168 | 169 | namespace detail { 170 | 171 | template 172 | using if_t = typename std::conditional::type; 173 | 174 | #if __cplusplus >= 201703L || __cpp_lib_byte > 0 175 | using std::byte; 176 | #else 177 | using byte = unsigned char; 178 | #endif 179 | 180 | #if __cplusplus >= 201703L || __cpp_lib_void_t > 0 181 | using std::void_t; 182 | #else 183 | 184 | template 185 | struct make_void { 186 | using type = void; 187 | }; 188 | 189 | template 190 | using void_t = typename make_void::type; 191 | 192 | #endif 193 | 194 | #if __cplusplus >= 201703L || __cpp_lib_launder > 0 195 | using std::launder; 196 | #else 197 | template 198 | constexpr T* launder(T* p) noexcept { 199 | static_assert( 200 | std::is_function::value == false && std::is_void::value == false, 201 | "launder is invalid for function pointers and pointers to cv void" 202 | ); 203 | return p; 204 | } 205 | #endif 206 | 207 | template 208 | struct is_reference_wrapper : std::false_type {}; 209 | 210 | template 211 | struct is_reference_wrapper> : std::true_type{}; 212 | 213 | #if __cplusplus >= 201703L || __cpp_lib_invoke > 0 214 | 215 | using std::invoke; 216 | 217 | #else 218 | 219 | struct fp_with_inst_ptr {}; 220 | struct fp_with_inst_val {}; 221 | struct fp_with_ref_wrap {}; 222 | 223 | struct dp_with_inst_ptr {}; 224 | struct dp_with_inst_val {}; 225 | struct dp_with_ref_wrap {}; 226 | 227 | template 228 | struct invoke_traits { 229 | using result_type = 230 | decltype(std::declval()(std::declval()...)); 231 | }; 232 | 233 | template 234 | struct invoke_traits { 235 | private: 236 | constexpr static bool _is_mem_func = 237 | std::is_member_function_pointer::value; 238 | constexpr static bool _is_a1_a_ptr = 239 | std::is_base_of::type>::value == false; 240 | constexpr static bool _is_a1_a_ref_wrap = is_reference_wrapper::value; 241 | 242 | public: 243 | using tag_type = if_t<_is_mem_func, 244 | if_t<_is_a1_a_ptr, fp_with_inst_ptr, 245 | if_t<_is_a1_a_ref_wrap, fp_with_ref_wrap, 246 | /* else */ fp_with_inst_val>>, 247 | /* else */ 248 | if_t<_is_a1_a_ptr, dp_with_inst_ptr, 249 | if_t<_is_a1_a_ref_wrap, dp_with_ref_wrap, 250 | /* else */ dp_with_inst_val>> 251 | >; 252 | 253 | using result_type = decltype(invoke( 254 | std::declval(), 255 | std::declval(), 256 | std::declval(), 257 | std::declval()... 258 | )); 259 | }; 260 | 261 | template 262 | auto invoke(Callable&& f, Args&& ... args) 263 | -> decltype(std::forward(f)(std::forward(args)...)) { 264 | return std::forward(f)(std::forward(args)...); 265 | } 266 | 267 | 268 | 269 | template 270 | auto invoke(fp_with_inst_ptr, Type T::* f, A1&& a1, Args&& ... args) 271 | -> decltype((*std::forward(a1).*f)(std::forward(args)...)) { 272 | return (*std::forward(a1).*f)(std::forward(args)...); 273 | } 274 | 275 | template 276 | auto invoke(fp_with_inst_val, Type T::* f, A1&& a1, Args&& ... args) 277 | -> decltype((std::forward(a1).*f)(std::forward(args)...)) { 278 | return (std::forward(a1).*f)(std::forward(args)...); 279 | } 280 | 281 | template 282 | auto invoke(fp_with_ref_wrap, Type T::* f, A1&& a1, Args&& ... args) 283 | -> decltype((a1.get().*f)(std::forward(args)...)) { 284 | return (a1.get().*f)(std::forward(args)...); 285 | } 286 | 287 | template 288 | auto invoke(dp_with_inst_ptr, Type T::* f, A1&& a1, Args&& ...) 289 | -> typename std::decay::type { 290 | static_assert(sizeof...(Args) == 0, 291 | "invoke on data member pointer must not provide arguments other than " 292 | "instance pointer"); 293 | return *std::forward(a1).*f; 294 | } 295 | 296 | template 297 | auto invoke(dp_with_inst_val, Type T::* f, A1&& a1, Args&& ...) 298 | -> typename std::decay::type { 299 | static_assert(sizeof...(Args) == 0, 300 | "invoke on data member pointer must not provide arguments other than " 301 | "instance pointer"); 302 | return std::forward(a1).*f; 303 | } 304 | 305 | template 306 | auto invoke(dp_with_ref_wrap, Type T::* f, A1&& a1, Args&& ...) 307 | -> typename std::decay::type { 308 | static_assert(sizeof...(Args) == 0, 309 | "invoke on data member pointer must not provide arguments other than " 310 | "instance pointer"); 311 | return (a1.get().*f); 312 | } 313 | 314 | template 315 | auto invoke(Type T::* f, A1&& a1, Args&& ... args) 316 | -> typename invoke_traits< 317 | decltype(f), 318 | decltype(a1), 319 | decltype(args)... 320 | >::result_type { 321 | typename invoke_traits< 322 | decltype(f), 323 | decltype(a1), 324 | decltype(args)... 325 | >::tag_type tag; 326 | 327 | return invoke(tag, f, std::forward(a1), std::forward(args)...); 328 | } 329 | 330 | #endif 331 | 332 | #if __cplusplus >= 201703L || __cpp_lib_is_invocable > 0 333 | 334 | using std::is_invocable; 335 | using std::is_invocable_r; 336 | 337 | #elif __has_include() 338 | 339 | #include 340 | using boost::callable_traits::is_invocable; 341 | using boost::callable_traits::is_invocable_r; 342 | 343 | #else 344 | 345 | // Dummy implementation because these are not used for correctness, 346 | // only for better error messages 347 | template 348 | struct is_invocable : std::true_type {}; 349 | template 350 | struct is_invocable_r : std::true_type {}; 351 | 352 | #endif 353 | 354 | template 355 | struct scope_guard { 356 | scope_guard(Callable&& f) : _f(std::forward(f)) {} 357 | ~scope_guard() { 358 | if (should_call()) { 359 | _f(); 360 | } 361 | }; 362 | 363 | scope_guard(const scope_guard&) = delete; 364 | scope_guard& operator=(const scope_guard&) = delete; 365 | 366 | #if __cplusplus >= 201703L || __cpp_guaranteed_copy_elision > 0 367 | 368 | private: 369 | bool should_call() const { 370 | return true; 371 | } 372 | 373 | #else 374 | 375 | scope_guard(scope_guard&& other) : _f(std::move(other._f)) { 376 | other._ignore = true; 377 | } 378 | 379 | scope_guard& operator=(scope_guard&& other) { 380 | _ignore = false; 381 | _f = std::move(other._f); 382 | 383 | other._ignore = true; 384 | } 385 | 386 | private: 387 | bool _ignore = false; 388 | bool should_call() const { 389 | return _ignore == false; 390 | } 391 | 392 | #endif 393 | Callable _f; 394 | }; 395 | 396 | template 397 | scope_guard make_scope_guard(Callable&& f) { 398 | return scope_guard(std::forward(f)); 399 | } 400 | 401 | } // namespace detail 402 | 403 | template 404 | struct alignas((size_t)1 << align_log2) spsc_queue { // gcc bug 89683 405 | using value_type = T; 406 | using size_type = size_t; 407 | 408 | static const auto size = queue_size; 409 | static const auto align = size_t(1) << align_log2; 410 | 411 | static_assert( 412 | alignof(T) <= align, 413 | "Type T must not be more aligned than this queue" 414 | ); 415 | 416 | spsc_queue() = default; 417 | 418 | ~spsc_queue() { 419 | std::atomic_thread_fence(std::memory_order_seq_cst); 420 | consume_all([](T*) { return true; }); 421 | } 422 | 423 | spsc_queue(const spsc_queue& other) { 424 | auto tail = 0; 425 | 426 | auto g = detail::make_scope_guard([&, this] { 427 | tail_cache = tail; 428 | _tail.store(tail); 429 | }); 430 | 431 | auto src_tail = other._tail.load(); 432 | auto src_head = other._head.load(); 433 | 434 | while (src_head != src_tail) { 435 | new(_buffer.data() + tail * sizeof(T)) 436 | T(*detail::launder(reinterpret_cast( 437 | other._buffer.data() + src_head * sizeof(T) 438 | ))); 439 | 440 | tail += 1; 441 | src_head += 1; 442 | if (src_head == size) src_head = 0; 443 | } 444 | } 445 | 446 | spsc_queue& operator=(const spsc_queue& other) { 447 | if (this == &other) return *this; 448 | 449 | { 450 | auto head = _head.load(); 451 | auto tail = _tail.load(); 452 | 453 | auto g = detail::make_scope_guard([&, this] { 454 | head_cache = head; 455 | _head.store(head); 456 | }); 457 | 458 | while (head != tail) { 459 | auto elem = detail::launder( 460 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 461 | ); 462 | elem->~T(); 463 | 464 | head += 1; 465 | if (head == size) head = 0; 466 | } 467 | } 468 | 469 | _tail.store(0); 470 | head_cache = 0; 471 | _head.store(0); 472 | tail_cache = 0; 473 | 474 | { 475 | auto tail = 0; 476 | 477 | auto g = detail::make_scope_guard([&, this] { 478 | tail_cache = tail; 479 | _tail.store(tail); 480 | }); 481 | 482 | auto src_tail = other._tail.load(); 483 | auto src_head = other._head.load(); 484 | 485 | while (src_head != src_tail) { 486 | new(_buffer.data() + tail * sizeof(T)) 487 | T(*detail::launder(reinterpret_cast( 488 | other._buffer.data() + src_head * sizeof(T) 489 | ))); 490 | 491 | tail += 1; 492 | src_head += 1; 493 | if (src_head == size) src_head = 0; 494 | } 495 | } 496 | 497 | return *this; 498 | } 499 | 500 | bool is_empty() const { 501 | auto head = _head.load(std::memory_order_acquire); 502 | auto tail = _tail.load(std::memory_order_acquire); 503 | 504 | return head == tail; 505 | } 506 | 507 | bool is_full() const { 508 | auto head = _head.load(std::memory_order_acquire); 509 | auto tail = _tail.load(std::memory_order_acquire) + 1; 510 | if (tail == size) tail = 0; 511 | 512 | return head == tail; 513 | } 514 | 515 | // copies elem into queue, if theres space 516 | // returns true if successful, false otherwise 517 | bool push(const T& elem) { 518 | return this->emplace(elem); 519 | } 520 | 521 | // tries to move elem into queue, if theres space 522 | // returns true if successful, false otherwise 523 | bool push(T&& elem) { 524 | return this->emplace(std::move(elem)); 525 | } 526 | 527 | // tries to copy count elements into the queue 528 | // returns the number of elements that actually got copied 529 | size_type push_n(size_type count, const T& elem) { 530 | return this->emplace_n(count, elem); 531 | } 532 | 533 | 534 | // copies elements into queue until end is reached or queue is full, 535 | // whichever happens first 536 | // returns the number of elements copied into the queue 537 | template 538 | size_type write(Iterator beg, Iterator end) { 539 | static_assert( 540 | std::is_constructible::value, 541 | "T must be constructible from Iterator::reference" 542 | ); 543 | 544 | using traits = std::iterator_traits; 545 | 546 | constexpr bool is_random_access = std::is_same< 547 | typename traits::iterator_category, 548 | std::random_access_iterator_tag 549 | >::value; 550 | 551 | // std::contiguous_iterator_tag is a feature of C++20, so try to be 552 | // compatible with it. Fall back on an approximate implementation for 553 | // C++17 or earlier. The value to compare against was chosen such that 554 | // compilers that implement some features of future standards and 555 | // indicate that using the value of __cplusplus dont accidentally fall 556 | // into the requirement to implement std::contiguous_iterator_tag. 557 | #if __cplusplus > 202000L 558 | constexpr bool is_contiguous = std::is_same< 559 | typename traits::iterator_category, 560 | std::contiguous_iterator_tag 561 | >::value; 562 | #else 563 | constexpr bool is_contiguous = std::is_pointer::value; 564 | #endif 565 | 566 | readwrite_tag< 567 | is_random_access || is_contiguous, 568 | std::is_trivially_constructible::value 569 | > tag; 570 | 571 | return this->write_fwd(tag, beg, end); 572 | } 573 | 574 | // copies elements into queue until count elements have been copied or 575 | // queue is full, whichever happens first 576 | // returns the number of elements copied into queue 577 | template 578 | size_type write(size_type count, Iterator elems) { 579 | static_assert( 580 | std::is_constructible::value, 581 | "T must be constructible from Iterator::reference" 582 | ); 583 | 584 | readwrite_tag< 585 | true, 586 | std::is_trivially_constructible::value 587 | > tag; 588 | 589 | return this->write_fwd(tag, count, elems); 590 | } 591 | 592 | private: 593 | template 594 | struct readwrite_tag {}; 595 | 596 | template 597 | size_type write_fwd( 598 | readwrite_tag, 599 | Iterator beg, 600 | Iterator end) 601 | { 602 | return this->write_internal(beg, end); 603 | } 604 | 605 | template 606 | size_type write_fwd( 607 | readwrite_tag, 608 | Iterator beg, 609 | Iterator end) 610 | { 611 | return this->write_internal(beg, end); 612 | } 613 | 614 | template 615 | size_type write_fwd( 616 | readwrite_tag, 617 | Iterator beg, 618 | Iterator end) 619 | { 620 | return this->write_copy(end - beg, beg); 621 | } 622 | 623 | template 624 | size_type write_fwd( 625 | readwrite_tag, 626 | Iterator beg, 627 | Iterator end) 628 | { 629 | return this->write_trivial(end - beg, beg); 630 | } 631 | 632 | template 633 | size_type write_fwd( 634 | readwrite_tag, 635 | size_type count, 636 | Iterator elems) 637 | { 638 | return this->write_copy(count, elems); 639 | } 640 | 641 | template 642 | size_type write_fwd( 643 | readwrite_tag, 644 | size_type count, 645 | Iterator elems) 646 | { 647 | return this->write_trivial(count, elems); 648 | } 649 | 650 | template 651 | size_type write_trivial(size_type count, Iterator elems) { 652 | auto tail = _tail.load(std::memory_order_relaxed); 653 | auto head = head_cache; 654 | auto free = size - (tail - head); 655 | if (free > size) free -= size; 656 | 657 | if (count >= free) { 658 | head = head_cache = _head.load(std::memory_order_acquire); 659 | free = size - (tail - head); 660 | if (free > size) free -= size; 661 | 662 | if (count >= free) { 663 | count = free - 1; 664 | } 665 | } 666 | 667 | auto next = tail + count; 668 | if (next >= size) { 669 | next -= size; 670 | auto split_pos = count - next; 671 | std::copy_n( 672 | elems, 673 | split_pos, 674 | reinterpret_cast(_buffer.data() + tail * sizeof(T)) 675 | ); 676 | std::copy_n( 677 | elems + split_pos, 678 | next, 679 | reinterpret_cast(_buffer.data()) 680 | ); 681 | } else { 682 | std::copy_n( 683 | elems, 684 | count, 685 | reinterpret_cast(_buffer.data() + tail * sizeof(T)) 686 | ); 687 | } 688 | 689 | _tail.store(next, std::memory_order_release); 690 | return count; 691 | } 692 | 693 | template 694 | size_type write_copy(size_type count, Iterator elems) { 695 | auto tail = _tail.load(std::memory_order_relaxed); 696 | auto head = head_cache; 697 | auto free = size - (tail - head); 698 | if (free > size) free -= size; 699 | 700 | if (count >= free) { 701 | head = head_cache = _head.load(std::memory_order_acquire); 702 | free = size - (tail - head); 703 | if (free > size) free -= size; 704 | 705 | if (count >= free) { 706 | count = free - 1; 707 | } 708 | } 709 | 710 | auto next = tail + count; 711 | if (next >= size) next -= size; 712 | 713 | auto g = detail::make_scope_guard([&, this] { 714 | _tail.store(tail, std::memory_order_release); 715 | }); 716 | 717 | while (tail != next) { 718 | new(_buffer.data() + tail * sizeof(T)) T(*elems); 719 | 720 | ++elems; 721 | tail += 1; 722 | if (tail == size) tail = 0; 723 | } 724 | 725 | return count; 726 | } 727 | 728 | template 729 | size_type write_internal(Iterator beg, Iterator end) { 730 | auto tail = _tail.load(std::memory_order_relaxed); 731 | 732 | auto g = detail::make_scope_guard([&, this] { 733 | _tail.store(tail, std::memory_order_release); 734 | }); 735 | 736 | auto count = size_type(0); 737 | for (; beg != end; ++beg) { 738 | auto next = tail + 1; 739 | if (next == size) next = 0; 740 | 741 | auto head = head_cache; 742 | if (next == head) { 743 | head = head_cache = _head.load(std::memory_order_acquire); 744 | if (next == head) { 745 | break; 746 | } 747 | } 748 | 749 | new(_buffer.data() + tail * sizeof(T)) T(*beg); 750 | tail = next; 751 | count += 1; 752 | } 753 | 754 | return count; 755 | } 756 | 757 | public: 758 | // constructs an element of type T in place using Args 759 | // returns true if successful, false otherwise 760 | template 761 | bool emplace(Args&&... args) { 762 | static_assert( 763 | std::is_constructible::value, 764 | "Type T must be constructible from Args..." 765 | ); 766 | 767 | auto tail = _tail.load(std::memory_order_relaxed); 768 | auto next = tail + 1; 769 | if (next == size) next = 0; 770 | 771 | auto head = head_cache; 772 | if (next == head) { 773 | head = head_cache = _head.load(std::memory_order_acquire); 774 | if (next == head) { 775 | return false; 776 | } 777 | } 778 | 779 | new(_buffer.data() + tail * sizeof(T)) T{ std::forward(args)... }; 780 | 781 | _tail.store(next, std::memory_order_release); 782 | return true; 783 | } 784 | 785 | // tries to construct count elements of type T in place using Args 786 | // returns the number of elements that got constructed 787 | template 788 | size_type emplace_n(size_type count, Args&&... args) { 789 | static_assert( 790 | std::is_constructible::value, 791 | "Type T must be constructible from Args..." 792 | ); 793 | 794 | auto tail = _tail.load(std::memory_order_relaxed); 795 | auto head = head_cache; 796 | auto free = size - (tail - head); 797 | if (free > size) free -= size; 798 | 799 | if (count >= free) { 800 | head = head_cache = _head.load(std::memory_order_acquire); 801 | free = size - (tail - head); 802 | if (free > size) free -= size; 803 | 804 | if (count >= free) { 805 | count = free - 1; 806 | } 807 | } 808 | 809 | auto next = tail + count; 810 | if (next >= size) next -= size; 811 | 812 | auto g = detail::make_scope_guard([&, this] { 813 | _tail.store(tail, std::memory_order_release); 814 | }); 815 | 816 | while (tail != next) { 817 | new(_buffer.data() + tail * sizeof(T)) T{ args... }; 818 | 819 | tail += 1; 820 | if (tail == size) tail = 0; 821 | } 822 | 823 | return count; 824 | } 825 | 826 | // Callable is an invocable that takes void* and returns bool 827 | // Callable must use placement new to construct an object of type T at the 828 | // pointer passed to it. If it cannot do so, it must return false. If it 829 | // returns false, an object of type T must not have been constructed. 830 | // 831 | // This function returns true if there was space for at least one element, 832 | // and Callable returned true. Otherwise, false will be returned. 833 | template 834 | bool produce(Callable&& f) { 835 | static_assert( 836 | detail::is_invocable_r::value, 837 | "Callable must return bool, and take void*" 838 | ); 839 | 840 | auto tail = _tail.load(std::memory_order_relaxed); 841 | auto next = tail + 1; 842 | if (next == size) next = 0; 843 | 844 | auto head = head_cache; 845 | if (next == head) { 846 | head = head_cache = _head.load(std::memory_order_acquire); 847 | if (next == head) { 848 | return false; 849 | } 850 | } 851 | 852 | void* storage = _buffer.data() + tail * sizeof(T); 853 | if (detail::invoke(std::forward(f), storage)) { 854 | _tail.store(next, std::memory_order_release); 855 | return true; 856 | } 857 | 858 | return false; 859 | } 860 | 861 | // Callable is an invocable that takes void* and returns bool 862 | // Callable must use placement new to construct an object of type T at the 863 | // pointer passed to it. If it cannot do so, it must return false. If it 864 | // returns false, an object of type T must not have been constructed. 865 | // 866 | // This function tries to construct count elements by calling Callable for 867 | // each address where an object can be constructed. This function returns 868 | // the number of elements that were successfully constructed, that is the 869 | // number of times Callable returned true. 870 | template 871 | size_type produce_n(size_type count, Callable&& f) { 872 | static_assert( 873 | detail::is_invocable_r::value, 874 | "Callable must return bool, and take void*" 875 | ); 876 | 877 | auto tail = _tail.load(std::memory_order_relaxed); 878 | auto head = head_cache; 879 | auto free = size - (tail - head); 880 | if (free > size) free -= size; 881 | 882 | if (count >= free) { 883 | head = head_cache = _head.load(std::memory_order_acquire); 884 | free = size - (tail - head); 885 | if (free > size) free -= size; 886 | 887 | if (count >= free) { 888 | count = free - 1; 889 | } 890 | } 891 | 892 | auto next = tail + count; 893 | if (next >= size) next -= size; 894 | 895 | auto g = detail::make_scope_guard([&, this] { 896 | _tail.store(tail, std::memory_order_release); 897 | }); 898 | 899 | while (tail != next) { 900 | void* storage = _buffer.data() + tail * sizeof(T); 901 | if (!detail::invoke(f, storage)) { 902 | auto ret = next - tail; 903 | if (ret < 0) ret += size; 904 | return ret; 905 | } 906 | 907 | tail += 1; 908 | if (tail == size) tail = 0; 909 | } 910 | 911 | return count; 912 | } 913 | 914 | // Returns a pointer to the next element that can be dequeued, or nullptr 915 | // if the queue is empty. 916 | const T* front() const { 917 | auto head = _head.load(std::memory_order_relaxed); 918 | auto tail = tail_cache; 919 | 920 | if (head == tail) { 921 | tail = tail_cache = _tail.load(std::memory_order_acquire); 922 | if (head == tail) { 923 | return nullptr; 924 | } 925 | } 926 | 927 | return detail::launder( 928 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 929 | ); 930 | } 931 | 932 | // Returns a pointer to the next element that can be dequeued, or nullptr 933 | // if the queue is empty. 934 | T* front() { 935 | auto head = _head.load(std::memory_order_relaxed); 936 | auto tail = tail_cache; 937 | 938 | if (head == tail) { 939 | tail = tail_cache = _tail.load(std::memory_order_acquire); 940 | if (head == tail) { 941 | return nullptr; 942 | } 943 | } 944 | 945 | return detail::launder( 946 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 947 | ); 948 | } 949 | 950 | // Discards the next element to be dequeued. The queue must contain at 951 | // least one element before calling this function. 952 | void discard() { 953 | auto head = _head.load(std::memory_order_relaxed); 954 | 955 | auto elem = detail::launder( 956 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 957 | ); 958 | elem->~T(); 959 | 960 | auto next = head + 1; 961 | if (next == size) next = 0; 962 | _head.store(next, std::memory_order_release); 963 | } 964 | 965 | // tries to move the next element to be dequeued into out. 966 | // Returns true if out was assigned to, false otherwise. 967 | bool pop(T& out) { 968 | auto head = _head.load(std::memory_order_relaxed); 969 | auto tail = tail_cache; 970 | 971 | if (head == tail) { 972 | tail = tail_cache = _tail.load(std::memory_order_acquire); 973 | if (head == tail) { 974 | return false; 975 | } 976 | } 977 | 978 | auto elem = detail::launder( 979 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 980 | ); 981 | 982 | out = std::move(*elem); 983 | elem->~T(); 984 | 985 | auto next = head + 1; 986 | if (next == size) next = 0; 987 | _head.store(next, std::memory_order_release); 988 | return true; 989 | } 990 | 991 | // tries to move elements to [beg .. end), or until the queue is empty 992 | // returns the number of elements moved 993 | template 994 | size_type read(Iterator beg, Iterator end) { 995 | static_assert( 996 | std::is_assignable::value, 997 | "You must be able to assign T&& to Iterator::reference" 998 | ); 999 | 1000 | using traits = std::iterator_traits; 1001 | 1002 | constexpr bool is_random_access = std::is_same< 1003 | typename traits::iterator_category, 1004 | std::random_access_iterator_tag 1005 | >::value; 1006 | 1007 | // std::contiguous_iterator_tag is a feature of C++20, so try to be 1008 | // compatible with it. Fall back on an approximate implementation for 1009 | // C++17 or earlier. The value to compare against was chosen such that 1010 | // compilers that implement some features of future standards and 1011 | // indicate that using the value of __cplusplus dont accidentally fall 1012 | // into the requirement to implement std::contiguous_iterator_tag. 1013 | #if __cplusplus > 202000L 1014 | constexpr bool is_contiguous = std::is_same< 1015 | typename traits::iterator_category, 1016 | std::contiguous_iterator_tag 1017 | >::value; 1018 | #else 1019 | constexpr bool is_contiguous = std::is_pointer::value; 1020 | #endif 1021 | 1022 | readwrite_tag< 1023 | is_random_access || is_contiguous, 1024 | std::is_trivially_constructible::value 1025 | > tag; 1026 | 1027 | return this->read_fwd(tag, beg, end); 1028 | } 1029 | 1030 | // tries to move elements to [elems .. elems + count) or until the queue is 1031 | // empty 1032 | // returns the number of elements moved 1033 | template 1034 | size_type read(size_type count, Iterator elems) { 1035 | static_assert( 1036 | std::is_assignable::value, 1037 | "You must be able to assign T&& to Iterator::reference" 1038 | ); 1039 | 1040 | readwrite_tag< 1041 | true, 1042 | std::is_trivially_constructible::value 1043 | > tag; 1044 | 1045 | return this->read_fwd(tag, count, elems); 1046 | } 1047 | 1048 | private: 1049 | template 1050 | size_type read_fwd( 1051 | readwrite_tag, 1052 | Iterator beg, 1053 | Iterator end) 1054 | { 1055 | return this->read_internal(beg, end); 1056 | } 1057 | 1058 | template 1059 | size_type read_fwd( 1060 | readwrite_tag, 1061 | Iterator beg, 1062 | Iterator end) 1063 | { 1064 | return this->read_internal(beg, end); 1065 | } 1066 | 1067 | template 1068 | size_type read_fwd( 1069 | readwrite_tag, 1070 | Iterator beg, 1071 | Iterator end) 1072 | { 1073 | return this->read_copy(end - beg, beg); 1074 | } 1075 | 1076 | template 1077 | size_type read_fwd( 1078 | readwrite_tag, 1079 | Iterator beg, 1080 | Iterator end) 1081 | { 1082 | return this->read_trivial(end - beg, beg); 1083 | } 1084 | 1085 | template 1086 | size_type read_fwd( 1087 | readwrite_tag, 1088 | size_type count, 1089 | Iterator elems) 1090 | { 1091 | return this->read_copy(count, elems); 1092 | } 1093 | 1094 | template 1095 | size_type read_fwd( 1096 | readwrite_tag, 1097 | size_type count, 1098 | Iterator elems) 1099 | { 1100 | return this->read_trivial(count, elems); 1101 | } 1102 | 1103 | template 1104 | size_type read_trivial(size_type count, Iterator elems) { 1105 | auto head = _head.load(std::memory_order_relaxed); 1106 | auto tail = tail_cache; 1107 | auto filled = (tail - head); 1108 | if (filled > size) filled += size; 1109 | 1110 | if (count >= filled) { 1111 | tail = tail_cache = _tail.load(std::memory_order_acquire); 1112 | filled = (tail - head); 1113 | if (filled > size) filled += size; 1114 | 1115 | if (count >= filled) { 1116 | count = filled; 1117 | } 1118 | } 1119 | 1120 | auto next = head + count; 1121 | if (next >= size) { 1122 | next -= size; 1123 | auto split_pos = count - next; 1124 | std::copy_n( 1125 | elems, 1126 | split_pos, 1127 | detail::launder( 1128 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 1129 | ) 1130 | ); 1131 | std::copy_n( 1132 | elems + split_pos, 1133 | next, 1134 | detail::launder(reinterpret_cast(_buffer.data())) 1135 | ); 1136 | } else { 1137 | std::copy_n( 1138 | elems, 1139 | count, 1140 | detail::launder( 1141 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 1142 | ) 1143 | ); 1144 | } 1145 | 1146 | _head.store(next, std::memory_order_release); 1147 | return count; 1148 | } 1149 | 1150 | template 1151 | size_type read_copy(size_type count, Iterator elems) { 1152 | auto head = _head.load(std::memory_order_relaxed); 1153 | auto tail = tail_cache; 1154 | auto filled = (tail - head); 1155 | if (filled > size) filled += size; 1156 | 1157 | if (count >= filled) { 1158 | tail = tail_cache = _tail.load(std::memory_order_acquire); 1159 | filled = (tail - head); 1160 | if (filled > size) filled += size; 1161 | 1162 | if (count >= filled) { 1163 | count = filled; 1164 | } 1165 | } 1166 | 1167 | auto next = head + count; 1168 | if (next >= size) next -= size; 1169 | 1170 | auto g = detail::make_scope_guard([&, this] { 1171 | _head.store(head, std::memory_order_release); 1172 | }); 1173 | 1174 | while (head != next) { 1175 | auto elem = detail::launder( 1176 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 1177 | ); 1178 | 1179 | *elems = std::move(elem); 1180 | elem->~T(); 1181 | 1182 | head += 1; 1183 | if (head == size) head = 0; 1184 | } 1185 | 1186 | return count; 1187 | } 1188 | 1189 | template 1190 | size_type read_internal(Iterator beg, Iterator end) { 1191 | auto head = _head.load(std::memory_order_relaxed); 1192 | 1193 | auto g = detail::make_scope_guard([&, this] { 1194 | _head.store(head, std::memory_order_release); 1195 | }); 1196 | 1197 | auto count = size_type(0); 1198 | for (; beg != end; ++beg) { 1199 | auto tail = tail_cache; 1200 | if (head == tail) { 1201 | tail = tail_cache = _tail.load(std::memory_order_acquire); 1202 | if (head == tail) { 1203 | break; 1204 | } 1205 | } 1206 | 1207 | auto elem = detail::launder( 1208 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 1209 | ); 1210 | 1211 | *beg = std::move(elem); 1212 | elem->~T(); 1213 | 1214 | head += 1; 1215 | if (head == size) head = 0; 1216 | count += 1; 1217 | } 1218 | 1219 | return count; 1220 | } 1221 | 1222 | public: 1223 | // Callable is an invocable that takes T* and returns bool 1224 | // 1225 | // This function calls Callable with the address of the next element to be 1226 | // dequeued, if the queue is not empty. If Callable returns true, the 1227 | // element is removed from the queue and this function returns true. 1228 | // Otherwise this function returns false. 1229 | template 1230 | bool consume(Callable&& f) { 1231 | static_assert( 1232 | detail::is_invocable_r::value, 1233 | "Callable must return bool, and take T*" 1234 | ); 1235 | 1236 | auto head = _head.load(std::memory_order_relaxed); 1237 | auto tail = tail_cache; 1238 | 1239 | if (head == tail) { 1240 | tail = tail_cache = _tail.load(std::memory_order_acquire); 1241 | if (head == tail) { 1242 | return false; 1243 | } 1244 | } 1245 | 1246 | auto elem = detail::launder( 1247 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 1248 | ); 1249 | 1250 | if (detail::invoke(std::forward(f), elem)) { 1251 | elem->~T(); 1252 | auto next = head + 1; 1253 | if (next == size) next = 0; 1254 | _head.store(next, std::memory_order_release); 1255 | return true; 1256 | } 1257 | 1258 | return false; 1259 | } 1260 | 1261 | // Callable is an invocable that takes T* and returns bool 1262 | // 1263 | // This function calls Callable for each element currently in the queue, 1264 | // with the address of that element. If Callable returns true, the element 1265 | // is removed from the queue. If Callable returns false, the element is not 1266 | // removed, and this function returns. This function always returns the 1267 | // number of times Callable returned true. 1268 | template 1269 | size_type consume_all(Callable&& f) { 1270 | static_assert( 1271 | detail::is_invocable_r::value, 1272 | "Callable must return bool, and take T*" 1273 | ); 1274 | 1275 | auto head = _head.load(std::memory_order_relaxed); 1276 | auto tail = tail_cache = _tail.load(std::memory_order_acquire); 1277 | auto old_head = head; 1278 | 1279 | auto g = detail::make_scope_guard([&, this] { 1280 | _head.store(head, std::memory_order_release); 1281 | }); 1282 | 1283 | while (head != tail) { 1284 | auto elem = detail::launder( 1285 | reinterpret_cast(_buffer.data() + head * sizeof(T)) 1286 | ); 1287 | 1288 | if (!detail::invoke(f, elem)) { 1289 | break; 1290 | } 1291 | 1292 | elem->~T(); 1293 | head += 1; 1294 | if (head == size) head = 0; 1295 | } 1296 | 1297 | ptrdiff_t ret = head - old_head; 1298 | if (ret < 0) ret += size; 1299 | return ret; 1300 | } 1301 | 1302 | private: 1303 | alignas(align) std::array _buffer; 1304 | 1305 | alignas(align) std::atomic _tail{ 0 }; 1306 | mutable size_t head_cache{ 0 }; 1307 | 1308 | alignas(align) std::atomic _head{ 0 }; 1309 | mutable size_t tail_cache{ 0 }; 1310 | }; 1311 | 1312 | } // namespace deaod 1313 | -------------------------------------------------------------------------------- /dro/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Andrew Drogalis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /dro/spsc-queue.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 Andrew Drogalis 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the “Software”), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | 13 | #ifndef DRO_SPSC_QUEUE 14 | #define DRO_SPSC_QUEUE 15 | 16 | #include // for std::array 17 | #include // for atomic, memory_order 18 | #include // for concept, requires 19 | #include // for size_t 20 | #include // for numeric_limits 21 | #include // for std::hardware_destructive_interference_size 22 | #include // for std::logic_error 23 | #include // for std::is_default_constructible 24 | #include // for forward 25 | #include // for vector, allocator 26 | 27 | namespace dro { 28 | 29 | namespace details { 30 | 31 | #ifdef __cpp_lib_hardware_interference_size 32 | static constexpr std::size_t cacheLineSize = 33 | std::hardware_destructive_interference_size; 34 | #else 35 | static constexpr std::size_t cacheLineSize = 64; 36 | #endif 37 | 38 | static constexpr std::size_t MAX_BYTES_ON_STACK = 2'097'152; // 2 MBs 39 | 40 | template 41 | concept SPSC_Type = 42 | std::is_default_constructible::value && 43 | std::is_nothrow_destructible::value && 44 | (std::is_move_assignable_v || std::is_copy_assignable_v); 45 | 46 | template 47 | concept SPSC_NoThrow_Type = 48 | std::is_nothrow_constructible_v && 49 | ((std::is_nothrow_copy_assignable_v && std::is_copy_assignable_v) || 50 | (std::is_nothrow_move_assignable_v && std::is_move_assignable_v)); 51 | 52 | // Prevents Stack Overflow 53 | template 54 | concept MAX_STACK_SIZE = (N <= (MAX_BYTES_ON_STACK / sizeof(T))); 55 | 56 | // Memory Allocated on the Heap (Default Option) 57 | template > 58 | struct HeapBuffer { 59 | const std::size_t capacity_; 60 | std::vector buffer_; 61 | 62 | static constexpr std::size_t padding = ((cacheLineSize - 1) / sizeof(T)) + 1; 63 | static constexpr std::size_t MAX_SIZE_T = 64 | std::numeric_limits::max(); 65 | 66 | explicit HeapBuffer(const std::size_t capacity, 67 | const Allocator &allocator = Allocator()) 68 | // +1 prevents live lock e.g. reader and writer share 1 slot for size 1 69 | : capacity_(capacity + 1), buffer_(allocator) { 70 | if (capacity < 1) { 71 | throw std::logic_error("Capacity must be a positive number; Heap " 72 | "allocations require capacity argument"); 73 | } 74 | // (2 * padding) is for preventing cache contention between adjacent memory 75 | if (capacity_ > MAX_SIZE_T - (2 * padding)) { 76 | throw std::overflow_error( 77 | "Capacity with padding exceeds std::size_t. Reduce size of queue."); 78 | } 79 | buffer_.resize(capacity_ + (2 * padding)); 80 | } 81 | 82 | ~HeapBuffer() = default; 83 | // Non-Copyable and Non-Movable 84 | HeapBuffer(const HeapBuffer &lhs) = delete; 85 | HeapBuffer &operator=(const HeapBuffer &lhs) = delete; 86 | HeapBuffer(HeapBuffer &&lhs) = delete; 87 | HeapBuffer &operator=(HeapBuffer &&lhs) = delete; 88 | }; 89 | 90 | // Memory Allocated on the Stack 91 | template > 92 | struct StackBuffer { 93 | // +1 prevents live lock e.g. reader and writer share 1 slot for size 1 94 | static constexpr std::size_t capacity_{N + 1}; 95 | static constexpr std::size_t padding = ((cacheLineSize - 1) / sizeof(T)) + 1; 96 | // (2 * padding) is for preventing cache contention between adjacent memory 97 | std::array buffer_; 98 | 99 | explicit StackBuffer(const std::size_t capacity, 100 | const Allocator &allocator = Allocator()) { 101 | if (capacity) { 102 | throw std::invalid_argument( 103 | "Capacity in constructor is ignored for stack allocations"); 104 | } 105 | } 106 | 107 | ~StackBuffer() = default; 108 | // Non-Copyable and Non-Movable 109 | StackBuffer(const StackBuffer &lhs) = delete; 110 | StackBuffer &operator=(const StackBuffer &lhs) = delete; 111 | StackBuffer(StackBuffer &&lhs) = delete; 112 | StackBuffer &operator=(StackBuffer &&lhs) = delete; 113 | }; 114 | 115 | } // namespace details 116 | 117 | template > 119 | requires details::MAX_STACK_SIZE 120 | class SPSCQueue 121 | : public std::conditional_t, 122 | details::StackBuffer> { 123 | private: 124 | using base_type = 125 | std::conditional_t, 126 | details::StackBuffer>; 127 | static constexpr bool nothrow_v = details::SPSC_NoThrow_Type; 128 | 129 | struct alignas(details::cacheLineSize) WriterCacheLine { 130 | std::atomic writeIndex_{0}; 131 | std::size_t readIndexCache_{0}; 132 | // Reduces cache contention on very small queues 133 | const size_t paddingCache_ = base_type::padding; 134 | } writer_; 135 | 136 | struct alignas(details::cacheLineSize) ReaderCacheLine { 137 | std::atomic readIndex_{0}; 138 | std::size_t writeIndexCache_{0}; 139 | // Reduces cache contention on very small queues 140 | std::size_t capacityCache_{}; 141 | } reader_; 142 | 143 | public: 144 | explicit SPSCQueue(const std::size_t capacity = 0, 145 | const Allocator &allocator = Allocator()) 146 | : base_type(capacity, allocator) { 147 | reader_.capacityCache_ = base_type::capacity_; 148 | } 149 | 150 | ~SPSCQueue() = default; 151 | // Non-Copyable and Non-Movable 152 | SPSCQueue(const SPSCQueue &lhs) = delete; 153 | SPSCQueue &operator=(const SPSCQueue &lhs) = delete; 154 | SPSCQueue(SPSCQueue &&lhs) = delete; 155 | SPSCQueue &operator=(SPSCQueue &&lhs) = delete; 156 | 157 | template 158 | requires std::constructible_from 159 | void 160 | emplace(Args &&...args) noexcept(details::SPSC_NoThrow_Type) { 161 | const auto writeIndex = writer_.writeIndex_.load(std::memory_order_relaxed); 162 | const auto nextWriteIndex = 163 | (writeIndex == base_type::capacity_ - 1) ? 0 : writeIndex + 1; 164 | // Loop while waiting for reader to catch up 165 | while (nextWriteIndex == writer_.readIndexCache_) { 166 | writer_.readIndexCache_ = 167 | reader_.readIndex_.load(std::memory_order_acquire); 168 | } 169 | write_value(writeIndex, std::forward(args)...); 170 | writer_.writeIndex_.store(nextWriteIndex, std::memory_order_release); 171 | } 172 | 173 | template 174 | requires std::constructible_from 175 | void force_emplace(Args &&...args) noexcept( 176 | details::SPSC_NoThrow_Type) { 177 | const auto writeIndex = writer_.writeIndex_.load(std::memory_order_relaxed); 178 | const auto nextWriteIndex = 179 | (writeIndex == base_type::capacity_ - 1) ? 0 : writeIndex + 1; 180 | write_value(writeIndex, std::forward(args)...); 181 | writer_.writeIndex_.store(nextWriteIndex, std::memory_order_release); 182 | } 183 | 184 | template 185 | requires std::constructible_from 186 | [[nodiscard]] bool try_emplace(Args &&...args) noexcept( 187 | details::SPSC_NoThrow_Type) { 188 | const auto writeIndex = writer_.writeIndex_.load(std::memory_order_relaxed); 189 | const auto nextWriteIndex = 190 | (writeIndex == base_type::capacity_ - 1) ? 0 : writeIndex + 1; 191 | // Check reader cache and if actually equal then fail to write 192 | if (nextWriteIndex == writer_.readIndexCache_) { 193 | writer_.readIndexCache_ = 194 | reader_.readIndex_.load(std::memory_order_acquire); 195 | if (nextWriteIndex == writer_.readIndexCache_) { 196 | return false; 197 | } 198 | } 199 | write_value(writeIndex, std::forward(args)...); 200 | writer_.writeIndex_.store(nextWriteIndex, std::memory_order_release); 201 | return true; 202 | } 203 | 204 | void push(const T &val) noexcept(nothrow_v) { emplace(val); } 205 | 206 | template 207 | requires std::constructible_from 208 | void push(P &&val) noexcept(details::SPSC_NoThrow_Type) { 209 | emplace(std::forward

(val)); 210 | } 211 | 212 | void force_push(const T &val) noexcept(nothrow_v) { force_emplace(val); } 213 | 214 | template 215 | requires std::constructible_from 216 | void force_push(P &&val) noexcept(details::SPSC_NoThrow_Type) { 217 | force_emplace(std::forward

(val)); 218 | } 219 | 220 | [[nodiscard]] bool try_push(const T &val) noexcept(nothrow_v) { 221 | return try_emplace(val); 222 | } 223 | 224 | template 225 | requires std::constructible_from 226 | [[nodiscard]] bool 227 | try_push(P &&val) noexcept(details::SPSC_NoThrow_Type) { 228 | return try_emplace(std::forward

(val)); 229 | } 230 | 231 | void pop(T &val) noexcept(nothrow_v) { 232 | const auto readIndex = reader_.readIndex_.load(std::memory_order_relaxed); 233 | // Loop while waiting for writer to enqueue 234 | while (readIndex == reader_.writeIndexCache_) { 235 | reader_.writeIndexCache_ = 236 | writer_.writeIndex_.load(std::memory_order_acquire); 237 | } 238 | val = read_value(readIndex); 239 | const auto nextReadIndex = 240 | (readIndex == reader_.capacityCache_ - 1) ? 0 : readIndex + 1; 241 | reader_.readIndex_.store(nextReadIndex, std::memory_order_release); 242 | } 243 | 244 | [[nodiscard]] bool try_pop(T &val) noexcept(nothrow_v) { 245 | const auto readIndex = reader_.readIndex_.load(std::memory_order_relaxed); 246 | // Check writer cache and if actually equal then fail to read 247 | if (readIndex == reader_.writeIndexCache_) { 248 | reader_.writeIndexCache_ = 249 | writer_.writeIndex_.load(std::memory_order_acquire); 250 | if (readIndex == reader_.writeIndexCache_) { 251 | return false; 252 | } 253 | } 254 | val = read_value(readIndex); 255 | const auto nextReadIndex = 256 | (readIndex == reader_.capacityCache_ - 1) ? 0 : readIndex + 1; 257 | reader_.readIndex_.store(nextReadIndex, std::memory_order_release); 258 | return true; 259 | } 260 | 261 | [[nodiscard]] std::size_t size() const noexcept { 262 | const auto writeIndex = writer_.writeIndex_.load(std::memory_order_acquire); 263 | const auto readIndex = reader_.readIndex_.load(std::memory_order_acquire); 264 | // This method prevents conversion to std::ptrdiff_t (a signed type) 265 | if (writeIndex >= readIndex) { 266 | return writeIndex - readIndex; 267 | } 268 | return (base_type::capacity_ - readIndex) + writeIndex; 269 | } 270 | 271 | [[nodiscard]] bool empty() const noexcept { 272 | return writer_.writeIndex_.load(std::memory_order_acquire) == 273 | reader_.readIndex_.load(std::memory_order_acquire); 274 | } 275 | 276 | [[nodiscard]] std::size_t capacity() const noexcept { 277 | return base_type::capacity_ - 1; 278 | } 279 | 280 | private: 281 | // Note: The "+ padding" is a constant offset used to prevent false sharing 282 | // with memory in front of the SPSC allocations 283 | T &read_value(const auto &readIndex) noexcept(nothrow_v) 284 | requires std::is_copy_assignable_v && (!std::is_move_assignable_v) 285 | { 286 | return base_type::buffer_[readIndex + base_type::padding]; 287 | } 288 | 289 | T &&read_value(const auto &readIndex) noexcept(nothrow_v) 290 | requires std::is_move_assignable_v 291 | { 292 | return std::move(base_type::buffer_[readIndex + base_type::padding]); 293 | } 294 | 295 | void write_value(const auto &writeIndex, T &val) noexcept(nothrow_v) 296 | requires std::is_copy_assignable_v && (!std::is_move_assignable_v) 297 | { 298 | base_type::buffer_[writeIndex + writer_.paddingCache_] = val; 299 | } 300 | 301 | void write_value(const auto &writeIndex, T &&val) noexcept(nothrow_v) 302 | requires std::is_move_assignable_v 303 | { 304 | base_type::buffer_[writeIndex + writer_.paddingCache_] = std::move(val); 305 | } 306 | 307 | template 308 | requires(std::constructible_from && 309 | std::is_copy_assignable_v && (!std::is_move_assignable_v)) 310 | void write_value(const auto &writeIndex, Args &&...args) noexcept( 311 | details::SPSC_NoThrow_Type) { 312 | T copyOnly{std::forward(args)...}; 313 | base_type::buffer_[writeIndex + writer_.paddingCache_] = copyOnly; 314 | } 315 | 316 | template 317 | requires(std::constructible_from && 318 | std::is_move_assignable_v) 319 | void write_value(const auto &writeIndex, Args &&...args) noexcept( 320 | details::SPSC_NoThrow_Type) { 321 | base_type::buffer_[writeIndex + writer_.paddingCache_] = 322 | T(std::forward(args)...); 323 | } 324 | }; 325 | 326 | } // namespace dro 327 | #endif 328 | -------------------------------------------------------------------------------- /fast_queue_arm64.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Anders Cedronius on 2023-06-27. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | //#include 12 | 13 | template 14 | class FastQueue { 15 | static_assert(sizeof(T) == 8, "Only 64 bit objects are supported"); 16 | static_assert(sizeof(void*) == 8, "The architecture is not 64-bits"); 17 | static_assert((RING_BUFFER_SIZE & (RING_BUFFER_SIZE + 1)) == 0, "RING_BUFFER_SIZE must be a number of contiguous bits set from LSB. Example: 0b00001111 not 0b01001111"); 18 | public: 19 | template 20 | inline void push(Args&&... args) noexcept { 21 | while(mRingBuffer[mWritePosition&RING_BUFFER_SIZE].mObj != nullptr) if (mExitThreadSemaphore) [[unlikely]] return; 22 | new(&mRingBuffer[mWritePosition++&RING_BUFFER_SIZE].mObj) T{std::forward(args)...}; 23 | } 24 | 25 | inline void pop(T& aOut) noexcept { 26 | std::atomic_thread_fence(std::memory_order_consume); 27 | while (!(aOut = mRingBuffer[mReadPosition & RING_BUFFER_SIZE].mObj)) { 28 | if (mExitThread == mReadPosition) [[unlikely]] { 29 | aOut = nullptr; 30 | return; 31 | } 32 | } 33 | mRingBuffer[mReadPosition++ & RING_BUFFER_SIZE].mObj = nullptr; 34 | //__builtin_prefetch(aOut, 1); 35 | //__pldx(0, 0, 1, (const void*)aOut); 36 | } 37 | 38 | //Stop queue (Maybe called from any thread) 39 | void stopQueue() { 40 | mExitThread = mWritePosition; 41 | mExitThreadSemaphore = true; 42 | } 43 | 44 | private: 45 | struct AlignedDataObjects { 46 | alignas(L1_CACHE_LNE) T mObj = nullptr; 47 | }; 48 | alignas(L1_CACHE_LNE) volatile std::atomic mReadPosition = 1; 49 | alignas(L1_CACHE_LNE) volatile std::atomic mWritePosition = 1; 50 | alignas(L1_CACHE_LNE) volatile uint64_t mExitThread = 0; 51 | alignas(L1_CACHE_LNE) volatile bool mExitThreadSemaphore = false; 52 | alignas(L1_CACHE_LNE) std::array mRingBuffer; 53 | alignas(L1_CACHE_LNE) volatile uint8_t mBorderDown[L1_CACHE_LNE]{}; 54 | }; -------------------------------------------------------------------------------- /fast_queue_x86_64.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Anders Cedronius on 2023-06-27. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | class FastQueue { 14 | static_assert(sizeof(T) == 8, "Only 64 bit objects are supported"); 15 | static_assert(sizeof(void*) == 8, "The architecture is not 64-bits"); 16 | static_assert((RING_BUFFER_SIZE & (RING_BUFFER_SIZE + 1)) == 0, "RING_BUFFER_SIZE must be a number of contiguous bits set from LSB. Example: 0b00001111 not 0b01001111"); 17 | public: 18 | template 19 | void push(Args&&... args) noexcept { 20 | while (mRingBuffer[mWritePosition & RING_BUFFER_SIZE].mObj) if (mExitThreadSemaphore) [[unlikely]] return; 21 | new(&mRingBuffer[mWritePosition++ & RING_BUFFER_SIZE].mObj) T{ std::forward(args)... }; 22 | } 23 | 24 | inline void pop(T& aOut) noexcept { 25 | std::atomic_thread_fence(std::memory_order_consume); 26 | while (!(aOut = mRingBuffer[mReadPosition & RING_BUFFER_SIZE].mObj)) { 27 | if ((mExitThread == mReadPosition) && mExitThreadSemaphore) [[unlikely]] { 28 | aOut = nullptr; 29 | return; 30 | } 31 | } 32 | mRingBuffer[mReadPosition++ & RING_BUFFER_SIZE].mObj = nullptr; 33 | } 34 | 35 | //Stop queue (Maybe called from any thread) 36 | void stopQueue() { 37 | mExitThread = mWritePosition; 38 | mExitThreadSemaphore = true; 39 | } 40 | 41 | private: 42 | struct AlignedDataObjects { 43 | alignas(L1_CACHE_LNE * 2) T mObj = nullptr; 44 | }; 45 | alignas(L1_CACHE_LNE) volatile uint64_t mReadPosition = 0; 46 | alignas(L1_CACHE_LNE) volatile std::atomic mWritePosition = 0; 47 | alignas(L1_CACHE_LNE) volatile uint64_t mExitThread = 0; 48 | alignas(L1_CACHE_LNE) volatile bool mExitThreadSemaphore = false; 49 | std::array mRingBuffer; 50 | alignas(L1_CACHE_LNE) volatile uint8_t mBorderDown[L1_CACHE_LNE]{}; 51 | }; -------------------------------------------------------------------------------- /fastqueue2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersc/fastqueue2/0d60e7e4f1126c69e8d594fb66f261f2b6ac9697/fastqueue2.png -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #if __x86_64__ || _M_X64 6 | #include "fast_queue_x86_64.h" 7 | #elif __aarch64__ || _M_ARM64 8 | #include "fast_queue_arm64.h" 9 | #else 10 | #error Architecture not supported 11 | #endif 12 | 13 | #include "spsc_queue.hpp" //Deaod 14 | #include "spsc-queue.hpp" //Dro 15 | #include "pin_thread.h" 16 | 17 | #define QUEUE_MASK 0b1111111111 18 | #define L1_CACHE_LINE 64 19 | #define TEST_TIME_DURATION_SEC 20 20 | //Run the consumer on CPU 21 | #define CONSUMER_CPU 1 22 | //Run the producer on CPU 23 | #define PRODUCER_CPU 3 24 | 25 | std::atomic gActiveConsumer = 0; 26 | std::atomic gCounter = 0; 27 | bool gStartBench = false; 28 | bool gActiveProducer = true; 29 | 30 | class MyObject { 31 | public: 32 | uint64_t mIndex; 33 | }; 34 | 35 | /// ----------------------------------------------------------- 36 | /// 37 | /// DroSPSC section Start 38 | /// 39 | /// ----------------------------------------------------------- 40 | 41 | 42 | void droSPSCProducer(dro::SPSCQueue *pQueue, int32_t aCPU) { 43 | if (!pinThread(aCPU)) { 44 | std::cout << "Pin CPU fail. " << std::endl; 45 | return; 46 | } 47 | while (!gStartBench) { 48 | #ifdef _MSC_VER 49 | __nop(); 50 | #else 51 | asm volatile ("NOP"); 52 | #endif 53 | } 54 | uint64_t lCounter = 0; 55 | while (gActiveProducer) { 56 | auto lTheObject = new MyObject(); 57 | lTheObject->mIndex = lCounter++; 58 | pQueue->emplace(lTheObject); 59 | } 60 | pQueue->emplace(nullptr); //Signal end 61 | } 62 | 63 | void droSPSCConsumer(dro::SPSCQueue *pQueue, int32_t aCPU) { 64 | if (!pinThread(aCPU)) { 65 | std::cout << "Pin CPU fail. " << std::endl; 66 | --gActiveConsumer; 67 | return; 68 | } 69 | uint64_t lCounter = 0; 70 | while (true) { 71 | MyObject* lResult = nullptr; 72 | pQueue->pop(lResult); 73 | if (lResult == nullptr) { 74 | break; 75 | } 76 | if (lResult->mIndex != lCounter) { 77 | std::cout << "Queue item error" << std::endl; 78 | } 79 | lCounter++; 80 | delete lResult; 81 | } 82 | gCounter += lCounter; 83 | --gActiveConsumer; 84 | } 85 | 86 | /// ----------------------------------------------------------- 87 | /// 88 | /// DroSPSC section End 89 | /// 90 | /// ----------------------------------------------------------- 91 | 92 | /// ----------------------------------------------------------- 93 | /// 94 | /// deaodSPSC section Start 95 | /// 96 | /// ----------------------------------------------------------- 97 | 98 | 99 | void deaodSPSCProducer(deaod::spsc_queue *pQueue, int32_t aCPU) { 100 | if (!pinThread(aCPU)) { 101 | std::cout << "Pin CPU fail. " << std::endl; 102 | return; 103 | } 104 | while (!gStartBench) { 105 | #ifdef _MSC_VER 106 | __nop(); 107 | #else 108 | asm volatile ("NOP"); 109 | #endif 110 | } 111 | uint64_t lCounter = 0; 112 | while (gActiveProducer) { 113 | auto lTheObject = new MyObject(); 114 | lTheObject->mIndex = lCounter++; 115 | bool lAbleToPush = false; 116 | while (!lAbleToPush && gActiveProducer) { 117 | lAbleToPush = pQueue->push(lTheObject); 118 | } 119 | } 120 | } 121 | 122 | void deaodSPSCConsumer(deaod::spsc_queue *pQueue, int32_t aCPU) { 123 | if (!pinThread(aCPU)) { 124 | std::cout << "Pin CPU fail. " << std::endl; 125 | gActiveConsumer--; 126 | return; 127 | } 128 | uint64_t lCounter = 0; 129 | while (true) { 130 | 131 | MyObject* lResult = nullptr; 132 | bool lAbleToPop = false; 133 | while (!lAbleToPop && gActiveProducer) { 134 | lAbleToPop = pQueue->pop(lResult); 135 | } 136 | if (lResult == nullptr) { 137 | break; 138 | } 139 | if (lResult->mIndex != lCounter) { 140 | std::cout << "Queue item error" << std::endl; 141 | } 142 | lCounter++; 143 | delete lResult; 144 | } 145 | gCounter += lCounter; 146 | gActiveConsumer--; 147 | } 148 | 149 | /// ----------------------------------------------------------- 150 | /// 151 | /// deaodSPSC section End 152 | /// 153 | /// ----------------------------------------------------------- 154 | 155 | 156 | /// ----------------------------------------------------------- 157 | /// 158 | /// FastQueue section Start 159 | /// 160 | /// ----------------------------------------------------------- 161 | 162 | void fastQueueProducer(FastQueue *pQueue, int32_t aCPU) { 163 | if (!pinThread(aCPU)) { 164 | std::cout << "Pin CPU fail. " << std::endl; 165 | return; 166 | } 167 | while (!gStartBench) { 168 | #ifdef _MSC_VER 169 | __nop(); 170 | #else 171 | asm volatile ("NOP"); 172 | #endif 173 | } 174 | uint64_t lCounter = 0; 175 | while (gActiveProducer) { 176 | auto lTheObject = new MyObject(); 177 | lTheObject->mIndex = lCounter++; 178 | pQueue->push(lTheObject); 179 | } 180 | pQueue->stopQueue(); 181 | } 182 | 183 | void fastQueueConsumer(FastQueue *pQueue, int32_t aCPU) { 184 | if (!pinThread(aCPU)) { 185 | std::cout << "Pin CPU fail. " << std::endl; 186 | --gActiveConsumer; 187 | return; 188 | } 189 | uint64_t lCounter = 0; 190 | while (true) { 191 | MyObject* pResult = nullptr; 192 | pQueue->pop(pResult); 193 | if (pResult == nullptr) { 194 | break; 195 | } 196 | if (pResult->mIndex != lCounter) { 197 | std::cout << "Queue item error. got: " << pResult->mIndex << " expected: " << lCounter << std::endl; 198 | } 199 | lCounter++; 200 | delete pResult; 201 | } 202 | gCounter += lCounter; 203 | --gActiveConsumer; 204 | } 205 | 206 | /// ----------------------------------------------------------- 207 | /// 208 | /// FastQueue section End 209 | /// 210 | /// ----------------------------------------------------------- 211 | 212 | int main() { 213 | 214 | /// 215 | /// Dro test -> 216 | /// 217 | 218 | // Create the queue 219 | auto droSPSC = new dro::SPSCQueue(QUEUE_MASK); 220 | 221 | // Start the consumer(s) / Producer(s) 222 | gActiveConsumer++; 223 | 224 | std::thread([droSPSC] { droSPSCConsumer(droSPSC, CONSUMER_CPU); }).detach(); 225 | std::thread([droSPSC] { droSPSCProducer(droSPSC, PRODUCER_CPU); }).detach(); 226 | 227 | // Wait for the OS to actually get it done. 228 | std::this_thread::sleep_for(std::chrono::milliseconds(100)); 229 | 230 | // Start the test 231 | std::cout << "DroSPSC pointer test started." << std::endl; 232 | gStartBench = true; 233 | std::this_thread::sleep_for(std::chrono::seconds(TEST_TIME_DURATION_SEC)); 234 | 235 | // End the test 236 | gActiveProducer = false; 237 | std::cout << "DroSPSC pointer test ended." << std::endl; 238 | 239 | // Wait for the consumers to 'join' 240 | // Why not the classic join? I prepared for a multi thread case I need this function for. 241 | while (gActiveConsumer) { 242 | std::this_thread::sleep_for(std::chrono::milliseconds(1)); 243 | } 244 | 245 | // Garbage collect the queue 246 | delete droSPSC; 247 | 248 | // Print the result. 249 | std::cout << "DroSPSC Transactions -> " << gCounter / TEST_TIME_DURATION_SEC << "/s" << std::endl; 250 | 251 | // Zero the test parameters. 252 | gStartBench = false; 253 | gActiveProducer = true; 254 | gCounter = 0; 255 | gActiveConsumer = 0; 256 | 257 | /// 258 | /// DeaodSPSC test -> 259 | /// 260 | 261 | // Create the queue 262 | auto deaodSPSC = new deaod::spsc_queue(); 263 | 264 | // Start the consumer(s) / Producer(s) 265 | gActiveConsumer++; 266 | 267 | std::thread([deaodSPSC] { deaodSPSCConsumer(deaodSPSC, CONSUMER_CPU); }).detach(); 268 | std::thread([deaodSPSC] { deaodSPSCProducer(deaodSPSC, PRODUCER_CPU); }).detach(); 269 | 270 | // Wait for the OS to actually get it done. 271 | std::this_thread::sleep_for(std::chrono::milliseconds(100)); 272 | 273 | // Start the test 274 | std::cout << "DeaodSPSC pointer test started." << std::endl; 275 | gStartBench = true; 276 | std::this_thread::sleep_for(std::chrono::seconds(TEST_TIME_DURATION_SEC)); 277 | 278 | // End the test 279 | gActiveProducer = false; 280 | std::cout << "DeaodSPSC pointer test ended." << std::endl; 281 | 282 | // Wait for the consumers to 'join' 283 | // Why not the classic join? I prepared for a multi thread case I need this function for. 284 | while (gActiveConsumer) { 285 | std::this_thread::sleep_for(std::chrono::milliseconds(1)); 286 | } 287 | 288 | // Garbage collect the queue 289 | delete deaodSPSC; 290 | 291 | // Print the result. 292 | std::cout << "DeaodSPSC Transactions -> " << gCounter / TEST_TIME_DURATION_SEC << "/s" << std::endl; 293 | 294 | // Zero the test parameters. 295 | gStartBench = false; 296 | gActiveProducer = true; 297 | gCounter = 0; 298 | gActiveConsumer = 0; 299 | 300 | /// 301 | /// FastQueue test -> 302 | /// 303 | 304 | // Create the queue 305 | auto lFastQueue = new FastQueue(); 306 | 307 | // Start the consumer(s) / Producer(s) 308 | gActiveConsumer++; 309 | std::thread([lFastQueue] { return fastQueueConsumer(lFastQueue, CONSUMER_CPU); }).detach(); 310 | std::thread([lFastQueue] { return fastQueueProducer(lFastQueue, PRODUCER_CPU); }).detach(); 311 | 312 | // Wait for the OS to actually get it done. 313 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); 314 | 315 | // Start the test 316 | std::cout << "FastQueue pointer test started." << std::endl; 317 | gStartBench = true; 318 | std::this_thread::sleep_for(std::chrono::seconds(TEST_TIME_DURATION_SEC)); 319 | 320 | // End the test 321 | gActiveProducer = false; 322 | std::cout << "FastQueue pointer test ended." << std::endl; 323 | 324 | // Wait for the consumers to 'join' 325 | // Why not the classic join? I prepared for a multi thread case I need this function for. 326 | while (gActiveConsumer) { 327 | std::this_thread::sleep_for(std::chrono::milliseconds(1)); 328 | } 329 | 330 | // Garbage collect the queue 331 | delete lFastQueue; 332 | 333 | // Print the result. 334 | std::cout << "FastQueue Transactions -> " << gCounter / TEST_TIME_DURATION_SEC << "/s" << std::endl; 335 | 336 | // Zero the test parameters. 337 | gStartBench = false; 338 | gActiveProducer = true; 339 | gCounter = 0; 340 | gActiveConsumer = 0; 341 | 342 | // Create the queue 343 | 344 | auto lObject = std::make_unique(8); 345 | 346 | //auto lFastQueueTest = new FastQueue, QUEUE_MASK, L1_CACHE_LINE>(); 347 | 348 | //std::cout << std::cref(lFastQueueTest) << std::endl; 349 | 350 | return 0; 351 | } 352 | -------------------------------------------------------------------------------- /pin_thread.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Anders Cedronius 3 | // 4 | 5 | #pragma once 6 | 7 | #ifdef __APPLE__ 8 | #include 9 | #ifdef TARGET_OS_MAC 10 | 11 | #include 12 | #include 13 | #import 14 | 15 | #define SYSCTL_CORE_COUNT "machdep.cpu.core_count" 16 | 17 | typedef struct cpu_set { 18 | uint32_t count; 19 | } cpu_set_t; 20 | 21 | static inline void 22 | CPU_ZERO(cpu_set_t *cs) { cs->count = 0; } 23 | 24 | static inline void 25 | CPU_SET(int num, cpu_set_t *cs) { cs->count |= (1 << num); } 26 | 27 | static inline int 28 | CPU_ISSET(int num, cpu_set_t *cs) { return (cs->count & (1 << num)); } 29 | 30 | bool raise_thread_priority() { 31 | /* raise the thread's priority */ 32 | thread_extended_policy_data_t extendedPolicy; 33 | thread_act_t this_thread = pthread_mach_thread_np(pthread_self()); 34 | 35 | extendedPolicy.timeshare = 0; 36 | kern_return_t error = thread_policy_set(this_thread, THREAD_EXTENDED_POLICY, 37 | (thread_policy_t)&extendedPolicy, 38 | THREAD_EXTENDED_POLICY_COUNT); 39 | if (error != KERN_SUCCESS) { 40 | std::cout << "Couldn't set thread timeshare policy" << std::endl; 41 | return false; 42 | } 43 | return true; 44 | } 45 | 46 | int sched_getaffinity(pid_t pid, size_t cpu_size, cpu_set_t *cpu_set) 47 | { 48 | int32_t core_count = 0; 49 | size_t len = sizeof(core_count); 50 | int ret = sysctlbyname(SYSCTL_CORE_COUNT, &core_count, &len, 0, 0); 51 | if (ret) { 52 | return -1; 53 | } 54 | cpu_set->count = 0; 55 | for (int i = 0; i < core_count; i++) { 56 | cpu_set->count |= (1 << i); 57 | } 58 | return 0; 59 | } 60 | 61 | int pthread_setaffinity_np(pthread_t thread, size_t cpu_size, 62 | cpu_set_t *cpu_set) { 63 | thread_port_t mach_thread; 64 | int core = 0; 65 | 66 | for (core = 0; core < 8 * cpu_size; core++) { 67 | if (CPU_ISSET(core, cpu_set)) break; 68 | } 69 | thread_affinity_policy_data_t policy = { core }; 70 | mach_thread = pthread_mach_thread_np(thread); 71 | thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, 72 | (thread_policy_t)&policy, 1); 73 | return 0; 74 | } 75 | 76 | bool pinThread(int32_t aCpu) { 77 | if (aCpu < 0) { 78 | return false; 79 | } 80 | cpu_set_t lCpuSet; 81 | CPU_ZERO(&lCpuSet); 82 | CPU_SET(aCpu, &lCpuSet); 83 | if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &lCpuSet)) { 84 | return false; 85 | } 86 | //return raise_thread_priority(); 87 | return true; 88 | } 89 | 90 | 91 | 92 | #else 93 | #error Only MacOS supported 94 | #endif 95 | #elif defined _WIN64 96 | #include 97 | bool pinThread(int32_t aCpu) { 98 | if (aCpu > 64) { 99 | throw std::runtime_error("Support for more than 64 CPU's under Windows is not implemented."); 100 | } 101 | HANDLE lThread = GetCurrentThread(); 102 | DWORD_PTR lThreadAffinityMask = 1ULL << aCpu; 103 | DWORD_PTR lReturn = SetThreadAffinityMask(lThread, lThreadAffinityMask); 104 | if (lReturn) { 105 | return true; 106 | } 107 | return false; 108 | } 109 | #elif __linux 110 | bool pinThread(int32_t aCpu) { 111 | if (aCpu < 0) { 112 | return false; 113 | } 114 | cpu_set_t lCpuSet; 115 | CPU_ZERO(&lCpuSet); 116 | CPU_SET(aCpu, &lCpuSet); 117 | if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &lCpuSet)) { 118 | return false; 119 | } 120 | return true; 121 | } 122 | #else 123 | #error OS not supported 124 | #endif 125 | 126 | -------------------------------------------------------------------------------- /ring_buffer_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersc/fastqueue2/0d60e7e4f1126c69e8d594fb66f261f2b6ac9697/ring_buffer_concept.png -------------------------------------------------------------------------------- /ringbuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersc/fastqueue2/0d60e7e4f1126c69e8d594fb66f261f2b6ac9697/ringbuffer.png --------------------------------------------------------------------------------