├── .travis.yml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── build
    └── .gitignore
├── example.cpp
├── latch.hpp
├── tests
    ├── README.md
    ├── test_latch.cpp
    └── test_pool.cpp
├── threadpool.cpp
└── threadpool.hpp


/.travis.yml:
--------------------------------------------------------------------------------
1 | language: cpp
2 | compiler:
3 |   - gcc
4 | before_install:
5 |   - pip install --user cpp-coveralls
6 | script:
7 |   - cd build && cmake .. && make && make test
8 | after_success:
9 |   - coveralls --exclude tests -r .. -b . --gcov-options '\-lp'


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(threadpool)
 2 | cmake_minimum_required(VERSION 3.1)
 3 | 
 4 | include(CTest)
 5 | 
 6 | set(CMAKE_CXX_STANDARD 11)
 7 | 
 8 | set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
 9 | set(THREADS_PREFER_PTHREAD_FLAG TRUE)
10 | find_package(Threads REQUIRED)
11 | 
12 | 
13 | add_definitions(-Wall -Wextra)
14 | add_library(threadpool STATIC ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.cpp)
15 | set_target_properties(threadpool PROPERTIES POSITION_INDEPENDENT_CODE ON)
16 | target_link_libraries(threadpool Threads::Threads)
17 | 
18 | if (BUILD_TESTING)
19 |     set(CMAKE_CXX_FLAGS "-g -O0 -Wall -fprofile-arcs -ftest-coverage")
20 |     add_executable(check ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_pool.cpp)
21 |     target_link_libraries(check threadpool)
22 |     add_test(NAME threadpool_standalone_tests COMMAND check)
23 | 
24 |     add_executable(check_latch ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_latch.cpp)
25 |     target_link_libraries(check_latch Threads::Threads)
26 |     add_test(NAME latch_standalone_tests COMMAND check_latch)
27 | 
28 |     add_executable(example_latchpool ${CMAKE_CURRENT_SOURCE_DIR}/example.cpp)
29 |     target_link_libraries(example_latchpool threadpool Threads::Threads)
30 |     add_test(NAME latch_pool_example COMMAND example_latchpool)
31 | endif (BUILD_TESTING)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Copyright (c) 2017 Nathaniel J. McClatchey, PhD
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/nmcclatchey/ThreadPool.svg?branch=master)](https://travis-ci.org/nmcclatchey/ThreadPool)
 2 | [![Coverage Status](https://coveralls.io/repos/github/nmcclatchey/ThreadPool/badge.svg?branch=master)](https://coveralls.io/github/nmcclatchey/ThreadPool?branch=master)
 3 | 
 4 | # ThreadPool
 5 | 
 6 | Provides low-overhead concurrent scheduling in C++11 through [thread pools](https://en.wikipedia.org/wiki/Thread_pool "Wikipedia: Thread pool") and [work stealing](https://en.wikipedia.org/wiki/Work_stealing "Wikipedia: Work stealing"). The thread pool approach allows fine-grained parallelism by minimizing the overhead involved in scheduling a task. The work stealing approach allows efficient balancing of scheduled tasks across available threads.
 7 | 
 8 | ## Why use this library?
 9 | 
10 | - It fulfills common scheduling needs:
11 |     + Performs multiple tasks concurrently.
12 |     + Tasks can be scheduled for an arbitrary later time-point. This provides an efficient replacement for timed waits.
13 |     + A task can spawn subtasks, with a hint that the pool ought to complete them as soon as possible.
14 | - It fulfills some uncommon scheduling needs:
15 |     + The pool can be paused, and later resumed.
16 | - It is designed for efficiency and scalability:
17 |     + Load balancing ensures that as long as there is work to do, it is being done by as many threads as possible.
18 |     + Lock-free data structures use [weak atomic orderings](https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering "C++ Reference: Release-Acquire ordering"); only when those cores that require new tasks are synchronized.
19 |     + No busy-waiting. Idle threads use [condition variables](https://en.cppreference.com/w/cpp/thread/condition_variable "C++ Reference: condition_variable") to wait without using the CPU.
20 | - It is explicitly documented:
21 |     + Full generated documentation via [Doxygen](http://www.doxygen.nl/).
22 |     + Memory synchronization between task scheduling and execution is explicitly stated in terms of [C++11's memory model](https://en.cppreference.com/w/cpp/atomic/memory_order "C++ Reference: Memory order").
23 |     + Provides usage recommendations for maximizing performance.
24 | 
25 | ## Getting Started
26 | 
27 | This library consists of a single header and source file. One may compile the source file either as part of one's own project, or as a static library.
28 | 
29 | ### Prerequisites
30 | 
31 | You will require a C++11 compiler. If you are using MinGW-w64, you may require [MinGW STD Threads](https://github.com/meganz/mingw-std-threads "MinGW STD Threads") to supply `std::thread` and similar.
32 | 
33 | ### Installing
34 | 
35 | Either compile `threadpool.cpp` as part of your project, or [compile it as a static library](https://en.wikipedia.org/wiki/Static_library "Wikipedia: Static library").
36 | 
37 | ### Using the library
38 | 
39 | The library is designed to enable a simple use pattern:
40 | 1. Create a `ThreadPool` object.
41 | 2. Give tasks to the pool by calling the pool's `schedule()`, `schedule_subtask()`, or `schedule_after()` methods.
42 | 3. Wait for tasks to complete.
43 | 
44 | Full documentation for this library may be generated using  Doxygen.
45 | 
46 | A simple example of how to the library follows:
47 | ```
48 | #include "threadpool.hpp"
49 | 
50 | //  Create a new thread pool, letting the implementation determine the number of worker threads to use.
51 | ThreadPool pool;
52 | 
53 | //  Put a task into the pool. Because this isn't called from within a worker thread, it takes the scheduler's slow path.
54 | pool.schedule([](void)
55 | {
56 | //  Put a task into the pool. This is called from within a worker thread, so it takes the scheduler's fast path.
57 |   pool.schedule([](void) {
58 |     do_something();
59 |   });
60 |  
61 | //  Put a task into the pool, treated as if it were part of the currently running task. This is called from within a worker thread, so it takes the scheduler's fast path.
62 |   pool.schedule_subtask([](void) {
63 |     do_something();
64 |   });
65 | 
66 | //  Put a task into the pool, to be executed 2 seconds after it is scheduled.
67 |   using namespace std::chrono;
68 |   pool.schedule_after(seconds(2),
69 |   [](void) {
70 |     do_something();
71 |   });
72 | });
73 | 
74 | //  When the thread pool is destroyed, remaining unexecuted tasks are forgotten.
75 | ```
76 | 
77 | ## Authors
78 | 
79 | * **Nathaniel J. McClatchey, PhD** - *Initial work*
80 | 
81 | ## License
82 | 
83 | To encourage people to use this library freely and without concern, this project is licensed under the [MIT License](LICENSE).
84 | 


--------------------------------------------------------------------------------
/build/.gitignore:
--------------------------------------------------------------------------------
1 | /*
2 | !.gitignore


--------------------------------------------------------------------------------
/example.cpp:
--------------------------------------------------------------------------------
 1 | //  Include this first to check for missed dependencies.
 2 | #include "threadpool.hpp"
 3 | #include "latch.hpp"
 4 | 
 5 | #if (!defined(__MINGW32__) || defined(_GLIBCXX_HAS_GTHREADS))
 6 | #include <thread>
 7 | #else
 8 | #include <mingw.thread.h>
 9 | #endif
10 | #include <iostream>
11 | #include <atomic>
12 | #include <chrono>
13 | 
14 | int main()
15 | {
16 |   ThreadPool pool;
17 |   latch continuation_guard (1024);
18 |   std::atomic<int> counter {0};
19 | 
20 |   //    To wait without blocking threads, define a continuation function, to be
21 |   //  executed after all other tasks complete.
22 |   std::function<void(void)> continuation = [&]()
23 |     {
24 |       //    Check whether all other subtasks are complete. If not, push another
25 |       //  call to this continuation function into the pool.
26 |       if (!continuation_guard.try_wait())
27 |       {
28 |         pool.schedule(continuation);
29 |         return;
30 |       }
31 |       if (counter.load(std::memory_order_relaxed) == 1024)
32 |         std::cout << "SUCCESS\n";
33 |       else
34 |         std::cout << "FAILED\n";
35 |     };
36 | 
37 |   pool.schedule([&](){
38 |       for (int j = 0; j < 1024; ++j)
39 |       {
40 |         pool.schedule_subtask([&](){
41 |             std::this_thread::sleep_for(std::chrono::milliseconds(5));
42 |             counter.fetch_add(1, std::memory_order_relaxed);
43 |             continuation_guard.count_down(1);
44 |           });
45 |       }
46 |       pool.schedule(continuation);
47 |     });
48 |   //    Threads outside the pool can take a simpler approach, using the OS's
49 |   //  preemptive scheduling or other waiting mechanisms.
50 |   continuation_guard.wait();
51 |   std::cout << "Finishing.\n";
52 |   std::this_thread::sleep_for(std::chrono::milliseconds(200));
53 |   return 0;
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/latch.hpp:
--------------------------------------------------------------------------------
  1 | /// \file
  2 | /// \brief  Provides a `latch` class for synchronization, roughly equivalent to
  3 | ///   C++20's `std::latch`.
  4 | 
  5 | #ifndef LATCH_HPP_
  6 | #define LATCH_HPP_
  7 | 
  8 | #if (__cplusplus >= 202002L) && __has_include(<latch>)
  9 | #include <latch>
 10 | using std::latch;
 11 | #elif LATCH_USE_WIN32_SYNCHAPI
 12 | #include <cstdint>
 13 | #include <cassert>
 14 | #include <limits>
 15 | #include <system_error>
 16 | #define WIN32_LEAN_AND_MEAN
 17 | #include <Windows.h>
 18 | #include <synchapi.h>
 19 | /// \brief  Allow threads to wait until a selection of tasks is completed by
 20 | ///   other threads.
 21 | ///
 22 | ///   `latch`es allow threads to wait for multiple tasks to be completed by
 23 | /// other threads. This is vital for applying the *fork-join* paradigm to
 24 | /// concurrency models that do not naturally supply a means of joining, such as
 25 | /// thread pools, and removes the need to store a tree of forked threads if one
 26 | /// employs a large number of worker threads.                                 \n
 27 | ///   In a typical use-case, a `latch` is locked *n* times by thread *0*,
 28 | /// which then spawns worker threads *1, 2, ... n* and waits on the `latch`.
 29 | /// Each worker thread completes its task, then unlocks the `latch`. In this
 30 | /// example, thread *0* only progresses past the `latch` once all workers
 31 | /// complete their tasks.
 32 | class latch
 33 | {
 34 |   LONG wait_for_ {0};
 35 |  public:
 36 | /// \brief  Constructs a `latch`. Note: Diverges from `std::latch` in that it
 37 | ///   is not constexpr.
 38 |   constexpr explicit latch (std::ptrdiff_t expected = 1)
 39 |     : wait_for_(expected)
 40 |   {
 41 |     assert(expected >= 0);
 42 |     assert(expected <= (max)());
 43 |   }
 44 | 
 45 |   ~latch (void)
 46 |   {
 47 |   }
 48 | 
 49 |   latch (const latch &) = delete;
 50 |   latch & operator= (const latch &) = delete;
 51 | 
 52 | /// \brief  Decreases the number of tasks remaining, and unlocks the `Barrier`
 53 | ///   if no tasks remain.
 54 |   void count_down (std::ptrdiff_t n = 1)
 55 |   {
 56 |     assert(n >= 0);
 57 |     assert(n <= (max)());
 58 |     LONG previously_waiting = n;
 59 |     do {
 60 |       LONG new_waiting = InterlockedCompareExchangeRelease(&wait_for_, previously_waiting - n, previously_waiting);
 61 |       if (new_waiting == previously_waiting)
 62 |         break;
 63 |       previously_waiting = new_waiting;
 64 |     } while (true);
 65 |     assert(previously_waiting >= n);
 66 |     if (previously_waiting <= n)
 67 |       WakeByAddressAll(&wait_for_);
 68 |   }
 69 | 
 70 | /// \brief  Returns `true` if tasks remain incomplete.
 71 |   inline bool try_wait (void) const noexcept
 72 |   {
 73 |     return InterlockedCompareExchangeAcquire(const_cast<LONG *>(&wait_for_), 0, 0) == 0;
 74 |   }
 75 | 
 76 | /// \brief  Blocks until no tasks remain incomplete.
 77 |   void wait (void) const
 78 |   {
 79 |     do
 80 |     {
 81 |       LONG expected = InterlockedCompareExchangeAcquire(const_cast<LONG *>(&wait_for_), 0, 0);
 82 |       if (expected == 0)
 83 |         break;
 84 |       if (!WaitOnAddress(const_cast<LONG *>(&wait_for_), &expected, sizeof(LONG), INFINITE))
 85 |         throw std::system_error(GetLastError(), std::system_category());
 86 |     } while (true);
 87 |   }
 88 | 
 89 | /// \brief  Counts down, then waits until no tasks remain.
 90 |   void arrive_and_wait (std::ptrdiff_t n = 1)
 91 |   {
 92 |     assert(n >= 0);
 93 |     assert(n <= (max)());
 94 |     LONG previously_waiting = n;
 95 |     do {
 96 |       LONG new_waiting = InterlockedCompareExchangeRelease(&wait_for_, previously_waiting - n, previously_waiting);
 97 |       if (new_waiting == previously_waiting)
 98 |         break;
 99 |       previously_waiting = new_waiting;
100 |     } while (true);
101 |     assert(previously_waiting >= n);
102 |     if (previously_waiting <= n)
103 |       WakeByAddressAll(&wait_for_);
104 |     else
105 |     {
106 |       LONG expected = previously_waiting - n;
107 |       do {
108 |         if (!WaitOnAddress(&wait_for_, &expected, sizeof(LONG), INFINITE))
109 |           throw std::system_error(GetLastError(), std::system_category());
110 |         expected = InterlockedCompareExchangeAcquire(&wait_for_, 0, 0);
111 |       } while (expected != 0);
112 |     }
113 |   }
114 | 
115 |   static constexpr std::ptrdiff_t (max) (void) noexcept
116 |   {
117 |     return (std::numeric_limits<LONG>::max)();
118 |   }
119 | };
120 | #else
121 | #include <atomic>
122 | #include <cassert>
123 | #include <limits>
124 | #if defined(__MINGW32__) && !defined(_GLIBCXX_HAS_GTHREADS)
125 | #include <mingw.condition_variable.h>
126 | #include <mingw.mutex.h>
127 | #else
128 | #include <condition_variable>
129 | #include <mutex>
130 | #endif
131 | 
132 | /// \brief  Allow threads to wait until a selection of tasks is completed by
133 | ///   other threads.
134 | ///
135 | ///   `latch`es allow threads to wait for multiple tasks to be completed by
136 | /// other threads. This is vital for applying the *fork-join* paradigm to
137 | /// concurrency models that do not naturally supply a means of joining, such as
138 | /// thread pools, and removes the need to store a tree of forked threads if one
139 | /// employs a large number of worker threads.                                 \n
140 | ///   In a typical use-case, a `latch` is locked *n* times by thread *0*,
141 | /// which then spawns worker threads *1, 2, ... n* and waits on the `latch`.
142 | /// Each worker thread completes its task, then unlocks the `latch`. In this
143 | /// example, thread *0* only progresses past the `latch` once all workers
144 | /// complete their tasks.
145 | class latch
146 | {
147 |   mutable std::condition_variable cv_ {};
148 |   mutable std::mutex mutex_ {};
149 |   std::atomic<int> wait_for_ {0};
150 |  public:
151 | /// \brief  Constructs a `latch`. Note: Diverges from `std::latch` in that it
152 | ///   is not constexpr.
153 |   explicit latch (std::ptrdiff_t expected = 1)
154 |     : wait_for_(expected)
155 |   {
156 |   }
157 | 
158 |   ~latch (void)
159 |   {
160 |   }
161 | 
162 |   latch (const latch &) = delete;
163 |   latch & operator= (const latch &) = delete;
164 | 
165 | /// \brief  Decreases the number of tasks remaining, and unlocks the `Barrier`
166 | ///   if no tasks remain.
167 |   void count_down (std::ptrdiff_t n = 1)
168 |   {
169 |     assert(n >= 0);
170 |     auto previously_waiting = wait_for_.fetch_sub(n, std::memory_order_release);
171 |     assert(previously_waiting >= n);
172 |     if (previously_waiting <= n)
173 |     {
174 | //    Using this mutex synchronizes with the awakened thread, ensuring that the
175 | //  barrier is seen to be open.
176 |       std::lock_guard<decltype(mutex_)> guard(mutex_);
177 |       cv_.notify_all();
178 |     }
179 |   }
180 | 
181 | /// \brief  Returns `true` if tasks remain incomplete.
182 |   inline bool try_wait (void) const noexcept
183 |   {
184 |     return wait_for_.load(std::memory_order_acquire) == 0;
185 |   }
186 | 
187 | /// \brief  Blocks until no tasks remain incomplete.
188 |   void wait (void) const
189 |   {
190 |     std::unique_lock<decltype(mutex_)> lck (mutex_);
191 |     cv_.wait(lck, [this]()->bool { return try_wait(); });
192 |   }
193 | 
194 | /// \brief  Counts down, then waits until no tasks remain.
195 |   void arrive_and_wait (std::ptrdiff_t n = 1)
196 |   {
197 |     assert(n >= 0);
198 |     auto previously_waiting = wait_for_.fetch_sub(n, std::memory_order_acq_rel);
199 |     assert(previously_waiting >= n);
200 |     std::unique_lock<decltype(mutex_)> lck(mutex_);
201 |     if (previously_waiting <= n)
202 |       cv_.notify_all();
203 |     else
204 |       cv_.wait(lck, [this]()->bool { return try_wait(); });
205 |   }
206 | 
207 |   static constexpr std::ptrdiff_t max (void) noexcept
208 |   {
209 |     return std::numeric_limits<int>::max();
210 |   }
211 | };
212 | #endif
213 | 
214 | #endif // LATCH_HPP_
215 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # ThreadPool tests
 2 | 
 3 | To ensure that the ThreadPool library works properly, I have created a test of its capabilities. The file `tests.cpp` is intended to test
 4 | * Compilation
 5 | * Expected use cases
 6 | * Load-balancing
 7 | 
 8 | The test application will, when run, perform the following:
 9 | * Create and destroy a `ThreadPool` without assigning any tasks.
10 | * Assign tasks to a `ThreadPool`.
11 | * Ensure that `ThreadPool`s idle when all tasks are complete, to avoid excessive CPU use. If it fails to idle quickly enough after completion, or does not complete within a reasonable period of time, the test application returns non-zero.
12 | * Restart an idling `ThreadPool` for a second round of tasks.
13 | * Test delayed scheduling of tasks.
14 | * Ensure that an active `ThreadPool` can be safely destroyed (losing its tasks in the process).
15 | * Measure how well tasks are balanced, by counting the minimum, maximum, and average number of tasks performed by each worker thread. Given that the tasks are (mostly) homogeneous, good balance is indicated by similarity of these numbers.
16 | * Pause and resume a `ThreadPool`.
17 | * Destroy a paused `ThreadPool`.


--------------------------------------------------------------------------------
/tests/test_latch.cpp:
--------------------------------------------------------------------------------
 1 | //  Include this first to check for missed dependencies.
 2 | #include "../latch.hpp"
 3 | 
 4 | #if (!defined(__MINGW32__) || defined(_GLIBCXX_HAS_GTHREADS))
 5 | #include <thread>
 6 | #include <mutex>
 7 | #include <condition_variable>
 8 | #else
 9 | #include <mingw.thread.h>
10 | #include <mingw.mutex.h>
11 | #include <mingw.condition_variable.h>
12 | #endif
13 | #include <cassert>
14 | #include <cstdio>
15 | #include <atomic>
16 | #include <cstdint>
17 | #include <chrono>
18 | #include <iostream>
19 | 
20 | #define LOG(fmtString,...) std::printf(fmtString "\n", ##__VA_ARGS__); fflush(stdout)
21 | 
22 | int main()
23 | {
24 |   std::atomic<int> result_code {0};
25 |   {
26 |     latch test_latch (4);
27 |     for (int i = 0; i < 4; ++i)
28 |     {
29 |       std::thread new_thread([i,&test_latch](){
30 |           std::this_thread::sleep_for(std::chrono::milliseconds(500 * (i + 1)));
31 |           test_latch.count_down(1);
32 |         });
33 |       new_thread.detach();
34 |     }
35 |     std::this_thread::sleep_for(std::chrono::milliseconds(250));
36 |     for (int i = 0; i < 4; ++i)
37 |     {
38 |       if (test_latch.try_wait())
39 |       {
40 |         LOG("Exiting far too early (probe point %d)", i);
41 |         result_code.fetch_or(1, std::memory_order_relaxed);
42 |       }
43 |       std::this_thread::sleep_for(std::chrono::milliseconds(500));
44 |     }
45 |     if (!test_latch.try_wait())
46 |     {
47 |       LOG("%s", "Did not unlock when expected.");
48 |       result_code.fetch_or(2, std::memory_order_relaxed);
49 |     }
50 |   }
51 |   {
52 |     latch test_latch (4);
53 |     std::atomic<int> counter {0};
54 |     for (int i = 0; i < 4; ++i)
55 |     {
56 |       std::thread new_thread([i,&test_latch, &counter, &result_code](){
57 |           std::this_thread::sleep_for(std::chrono::milliseconds(500 * (i + 1)));
58 |           counter.fetch_add(1, std::memory_order_relaxed);
59 |           try
60 |           {
61 |             test_latch.arrive_and_wait(1);
62 |           }
63 |           catch(const std::system_error & e)
64 |           {
65 |             std::cerr << "Arrive-and-wait error code " << e.code() << ": " << e.what() << '\n';
66 |             result_code.fetch_or(16, std::memory_order_relaxed);
67 |           }
68 |           if (counter.load(std::memory_order_relaxed) != 4)
69 |             result_code.fetch_or(8, std::memory_order_relaxed);
70 |         });
71 |       new_thread.detach();
72 |     }
73 |     try
74 |     {
75 |       test_latch.wait();
76 |     }
77 |     catch(const std::system_error & e)
78 |     {
79 |       std::cerr << "Wait error code " << e.code() << ": " << e.what() << '\n';
80 |       result_code.fetch_or(32, std::memory_order_relaxed);
81 |     }
82 |     if (counter.load(std::memory_order_relaxed) != 4)
83 |       result_code.fetch_or(4, std::memory_order_relaxed);
84 |     std::this_thread::sleep_for(std::chrono::milliseconds(500));
85 |   }
86 |   return result_code.load(std::memory_order_relaxed);
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/tests/test_pool.cpp:
--------------------------------------------------------------------------------
  1 | //  Include this first to check for missed dependencies.
  2 | #include "../threadpool.hpp"
  3 | 
  4 | #if (!defined(__MINGW32__) || defined(_GLIBCXX_HAS_GTHREADS))
  5 | #include <thread>
  6 | #include <mutex>
  7 | #include <condition_variable>
  8 | #else
  9 | #include <mingw.thread.h>
 10 | #include <mingw.mutex.h>
 11 | #include <mingw.condition_variable.h>
 12 | #endif
 13 | #include <cassert>
 14 | #include <cstdio>
 15 | #include <atomic>
 16 | #include <cstdint>
 17 | #include <chrono>
 18 | 
 19 | #define LOG(fmtString,...) printf(fmtString "\n", ##__VA_ARGS__); fflush(stdout)
 20 | 
 21 | using namespace std;
 22 | 
 23 | namespace
 24 | {
 25 | constexpr size_t kTestMaxThreads = 1024;
 26 | constexpr size_t kTestRootTasks = 1000;
 27 | constexpr size_t kTestBranchFactor = 800;
 28 | constexpr uint_fast64_t kTestTotalTasks = kTestRootTasks * kTestBranchFactor;
 29 | 
 30 | thread_local std::atomic<uint_fast64_t> * task_slot_local = nullptr;
 31 | std::atomic<uint_fast32_t> task_slot_next(0);
 32 | std::atomic<uint_fast64_t> executed_tasks [kTestMaxThreads * 64];
 33 | 
 34 | 
 35 | void perform_task (void)
 36 | {
 37 |   if (task_slot_local == nullptr)
 38 |   {
 39 |     auto n = task_slot_next.fetch_add(1, std::memory_order_relaxed);
 40 |     assert(n < kTestMaxThreads);
 41 |     task_slot_local = executed_tasks + (n * 64 / sizeof(*executed_tasks));
 42 |   }
 43 |   task_slot_local->fetch_add(1, std::memory_order_release);
 44 | }
 45 | 
 46 | std::condition_variable cv;
 47 | std::mutex mtx;
 48 | 
 49 | bool one_is_active = false;
 50 | size_t alive_count = 0;
 51 | void stay_active (ThreadPool & pool)
 52 | {
 53 |   {
 54 |     std::lock_guard<decltype(mtx)> lk (mtx);
 55 |     one_is_active = true;
 56 |     /*++alive_count;
 57 |     if ((alive_count & (alive_count + 1)) == 0)
 58 |       std::printf("Alive, %llu\n", alive_count);*/
 59 |     cv.notify_all();
 60 |   }
 61 |   pool.schedule([&pool](void){ stay_active(pool); });
 62 | }
 63 | 
 64 | void gather_statistics  (uint_fast64_t & balance_min,
 65 |                          uint_fast64_t & balance_max,
 66 |                          uint_fast64_t & balance_total)
 67 | {
 68 |   balance_min = ~static_cast<uint_fast64_t>(0);
 69 |   balance_max = balance_total = 0;
 70 |   for (uint_fast32_t n = 0; n < task_slot_next.load(std::memory_order_acquire); ++n)
 71 |   {
 72 |     auto it = executed_tasks[n * 8].load(std::memory_order_relaxed);
 73 |     if (balance_max < it)
 74 |       balance_max = it;
 75 |     if (balance_min > it)
 76 |       balance_min = it;
 77 |     balance_total += it;
 78 |   }
 79 | }
 80 | }
 81 | 
 82 | int main()
 83 | {
 84 |   int test_id = 0;
 85 |   {
 86 |     LOG("Test %u:\t%s",++test_id,"Query static information");
 87 |     LOG("\tWorker queue capacity is %zu tasks.",ThreadPool::get_worker_capacity());
 88 |   }
 89 |   {
 90 |     LOG("Test %u:\t%s",++test_id,"Construct and destroy empty threadpool.");
 91 |     {
 92 |       ThreadPool pool;
 93 |       LOG("\t%s","Construct successful.");
 94 |     }
 95 |     LOG("\t%s","Destroy successful.");
 96 |   }
 97 |   std::atomic<int> logged_errors {0};
 98 | #ifndef NDEBUG
 99 |   {
100 |     LOG("Test %u:\t%s", ++test_id, "Disallow null function pointers.");
101 |     LOG("\t%s","Constructing a thread pool.");
102 |     ThreadPool pool;
103 |     LOG("\t\tDone.\tNote: Pool has %u worker threads.", pool.get_concurrency());
104 | 
105 |     std::condition_variable schedule_cv;
106 |     std::mutex schedule_mtx;
107 |     bool ready = false;
108 |     
109 |     pool.schedule([&](void)
110 |     {
111 |       try {
112 |         try {
113 |           std::function<void()> null_func;
114 |           pool.schedule_subtask(null_func);
115 |           logged_errors |= 8;
116 |         } catch (std::bad_function_call &) {}
117 |         try {
118 |           pool.schedule_subtask(std::function<void()>());
119 |           logged_errors |= 8;
120 |         } catch (std::bad_function_call &) {}
121 |       } catch (...)
122 |       {
123 |         logged_errors  |= 8;
124 |       }
125 |       std::lock_guard<std::mutex> guard {schedule_mtx};
126 |       ready = true;
127 |       schedule_cv.notify_all();
128 |     });
129 |     try {
130 |       std::function<void()> null_func;
131 |       pool.schedule(null_func);
132 |       logged_errors |= 8;
133 |     } catch (std::bad_function_call &) {}
134 |     try {
135 |       pool.schedule(std::function<void()>());
136 |       logged_errors |= 8;
137 |     } catch (std::bad_function_call &) {}
138 |     try {
139 |       std::function<void()> null_func;
140 |       pool.schedule_after(std::chrono::seconds(1), null_func);
141 |       logged_errors |= 8;
142 |     } catch (std::bad_function_call &) {}
143 |     try {
144 |       pool.schedule_after(std::chrono::seconds(1), std::function<void()>());
145 |       logged_errors |= 8;
146 |     } catch (std::bad_function_call &) {}
147 |     std::unique_lock<std::mutex> lck {schedule_mtx};
148 |     schedule_cv.wait(lck,[&ready]()->bool { return ready; });
149 |     LOG("\t%s", "Destroying the thread pool.");
150 |   }
151 | #endif
152 | 
153 |   {
154 |     LOG("Test %u:\t%s",++test_id,"Use threadpool for tasks.");
155 |     LOG("\t%s","Constructing a thread pool.");
156 |     ThreadPool pool;
157 |     LOG("\t\tDone.\tNote: Pool has %u worker threads.", pool.get_concurrency());
158 |     for (unsigned nn = 0; nn < 2; ++nn)
159 |     {
160 |       LOG("\t%s","Resetting task-recording utilities...");
161 |       for (unsigned i = 0; i < 8 * 64; ++i)
162 |         executed_tasks[i].store(0, std::memory_order_release);
163 |       bool already_idling = false;
164 | 
165 |       LOG("\tScheduling some %s tasks...", (nn == 0) ? "immediate" : "delayed");
166 |       pool.schedule([&](void)
167 |       {
168 |         for (unsigned i = 0; i < kTestRootTasks / 2; ++i)
169 |         {
170 |           pool.schedule_after(std::chrono::seconds(nn), [&](void)
171 |           {
172 |             for (unsigned j = 0; j < kTestBranchFactor; ++j)
173 |             {
174 |               pool.schedule_subtask(&perform_task);
175 |             }
176 |           });
177 |         }
178 |         for (unsigned i = kTestRootTasks / 2; i < kTestRootTasks; ++i)
179 |         {
180 |           std::function<void(void)> lvalue_task ( [&](void)
181 |           {
182 |             for (unsigned j = 0; j < kTestBranchFactor; ++j)
183 |             {
184 |               pool.schedule_subtask(&perform_task);
185 |             }
186 |           } );
187 |           pool.schedule_after(std::chrono::seconds(nn), lvalue_task);
188 |         }
189 |       });
190 |       LOG("\t\t%s","Done. Tasks scheduled successfully.");
191 |       LOG("\t%s","Waiting a bit while tasks complete...");
192 | 
193 |       unsigned total_ms = 0;
194 |       for (unsigned ii = 0; ii < 9; ++ii)
195 |       {
196 |         using namespace std::chrono;
197 |         unsigned sleep_ms = (100u << ii);
198 |         std::this_thread::sleep_for(milliseconds(sleep_ms));
199 |         total_ms += sleep_ms;
200 | 
201 |         LOG("\t\t%s","Checking whether tasks are completed...");
202 |         uint_fast64_t balance_min, balance_max, balance_total;
203 |         gather_statistics(balance_min, balance_max, balance_total);
204 |         LOG("\t\tCompleted %llu / %llu tasks so far.", static_cast<long long unsigned>(balance_total), static_cast<long long unsigned>(kTestTotalTasks));
205 |         if (pool.is_idle() && (balance_total == kTestTotalTasks))
206 |         {
207 |           gather_statistics(balance_min, balance_max, balance_total);
208 |           LOG("\tPool has idled, as expected, with all %llu tasks complete.", static_cast<long long unsigned>(kTestTotalTasks));
209 |           LOG("\tProcessor utilization [min / mean / max]:\t%llu / %llu / %llu", static_cast<long long unsigned>(balance_min), static_cast<long long unsigned>(balance_total / pool.get_concurrency()), static_cast<long long unsigned>(balance_max));
210 |           break;
211 |         }
212 |         else if (balance_total == kTestTotalTasks)
213 |         {
214 |           if (already_idling)
215 |           {
216 |             LOG("\t%s","Pool has not yet idled, despite all tasks being complete; this is probably an error.");
217 |             logged_errors |= 1;
218 |           } else
219 |             already_idling = true;
220 |         }
221 |       }
222 |       if (!pool.is_idle())
223 |       {
224 |         LOG("\t\tPool failed to complete all tasks after %u seconds. There is probably an error.", total_ms / 1000);
225 |         logged_errors |= 2;
226 |       }
227 |     }
228 |     LOG("\t%s", "Destroying the thread pool.");
229 |   }
230 | 
231 |   {
232 |     LOG("Test %u:\t%s",++test_id,"Destroy a ThreadPool with running task-chains.");
233 |     LOG("\t%s","Constructing a thread pool.");
234 |     ThreadPool pool (2);
235 |     LOG("\t\tDone.\tNote: Pool has %u worker threads.", pool.get_concurrency());
236 |     LOG("\t%s", "Scheduling several undying tasks...");
237 |     {
238 |       std::unique_lock<decltype(mtx)> guard (mtx);
239 |       one_is_active = false;
240 |       for (unsigned n = 0; n < 16; ++n)
241 |         pool.schedule([&pool](void) { stay_active(pool); });
242 |       cv.wait(guard, [](void)->bool { return one_is_active; });
243 |     }
244 |     LOG("\t\t%s","Done. Tasks are running.");
245 |     LOG("\t%s", "Destroying the thread pool.");
246 |   }
247 | 
248 |   {
249 |     LOG("Test %u:\t%s",++test_id,"Pause and resume a ThreadPool with running task-chains.");
250 |     LOG("\t%s","Constructing a thread pool.");
251 |     ThreadPool pool (3);
252 |     LOG("\t\tDone.\tNote: Pool has %u worker threads.", pool.get_concurrency());
253 |     LOG("\t%s", "Scheduling several undying tasks...");
254 |     {
255 |       std::unique_lock<decltype(mtx)> guard (mtx);
256 |       one_is_active = false;
257 |       alive_count = 0;
258 |       for (unsigned n = 0; n < 16; ++n)
259 |         pool.schedule([&pool](void) { stay_active(pool); });
260 |       cv.wait(guard, [](void)->bool { return one_is_active; });
261 |     }
262 |     LOG("\t\t%s","Done. Tasks scheduled successfully.");
263 |     LOG("\t%s","Pausing...");
264 |     pool.halt();
265 |     LOG("\t%s","Waiting for a bit...");
266 |     std::this_thread::sleep_for(std::chrono::milliseconds(250));
267 |     if (pool.is_halted())
268 |     {
269 |       LOG("\t%s", "Pool did pause.");
270 |     }
271 |     else
272 |     {
273 |       LOG("\t%s", "Pool did not pause. This is most unusual!");
274 |       logged_errors |= 4;
275 |     }
276 |     LOG("\t%s","Unpausing...");
277 |     pool.resume();
278 |     LOG("\t%s","Waiting for 0.3 seconds...");
279 |     std::this_thread::sleep_for(std::chrono::milliseconds(300));
280 |     LOG("\t%s", "Destroying the thread pool.");
281 |   }
282 | 
283 |   {
284 |     LOG("Test %u:\t%s",++test_id,"Destroy a paused Threadpool.");
285 |     LOG("\t%s","Constructing a thread pool.");
286 |     ThreadPool pool (5);
287 |     LOG("\t\tDone.\tNote: Pool has %u worker threads.", pool.get_concurrency());
288 |     LOG("\t%s", "Scheduling several undying tasks...");
289 |     {
290 |       std::unique_lock<decltype(mtx)> guard (mtx);
291 |       one_is_active = false;
292 |       alive_count = 0;
293 |       for (unsigned n = 0; n < 16; ++n)
294 |         pool.schedule([&pool](void) { stay_active(pool); });
295 |       cv.wait(guard, [](void)->bool { return one_is_active; });
296 |     }
297 |     LOG("\t\t%s","Done. Tasks scheduled successfully.");
298 |     LOG("\t%s","Pausing...");
299 |     pool.halt();
300 |     while (!pool.is_halted())
301 |       std::this_thread::sleep_for(std::chrono::milliseconds(50));
302 |     LOG("\t%s", "Destroying the thread pool.");
303 |   }
304 | 
305 |   {
306 |     LOG("Test %u:\t%s",++test_id,"Attempt to pause from within a worker thread.");
307 |     LOG("\t%s","Constructing a thread pool.");
308 |     ThreadPool pool;
309 |     LOG("\t\tDone.\tNote: Pool has %u worker threads.", pool.get_concurrency());
310 |     LOG("\t%s", "Scheduling a few tasks, including a pausing task.");
311 |     {
312 |       std::unique_lock<decltype(mtx)> guard (mtx);
313 |       one_is_active = false;
314 |       alive_count = 0;
315 |       for (unsigned n = 0; n < 16; ++n)
316 |         pool.schedule([&pool](void) { stay_active(pool); });
317 |       pool.schedule([&pool](void) { stay_active(pool); pool.halt(); });
318 |       cv.wait(guard, [](void)->bool { return one_is_active; });
319 |     }
320 |     LOG("\t\t%s","Done. Tasks scheduled successfully.");
321 |     LOG("\t\t%s","Done. Waiting for a bit...");
322 |     std::this_thread::sleep_for(std::chrono::milliseconds(250));
323 |     LOG("\t%s","Unpausing...");
324 |     pool.resume();
325 |     LOG("\t\t%s","Done. Waiting for a bit...");
326 |     std::this_thread::sleep_for(std::chrono::milliseconds(250));
327 |     LOG("\t%s", "Destroying the thread pool.");
328 |   }
329 | 
330 |   {
331 |     LOG("Test %u:\t%s",++test_id,"Attempt to pause from within a worker thread, and then destroy the pool.");
332 |     LOG("\t%s","Constructing a thread pool.");
333 |     ThreadPool pool;
334 |     LOG("\t\tDone.\tNote: Pool has %u worker threads.", pool.get_concurrency());
335 |     LOG("\t%s", "Scheduling a few tasks, including a pausing task.");
336 |     {
337 |       std::unique_lock<decltype(mtx)> guard (mtx);
338 |       one_is_active = false;
339 |       alive_count = 0;
340 |       for (unsigned n = 0; n < 16; ++n)
341 |         pool.schedule([&pool](void) { stay_active(pool); });
342 |       pool.schedule([&pool](void) { stay_active(pool); pool.halt(); pool.halt(); pool.halt(); });
343 |       cv.wait(guard, [](void)->bool { return one_is_active; });
344 |     }
345 |     LOG("\t\t%s","Done. Tasks scheduled successfully.");
346 |     LOG("\t\t%s","Done. Waiting for a bit second...");
347 |     std::this_thread::sleep_for(std::chrono::milliseconds(250));
348 |     LOG("\t%s", "Destroying the thread pool.");
349 |   }
350 |   LOG("%s", "Exiting...");
351 |   return logged_errors;
352 | }
353 | 
354 | 


--------------------------------------------------------------------------------
/threadpool.cpp:
--------------------------------------------------------------------------------
   1 | /// \file   threadpool.cpp
   2 | /// \brief  Implements `threadpool.hpp`.
   3 | /// \author Nathaniel J. McClatchey, PhD
   4 | /// \copyright Copyright (c) 2017-2019 Nathaniel J. McClatchey, PhD.          \n
   5 | ///   Licensed under the MIT license.                                         \n
   6 | ///   You should have received a copy of the license with this software.
   7 | /// \note   To compile for MinGW-w64 without linking against the *winpthreads*
   8 | /// library, use the [*MinGW Windows STD Threads* library](https://github.com/meganz/mingw-std-threads "MinGW STD Threads").
   9 | #include "threadpool.hpp"
  10 | 
  11 | #if !defined(__cplusplus)
  12 | #error The implementation of ThreadPool requires C++11 or higher.
  13 | #endif
  14 | 
  15 | //  Debugging:
  16 | #include <cassert>            //  Fail deadly on internal library error.
  17 | #ifndef NDEBUG
  18 | #include <cstdio>             //  Warn on task queue overflow.
  19 | #endif
  20 | //  Memory management (for allocate-once approach):
  21 | #include <cstdlib>            //  For std::malloc and std::free.
  22 | #include <memory>             //  For std::align and std::unique_ptr.
  23 | #if (__cplusplus >= 201703L) && !defined(THREAD_POOL_FALSE_SHARING_ALIGNMENT)
  24 | #include <new>                //  Used to detect cache size.
  25 | #endif
  26 | //  Integers:
  27 | #include <cstdint>            //  Fixed-width integer types.
  28 | #include <atomic>             //  Relaxed memory orderings, for efficiency.
  29 | #include <limits>             //  Type sizes and maximum values.
  30 | //  Central queue management:
  31 | #include <algorithm>          //  Delayed-task sorting.
  32 | #include <vector>             //  Delayed-task storage.
  33 | #include <deque>              //  For central task queue.
  34 | //  Miscellaneous type information:
  35 | #include <type_traits>        //  Detect conditions needed for noexcept.
  36 | #include <utility>            //  For std::declval
  37 | 
  38 | //  Threading facilities:
  39 | #if (!defined(__MINGW32__) || defined(_GLIBCXX_HAS_GTHREADS))
  40 | #include <thread>             //  For threads. Duh.
  41 | #include <mutex>              //  For locking of central queue.
  42 | #include <condition_variable> //  Let threads sleep instead of spin when idle.
  43 | #else
  44 | //    This toolchain-specific workaround allows ThreadPool to be used with
  45 | //  MinGW-w64 even without linking the winpthreads library. If you lack these
  46 | //  headers, you can find them at https://github.com/nmcclatchey/mingw-std-threads .
  47 | #include "mingw.thread.h"
  48 | #include "mingw.mutex.h"
  49 | #include "mingw.condition_variable.h"
  50 | #endif
  51 | 
  52 | namespace {
  53 | #ifdef THREAD_POOL_FALSE_SHARING_ALIGNMENT
  54 | //  If a user has supplied a false-sharing alignment, use it.
  55 | constexpr std::size_t kFalseSharingAlignment = THREAD_POOL_FALSE_SHARING_ALIGNMENT;
  56 | #elif defined(__cpp_lib_hardware_interference_size) && (__cpp_lib_hardware_interference_size >= 201703L)
  57 | constexpr std::size_t kFalseSharingAlignment = std::hardware_destructive_interference_size;
  58 | #else
  59 | //  No hints? Use a typical cache line size.
  60 | constexpr std::size_t kFalseSharingAlignment = 64;
  61 | #endif
  62 | //  Forward-declarations
  63 | struct Worker;
  64 | struct ThreadPoolImpl;
  65 | 
  66 | /// \brief  Determines the capacity of each `Worker`'s queue. Larger values take
  67 | ///   more memory, but less processing power. The reverse holds for smaller
  68 | ///   values.
  69 | /// \note Must be positive.
  70 | constexpr std::uint_fast8_t kLog2Modulus = 12u;
  71 | 
  72 | static_assert(kLog2Modulus > 0, "Worker thread capacity must be positive.");
  73 | 
  74 | constexpr std::uint_fast32_t kModulus = 1ull << kLog2Modulus;
  75 | 
  76 | static_assert(kLog2Modulus < std::numeric_limits<decltype(kModulus)>::digits, "Worker thread capacity must not be excessive.");
  77 | 
  78 | /// \brief  Least-significant bit of an integer. Useful for alignment of arrays,
  79 | ///   because an alignment greater than the L.S.B. of the size of an element
  80 | ///   will be ruined on increment.
  81 | template<class Integer>
  82 | constexpr Integer lsb (Integer x) noexcept
  83 | {
  84 |   return ((x - 1) & x) ^ x;
  85 | }
  86 | 
  87 | /// \brief  Checks whether and integer is a power-of-2. Useful for alignment
  88 | ///   debugging.
  89 | template<class Integer>
  90 | constexpr bool is_pow2 (Integer x) noexcept
  91 | {
  92 |   return ((x - 1) & x) == 0;
  93 | }
  94 | 
  95 | /// \brief  Checks whether (n1 > n2) || (n1 == 0). Clang optimizes this, while
  96 | ///   GCC does not (even in 9.0)
  97 | template<class Integer>
  98 | inline constexpr bool greater_or_zero (Integer n1, Integer n2) noexcept
  99 | {
 100 |   //  return (n1 > n2) || (n1 == 0);
 101 |   static_assert(std::numeric_limits<Integer>::is_signed == false,
 102 |                 "This optimization depends on using unsigned comparison.");
 103 |   return (n1 - 1u >= n2);
 104 | }
 105 | 
 106 | static_assert(is_pow2(kFalseSharingAlignment),
 107 |               "Alignments must be integer powers of 2.");
 108 | 
 109 | /// \brief  Exactly what it says on the tin. I'd use `std::min`, but that's not
 110 | ///   `constexpr` until C++14.
 111 | template<class In1, class In2>
 112 | constexpr typename std::common_type<In1, In2>::type min (In1 x, In2 y) noexcept
 113 | {
 114 |   using result_type = decltype(min(x,y));
 115 |   return (static_cast<result_type>(x) < static_cast<result_type>(y)) ? x : y;
 116 | }
 117 | 
 118 | /// \brief  Exactly what it says on the tin. I'd use `std::max`, but that's not
 119 | ///   `constexpr` until C++14.
 120 | template<class In1, class In2>
 121 | constexpr typename std::common_type<In1, In2>::type max (In1 x, In2 y) noexcept
 122 | {
 123 |   using result_type = decltype(max(x,y));
 124 |   return (static_cast<result_type>(x) < static_cast<result_type>(y)) ? y : x;
 125 | }
 126 | 
 127 | /// \brief  Determines an alignment that minimizes the number of times that a
 128 | ///   densely-packed array of `T` would have an instance of `T` straddling a
 129 | ///   cache-line border.
 130 | template<class T>
 131 | constexpr std::size_t get_align (void)
 132 | {
 133 |   return max(alignof(T), min(lsb(sizeof(T)), kFalseSharingAlignment));
 134 | }
 135 | 
 136 | /// \brief  Destructor that allows `std::unique_ptr` to be used with memory
 137 | ///   acquired using `malloc`.
 138 | struct RawDeleter
 139 | {
 140 |   void operator() (void * ptr) const
 141 |   {
 142 |     std::free(ptr);
 143 |   }
 144 | };
 145 | 
 146 | /// \brief  Provides O(1) access to the Worker that is handling the current
 147 | ///   function (if any). Used to provide a fast path for scheduling within the
 148 | ///   ThreadPool.
 149 | thread_local Worker * current_worker = nullptr;
 150 | 
 151 | struct ThreadPoolImpl
 152 | {
 153 |   using task_type   = typename ThreadPool::task_type;
 154 |   using clock       = std::chrono::steady_clock;
 155 |   using timed_task  = std::pair<clock::time_point, task_type>;
 156 |   using index_type  = std::uint_fast16_t;
 157 | 
 158 |   ThreadPoolImpl (Worker *, index_type);
 159 |   ~ThreadPoolImpl (void);
 160 | 
 161 | //  Returns number of allocated Workers (may differ from active workers later)
 162 |   inline index_type get_capacity (void) const noexcept
 163 |   {
 164 |     return num_workers_;
 165 |   }
 166 | 
 167 |   inline index_type get_concurrency (void) const noexcept
 168 |   {
 169 |     return num_threads_.load(std::memory_order_relaxed);
 170 |   }
 171 | 
 172 |   void halt (void);
 173 |   void resume (void);
 174 |   bool is_halted (void) const;
 175 | 
 176 |   template<typename Task>
 177 |   void schedule_overflow (Task &&);
 178 | 
 179 |   template<typename Task>
 180 |   void schedule_after (clock::duration const &, Task &&);
 181 | 
 182 |   bool is_idle (void) const;
 183 | 
 184 |   inline bool should_stop (void) const noexcept
 185 |   {
 186 |     return stop_.load(std::memory_order_relaxed) & 0x01;
 187 |   }
 188 | 
 189 | 
 190 |   inline void notify_if_idle (void) noexcept
 191 |   {
 192 |     if (idle_ > 0)
 193 |       cv_.notify_one();
 194 |   }
 195 |   inline bool might_have_task (void) const noexcept
 196 |   {
 197 |     return !queue_.empty();
 198 |   }
 199 | //  Note: Does no synchronization of its own.
 200 |   inline bool has_task (void) const noexcept
 201 |   {
 202 |     return !queue_.empty();
 203 |   }
 204 | //  Note: Does no synchronization of its own.
 205 |   inline std::size_t size (void) const noexcept
 206 |   {
 207 |     return queue_.size();
 208 |   }
 209 | //  Note: Does no synchronization of its own.
 210 |   void update_tasks (void)
 211 |   {
 212 |     if (time_queue_.empty())
 213 |       return;
 214 |     auto time_now = clock::now();
 215 | 
 216 |     while (time_now >= time_queue_.front().first)
 217 |     {
 218 | //    If an exception was thrown, it was thrown in `push`. Because of the strong
 219 | //  exception-safety guarantee, nothing actually happens.
 220 |       try {
 221 |         push(std::move(time_queue_.front().second));
 222 |       } catch (std::bad_alloc &) {
 223 |         return;
 224 |       }
 225 | //  The pop_back method for a vector should be non-throwing.
 226 |       std::pop_heap(time_queue_.begin(), time_queue_.end(), TaskOrder{});
 227 |       time_queue_.pop_back();
 228 | 
 229 |       if (time_queue_.empty())
 230 |         break;
 231 |     }
 232 |   }
 233 | //  Note: Does no synchronization of its own.
 234 |   task_type extract_task (void)
 235 |   {
 236 |     assert(!queue_.empty() && "Cannot retrieve a task from an empty queue.");
 237 |     task_type result = std::move(queue_.front());
 238 |     queue_.pop_front();
 239 |     return result;
 240 |   }
 241 | 
 242 | /// \par  Exception safety
 243 | ///   Provides the strong (rollback) guarantee, even with move semantics.
 244 |   template<typename Task>
 245 |   inline void push (Task && task)
 246 |   {
 247 |     queue_.push_back(std::forward<Task>(task));
 248 |   }
 249 | 
 250 | /// \par  Exception safety
 251 | ///   Provides the strong (rollback) guarantee unless the task can only be moved
 252 | /// and has a throwing move constructor.
 253 |   template<typename Task>
 254 |   inline void push_at (clock::time_point const & tp, Task && task)
 255 |   {
 256 |     time_queue_.push_back(timed_task{tp, std::forward<Task>(task)});
 257 |     std::push_heap(time_queue_.begin(), time_queue_.end(), TaskOrder{});
 258 |   }
 259 | 
 260 | //  Note: wait and wait_until don't throw in C++14 and later.
 261 |   void wait_for_task (std::unique_lock<std::mutex> & lk)
 262 |   {
 263 |     assert(lk.mutex() == &mutex_ &&"Incorrect mutex used for synchronization.");
 264 |     if (time_queue_.empty())
 265 |       cv_.wait(lk);
 266 |     else
 267 |       cv_.wait_until(lk, time_queue_.front().first);
 268 |   }
 269 | 
 270 |   inline Worker * data (void) noexcept
 271 |   {
 272 |     return workers_;
 273 |   }
 274 |  private:
 275 |   struct TaskOrder {
 276 |     inline bool operator() (timed_task const & lhs, timed_task const & rhs) const noexcept
 277 |     {
 278 |       return lhs.first > rhs.first;
 279 |     }
 280 |   };
 281 | 
 282 |   std::condition_variable cv_ {};
 283 |   mutable std::mutex mutex_ {};
 284 | 
 285 |   std::deque<task_type> queue_ {};
 286 |   std::vector<timed_task> time_queue_ {};
 287 | 
 288 |   Worker * const workers_;
 289 | 
 290 |   index_type num_workers_ {0},
 291 |               living_ {0}, idle_ {0}, paused_ {0};
 292 |   std::atomic<index_type> num_threads_ {0};
 293 | 
 294 |   std::atomic<std::uint_fast8_t> stop_ {0x00};
 295 | 
 296 |   ThreadPoolImpl (ThreadPoolImpl const &) = delete;
 297 |   ThreadPoolImpl & operator= (ThreadPoolImpl const &) = delete;
 298 | 
 299 |   void stop_threads (std::unique_lock<std::mutex>&);
 300 | 
 301 |   friend struct Worker;
 302 | };
 303 | 
 304 | //  Notes:
 305 | //  -   "front_" is always claimed for the worker.
 306 | //  -   "back_"  stores past-the-end markers both for writing and validity. If
 307 | //    they are unequal, the back is locked.
 308 | //  -   For various reasons, it is possible for the front marker to be between
 309 | //    the write and valid pte markers. In such a case, the already-claimed task
 310 | //    may be read, but no further tasks will be read, even if claimed.
 311 | struct alignas(kFalseSharingAlignment) Worker
 312 | {
 313 |   using task_type = typename ThreadPool::task_type;
 314 |   using index_type = std::uint_fast32_t;
 315 | 
 316 |   Worker (ThreadPoolImpl &) noexcept;
 317 |   ~Worker (void);
 318 | 
 319 |   void operator() (void);
 320 | 
 321 |   bool is_alive (void) const noexcept
 322 |   {
 323 |     return thread_.joinable();
 324 |   }
 325 | 
 326 |   void restart_thread (void)
 327 |   {
 328 |     assert(!pool_.should_stop() && "Start or stop new threads. Not both.");
 329 |     if (!thread_.joinable())  //  noexcept
 330 |     {
 331 |       thread_ = std::thread(std::reference_wrapper<Worker>(*this));
 332 |       pool_.num_threads_.fetch_add(1, std::memory_order_relaxed);
 333 |     }
 334 |   }
 335 |   void stop_thread (void)
 336 |   {
 337 |     assert(pool_.should_stop() && "Spurious thread-stopping detected.");
 338 |     if (thread_.joinable()) //  noexcept
 339 |     {
 340 |       thread_.join();
 341 |       pool_.num_threads_.fetch_sub(1, std::memory_order_relaxed);
 342 |     }
 343 |   }
 344 | 
 345 |   inline bool belongs_to (ThreadPoolImpl const * ptr) const noexcept
 346 |   {
 347 |     return &pool_ == ptr;
 348 |   }
 349 | 
 350 |   inline bool get_paused (void) const noexcept
 351 |   {
 352 |     return paused_;
 353 |   }
 354 | 
 355 |   inline void set_paused (bool val) noexcept
 356 |   {
 357 |     paused_ = val;
 358 |   }
 359 | 
 360 |   template<typename Task>
 361 |   bool push (Task && tasks);
 362 | 
 363 |   template<typename Task>
 364 |   bool push_front (Task && tasks);
 365 | 
 366 |   index_type count_tasks (void) const noexcept;
 367 | 
 368 |   void canibalize (ThreadPoolImpl &);
 369 | 
 370 |  private:
 371 |   Worker (Worker const &) = delete;
 372 |   Worker & operator= (Worker const &) = delete;
 373 | 
 374 |   constexpr static std::size_t kValidShift = std::numeric_limits<index_type>::digits / 2;
 375 |   constexpr static index_type kWriteMask = ~(~static_cast<index_type>(0) << kValidShift);
 376 |   static_assert(kLog2Modulus <= kValidShift,                                   \
 377 |     "ThreadPool's local task queue size exceeds limit of selected index type.");
 378 | 
 379 |   inline static constexpr
 380 |     index_type get_distance (index_type left, index_type right) noexcept
 381 |   {
 382 |     return (right - left + kModulus) % kModulus;
 383 |   }
 384 | 
 385 |   inline static constexpr index_type get_valid (index_type b) noexcept
 386 |   {
 387 |     return b >> kValidShift;
 388 |   }
 389 | 
 390 |   inline static constexpr index_type get_write (index_type b) noexcept
 391 |   {
 392 |     static_assert((kWriteMask >> kValidShift) == 0, "WRITE and VALID regions must not intersect.");
 393 |     return b & kWriteMask;
 394 |   }
 395 | 
 396 |   inline static constexpr
 397 |     index_type make_back (index_type write, index_type valid) noexcept
 398 |   {
 399 |     return write | (valid << kValidShift);
 400 |   }
 401 | 
 402 |   inline static constexpr index_type make_back (index_type write) noexcept
 403 |   {
 404 |     return write | (write << kValidShift);
 405 |   }
 406 | 
 407 | 
 408 |   unsigned steal (void);
 409 |   unsigned steal_from (Worker & source) noexcept(std::is_nothrow_destructible<task_type>::value && std::is_nothrow_move_constructible<task_type>::value);
 410 |   bool pop (task_type & task)           noexcept(std::is_nothrow_destructible<task_type>::value && std::is_nothrow_move_assignable<task_type>::value);
 411 |   unsigned push_front(ThreadPoolImpl &, unsigned number);
 412 |   bool execute (void);
 413 |   void refresh_tasks (ThreadPoolImpl &, unsigned number);
 414 | 
 415 | /// \brief  Activates a task slot within the queue, and fills it appropriately.
 416 |   template<typename Task>
 417 |   void place_task (index_type location, Task && task)
 418 |     noexcept(std::is_nothrow_constructible<task_type, Task &&>::value)
 419 |   {
 420 |     static_assert(std::is_trivially_destructible<OptionalTask::Empty>::value,
 421 |               "Implicit destruction is used here, and thus is required here.");
 422 |     new(std::addressof(tasks_[location].task_)) task_type(std::forward<Task>(task));
 423 |   }
 424 | /// \brief  Deactivates a task slot, and returns what was inside before the
 425 | ///   deactivation.
 426 |   task_type remove_task (index_type location)
 427 |     noexcept(std::is_nothrow_destructible<task_type>::value)
 428 |   {
 429 |     task_type result = std::move(tasks_[location].task_);
 430 |     tasks_[location].task_.~task_type();
 431 | //  Set the new active member of the union. Should be a no-op.
 432 |     static_assert(std::is_trivial<OptionalTask::Empty>::value,
 433 |             "The default value for implicit optional values must be trivial.");
 434 |     tasks_[location].empty_ = OptionalTask::Empty();
 435 |     return result;
 436 |   }
 437 | 
 438 |   template<typename Func>
 439 |   void remove_all_and (Func const &);
 440 | 
 441 | //  These store information about the current state of the deque.
 442 | //  -   front_ is modified only by the Worker's own thread. Reads and writes
 443 | //    must be atomic, however, to avoid torn writes.
 444 | //  -   back_ is potentially modified by all threads. The top and bottom halves
 445 | //    store a past-the-end (PTE) marker for the occupied slots, and a PTE marker
 446 | //    for the slots this Worker is permitted to read, respectively.
 447 |   std::atomic<index_type> front_ {0}, back_ {0};
 448 | //    When this Worker runs out of tasks, it will search for more. A central
 449 | //  ThreadPool object will serve to coordinate work-stealing (that is, store the
 450 | //  addresses of other Workers), provide new tasks, and capture overflow should
 451 | //  a Worker generate more tasks than can fit in its deque.
 452 |   ThreadPoolImpl & pool_;
 453 | //    To avoid starvation for tasks in the overflow queue, I pull in its tasks
 454 | //  once every time a worker finishes a batch of tasks. The variable countdown_
 455 | //  records the remaining size of the batch. A successfully scheduled subtask
 456 | //  will increment this to ensure the originally scheduled tasks are completed
 457 | //  as part of the batch.
 458 |   static_assert(kLog2Modulus < std::numeric_limits<std::uint_fast32_t>::digits - 2,
 459 | "The behavior of the worker queue's starvation-avoidance algorithm has not yet \
 460 | been examined in the case that the countdown variable is small relative to the \
 461 | task-queue.");
 462 |   std::uint_fast32_t  countdown_;
 463 | //    While a task is being executed, the front_ marker is not incremented. This
 464 | //  avoids early claiming of a new task (which would prevent that task from
 465 | //  being stolen), but makes the push-to-front process a bit more complicated.
 466 | //  In particular, the push-to-front should overwrite the front when first
 467 | //  called during an execution, but not afterward.
 468 |   bool front_invalid_;
 469 |   bool paused_;
 470 | //    Need to keep the thread's handle for later joining. I could work around
 471 | //  this, but the workaround would be less efficient.
 472 |   std::thread thread_ {};
 473 | //    Task queue. When information about the cache is available, allocate so
 474 | //  that tasks aren't split across cache lines. Note: If splitting is
 475 | //  inevitable, make a best-effort attempt to reduce it.
 476 |   union OptionalTask {
 477 |     struct Empty {} empty_;
 478 |     task_type task_;
 479 | 
 480 |     OptionalTask (void) noexcept : empty_() {}
 481 |     ~OptionalTask (void) noexcept {}
 482 |   };
 483 |   alignas(get_align<task_type>()) OptionalTask tasks_ [kModulus];
 484 | };
 485 | 
 486 | Worker::Worker (ThreadPoolImpl & pool) noexcept
 487 |   : pool_(pool),
 488 |     countdown_(2), front_invalid_(false), paused_(false)
 489 | {
 490 | }
 491 | 
 492 | //  Only called after all workers have stopped.
 493 | Worker::~Worker (void)
 494 | {
 495 | //    If this assert fails, either synchronization wasn't performed, or a task
 496 | //  is actively running. Either way, the code would need a fix.
 497 |   assert(!front_invalid_ && "Attempting to destroy a running worker!");
 498 | 
 499 | //    Remove tasks without using them in any way.
 500 |   remove_all_and([](task_type&&) noexcept {});
 501 | }
 502 | 
 503 | //    Removes each task from a Worker and applies func to it. Note: Must
 504 | //  not be called before the Worker's thread is fully stopped.
 505 | /// \note Has exactly one possibly-throwing statement.
 506 | template<class Func>
 507 | void Worker::remove_all_and (Func const & func)
 508 | {
 509 |   index_type back = back_.load(std::memory_order_relaxed);
 510 | 
 511 | //    For safety, block stealing during this. Note: Won't block the worker that
 512 | //  is being destroyed.
 513 |   do {
 514 |     back = make_back(get_valid(back));
 515 |   } while (!back_.compare_exchange_weak(back, make_back(1, 0),
 516 |                     std::memory_order_acquire, std::memory_order_relaxed));
 517 | 
 518 | //  If the worker is running a task, something is VERY wrong.
 519 |   assert(!front_invalid_ && "The worker is still running a task!");
 520 | 
 521 |   back = get_valid(back);
 522 | 
 523 |   index_type front = front_.load(std::memory_order_acquire);
 524 | //  Ensure a consistent state, in the event of an exception.
 525 |   struct RAIIHelper
 526 |   {
 527 |     decltype(back_) & back_ref;
 528 |     index_type value;
 529 |     ~RAIIHelper (void)
 530 |     {
 531 |       back_ref.store(value, std::memory_order_release);
 532 |     }
 533 |   } raii_helper { back_, back };
 534 |   while (front != raii_helper.value)
 535 |   {
 536 |     raii_helper.value = (raii_helper.value - 1 + kModulus) % kModulus;
 537 | //  Possibly-throwing:
 538 |     func(remove_task(raii_helper.value));
 539 |   }
 540 | }
 541 | 
 542 | //  Work-stealing will occur as follows:
 543 | //  1.  Determine the exact number of tasks that can be added to this queue.
 544 | //    -   Note: Though stealing only occurs when the queue is empty, it can be
 545 | //      empty because of another process performing work-stealing.
 546 | //    -   Note: This value need not be refreshed, as it can only decrease. This
 547 | //      is because only the Worker's thread will be allowed to add items to its
 548 | //      queue.
 549 | //  2.  Estimate the number of items in the the source queue.
 550 | //    -   If the queue is already being edited, giving up is an option. The
 551 | //      worker will come back later or try a different source queue, effectively
 552 | //      creating a spin-lock.
 553 | //  3.  Set source's write to write - (items - stolen). Do not change valid.
 554 | //    -   When write != valid, the write-head is locked. Moreover, reading
 555 | //      should not occur if read is in the interval [write, valid].
 556 | //  4.  Check whether source's read is in the interval [write, valid]. If it is,
 557 | //    then the current interval is contended. Go to step 2.
 558 | //  5.  Now that the write interval is locked, copy to the front (reading side)
 559 | //    of this thread's queue. This is safe because only this thread affects this
 560 | //    part of the queue.
 561 | //  6.  Set source's VALID equal to its WRITE to unlock that part of the queue.
 562 | 
 563 | //    Steals approximately [available] / [divisor] tasks from source, if
 564 | //  possible. Returns number of successfully stolen tasks.
 565 | /// \note noexcept if place_task and remove_task are both noexcept.
 566 | unsigned Worker::steal_from (Worker & source)
 567 |     noexcept(std::is_nothrow_destructible<task_type>::value && std::is_nothrow_move_constructible<task_type>::value)
 568 | {
 569 |   static constexpr unsigned kDivisor = 4;
 570 |   index_type this_front,    this_back,    writeable, stolen,
 571 |               source_front, source_back,  source_valid, source_write;
 572 | //  Worker::steal_from may only be called from the Worker's owned thread.
 573 |   assert(std::this_thread::get_id() == thread_.get_id() && "Worker::steal_from may only be called from the Worker's own thread.");
 574 |   assert(this != &source && "Worker may not steal from itself.");
 575 |   assert(!front_invalid_ && "Worker cannot steal while it is performing a task.");
 576 | 
 577 |   this_front = front_.load(std::memory_order_relaxed);
 578 |   this_back = back_.load(std::memory_order_acquire);
 579 | 
 580 |   writeable = get_distance(get_valid(this_back), this_front - 1);
 581 |   if (writeable == 0)
 582 |     return 0;
 583 | 
 584 | //  Maximum number of times to attempt to lock the victim before giving up.
 585 |   std::uint_fast8_t spins = 64;
 586 | //  Lock the source queue, reserving several tasks to steal.
 587 |   source_back = source.back_.load(std::memory_order_relaxed);
 588 |   do {
 589 |     source_valid = get_valid(source_back);
 590 | //  Already locked. Better to give up immediately, and try a different victim.
 591 |     if (source_valid != get_write(source_back))
 592 |       return 0;
 593 |     source_front = source.front_.load(std::memory_order_relaxed);
 594 | //  Stolen is actually from WRITE, but WRITE and VALID are identical.
 595 |     index_type valid = get_distance(source_front, source_valid);
 596 | //    Must not attempt to claim the current front pointer, so require at least 2
 597 | //  items in source queue.
 598 |     if (valid < 2)
 599 |       return 0;
 600 |     stolen = min((valid + kDivisor - 2) / kDivisor, writeable);
 601 |     source_write = (source_valid - stolen + kModulus) % kModulus;
 602 | 
 603 |     if (source.back_.compare_exchange_weak(source_back,
 604 |                               make_back(source_write, source_valid),
 605 |                               std::memory_order_acq_rel,
 606 |                               std::memory_order_relaxed))
 607 |       break;
 608 | //  Spun too long. Better to try a different victim than lock forever.
 609 |     if (--spins == 0)
 610 |       return 0;
 611 |   } while (true);
 612 | //    Now that the lock has been acquired, read may advance at most one more
 613 | //  time. That is, simply ensuring that READ < WRITE will suffice to ensure
 614 | //  correct behavior. Unfortunately, the READ may already be in the claim. Only
 615 | //  READ <= VALID is certain until we enforce it.
 616 | //    Note that by including the one-increment error margin, the following
 617 | //  adjustment needs to be run at most once.
 618 |   {
 619 |     source_front = source.front_.load(std::memory_order_acquire);
 620 |     index_type valid = get_distance(source_front, source_valid);
 621 |     if (valid < 2)  //  Unlock. There aren't any unclaimed tasks to steal.
 622 |     {
 623 |       source.back_.store(make_back(source_valid));
 624 |       return 0;
 625 |     }
 626 | 
 627 |     index_type readable = get_distance(source_front, source_write);
 628 | //    Even if READ <= VALID, (that is, normal behavior), if READ == WRITE then
 629 | //  we must increment WRITE as READ may be incremented during the write phase.
 630 |     if (greater_or_zero(readable, valid))
 631 |     {
 632 |       stolen = (valid + kDivisor - 2) / kDivisor;
 633 | //    Thief's number of held tasks can only be reduced since last check, so
 634 | //  there is no reason to double-check whether thief can hold the tasks.
 635 |       source_write = (source_valid - stolen + kModulus) % kModulus;
 636 | //    This store is optional. It allows the victim queue to keep executing
 637 | //  while memory is copied.
 638 |       source.back_.store(make_back(source_write, source_valid),
 639 |                          std::memory_order_relaxed);
 640 |     }
 641 |   }
 642 | 
 643 | #ifndef NDEBUG
 644 |   assert(source_write != source_valid);
 645 |   auto test_front = source.front_.load(std::memory_order_relaxed);
 646 |   assert(get_distance(test_front, source_write) <= get_distance(test_front, source_valid));
 647 | #endif
 648 |   do {
 649 |     source_valid = (source_valid - 1 + kModulus) % kModulus;
 650 |     this_front = (this_front - 1 + kModulus) % kModulus;
 651 |     place_task(this_front, source.remove_task(source_valid));
 652 |   } while (source_valid != source_write);
 653 | 
 654 |   front_.store(this_front, std::memory_order_release);
 655 |   source.back_.store(make_back(source_valid), std::memory_order_release);
 656 |   return stolen;
 657 | }
 658 | 
 659 | //    Removes a task from the front of the queue, if possible. Returns true or
 660 | //  false for success or failure, respectively.
 661 | bool Worker::pop (task_type & task)
 662 |   noexcept(std::is_nothrow_destructible<task_type>::value &&
 663 |            std::is_nothrow_move_assignable<task_type>::value)
 664 | {
 665 |   //assert(std::this_thread::get_id() == thread_.get_id() && "Worker::pop may only be called from the Worker's own thread.");
 666 | 
 667 |   auto front = front_.load(std::memory_order_relaxed);
 668 |   auto back = back_.load(std::memory_order_acquire);
 669 | 
 670 |   auto readable = get_distance(front, get_write(back));
 671 | //    Two circumstances can prevent reading: Either there is nothing to read, or
 672 | //  the current location is claimed. Even once the claim is resolved, there may
 673 | //  or may not be something to read.
 674 |   if (greater_or_zero(readable, get_distance(front, get_valid(back))))
 675 |     return false;
 676 | 
 677 |   auto new_front = (front + 1) % kModulus;
 678 |   if (!front_invalid_)
 679 |   {
 680 |     task = remove_task(front);
 681 |     front_.store(new_front, std::memory_order_relaxed);
 682 |   //  I need to release back_ so that the write to front_ is visible to thieves.
 683 |     back_.fetch_or(0, std::memory_order_release);
 684 |     return true;
 685 |   }
 686 |   else if (readable > 1)
 687 |   {
 688 |     front_.store(new_front, std::memory_order_relaxed);
 689 |     back = back_.fetch_or(0, std::memory_order_acq_rel);
 690 |     if (greater_or_zero(get_distance(new_front, get_write(back)), get_distance(new_front, get_valid(back))))
 691 |     {
 692 | //    By the time we advanced the pointer, the task we intended to read was
 693 | //  already removed. Don't read it.
 694 |       front_invalid_ = false;
 695 |       return false;
 696 |     }
 697 |     else
 698 |     {
 699 |       task = remove_task(new_front);
 700 |       return true;
 701 |     }
 702 |   }
 703 |   else
 704 |     return false;
 705 | }
 706 | 
 707 | //    Removes, then performs the task at the front of the queue. Claims the next
 708 | //  task after performing the current one.
 709 | bool Worker::execute (void)
 710 | {
 711 |   assert(!front_invalid_ && "Can't execute a task while already executing a different task.");
 712 |   assert(std::this_thread::get_id() == thread_.get_id() && "Worker::execute may only be called from the Worker's own thread.");
 713 | 
 714 |   auto front = front_.load(std::memory_order_relaxed);
 715 |   auto back = back_.load(std::memory_order_acquire);
 716 | 
 717 |   auto readable = get_distance(front, get_write(back));
 718 | //    Two circumstances can prevent reading: Either there is nothing to read, or
 719 | //  the current location is claimed. Even once the claim is resolved, there may
 720 | //  or may not be something to read.
 721 |   if (greater_or_zero(readable, get_distance(front, get_valid(back))))
 722 |     return false;
 723 | 
 724 | //    Will ensure that the queue is restored to validity, even in the event of
 725 | //  an exception.
 726 |   struct Reservation
 727 |   {
 728 |     Reservation (Worker & worker) noexcept
 729 |       : worker_(worker)
 730 |     {
 731 |       worker_.front_invalid_ = true;
 732 |     }
 733 |     ~Reservation (void)
 734 |     {
 735 |       if (worker_.front_invalid_)
 736 |       {
 737 |         worker_.front_invalid_ = false;
 738 |         auto new_front = worker_.front_.load(std::memory_order_relaxed);
 739 |         worker_.front_.store((new_front+1)%kModulus, std::memory_order_relaxed);
 740 | //  I need to release back_ so that the write to front_ is visible to thieves.
 741 |         worker_.back_.fetch_or(0, std::memory_order_release);
 742 |       }
 743 |     }
 744 |     Reservation (Reservation const &) = delete;
 745 |     Reservation & operator= (Reservation const &) = delete;
 746 |    private:
 747 |     Worker & worker_;
 748 |   } reservation {*this};
 749 | //  Potentially-throwing.
 750 |   task_type task = remove_task(front);
 751 | //  Potentially-throwing.
 752 |   task();
 753 | /// \todo Find a good way to unify this with the other validation.
 754 | //    If the slot was not already overwritten (eg. by the task pushing to the
 755 | //  task-queue), need to adjust the queue size.
 756 |   return true;
 757 | }
 758 | 
 759 | //    Pulls some tasks into the local queue from the central queue, and returns
 760 | //  others.
 761 | void Worker::refresh_tasks (ThreadPoolImpl & tasks, unsigned number)
 762 | {
 763 |   unsigned num_pushed = push_front(tasks, number);
 764 |   if (num_pushed == 0)
 765 |   {
 766 |     auto cnt = tasks.size();
 767 |     if (number > cnt)
 768 |       number = static_cast<unsigned>(cnt);
 769 |     task_type task;
 770 | 
 771 |     for (; number && pop(task); ++num_pushed, --number)
 772 |       tasks.push(std::move(task));
 773 |     push_front(tasks, num_pushed);
 774 |   }
 775 | }
 776 | 
 777 | //  Feeds all existing tasks to the ThreadPool. Used as a last resort.
 778 | void Worker::canibalize (ThreadPoolImpl & tasks)
 779 | {
 780 |   do {
 781 |     task_type task;
 782 |     if (pop(task))
 783 |       tasks.push(std::move(task));
 784 |     else
 785 |     {
 786 |       auto front = front_.load(std::memory_order_relaxed);
 787 |       auto back = back_.load(std::memory_order_relaxed);
 788 | //    If the queue is fully-depleted, our job is done. Otherwise, we need to
 789 | //  keep trying.
 790 |       if ((get_write(back) == get_valid(back)) && (get_valid(back) == front))
 791 |         break;
 792 |       else
 793 |         std::this_thread::yield();
 794 |     }
 795 |   } while (true);
 796 | }
 797 | 
 798 | //    Pushes a task onto the back of the queue, if possible. If the back of the
 799 | //  queue is in contention, (eg. because of work stealing), pushes onto the
 800 | //  front of the queue instead.
 801 | //    Note: Only evaluates the task reference if there is room to insert the
 802 | //  task.
 803 | /// \par  Exception safety
 804 | ///   *Strong*: If an exception is thrown, the function has no effect.
 805 | ///   Applies only if `place_task()` also provides the strong guarantee.
 806 | template<typename Task>
 807 | bool Worker::push (Task && task)
 808 | {
 809 |   assert(std::this_thread::get_id() == thread_.get_id() && "Worker::push may only be called from the Worker's owned thread.");
 810 | 
 811 |   auto front = front_.load(std::memory_order_relaxed);
 812 |   auto back = back_.load(std::memory_order_acquire);
 813 | 
 814 |   auto valid = get_valid(back);
 815 |   if (((front - valid + kModulus) % kModulus) == 1)
 816 |     return false;
 817 | 
 818 |   index_type write     = get_write(back);
 819 |   index_type new_back  = (write + 1) % kModulus;
 820 |   index_type expected  = make_back(write);
 821 |   if (back_.compare_exchange_strong(expected, make_back(write, new_back),
 822 |                                     std::memory_order_acquire,
 823 |                                     std::memory_order_relaxed))
 824 |   {
 825 |     struct RAIIHelper
 826 |     {
 827 |       decltype(back_) & back_ref;
 828 |       index_type value;
 829 |       ~RAIIHelper (void)
 830 |       {
 831 |         back_ref.store(value, std::memory_order_release);
 832 |       }
 833 |     } raii_helper { back_, back };
 834 |     place_task(write, std::forward<Task>(task));  //  May throw.
 835 |     raii_helper.value = make_back(new_back);
 836 |   }
 837 |   else
 838 |   {
 839 |     write = front;
 840 |     front = (front - 1 + kModulus) % kModulus;
 841 |     if (!front_invalid_)
 842 |       write = front;
 843 |     place_task(write, std::forward<Task>(task));
 844 |     front_.store(front, std::memory_order_release);
 845 |   }
 846 | 
 847 |   pool_.notify_if_idle();
 848 |   return true;
 849 | }
 850 | 
 851 | //    Places a new task at the front of the queue. Note that this skirts anti-
 852 | //  starvation precautions.
 853 | //    Note: Only evaluates the task reference if there is room to insert the
 854 | //  task.
 855 | /// \par  Exception safety
 856 | ///   *Strong*: If an exception is thrown, the function has no effect.
 857 | ///   Applies only if `place_task()` also provides the strong guarantee.
 858 | template<typename Task>
 859 | bool Worker::push_front (Task && task)
 860 | {
 861 |   assert(std::this_thread::get_id() == thread_.get_id() && "Worker::push_front may only be called from the Worker's owned thread.");
 862 | 
 863 |   index_type front = front_.load(std::memory_order_relaxed);
 864 |   index_type back = back_.load(std::memory_order_acquire);
 865 | 
 866 |   if ((front - get_valid(back) + kModulus) % kModulus == 1)
 867 |     return false;
 868 |   index_type write = front;
 869 |   front = (front - 1 + kModulus) % kModulus;
 870 | 
 871 | //  Potentially-throwing
 872 |   place_task(front_invalid_ ? write : front, std::forward<Task>(task));
 873 | 
 874 |   front_.store(front, std::memory_order_release);
 875 | 
 876 | //    Delay lower-level (central) queue from being accessed, to fully support
 877 | //  depth-first traversal of task tree.
 878 |   ++countdown_;
 879 |   pool_.notify_if_idle();
 880 |   return true;
 881 | }
 882 | 
 883 | //    Places multiple new tasks at the front of the queue. Note that this skirts
 884 | //  anti-starvation precautions.
 885 | unsigned Worker::push_front (ThreadPoolImpl & tasks, unsigned number)
 886 | {
 887 |   assert(std::this_thread::get_id() == thread_.get_id() && "Worker::push_front may only be called from the Worker's owned thread.");
 888 |   if (!tasks.has_task())
 889 |     return 0;
 890 | 
 891 |   index_type front = front_.load(std::memory_order_relaxed);
 892 |   index_type back = back_.load(std::memory_order_acquire);
 893 | 
 894 |   auto written = (front - get_valid(back) - 1 + kModulus) % kModulus;
 895 |   if (number < written)
 896 |     written = number;
 897 |   if (written == 0)
 898 |     return 0;
 899 | 
 900 | //  In C++, bool converts implicitly to 0 (false) or 1 (true).
 901 |   front += front_invalid_;
 902 |   auto n = written;
 903 |   do {
 904 |     front = (front - 1 + kModulus) % kModulus;
 905 |     place_task(front, tasks.extract_task());
 906 |     if (!tasks.has_task())
 907 |     {
 908 |       written -= n - 1;
 909 |       break;
 910 |     }
 911 |   } while (--n);
 912 |   front = (front - front_invalid_ + kModulus) % kModulus;
 913 |   front_.store(front, std::memory_order_release);
 914 |   return written;
 915 | }
 916 | 
 917 | //  Returns an estimate of the number of tasks currently in the queue.
 918 | typename Worker::index_type Worker::count_tasks (void) const noexcept
 919 | {
 920 |   index_type front = front_.load(std::memory_order_relaxed);
 921 |   index_type back = back_.load(std::memory_order_relaxed);
 922 |   return get_distance(front, get_valid(back));
 923 | }
 924 | 
 925 | //  Attempts to steal work from other worker threads in the same pool.
 926 | unsigned Worker::steal (void)
 927 | {
 928 |   unsigned num_workers = pool_.get_capacity();
 929 |   auto randomizer = front_.load(std::memory_order_relaxed);
 930 |   unsigned source = static_cast<unsigned>(randomizer);
 931 |   unsigned stolen_count = 0;
 932 |   for (auto n = num_workers; n--;) {
 933 |     source = (source + 1) % num_workers;
 934 |     Worker * victim = pool_.data() + source;
 935 |     if (victim == this)
 936 |       continue;
 937 |     stolen_count += steal_from(*victim);
 938 |     if (stolen_count > 0)
 939 |       break;
 940 |   }
 941 |   return stolen_count;
 942 | }
 943 | 
 944 | //    Performs a loop of the form execute-steal-check_central_queue-repeat.
 945 | //  Sleeps if no work is available in this and other queues.
 946 | void Worker::operator() (void)
 947 | {
 948 |   static constexpr std::uint_fast32_t kPullFromQueue = 1 + (kModulus - 1) / 32;
 949 |   index_type last_size = 0;
 950 | //    This thread-local variable allows O(1) scheduling (allows pushing directly
 951 | //  to the local task queue).
 952 |   current_worker = this;
 953 |   using mutex_type = decltype(pool_.mutex_);
 954 |   mutex_type & mutex = pool_.mutex_;
 955 | 
 956 |   {
 957 |     std::unique_lock<mutex_type> guard(mutex);
 958 |     ++pool_.living_;
 959 |     guard.unlock();
 960 |     pool_.cv_.notify_all();
 961 |   }
 962 | //  The thread is started after all workers are initialized; no need to wait.
 963 | 
 964 |   while (true)
 965 |   {
 966 |     if (--countdown_ == 0)
 967 |     {
 968 |       auto task_count = count_tasks();
 969 |       index_type size = (task_count <= kModulus / 16) ? task_count * 16 : kModulus - 1;
 970 |       countdown_ = size + 2;
 971 | 
 972 | //    Periodically check whether the program is trying to destroy the pool.
 973 |       if (pool_.should_stop())
 974 |         goto kill;
 975 | 
 976 |       if (mutex.try_lock())
 977 |       {
 978 |         std::lock_guard<mutex_type> guard (mutex, std::adopt_lock);
 979 |         pool_.update_tasks();
 980 |         if (!pool_.has_task())
 981 |         {
 982 | //    If the queue size has stabilized, it's likely that all tasks are waiting
 983 | //  on something (and thus continually re-adding themselves). Shake things up a
 984 | //  bit by re-shuffling tasks.
 985 |           if (size == last_size)
 986 |             size += steal();
 987 |           last_size = size;
 988 |           continue;
 989 |         }
 990 |         refresh_tasks(pool_, (kPullFromQueue + 3) / 4);
 991 |         countdown_ += kPullFromQueue / 2;
 992 |       }
 993 |       else
 994 |       {
 995 | //  If the queue size has stabilized, it's probably full of infinite loops.
 996 |         if (size == last_size)
 997 |           countdown_ = 4;
 998 |       }
 999 |       last_size = size;
1000 |     }
1001 | //    Second, check for (and perform) any tasks in this thread's queue.
1002 |     if (execute())
1003 |       continue;
1004 | //  Make sure we don't exhaust the full queue when an exit is desired.
1005 |     if (pool_.should_stop())
1006 |       goto kill;
1007 | //    Third, check whether there are common tasks available. This will also
1008 | //  serve to jump-start the worker.
1009 | //    Testing whether the task queue is empty may give an incorrect result,
1010 | //  due to lack of synchronization, but is still a fast and easy test.
1011 |     if (pool_.might_have_task() && mutex.try_lock())
1012 |     {
1013 |       std::lock_guard<mutex_type> guard (mutex, std::adopt_lock);
1014 |       pool_.update_tasks();
1015 |       unsigned count = push_front(pool_, kPullFromQueue);
1016 |       if (count > 0)
1017 |       {
1018 | //  If our new tasks are already from the queue, no need to refresh.
1019 |         countdown_ += kPullFromQueue;//count;
1020 |         continue;
1021 |       }
1022 |     }
1023 | //    Fourth, try work stealing.
1024 |     if (steal() > 0)
1025 |       continue;
1026 | 
1027 | //  Fifth, wait a bit for something to change...
1028 |     auto num_workers = pool_.get_capacity();
1029 |     bool should_idle = (count_tasks() == 0);
1030 |     for (auto n = num_workers; n-- && should_idle;)
1031 |       should_idle = (pool_.workers_[n].count_tasks() < 2);
1032 |     if (should_idle && mutex.try_lock())
1033 |     {
1034 |       std::unique_lock<mutex_type> guard (mutex, std::adopt_lock);
1035 |       if (pool_.should_stop())
1036 |         goto kill;
1037 |       pool_.update_tasks();
1038 |       if (!pool_.has_task())
1039 |       {
1040 |         ++pool_.idle_;
1041 |         pool_.wait_for_task(guard);
1042 |         --pool_.idle_;
1043 |         if (pool_.should_stop())
1044 |           goto kill;
1045 |         pool_.update_tasks();
1046 |       }
1047 |       push_front(pool_, kPullFromQueue);
1048 | //  If our new tasks are already from the queue, no need to refresh.
1049 |       countdown_ += kPullFromQueue;
1050 |     }
1051 |   }
1052 | kill:
1053 |   current_worker = nullptr;
1054 |   {
1055 |     std::unique_lock<mutex_type> guard (mutex);
1056 |     --pool_.living_;
1057 |     guard.unlock();
1058 |     pool_.cv_.notify_all();
1059 |   }
1060 | }
1061 | 
1062 | 
1063 | 
1064 | ////////////////////////////////////////////////////////////////////////////////
1065 | //                              ThreadPoolImpl                                //
1066 | ////////////////////////////////////////////////////////////////////////////////
1067 | 
1068 | ThreadPoolImpl::ThreadPoolImpl (Worker * workers, index_type num_workers)
1069 |   : workers_(workers), num_workers_(num_workers)
1070 | {
1071 |   assert(num_workers > 0);
1072 |   std::unique_lock<decltype(mutex_)> guard (mutex_);
1073 | 
1074 | //  Construct the workers, after some safety-checks.
1075 |   static_assert(std::is_nothrow_constructible<Worker, ThreadPoolImpl &>::value,\
1076 |     "This loop is only exception-safe if Worker construction is non-throwing");
1077 |   for (index_type i = 0; i < get_capacity(); ++i)
1078 |     new(workers_ + i) Worker(*this);
1079 | //    Start the threads only after all initialization is complete. The Worker's
1080 | //  loop will need no further synchronization for safe use.
1081 | //    Note that a worker without an initialized thread will simply do nothing,
1082 | //  because the threads are responsible for populating themselves with tasks.
1083 |   std::exception_ptr eptr;
1084 |   for (index_type i = 0; i < get_capacity(); ++i)
1085 |   {
1086 |     try {
1087 |       workers_[i].restart_thread();
1088 |     } catch (std::system_error &) {
1089 |       eptr = std::current_exception();
1090 |     }
1091 |   }
1092 | //    If no threads were able to start, give a meaningful error regarding why.
1093 | //  However, if at least one thread was able to start, the ThreadPool will
1094 | //  function properly.
1095 |   if (get_concurrency() == 0)
1096 |     std::rethrow_exception(eptr);
1097 | //  Wait for the pool to be fully populated to ensure no weird behaviors.
1098 |   cv_.wait(guard, [this](void)->bool {
1099 |     return (living_ == get_concurrency()) || should_stop();
1100 |   });
1101 | }
1102 | 
1103 | ThreadPoolImpl::~ThreadPoolImpl (void)
1104 | {
1105 | #ifndef NDEBUG
1106 |   Worker * p_worker = current_worker;
1107 |   if ((p_worker != nullptr) && p_worker->belongs_to(this))
1108 |   {
1109 |     std::printf("ERROR!\tA worker thread may not destroy the ThreadPool to \
1110 | which it belongs.\n");
1111 |     std::abort();
1112 |   }
1113 | #endif
1114 |   std::unique_lock<decltype(mutex_)> guard (mutex_);
1115 |   stop_.store(0x05, std::memory_order_relaxed);
1116 |   if (paused_ > 0)
1117 |   {
1118 | //    If the pool is in a "paused" state, it might be the case that one thread
1119 | //  is still alive (and waiting for an "unpause" signal). Wake it up...
1120 |     cv_.notify_all();
1121 |     cv_.wait(guard, [this](void)->bool {
1122 |       return (living_ == 0);
1123 |     });
1124 |     for (auto i = get_capacity(); i--;)
1125 |       workers_[i].stop_thread();
1126 |   } else
1127 |     stop_threads(guard);
1128 | 
1129 |   for (auto i = get_capacity(); i--;)
1130 |     workers_[i].~Worker();
1131 | }
1132 | 
1133 | //  Note: Because of the mutex, can be called from any thread at any time.
1134 | void ThreadPoolImpl::stop_threads (std::unique_lock<decltype(mutex_)> & guard)
1135 | {
1136 |   if (idle_ > 0)
1137 |     cv_.notify_all();
1138 |   cv_.wait(guard, [this](void)->bool {
1139 |     return (living_ == paused_) || !should_stop();
1140 |   });
1141 |   if (should_stop()) {
1142 | //    At this point, all threads are either dead (need to be joined) or paused
1143 | //  (must not be joined). Take action appropriately.
1144 | //    Note that if multiple threads are paused simultaneously, they all reach
1145 | //  this point (one at a time, though)
1146 |     for (auto i = get_capacity(); i--;)
1147 |     {
1148 |       if (!workers_[i].get_paused())
1149 |         workers_[i].stop_thread();
1150 |     }
1151 |   }
1152 | }
1153 | 
1154 | void ThreadPoolImpl::halt (void)
1155 | {
1156 |   std::unique_lock<decltype(mutex_)> guard (mutex_);
1157 | //    Note: Bit 0x04 is used to indicate that the destructor is ongoing. Do not
1158 | //  interfere with it.
1159 |   if (stop_.load(std::memory_order_relaxed) & 0x04)
1160 |     return;
1161 |   stop_.store(0x03, std::memory_order_relaxed);
1162 |   Worker * p_worker = current_worker;
1163 |   if ((p_worker != nullptr) && p_worker->belongs_to(this))
1164 |   {
1165 |     p_worker->set_paused(true);
1166 |     ++paused_;
1167 |   }
1168 |   stop_threads(guard);
1169 | //  If the caller is part of the pool, block execution until unpaused.
1170 |   if ((p_worker != nullptr) && p_worker->belongs_to(this))
1171 |   {
1172 |     cv_.wait(guard, [this] (void) -> bool {
1173 |       return (stop_.load(std::memory_order_relaxed) & 0x02) == 0;
1174 |     });
1175 |     p_worker->set_paused(false);
1176 |     --paused_;
1177 |   }
1178 | }
1179 | 
1180 | //  Note: Because of the mutex, can call from any thread at any time.
1181 | void ThreadPoolImpl::resume (void)
1182 | {
1183 |   std::unique_lock<decltype(mutex_)> guard (mutex_);
1184 | 
1185 |   assert(living_ >= paused_);
1186 | //    Note: Bit 0x04 will be used to indicate attempted destruction. Do not
1187 | //  interfere.
1188 |   if (stop_.load(std::memory_order_relaxed) & 0x04)
1189 |     return;
1190 | 
1191 |   stop_.store(0x00, std::memory_order_relaxed);
1192 |   cv_.notify_all(); //  noexcept
1193 | 
1194 |   std::exception_ptr eptr;
1195 |   for (unsigned i = 0; i < get_capacity(); ++i)
1196 |   {
1197 |     try {
1198 |       workers_[i].restart_thread();
1199 |     } catch (std::system_error &) {
1200 | //    Whenever a thread fails to start, remove all the tasks it would otherwise
1201 | //  need to consume. This will prevent those tasks from becoming unreachable.
1202 |       if (!workers_[i].is_alive())
1203 |         workers_[i].canibalize(*this);
1204 |       eptr = std::current_exception();
1205 |     }
1206 |   }
1207 |   if (get_concurrency() == 0)
1208 |     std::rethrow_exception(eptr);
1209 | 
1210 |   cv_.wait(guard, [this](void)->bool {
1211 |     return (living_ >= get_concurrency()) || should_stop();
1212 |   });
1213 | }
1214 | 
1215 | bool ThreadPoolImpl::is_halted (void) const
1216 | {
1217 |   std::lock_guard<decltype(mutex_)> guard (mutex_);
1218 | //  Include paused tasks to give more consistent behavior.
1219 |   return (stop_.load(std::memory_order_relaxed) & 0x02) && (paused_ == living_);
1220 | }
1221 | 
1222 | bool ThreadPoolImpl::is_idle (void) const
1223 | {
1224 |   std::lock_guard<decltype(mutex_)> guard (mutex_);
1225 | //  Include paused tasks to give more consistent behavior.
1226 |   return (idle_ + paused_) == living_;
1227 | }
1228 | 
1229 | /// \par Exception safety
1230 | ///   Provides the strong (rollback) guarantee.
1231 | template<typename Task>
1232 | void ThreadPoolImpl::schedule_overflow (Task && task)
1233 | {
1234 |   bool idle;
1235 |   {
1236 |     std::lock_guard<decltype(mutex_)> guard (mutex_);
1237 |     push(std::forward<Task>(task)); // < Strong exception-safety guarantee.
1238 |     idle = idle_ > 0;
1239 |   }
1240 |   if (idle)
1241 |     cv_.notify_one();
1242 | }
1243 | 
1244 | /// \par Exception safety
1245 | ///   Provides the strong (rollback) guarantee.
1246 | template<typename Task>
1247 | void ThreadPoolImpl::schedule_after (clock::duration const & dur, Task && task)
1248 | {
1249 |   bool idle;
1250 |   {
1251 |     std::lock_guard<decltype(mutex_)> guard (mutex_);
1252 |     push_at(clock::now() + dur, std::forward<Task>(task));
1253 | //    Wake the waiters, just in case the scheduled time is earlier than that for
1254 | //  which they were waiting.
1255 |     idle = idle_ > 0;
1256 |   }
1257 |   if (idle)
1258 |     cv_.notify_one();
1259 | }
1260 | 
1261 | 
1262 | 
1263 | #ifndef NDEBUG
1264 | void debug_warn_overflow (void) noexcept
1265 | {
1266 |   static std::atomic_flag overflow_warning_given = ATOMIC_FLAG_INIT;
1267 |   if (!overflow_warning_given.test_and_set())
1268 |     std::printf("Task queue overflow (more than %zu tasks in a single worker's \
1269 | queue). May impact performance.", ThreadPool::get_worker_capacity());
1270 | }
1271 | #endif
1272 | 
1273 | template<typename Task>
1274 | void impl_schedule (Task && task, ThreadPoolImpl * impl)
1275 | {
1276 | #ifndef NDEBUG
1277 | //    If a NULL task is passed, place the error message as close as possible to
1278 | //  the error itself.
1279 |   if (task == nullptr)
1280 |     throw std::bad_function_call();
1281 | #endif
1282 |   Worker * worker = current_worker;
1283 | //  If a thread is attempting to schedule in its own pool...
1284 |   if ((worker != nullptr) && worker->belongs_to(impl))
1285 |   {
1286 |     if (worker->push(std::forward<Task>(task)))
1287 |       return;
1288 | #ifndef NDEBUG
1289 |     else
1290 |       debug_warn_overflow();
1291 | #endif
1292 |   }
1293 |   impl->schedule_overflow<Task>(std::forward<Task>(task));
1294 | }
1295 | 
1296 | //  Schedule at the front of the queue, if in fast path.
1297 | template<typename Task>
1298 | void impl_schedule_subtask (Task && task, ThreadPoolImpl * impl)
1299 | {
1300 | #ifndef NDEBUG
1301 | //    If a NULL task is passed, place the error message as close as possible to
1302 | //  the error itself.
1303 |   if (task == nullptr)
1304 |     throw std::bad_function_call();
1305 | #endif
1306 |   Worker * worker = current_worker;
1307 | //  If a thread is attempting to schedule in its own pool, take the fast path.
1308 |   if ((worker != nullptr) && worker->belongs_to(impl))
1309 |   {
1310 |     if (worker->push_front(std::forward<Task>(task)))
1311 |       return;
1312 | #ifndef NDEBUG
1313 |     else
1314 |       debug_warn_overflow();
1315 | #endif
1316 |   }
1317 |   impl->schedule_overflow(std::forward<Task>(task));
1318 | }
1319 | 
1320 | template<typename Task>
1321 | void impl_schedule_after (std::chrono::steady_clock::duration const & dur,
1322 |                           Task && task, ThreadPoolImpl * impl)
1323 | {
1324 |   if (dur <= std::chrono::steady_clock::duration(0))
1325 |     impl_schedule(std::forward<Task>(task), impl);
1326 |   else
1327 |   {
1328 | #ifndef NDEBUG
1329 | //    If a NULL task is passed, place the error message as close as possible to
1330 | //  the error itself.
1331 |     if (task == nullptr)
1332 |       throw std::bad_function_call();
1333 | #endif
1334 |     impl->schedule_after<Task>(dur, std::forward<Task>(task));
1335 |   }
1336 | }
1337 | } //  Namespace [anonymous]
1338 | 
1339 | 
1340 | 
1341 | 
1342 | 
1343 | ////////////////////////////////////////////////////////////////////////////////
1344 | //                                ThreadPool                                  //
1345 | ////////////////////////////////////////////////////////////////////////////////
1346 | 
1347 | ThreadPool::ThreadPool (unsigned threads)
1348 |   : impl_(nullptr)
1349 | {
1350 |   if (threads == 0)
1351 |   {
1352 | //    Hardware concurrency of 0 indicates that it is unknown. Make sure we have
1353 | //  a few threads running.
1354 |     threads = max(2u, std::thread::hardware_concurrency());
1355 |   }
1356 |   using thread_counter_type = decltype(std::declval<ThreadPoolImpl>().get_concurrency());
1357 |   threads = min(threads, min(std::numeric_limits<thread_counter_type>::max(),
1358 |                              std::numeric_limits<unsigned>::max()));
1359 | //    Alignment change during Worker allocation is an integer multiple of
1360 | //  alignof(Worker). If (alignof(Worker) >= alignof(ThreadPoolImpl)), then
1361 | //  the second align will not do anything, and the problem is solved. Otherwise,
1362 | //  Alignment is off by at most alignof(ThreadPoolImpl) - alignof(Worker).
1363 | //    Total alignment is off by at most the greater of the alignments.
1364 |   std::size_t space = sizeof(ThreadPoolImpl) + threads * sizeof(Worker) +      \
1365 |                  max(alignof(ThreadPoolImpl), alignof(Worker)) + sizeof(void**);
1366 | 
1367 |   std::unique_ptr<void, RawDeleter> memory { std::malloc(space) };
1368 |   if (memory == nullptr)
1369 |     throw std::bad_alloc();
1370 |   void * ptr = memory.get();
1371 | 
1372 |   using std::align;
1373 | //  Allocate space for a block of worker threads
1374 |   if (!align(alignof(Worker), threads * sizeof(Worker), ptr, space))
1375 |     throw std::bad_alloc();
1376 |   Worker * workers = static_cast<Worker*>(ptr);
1377 |   ptr = workers + threads;
1378 | 
1379 | //  Allocate space for the controller.
1380 |   if (!align(alignof(ThreadPoolImpl), sizeof(ThreadPoolImpl), ptr, space))
1381 |     throw std::bad_alloc();
1382 |   ThreadPoolImpl * impl = static_cast<ThreadPoolImpl*>(ptr);
1383 |   ptr = impl + 1;
1384 | 
1385 |   new(impl) ThreadPoolImpl(workers, static_cast<thread_counter_type>(threads));
1386 | 
1387 |   impl_ = impl;
1388 |   *reinterpret_cast<void**>(ptr) = memory.release();
1389 | }
1390 | 
1391 | ThreadPool::~ThreadPool (void)
1392 | {
1393 |   ThreadPoolImpl * impl = static_cast<ThreadPoolImpl*>(impl_);
1394 |   std::unique_ptr<void,RawDeleter> memory {*reinterpret_cast<void**>(impl + 1)};
1395 |   impl->~ThreadPoolImpl();
1396 | }
1397 | 
1398 | unsigned ThreadPool::get_concurrency(void) const noexcept
1399 | {
1400 |   return static_cast<ThreadPoolImpl const*>(impl_)->get_concurrency();
1401 | }
1402 | 
1403 | bool ThreadPool::is_idle (void) const
1404 | {
1405 |   return static_cast<ThreadPoolImpl const*>(impl_)->is_idle();
1406 | }
1407 | 
1408 | //  Schedules a task normally, at the back of the queue.
1409 | void ThreadPool::schedule (task_type const & task)
1410 | {
1411 |   impl_schedule(task, static_cast<ThreadPoolImpl*>(impl_));
1412 | }
1413 | void ThreadPool::schedule (task_type && task)
1414 | {
1415 |   impl_schedule(std::move(task), static_cast<ThreadPoolImpl*>(impl_));
1416 | }
1417 | 
1418 | //  Schedules a task normally, at the back of the queue.
1419 | void ThreadPool::sched_impl(duration const & dur, task_type const & task)
1420 | {
1421 |   impl_schedule_after(dur, task, static_cast<ThreadPoolImpl*>(impl_));
1422 | }
1423 | void ThreadPool::sched_impl(duration const & dur, task_type && task)
1424 | {
1425 |   impl_schedule_after(dur, std::move(task),static_cast<ThreadPoolImpl*>(impl_));
1426 | }
1427 | 
1428 | //  Schedule at the front of the queue, if in fast path.
1429 | void ThreadPool::schedule_subtask (task_type const & task)
1430 | {
1431 |   impl_schedule_subtask(task, static_cast<ThreadPoolImpl*>(impl_));
1432 | }
1433 | void ThreadPool::schedule_subtask (task_type && task)
1434 | {
1435 |   impl_schedule_subtask(std::move(task), static_cast<ThreadPoolImpl*>(impl_));
1436 | }
1437 | 
1438 | std::size_t ThreadPool::get_worker_capacity (void) noexcept
1439 | {
1440 |   return kModulus - 1;
1441 | }
1442 | 
1443 | void ThreadPool::halt (void)
1444 | {
1445 |   static_cast<ThreadPoolImpl*>(impl_)->halt();
1446 | }
1447 | void ThreadPool::resume (void)
1448 | {
1449 |   static_cast<ThreadPoolImpl*>(impl_)->resume();
1450 | }
1451 | bool ThreadPool::is_halted (void) const
1452 | {
1453 |   return static_cast<ThreadPoolImpl const*>(impl_)->is_halted();
1454 | }
1455 | 


--------------------------------------------------------------------------------
/threadpool.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | /// \file threadpool.hpp
  3 | /// \brief  Lightweight, fine-grained multitasking through thread pools.
  4 | ///
  5 | ///   This header is part of a multi-tasking library that provides low-overhead
  6 | /// concurrent scheduling. This is provided through a thread pool, and uses the
  7 | /// [work-stealing method](https://en.wikipedia.org/wiki/Work_stealing "Wikipedia: Work stealing")
  8 | /// for load balancing.                                                       \n
  9 | ///   In addition, the library provides a fast scheduling path for tasks spawned
 10 | /// by another task within the same pool.                                     \n
 11 | ///   These avert the majority of scheduling overhead for each new task, which
 12 | /// makes fine-grained parallelism feasible.
 13 | /// \code
 14 | /// #include "threadpool.hpp"
 15 | ///
 16 | /// //    Create a new thread pool, letting the implementation determine the
 17 | /// //  number of worker threads to use.
 18 | /// ThreadPool pool;
 19 | ///
 20 | /// //  Function pointers of type void (*) () can be passed as tasks directly.
 21 | /// void task (void)
 22 | /// {
 23 | ///   //  ...
 24 | /// }
 25 | /// //    Put a task into the pool. Because this isn't called from within a
 26 | /// //  worker thread, the worker threads synchronize to avoid calling it twice.
 27 | /// pool.schedule([](void)
 28 | /// {
 29 | /// //    Put a task into the pool. This is called from within a worker thread,
 30 | /// //  so no synchronization is required.
 31 | ///   pool.schedule(&task);
 32 | ///
 33 | /// //    Put a task into the pool, treated as if it were part of the currently
 34 | /// //  running task. This is called from within a worker thread, so no
 35 | /// //  synchronization is required.
 36 | ///   pool.schedule_subtask([](void) { });
 37 | ///
 38 | ///  using namespace std::chrono;
 39 | /// //  Put a task into the pool, to be executed 2 seconds after it is scheduled.
 40 | ///  pool.schedule_after(seconds(2),
 41 | ///  [](void) {
 42 | ///    do_something();
 43 | ///  });
 44 | ///
 45 | /// //    Put a task into the pool, to be executed at the specified time.
 46 | ///   pool.schedule_after(steady_clock::now() + seconds(2),
 47 | ///   [](void) {
 48 | ///     do_something();
 49 | ///   });
 50 | /// });
 51 | ///
 52 | /// //    When the thread pool is destroyed, remaining tasks are forgotten.
 53 | /// \endcode
 54 | /// \note Tasks assigned to the pool from within one of its worker threads will
 55 | ///   take the fast scheduling path unless the worker already has
 56 | ///   `get_worker_capacity()` tasks scheduled. Tasks assigned from outside the
 57 | ///   pool will take the slow path.
 58 | /// \warning  If `get_concurrency()` active tasks (or more) simultaneously
 59 | ///   block, then all inactive tasks in the pool may be blocked. To prevent
 60 | ///   deadlock, it is recommended that tasks be constructed such that at least
 61 | ///   one active task makes progress.
 62 | /// \note Users may define the macro `THREAD_POOL_FALSE_SHARING_ALIGNMENT` to
 63 | ///   specify L1 cache line size when compiling `threadpool.cpp`. If it is not
 64 | ///   specified, the library will attempt to use C++17's
 65 | ///   `hardware_destructive_interference_size`. If that feature is not supported
 66 | ///   by the compiler, an implementation-defined default value will be selected.
 67 | /// \note Users may specify the capacity of each worker's fixed queue by
 68 | ///   changing the definition of `kLog2Modulus` in `threadpool.cpp`.
 69 | /// \todo Allow tasks to return values, possibly using `std::packaged_task`.
 70 | /// \todo Investigate delegates as a replacement for `std::function`:
 71 | ///   ["The Impossibly Fast C++ Delegates (Fixed)"](https://www.codeproject.com/Articles/1170503/The-Impossibly-Fast-Cplusplus-Delegates-Fixed "The Impossibly Fast C++ Delegates")
 72 | /// \author Nathaniel J. McClatchey, PhD
 73 | /// \version  2.0
 74 | /// \copyright Copyright (c) 2017-2019 Nathaniel J. McClatchey, PhD.          \n
 75 | ///   [Licensed under the MIT license.](https://github.com/nmcclatchey/ThreadPool/blob/master/LICENSE "MIT License")  \n
 76 | ///   You should have received a copy of the license with this software.
 77 | ////////////////////////////////////////////////////////////////////////////////
 78 | 
 79 | #ifndef THREAD_POOL_HPP_
 80 | #define THREAD_POOL_HPP_
 81 | 
 82 | #if !defined(__cplusplus) || (__cplusplus < 201103L)
 83 | #error  "The ThreadPool library requires C++11 or higher."
 84 | #endif
 85 | 
 86 | //    For a unified interface to Callable objects, I considered 3 options:
 87 | //  * Delegates (fast, but would need extra library and wouldn't allow return)
 88 | //  * std::function (universally available, but doesn't allow return)
 89 | //  * std::packaged_task (allows return, but may not be available. Eg. MinGW-w64
 90 | //  with Win32 threads).
 91 | #include <functional>
 92 | //  For std::size_t
 93 | #include <cstddef>
 94 | //  For timed waiting.
 95 | #include <chrono>
 96 | 
 97 | /// \brief A high-performance asynchronous task scheduler.
 98 | /// \warning  If `get_concurrency()` active tasks (or more) simultaneously
 99 | ///   block, then all inactive tasks in the pool may be blocked. To prevent
100 | ///   deadlock, it is recommended that tasks be constructed such that at least
101 | ///   one active task makes progress.
102 | /// \note Has a fast path and a slow path. If called by a worker thread,
103 | ///   `schedule(const task_type &)` and `schedule_subtask(tconst task_type &)`
104 | ///   take the fast path, placing the task into the worker thread's own queue
105 | ///   and bypassing any synchronization. If any scheduling function is called by
106 | ///   a thread not in the pool or if the worker's queue is at capacity, the slow
107 | ///   path is taken, requiring synchronization of the `ThreadPool`'s central
108 | ///   queue.
109 | /// \note If the worker's local queue is full, the slow path is taken. If one
110 | ///   compiles `threadpool.cpp` without the macro `NDEBUG` defined, a warning
111 | ///   will be printed when an over-full queue is first detected.
112 | //    Implementer's note: The [pointer to implementation idiom](http://en.cppreference.com/w/cpp/language/pimpl "C++ Reference: pImpl idiom")
113 | //  provides no significant disadvantage. It will impose a pointer lookup
114 | //  penalty, but only on the slow path. Moreover, dynamic allocation is required
115 | //  regardless, and all initial allocation is combined into a single allocation.
116 | struct ThreadPool
117 | {
118 | /// \brief  A [Callable](https://en.cppreference.com/w/cpp/named_req/Callable "C++ Reference: Named requirements: Callable")
119 | ///   type, taking no arguments and returning void. Used to store tasks for
120 | ///   later execution.
121 | /// \note   Will be called at most once, then destroyed.
122 |   using task_type = std::function<void()>;
123 | 
124 | /// \brief  Initializes a thread pool and starts a collection of worker threads.
125 | /// \param[in]  worker_capacity The maximum number of worker threads that the
126 | ///   pool will support.
127 | /// \exception  Throws `std::system_error` if the pool was unable to start at
128 | ///   least one thread.
129 | ///
130 | ///   Creates a thread pool with up to *worker_capacity* worker threads, and
131 | /// attempts to start them. If *worker_capacity == 0*, the number of worker
132 | /// threads is positive, but otherwise implementation-defined.
133 | /// \note Use `get_concurrency()` to detect the number of worker threads that
134 | ///   were able to start.
135 |   ThreadPool (unsigned worker_capacity = 0);
136 | 
137 | /// \brief  Destroys the `ThreadPool`, terminating all of its worker threads.
138 | ///
139 | ///   Notifies all worker threads that work is to be discontinued, and blocks
140 | /// until they terminate. Though any task that has already been started will be
141 | /// completed, any tasks that are not active when `~ThreadPool()` is called
142 | /// may be forgotten.
143 | /// \warning  Using a worker thread to destroy its own `ThreadPool` results in
144 | ///   undefined behavior.
145 |   ~ThreadPool (void);
146 | 
147 | //  Thread pools cannot be copied or moved.
148 |   ThreadPool (ThreadPool const &) = delete;
149 |   ThreadPool & operator= (ThreadPool const &) = delete;
150 | 
151 | /// \brief  Schedules a task to be performed asynchronously.
152 | /// \param[in]  task  The task to be performed.
153 | ///
154 | ///   Schedules a task to be performed asynchronously. The task will be called
155 | /// at most once.
156 | /// \par  Memory order
157 | ///   Execution of a task *synchronizes-with* (as in
158 | /// [`std::memory_order`](https://en.cppreference.com/w/cpp/atomic/memory_order "C++ Reference: Memory order")
159 | /// ) the call to `schedule()` that added it to the pool, using a
160 | /// [*Release-Acquire*](https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering "C++ Reference: Release-Acquire ordering")
161 | /// ordering.
162 |   void schedule (task_type const & task);
163 | /// \overload
164 |   void schedule (task_type && task);
165 | 
166 | /// \brief  Schedules a task to be run asynchronously after a specified wait
167 | ///   duration.
168 | /// \param[in]  rel_time  The duration after which the task is to be run.
169 | /// \param[in]  task  The task to be performed.
170 | ///
171 | ///   Schedules a task to be performed asynchronously, but only after waiting
172 | /// for a duration of *rel_time*. The task will be called at most once.
173 | /// \par  Memory order
174 | ///   Execution of a task *synchronizes-with* (as in
175 | /// [`std::memory_order`](https://en.cppreference.com/w/cpp/atomic/memory_order "C++ Reference: Memory order")
176 | /// ) the call to `schedule_after()` that added it to the pool, using a
177 | /// [*Release-Acquire*](https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering "C++ Reference: Release-Acquire ordering")
178 | /// ordering.
179 |   template<class Rep, class Period, class Task>
180 |   void schedule_after ( std::chrono::duration<Rep, Period> const & rel_time,
181 |                         Task && task)
182 |   {
183 |     using namespace std;
184 |     sched_impl(chrono::duration_cast<duration>(rel_time), forward<Task>(task));
185 |   }
186 | 
187 | /// \brief  Schedules a task to be run asynchronously at (or after) a specified
188 | ///   point in time.
189 | /// \param[in]  time  The time point after which the task is to be run.
190 | /// \param[in]  task  The task to be performed.
191 | ///
192 | ///   Schedules a task to be performed asynchronously at a specified time point.
193 | /// The task will be called at most once.
194 | /// \par  Memory order
195 | ///   Execution of a task *synchronizes-with* (as in
196 | /// [`std::memory_order`](https://en.cppreference.com/w/cpp/atomic/memory_order "C++ Reference: Memory order")
197 | /// ) the call to `schedule_after()` that added it to the pool, using a
198 | /// [*Release-Acquire*](https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering "C++ Reference: Release-Acquire ordering")
199 | /// ordering.
200 |   template<class Clock, class Duration, class Task>
201 |   void schedule_after ( std::chrono::time_point<Clock, Duration> const & time,
202 |                         Task && task)
203 |   {
204 |     using namespace std;
205 |     using namespace std::chrono;
206 |     sched_impl(duration_cast<duration>(time-Clock::now()), forward<Task>(task));
207 |   }
208 | 
209 | /// \brief  Schedules a task to be run asynchronously, but with a hint that the
210 | ///   task ought to be considered part of the currently-scheduled task.
211 | /// \param[in]  task  The task to be performed.
212 | /// \see `schedule(const task_type &)`
213 | ///
214 | ///     Schedules a task to be performed asynchronously, but treats it as if it
215 | ///   were part of the currently scheduled task. This gives the task a better
216 | ///   chance of being performed soon after scheduling, but relaxes
217 | ///   non-starvation guarantees. In particular, if the collective subtasks fail
218 | ///   to terminate, then the original task is considered not to have terminated,
219 | ///   and later tasks may fail to run.                                        \n
220 | ///     The `schedule_subtask()` method may be used to encourage (not force)
221 | ///   depth-first execution -- rather than breadth-first execution -- if tasks
222 | ///   exhibit significant branching. This can reduce the odds of a local queue
223 | ///   overflow (the slow path) and reduce the memory needed for scheduled tasks.
224 | ///                                                                           \n
225 | ///     The task will be called at most once.
226 | /// \par  Memory order
227 | ///   Execution of a task *synchronizes-with* (as in
228 | /// [`std::memory_order`](https://en.cppreference.com/w/cpp/atomic/memory_order "C++ Reference: Memory order")
229 | /// ) the call to `schedule_subtask()` that added it to the pool, using a
230 | /// [*Release-Acquire*](https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering "C++ Reference: Release-Acquire ordering")
231 | /// ordering.
232 | /// \warning  Because a subtask is considered as part of the task that spawned
233 | ///   it, no guarantees of non-starvation are made should the collective
234 | ///   subtasks not terminate.
235 |   void schedule_subtask (task_type const & task);
236 | /// \overload
237 |   void schedule_subtask (task_type && task);
238 | 
239 | /// \brief  Returns the number of threads in the pool.
240 | /// \return Number of threads in the pool.
241 | ///
242 | ///   Returns the number of threads in the `ThreadPool`. That is, this function
243 | /// returns the number of tasks that can be truly executed concurrently or with
244 | /// preemption.
245 | /// \note If more than `get_concurrency()` tasks block simultaneously, the
246 | ///   entire `ThreadPool` is blocked, and no further progress will be made.
247 |   unsigned get_concurrency (void) const noexcept;
248 | 
249 | /// \brief  Maximum number of tasks that can be efficiently scheduled by a
250 | ///   worker thread.
251 | /// \return Returns the number of tasks that a worker thread can retain in local
252 | ///   storage.
253 | ///
254 | ///   To reduce contention, each worker thread keeps its own queue of tasks. The
255 | /// queues are pre-allocated, and of constant size. The `get_worker_capacity()`
256 | /// function returns the number of tasks that each worker can keep in its own
257 | /// queue -- that is, the number of tasks that a worker can have scheduled
258 | /// before contention occurs.                                                 \n
259 | ///   If the returned value is large, many tasks may be simultaneously scheduled
260 | /// without taking the slow path, but more memory is required. If it is small,
261 | /// task scheduling is more likely to take the slow path, but less memory is
262 | /// required.                                                                 \n
263 | ///   To select the size of the worker queues, edit the variable `kLog2Modulus`
264 | /// in `threadpool.cpp`.
265 |   static std::size_t get_worker_capacity (void) noexcept;
266 | 
267 | /// \brief  Determines whether the pool is currently idle.
268 | /// \return `true` if the pool is idle, or `false` if not.
269 | ///
270 | ///   Returns whether the pool is idle. That is, returns `true` if all threads
271 | /// in the pool are simultaneously idling, or `false` if at least one thread is
272 | /// active. If the pool is halted, the returned value is undefined. Calling this
273 | /// from within one of the `ThreadPool`'s tasks necessarily returns `false`.
274 |   bool is_idle (void) const;
275 | 
276 | /// \{
277 | /// \brief  Suspends execution of tasks in the `ThreadPool`.
278 | ///
279 | ///   Halts all worker threads, blocking the caller until worker threads have
280 | /// fully halted. If `halt()` is called from within one of the pool's worker
281 | /// threads, the calling thread is halted either until `resume()` is called or
282 | /// until the `ThreadPool` is destroyed, whichever comes first.
283 | /// \see  `resume()`
284 |   void halt (void);
285 | 
286 | /// \brief  Resumes execution of tasks in the `ThreadPool` after a call to
287 | ///   `halt()`, or starts threads that had previously failed to initialize.
288 | ///
289 | ///   Attempts to start, restart, or resume all worker threads.
290 | /// - If all allocated worker threads are already running, this function has no
291 | /// effect.
292 | /// - If execution is currently halted, or the number of active workers is less
293 | /// than that returned by `get_concurrency()`, attempts to re-start all inactive
294 | /// worker threads.
295 | /// .
296 | ///   May start fewer worker threads than the total capacity of the pool.     \n
297 | ///   May block the caller until all started worker threads have resumed their
298 | /// tasks.
299 | /// \exception  Throws `std::system_error` if the pool was unable to ensure at
300 | ///   least one living thread.
301 | /// \see  `halt()`
302 |   void resume (void);
303 | 
304 | /// \brief  Returns whether the pool is currently halted.
305 | /// \return Returns `true` if all worker threads are halted, or `false` if not.
306 | ///
307 | ///   Returns whether the pool is currently halted. Note that this function only
308 | /// begins to return `true` once all tasks have fully halted. Calling it from
309 | /// within one of the `ThreadPool`'s tasks necessarily returns `false`.
310 |   bool is_halted (void) const;
311 | /// \}
312 |  private:
313 |   void * impl_;
314 |   using duration = std::chrono::steady_clock::duration;
315 |   void sched_impl (duration const &, task_type const &);
316 |   void sched_impl (duration const &, task_type && task);
317 | };
318 | 
319 | #endif // THREAD_POOL_HPP_
320 | 


--------------------------------------------------------------------------------