├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── include
    └── nanothread
    │   └── nanothread.h
├── src
    ├── nanothread.cpp
    ├── queue.cpp
    └── queue.h
└── tests
    ├── CMakeLists.txt
    ├── test_01.c
    ├── test_02.cpp
    ├── test_03.cpp
    └── test_04.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | /.clangd
 2 | /compile_commands.json
 3 | *.cmake
 4 | CMakeCache.txt
 5 | CMakeFiles
 6 | Makefile
 7 | *.ninja
 8 | \.cache
 9 | \.ninja_*
10 | Testing
11 | build
12 | .vscode
13 | *.vcxproj
14 | *.vcxproj.filters
15 | nanothread.sln
16 | nanothread.dir
17 | Release
18 | Debug
19 | 
20 | libnanothread.so
21 | libnanothread.dylib
22 | nanothread.dll
23 | tests/test_0[1-4]
24 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ext/cmake-defaults"]
2 |     path = ext/cmake-defaults
3 |     url = https://github.com/mitsuba-renderer/cmake-defaults
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------
 2 | # Top-level nanothread CMake file, needs a recent version
 3 | # ----------------------------------------------------------
 4 | cmake_minimum_required(VERSION 3.13...3.18)
 5 | 
 6 | project(nanothread
 7 |   DESCRIPTION
 8 |     "nanothread"
 9 |   LANGUAGES
10 |     CXX C
11 | )
12 | 
13 | # ----------------------------------------------------------
14 | #  Optional features available to users
15 | # ----------------------------------------------------------
16 | 
17 | option(NANOTHREAD_STATIC "Build as static library?" OFF)
18 | option(NANOTHREAD_ENABLE_TESTS "Build test suite?" OFF)
19 | 
20 | # ----------------------------------------------------------
21 | #  Check if submodules have been checked out, or fail early
22 | # ----------------------------------------------------------
23 | 
24 | if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ext/cmake-defaults/CMakeLists.txt")
25 |   message(FATAL_ERROR "The nanothread dependencies are missing! "
26 |     "You probably did not clone the project with --recursive. It is possible to recover "
27 |     "by invoking\n$ git submodule update --init --recursive")
28 | endif()
29 | 
30 | # ----------------------------------------------------------
31 | #  Build defaults for projects by the Realistic Graphics Lab
32 | # ----------------------------------------------------------
33 | 
34 | include(ext/cmake-defaults/CMakeLists.txt)
35 | 
36 | # ----------------------------------------------------------
37 | #  Compile the nanothread library
38 | # ----------------------------------------------------------
39 | 
40 | if(NANOTHREAD_STATIC)
41 |   add_library(nanothread STATIC)
42 |   target_compile_definitions(nanothread PUBLIC -DNANOTHREAD_STATIC)
43 | else()
44 |   add_library(nanothread SHARED)
45 | endif()
46 | 
47 | target_sources(nanothread PRIVATE
48 |   include/nanothread/nanothread.h
49 |   src/queue.cpp src/queue.h
50 |   src/nanothread.cpp
51 | )
52 | 
53 | target_compile_features(nanothread PRIVATE cxx_std_11)
54 | target_include_directories(nanothread PRIVATE include)
55 | 
56 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
57 |   if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
58 |     target_compile_options(nanothread PRIVATE -mcx16)
59 |   endif()
60 | endif()
61 | 
62 | if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
63 |   # GCC needs libatomic for 16 byte CSA
64 |   find_library(LIBATOMIC NAMES libatomic.so libatomic.so.1)
65 |   if (NOT LIBATOMIC)
66 |     message(FATAL_ERROR "libatomic could not be found!")
67 |   endif()
68 |   target_link_libraries(nanothread PRIVATE ${LIBATOMIC})
69 |   mark_as_advanced(LIBATOMIC)
70 | endif()
71 | 
72 | target_include_directories(nanothread
73 |   PUBLIC
74 |   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
75 |   $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
76 | 
77 | target_compile_definitions(nanothread PRIVATE -DNANOTHREAD_BUILD=1)
78 | set_target_properties(nanothread PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
79 | 
80 | if (NANOTHREAD_ENABLE_TESTS)
81 |    add_subdirectory(tests)
82 | endif()
83 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice, this
 7 |    list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | 3. Neither the name of the copyright holder nor the names of its contributors
14 |    may be used to endorse or promote products derived from this software
15 |    without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # nanothread — Minimal thread pool for task parallelism
  2 | 
  3 | ## Introduction
  4 | 
  5 | This library provides a minimal cross-platform interface for task parallelism.
  6 | Given a computation that is partitioned into a set of interdependent tasks, the
  7 | library efficiently distributes this work to a thread pool using lock-free
  8 | queues, while respecting dependencies between tasks.
  9 | 
 10 | Each task is associated with a callback function that is potentially invoked
 11 | multiple times if the task consists of multiple work units. This whole
 12 | process is arbitrarily recursive: task callbacks can submit further jobs, wait
 13 | for their completion, etc. Parallel loops, reductions, and more complex
 14 | graph-based computations are easily realized using these abstractions.
 15 | 
 16 | This project is internally implemented in C++11, but exposes the main
 17 | functionality using a pure C99 API, along with a header-only C++11 convenience
 18 | wrapper. It has no dependencies other than CMake and a C++11-capable compiler.
 19 | The entire project requires less than 1000 lines of header and
 20 | implementation code (according to [cloc](http://cloc.sourceforge.net/)).
 21 | 
 22 | This library is part of the larger
 23 | [Dr.Jit](https://github.com/mitsuba-renderer/drjit) project and parallelizes
 24 | workloads generated by the
 25 | [Dr.Jit-Core](https://github.com/mitsuba-renderer/drjit-core) library. However,
 26 | this project has no dependencies on these parent projects and can be used in
 27 | any other context.
 28 | 
 29 | ## Why?
 30 | 
 31 | Many of my previous projects have built on [Intel's Thread Building
 32 | Blocks](https://software.intel.com/content/www/us/en/develop/tools/threading-building-blocks.html)
 33 | for exactly this type of functionality. Unfortunately, large portions of TBB's
 34 | task interface were recently deprecated as part of the oneAPI / oneTBB
 35 | transition. Rather than struggling with this complex dependency, I decided to
 36 | build something minimal and stable that satisfies my requirements.
 37 | 
 38 | ## Examples (C++11 interface)
 39 | 
 40 | The follow examples showcase the C++11 interface, which is a thin header-only
 41 | layer over the C99 API.
 42 | 
 43 | ### Parallel for loops (synchronous)
 44 | ```cpp
 45 | template <typename T, typename Func>
 46 | void parallel_for(const blocked_range<T> &range, Func &&func, Pool *pool = nullptr);
 47 | ```
 48 | This function submits a single task consisting of a arbitrarily many work units
 49 | that are processed in blocks of a specified size, and waits for their
 50 | completion. If no thread pool ``Pool *`` is specified, the default pool will be
 51 | used (and created on the fly, if needed).
 52 | 
 53 | Example:
 54 | 
 55 | ```cpp
 56 | #include <nanothread/nanothread.h>
 57 | 
 58 | namespace dr = drjit;
 59 | 
 60 | int main(int, char **) {
 61 |     int result[100];
 62 | 
 63 |     // Call the provided lambda function 99 times with blocks of size 1
 64 |     dr::parallel_for(
 65 |         dr::blocked_range<uint32_t>(/* begin = */ 0, /* end = */ 100, /* block_size = */ 1),
 66 | 
 67 |         // The callback is allowed to be a stateful lambda function
 68 |         [&](dr::blocked_range<uint32_t> range) {
 69 |             for (uint32_t i = range.begin(); i != range.end(); ++i) {
 70 |                 printf("Worker thread %u is starting to process work unit %u\n",
 71 |                        pool_thread_id(), i);
 72 | 
 73 |                 // Write to variables defined in the caller's frame
 74 |                 result[i] = i;
 75 |             }
 76 |         }
 77 |     );
 78 | }
 79 | ```
 80 | 
 81 | Small amounts of work that only consist of a single block will immediately be
 82 | executed on the calling thread instead of involving the thread pool. Exceptions
 83 | occurring during parallel execution will be captured and re-thrown by
 84 | ``dr::parallel_for``.
 85 | 
 86 | ### Parallel for loops (asynchronous)
 87 | 
 88 | Parallel `for` loops can also run asynchronously—in that case, the function
 89 | immediately returns a ``Task *`` handle that can be used to wait for
 90 | completion, or to schedule *child tasks*, whose execution will be delayed until
 91 | all parents have completed.
 92 | 
 93 | ```cpp
 94 | template <typename T, typename Func>
 95 | Task *parallel_for_async(const blocked_range<T> &range, Func &&func,
 96 |                          std::initializer_list<Task *> parents = { },
 97 |                          Pool *pool = nullptr);
 98 | ```
 99 | 
100 | The returned task handle must eventually be released using the functions
101 | ``task_release(Task *)`` (which is instantaneous) or
102 | ``task_wait_and_release(Task *)`` (which blocks until the task has terminated).
103 | A failure to do so will leak memory.
104 | 
105 | Example:
106 | ```cpp
107 | #include <nanothread/nanothread.h>
108 | 
109 | namespace dr = drjit;
110 | 
111 | int main(int, char **) {
112 |     // Schedule task 1
113 |     Task *task_1 = dr::parallel_for_async(
114 |         dr::blocked_range<uint32_t>(/* ... */),
115 |         [&](dr::blocked_range<uint32_t> range) { /* ... */ }
116 |     );
117 | 
118 |     // Schedule task 2
119 |     Task *task_2 = dr::parallel_for_async(
120 |         dr::blocked_range<uint32_t>(/* ... */),
121 |         [&](dr::blocked_range<uint32_t> range) { /* ... */ }
122 |     );
123 | 
124 |     // Schedule task 3 ...
125 |     Task *task_3 = dr::parallel_for_async(
126 |         dr::blocked_range<uint32_t>(/* ... */),
127 |         [&](dr::blocked_range<uint32_t> range) { /* ... */ },
128 |         { task_1, task_2 } // ... <- but don't execute until these tasks are done
129 |     );
130 | 
131 |     task_release(task_1);
132 |     task_release(task_2);
133 |     task_wait_and_release(task_3);
134 | }
135 | ```
136 | 
137 | If a task only consists of single-threaded work that cannot easily be converted
138 | into a parallel ``for`` loop, the function ``do_async`` provides an more
139 | convenient interface that is analogous to ``parallel_for_async`` with a
140 | ``blocked_range`` of size 1.
141 | 
142 | ```cpp
143 | template <typename Func>
144 | Task *do_async(Func &&func, std::initializer_list<Task *> parents = {},
145 |                Pool *pool = nullptr);
146 | ```
147 | 
148 | ## Examples (C99 interface)
149 | 
150 | The following code fragment submits a single task consisting of 100 work units
151 | and waits for its completion.
152 | 
153 | ```c
154 | #include <nanothread/nanothread.h>
155 | #include <stdio.h>
156 | #include <unistd.h>
157 | 
158 | // Task callback function. Will be called with index = 0..99
159 | void my_task(uint32_t index, void *payload) {
160 |     printf("Worker thread %u is starting to process work unit %u\n",
161 |            pool_thread_id(), index);
162 | 
163 |     // Use payload to communicate some data to the caller
164 |     ((uint32_t *) payload)[index] = index;
165 | }
166 | 
167 | int main(int argc, char** argv) {
168 |     uint32_t temp[100];
169 | 
170 |     // Create a worker per CPU thread
171 |     Pool *pool = pool_create(NANOTHREAD_AUTO);
172 | 
173 |     // Synchronous interface: submit a task and wait for it to complete
174 |     task_submit_and_wait(
175 |         pool,
176 |         100,     // How many work units does this task contain?
177 |         my_task, // Function to be executed
178 |         temp     // Optional payload, will be passed to function
179 |     );
180 | 
181 |     // .. contents of 'temp' are now ready ..
182 | 
183 |     // Clean up used resources
184 |     pool_destroy(pool);
185 | }
186 | ```
187 | 
188 | Tasks can also be executed *asynchronously*, in which case extra steps must be
189 | added to wait for tasks, and to release task handles.
190 | 
191 | ```c
192 | /// Heap-allocate scratch space for inter-task communication
193 | uint32_t *payload = malloc(100 * sizeof(uint32_t));
194 | 
195 | /// Submit a task and return immediately
196 | Task *task_1 = task_submit(
197 |     pool,
198 |     100,       // How many work units does this task contain?
199 |     my_task_1, // Function to be executed
200 |     payload,   // Optional payload, will be passed to function
201 |     0,         // Size of the payload (only relevant if it should be copied)
202 |     nullptr,   // Payload deletion callback
203 |     0          // Enforce asynchronous execution even if task is small?
204 | );
205 | 
206 | /// Submit a task that is dependent on other tasks (specifically task_1)
207 | Task *task_2 = task_submit_dep(
208 |     pool,
209 |     &task_1,   // Pointer to a list of parent tasks
210 |     1,         // Number of parent tasks
211 |     100,       // How many work units does this task contain?
212 |     my_task_2, // Function to be executed
213 |     payload,   // Optional payload, will be passed to function
214 |     0,         // Size of the payload (only relevant if it should be copied)
215 |     free,      // Call free(payload) once this task completes
216 |     0          // Enforce asynchronous execution even if task is small?
217 | );
218 | 
219 | /* Now that the parent-child relationship is specified,
220 |    the handle of task 1 can be released */
221 | task_release(task_1);
222 | 
223 | // Wait for the completion of task 2 and also release its handle
224 | task_wait_and_release(task_2);
225 | ```
226 | 
227 | ## Documentation
228 | 
229 | The complete API is documented in the file
230 | [nanothread/nanothread.h](https://github.com/mitsuba-renderer/nanothread/blob/master/include/nanothread/nanothread.h).
231 | 
232 | ## Technical details
233 | 
234 | This library follows a lock-free design: tasks that are ready for execution are
235 | stored in a [Michael-Scott
236 | queue](https://www.cs.rochester.edu/u/scott/papers/1996_PODC_queues.pdf) that
237 | is continuously polled by workers, and task submission/removal relies on atomic
238 | compare-and-swap (CAS) operations. Workers that idle for more than roughly 50
239 | milliseconds are put to sleep until more work becomes available.
240 | 
241 | The lock-free design is important: the central data structures of a task
242 | submission system are heavily contended, and traditional abstractions (e.g.
243 | ``std::mutex``) will immediately put contending threads to sleep to defer lock
244 | resolution to the OS kernel. The associated context switches produce an
245 | extremely large overhead that can make a parallel program orders of magnitude
246 | slower than a single-threaded version.
247 | 
248 | The implementation catches exception that occur while executing parallel work
249 | and re-throws them the caller's thread (this part is of no relevance for
250 | software written in C99).
251 | 
252 | The functions ``task_wait()`` and ``task_wait_and_release()`` do not just
253 | wait---they spend the wait time fetching and executing work from the task
254 | queue, which has two implications: first, it is not wasteful to wait for the
255 | completion of another task while executing a task. Second, the thread pool can
256 | be set to a size of zero via ``pool_create(0)`` or ``pool_set_size(pool, 0)``,
257 | in which case the program will still run correctly without launching any
258 | additional threads.
259 | 


--------------------------------------------------------------------------------
/include/nanothread/nanothread.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |     nanothread/nanothread.h -- Simple thread pool with a task-based API
  3 | 
  4 |     Copyright (c) 2021 Wenzel Jakob <wenzel.jakob@epfl.ch>
  5 | 
  6 |     All rights reserved. Use of this source code is governed by a BSD-style
  7 |     license that can be found in the LICENSE file.
  8 | */
  9 | 
 10 | #pragma once
 11 | 
 12 | #include <stdint.h>
 13 | #include <stddef.h>
 14 | #include <stdio.h>
 15 | 
 16 | #if defined(__cplusplus)
 17 | #  include <type_traits>
 18 | #else
 19 | #  include <stdbool.h>
 20 | #endif
 21 | 
 22 | #if defined(NANOTHREAD_STATIC)
 23 | #  define NANOTHREAD_EXPORT
 24 | #else
 25 | #  if defined(_MSC_VER)
 26 | #    if defined(NANOTHREAD_BUILD)
 27 | #      define NANOTHREAD_EXPORT    __declspec(dllexport)
 28 | #    else
 29 | #      define NANOTHREAD_EXPORT    __declspec(dllimport)
 30 | #    endif
 31 | #  else
 32 | #    define NANOTHREAD_EXPORT      __attribute__ ((visibility("default")))
 33 | #  endif
 34 | #endif
 35 | 
 36 | #if defined(__cplusplus)
 37 | #  define NANOTHREAD_DEF(x) = x
 38 | #else
 39 | #  define NANOTHREAD_DEF(x)
 40 | #endif
 41 | 
 42 | #define NANOTHREAD_AUTO ((uint32_t) -1)
 43 | 
 44 | typedef struct Pool Pool;
 45 | typedef struct Task Task;
 46 | 
 47 | #if defined(__cplusplus)
 48 | #define NANOTHREAD_THROW     noexcept(false)
 49 | extern "C" {
 50 | #else
 51 | #define NANOTHREAD_THROW
 52 | #endif
 53 | 
 54 | /**
 55 |  * \brief Create a new thread pool
 56 |  *
 57 |  * \param size
 58 |  *     Specifies the desired number of threads. The default value of
 59 |  *     \c NANOTHREAD_AUTO choses a thread count equal to the number of
 60 |  *     available cores.
 61 |  *
 62 |  * \param ftz
 63 |  *     Should denormalized floating point numbers be flushed to zero?
 64 |  *     The pool workers will initialize their floating point control
 65 |  *     registers accordingly.
 66 |  */
 67 | extern NANOTHREAD_EXPORT Pool *
 68 | pool_create(uint32_t size NANOTHREAD_DEF(NANOTHREAD_AUTO),
 69 |             int ftz NANOTHREAD_DEF(1));
 70 | 
 71 | /**
 72 |  * \brief Destroy the thread pool and discard remaining unfinished work.
 73 |  *
 74 |  * It is undefined behavior to destroy the thread pool while other threads
 75 |  * are waiting for the completion of scheduled work via \ref task_wait().
 76 |  *
 77 |  * \param pool
 78 |  *     The thread pool to destroy. \c nullptr refers to the default pool.
 79 |  */
 80 | extern NANOTHREAD_EXPORT void pool_destroy(Pool *pool NANOTHREAD_DEF(0));
 81 | 
 82 | /// Returns the number of available CPU cores.
 83 | extern NANOTHREAD_EXPORT uint32_t core_count();
 84 | 
 85 | /**
 86 |  * \brief Return the number of threads that are part of the pool
 87 |  *
 88 |  * \param pool
 89 |  *     The thread pool to query. \c nullptr refers to the default pool.
 90 |  */
 91 | extern NANOTHREAD_EXPORT uint32_t pool_size(Pool *pool NANOTHREAD_DEF(0));
 92 | 
 93 | /**
 94 |  * \brief Resize the thread pool to the given number of threads
 95 |  *
 96 |  * \param pool
 97 |  *     The thread pool to resize. \c nullptr refers to the default pool.
 98 |  */
 99 | extern NANOTHREAD_EXPORT void pool_set_size(Pool *pool, uint32_t size);
100 | 
101 | /**
102 |  * \brief Enable/disable time profiling
103 |  *
104 |  * Profiling must be enabled to use the \ref task_time() function.
105 |  *
106 |  * \param value
107 |  *     A nonzero value indicates that profiling should be enabled.
108 |  */
109 | extern NANOTHREAD_EXPORT void pool_set_profile(int value);
110 | 
111 | /// Check whether time profiling is enabled (global setting)
112 | extern NANOTHREAD_EXPORT int pool_profile();
113 | 
114 | /**
115 |  * \brief Return a unique number identifying the current worker thread
116 |  *
117 |  * When called from a thread pool worker (e.g. while executing a parallel
118 |  * task), this function returns a unique identifying number between 1 and the
119 |  * pool's total thread count.
120 |  *
121 |  * The IDs of separate thread pools overlap. When the current thread is not a
122 |  * thread pool worker, the function returns zero.
123 |  */
124 | extern NANOTHREAD_EXPORT uint32_t pool_thread_id();
125 | 
126 | /** \brief Process work available within the pool until a stopping criterion is
127 |  * satisified.
128 |  *
129 |  * This function repeatedly fetches work from ``pool`` until the stopping
130 |  * criterion ``stopping_criterion(payload)`` evaluates to ``true``, at which
131 |  * point the function returns.
132 |  *
133 |  * It provides a way for an ordinary thread to temporarily join the thread
134 |  * pool. A function being called by a worker thread that needs to wait for an
135 |  * event to take place can also call this function to avoid starvation issues.
136 |  */
137 | extern NANOTHREAD_EXPORT void
138 | pool_work_until(Pool *pool, bool (*stopping_criterion)(void *), void *payload);
139 | 
140 | /*
141 |  * \brief Submit a new task to a thread pool
142 |  *
143 |  * This function submits a new task consisting of \c size work units to the
144 |  * thread pool \c pool.
145 |  *
146 |  * <b>Callback</b>: The task callback \c func will be invoked \c size times by
147 |  * the various thread pool workers. Its first argument will range from
148 |  * <tt>0</tt> to \c <tt>size - 1</tt>, and the second argument refers to a
149 |  * payload memory region specified via the \c payload parameter.
150 |  *
151 |  * <b>Parents</bb>: The \c parent and \c parent_count parameters can be used to
152 |  * specify parent tasks that must be completed before execution of this task
153 |  * can commence. If the task does not depend on any other tasks (e.g.
154 |  * <tt>parent_count == 0</tt> and <tt>parent == nullptr</tt>), or when all of
155 |  * those other tasks have already finished executing, then it will be
156 |  * immediately appended to the end of the task queue. Otherwise, the task will
157 |  * be scheduled once all parent tasks have finished executing.
158 |  *
159 |  * <b>Payload storage</b>: The callback payload is handled using one of two
160 |  * possible modes:
161 |  *
162 |  * <ol>
163 |  *    <li>When <tt>size == 0</tt> or <tt>payload_deleter != nullptr</tt>, the
164 |  *    value of the \c payload parameter is simply forwarded to the callback \c
165 |  *    func. In the latter case, <tt>payload_deleter(payload)</tt> is invoked
166 |  *    following completion of the task, which can carry out additional cleanup
167 |  *    operations if needed. In both cases, the memory region targeted by \c
168 |  *    payload may be accessed asynchronously and must remain valid until the
169 |  *    task is done.</li>
170 |  *
171 |  *    <li>Otherwise, the function will internally create a copy of the payload
172 |  *    and free it following completion of the task. In this case, it is fine to
173 |  *    delete the the memory region targeted by \c payload right after the
174 |  *    function call.</li>
175 |  * </ol>
176 |  *
177 |  * The function returns a task handle that can be used to schedule other
178 |  * dependent tasks, and to wait for task completion if desired. This handle
179 |  * must eventually be released using \ref task_release() or \ref
180 |  * task_release_and_wait(). A failure to do so will result in memory leaks.
181 |  *
182 |  * <b>Small task optimization</b>: If desired, small tasks can be executed
183 |  * right away without using the thread pool. This happens under the following
184 |  * conditions:
185 |  *
186 |  * <ol>
187 |  *   <li>The task is "small" (\c size == 1).</li>
188 |  *   <li>The task does not depend on any parent tasks.</li>
189 |  *   <li>The \c always_async parameter is set to 0</li>
190 |  * </ol>
191 |  *
192 |  * \remark
193 |  *     Barriers and similar dependency relations can be encoded by via
194 |  *     artificial tasks using <tt>size == 0</tt> and <tt>func == nullptr<tt>
195 |  *     along with a set of parent tasks.
196 |  *
197 |  * \param pool
198 |  *     The thread pool that should execute the specified task. \c nullptr
199 |  *     refers to the default pool.
200 |  *
201 |  * \param parent
202 |  *     List of parents of size \c parent_count. \c nullptr-valued elements
203 |  *     are ignored
204 |  *
205 |  * \param parent_count
206 |  *     Number of parent tasks
207 |  *
208 |  * \param size
209 |  *     Total number of work units; the callback \c func will be called this
210 |  *     many times if provided.
211 |  *
212 |  * \param func
213 |  *     Callback function that will be invoked to perform the actual computation.
214 |  *     If set to \c nullptr, the callback is ignored. This can be used to create
215 |  *     artificial tasks that only encode dependencies.
216 |  *
217 |  * \param payload
218 |  *     Optional payload that is passed to the function \c func
219 |  *
220 |  * \param payload_size
221 |  *     When \c payload_deleter is equal to \c nullptr and when \c size is
222 |  *     nonzero, a temporary copy of the payload will be made. This parameter is
223 |  *     necessary to specify the payload size in that case.
224 |  *
225 |  * \param payload_deleter
226 |  *     Optional callback that will be invoked to free the payload
227 |  *
228 |  * \param always_async
229 |  *     If set to a nonzero value, execution will always happen asynchronously,
230 |  *     even in cases where the task being scheduled has no parents, and
231 |  *     when only encodes a small amount of work (\c size == 1). Otherwise
232 |  *     it will be executed synchronously, and the function will return \c nullptr.
233 |  *
234 |  * \return
235 |  *     A task handle that must eventually be released via \ref task_release()
236 |  *     or \ref task_wait_and_release(). The function returns \c nullptr when
237 |  *     no task was generated (e.g. when there are no parent tasks, and either
238 |  *     <tt>size==0</tt>, or when <tt>size==1</tt> and the task was executed
239 |  *     synchronously.)
240 |  */
241 | extern NANOTHREAD_EXPORT
242 | Task *task_submit_dep(Pool *pool,
243 |                       const Task * const *parent,
244 |                       uint32_t parent_count,
245 |                       uint32_t size NANOTHREAD_DEF(1),
246 |                       void (*func)(uint32_t, void *) NANOTHREAD_DEF(0),
247 |                       void *payload NANOTHREAD_DEF(0),
248 |                       uint32_t payload_size NANOTHREAD_DEF(0),
249 |                       void (*payload_deleter)(void *) NANOTHREAD_DEF(0),
250 |                       int always_async NANOTHREAD_DEF(0));
251 | 
252 | /*
253 |  * \brief Release a task handle so that it can eventually be reused
254 |  *
255 |  * Releasing a task handle does not impact the tasks's execution, which could
256 |  * be in one of three states: waiting, running, or complete. This operation is
257 |  * important because it frees internal resources that would otherwise leak.
258 |  *
259 |  * Following a call to \ref task_release(), the associated task can no
260 |  * longer be used as a direct parent of other tasks, and it is no longer
261 |  * possible to wait for its completion using an operation like \ref
262 |  * task_wait().
263 |  *
264 |  * \param pool
265 |  *     The thread pool containing the task. \c nullptr refers to the default pool.
266 |  *
267 |  * \param task
268 |  *     The task in question. When equal to \c nullptr, the operation is a no-op.
269 |  */
270 | extern NANOTHREAD_EXPORT void task_release(Task *task);
271 | 
272 | /*
273 |  * \brief Wait for the completion of the specified task
274 |  *
275 |  * This function causes the calling thread to sleep until all work units of
276 |  * 'task' have been completed.
277 |  *
278 |  * If an exception was caught during parallel excecution of 'task', the
279 |  * function \ref task_wait() will re-raise this exception in the context of the
280 |  * caller. Note that if a parallel task raises many exceptions, only a single
281 |  * one of them will be be captured in this way.
282 |  *
283 |  * \param task
284 |  *     The task in question. When equal to \c nullptr, the operation is a no-op.
285 |  */
286 | extern NANOTHREAD_EXPORT void task_wait(Task *task) NANOTHREAD_THROW;
287 | 
288 | /*
289 |  * \brief Wait for the completion of the specified task and release its handle
290 |  *
291 |  * This function is equivalent to calling \ref task_wait() followed by \ref
292 |  * task_release().
293 |  *
294 |  * If an exception was caught during parallel excecution of 'task', the
295 |  * function \ref task_wait_and_release() will perform the release step and then
296 |  * re-raise this exception in the context of the caller. Note that if a
297 |  * parallel task raises many exceptions, only a single one of them will be be
298 |  * captured in this way.
299 |  *
300 |  * \param task
301 |  *     The task in question. When equal to \c nullptr, the operation is a no-op.
302 |  */
303 | extern NANOTHREAD_EXPORT void task_wait_and_release(Task *task) NANOTHREAD_THROW;
304 | 
305 | /**
306 |  * \brief Return the time consumed by the task in milliseconds
307 |  *
308 |  * To use this function, you must first enable time profiling via \ref
309 |  * pool_set_profile() before launching tasks.
310 |  */
311 | extern NANOTHREAD_EXPORT double task_time(Task *task) NANOTHREAD_THROW;
312 | 
313 | /*
314 |  * \brief Increase the reference count of a task
315 |  *
316 |  * In advanced use case, it may be helpful if multiple parts of the system can
317 |  * hold references to a task (and e.g. query timing information or
318 |  * completeness). The \c task_retain operation enables this by increasing an
319 |  * internal reference counter so that \ref task_release() must be called
320 |  * multiple times before the task is actually released.
321 |  *
322 |  * \param task
323 |  *     The task in question. When equal to \c nullptr, the operation is a no-op.
324 |  */
325 | extern NANOTHREAD_EXPORT void task_retain(Task *task);
326 | 
327 | /// Convenience wrapper around task_submit_dep(), but without dependencies
328 | static inline
329 | Task *task_submit(Pool *pool,
330 |                   uint32_t size NANOTHREAD_DEF(1),
331 |                   void (*func)(uint32_t, void *) NANOTHREAD_DEF(0),
332 |                   void *payload NANOTHREAD_DEF(0),
333 |                   uint32_t payload_size NANOTHREAD_DEF(0),
334 |                   void (*payload_deleter)(void *) NANOTHREAD_DEF(0),
335 |                   int always_async NANOTHREAD_DEF(0)) {
336 | 
337 |     return task_submit_dep(pool, 0, 0, size, func, payload, payload_size,
338 |                            payload_deleter, always_async);
339 | }
340 | 
341 | /// Convenience wrapper around task_submit(), but fully synchronous
342 | static inline
343 | void task_submit_and_wait(Pool *pool,
344 |                           uint32_t size NANOTHREAD_DEF(1),
345 |                           void (*func)(uint32_t, void *) NANOTHREAD_DEF(0),
346 |                           void *payload NANOTHREAD_DEF(0)) {
347 | 
348 |     Task *task = task_submit(pool, size, func, payload, 0, 0, 0);
349 |     task_wait_and_release(task);
350 | }
351 | 
352 | #if defined(__cplusplus)
353 | }
354 | 
355 | #include <utility>
356 | 
357 | namespace drjit {
358 |     template <typename Int> struct blocked_range {
359 |     public:
360 |         blocked_range(Int begin, Int end, Int block_size = 1)
361 |             : m_begin(begin), m_end(end), m_block_size(block_size) { }
362 | 
363 |         struct iterator {
364 |             Int value;
365 | 
366 |             iterator(Int value) : value(value) { }
367 | 
368 |             Int operator*() const { return value; }
369 |             operator Int() const { return value;}
370 | 
371 |             void operator++() { value++; }
372 |             bool operator==(const iterator &it) { return value == it.value; }
373 |             bool operator!=(const iterator &it) { return value != it.value; }
374 |         };
375 | 
376 |         uint32_t blocks() const {
377 |             return (uint32_t) ((m_end - m_begin + m_block_size - 1) / m_block_size);
378 |         }
379 | 
380 |         iterator begin() const { return iterator(m_begin); }
381 |         iterator end() const { return iterator(m_end); }
382 |         Int block_size() const { return m_block_size; }
383 | 
384 |     private:
385 |         Int m_begin;
386 |         Int m_end;
387 |         Int m_block_size;
388 |     };
389 | 
390 |     template <typename Int, typename Func>
391 |     void parallel_for(const blocked_range<Int> &range, Func &&func,
392 |                       Pool *pool = nullptr) {
393 | 
394 |         struct Payload {
395 |             Func *f;
396 |             Int begin, end, block_size;
397 |         };
398 | 
399 |         Payload payload{ &func, range.begin(), range.end(),
400 |                          range.block_size() };
401 | 
402 |         auto callback = [](uint32_t index, void *payload) {
403 |             Payload *p = (Payload *) payload;
404 |             Int begin = p->begin + p->block_size * (Int) index,
405 |                 end = begin + p->block_size;
406 | 
407 |             if (end > p->end)
408 |                 end = p->end;
409 | 
410 |             (*p->f)(blocked_range<Int>(begin, end));
411 |         };
412 | 
413 |         task_submit_and_wait(pool, range.blocks(), callback, &payload);
414 |     }
415 | 
416 |     template <typename Int, typename Func>
417 |     Task *parallel_for_async(const blocked_range<Int> &range, Func &&func,
418 |                              const Task * const *parents,
419 |                              size_t parent_count,
420 |                              Pool *pool = nullptr) {
421 |         using BaseFunc = typename std::decay<Func>::type;
422 | 
423 |         struct Payload {
424 |             BaseFunc f;
425 |             Int begin, end, block_size;
426 |         };
427 | 
428 |         auto callback = [](uint32_t index, void *payload) {
429 |             Payload *p = (Payload *) payload;
430 |             Int begin = p->begin + p->block_size * (Int) index,
431 |                 end = begin + p->block_size;
432 | 
433 |             if (end > p->end)
434 |                 end = p->end;
435 | 
436 |             p->f(blocked_range<Int>(begin, end));
437 |         };
438 | 
439 |         if (std::is_trivially_copyable<BaseFunc>::value &&
440 |             std::is_trivially_destructible<BaseFunc>::value) {
441 |             Payload payload{ std::forward<Func>(func), range.begin(),
442 |                              range.end(), range.block_size() };
443 | 
444 |             return task_submit_dep(pool, parents,
445 |                                    (uint32_t) parent_count, range.blocks(),
446 |                                    callback, &payload, sizeof(Payload), nullptr, 1);
447 |         } else {
448 |             Payload *payload = new Payload{ std::forward<Func>(func), range.begin(),
449 |                                             range.end(), range.block_size() };
450 | 
451 |             auto deleter = [](void *payload) {
452 |                 delete (Payload *) payload;
453 |             };
454 | 
455 |             return task_submit_dep(pool, parents,
456 |                                    (uint32_t) parent_count, range.blocks(),
457 |                                    callback, payload, 0, deleter, 1);
458 |         }
459 |     }
460 | 
461 |     template <typename Int, typename Func>
462 |     Task *parallel_for_async(const blocked_range<Int> &range, Func &&func,
463 |                              std::initializer_list<const Task *> parents = { },
464 |                              Pool *pool = nullptr) {
465 |         return parallel_for_async(range, func, parents.begin(), parents.size(),
466 |                                   pool);
467 |     }
468 | 
469 |     template <typename Func>
470 |     Task *do_async(Func &&func, const Task * const *parents, size_t parent_count,
471 |                    Pool *pool = nullptr) {
472 |         using BaseFunc = typename std::decay<Func>::type;
473 | 
474 |         struct Payload {
475 |             BaseFunc f;
476 |         };
477 | 
478 |         auto callback = [](uint32_t /* unused */, void *payload) {
479 |             ((Payload *) payload)->f();
480 |         };
481 | 
482 |         if (std::is_trivially_copyable<BaseFunc>::value &&
483 |             std::is_trivially_destructible<BaseFunc>::value) {
484 |             Payload payload {std::forward<Func>(func) };
485 | 
486 |             return task_submit_dep(pool, parents,
487 |                                    (uint32_t) parent_count, 1, callback,
488 |                                    &payload, sizeof(Payload), nullptr, 1);
489 |         } else {
490 |             Payload *payload = new Payload{ std::forward<Func>(func) };
491 | 
492 |             auto deleter = [](void *payload) { delete (Payload *) payload; };
493 | 
494 |             return task_submit_dep(pool, parents,
495 |                                    (uint32_t) parent_count, 1, callback,
496 |                                    payload, 0, deleter, 1);
497 |         }
498 |     }
499 | 
500 |     template <typename Func>
501 |     Task *do_async(Func &&func, std::initializer_list<const Task *> parents = {},
502 |                    Pool *pool = nullptr) {
503 |         return do_async(func, parents.begin(), parents.size(), pool);
504 |     }
505 | }
506 | #endif
507 | 


--------------------------------------------------------------------------------
/src/nanothread.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     src/pool.cpp -- Simple thread pool with task-based API
  3 | 
  4 |     Copyright (c) 2021 Wenzel Jakob <wenzel.jakob@epfl.ch>
  5 | 
  6 |     All rights reserved. Use of this source code is governed by a BSD-style
  7 |     license that can be found in the LICENSE file.
  8 | */
  9 | 
 10 | #include <nanothread/nanothread.h>
 11 | #include "queue.h"
 12 | #include <cstdlib>
 13 | #include <memory>
 14 | #include <thread>
 15 | #include <type_traits>
 16 | 
 17 | #if defined(__linux__)
 18 | #  include <unistd.h>
 19 | #elif defined(_WIN32)
 20 | #  include <windows.h>
 21 | #  include <processthreadsapi.h>
 22 | #endif
 23 | 
 24 | #if defined(_MSC_VER)
 25 | #  include <intrin.h>
 26 | #elif defined(__SSE2__)
 27 | #  include <pmmintrin.h>
 28 | #endif
 29 | 
 30 | struct Worker;
 31 | 
 32 | /// TLS variable storing an ID of each thread
 33 | #if defined(_MSC_VER)
 34 |     static __declspec(thread) uint32_t thread_id_tls = 0;
 35 | #else
 36 |     static __thread uint32_t thread_id_tls = 0;
 37 | #endif
 38 | 
 39 | /// Data structure describing a pool of workers
 40 | struct Pool {
 41 |     /// Queue of scheduled tasks
 42 |     TaskQueue queue;
 43 | 
 44 |     /// List of currently running worker threads
 45 |     std::vector<std::unique_ptr<Worker>> workers;
 46 | 
 47 |     /// Number of idle workers that have gone to sleep
 48 |     std::atomic<uint32_t> asleep;
 49 | 
 50 |     /// Should denormalized floating point numbers be flushed to zero?
 51 |     bool ftz = true;
 52 | };
 53 | 
 54 | struct Worker {
 55 |     Pool *pool;
 56 |     std::thread thread;
 57 |     uint32_t id;
 58 |     bool stop;
 59 |     bool ftz;
 60 | 
 61 |     Worker(Pool *pool, uint32_t id, bool ftz);
 62 |     ~Worker();
 63 |     void run();
 64 | };
 65 | 
 66 | 
 67 | static Pool *pool_default_inst = nullptr;
 68 | static Lock pool_default_lock;
 69 | static uint32_t cached_core_count = 0;
 70 | 
 71 | uint32_t core_count() {
 72 |     // assumes atomic word size memory access
 73 |     if (cached_core_count)
 74 |         return cached_core_count;
 75 | 
 76 |     // Determine the number of present cores
 77 |     uint32_t ncores = std::thread::hardware_concurrency();
 78 | 
 79 | #if defined(__linux__)
 80 |     // Don't try to query CPU affinity if running inside Valgrind
 81 |     if (getenv("VALGRIND_OPTS") == nullptr) {
 82 |         /* Some of the cores may not be available to the user
 83 |            (e.g. on certain cluster nodes) -- determine the number
 84 |            of actual available cores here. */
 85 |         uint32_t ncores_logical = ncores;
 86 |         size_t size = 0;
 87 |         cpu_set_t *cpuset = nullptr;
 88 |         int retval = 0;
 89 | 
 90 |         /* The kernel may expect a larger cpu_set_t than would
 91 |            be warranted by the physical core count. Keep querying
 92 |            with increasingly larger buffers if the
 93 |            pthread_getaffinity_np operation fails */
 94 |         for (uint32_t i = 0; i < 10; ++i) {
 95 |             size = CPU_ALLOC_SIZE(ncores_logical);
 96 |             cpuset = CPU_ALLOC(ncores_logical);
 97 |             if (!cpuset) {
 98 |                 fprintf(stderr, "nanothread: core_count(): Could not allocate cpu_set_t.\n");
 99 |                 return ncores;
100 |             }
101 |             CPU_ZERO_S(size, cpuset);
102 | 
103 |             int retval = pthread_getaffinity_np(pthread_self(), size, cpuset);
104 |             if (retval == 0)
105 |                 break;
106 |             CPU_FREE(cpuset);
107 |             ncores_logical *= 2;
108 |         }
109 | 
110 |         if (retval) {
111 |             fprintf(stderr, "nanothread: core_count(): Could not read thread affinity map.\n");
112 |             return ncores;
113 |         }
114 | 
115 |         uint32_t ncores_avail = 0;
116 |         for (uint32_t i = 0; i < ncores_logical; ++i)
117 |             ncores_avail += CPU_ISSET_S(i, size, cpuset) ? 1 : 0;
118 |         ncores = ncores_avail;
119 |         CPU_FREE(cpuset);
120 |     }
121 | #endif
122 |     cached_core_count = ncores;
123 |     return ncores;
124 | }
125 | 
126 | 
127 | uint32_t pool_thread_id() {
128 |     return thread_id_tls;
129 | }
130 | 
131 | Pool *pool_default() {
132 |     std::unique_lock<Lock> guard(pool_default_lock);
133 | 
134 |     if (!pool_default_inst)
135 |         pool_default_inst = pool_create();
136 | 
137 |     return pool_default_inst;
138 | }
139 | 
140 | Pool *pool_create(uint32_t size, int ftz) {
141 |     Pool *pool = new Pool();
142 |     pool->ftz = ftz != 0;
143 |     if (size == (uint32_t) -1)
144 |         size = core_count();
145 |     NT_TRACE("pool_create(%p)", pool);
146 |     pool_set_size(pool, size);
147 |     return pool;
148 | }
149 | 
150 | 
151 | void pool_destroy(Pool *pool) {
152 |     if (pool) {
153 |         pool_set_size(pool, 0);
154 |         delete pool;
155 |     } else if (pool_default_inst) {
156 |         pool_destroy(pool_default_inst);
157 |         pool_default_inst = nullptr;
158 |     }
159 | }
160 | 
161 | uint32_t pool_size(Pool *pool) {
162 |     if (!pool) {
163 |         std::unique_lock<Lock> guard(pool_default_lock);
164 |         pool = pool_default_inst;
165 |     }
166 | 
167 |     if (pool)
168 |         return (uint32_t) pool->workers.size();
169 |     else
170 |         return core_count();
171 | }
172 | 
173 | void pool_set_size(Pool *pool, uint32_t size) {
174 |     if (!pool) {
175 |         std::unique_lock<Lock> guard(pool_default_lock);
176 |         pool = pool_default_inst;
177 | 
178 |         if (!pool) {
179 |             pool = pool_default_inst = new Pool();
180 |             NT_TRACE("pool_create(%p)", pool);
181 |         }
182 |     }
183 | 
184 |     NT_TRACE("pool_set_size(%p, %u)", pool, size);
185 | 
186 |     int diff = (int) size - (int) pool->workers.size();
187 |     if (diff > 0) {
188 |         // Launch extra worker threads
189 |         for (int i = 0; i < diff; ++i)
190 |             pool->workers.push_back(std::unique_ptr<Worker>(
191 |                 new Worker(pool, (uint32_t) pool->workers.size() + 1, pool->ftz)));
192 |     } else if (diff < 0) {
193 |         // Remove worker threads (destructor calls join())
194 |         for (int i = diff; i != 0; ++i)
195 |             pool->workers[pool->workers.size() + i]->stop = true;
196 |         pool->queue.wakeup();
197 |         for (int i = diff; i != 0; ++i)
198 |             pool->workers.pop_back();
199 |     }
200 | }
201 | 
202 | int profile_tasks = false;
203 | 
204 | int pool_profile() {
205 |     return (int) profile_tasks;
206 | }
207 | 
208 | void pool_set_profile(int value) {
209 |     profile_tasks = (bool) value;
210 | }
211 | 
212 | Task *task_submit_dep(Pool *pool, const Task *const *parent,
213 |                       uint32_t parent_count, uint32_t size,
214 |                       void (*func)(uint32_t, void *), void *payload,
215 |                       uint32_t payload_size, void (*payload_deleter)(void *),
216 |                       int async) {
217 | 
218 |     if (size == 0) {
219 |         // There is no work, so the payload is irrelevant
220 |         func = nullptr;
221 | 
222 |         // The queue requires task size >= 1
223 |         size = 1;
224 |     }
225 | 
226 |     // Does the task have parent tasks
227 |     bool has_parent = false;
228 |     for (uint32_t i = 0; i < parent_count; ++i)
229 |         has_parent |= parent[i] != nullptr;
230 | 
231 |     // If this is a small work unit, execute it right away
232 |     if (size == 1 && !has_parent && async == 0) {
233 |         NT_TRACE("task_submit_dep(): task is small, executing right away");
234 | 
235 |         if (!profile_tasks) {
236 |             if (func)
237 |                 func(0, payload);
238 | 
239 |             if (payload_deleter)
240 |                 payload_deleter(payload);
241 | 
242 |             // Don't even return a task..
243 |             return nullptr;
244 |         } else {
245 |             if (!pool)
246 |                 pool = pool_default();
247 | 
248 |             Task *task = pool->queue.alloc(size);
249 | 
250 |             if (profile_tasks) {
251 |                 #if defined(_WIN32)
252 |                     QueryPerformanceCounter(&task->time_start);
253 |                 #elif defined(__APPLE__)
254 |                     task->time_start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
255 |                 #else
256 |                     clock_gettime(CLOCK_MONOTONIC, &task->time_start);
257 |                 #endif
258 |             }
259 | 
260 |             if (func)
261 |                 func(0, payload);
262 | 
263 |             if (profile_tasks) {
264 |                 #if defined(_WIN32)
265 |                     QueryPerformanceCounter(&task->time_end);
266 |                 #elif defined(__APPLE__)
267 |                     task->time_end = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
268 |                 #else
269 |                     clock_gettime(CLOCK_MONOTONIC, &task->time_end);
270 |                 #endif
271 |             }
272 | 
273 |             if (payload_deleter)
274 |                 payload_deleter(payload);
275 | 
276 |             task->refcount.store(high_bit, std::memory_order_relaxed);
277 |             task->exception_used.store(false, std::memory_order_relaxed);
278 |             task->exception = nullptr;
279 |             task->size = size;
280 |             task->func = func;
281 |             task->pool = pool;
282 |             task->payload = nullptr;
283 |             task->payload_deleter = nullptr;
284 | 
285 |             return task;
286 |         }
287 |     }
288 | 
289 |     // Size 0 is equivalent to size 1, but without the above optimization
290 |     if (size == 0)
291 |         size = 1;
292 | 
293 |     if (!pool)
294 |         pool = pool_default();
295 | 
296 |     Task *task = pool->queue.alloc(size);
297 |     task->exception_used.store(false, std::memory_order_relaxed);
298 |     task->exception = nullptr;
299 | 
300 |     if (has_parent) {
301 |         // Prevent early job submission due to completion of parents
302 |         task->wait_parents.store(1, std::memory_order_release);
303 | 
304 |         // Register dependencies in queue, will further increase child->wait_parents
305 |         for (uint32_t i = 0; i < parent_count; ++i)
306 |             pool->queue.add_dependency((Task *) parent[i], task);
307 |     }
308 | 
309 |     task->size = size;
310 |     task->func = func;
311 |     task->pool = pool;
312 | 
313 |     if (payload) {
314 |         if (payload_deleter || payload_size == 0) {
315 |             task->payload = payload;
316 |             task->payload_deleter = payload_deleter;
317 |         } else if (payload_size <= sizeof(Task::payload_storage)) {
318 |             task->payload = task->payload_storage;
319 |             memcpy(task->payload_storage, payload, payload_size);
320 |             task->payload_deleter = nullptr;
321 |         } else {
322 |             /* Payload doesn't fit into temporary storage, and no
323 |                custom deleter was provided. Make a temporary copy. */
324 |             task->payload = malloc(payload_size);
325 |             task->payload_deleter = free;
326 |             NT_ASSERT(task->payload != nullptr);
327 |             memcpy(task->payload, payload, payload_size);
328 |         }
329 |     } else {
330 |         task->payload = nullptr;
331 |         task->payload_deleter = nullptr;
332 |     }
333 | 
334 |     bool push = true;
335 |     if (has_parent) {
336 |         /* Undo the earlier 'wait' increment. If the value is now zero, all
337 |            parent tasks have completed and the job can be pushed. Otherwise,
338 |            it's somebody else's job to carry out this step. */
339 |         push = task->wait_parents.fetch_sub(1) == 1;
340 |     }
341 | 
342 |     if (push)
343 |         pool->queue.push(task);
344 | 
345 |     return task;
346 | }
347 | 
348 | static void pool_execute_task(Pool *pool, bool (*stopping_criterion)(void *),
349 |                               void *payload, bool may_sleep) {
350 |     Task *task;
351 |     uint32_t index;
352 |     std::tie(task, index) =
353 |         pool->queue.pop_or_sleep(stopping_criterion, payload, may_sleep);
354 | 
355 |     if (task) {
356 |         if (task->func) {
357 |             if (task->exception_used.load()) {
358 |                 NT_TRACE(
359 |                     "not running callback (task=%p, index=%u) because another "
360 |                     "work unit of this task generated an exception",
361 |                     task, index);
362 |             } else {
363 |                 try {
364 |                     NT_TRACE("running callback (task=%p, index=%u, payload=%p)", task, index, task->payload);
365 |                     task->func(index, task->payload);
366 |                 } catch (...) {
367 |                     bool value = false;
368 |                     if (task->exception_used.compare_exchange_strong(value, true)) {
369 |                         NT_TRACE("exception caught, storing..");
370 |                         task->exception = std::current_exception();
371 |                     } else {
372 |                         NT_TRACE("exception caught, ignoring (an exception was already stored)");
373 |                     }
374 |                 }
375 |             }
376 |         }
377 | 
378 |         pool->queue.release(task);
379 |     }
380 | }
381 | 
382 | void pool_work_until(Pool *pool, bool (*stopping_criterion)(void *), void *payload) {
383 |     if (!pool)
384 |         pool = pool_default_inst;
385 |     if (!pool)
386 |         return;
387 |     while (!stopping_criterion(payload))
388 |         pool_execute_task(pool, stopping_criterion, payload, false);
389 | }
390 | 
391 | #if defined(__SSE2__)
392 | struct FTZGuard {
393 |     FTZGuard(bool enable) : enable(enable) {
394 |         if (enable) {
395 |             csr = _mm_getcsr();
396 |             _mm_setcsr(csr | (_MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON));
397 |         }
398 |     }
399 | 
400 |     ~FTZGuard() {
401 |         if (enable)
402 |             _mm_setcsr(csr);
403 |     }
404 | 
405 |     bool enable;
406 |     int csr;
407 | };
408 | #else
409 | struct FTZGuard { FTZGuard(bool) { } };
410 | #endif
411 | 
412 | void task_wait(Task *task) {
413 |     if (task) {
414 |         Pool *pool = task->pool;
415 |         FTZGuard ftz_guard(pool->ftz);
416 | 
417 |         // Signal that we are waiting for this task
418 |         task->wait_count++;
419 | 
420 |         auto stopping_criterion = [](void *ptr) -> bool {
421 |             return (uint32_t)(((Task *) ptr)->refcount.load()) == 0;
422 |         };
423 | 
424 |         NT_TRACE("task_wait(%p)", task);
425 | 
426 |         // Help executing work units in the meantime
427 |         while (!stopping_criterion(task))
428 |             pool_execute_task(pool, stopping_criterion, task, true);
429 | 
430 |         task->wait_count--;
431 | 
432 |         if (task->exception)
433 |             std::rethrow_exception(task->exception);
434 |     }
435 | }
436 | 
437 | void task_retain(Task *task) {
438 |     if (task)
439 |         task->pool->queue.retain(task);
440 | }
441 | 
442 | void task_release(Task *task) {
443 |     if (task)
444 |         task->pool->queue.release(task, true);
445 | }
446 | 
447 | void task_wait_and_release(Task *task) NANOTHREAD_THROW {
448 |     try {
449 |         task_wait(task);
450 |     } catch (...) {
451 |         task_release(task);
452 |         throw;
453 |     }
454 |     task_release(task);
455 | }
456 | 
457 | #if defined(_WIN32)
458 | static double timer_frequency_scale = 0.0;
459 | #endif
460 | 
461 | NANOTHREAD_EXPORT double task_time(Task *task) NANOTHREAD_THROW {
462 |     if (!task)
463 |         return 0;
464 | 
465 | #if defined(__APPLE__)
466 |     return (task->time_end - task->time_start) * 1e-6;
467 | #elif !defined(_WIN32)
468 |     return (task->time_end.tv_sec - task->time_start.tv_sec) * 1e3 +
469 |            (task->time_end.tv_nsec - task->time_start.tv_nsec) * 1e-6;
470 | #else
471 |     if (timer_frequency_scale == 0.0) {
472 |         LARGE_INTEGER timer_frequency;
473 |         QueryPerformanceFrequency(&timer_frequency);
474 |         timer_frequency_scale = 1e3 / timer_frequency.QuadPart;
475 |     }
476 | 
477 |     return timer_frequency_scale *
478 |            (task->time_end.QuadPart - task->time_start.QuadPart);
479 | #endif
480 | }
481 | 
482 | Worker::Worker(Pool *pool, uint32_t id, bool ftz)
483 |     : pool(pool), id(id), stop(false), ftz(ftz) {
484 |     thread = std::thread(&Worker::run, this);
485 | }
486 | 
487 | Worker::~Worker() { thread.join(); }
488 | 
489 | void Worker::run() {
490 |     thread_id_tls = id;
491 | 
492 |     NT_TRACE("worker started");
493 | 
494 |     #if defined(_WIN32)
495 |         wchar_t buf[24];
496 |         _snwprintf(buf, sizeof(buf) / sizeof(wchar_t), L"nanothread worker %u", id);
497 |         SetThreadDescription(GetCurrentThread(), buf);
498 |     #else
499 |         char buf[24];
500 |         snprintf(buf, sizeof(buf), "nanothread worker %u", id);
501 |         #if defined(__APPLE__)
502 |             pthread_setname_np(buf);
503 |         #else
504 |             pthread_setname_np(pthread_self(), buf);
505 |         #endif
506 |     #endif
507 | 
508 |     FTZGuard ftz_guard(ftz);
509 |     while (!stop)
510 |         pool_execute_task(
511 |             pool, [](void *ptr) -> bool { return *((bool *) ptr); }, &stop,
512 |             true);
513 | 
514 |     NT_TRACE("worker stopped");
515 | 
516 |     thread_id_tls = 0;
517 | }
518 | 


--------------------------------------------------------------------------------
/src/queue.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     src/queue.cpp -- Lock-free task queue implementation used by nanothread
  3 | 
  4 |     Copyright (c) 2021 Wenzel Jakob <wenzel.jakob@epfl.ch>
  5 | 
  6 |     All rights reserved. Use of this source code is governed by a BSD-style
  7 |     license that can be found in the LICENSE file.
  8 | */
  9 | 
 10 | #include "queue.h"
 11 | #include <cstdio>
 12 | #include <cstdlib>
 13 | #include <ctime>
 14 | 
 15 | #if defined(_WIN32)
 16 | #  include <windows.h>
 17 | #endif
 18 | 
 19 | #if defined(_MSC_VER)
 20 | #  include <intrin.h>
 21 | #elif defined(__SSE2__)
 22 | #  include <emmintrin.h>
 23 | #endif
 24 | 
 25 | /// Put worker threads to sleep after 500K attempts to get work
 26 | #define NANOTHREAD_MAX_ATTEMPTS 500000
 27 | 
 28 | /// Reduce power usage in busy-wait CAS loops
 29 | static void cas_pause() {
 30 | #if defined(_M_X64) || defined(__SSE2__)
 31 |     _mm_pause();
 32 | #endif
 33 | }
 34 | 
 35 | /// Atomic 16 byte compare-and-swap & release barrier on ARM
 36 | static bool cas(Task::Ptr &ptr, Task::Ptr &expected, Task::Ptr desired) {
 37 | #if defined(_MSC_VER)
 38 |     #if defined(_M_ARM64)
 39 |         return _InterlockedCompareExchange128_rel(
 40 |             (__int64 volatile *) &ptr, (__int64) desired.value,
 41 |             (__int64) desired.task, (__int64 *) &expected);
 42 |     #else
 43 |         return _InterlockedCompareExchange128(
 44 |             (__int64 volatile *) &ptr, (__int64) desired.value,
 45 |             (__int64) desired.task, (__int64 *) &expected);
 46 |     #endif
 47 | #else
 48 |     return __atomic_compare_exchange(&ptr, &expected, &desired, true,
 49 |                                      __ATOMIC_RELEASE, __ATOMIC_ACQUIRE);
 50 | #endif
 51 | }
 52 | 
 53 | // *Non-atomic* 16 byte load, acquire barrier on ARM
 54 | static Task::Ptr ldar(Task::Ptr &source) {
 55 | #if defined(_MSC_VER)
 56 |     using P = unsigned __int64 volatile *;
 57 |     #if defined(_M_ARM64)
 58 |         uint64_t value_1 = __ldar64((P) &source);
 59 |         uint64_t value_2 = __ldar64(((P) &source) + 1);
 60 |     #else
 61 |         uint64_t value_1 = *((P) &source);
 62 |         uint64_t value_2 = *(((P) &source) + 1);
 63 |     #endif
 64 |     return Task::Ptr{ (Task *) value_1, (uint64_t) value_2 };
 65 | #else
 66 |     uint64_t value_1 = __atomic_load_n((uint64_t *) &source, __ATOMIC_ACQUIRE);
 67 |     uint64_t value_2 = __atomic_load_n((((uint64_t *) &source) + 1), __ATOMIC_ACQUIRE);
 68 |     return Task::Ptr{ (Task *) value_1, value_2 };
 69 | #endif
 70 | }
 71 | 
 72 | TaskQueue::TaskQueue() : tasks_created(0), sleep_state(0) {
 73 |     head = Task::Ptr(alloc(0));
 74 |     tail = head;
 75 | }
 76 | 
 77 | TaskQueue::~TaskQueue() {
 78 |     uint32_t created = tasks_created.load(),
 79 |              deleted = 0, incomplete = 0,
 80 |              incomplete_size = 0;
 81 | 
 82 |     // Free jobs that are still in the queue
 83 |     Task::Ptr ptr = head;
 84 |     while (ptr.task) {
 85 |         Task *task = ptr.task;
 86 | 
 87 |         if (ptr.remain() != 0) {
 88 |             incomplete_size += ptr.remain();
 89 |             incomplete++;
 90 |         }
 91 | 
 92 |         for (Task *child : task->children) {
 93 |             uint32_t wait = child->wait_parents.fetch_sub(1);
 94 |             NT_ASSERT(wait != 0);
 95 |             if (wait == 1)
 96 |                 push(child);
 97 |         }
 98 | 
 99 |         task->clear();
100 |         deleted++;
101 |         ptr = task->next;
102 |         delete task;
103 |     }
104 | 
105 |     // Free jobs on the free-job stack
106 |     ptr = recycle;
107 |     while (ptr.task) {
108 |         Task *task = ptr.task;
109 |         NT_ASSERT(task->payload == nullptr && task->children.empty());
110 |         deleted++;
111 |         ptr = task->next;
112 |         delete task;
113 |     }
114 | 
115 |     if (created != deleted)
116 |         fprintf(stderr,
117 |                 "nanothread: %u/%u tasks were leaked! Did you forget to call "
118 |                 "task_release()?\n", created - deleted, created);
119 | 
120 |     if (incomplete > 0)
121 |         fprintf(stderr, "nanothread: %u tasks with %u work units were not "
122 |                 "completed!\n", incomplete, incomplete_size);
123 | }
124 | 
125 | Task *TaskQueue::alloc(uint32_t size) {
126 |     Task::Ptr node = ldar(recycle);
127 | 
128 |     while (true) {
129 |         // Stop if stack is empty
130 |         if (!node)
131 |             break;
132 | 
133 |         // Load the next node
134 |         Task::Ptr next = ldar(node.task->next);
135 | 
136 |         // Next, try to move it to the stack head
137 |         if (cas(recycle, node, node.update_task(next.task)))
138 |             break;
139 | 
140 |         cas_pause();
141 |     }
142 | 
143 |     Task *task;
144 | 
145 |     if (node.task) {
146 |         task = node.task;
147 |     } else {
148 |         task = new Task();
149 |         tasks_created++;
150 |     }
151 | 
152 |     task->next = Task::Ptr();
153 |     task->refcount.store(size + (size == 0 ? high_bit : (3 * high_bit)),
154 |                          std::memory_order_relaxed);
155 |     task->wait_parents.store(0, std::memory_order_relaxed);
156 |     task->wait_count.store(0, std::memory_order_relaxed);
157 |     task->size = size;
158 |     memset(&task->time_start, 0, sizeof(task->time_start));
159 |     memset(&task->time_end, 0, sizeof(task->time_end));
160 | 
161 |     NT_TRACE("created new task %p with size=%u", task, size);
162 | 
163 |     return task;
164 | }
165 | 
166 | void TaskQueue::release(Task *task, bool high) {
167 |     uint64_t result = task->refcount.fetch_sub(high ? high_bit : 1);
168 |     uint32_t ref_lo = (uint32_t) result,
169 |              ref_hi = (uint32_t) (result >> 32);
170 | 
171 |     NT_ASSERT((!high || ref_hi > 0) && (high || ref_lo > 0));
172 |     ref_hi -= (uint32_t) high;
173 |     ref_lo -= (uint32_t) !high;
174 | 
175 |     NT_TRACE("dec_ref(%p, (%i, %i)) -> ref = (%u, %u)", task, (int) high,
176 |              (int) !high, ref_hi, ref_lo);
177 | 
178 |     // If all work has completed: schedule children and free payload
179 |     if (!high && ref_lo == 0) {
180 |         NT_TRACE("all work associated with task %p has completed.", task);
181 | 
182 |         if (profile_tasks) {
183 |             #if defined(_WIN32)
184 |                 QueryPerformanceCounter(&task->time_end);
185 |             #elif defined(__APPLE__)
186 |                 task->time_end = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
187 |             #else
188 |                 clock_gettime(CLOCK_MONOTONIC, &task->time_end);
189 |             #endif
190 |         }
191 | 
192 |         for (Task *child : task->children) {
193 |             uint32_t wait = child->wait_parents.fetch_sub(1);
194 | 
195 |             NT_TRACE("notifying child %p of task %p: wait=%u", child, task,
196 |                      wait - 1);
197 | 
198 |             NT_ASSERT(wait > 0);
199 | 
200 |             if (task->exception_used.load()) {
201 |                 bool expected = false;
202 |                 if (child->exception_used.compare_exchange_strong(expected, true)) {
203 |                     NT_TRACE("propagating exception to child %p of task %p.",
204 |                              child, task);
205 |                     child->exception = task->exception;
206 |                 } else {
207 |                     NT_TRACE("not propagating exception to child %p of "
208 |                              "task %p (already stored).", child, task);
209 |                 }
210 |             }
211 | 
212 |             if (wait == 1) {
213 |                 NT_TRACE("Child %p of task %p is ready for execution.", child,
214 |                          task);
215 |                 push(child);
216 |             }
217 |         }
218 | 
219 |         task->clear();
220 | 
221 |         // Possible that waiting threads were put to sleep
222 |         if (task->wait_count.load() > 0)
223 |             wakeup();
224 | 
225 |         release(task, true);
226 |     } else if (high && ref_hi == 0) {
227 |         // Nobody holds any references at this point, recycle task
228 | 
229 |         NT_ASSERT(ref_lo == 0);
230 |         NT_TRACE("all usage of task %p is done, recycling.", task);
231 | 
232 |         Task::Ptr node = ldar(recycle);
233 |         while (true) {
234 |             task->next = node;
235 | 
236 |             if (cas(recycle, node, node.update_task(task)))
237 |                 break;
238 | 
239 |             cas_pause();
240 |         }
241 |     }
242 | }
243 | 
244 | void TaskQueue::add_dependency(Task *parent, Task *child) {
245 |     if (!parent)
246 |         return;
247 | 
248 |     uint64_t refcount =
249 |         parent->refcount.load(std::memory_order_relaxed);
250 | 
251 |     /* Increase the parent task's reference count to prevent the cleanup
252 |        handler in release() from starting while the following executes. */
253 |     while (true) {
254 |         if ((uint32_t) refcount == 0) {
255 |             // Parent task has already completed
256 |             if (parent->exception_used.load()) {
257 |                 bool expected = false;
258 |                 if (child->exception_used.compare_exchange_strong(expected, true)) {
259 |                     NT_TRACE("propagating exception to child %p of task %p.",
260 |                              child, parent);
261 |                     child->exception = parent->exception;
262 |                 } else {
263 |                     NT_TRACE("not propagating exception to child %p of "
264 |                              "task %p (already stored).", child, parent);
265 |                 }
266 |             }
267 |             return;
268 |         }
269 | 
270 |         if (parent->refcount.compare_exchange_weak(refcount, refcount + 1,
271 |                                                    std::memory_order_release,
272 |                                                    std::memory_order_relaxed))
273 |             break;
274 | 
275 |         cas_pause();
276 |     }
277 | 
278 |     // Otherwise, register the child task with the parent
279 |     parent->children.push_back(child);
280 |     uint32_t wait = ++child->wait_parents;
281 |     (void) wait;
282 | 
283 |     NT_TRACE("registering dependency: parent=%p, child=%p, child->wait=%u",
284 |              parent, child, wait);
285 | 
286 |     /* Undo the parent->refcount change. If the task completed in the
287 |        meantime, child->wait_parents will also be decremented by
288 |        this call. */
289 |     release(parent);
290 | }
291 | 
292 | void TaskQueue::retain(Task *task) {
293 |     NT_TRACE("retain(task=%p)", task);
294 |     task->refcount.fetch_add(high_bit);
295 | }
296 | 
297 | void TaskQueue::push(Task *task) {
298 |     uint32_t size = task->size;
299 | 
300 |     NT_TRACE("push(task=%p, size=%u)", task, size);
301 | 
302 |     while (true) {
303 |         // Lead tail and tail->next, and double-check, in this order
304 |         Task::Ptr tail_c = ldar(tail);
305 |         Task::Ptr &next = tail_c.task->next;
306 |         Task::Ptr next_c = ldar(next);
307 |         Task::Ptr tail_c_2 = ldar(tail);
308 | 
309 |         // Detect inconsistencies due to contention
310 |         if (tail_c == tail_c_2) {
311 |             if (!next_c.task) {
312 |                 // Tail was pointing to last node, try to insert here
313 |                 if (cas(next, next_c, Task::Ptr(task, size))) {
314 |                     // Best-effort attempt to redirect tail to the added element
315 |                     cas(tail, tail_c, tail_c.update_task(task));
316 |                     break;
317 |                 }
318 |             } else {
319 |                 // Tail wasn't pointing to the last node, try to update
320 |                 cas(tail, tail_c, tail_c.update_task(next_c.task));
321 |             }
322 |         }
323 | 
324 |         cas_pause();
325 |     }
326 | 
327 |     // Wake sleeping threads, if any
328 |     if (sleep_state.load(std::memory_order_acquire) & low_mask)
329 |         wakeup();
330 | }
331 | 
332 | std::pair<Task *, uint32_t> TaskQueue::pop() {
333 |     uint32_t index;
334 |     Task *task;
335 | 
336 |     while (true) {
337 |         // Lead head, tail, and next element, and double-check, in this order
338 |         Task::Ptr head_c = ldar(head);
339 |         Task::Ptr tail_c = ldar(tail);
340 |         Task::Ptr &next = head_c.task->next;
341 |         Task::Ptr next_c = ldar(next);
342 |         Task::Ptr head_c_2 = ldar(head);
343 | 
344 |         // Detect inconsistencies due to contention
345 |         if (head_c == head_c_2) {
346 |             if (head_c.task != tail_c.task) {
347 |                 uint32_t remain = next_c.remain();
348 | 
349 |                 if (remain > 1) {
350 |                     // More than 1 remaining work units, update work counter
351 |                     if (cas(next, next_c, next_c.update_remain(remain - 1))) {
352 |                         task = next_c.task;
353 |                         index = task->size - remain;
354 |                         break;
355 |                     }
356 |                 } else {
357 |                     NT_ASSERT(remain == 1);
358 |                     // Head node is removed from the queue, reduce refcount
359 |                     if (cas(head, head_c, head_c.update_task(next_c.task))) {
360 |                         task = next_c.task;
361 |                         index = task->size - 1;
362 |                         release(head_c.task, true);
363 |                         break;
364 |                     }
365 |                 }
366 |             } else {
367 |                 // Task queue was empty
368 |                 if (!next_c.task) {
369 |                     task = nullptr;
370 |                     index = 0;
371 |                     cas_pause();
372 |                     break;
373 |                 } else {
374 |                     // Advance the tail, it's falling behind
375 |                     cas(tail, tail_c, tail_c.update_task(next_c.task));
376 |                 }
377 |             }
378 |         }
379 | 
380 |         cas_pause();
381 |     }
382 | 
383 |     if (task) {
384 |         NT_TRACE("pop(task=%p, index=%u)", task, index);
385 | 
386 |         if (index == 0 && profile_tasks) {
387 |             #if defined(_WIN32)
388 |                 QueryPerformanceCounter(&task->time_start);
389 |             #elif defined(__APPLE__)
390 |                 task->time_start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
391 |             #else
392 |                 clock_gettime(CLOCK_MONOTONIC, &task->time_start);
393 |             #endif
394 |         }
395 |     }
396 | 
397 |     return { task, index };
398 | }
399 | 
400 | void TaskQueue::wakeup() {
401 |     std::unique_lock<std::mutex> guard(sleep_mutex);
402 |     uint64_t value = sleep_state.load();
403 |     NT_TRACE("wakeup(): sleep_state := (%u, 0)", (uint32_t) (sleep_state >> 32) + 1);
404 |     sleep_state = (value + high_bit) & high_mask;
405 |     sleep_cv.notify_all();
406 | }
407 | 
408 | #if defined(NT_DEBUG)
409 | double time_milliseconds() {
410 |     #if defined(_WIN32)
411 |         LARGE_INTEGER ticks, ticks_per_sec;
412 |         QueryPerformanceCounter(&ticks);
413 |         QueryPerformanceFrequency(&ticks_per_sec);
414 |         return (double) (ticks.QuadPart * 1000) / (double) ticks_per_sec.QuadPart;
415 |     #elif defined(__APPLE__)
416 |         return clock_gettime_nsec_np(CLOCK_UPTIME_RAW) / 1000000.0;
417 |     #else
418 |         struct timespec ts;
419 |         clock_gettime(CLOCK_MONOTONIC, &ts);
420 |         return ts.tv_sec * 1000 + ts.tv_nsec / 1000000.0;
421 |     #endif
422 | }
423 | #endif
424 | 
425 | std::pair<Task *, uint32_t>
426 | TaskQueue::pop_or_sleep(bool (*stopping_criterion)(void *), void *payload,
427 |                         bool may_sleep) {
428 |         std::pair<Task *, uint32_t> result(nullptr, 0);
429 |         uint32_t attempts = 0;
430 | 
431 | #if defined(NT_DEBUG)
432 |     double start = time_milliseconds();
433 | #endif
434 | 
435 |     while (true) {
436 |         result = pop();
437 | 
438 |         if (result.first || stopping_criterion(payload))
439 |             break;
440 | 
441 |         attempts++;
442 | 
443 |         if (may_sleep && attempts >= NANOTHREAD_MAX_ATTEMPTS) {
444 |             std::unique_lock<std::mutex> guard(sleep_mutex);
445 | 
446 |             uint64_t value = ++sleep_state, phase = value & high_mask;
447 |             NT_TRACE("pop_or_sleep(): falling asleep after %.2f milliseconds, "
448 |                      "sleep_state := (%u, %u)!",
449 |                      time_milliseconds() - start, (uint32_t)(value >> 32),
450 |                      (uint32_t) value);
451 | 
452 |             // Try once more to fetch a job
453 |             result = pop();
454 | 
455 |             /* If the following is true, somebody added work, or the stopping
456 |                became active while this thread was about to go to sleep. */
457 |             if (result.first || stopping_criterion(payload)) {
458 |                 // Reduce sleep_state if we're still in the same phase.
459 |                 NT_TRACE("sleep aborted.");
460 |                 while (true) {
461 |                     if (sleep_state.compare_exchange_strong(value, value - 1))
462 |                         break;
463 |                     if ((value & high_mask) != phase)
464 |                         break;
465 |                     cas_pause();
466 |                 }
467 |                 break;
468 |             }
469 | 
470 |             /* The push() code above has the structure
471 | 
472 |                 - A1. Enqueue work
473 |                 - A2. Check sleep_state, and wake threads if nonzero
474 | 
475 |                While the code here has the structure
476 | 
477 |                 - B1. Increase sleep_state
478 |                 - B2. Try to dequeue work
479 |                 - B3. Wait for wakeup signal
480 | 
481 |                This ordering excludes the possibility that the thread sleeps
482 |                erroneously while work is available or added later on.
483 |             */
484 | 
485 |             while ((sleep_state & high_mask) == phase)
486 |                 sleep_cv.wait(guard);
487 | 
488 |             value = sleep_state.load();
489 |             NT_TRACE("pop_or_sleep(): woke up -- sleep_state=(%u, %u)",
490 |                      (uint32_t)(value >> 32), (uint32_t) value);
491 |         }
492 |     }
493 | 
494 |     return result;
495 | }
496 | 


--------------------------------------------------------------------------------
/src/queue.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |     src/queue.h -- Lock-free task queue implementation used by nanothread
  3 | 
  4 |     Copyright (c) 2021 Wenzel Jakob <wenzel.jakob@epfl.ch>
  5 | 
  6 |     All rights reserved. Use of this source code is governed by a BSD-style
  7 |     license that can be found in the LICENSE file.
  8 | */
  9 | 
 10 | #pragma once
 11 | 
 12 | #include <atomic>
 13 | #include <vector>
 14 | #include <condition_variable>
 15 | #include <type_traits>
 16 | #include <exception>
 17 | #include <cstring>
 18 | #include <cstdlib>
 19 | 
 20 | #if defined(_WIN32)
 21 | #  include <windows.h>
 22 | #  include <shared_mutex>
 23 | using Lock = std::shared_mutex; // Prefer (more efficient) shared_mutex on Windows
 24 | #else
 25 | #  include <mutex>
 26 | using Lock = std::mutex;
 27 | #endif
 28 | 
 29 | 
 30 | struct Pool;
 31 | 
 32 | constexpr uint64_t high_bit  = (uint64_t) 0x0000000100000000ull;
 33 | constexpr uint64_t high_mask = (uint64_t) 0xFFFFFFFF00000000ull;
 34 | constexpr uint64_t low_mask  = (uint64_t) 0x00000000FFFFFFFFull;
 35 | 
 36 | inline uint64_t shift(uint32_t value) { return ((uint64_t) value) << 32; }
 37 | 
 38 | struct Task {
 39 |     /**
 40 |      * \brief Wide 16 byte pointer to a task in the worker pool. In addition to the
 41 |      * pointer itself, it encapsulates two more pieces of information:
 42 |      *
 43 |      * 1. The lower 32 bit of the \c value field store how many remaining work
 44 |      *    units the task contains
 45 |      *
 46 |      * 2. The upper 32 bit of the \c value field contain a counter to prevent
 47 |      *    the ABA problem during atomic updates.
 48 |      */
 49 |     struct alignas(16) Ptr {
 50 |         Task *task;
 51 |         uint64_t value;
 52 | 
 53 |         Ptr(Task *task = nullptr, uint64_t value = 0) : task(task), value(value) { }
 54 | 
 55 |         Task::Ptr update_remain(uint32_t remain = 0) const {
 56 |             return Ptr{ task, remain | ((value & high_mask) + high_bit) };
 57 |         }
 58 | 
 59 |         Task::Ptr update_task(Task *new_task) const {
 60 |             return Ptr{ new_task, (value & high_mask) + high_bit };
 61 |         }
 62 | 
 63 |         operator bool() const { return task != nullptr; }
 64 | 
 65 |         uint32_t remain() const { return (uint32_t) value; }
 66 | 
 67 |         bool operator==(const Task::Ptr &other) const {
 68 |             return task == other.task && value == other.value;
 69 |         }
 70 |     };
 71 | 
 72 |     /// Singly linked list, points to the next element
 73 |     Task::Ptr next;
 74 | 
 75 |     /**
 76 |      * \brief Reference count of this instance
 77 |      *
 78 |      * The reference count is arranged as a 2-tuple of 32 bit counters. When
 79 |      * submitting a work unit, its reference count is initially set to <tt>(3,
 80 |      * size)</tt>, where \c size is the number of associated work units. The
 81 |      * number '3' indicates three special references
 82 |      *
 83 |      *  - 1. A reference by the user code, which may e.g. wait for task completion
 84 |      *  - 2. A reference as part of the queue data structure
 85 |      *  - 3. A reference because the lower part is nonzero
 86 |      *
 87 |      * The function <tt>TaskQueue::release(task, high=true/false)</tt> can be
 88 |      * used to reduce the high and low parts separately.
 89 |      *
 90 |      * When the low part reaches zero, it assumed that all associated work
 91 |      * units have been completed, at which point child tasks are scheduled
 92 |      * and the task's payload is cleared. When both high and low parts reach
 93 |      * zero, it is assumed that no part of the system holds a reference to the
 94 |      * task, and it can be recycled.
 95 |      */
 96 |     std::atomic<uint64_t> refcount;
 97 | 
 98 |     /// Number of parent tasks that this task is waiting for
 99 |     std::atomic<uint32_t> wait_parents;
100 | 
101 |     /// Number of threads that are waiting for this task in task_wait()
102 |     std::atomic<uint32_t> wait_count;
103 | 
104 |     /// Total number of work units in this task
105 |     uint32_t size;
106 | 
107 |     /// Callback of the work unit
108 |     void (*func)(uint32_t, void *);
109 | 
110 |     /// Pool that this tasks belongs to
111 |     Pool *pool;
112 | 
113 |     /// Payload to be delivered to 'func'
114 |     void *payload;
115 | 
116 |     /// Custom deleter used to free 'payload'
117 |     void (*payload_deleter)(void *);
118 | 
119 |     /// Successor tasks that depend on this task
120 |     std::vector<Task *> children;
121 | 
122 |     /// Atomic flag stating whether the 'exception' field is already used
123 |     std::atomic<bool> exception_used;
124 | 
125 |     /// Pointer to an exception in case the task failed
126 |     std::exception_ptr exception;
127 | 
128 | #if defined(__APPLE__)
129 |     uint64_t time_start, time_end;
130 | #elif !defined(_WIN32)
131 |     timespec time_start, time_end;
132 | #else
133 |     LARGE_INTEGER time_start, time_end;
134 | #endif
135 | 
136 |     /// Fixed-size payload storage region
137 |     alignas(8) uint8_t payload_storage[256];
138 | 
139 |     void clear() {
140 |         if (payload_deleter)
141 |             payload_deleter(payload);
142 |         payload_deleter = nullptr;
143 |         payload = nullptr;
144 |         children.clear();
145 | #if !defined(NDEBUG)
146 |         memset(payload_storage, 0xFF, sizeof(payload_storage));
147 | #endif
148 |     }
149 | };
150 | 
151 | /**
152 |  * Modified implementation of the lock-free queue presented in the paper
153 |  *
154 |  * "Simple, fast and practical non-blocking and blocking concurrent queue algorithms"
155 |  * by Maged Michael and Michael Scott.
156 |  *
157 |  * The main difference compared to a Michael-Scott queue is that each queue
158 |  * item also has a *size* \c N that effectively creates \c N adjacent copies of
159 |  * the item (but using a counter, which is more efficient than naive
160 |  * replication). The \ref pop() operation returns the a pointer to the item and
161 |  * a number in the range <tt>[0, N-1]</tt> indicating the item's index.
162 |  *
163 |  * Tasks can also have children. Following termination of a task, the queue
164 |  * will push any children that don't depend on other unfinished work.
165 |  *
166 |  * The implementation here is designed to work on standard weakly ordered
167 |  * memory architecture (e.g. AArch64), but likely would not not work an
168 |  * completely weakly ordered architecture like the DEC Alpha.
169 |  */
170 | struct TaskQueue {
171 | public:
172 |     /// Create an empty task queue
173 |     TaskQueue();
174 | 
175 |     /// Free the queue and delete any remaining tasks
176 |     ~TaskQueue();
177 | 
178 |     /**
179 |      * \brief Allocate a new task record consisting of \c size work units
180 |      *
181 |      * The implementation tries to fetch an available task instance from a
182 |      * pool of completed tasks, if possible. Otherwise, a new task is created.
183 |      *
184 |      * It is assumed that the caller will populate the remaining fields of the
185 |      * returned task and then invoke \ref push() to submit the task to the
186 |      * queue. The reference count of the returned task is initially set to
187 |      * <tt>(2, size)</tt>, where \c size is the number of associated work
188 |      * units. The number '2' indicates two special references by user code and
189 |      * by the queue itself, which don't correspond to outstanding work.
190 |      *
191 |      * Initializes the Tasks' \c wait \c size, \c refcount, and \c next fields.
192 |      */
193 |     Task *alloc(uint32_t size);
194 | 
195 |     /**
196 |      * \brief Decrease the reference count of a task.
197 |      *
198 |      * The implementation moves the task into a pool of completed tasks once
199 |      * the task is no longer referenced by any thread or data structure.
200 |      */
201 |     void release(Task *task, bool high = false);
202 | 
203 |     /// Increase the reference count of a task.
204 |     void retain(Task *task);
205 | 
206 |     /// Append a task at the end of the queue
207 |     void push(Task *task);
208 | 
209 |     /// Register an inter-task dependency
210 |     void add_dependency(Task *task, Task *child);
211 | 
212 |     /**
213 |      * \brief Pop a task from the queue
214 |      *
215 |      * When the queue is nonempty, this function returns a task instance and a
216 |      * number in the range <tt>[0, size - 1]</tt>, where \c size is the number
217 |      * of work units in the task. Otherwise, it returns \c nullptr and 0.
218 |      */
219 |     std::pair<Task *, uint32_t> pop();
220 | 
221 |     /**
222 |      * \breif Fetch a task from the queue, or sleep
223 |      *
224 |      * This function repeatedly tries to fetch work from the queue and sleeps
225 |      * if no work is available for an extended amount of time (~50 ms) and
226 |      * the \c may_sleep parameter is set to \c true.
227 |      *
228 |      * The function stops trying to acquire work and returns <tt>(nullptr,
229 |      * 0)</tt> when the supplied function <tt>stopping_criterion(payload)</tt>
230 |      * evaluates to true.
231 |      */
232 |     std::pair<Task *, uint32_t> pop_or_sleep(bool (*stopping_criterion)(void *),
233 |                                              void *payload, bool may_sleep);
234 | 
235 |     /// Wake sleeping threads
236 |     void wakeup();
237 | 
238 | private:
239 |     /// Head and tail of a lock-free list data structure
240 |     Task::Ptr head, tail;
241 | 
242 |     /// Head of a lock-free stack storing unused tasks
243 |     Task::Ptr recycle;
244 | 
245 |     /// Number of task instances created (for debugging)
246 |     std::atomic<uint32_t> tasks_created;
247 | 
248 |     /// Upper 32 bit: sleep phase, lower 32 bit: number of sleepers
249 |     std::atomic<uint64_t> sleep_state;
250 | 
251 |     /// Mutex protecting the fields below
252 |     std::mutex sleep_mutex;
253 | 
254 |     /// Condition variable used to manage workers that are asleep
255 |     std::condition_variable sleep_cv;
256 | };
257 | 
258 | 
259 | extern "C" uint32_t pool_thread_id();
260 | 
261 | extern int profile_tasks;
262 | 
263 | #define NT_STR_2(x) #x
264 | #define NT_STR(x)   NT_STR_2(x)
265 | 
266 | // #define NT_DEBUG
267 | #if defined(NT_DEBUG)
268 | #  define NT_TRACE(fmt, ...)                                                  \
269 |       fprintf(stderr, "%03u: " fmt "\n", pool_thread_id(), ##__VA_ARGS__)
270 | #else
271 | #  define NT_TRACE(fmt, ...) do { } while (0)
272 | #endif
273 | 
274 | #define NT_ASSERT(x)                                                           \
275 |     if (!(x)) {                                                                \
276 |         fprintf(stderr, "Assertion failed in " __FILE__                        \
277 |                         ":" NT_STR(__LINE__) ": " #x "\n");                    \
278 |         abort();                                                               \
279 |     }
280 | 
281 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(test_01 test_01.c)
 2 | target_link_libraries(test_01 PRIVATE nanothread)
 3 | 
 4 | add_executable(test_02 test_02.cpp)
 5 | target_link_libraries(test_02 PRIVATE nanothread)
 6 | target_compile_features(test_02 PRIVATE cxx_std_11)
 7 | 
 8 | add_executable(test_03 test_03.cpp)
 9 | target_link_libraries(test_03 PRIVATE nanothread)
10 | target_compile_features(test_03 PRIVATE cxx_std_14)
11 | 
12 | add_executable(test_04 test_04.cpp)
13 | target_link_libraries(test_04 PRIVATE nanothread)
14 | target_compile_features(test_04 PRIVATE cxx_std_11)
15 | 


--------------------------------------------------------------------------------
/tests/test_01.c:
--------------------------------------------------------------------------------
 1 | #include <nanothread/nanothread.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | #include <stdlib.h>
 5 | 
 6 | #if defined(_WIN32)
 7 | #  include <windows.h>
 8 | #else
 9 | #  include <unistd.h>
10 | #endif
11 | 
12 | // Task callback function. Will be called with index = 0..999
13 | void my_task(uint32_t index, void *payload) {
14 |     printf("Worker thread %u: work unit %u\n", pool_thread_id(), index);
15 | 
16 |     // Sleep for a bit
17 | #if defined(_WIN32)
18 |     Sleep(500);
19 | #else
20 |     usleep(500000);
21 | #endif
22 | 
23 |     // Use payload to communicate some data to the caller
24 |     ((uint32_t *) payload)[index] = index;
25 | }
26 | 
27 | int main(int argc, char** argv) {
28 |     (void) argc; (void) argv; // Command line arguments unused
29 | 
30 |     uint32_t temp[10000];
31 | 
32 |     memset(temp, 0, sizeof(int) * 1000);
33 | 
34 |     // Create a worker per CPU thread
35 |     Pool *pool = pool_create(100, 0);
36 | 
37 |     // Synchronous interface: submit a task and wait for it to complete
38 |     task_submit_and_wait(
39 |         pool,
40 |         1000,     // How many work units does this task contain?
41 |         my_task, // Function to be executed
42 |         temp     // Optional payload, will be passed to function
43 |     );
44 | 
45 |     // .. contents of 'temp' are now ready ..
46 |     for (uint32_t i = 0; i < 1000; ++i) {
47 |         if (temp[i] != i) {
48 |             fprintf(stderr, "Test failed!\n");
49 |             abort();
50 |         }
51 |     }
52 | 
53 |     // Clean up used resources
54 |     pool_destroy(pool);
55 | }
56 | 


--------------------------------------------------------------------------------
/tests/test_02.cpp:
--------------------------------------------------------------------------------
 1 | #include <nanothread/nanothread.h>
 2 | #include <cstdlib>
 3 | #include <cstring>
 4 | 
 5 | #if defined(_WIN32)
 6 | #  include <windows.h>
 7 | #else
 8 | #  include <unistd.h>
 9 | #endif
10 | 
11 | 
12 | int main(int, char**) {
13 |     uint32_t temp[10000];
14 | 
15 |     memset(temp, 0, sizeof(int) * 1000);
16 | 
17 |     // Create a worker per CPU thread
18 |     Pool *pool = pool_create(100);
19 | 
20 |     Task *task = drjit::parallel_for_async(
21 |         drjit::blocked_range<uint32_t>(0, 1000, 1),
22 | 
23 |         // Task callback function. Will be called with index = 0..999
24 |         [&](drjit::blocked_range<uint32_t> range) {
25 |             for (uint32_t i = range.begin(); i != range.end(); ++i) {
26 |                 printf("Worker thread %u: work unit %u\n", pool_thread_id(), i);
27 | 
28 |                 // Sleep for a bit
29 |                 #if defined(_WIN32)
30 |                     Sleep(500);
31 |                 #else
32 |                     usleep(500000);
33 |                 #endif
34 | 
35 |                 // Use payload to communicate some data to the caller
36 |                 temp[i] = i;
37 |             }
38 |         },
39 | 
40 |         {}, pool
41 |     );
42 | 
43 |     // Synchronous interface: submit a task and wait for it to complete
44 |     task_wait(task);
45 | 
46 |     // .. contents of 'temp' are now ready ..
47 |     for (uint32_t i = 0; i < 1000; ++i) {
48 |         if (temp[i] != i) {
49 |             fprintf(stderr, "Test failed!\n");
50 |             abort();
51 |         }
52 |     }
53 | 
54 |     // Clean up used resources
55 |     pool_destroy(pool);
56 | }
57 | 


--------------------------------------------------------------------------------
/tests/test_03.cpp:
--------------------------------------------------------------------------------
 1 | #include <nanothread/nanothread.h>
 2 | #include <stdlib.h>
 3 | 
 4 | Task *tetranacci(Pool *pool, uint32_t i, uint32_t *out) {
 5 |     if (i < 4) {
 6 |         *out = (i == 3) ? 1 : 0;
 7 |         return nullptr;
 8 |     }
 9 | 
10 |     uint32_t *tmp = new uint32_t[4];
11 | 
12 |     Task *task[4] = {
13 |         tetranacci(pool, i - 1, tmp),
14 |         tetranacci(pool, i - 2, tmp + 1),
15 |         tetranacci(pool, i - 3, tmp + 2),
16 |         tetranacci(pool, i - 4, tmp + 3)
17 |     };
18 | 
19 |     Task *rv = drjit::do_async(
20 |         [tmp, out]() {
21 |             *out = tmp[0] + tmp[1] + tmp[2] + tmp[3];
22 |             delete[] tmp;
23 |         }, { task[0], task[1], task[2], task[3] },
24 |         pool
25 |     );
26 | 
27 |     task_release(task[0]);
28 |     task_release(task[1]);
29 |     task_release(task[2]);
30 |     task_release(task[3]);
31 | 
32 |     return rv;
33 | }
34 | 
35 | Task * tetranacci_2(Pool *pool, uint32_t i, uint32_t *out) {
36 |     if (i < 4) {
37 |         *out = (i == 3) ? 1 : 0;
38 |         return nullptr;
39 |     }
40 | 
41 |     return drjit::do_async(
42 |         [pool, i, out]() {
43 |             uint32_t tmp[4];
44 |             Task *task[4];
45 | 
46 |             for (int k = 0; k < 4; ++k)
47 |                 task[k] = tetranacci_2(pool, i - k - 1, tmp + k);
48 |             for (int k = 0; k < 4; ++k)
49 |                 task_wait_and_release(task[k]);
50 | 
51 |             *out = tmp[0] + tmp[1] + tmp[2] + tmp[3];
52 |         }, {}, pool
53 |     );
54 | }
55 | 
56 | int main(int, char**) {
57 |     // Create a worker per CPU thread
58 |     for (int i = 0; i< 100; ++i) {
59 |         printf("Testing with %i threads..\n", i);
60 |         Pool *pool = pool_create(i);
61 | 
62 |         uint32_t out = 0;
63 |         Task *task = tetranacci(pool, 16, &out);
64 |         task_wait_and_release(task);
65 |         if (out != 2872)
66 |             abort();
67 | 
68 |         task = tetranacci_2(pool, 16, &out);
69 |         task_wait_and_release(task);
70 |         if (out != 2872)
71 |             abort();
72 | 
73 |         // Clean up used resources
74 |         pool_destroy(pool);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/tests/test_04.cpp:
--------------------------------------------------------------------------------
 1 | #include <nanothread/nanothread.h>
 2 | #include <stdexcept>
 3 | 
 4 | #if defined(_WIN32)
 5 | #  include <windows.h>
 6 | #else
 7 | #  include <unistd.h>
 8 | #endif
 9 | 
10 | void my_sleep(uint32_t amt) {
11 | #if defined(_WIN32)
12 |     Sleep(amt);
13 | #else
14 |     usleep(amt * 1000);
15 | #endif
16 | }
17 | 
18 | namespace dr = drjit;
19 | 
20 | void test01() {
21 |     try {
22 |         dr::parallel_for(
23 |             dr::blocked_range<uint32_t>(0, 1000, 5),
24 |             [](dr::blocked_range<uint32_t> /* range */) {
25 |                 throw std::runtime_error("Hello world!");
26 |             }
27 |         );
28 |     } catch (std::exception &e) {
29 |         printf("Test 1: success: %s\n", e.what());
30 |         return;
31 |     }
32 |     abort();
33 | }
34 | 
35 | void test02(bool wait) {
36 |     auto work1 = dr::parallel_for_async(
37 |         dr::blocked_range<uint32_t>(0, 10, 1),
38 |         [](dr::blocked_range<uint32_t> /* range */) {
39 |             my_sleep(10);
40 |             throw std::runtime_error("Hello world!");
41 |         }
42 |     );
43 | 
44 |     if (wait)
45 |         my_sleep(100);
46 | 
47 |     auto work2 = dr::parallel_for_async(
48 |         dr::blocked_range<uint32_t>(0, 10, 1),
49 |         [](dr::blocked_range<uint32_t> /* range */) {
50 |             printf("Should never get here!\n");
51 |             abort();
52 |         },
53 |         { work1 }
54 |     );
55 | 
56 |     task_release(work1);
57 | 
58 |     try {
59 |         task_wait_and_release(work2);
60 |     } catch (std::exception &e) {
61 |         printf("Test 2: success: %s\n", e.what());
62 |         return;
63 |     }
64 |     abort();
65 | }
66 | 
67 | int main(int, char**) {
68 |     test01();
69 |     test02(false);
70 |     test02(true);
71 | }
72 | 


--------------------------------------------------------------------------------