├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── include └── nanothread │ └── nanothread.h ├── src ├── nanothread.cpp ├── queue.cpp └── queue.h └── tests ├── CMakeLists.txt ├── test_01.c ├── test_02.cpp ├── test_03.cpp └── test_04.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | /.clangd 2 | /compile_commands.json 3 | *.cmake 4 | CMakeCache.txt 5 | CMakeFiles 6 | Makefile 7 | *.ninja 8 | \.cache 9 | \.ninja_* 10 | Testing 11 | build 12 | .vscode 13 | *.vcxproj 14 | *.vcxproj.filters 15 | nanothread.sln 16 | nanothread.dir 17 | Release 18 | Debug 19 | 20 | libnanothread.so 21 | libnanothread.dylib 22 | nanothread.dll 23 | tests/test_0[1-4] 24 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ext/cmake-defaults"] 2 | path = ext/cmake-defaults 3 | url = https://github.com/mitsuba-renderer/cmake-defaults 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # Top-level nanothread CMake file, needs a recent version 3 | # ---------------------------------------------------------- 4 | cmake_minimum_required(VERSION 3.13...3.18) 5 | 6 | project(nanothread 7 | DESCRIPTION 8 | "nanothread" 9 | LANGUAGES 10 | CXX C 11 | ) 12 | 13 | # ---------------------------------------------------------- 14 | # Optional features available to users 15 | # ---------------------------------------------------------- 16 | 17 | option(NANOTHREAD_STATIC "Build as static library?" OFF) 18 | option(NANOTHREAD_ENABLE_TESTS "Build test suite?" OFF) 19 | 20 | # ---------------------------------------------------------- 21 | # Check if submodules have been checked out, or fail early 22 | # ---------------------------------------------------------- 23 | 24 | if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ext/cmake-defaults/CMakeLists.txt") 25 | message(FATAL_ERROR "The nanothread dependencies are missing! " 26 | "You probably did not clone the project with --recursive. It is possible to recover " 27 | "by invoking\n$ git submodule update --init --recursive") 28 | endif() 29 | 30 | # ---------------------------------------------------------- 31 | # Build defaults for projects by the Realistic Graphics Lab 32 | # ---------------------------------------------------------- 33 | 34 | include(ext/cmake-defaults/CMakeLists.txt) 35 | 36 | # ---------------------------------------------------------- 37 | # Compile the nanothread library 38 | # ---------------------------------------------------------- 39 | 40 | if(NANOTHREAD_STATIC) 41 | add_library(nanothread STATIC) 42 | target_compile_definitions(nanothread PUBLIC -DNANOTHREAD_STATIC) 43 | else() 44 | add_library(nanothread SHARED) 45 | endif() 46 | 47 | target_sources(nanothread PRIVATE 48 | include/nanothread/nanothread.h 49 | src/queue.cpp src/queue.h 50 | src/nanothread.cpp 51 | ) 52 | 53 | target_compile_features(nanothread PRIVATE cxx_std_11) 54 | target_include_directories(nanothread PRIVATE include) 55 | 56 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") 57 | if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") 58 | target_compile_options(nanothread PRIVATE -mcx16) 59 | endif() 60 | endif() 61 | 62 | if (CMAKE_CXX_COMPILER_ID MATCHES "GNU") 63 | # GCC needs libatomic for 16 byte CSA 64 | find_library(LIBATOMIC NAMES libatomic.so libatomic.so.1) 65 | if (NOT LIBATOMIC) 66 | message(FATAL_ERROR "libatomic could not be found!") 67 | endif() 68 | target_link_libraries(nanothread PRIVATE ${LIBATOMIC}) 69 | mark_as_advanced(LIBATOMIC) 70 | endif() 71 | 72 | target_include_directories(nanothread 73 | PUBLIC 74 | $ 75 | $) 76 | 77 | target_compile_definitions(nanothread PRIVATE -DNANOTHREAD_BUILD=1) 78 | set_target_properties(nanothread PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) 79 | 80 | if (NANOTHREAD_ENABLE_TESTS) 81 | add_subdirectory(tests) 82 | endif() 83 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Wenzel Jakob , All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this 7 | list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | 3. Neither the name of the copyright holder nor the names of its contributors 14 | may be used to endorse or promote products derived from this software 15 | without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nanothread — Minimal thread pool for task parallelism 2 | 3 | ## Introduction 4 | 5 | This library provides a minimal cross-platform interface for task parallelism. 6 | Given a computation that is partitioned into a set of interdependent tasks, the 7 | library efficiently distributes this work to a thread pool using lock-free 8 | queues, while respecting dependencies between tasks. 9 | 10 | Each task is associated with a callback function that is potentially invoked 11 | multiple times if the task consists of multiple work units. This whole 12 | process is arbitrarily recursive: task callbacks can submit further jobs, wait 13 | for their completion, etc. Parallel loops, reductions, and more complex 14 | graph-based computations are easily realized using these abstractions. 15 | 16 | This project is internally implemented in C++11, but exposes the main 17 | functionality using a pure C99 API, along with a header-only C++11 convenience 18 | wrapper. It has no dependencies other than CMake and a C++11-capable compiler. 19 | The entire project requires less than 1000 lines of header and 20 | implementation code (according to [cloc](http://cloc.sourceforge.net/)). 21 | 22 | This library is part of the larger 23 | [Dr.Jit](https://github.com/mitsuba-renderer/drjit) project and parallelizes 24 | workloads generated by the 25 | [Dr.Jit-Core](https://github.com/mitsuba-renderer/drjit-core) library. However, 26 | this project has no dependencies on these parent projects and can be used in 27 | any other context. 28 | 29 | ## Why? 30 | 31 | Many of my previous projects have built on [Intel's Thread Building 32 | Blocks](https://software.intel.com/content/www/us/en/develop/tools/threading-building-blocks.html) 33 | for exactly this type of functionality. Unfortunately, large portions of TBB's 34 | task interface were recently deprecated as part of the oneAPI / oneTBB 35 | transition. Rather than struggling with this complex dependency, I decided to 36 | build something minimal and stable that satisfies my requirements. 37 | 38 | ## Examples (C++11 interface) 39 | 40 | The follow examples showcase the C++11 interface, which is a thin header-only 41 | layer over the C99 API. 42 | 43 | ### Parallel for loops (synchronous) 44 | ```cpp 45 | template 46 | void parallel_for(const blocked_range &range, Func &&func, Pool *pool = nullptr); 47 | ``` 48 | This function submits a single task consisting of a arbitrarily many work units 49 | that are processed in blocks of a specified size, and waits for their 50 | completion. If no thread pool ``Pool *`` is specified, the default pool will be 51 | used (and created on the fly, if needed). 52 | 53 | Example: 54 | 55 | ```cpp 56 | #include 57 | 58 | namespace dr = drjit; 59 | 60 | int main(int, char **) { 61 | int result[100]; 62 | 63 | // Call the provided lambda function 99 times with blocks of size 1 64 | dr::parallel_for( 65 | dr::blocked_range(/* begin = */ 0, /* end = */ 100, /* block_size = */ 1), 66 | 67 | // The callback is allowed to be a stateful lambda function 68 | [&](dr::blocked_range range) { 69 | for (uint32_t i = range.begin(); i != range.end(); ++i) { 70 | printf("Worker thread %u is starting to process work unit %u\n", 71 | pool_thread_id(), i); 72 | 73 | // Write to variables defined in the caller's frame 74 | result[i] = i; 75 | } 76 | } 77 | ); 78 | } 79 | ``` 80 | 81 | Small amounts of work that only consist of a single block will immediately be 82 | executed on the calling thread instead of involving the thread pool. Exceptions 83 | occurring during parallel execution will be captured and re-thrown by 84 | ``dr::parallel_for``. 85 | 86 | ### Parallel for loops (asynchronous) 87 | 88 | Parallel `for` loops can also run asynchronously—in that case, the function 89 | immediately returns a ``Task *`` handle that can be used to wait for 90 | completion, or to schedule *child tasks*, whose execution will be delayed until 91 | all parents have completed. 92 | 93 | ```cpp 94 | template 95 | Task *parallel_for_async(const blocked_range &range, Func &&func, 96 | std::initializer_list parents = { }, 97 | Pool *pool = nullptr); 98 | ``` 99 | 100 | The returned task handle must eventually be released using the functions 101 | ``task_release(Task *)`` (which is instantaneous) or 102 | ``task_wait_and_release(Task *)`` (which blocks until the task has terminated). 103 | A failure to do so will leak memory. 104 | 105 | Example: 106 | ```cpp 107 | #include 108 | 109 | namespace dr = drjit; 110 | 111 | int main(int, char **) { 112 | // Schedule task 1 113 | Task *task_1 = dr::parallel_for_async( 114 | dr::blocked_range(/* ... */), 115 | [&](dr::blocked_range range) { /* ... */ } 116 | ); 117 | 118 | // Schedule task 2 119 | Task *task_2 = dr::parallel_for_async( 120 | dr::blocked_range(/* ... */), 121 | [&](dr::blocked_range range) { /* ... */ } 122 | ); 123 | 124 | // Schedule task 3 ... 125 | Task *task_3 = dr::parallel_for_async( 126 | dr::blocked_range(/* ... */), 127 | [&](dr::blocked_range range) { /* ... */ }, 128 | { task_1, task_2 } // ... <- but don't execute until these tasks are done 129 | ); 130 | 131 | task_release(task_1); 132 | task_release(task_2); 133 | task_wait_and_release(task_3); 134 | } 135 | ``` 136 | 137 | If a task only consists of single-threaded work that cannot easily be converted 138 | into a parallel ``for`` loop, the function ``do_async`` provides an more 139 | convenient interface that is analogous to ``parallel_for_async`` with a 140 | ``blocked_range`` of size 1. 141 | 142 | ```cpp 143 | template 144 | Task *do_async(Func &&func, std::initializer_list parents = {}, 145 | Pool *pool = nullptr); 146 | ``` 147 | 148 | ## Examples (C99 interface) 149 | 150 | The following code fragment submits a single task consisting of 100 work units 151 | and waits for its completion. 152 | 153 | ```c 154 | #include 155 | #include 156 | #include 157 | 158 | // Task callback function. Will be called with index = 0..99 159 | void my_task(uint32_t index, void *payload) { 160 | printf("Worker thread %u is starting to process work unit %u\n", 161 | pool_thread_id(), index); 162 | 163 | // Use payload to communicate some data to the caller 164 | ((uint32_t *) payload)[index] = index; 165 | } 166 | 167 | int main(int argc, char** argv) { 168 | uint32_t temp[100]; 169 | 170 | // Create a worker per CPU thread 171 | Pool *pool = pool_create(NANOTHREAD_AUTO); 172 | 173 | // Synchronous interface: submit a task and wait for it to complete 174 | task_submit_and_wait( 175 | pool, 176 | 100, // How many work units does this task contain? 177 | my_task, // Function to be executed 178 | temp // Optional payload, will be passed to function 179 | ); 180 | 181 | // .. contents of 'temp' are now ready .. 182 | 183 | // Clean up used resources 184 | pool_destroy(pool); 185 | } 186 | ``` 187 | 188 | Tasks can also be executed *asynchronously*, in which case extra steps must be 189 | added to wait for tasks, and to release task handles. 190 | 191 | ```c 192 | /// Heap-allocate scratch space for inter-task communication 193 | uint32_t *payload = malloc(100 * sizeof(uint32_t)); 194 | 195 | /// Submit a task and return immediately 196 | Task *task_1 = task_submit( 197 | pool, 198 | 100, // How many work units does this task contain? 199 | my_task_1, // Function to be executed 200 | payload, // Optional payload, will be passed to function 201 | 0, // Size of the payload (only relevant if it should be copied) 202 | nullptr, // Payload deletion callback 203 | 0 // Enforce asynchronous execution even if task is small? 204 | ); 205 | 206 | /// Submit a task that is dependent on other tasks (specifically task_1) 207 | Task *task_2 = task_submit_dep( 208 | pool, 209 | &task_1, // Pointer to a list of parent tasks 210 | 1, // Number of parent tasks 211 | 100, // How many work units does this task contain? 212 | my_task_2, // Function to be executed 213 | payload, // Optional payload, will be passed to function 214 | 0, // Size of the payload (only relevant if it should be copied) 215 | free, // Call free(payload) once this task completes 216 | 0 // Enforce asynchronous execution even if task is small? 217 | ); 218 | 219 | /* Now that the parent-child relationship is specified, 220 | the handle of task 1 can be released */ 221 | task_release(task_1); 222 | 223 | // Wait for the completion of task 2 and also release its handle 224 | task_wait_and_release(task_2); 225 | ``` 226 | 227 | ## Documentation 228 | 229 | The complete API is documented in the file 230 | [nanothread/nanothread.h](https://github.com/mitsuba-renderer/nanothread/blob/master/include/nanothread/nanothread.h). 231 | 232 | ## Technical details 233 | 234 | This library follows a lock-free design: tasks that are ready for execution are 235 | stored in a [Michael-Scott 236 | queue](https://www.cs.rochester.edu/u/scott/papers/1996_PODC_queues.pdf) that 237 | is continuously polled by workers, and task submission/removal relies on atomic 238 | compare-and-swap (CAS) operations. Workers that idle for more than roughly 50 239 | milliseconds are put to sleep until more work becomes available. 240 | 241 | The lock-free design is important: the central data structures of a task 242 | submission system are heavily contended, and traditional abstractions (e.g. 243 | ``std::mutex``) will immediately put contending threads to sleep to defer lock 244 | resolution to the OS kernel. The associated context switches produce an 245 | extremely large overhead that can make a parallel program orders of magnitude 246 | slower than a single-threaded version. 247 | 248 | The implementation catches exception that occur while executing parallel work 249 | and re-throws them the caller's thread (this part is of no relevance for 250 | software written in C99). 251 | 252 | The functions ``task_wait()`` and ``task_wait_and_release()`` do not just 253 | wait---they spend the wait time fetching and executing work from the task 254 | queue, which has two implications: first, it is not wasteful to wait for the 255 | completion of another task while executing a task. Second, the thread pool can 256 | be set to a size of zero via ``pool_create(0)`` or ``pool_set_size(pool, 0)``, 257 | in which case the program will still run correctly without launching any 258 | additional threads. 259 | -------------------------------------------------------------------------------- /include/nanothread/nanothread.h: -------------------------------------------------------------------------------- 1 | /* 2 | nanothread/nanothread.h -- Simple thread pool with a task-based API 3 | 4 | Copyright (c) 2021 Wenzel Jakob 5 | 6 | All rights reserved. Use of this source code is governed by a BSD-style 7 | license that can be found in the LICENSE file. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #if defined(__cplusplus) 17 | # include 18 | #else 19 | # include 20 | #endif 21 | 22 | #if defined(NANOTHREAD_STATIC) 23 | # define NANOTHREAD_EXPORT 24 | #else 25 | # if defined(_MSC_VER) 26 | # if defined(NANOTHREAD_BUILD) 27 | # define NANOTHREAD_EXPORT __declspec(dllexport) 28 | # else 29 | # define NANOTHREAD_EXPORT __declspec(dllimport) 30 | # endif 31 | # else 32 | # define NANOTHREAD_EXPORT __attribute__ ((visibility("default"))) 33 | # endif 34 | #endif 35 | 36 | #if defined(__cplusplus) 37 | # define NANOTHREAD_DEF(x) = x 38 | #else 39 | # define NANOTHREAD_DEF(x) 40 | #endif 41 | 42 | #define NANOTHREAD_AUTO ((uint32_t) -1) 43 | 44 | typedef struct Pool Pool; 45 | typedef struct Task Task; 46 | 47 | #if defined(__cplusplus) 48 | #define NANOTHREAD_THROW noexcept(false) 49 | extern "C" { 50 | #else 51 | #define NANOTHREAD_THROW 52 | #endif 53 | 54 | /** 55 | * \brief Create a new thread pool 56 | * 57 | * \param size 58 | * Specifies the desired number of threads. The default value of 59 | * \c NANOTHREAD_AUTO choses a thread count equal to the number of 60 | * available cores. 61 | * 62 | * \param ftz 63 | * Should denormalized floating point numbers be flushed to zero? 64 | * The pool workers will initialize their floating point control 65 | * registers accordingly. 66 | */ 67 | extern NANOTHREAD_EXPORT Pool * 68 | pool_create(uint32_t size NANOTHREAD_DEF(NANOTHREAD_AUTO), 69 | int ftz NANOTHREAD_DEF(1)); 70 | 71 | /** 72 | * \brief Destroy the thread pool and discard remaining unfinished work. 73 | * 74 | * It is undefined behavior to destroy the thread pool while other threads 75 | * are waiting for the completion of scheduled work via \ref task_wait(). 76 | * 77 | * \param pool 78 | * The thread pool to destroy. \c nullptr refers to the default pool. 79 | */ 80 | extern NANOTHREAD_EXPORT void pool_destroy(Pool *pool NANOTHREAD_DEF(0)); 81 | 82 | /// Returns the number of available CPU cores. 83 | extern NANOTHREAD_EXPORT uint32_t core_count(); 84 | 85 | /** 86 | * \brief Return the number of threads that are part of the pool 87 | * 88 | * \param pool 89 | * The thread pool to query. \c nullptr refers to the default pool. 90 | */ 91 | extern NANOTHREAD_EXPORT uint32_t pool_size(Pool *pool NANOTHREAD_DEF(0)); 92 | 93 | /** 94 | * \brief Resize the thread pool to the given number of threads 95 | * 96 | * \param pool 97 | * The thread pool to resize. \c nullptr refers to the default pool. 98 | */ 99 | extern NANOTHREAD_EXPORT void pool_set_size(Pool *pool, uint32_t size); 100 | 101 | /** 102 | * \brief Enable/disable time profiling 103 | * 104 | * Profiling must be enabled to use the \ref task_time() function. 105 | * 106 | * \param value 107 | * A nonzero value indicates that profiling should be enabled. 108 | */ 109 | extern NANOTHREAD_EXPORT void pool_set_profile(int value); 110 | 111 | /// Check whether time profiling is enabled (global setting) 112 | extern NANOTHREAD_EXPORT int pool_profile(); 113 | 114 | /** 115 | * \brief Return a unique number identifying the current worker thread 116 | * 117 | * When called from a thread pool worker (e.g. while executing a parallel 118 | * task), this function returns a unique identifying number between 1 and the 119 | * pool's total thread count. 120 | * 121 | * The IDs of separate thread pools overlap. When the current thread is not a 122 | * thread pool worker, the function returns zero. 123 | */ 124 | extern NANOTHREAD_EXPORT uint32_t pool_thread_id(); 125 | 126 | /** \brief Process work available within the pool until a stopping criterion is 127 | * satisified. 128 | * 129 | * This function repeatedly fetches work from ``pool`` until the stopping 130 | * criterion ``stopping_criterion(payload)`` evaluates to ``true``, at which 131 | * point the function returns. 132 | * 133 | * It provides a way for an ordinary thread to temporarily join the thread 134 | * pool. A function being called by a worker thread that needs to wait for an 135 | * event to take place can also call this function to avoid starvation issues. 136 | */ 137 | extern NANOTHREAD_EXPORT void 138 | pool_work_until(Pool *pool, bool (*stopping_criterion)(void *), void *payload); 139 | 140 | /* 141 | * \brief Submit a new task to a thread pool 142 | * 143 | * This function submits a new task consisting of \c size work units to the 144 | * thread pool \c pool. 145 | * 146 | * Callback: The task callback \c func will be invoked \c size times by 147 | * the various thread pool workers. Its first argument will range from 148 | * 0 to \c size - 1, and the second argument refers to a 149 | * payload memory region specified via the \c payload parameter. 150 | * 151 | * Parents: The \c parent and \c parent_count parameters can be used to 152 | * specify parent tasks that must be completed before execution of this task 153 | * can commence. If the task does not depend on any other tasks (e.g. 154 | * parent_count == 0 and parent == nullptr), or when all of 155 | * those other tasks have already finished executing, then it will be 156 | * immediately appended to the end of the task queue. Otherwise, the task will 157 | * be scheduled once all parent tasks have finished executing. 158 | * 159 | * Payload storage: The callback payload is handled using one of two 160 | * possible modes: 161 | * 162 | *
    163 | *
  1. When size == 0 or payload_deleter != nullptr, the 164 | * value of the \c payload parameter is simply forwarded to the callback \c 165 | * func. In the latter case, payload_deleter(payload) is invoked 166 | * following completion of the task, which can carry out additional cleanup 167 | * operations if needed. In both cases, the memory region targeted by \c 168 | * payload may be accessed asynchronously and must remain valid until the 169 | * task is done.
  2. 170 | * 171 | *
  3. Otherwise, the function will internally create a copy of the payload 172 | * and free it following completion of the task. In this case, it is fine to 173 | * delete the the memory region targeted by \c payload right after the 174 | * function call.
  4. 175 | *
176 | * 177 | * The function returns a task handle that can be used to schedule other 178 | * dependent tasks, and to wait for task completion if desired. This handle 179 | * must eventually be released using \ref task_release() or \ref 180 | * task_release_and_wait(). A failure to do so will result in memory leaks. 181 | * 182 | * Small task optimization: If desired, small tasks can be executed 183 | * right away without using the thread pool. This happens under the following 184 | * conditions: 185 | * 186 | *
    187 | *
  1. The task is "small" (\c size == 1).
  2. 188 | *
  3. The task does not depend on any parent tasks.
  4. 189 | *
  5. The \c always_async parameter is set to 0
  6. 190 | *
191 | * 192 | * \remark 193 | * Barriers and similar dependency relations can be encoded by via 194 | * artificial tasks using size == 0 and func == nullptr 195 | * along with a set of parent tasks. 196 | * 197 | * \param pool 198 | * The thread pool that should execute the specified task. \c nullptr 199 | * refers to the default pool. 200 | * 201 | * \param parent 202 | * List of parents of size \c parent_count. \c nullptr-valued elements 203 | * are ignored 204 | * 205 | * \param parent_count 206 | * Number of parent tasks 207 | * 208 | * \param size 209 | * Total number of work units; the callback \c func will be called this 210 | * many times if provided. 211 | * 212 | * \param func 213 | * Callback function that will be invoked to perform the actual computation. 214 | * If set to \c nullptr, the callback is ignored. This can be used to create 215 | * artificial tasks that only encode dependencies. 216 | * 217 | * \param payload 218 | * Optional payload that is passed to the function \c func 219 | * 220 | * \param payload_size 221 | * When \c payload_deleter is equal to \c nullptr and when \c size is 222 | * nonzero, a temporary copy of the payload will be made. This parameter is 223 | * necessary to specify the payload size in that case. 224 | * 225 | * \param payload_deleter 226 | * Optional callback that will be invoked to free the payload 227 | * 228 | * \param always_async 229 | * If set to a nonzero value, execution will always happen asynchronously, 230 | * even in cases where the task being scheduled has no parents, and 231 | * when only encodes a small amount of work (\c size == 1). Otherwise 232 | * it will be executed synchronously, and the function will return \c nullptr. 233 | * 234 | * \return 235 | * A task handle that must eventually be released via \ref task_release() 236 | * or \ref task_wait_and_release(). The function returns \c nullptr when 237 | * no task was generated (e.g. when there are no parent tasks, and either 238 | * size==0, or when size==1 and the task was executed 239 | * synchronously.) 240 | */ 241 | extern NANOTHREAD_EXPORT 242 | Task *task_submit_dep(Pool *pool, 243 | const Task * const *parent, 244 | uint32_t parent_count, 245 | uint32_t size NANOTHREAD_DEF(1), 246 | void (*func)(uint32_t, void *) NANOTHREAD_DEF(0), 247 | void *payload NANOTHREAD_DEF(0), 248 | uint32_t payload_size NANOTHREAD_DEF(0), 249 | void (*payload_deleter)(void *) NANOTHREAD_DEF(0), 250 | int always_async NANOTHREAD_DEF(0)); 251 | 252 | /* 253 | * \brief Release a task handle so that it can eventually be reused 254 | * 255 | * Releasing a task handle does not impact the tasks's execution, which could 256 | * be in one of three states: waiting, running, or complete. This operation is 257 | * important because it frees internal resources that would otherwise leak. 258 | * 259 | * Following a call to \ref task_release(), the associated task can no 260 | * longer be used as a direct parent of other tasks, and it is no longer 261 | * possible to wait for its completion using an operation like \ref 262 | * task_wait(). 263 | * 264 | * \param pool 265 | * The thread pool containing the task. \c nullptr refers to the default pool. 266 | * 267 | * \param task 268 | * The task in question. When equal to \c nullptr, the operation is a no-op. 269 | */ 270 | extern NANOTHREAD_EXPORT void task_release(Task *task); 271 | 272 | /* 273 | * \brief Wait for the completion of the specified task 274 | * 275 | * This function causes the calling thread to sleep until all work units of 276 | * 'task' have been completed. 277 | * 278 | * If an exception was caught during parallel excecution of 'task', the 279 | * function \ref task_wait() will re-raise this exception in the context of the 280 | * caller. Note that if a parallel task raises many exceptions, only a single 281 | * one of them will be be captured in this way. 282 | * 283 | * \param task 284 | * The task in question. When equal to \c nullptr, the operation is a no-op. 285 | */ 286 | extern NANOTHREAD_EXPORT void task_wait(Task *task) NANOTHREAD_THROW; 287 | 288 | /* 289 | * \brief Wait for the completion of the specified task and release its handle 290 | * 291 | * This function is equivalent to calling \ref task_wait() followed by \ref 292 | * task_release(). 293 | * 294 | * If an exception was caught during parallel excecution of 'task', the 295 | * function \ref task_wait_and_release() will perform the release step and then 296 | * re-raise this exception in the context of the caller. Note that if a 297 | * parallel task raises many exceptions, only a single one of them will be be 298 | * captured in this way. 299 | * 300 | * \param task 301 | * The task in question. When equal to \c nullptr, the operation is a no-op. 302 | */ 303 | extern NANOTHREAD_EXPORT void task_wait_and_release(Task *task) NANOTHREAD_THROW; 304 | 305 | /** 306 | * \brief Return the time consumed by the task in milliseconds 307 | * 308 | * To use this function, you must first enable time profiling via \ref 309 | * pool_set_profile() before launching tasks. 310 | */ 311 | extern NANOTHREAD_EXPORT double task_time(Task *task) NANOTHREAD_THROW; 312 | 313 | /* 314 | * \brief Increase the reference count of a task 315 | * 316 | * In advanced use case, it may be helpful if multiple parts of the system can 317 | * hold references to a task (and e.g. query timing information or 318 | * completeness). The \c task_retain operation enables this by increasing an 319 | * internal reference counter so that \ref task_release() must be called 320 | * multiple times before the task is actually released. 321 | * 322 | * \param task 323 | * The task in question. When equal to \c nullptr, the operation is a no-op. 324 | */ 325 | extern NANOTHREAD_EXPORT void task_retain(Task *task); 326 | 327 | /// Convenience wrapper around task_submit_dep(), but without dependencies 328 | static inline 329 | Task *task_submit(Pool *pool, 330 | uint32_t size NANOTHREAD_DEF(1), 331 | void (*func)(uint32_t, void *) NANOTHREAD_DEF(0), 332 | void *payload NANOTHREAD_DEF(0), 333 | uint32_t payload_size NANOTHREAD_DEF(0), 334 | void (*payload_deleter)(void *) NANOTHREAD_DEF(0), 335 | int always_async NANOTHREAD_DEF(0)) { 336 | 337 | return task_submit_dep(pool, 0, 0, size, func, payload, payload_size, 338 | payload_deleter, always_async); 339 | } 340 | 341 | /// Convenience wrapper around task_submit(), but fully synchronous 342 | static inline 343 | void task_submit_and_wait(Pool *pool, 344 | uint32_t size NANOTHREAD_DEF(1), 345 | void (*func)(uint32_t, void *) NANOTHREAD_DEF(0), 346 | void *payload NANOTHREAD_DEF(0)) { 347 | 348 | Task *task = task_submit(pool, size, func, payload, 0, 0, 0); 349 | task_wait_and_release(task); 350 | } 351 | 352 | #if defined(__cplusplus) 353 | } 354 | 355 | #include 356 | 357 | namespace drjit { 358 | template struct blocked_range { 359 | public: 360 | blocked_range(Int begin, Int end, Int block_size = 1) 361 | : m_begin(begin), m_end(end), m_block_size(block_size) { } 362 | 363 | struct iterator { 364 | Int value; 365 | 366 | iterator(Int value) : value(value) { } 367 | 368 | Int operator*() const { return value; } 369 | operator Int() const { return value;} 370 | 371 | void operator++() { value++; } 372 | bool operator==(const iterator &it) { return value == it.value; } 373 | bool operator!=(const iterator &it) { return value != it.value; } 374 | }; 375 | 376 | uint32_t blocks() const { 377 | return (uint32_t) ((m_end - m_begin + m_block_size - 1) / m_block_size); 378 | } 379 | 380 | iterator begin() const { return iterator(m_begin); } 381 | iterator end() const { return iterator(m_end); } 382 | Int block_size() const { return m_block_size; } 383 | 384 | private: 385 | Int m_begin; 386 | Int m_end; 387 | Int m_block_size; 388 | }; 389 | 390 | template 391 | void parallel_for(const blocked_range &range, Func &&func, 392 | Pool *pool = nullptr) { 393 | 394 | struct Payload { 395 | Func *f; 396 | Int begin, end, block_size; 397 | }; 398 | 399 | Payload payload{ &func, range.begin(), range.end(), 400 | range.block_size() }; 401 | 402 | auto callback = [](uint32_t index, void *payload) { 403 | Payload *p = (Payload *) payload; 404 | Int begin = p->begin + p->block_size * (Int) index, 405 | end = begin + p->block_size; 406 | 407 | if (end > p->end) 408 | end = p->end; 409 | 410 | (*p->f)(blocked_range(begin, end)); 411 | }; 412 | 413 | task_submit_and_wait(pool, range.blocks(), callback, &payload); 414 | } 415 | 416 | template 417 | Task *parallel_for_async(const blocked_range &range, Func &&func, 418 | const Task * const *parents, 419 | size_t parent_count, 420 | Pool *pool = nullptr) { 421 | using BaseFunc = typename std::decay::type; 422 | 423 | struct Payload { 424 | BaseFunc f; 425 | Int begin, end, block_size; 426 | }; 427 | 428 | auto callback = [](uint32_t index, void *payload) { 429 | Payload *p = (Payload *) payload; 430 | Int begin = p->begin + p->block_size * (Int) index, 431 | end = begin + p->block_size; 432 | 433 | if (end > p->end) 434 | end = p->end; 435 | 436 | p->f(blocked_range(begin, end)); 437 | }; 438 | 439 | if (std::is_trivially_copyable::value && 440 | std::is_trivially_destructible::value) { 441 | Payload payload{ std::forward(func), range.begin(), 442 | range.end(), range.block_size() }; 443 | 444 | return task_submit_dep(pool, parents, 445 | (uint32_t) parent_count, range.blocks(), 446 | callback, &payload, sizeof(Payload), nullptr, 1); 447 | } else { 448 | Payload *payload = new Payload{ std::forward(func), range.begin(), 449 | range.end(), range.block_size() }; 450 | 451 | auto deleter = [](void *payload) { 452 | delete (Payload *) payload; 453 | }; 454 | 455 | return task_submit_dep(pool, parents, 456 | (uint32_t) parent_count, range.blocks(), 457 | callback, payload, 0, deleter, 1); 458 | } 459 | } 460 | 461 | template 462 | Task *parallel_for_async(const blocked_range &range, Func &&func, 463 | std::initializer_list parents = { }, 464 | Pool *pool = nullptr) { 465 | return parallel_for_async(range, func, parents.begin(), parents.size(), 466 | pool); 467 | } 468 | 469 | template 470 | Task *do_async(Func &&func, const Task * const *parents, size_t parent_count, 471 | Pool *pool = nullptr) { 472 | using BaseFunc = typename std::decay::type; 473 | 474 | struct Payload { 475 | BaseFunc f; 476 | }; 477 | 478 | auto callback = [](uint32_t /* unused */, void *payload) { 479 | ((Payload *) payload)->f(); 480 | }; 481 | 482 | if (std::is_trivially_copyable::value && 483 | std::is_trivially_destructible::value) { 484 | Payload payload {std::forward(func) }; 485 | 486 | return task_submit_dep(pool, parents, 487 | (uint32_t) parent_count, 1, callback, 488 | &payload, sizeof(Payload), nullptr, 1); 489 | } else { 490 | Payload *payload = new Payload{ std::forward(func) }; 491 | 492 | auto deleter = [](void *payload) { delete (Payload *) payload; }; 493 | 494 | return task_submit_dep(pool, parents, 495 | (uint32_t) parent_count, 1, callback, 496 | payload, 0, deleter, 1); 497 | } 498 | } 499 | 500 | template 501 | Task *do_async(Func &&func, std::initializer_list parents = {}, 502 | Pool *pool = nullptr) { 503 | return do_async(func, parents.begin(), parents.size(), pool); 504 | } 505 | } 506 | #endif 507 | -------------------------------------------------------------------------------- /src/nanothread.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | src/pool.cpp -- Simple thread pool with task-based API 3 | 4 | Copyright (c) 2021 Wenzel Jakob 5 | 6 | All rights reserved. Use of this source code is governed by a BSD-style 7 | license that can be found in the LICENSE file. 8 | */ 9 | 10 | #include 11 | #include "queue.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #if defined(__linux__) 18 | # include 19 | #elif defined(_WIN32) 20 | # include 21 | # include 22 | #endif 23 | 24 | #if defined(_MSC_VER) 25 | # include 26 | #elif defined(__SSE2__) 27 | # include 28 | #endif 29 | 30 | struct Worker; 31 | 32 | /// TLS variable storing an ID of each thread 33 | #if defined(_MSC_VER) 34 | static __declspec(thread) uint32_t thread_id_tls = 0; 35 | #else 36 | static __thread uint32_t thread_id_tls = 0; 37 | #endif 38 | 39 | /// Data structure describing a pool of workers 40 | struct Pool { 41 | /// Queue of scheduled tasks 42 | TaskQueue queue; 43 | 44 | /// List of currently running worker threads 45 | std::vector> workers; 46 | 47 | /// Number of idle workers that have gone to sleep 48 | std::atomic asleep; 49 | 50 | /// Should denormalized floating point numbers be flushed to zero? 51 | bool ftz = true; 52 | }; 53 | 54 | struct Worker { 55 | Pool *pool; 56 | std::thread thread; 57 | uint32_t id; 58 | bool stop; 59 | bool ftz; 60 | 61 | Worker(Pool *pool, uint32_t id, bool ftz); 62 | ~Worker(); 63 | void run(); 64 | }; 65 | 66 | 67 | static Pool *pool_default_inst = nullptr; 68 | static Lock pool_default_lock; 69 | static uint32_t cached_core_count = 0; 70 | 71 | uint32_t core_count() { 72 | // assumes atomic word size memory access 73 | if (cached_core_count) 74 | return cached_core_count; 75 | 76 | // Determine the number of present cores 77 | uint32_t ncores = std::thread::hardware_concurrency(); 78 | 79 | #if defined(__linux__) 80 | // Don't try to query CPU affinity if running inside Valgrind 81 | if (getenv("VALGRIND_OPTS") == nullptr) { 82 | /* Some of the cores may not be available to the user 83 | (e.g. on certain cluster nodes) -- determine the number 84 | of actual available cores here. */ 85 | uint32_t ncores_logical = ncores; 86 | size_t size = 0; 87 | cpu_set_t *cpuset = nullptr; 88 | int retval = 0; 89 | 90 | /* The kernel may expect a larger cpu_set_t than would 91 | be warranted by the physical core count. Keep querying 92 | with increasingly larger buffers if the 93 | pthread_getaffinity_np operation fails */ 94 | for (uint32_t i = 0; i < 10; ++i) { 95 | size = CPU_ALLOC_SIZE(ncores_logical); 96 | cpuset = CPU_ALLOC(ncores_logical); 97 | if (!cpuset) { 98 | fprintf(stderr, "nanothread: core_count(): Could not allocate cpu_set_t.\n"); 99 | return ncores; 100 | } 101 | CPU_ZERO_S(size, cpuset); 102 | 103 | int retval = pthread_getaffinity_np(pthread_self(), size, cpuset); 104 | if (retval == 0) 105 | break; 106 | CPU_FREE(cpuset); 107 | ncores_logical *= 2; 108 | } 109 | 110 | if (retval) { 111 | fprintf(stderr, "nanothread: core_count(): Could not read thread affinity map.\n"); 112 | return ncores; 113 | } 114 | 115 | uint32_t ncores_avail = 0; 116 | for (uint32_t i = 0; i < ncores_logical; ++i) 117 | ncores_avail += CPU_ISSET_S(i, size, cpuset) ? 1 : 0; 118 | ncores = ncores_avail; 119 | CPU_FREE(cpuset); 120 | } 121 | #endif 122 | cached_core_count = ncores; 123 | return ncores; 124 | } 125 | 126 | 127 | uint32_t pool_thread_id() { 128 | return thread_id_tls; 129 | } 130 | 131 | Pool *pool_default() { 132 | std::unique_lock guard(pool_default_lock); 133 | 134 | if (!pool_default_inst) 135 | pool_default_inst = pool_create(); 136 | 137 | return pool_default_inst; 138 | } 139 | 140 | Pool *pool_create(uint32_t size, int ftz) { 141 | Pool *pool = new Pool(); 142 | pool->ftz = ftz != 0; 143 | if (size == (uint32_t) -1) 144 | size = core_count(); 145 | NT_TRACE("pool_create(%p)", pool); 146 | pool_set_size(pool, size); 147 | return pool; 148 | } 149 | 150 | 151 | void pool_destroy(Pool *pool) { 152 | if (pool) { 153 | pool_set_size(pool, 0); 154 | delete pool; 155 | } else if (pool_default_inst) { 156 | pool_destroy(pool_default_inst); 157 | pool_default_inst = nullptr; 158 | } 159 | } 160 | 161 | uint32_t pool_size(Pool *pool) { 162 | if (!pool) { 163 | std::unique_lock guard(pool_default_lock); 164 | pool = pool_default_inst; 165 | } 166 | 167 | if (pool) 168 | return (uint32_t) pool->workers.size(); 169 | else 170 | return core_count(); 171 | } 172 | 173 | void pool_set_size(Pool *pool, uint32_t size) { 174 | if (!pool) { 175 | std::unique_lock guard(pool_default_lock); 176 | pool = pool_default_inst; 177 | 178 | if (!pool) { 179 | pool = pool_default_inst = new Pool(); 180 | NT_TRACE("pool_create(%p)", pool); 181 | } 182 | } 183 | 184 | NT_TRACE("pool_set_size(%p, %u)", pool, size); 185 | 186 | int diff = (int) size - (int) pool->workers.size(); 187 | if (diff > 0) { 188 | // Launch extra worker threads 189 | for (int i = 0; i < diff; ++i) 190 | pool->workers.push_back(std::unique_ptr( 191 | new Worker(pool, (uint32_t) pool->workers.size() + 1, pool->ftz))); 192 | } else if (diff < 0) { 193 | // Remove worker threads (destructor calls join()) 194 | for (int i = diff; i != 0; ++i) 195 | pool->workers[pool->workers.size() + i]->stop = true; 196 | pool->queue.wakeup(); 197 | for (int i = diff; i != 0; ++i) 198 | pool->workers.pop_back(); 199 | } 200 | } 201 | 202 | int profile_tasks = false; 203 | 204 | int pool_profile() { 205 | return (int) profile_tasks; 206 | } 207 | 208 | void pool_set_profile(int value) { 209 | profile_tasks = (bool) value; 210 | } 211 | 212 | Task *task_submit_dep(Pool *pool, const Task *const *parent, 213 | uint32_t parent_count, uint32_t size, 214 | void (*func)(uint32_t, void *), void *payload, 215 | uint32_t payload_size, void (*payload_deleter)(void *), 216 | int async) { 217 | 218 | if (size == 0) { 219 | // There is no work, so the payload is irrelevant 220 | func = nullptr; 221 | 222 | // The queue requires task size >= 1 223 | size = 1; 224 | } 225 | 226 | // Does the task have parent tasks 227 | bool has_parent = false; 228 | for (uint32_t i = 0; i < parent_count; ++i) 229 | has_parent |= parent[i] != nullptr; 230 | 231 | // If this is a small work unit, execute it right away 232 | if (size == 1 && !has_parent && async == 0) { 233 | NT_TRACE("task_submit_dep(): task is small, executing right away"); 234 | 235 | if (!profile_tasks) { 236 | if (func) 237 | func(0, payload); 238 | 239 | if (payload_deleter) 240 | payload_deleter(payload); 241 | 242 | // Don't even return a task.. 243 | return nullptr; 244 | } else { 245 | if (!pool) 246 | pool = pool_default(); 247 | 248 | Task *task = pool->queue.alloc(size); 249 | 250 | if (profile_tasks) { 251 | #if defined(_WIN32) 252 | QueryPerformanceCounter(&task->time_start); 253 | #elif defined(__APPLE__) 254 | task->time_start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); 255 | #else 256 | clock_gettime(CLOCK_MONOTONIC, &task->time_start); 257 | #endif 258 | } 259 | 260 | if (func) 261 | func(0, payload); 262 | 263 | if (profile_tasks) { 264 | #if defined(_WIN32) 265 | QueryPerformanceCounter(&task->time_end); 266 | #elif defined(__APPLE__) 267 | task->time_end = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); 268 | #else 269 | clock_gettime(CLOCK_MONOTONIC, &task->time_end); 270 | #endif 271 | } 272 | 273 | if (payload_deleter) 274 | payload_deleter(payload); 275 | 276 | task->refcount.store(high_bit, std::memory_order_relaxed); 277 | task->exception_used.store(false, std::memory_order_relaxed); 278 | task->exception = nullptr; 279 | task->size = size; 280 | task->func = func; 281 | task->pool = pool; 282 | task->payload = nullptr; 283 | task->payload_deleter = nullptr; 284 | 285 | return task; 286 | } 287 | } 288 | 289 | // Size 0 is equivalent to size 1, but without the above optimization 290 | if (size == 0) 291 | size = 1; 292 | 293 | if (!pool) 294 | pool = pool_default(); 295 | 296 | Task *task = pool->queue.alloc(size); 297 | task->exception_used.store(false, std::memory_order_relaxed); 298 | task->exception = nullptr; 299 | 300 | if (has_parent) { 301 | // Prevent early job submission due to completion of parents 302 | task->wait_parents.store(1, std::memory_order_release); 303 | 304 | // Register dependencies in queue, will further increase child->wait_parents 305 | for (uint32_t i = 0; i < parent_count; ++i) 306 | pool->queue.add_dependency((Task *) parent[i], task); 307 | } 308 | 309 | task->size = size; 310 | task->func = func; 311 | task->pool = pool; 312 | 313 | if (payload) { 314 | if (payload_deleter || payload_size == 0) { 315 | task->payload = payload; 316 | task->payload_deleter = payload_deleter; 317 | } else if (payload_size <= sizeof(Task::payload_storage)) { 318 | task->payload = task->payload_storage; 319 | memcpy(task->payload_storage, payload, payload_size); 320 | task->payload_deleter = nullptr; 321 | } else { 322 | /* Payload doesn't fit into temporary storage, and no 323 | custom deleter was provided. Make a temporary copy. */ 324 | task->payload = malloc(payload_size); 325 | task->payload_deleter = free; 326 | NT_ASSERT(task->payload != nullptr); 327 | memcpy(task->payload, payload, payload_size); 328 | } 329 | } else { 330 | task->payload = nullptr; 331 | task->payload_deleter = nullptr; 332 | } 333 | 334 | bool push = true; 335 | if (has_parent) { 336 | /* Undo the earlier 'wait' increment. If the value is now zero, all 337 | parent tasks have completed and the job can be pushed. Otherwise, 338 | it's somebody else's job to carry out this step. */ 339 | push = task->wait_parents.fetch_sub(1) == 1; 340 | } 341 | 342 | if (push) 343 | pool->queue.push(task); 344 | 345 | return task; 346 | } 347 | 348 | static void pool_execute_task(Pool *pool, bool (*stopping_criterion)(void *), 349 | void *payload, bool may_sleep) { 350 | Task *task; 351 | uint32_t index; 352 | std::tie(task, index) = 353 | pool->queue.pop_or_sleep(stopping_criterion, payload, may_sleep); 354 | 355 | if (task) { 356 | if (task->func) { 357 | if (task->exception_used.load()) { 358 | NT_TRACE( 359 | "not running callback (task=%p, index=%u) because another " 360 | "work unit of this task generated an exception", 361 | task, index); 362 | } else { 363 | try { 364 | NT_TRACE("running callback (task=%p, index=%u, payload=%p)", task, index, task->payload); 365 | task->func(index, task->payload); 366 | } catch (...) { 367 | bool value = false; 368 | if (task->exception_used.compare_exchange_strong(value, true)) { 369 | NT_TRACE("exception caught, storing.."); 370 | task->exception = std::current_exception(); 371 | } else { 372 | NT_TRACE("exception caught, ignoring (an exception was already stored)"); 373 | } 374 | } 375 | } 376 | } 377 | 378 | pool->queue.release(task); 379 | } 380 | } 381 | 382 | void pool_work_until(Pool *pool, bool (*stopping_criterion)(void *), void *payload) { 383 | if (!pool) 384 | pool = pool_default_inst; 385 | if (!pool) 386 | return; 387 | while (!stopping_criterion(payload)) 388 | pool_execute_task(pool, stopping_criterion, payload, false); 389 | } 390 | 391 | #if defined(__SSE2__) 392 | struct FTZGuard { 393 | FTZGuard(bool enable) : enable(enable) { 394 | if (enable) { 395 | csr = _mm_getcsr(); 396 | _mm_setcsr(csr | (_MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON)); 397 | } 398 | } 399 | 400 | ~FTZGuard() { 401 | if (enable) 402 | _mm_setcsr(csr); 403 | } 404 | 405 | bool enable; 406 | int csr; 407 | }; 408 | #else 409 | struct FTZGuard { FTZGuard(bool) { } }; 410 | #endif 411 | 412 | void task_wait(Task *task) { 413 | if (task) { 414 | Pool *pool = task->pool; 415 | FTZGuard ftz_guard(pool->ftz); 416 | 417 | // Signal that we are waiting for this task 418 | task->wait_count++; 419 | 420 | auto stopping_criterion = [](void *ptr) -> bool { 421 | return (uint32_t)(((Task *) ptr)->refcount.load()) == 0; 422 | }; 423 | 424 | NT_TRACE("task_wait(%p)", task); 425 | 426 | // Help executing work units in the meantime 427 | while (!stopping_criterion(task)) 428 | pool_execute_task(pool, stopping_criterion, task, true); 429 | 430 | task->wait_count--; 431 | 432 | if (task->exception) 433 | std::rethrow_exception(task->exception); 434 | } 435 | } 436 | 437 | void task_retain(Task *task) { 438 | if (task) 439 | task->pool->queue.retain(task); 440 | } 441 | 442 | void task_release(Task *task) { 443 | if (task) 444 | task->pool->queue.release(task, true); 445 | } 446 | 447 | void task_wait_and_release(Task *task) NANOTHREAD_THROW { 448 | try { 449 | task_wait(task); 450 | } catch (...) { 451 | task_release(task); 452 | throw; 453 | } 454 | task_release(task); 455 | } 456 | 457 | #if defined(_WIN32) 458 | static double timer_frequency_scale = 0.0; 459 | #endif 460 | 461 | NANOTHREAD_EXPORT double task_time(Task *task) NANOTHREAD_THROW { 462 | if (!task) 463 | return 0; 464 | 465 | #if defined(__APPLE__) 466 | return (task->time_end - task->time_start) * 1e-6; 467 | #elif !defined(_WIN32) 468 | return (task->time_end.tv_sec - task->time_start.tv_sec) * 1e3 + 469 | (task->time_end.tv_nsec - task->time_start.tv_nsec) * 1e-6; 470 | #else 471 | if (timer_frequency_scale == 0.0) { 472 | LARGE_INTEGER timer_frequency; 473 | QueryPerformanceFrequency(&timer_frequency); 474 | timer_frequency_scale = 1e3 / timer_frequency.QuadPart; 475 | } 476 | 477 | return timer_frequency_scale * 478 | (task->time_end.QuadPart - task->time_start.QuadPart); 479 | #endif 480 | } 481 | 482 | Worker::Worker(Pool *pool, uint32_t id, bool ftz) 483 | : pool(pool), id(id), stop(false), ftz(ftz) { 484 | thread = std::thread(&Worker::run, this); 485 | } 486 | 487 | Worker::~Worker() { thread.join(); } 488 | 489 | void Worker::run() { 490 | thread_id_tls = id; 491 | 492 | NT_TRACE("worker started"); 493 | 494 | #if defined(_WIN32) 495 | wchar_t buf[24]; 496 | _snwprintf(buf, sizeof(buf) / sizeof(wchar_t), L"nanothread worker %u", id); 497 | SetThreadDescription(GetCurrentThread(), buf); 498 | #else 499 | char buf[24]; 500 | snprintf(buf, sizeof(buf), "nanothread worker %u", id); 501 | #if defined(__APPLE__) 502 | pthread_setname_np(buf); 503 | #else 504 | pthread_setname_np(pthread_self(), buf); 505 | #endif 506 | #endif 507 | 508 | FTZGuard ftz_guard(ftz); 509 | while (!stop) 510 | pool_execute_task( 511 | pool, [](void *ptr) -> bool { return *((bool *) ptr); }, &stop, 512 | true); 513 | 514 | NT_TRACE("worker stopped"); 515 | 516 | thread_id_tls = 0; 517 | } 518 | -------------------------------------------------------------------------------- /src/queue.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | src/queue.cpp -- Lock-free task queue implementation used by nanothread 3 | 4 | Copyright (c) 2021 Wenzel Jakob 5 | 6 | All rights reserved. Use of this source code is governed by a BSD-style 7 | license that can be found in the LICENSE file. 8 | */ 9 | 10 | #include "queue.h" 11 | #include 12 | #include 13 | #include 14 | 15 | #if defined(_WIN32) 16 | # include 17 | #endif 18 | 19 | #if defined(_MSC_VER) 20 | # include 21 | #elif defined(__SSE2__) 22 | # include 23 | #endif 24 | 25 | /// Put worker threads to sleep after 500K attempts to get work 26 | #define NANOTHREAD_MAX_ATTEMPTS 500000 27 | 28 | /// Reduce power usage in busy-wait CAS loops 29 | static void cas_pause() { 30 | #if defined(_M_X64) || defined(__SSE2__) 31 | _mm_pause(); 32 | #endif 33 | } 34 | 35 | /// Atomic 16 byte compare-and-swap & release barrier on ARM 36 | static bool cas(Task::Ptr &ptr, Task::Ptr &expected, Task::Ptr desired) { 37 | #if defined(_MSC_VER) 38 | #if defined(_M_ARM64) 39 | return _InterlockedCompareExchange128_rel( 40 | (__int64 volatile *) &ptr, (__int64) desired.value, 41 | (__int64) desired.task, (__int64 *) &expected); 42 | #else 43 | return _InterlockedCompareExchange128( 44 | (__int64 volatile *) &ptr, (__int64) desired.value, 45 | (__int64) desired.task, (__int64 *) &expected); 46 | #endif 47 | #else 48 | return __atomic_compare_exchange(&ptr, &expected, &desired, true, 49 | __ATOMIC_RELEASE, __ATOMIC_ACQUIRE); 50 | #endif 51 | } 52 | 53 | // *Non-atomic* 16 byte load, acquire barrier on ARM 54 | static Task::Ptr ldar(Task::Ptr &source) { 55 | #if defined(_MSC_VER) 56 | using P = unsigned __int64 volatile *; 57 | #if defined(_M_ARM64) 58 | uint64_t value_1 = __ldar64((P) &source); 59 | uint64_t value_2 = __ldar64(((P) &source) + 1); 60 | #else 61 | uint64_t value_1 = *((P) &source); 62 | uint64_t value_2 = *(((P) &source) + 1); 63 | #endif 64 | return Task::Ptr{ (Task *) value_1, (uint64_t) value_2 }; 65 | #else 66 | uint64_t value_1 = __atomic_load_n((uint64_t *) &source, __ATOMIC_ACQUIRE); 67 | uint64_t value_2 = __atomic_load_n((((uint64_t *) &source) + 1), __ATOMIC_ACQUIRE); 68 | return Task::Ptr{ (Task *) value_1, value_2 }; 69 | #endif 70 | } 71 | 72 | TaskQueue::TaskQueue() : tasks_created(0), sleep_state(0) { 73 | head = Task::Ptr(alloc(0)); 74 | tail = head; 75 | } 76 | 77 | TaskQueue::~TaskQueue() { 78 | uint32_t created = tasks_created.load(), 79 | deleted = 0, incomplete = 0, 80 | incomplete_size = 0; 81 | 82 | // Free jobs that are still in the queue 83 | Task::Ptr ptr = head; 84 | while (ptr.task) { 85 | Task *task = ptr.task; 86 | 87 | if (ptr.remain() != 0) { 88 | incomplete_size += ptr.remain(); 89 | incomplete++; 90 | } 91 | 92 | for (Task *child : task->children) { 93 | uint32_t wait = child->wait_parents.fetch_sub(1); 94 | NT_ASSERT(wait != 0); 95 | if (wait == 1) 96 | push(child); 97 | } 98 | 99 | task->clear(); 100 | deleted++; 101 | ptr = task->next; 102 | delete task; 103 | } 104 | 105 | // Free jobs on the free-job stack 106 | ptr = recycle; 107 | while (ptr.task) { 108 | Task *task = ptr.task; 109 | NT_ASSERT(task->payload == nullptr && task->children.empty()); 110 | deleted++; 111 | ptr = task->next; 112 | delete task; 113 | } 114 | 115 | if (created != deleted) 116 | fprintf(stderr, 117 | "nanothread: %u/%u tasks were leaked! Did you forget to call " 118 | "task_release()?\n", created - deleted, created); 119 | 120 | if (incomplete > 0) 121 | fprintf(stderr, "nanothread: %u tasks with %u work units were not " 122 | "completed!\n", incomplete, incomplete_size); 123 | } 124 | 125 | Task *TaskQueue::alloc(uint32_t size) { 126 | Task::Ptr node = ldar(recycle); 127 | 128 | while (true) { 129 | // Stop if stack is empty 130 | if (!node) 131 | break; 132 | 133 | // Load the next node 134 | Task::Ptr next = ldar(node.task->next); 135 | 136 | // Next, try to move it to the stack head 137 | if (cas(recycle, node, node.update_task(next.task))) 138 | break; 139 | 140 | cas_pause(); 141 | } 142 | 143 | Task *task; 144 | 145 | if (node.task) { 146 | task = node.task; 147 | } else { 148 | task = new Task(); 149 | tasks_created++; 150 | } 151 | 152 | task->next = Task::Ptr(); 153 | task->refcount.store(size + (size == 0 ? high_bit : (3 * high_bit)), 154 | std::memory_order_relaxed); 155 | task->wait_parents.store(0, std::memory_order_relaxed); 156 | task->wait_count.store(0, std::memory_order_relaxed); 157 | task->size = size; 158 | memset(&task->time_start, 0, sizeof(task->time_start)); 159 | memset(&task->time_end, 0, sizeof(task->time_end)); 160 | 161 | NT_TRACE("created new task %p with size=%u", task, size); 162 | 163 | return task; 164 | } 165 | 166 | void TaskQueue::release(Task *task, bool high) { 167 | uint64_t result = task->refcount.fetch_sub(high ? high_bit : 1); 168 | uint32_t ref_lo = (uint32_t) result, 169 | ref_hi = (uint32_t) (result >> 32); 170 | 171 | NT_ASSERT((!high || ref_hi > 0) && (high || ref_lo > 0)); 172 | ref_hi -= (uint32_t) high; 173 | ref_lo -= (uint32_t) !high; 174 | 175 | NT_TRACE("dec_ref(%p, (%i, %i)) -> ref = (%u, %u)", task, (int) high, 176 | (int) !high, ref_hi, ref_lo); 177 | 178 | // If all work has completed: schedule children and free payload 179 | if (!high && ref_lo == 0) { 180 | NT_TRACE("all work associated with task %p has completed.", task); 181 | 182 | if (profile_tasks) { 183 | #if defined(_WIN32) 184 | QueryPerformanceCounter(&task->time_end); 185 | #elif defined(__APPLE__) 186 | task->time_end = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); 187 | #else 188 | clock_gettime(CLOCK_MONOTONIC, &task->time_end); 189 | #endif 190 | } 191 | 192 | for (Task *child : task->children) { 193 | uint32_t wait = child->wait_parents.fetch_sub(1); 194 | 195 | NT_TRACE("notifying child %p of task %p: wait=%u", child, task, 196 | wait - 1); 197 | 198 | NT_ASSERT(wait > 0); 199 | 200 | if (task->exception_used.load()) { 201 | bool expected = false; 202 | if (child->exception_used.compare_exchange_strong(expected, true)) { 203 | NT_TRACE("propagating exception to child %p of task %p.", 204 | child, task); 205 | child->exception = task->exception; 206 | } else { 207 | NT_TRACE("not propagating exception to child %p of " 208 | "task %p (already stored).", child, task); 209 | } 210 | } 211 | 212 | if (wait == 1) { 213 | NT_TRACE("Child %p of task %p is ready for execution.", child, 214 | task); 215 | push(child); 216 | } 217 | } 218 | 219 | task->clear(); 220 | 221 | // Possible that waiting threads were put to sleep 222 | if (task->wait_count.load() > 0) 223 | wakeup(); 224 | 225 | release(task, true); 226 | } else if (high && ref_hi == 0) { 227 | // Nobody holds any references at this point, recycle task 228 | 229 | NT_ASSERT(ref_lo == 0); 230 | NT_TRACE("all usage of task %p is done, recycling.", task); 231 | 232 | Task::Ptr node = ldar(recycle); 233 | while (true) { 234 | task->next = node; 235 | 236 | if (cas(recycle, node, node.update_task(task))) 237 | break; 238 | 239 | cas_pause(); 240 | } 241 | } 242 | } 243 | 244 | void TaskQueue::add_dependency(Task *parent, Task *child) { 245 | if (!parent) 246 | return; 247 | 248 | uint64_t refcount = 249 | parent->refcount.load(std::memory_order_relaxed); 250 | 251 | /* Increase the parent task's reference count to prevent the cleanup 252 | handler in release() from starting while the following executes. */ 253 | while (true) { 254 | if ((uint32_t) refcount == 0) { 255 | // Parent task has already completed 256 | if (parent->exception_used.load()) { 257 | bool expected = false; 258 | if (child->exception_used.compare_exchange_strong(expected, true)) { 259 | NT_TRACE("propagating exception to child %p of task %p.", 260 | child, parent); 261 | child->exception = parent->exception; 262 | } else { 263 | NT_TRACE("not propagating exception to child %p of " 264 | "task %p (already stored).", child, parent); 265 | } 266 | } 267 | return; 268 | } 269 | 270 | if (parent->refcount.compare_exchange_weak(refcount, refcount + 1, 271 | std::memory_order_release, 272 | std::memory_order_relaxed)) 273 | break; 274 | 275 | cas_pause(); 276 | } 277 | 278 | // Otherwise, register the child task with the parent 279 | parent->children.push_back(child); 280 | uint32_t wait = ++child->wait_parents; 281 | (void) wait; 282 | 283 | NT_TRACE("registering dependency: parent=%p, child=%p, child->wait=%u", 284 | parent, child, wait); 285 | 286 | /* Undo the parent->refcount change. If the task completed in the 287 | meantime, child->wait_parents will also be decremented by 288 | this call. */ 289 | release(parent); 290 | } 291 | 292 | void TaskQueue::retain(Task *task) { 293 | NT_TRACE("retain(task=%p)", task); 294 | task->refcount.fetch_add(high_bit); 295 | } 296 | 297 | void TaskQueue::push(Task *task) { 298 | uint32_t size = task->size; 299 | 300 | NT_TRACE("push(task=%p, size=%u)", task, size); 301 | 302 | while (true) { 303 | // Lead tail and tail->next, and double-check, in this order 304 | Task::Ptr tail_c = ldar(tail); 305 | Task::Ptr &next = tail_c.task->next; 306 | Task::Ptr next_c = ldar(next); 307 | Task::Ptr tail_c_2 = ldar(tail); 308 | 309 | // Detect inconsistencies due to contention 310 | if (tail_c == tail_c_2) { 311 | if (!next_c.task) { 312 | // Tail was pointing to last node, try to insert here 313 | if (cas(next, next_c, Task::Ptr(task, size))) { 314 | // Best-effort attempt to redirect tail to the added element 315 | cas(tail, tail_c, tail_c.update_task(task)); 316 | break; 317 | } 318 | } else { 319 | // Tail wasn't pointing to the last node, try to update 320 | cas(tail, tail_c, tail_c.update_task(next_c.task)); 321 | } 322 | } 323 | 324 | cas_pause(); 325 | } 326 | 327 | // Wake sleeping threads, if any 328 | if (sleep_state.load(std::memory_order_acquire) & low_mask) 329 | wakeup(); 330 | } 331 | 332 | std::pair TaskQueue::pop() { 333 | uint32_t index; 334 | Task *task; 335 | 336 | while (true) { 337 | // Lead head, tail, and next element, and double-check, in this order 338 | Task::Ptr head_c = ldar(head); 339 | Task::Ptr tail_c = ldar(tail); 340 | Task::Ptr &next = head_c.task->next; 341 | Task::Ptr next_c = ldar(next); 342 | Task::Ptr head_c_2 = ldar(head); 343 | 344 | // Detect inconsistencies due to contention 345 | if (head_c == head_c_2) { 346 | if (head_c.task != tail_c.task) { 347 | uint32_t remain = next_c.remain(); 348 | 349 | if (remain > 1) { 350 | // More than 1 remaining work units, update work counter 351 | if (cas(next, next_c, next_c.update_remain(remain - 1))) { 352 | task = next_c.task; 353 | index = task->size - remain; 354 | break; 355 | } 356 | } else { 357 | NT_ASSERT(remain == 1); 358 | // Head node is removed from the queue, reduce refcount 359 | if (cas(head, head_c, head_c.update_task(next_c.task))) { 360 | task = next_c.task; 361 | index = task->size - 1; 362 | release(head_c.task, true); 363 | break; 364 | } 365 | } 366 | } else { 367 | // Task queue was empty 368 | if (!next_c.task) { 369 | task = nullptr; 370 | index = 0; 371 | cas_pause(); 372 | break; 373 | } else { 374 | // Advance the tail, it's falling behind 375 | cas(tail, tail_c, tail_c.update_task(next_c.task)); 376 | } 377 | } 378 | } 379 | 380 | cas_pause(); 381 | } 382 | 383 | if (task) { 384 | NT_TRACE("pop(task=%p, index=%u)", task, index); 385 | 386 | if (index == 0 && profile_tasks) { 387 | #if defined(_WIN32) 388 | QueryPerformanceCounter(&task->time_start); 389 | #elif defined(__APPLE__) 390 | task->time_start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); 391 | #else 392 | clock_gettime(CLOCK_MONOTONIC, &task->time_start); 393 | #endif 394 | } 395 | } 396 | 397 | return { task, index }; 398 | } 399 | 400 | void TaskQueue::wakeup() { 401 | std::unique_lock guard(sleep_mutex); 402 | uint64_t value = sleep_state.load(); 403 | NT_TRACE("wakeup(): sleep_state := (%u, 0)", (uint32_t) (sleep_state >> 32) + 1); 404 | sleep_state = (value + high_bit) & high_mask; 405 | sleep_cv.notify_all(); 406 | } 407 | 408 | #if defined(NT_DEBUG) 409 | double time_milliseconds() { 410 | #if defined(_WIN32) 411 | LARGE_INTEGER ticks, ticks_per_sec; 412 | QueryPerformanceCounter(&ticks); 413 | QueryPerformanceFrequency(&ticks_per_sec); 414 | return (double) (ticks.QuadPart * 1000) / (double) ticks_per_sec.QuadPart; 415 | #elif defined(__APPLE__) 416 | return clock_gettime_nsec_np(CLOCK_UPTIME_RAW) / 1000000.0; 417 | #else 418 | struct timespec ts; 419 | clock_gettime(CLOCK_MONOTONIC, &ts); 420 | return ts.tv_sec * 1000 + ts.tv_nsec / 1000000.0; 421 | #endif 422 | } 423 | #endif 424 | 425 | std::pair 426 | TaskQueue::pop_or_sleep(bool (*stopping_criterion)(void *), void *payload, 427 | bool may_sleep) { 428 | std::pair result(nullptr, 0); 429 | uint32_t attempts = 0; 430 | 431 | #if defined(NT_DEBUG) 432 | double start = time_milliseconds(); 433 | #endif 434 | 435 | while (true) { 436 | result = pop(); 437 | 438 | if (result.first || stopping_criterion(payload)) 439 | break; 440 | 441 | attempts++; 442 | 443 | if (may_sleep && attempts >= NANOTHREAD_MAX_ATTEMPTS) { 444 | std::unique_lock guard(sleep_mutex); 445 | 446 | uint64_t value = ++sleep_state, phase = value & high_mask; 447 | NT_TRACE("pop_or_sleep(): falling asleep after %.2f milliseconds, " 448 | "sleep_state := (%u, %u)!", 449 | time_milliseconds() - start, (uint32_t)(value >> 32), 450 | (uint32_t) value); 451 | 452 | // Try once more to fetch a job 453 | result = pop(); 454 | 455 | /* If the following is true, somebody added work, or the stopping 456 | became active while this thread was about to go to sleep. */ 457 | if (result.first || stopping_criterion(payload)) { 458 | // Reduce sleep_state if we're still in the same phase. 459 | NT_TRACE("sleep aborted."); 460 | while (true) { 461 | if (sleep_state.compare_exchange_strong(value, value - 1)) 462 | break; 463 | if ((value & high_mask) != phase) 464 | break; 465 | cas_pause(); 466 | } 467 | break; 468 | } 469 | 470 | /* The push() code above has the structure 471 | 472 | - A1. Enqueue work 473 | - A2. Check sleep_state, and wake threads if nonzero 474 | 475 | While the code here has the structure 476 | 477 | - B1. Increase sleep_state 478 | - B2. Try to dequeue work 479 | - B3. Wait for wakeup signal 480 | 481 | This ordering excludes the possibility that the thread sleeps 482 | erroneously while work is available or added later on. 483 | */ 484 | 485 | while ((sleep_state & high_mask) == phase) 486 | sleep_cv.wait(guard); 487 | 488 | value = sleep_state.load(); 489 | NT_TRACE("pop_or_sleep(): woke up -- sleep_state=(%u, %u)", 490 | (uint32_t)(value >> 32), (uint32_t) value); 491 | } 492 | } 493 | 494 | return result; 495 | } 496 | -------------------------------------------------------------------------------- /src/queue.h: -------------------------------------------------------------------------------- 1 | /* 2 | src/queue.h -- Lock-free task queue implementation used by nanothread 3 | 4 | Copyright (c) 2021 Wenzel Jakob 5 | 6 | All rights reserved. Use of this source code is governed by a BSD-style 7 | license that can be found in the LICENSE file. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #if defined(_WIN32) 21 | # include 22 | # include 23 | using Lock = std::shared_mutex; // Prefer (more efficient) shared_mutex on Windows 24 | #else 25 | # include 26 | using Lock = std::mutex; 27 | #endif 28 | 29 | 30 | struct Pool; 31 | 32 | constexpr uint64_t high_bit = (uint64_t) 0x0000000100000000ull; 33 | constexpr uint64_t high_mask = (uint64_t) 0xFFFFFFFF00000000ull; 34 | constexpr uint64_t low_mask = (uint64_t) 0x00000000FFFFFFFFull; 35 | 36 | inline uint64_t shift(uint32_t value) { return ((uint64_t) value) << 32; } 37 | 38 | struct Task { 39 | /** 40 | * \brief Wide 16 byte pointer to a task in the worker pool. In addition to the 41 | * pointer itself, it encapsulates two more pieces of information: 42 | * 43 | * 1. The lower 32 bit of the \c value field store how many remaining work 44 | * units the task contains 45 | * 46 | * 2. The upper 32 bit of the \c value field contain a counter to prevent 47 | * the ABA problem during atomic updates. 48 | */ 49 | struct alignas(16) Ptr { 50 | Task *task; 51 | uint64_t value; 52 | 53 | Ptr(Task *task = nullptr, uint64_t value = 0) : task(task), value(value) { } 54 | 55 | Task::Ptr update_remain(uint32_t remain = 0) const { 56 | return Ptr{ task, remain | ((value & high_mask) + high_bit) }; 57 | } 58 | 59 | Task::Ptr update_task(Task *new_task) const { 60 | return Ptr{ new_task, (value & high_mask) + high_bit }; 61 | } 62 | 63 | operator bool() const { return task != nullptr; } 64 | 65 | uint32_t remain() const { return (uint32_t) value; } 66 | 67 | bool operator==(const Task::Ptr &other) const { 68 | return task == other.task && value == other.value; 69 | } 70 | }; 71 | 72 | /// Singly linked list, points to the next element 73 | Task::Ptr next; 74 | 75 | /** 76 | * \brief Reference count of this instance 77 | * 78 | * The reference count is arranged as a 2-tuple of 32 bit counters. When 79 | * submitting a work unit, its reference count is initially set to (3, 80 | * size), where \c size is the number of associated work units. The 81 | * number '3' indicates three special references 82 | * 83 | * - 1. A reference by the user code, which may e.g. wait for task completion 84 | * - 2. A reference as part of the queue data structure 85 | * - 3. A reference because the lower part is nonzero 86 | * 87 | * The function TaskQueue::release(task, high=true/false) can be 88 | * used to reduce the high and low parts separately. 89 | * 90 | * When the low part reaches zero, it assumed that all associated work 91 | * units have been completed, at which point child tasks are scheduled 92 | * and the task's payload is cleared. When both high and low parts reach 93 | * zero, it is assumed that no part of the system holds a reference to the 94 | * task, and it can be recycled. 95 | */ 96 | std::atomic refcount; 97 | 98 | /// Number of parent tasks that this task is waiting for 99 | std::atomic wait_parents; 100 | 101 | /// Number of threads that are waiting for this task in task_wait() 102 | std::atomic wait_count; 103 | 104 | /// Total number of work units in this task 105 | uint32_t size; 106 | 107 | /// Callback of the work unit 108 | void (*func)(uint32_t, void *); 109 | 110 | /// Pool that this tasks belongs to 111 | Pool *pool; 112 | 113 | /// Payload to be delivered to 'func' 114 | void *payload; 115 | 116 | /// Custom deleter used to free 'payload' 117 | void (*payload_deleter)(void *); 118 | 119 | /// Successor tasks that depend on this task 120 | std::vector children; 121 | 122 | /// Atomic flag stating whether the 'exception' field is already used 123 | std::atomic exception_used; 124 | 125 | /// Pointer to an exception in case the task failed 126 | std::exception_ptr exception; 127 | 128 | #if defined(__APPLE__) 129 | uint64_t time_start, time_end; 130 | #elif !defined(_WIN32) 131 | timespec time_start, time_end; 132 | #else 133 | LARGE_INTEGER time_start, time_end; 134 | #endif 135 | 136 | /// Fixed-size payload storage region 137 | alignas(8) uint8_t payload_storage[256]; 138 | 139 | void clear() { 140 | if (payload_deleter) 141 | payload_deleter(payload); 142 | payload_deleter = nullptr; 143 | payload = nullptr; 144 | children.clear(); 145 | #if !defined(NDEBUG) 146 | memset(payload_storage, 0xFF, sizeof(payload_storage)); 147 | #endif 148 | } 149 | }; 150 | 151 | /** 152 | * Modified implementation of the lock-free queue presented in the paper 153 | * 154 | * "Simple, fast and practical non-blocking and blocking concurrent queue algorithms" 155 | * by Maged Michael and Michael Scott. 156 | * 157 | * The main difference compared to a Michael-Scott queue is that each queue 158 | * item also has a *size* \c N that effectively creates \c N adjacent copies of 159 | * the item (but using a counter, which is more efficient than naive 160 | * replication). The \ref pop() operation returns the a pointer to the item and 161 | * a number in the range [0, N-1] indicating the item's index. 162 | * 163 | * Tasks can also have children. Following termination of a task, the queue 164 | * will push any children that don't depend on other unfinished work. 165 | * 166 | * The implementation here is designed to work on standard weakly ordered 167 | * memory architecture (e.g. AArch64), but likely would not not work an 168 | * completely weakly ordered architecture like the DEC Alpha. 169 | */ 170 | struct TaskQueue { 171 | public: 172 | /// Create an empty task queue 173 | TaskQueue(); 174 | 175 | /// Free the queue and delete any remaining tasks 176 | ~TaskQueue(); 177 | 178 | /** 179 | * \brief Allocate a new task record consisting of \c size work units 180 | * 181 | * The implementation tries to fetch an available task instance from a 182 | * pool of completed tasks, if possible. Otherwise, a new task is created. 183 | * 184 | * It is assumed that the caller will populate the remaining fields of the 185 | * returned task and then invoke \ref push() to submit the task to the 186 | * queue. The reference count of the returned task is initially set to 187 | * (2, size), where \c size is the number of associated work 188 | * units. The number '2' indicates two special references by user code and 189 | * by the queue itself, which don't correspond to outstanding work. 190 | * 191 | * Initializes the Tasks' \c wait \c size, \c refcount, and \c next fields. 192 | */ 193 | Task *alloc(uint32_t size); 194 | 195 | /** 196 | * \brief Decrease the reference count of a task. 197 | * 198 | * The implementation moves the task into a pool of completed tasks once 199 | * the task is no longer referenced by any thread or data structure. 200 | */ 201 | void release(Task *task, bool high = false); 202 | 203 | /// Increase the reference count of a task. 204 | void retain(Task *task); 205 | 206 | /// Append a task at the end of the queue 207 | void push(Task *task); 208 | 209 | /// Register an inter-task dependency 210 | void add_dependency(Task *task, Task *child); 211 | 212 | /** 213 | * \brief Pop a task from the queue 214 | * 215 | * When the queue is nonempty, this function returns a task instance and a 216 | * number in the range [0, size - 1], where \c size is the number 217 | * of work units in the task. Otherwise, it returns \c nullptr and 0. 218 | */ 219 | std::pair pop(); 220 | 221 | /** 222 | * \breif Fetch a task from the queue, or sleep 223 | * 224 | * This function repeatedly tries to fetch work from the queue and sleeps 225 | * if no work is available for an extended amount of time (~50 ms) and 226 | * the \c may_sleep parameter is set to \c true. 227 | * 228 | * The function stops trying to acquire work and returns (nullptr, 229 | * 0) when the supplied function stopping_criterion(payload) 230 | * evaluates to true. 231 | */ 232 | std::pair pop_or_sleep(bool (*stopping_criterion)(void *), 233 | void *payload, bool may_sleep); 234 | 235 | /// Wake sleeping threads 236 | void wakeup(); 237 | 238 | private: 239 | /// Head and tail of a lock-free list data structure 240 | Task::Ptr head, tail; 241 | 242 | /// Head of a lock-free stack storing unused tasks 243 | Task::Ptr recycle; 244 | 245 | /// Number of task instances created (for debugging) 246 | std::atomic tasks_created; 247 | 248 | /// Upper 32 bit: sleep phase, lower 32 bit: number of sleepers 249 | std::atomic sleep_state; 250 | 251 | /// Mutex protecting the fields below 252 | std::mutex sleep_mutex; 253 | 254 | /// Condition variable used to manage workers that are asleep 255 | std::condition_variable sleep_cv; 256 | }; 257 | 258 | 259 | extern "C" uint32_t pool_thread_id(); 260 | 261 | extern int profile_tasks; 262 | 263 | #define NT_STR_2(x) #x 264 | #define NT_STR(x) NT_STR_2(x) 265 | 266 | // #define NT_DEBUG 267 | #if defined(NT_DEBUG) 268 | # define NT_TRACE(fmt, ...) \ 269 | fprintf(stderr, "%03u: " fmt "\n", pool_thread_id(), ##__VA_ARGS__) 270 | #else 271 | # define NT_TRACE(fmt, ...) do { } while (0) 272 | #endif 273 | 274 | #define NT_ASSERT(x) \ 275 | if (!(x)) { \ 276 | fprintf(stderr, "Assertion failed in " __FILE__ \ 277 | ":" NT_STR(__LINE__) ": " #x "\n"); \ 278 | abort(); \ 279 | } 280 | 281 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(test_01 test_01.c) 2 | target_link_libraries(test_01 PRIVATE nanothread) 3 | 4 | add_executable(test_02 test_02.cpp) 5 | target_link_libraries(test_02 PRIVATE nanothread) 6 | target_compile_features(test_02 PRIVATE cxx_std_11) 7 | 8 | add_executable(test_03 test_03.cpp) 9 | target_link_libraries(test_03 PRIVATE nanothread) 10 | target_compile_features(test_03 PRIVATE cxx_std_14) 11 | 12 | add_executable(test_04 test_04.cpp) 13 | target_link_libraries(test_04 PRIVATE nanothread) 14 | target_compile_features(test_04 PRIVATE cxx_std_11) 15 | -------------------------------------------------------------------------------- /tests/test_01.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #if defined(_WIN32) 7 | # include 8 | #else 9 | # include 10 | #endif 11 | 12 | // Task callback function. Will be called with index = 0..999 13 | void my_task(uint32_t index, void *payload) { 14 | printf("Worker thread %u: work unit %u\n", pool_thread_id(), index); 15 | 16 | // Sleep for a bit 17 | #if defined(_WIN32) 18 | Sleep(500); 19 | #else 20 | usleep(500000); 21 | #endif 22 | 23 | // Use payload to communicate some data to the caller 24 | ((uint32_t *) payload)[index] = index; 25 | } 26 | 27 | int main(int argc, char** argv) { 28 | (void) argc; (void) argv; // Command line arguments unused 29 | 30 | uint32_t temp[10000]; 31 | 32 | memset(temp, 0, sizeof(int) * 1000); 33 | 34 | // Create a worker per CPU thread 35 | Pool *pool = pool_create(100, 0); 36 | 37 | // Synchronous interface: submit a task and wait for it to complete 38 | task_submit_and_wait( 39 | pool, 40 | 1000, // How many work units does this task contain? 41 | my_task, // Function to be executed 42 | temp // Optional payload, will be passed to function 43 | ); 44 | 45 | // .. contents of 'temp' are now ready .. 46 | for (uint32_t i = 0; i < 1000; ++i) { 47 | if (temp[i] != i) { 48 | fprintf(stderr, "Test failed!\n"); 49 | abort(); 50 | } 51 | } 52 | 53 | // Clean up used resources 54 | pool_destroy(pool); 55 | } 56 | -------------------------------------------------------------------------------- /tests/test_02.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #if defined(_WIN32) 6 | # include 7 | #else 8 | # include 9 | #endif 10 | 11 | 12 | int main(int, char**) { 13 | uint32_t temp[10000]; 14 | 15 | memset(temp, 0, sizeof(int) * 1000); 16 | 17 | // Create a worker per CPU thread 18 | Pool *pool = pool_create(100); 19 | 20 | Task *task = drjit::parallel_for_async( 21 | drjit::blocked_range(0, 1000, 1), 22 | 23 | // Task callback function. Will be called with index = 0..999 24 | [&](drjit::blocked_range range) { 25 | for (uint32_t i = range.begin(); i != range.end(); ++i) { 26 | printf("Worker thread %u: work unit %u\n", pool_thread_id(), i); 27 | 28 | // Sleep for a bit 29 | #if defined(_WIN32) 30 | Sleep(500); 31 | #else 32 | usleep(500000); 33 | #endif 34 | 35 | // Use payload to communicate some data to the caller 36 | temp[i] = i; 37 | } 38 | }, 39 | 40 | {}, pool 41 | ); 42 | 43 | // Synchronous interface: submit a task and wait for it to complete 44 | task_wait(task); 45 | 46 | // .. contents of 'temp' are now ready .. 47 | for (uint32_t i = 0; i < 1000; ++i) { 48 | if (temp[i] != i) { 49 | fprintf(stderr, "Test failed!\n"); 50 | abort(); 51 | } 52 | } 53 | 54 | // Clean up used resources 55 | pool_destroy(pool); 56 | } 57 | -------------------------------------------------------------------------------- /tests/test_03.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | Task *tetranacci(Pool *pool, uint32_t i, uint32_t *out) { 5 | if (i < 4) { 6 | *out = (i == 3) ? 1 : 0; 7 | return nullptr; 8 | } 9 | 10 | uint32_t *tmp = new uint32_t[4]; 11 | 12 | Task *task[4] = { 13 | tetranacci(pool, i - 1, tmp), 14 | tetranacci(pool, i - 2, tmp + 1), 15 | tetranacci(pool, i - 3, tmp + 2), 16 | tetranacci(pool, i - 4, tmp + 3) 17 | }; 18 | 19 | Task *rv = drjit::do_async( 20 | [tmp, out]() { 21 | *out = tmp[0] + tmp[1] + tmp[2] + tmp[3]; 22 | delete[] tmp; 23 | }, { task[0], task[1], task[2], task[3] }, 24 | pool 25 | ); 26 | 27 | task_release(task[0]); 28 | task_release(task[1]); 29 | task_release(task[2]); 30 | task_release(task[3]); 31 | 32 | return rv; 33 | } 34 | 35 | Task * tetranacci_2(Pool *pool, uint32_t i, uint32_t *out) { 36 | if (i < 4) { 37 | *out = (i == 3) ? 1 : 0; 38 | return nullptr; 39 | } 40 | 41 | return drjit::do_async( 42 | [pool, i, out]() { 43 | uint32_t tmp[4]; 44 | Task *task[4]; 45 | 46 | for (int k = 0; k < 4; ++k) 47 | task[k] = tetranacci_2(pool, i - k - 1, tmp + k); 48 | for (int k = 0; k < 4; ++k) 49 | task_wait_and_release(task[k]); 50 | 51 | *out = tmp[0] + tmp[1] + tmp[2] + tmp[3]; 52 | }, {}, pool 53 | ); 54 | } 55 | 56 | int main(int, char**) { 57 | // Create a worker per CPU thread 58 | for (int i = 0; i< 100; ++i) { 59 | printf("Testing with %i threads..\n", i); 60 | Pool *pool = pool_create(i); 61 | 62 | uint32_t out = 0; 63 | Task *task = tetranacci(pool, 16, &out); 64 | task_wait_and_release(task); 65 | if (out != 2872) 66 | abort(); 67 | 68 | task = tetranacci_2(pool, 16, &out); 69 | task_wait_and_release(task); 70 | if (out != 2872) 71 | abort(); 72 | 73 | // Clean up used resources 74 | pool_destroy(pool); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /tests/test_04.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #if defined(_WIN32) 5 | # include 6 | #else 7 | # include 8 | #endif 9 | 10 | void my_sleep(uint32_t amt) { 11 | #if defined(_WIN32) 12 | Sleep(amt); 13 | #else 14 | usleep(amt * 1000); 15 | #endif 16 | } 17 | 18 | namespace dr = drjit; 19 | 20 | void test01() { 21 | try { 22 | dr::parallel_for( 23 | dr::blocked_range(0, 1000, 5), 24 | [](dr::blocked_range /* range */) { 25 | throw std::runtime_error("Hello world!"); 26 | } 27 | ); 28 | } catch (std::exception &e) { 29 | printf("Test 1: success: %s\n", e.what()); 30 | return; 31 | } 32 | abort(); 33 | } 34 | 35 | void test02(bool wait) { 36 | auto work1 = dr::parallel_for_async( 37 | dr::blocked_range(0, 10, 1), 38 | [](dr::blocked_range /* range */) { 39 | my_sleep(10); 40 | throw std::runtime_error("Hello world!"); 41 | } 42 | ); 43 | 44 | if (wait) 45 | my_sleep(100); 46 | 47 | auto work2 = dr::parallel_for_async( 48 | dr::blocked_range(0, 10, 1), 49 | [](dr::blocked_range /* range */) { 50 | printf("Should never get here!\n"); 51 | abort(); 52 | }, 53 | { work1 } 54 | ); 55 | 56 | task_release(work1); 57 | 58 | try { 59 | task_wait_and_release(work2); 60 | } catch (std::exception &e) { 61 | printf("Test 2: success: %s\n", e.what()); 62 | return; 63 | } 64 | abort(); 65 | } 66 | 67 | int main(int, char**) { 68 | test01(); 69 | test02(false); 70 | test02(true); 71 | } 72 | --------------------------------------------------------------------------------