├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE-APACHEv2 ├── LICENSE-MIT ├── README.md ├── benchmarks ├── bouncing_producer_consumer │ ├── README.md │ └── taskpool_bpc.nim ├── dfs │ └── taskpool_dfs.nim ├── fibonacci │ ├── README.md │ ├── stdnim_fib.nim │ └── taskpool_fib.nim ├── heat │ ├── stdnim_heat.nim │ └── taskpool_heat.nim ├── matmul_cache_oblivious │ ├── README.md │ └── taskpool_matmul_co.nim ├── nqueens │ ├── stdnim_nqueens.nim │ └── taskpool_nqueens.nim ├── resources.nim ├── single_task_producer │ ├── README.md │ └── taskpool_spc.nim ├── wtime.h └── wtime.nim ├── doc └── README.md ├── examples ├── e01_simple_tasks.nim └── e02_parallel_pi.nim ├── papers ├── Chase-Lev - Dynamic Circular Work-Stealing Deque.pdf └── Nhat Minh Le et al - Correct and Efficient Work-Stealing for Weak Memory Models.pdf ├── taskpools.nim ├── taskpools.nimble └── taskpools ├── ast_utils.nim ├── channels_spsc_single.nim ├── chase_lev_deques.nim ├── event_notifiers.nim ├── flowvars.nim ├── instrumentation ├── contracts.nim └── loggers.nim ├── primitives ├── allocs.nim ├── barriers.md ├── barriers.nim ├── barriers_macos.nim ├── barriers_posix.nim └── barriers_windows.nim ├── sparsesets.nim ├── taskpools.nim └── tasks.nim /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - stable 6 | - unstable 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | uses: status-im/nimbus-common-workflow/.github/workflows/common.yml@main 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nimcache/ 2 | 3 | # Executables shall be put in an ignored build/ directory 4 | build/ 5 | nimble.develop 6 | nimble.paths 7 | -------------------------------------------------------------------------------- /LICENSE-APACHEv2: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021-Present Status Research & Development GmbH 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021-Present Status Research & Development GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Taskpools 2 | 3 | This implements a lightweight, energy-efficient, easily auditable multithreaded taskpools. 4 | 5 | This taskpools will be used in a highly security-sensitive blockchain application 6 | targeted at resource-restricted devices hence desirable properties are: 7 | 8 | - Ease of auditing and maintenance. 9 | - Formally verified synchronization primitives are highly-sought after. 10 | - Otherwise primitives are implemented from papers or ported from proven codebases 11 | that can serve as reference for auditors. 12 | - Resource-efficient. Threads spindown to save power, low memory use. 13 | - Decent performance and scalability. The CPU should spent its time processing user workloads 14 | and not dealing with threadpool contention, latencies and overheads. 15 | 16 | ## Example usage 17 | 18 | ```Nim 19 | # Demo of API using a very inefficient π approcimation algorithm. 20 | 21 | import 22 | std/[strutils, math, cpuinfo], 23 | taskpools 24 | 25 | # From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim 26 | # Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80 27 | proc term(k: int): float = 28 | if k mod 2 == 1: 29 | -4'f / float(2*k + 1) 30 | else: 31 | 4'f / float(2*k + 1) 32 | 33 | proc piApprox(tp: Taskpool, n: int): float = 34 | var pendingFuts = newSeq[FlowVar[float]](n) 35 | for k in 0 ..< pendingFuts.len: 36 | pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result. 37 | for k in 0 ..< pendingFuts.len: 38 | result += sync pendingFuts[k] # Block until the result is available. 39 | 40 | proc main() = 41 | var n = 1_000_000 42 | var nthreads = countProcessors() 43 | 44 | var tp = Taskpool.new(num_threads = nthreads) # Default to the number of hardware threads. 45 | 46 | echo formatFloat(tp.piApprox(n)) 47 | 48 | tp.syncAll() # Block until all pending tasks are processed (implied in tp.shutdown()) 49 | tp.shutdown() 50 | 51 | # Compile with nim c -r -d:release --threads:on --outdir:build example.nim 52 | main() 53 | ``` 54 | 55 | ## API 56 | 57 | The API follows the spec proposed here https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api 58 | 59 | The following types and procedures are exposed: 60 | 61 | - Taskpool: 62 | - ```Nim 63 | type Taskpool* = ptr object 64 | ## A taskpool schedules procedures to be executed in parallel 65 | ``` 66 | - ```Nim 67 | proc new(T: type Taskpool, numThreads = countProcessor()): T 68 | ## Initialize a threadpool that manages `numThreads` threads. 69 | ## Default to the number of logical processors available. 70 | ``` 71 | - ```Nim 72 | proc syncAll*(pool: Taskpool) = 73 | ## Blocks until all pending tasks are completed. 74 | ## 75 | ## This MUST only be called from 76 | ## the root thread that created the taskpool 77 | ``` 78 | - ```Nim 79 | proc shutdown*(tp: var TaskPool) = 80 | ## Wait until all tasks are completed and then shutdown the taskpool. 81 | ## 82 | ## This MUST only be called from 83 | ## the root scope that created the taskpool. 84 | ``` 85 | - ```Nim 86 | macro spawn*(tp: TaskPool, fnCall: typed): untyped = 87 | ## Spawns the input function call asynchronously, potentially on another thread of execution. 88 | ## 89 | ## If the function calls returns a result, spawn will wrap it in a Flowvar. 90 | ## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar. 91 | ## You can use `isReady` to check if result is available and if subsequent 92 | ## `spawn` returns immediately. 93 | ## 94 | ## Tasks are processed approximately in Last-In-First-Out (LIFO) order 95 | ``` 96 | In practice the signature is one of the following 97 | ```Nim 98 | proc spawn*(tp: TaskPool, fnCall(args) -> T): Flowvar[T] 99 | proc spawn*(tp: TaskPool, fnCall(args) -> void): void 100 | ``` 101 | - Flowvar, a handle on an asynchronous computation scheduled on the threadpool 102 | - ```Nim 103 | type Flowvar*[T] = object 104 | ## A Flowvar is a placeholder for a future result that may be computed in parallel 105 | ``` 106 | - ```Nim 107 | func isSpawned*(fv: Flowvar): bool = 108 | ## Returns true if a flowvar is spawned 109 | ## This may be useful for recursive algorithms that 110 | ## may or may not spawn a flowvar depending on a condition. 111 | ## This is similar to Option or Maybe types 112 | ``` 113 | - ```Nim 114 | func isReady*[T](fv: Flowvar[T]): bool = 115 | ## Returns true if the result of a Flowvar is ready. 116 | ## In that case `sync` will not block. 117 | ## Otherwise the current will block to help on all the pending tasks 118 | ## until the Flowvar is ready. 119 | ``` 120 | - ```Nim 121 | proc sync*[T](fv: sink Flowvar[T]): T = 122 | ## Blocks the current thread until the flowvar is available 123 | ## and returned. 124 | ## The thread is not idle and will complete pending tasks. 125 | ``` 126 | 127 | ### Non-goals 128 | 129 | The following are non-goals: 130 | 131 | - Supporting GC-ed types with Nim default GC (sequences and strings). Using no GC or --gc:arc, --gc:orc or --gc:boehm (any GC that doesn't have thread-local heaps). 132 | - Having async-awaitable tasks 133 | - Running on environments without dynamic memory allocation 134 | - High-Performance Computing specificities (distribution on many machines or GPUs or machines with 200+ cores or multi-sockets) 135 | 136 | ### Comparison with Weave 137 | 138 | Compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs: 139 | - Taskpools only provide spawn/sync (task parallelism).\ 140 | There is no (extremely) optimized parallel for (data parallelism)\ 141 | or precise in/out dependencies (events / dataflow parallelism). 142 | - Weave can handle trillions of small tasks that require only 10µs per task. (Load Balancing overhead) 143 | - Weave maintains an adaptive memory pool to reduce memory allocation overhead, 144 | Taskpools allocations are as-needed. (Scheduler overhead) 145 | 146 | ## License 147 | 148 | Licensed and distributed under either of 149 | 150 | * MIT license: [LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT 151 | * Apache License, Version 2.0, ([LICENSE-APACHEv2](LICENSE-APACHEv2) or http://www.apache.org/licenses/LICENSE-2.0) 152 | 153 | at your option. This file may not be copied, modified, or distributed except according to those terms. 154 | -------------------------------------------------------------------------------- /benchmarks/bouncing_producer_consumer/README.md: -------------------------------------------------------------------------------- 1 | # BPC (Bouncing Producer-Consumer) 2 | 3 | From [tasking-2.0](https://github.com/aprell/tasking-2.0) description 4 | 5 | > **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far 6 | > as I know, first described by [Dinan et al][1]. There are two types of 7 | > tasks, producer and consumer tasks. Each producer task creates another 8 | > producer task followed by *n* consumer tasks, until a certain depth *d* is 9 | > reached. Consumer tasks run for *t* microseconds. The smaller the values of 10 | > *n* and *t*, the harder it becomes to exploit the available parallelism. A 11 | > solid contender for the most antagonistic microbenchmark. 12 | -------------------------------------------------------------------------------- /benchmarks/bouncing_producer_consumer/taskpool_bpc.nim: -------------------------------------------------------------------------------- 1 | import 2 | # STD lib 3 | os, strutils, system/ansi_c, cpuinfo, strformat, math, 4 | # Library 5 | ../../taskpools, 6 | # bench 7 | ../wtime, ../resources 8 | 9 | var 10 | Depth: int32 # For example 10000 11 | NumTasksPerDepth: int32 # For example 9 12 | # The total number of tasks in the BPC benchmark is 13 | # (NumTasksPerDepth + 1) * Depth 14 | NumTasksTotal: int32 15 | TaskGranularity: int32 # in microseconds 16 | PollInterval: float64 # in microseconds 17 | 18 | tp: Taskpool 19 | 20 | var global_poll_elapsed {.threadvar.}: float64 21 | 22 | template dummy_cpt(): untyped = 23 | # Dummy computation 24 | # Calculate fib(30) iteratively 25 | var 26 | fib = 0 27 | f2 = 0 28 | f1 = 1 29 | for i in 2 .. 30: 30 | fib = f1 + f2 31 | f2 = f1 32 | f1 = fib 33 | 34 | proc bpc_consume(usec: int32) = 35 | 36 | var pollElapsed = 0'f64 37 | 38 | let start = wtime_usec() 39 | let stop = usec.float64 40 | global_poll_elapsed = PollInterval 41 | 42 | while true: 43 | var elapsed = wtime_usec() - start 44 | elapsed -= pollElapsed 45 | if elapsed >= stop: 46 | break 47 | 48 | dummy_cpt() 49 | 50 | # if elapsed >= global_poll_elapsed: 51 | # let pollStart = wtime_usec() 52 | # loadBalance(Weave) 53 | # pollElapsed += wtime_usec() - pollStart 54 | # global_poll_elapsed += PollInterval 55 | 56 | proc bpc_consume_nopoll(usec: int32) = 57 | 58 | let start = wtime_usec() 59 | let stop = usec.float64 60 | 61 | while true: 62 | var elapsed = wtime_usec() - start 63 | if elapsed >= stop: 64 | break 65 | 66 | dummy_cpt() 67 | 68 | proc bpc_produce(n, d: int32) {.gcsafe, raises: [].} = 69 | if d > 0: 70 | # Create producer task 71 | tp.spawn bpc_produce(n, d-1) 72 | else: 73 | return 74 | 75 | # Followed by n consumer tasks 76 | for i in 0 ..< n: 77 | tp.spawn bpc_consume(TaskGranularity) 78 | 79 | proc main() = 80 | Depth = 10000 81 | NumTasksPerDepth = 999 82 | TaskGranularity = 1 83 | 84 | if paramCount() == 0: 85 | let exeName = getAppFilename().extractFilename() 86 | echo &"Usage: {exeName} " & 87 | &"<# of tasks per depth: {NumTasksPerDepth}> " & 88 | &"[task granularity (us): {TaskGranularity}] " & 89 | &"[polling interval (us): task granularity]" 90 | echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}" 91 | if paramCount() >= 1: 92 | Depth = paramStr(1).parseInt.int32 93 | if paramCount() >= 2: 94 | NumTasksPerDepth = paramStr(2). parseInt.int32 95 | if paramCount() >= 3: 96 | TaskGranularity = paramStr(3). parseInt.int32 97 | if paramCount() == 4: 98 | PollInterval = paramStr(4).parseInt.float64 99 | else: 100 | PollInterval = TaskGranularity.float64 101 | if paramCount() > 4: 102 | let exeName = getAppFilename().extractFilename() 103 | echo &"Usage: {exeName} " & 104 | &"<# of tasks per depth: {NumTasksPerDepth}> " & 105 | &"[task granularity (us): {TaskGranularity}] " & 106 | &"[polling interval (us): task granularity]" 107 | quit 1 108 | 109 | NumTasksTotal = (NumTasksPerDepth + 1) * Depth 110 | 111 | var nthreads: int 112 | if existsEnv"TASKPOOL_NUM_THREADS": 113 | nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt() 114 | else: 115 | nthreads = countProcessors() 116 | 117 | tp = Taskpool.new(numThreads = nthreads) 118 | 119 | # measure overhead during tasking 120 | var ru: Rusage 121 | getrusage(RusageSelf, ru) 122 | var 123 | rss = ru.ru_maxrss 124 | flt = ru.ru_minflt 125 | 126 | let start = wtime_msec() 127 | 128 | bpc_produce(NumTasksPerDepth, Depth) 129 | tp.syncAll() 130 | 131 | let stop = wtime_msec() 132 | 133 | getrusage(RusageSelf, ru) 134 | rss = ru.ru_maxrss - rss 135 | flt = ru.ru_minflt - flt 136 | 137 | tp.shutdown() 138 | 139 | echo "--------------------------------------------------------------------------" 140 | echo "Scheduler: Taskpool" 141 | echo "Benchmark: BPC (Bouncing Producer-Consumer)" 142 | echo "Threads: ", nthreads 143 | echo "Time(ms) ", round(stop - start, 3) 144 | echo "Max RSS (KB): ", ru.ru_maxrss 145 | echo "Runtime RSS (KB): ", rss 146 | echo "# of page faults: ", flt 147 | echo "--------------------------------------------------------------------------" 148 | echo "# of tasks: ", NumTasksTotal 149 | echo "# of tasks/depth: ", NumTasksPerDepth 150 | echo "Depth: ", Depth 151 | echo "Task granularity (us): ", TaskGranularity 152 | echo "Polling / manual load balancing interval (us): ", PollInterval 153 | 154 | quit 0 155 | 156 | main() 157 | -------------------------------------------------------------------------------- /benchmarks/dfs/taskpool_dfs.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import 9 | # Stdlib 10 | system/ansi_c, strformat, os, strutils, cpuinfo, 11 | # Library 12 | ../../taskpools 13 | 14 | when not defined(windows): 15 | # bench 16 | import ../wtime 17 | 18 | var tp: Taskpool 19 | 20 | proc dfs(depth, breadth: int): uint32 {.gcsafe, raises: [].} = 21 | if depth == 0: 22 | return 1 23 | 24 | # We could use alloca to avoid heap allocation here 25 | var sums = newSeq[Flowvar[uint32]](breadth) 26 | 27 | for i in 0 ..< breadth: 28 | sums[i] = tp.spawn dfs(depth - 1, breadth) 29 | 30 | for i in 0 ..< breadth: 31 | result += sync(sums[i]) 32 | 33 | proc test(depth, breadth: int): uint32 = 34 | result = sync tp.spawn dfs(depth, breadth) 35 | 36 | proc main() = 37 | 38 | var 39 | depth = 8 40 | breadth = 8 41 | answer: uint32 42 | nthreads: int 43 | 44 | if existsEnv"TP_NUM_THREADS": 45 | nthreads = getEnv"TP_NUM_THREADS".parseInt() 46 | else: 47 | nthreads = countProcessors() 48 | 49 | if paramCount() == 0: 50 | let exeName = getAppFilename().extractFilename() 51 | echo &"Usage: {exeName} " 52 | echo &"Running with default config depth = {depth} and breadth = {breadth}" 53 | 54 | if paramCount() >= 1: 55 | depth = paramStr(1).parseInt() 56 | if paramCount() == 2: 57 | breadth = paramStr(2).parseInt() 58 | if paramCount() > 2: 59 | let exeName = getAppFilename().extractFilename() 60 | echo &"Usage: {exeName} " 61 | echo &"Up to 2 parameters are valid. Received {paramCount()}" 62 | quit 1 63 | 64 | # Staccato benches runtime init and exit as well 65 | when not defined(windows): 66 | let start = wtime_usec() 67 | 68 | tp = Taskpool.new() 69 | answer = test(depth, breadth) 70 | tp.shutdown() 71 | 72 | when not defined(windows): 73 | let stop = wtime_usec() 74 | 75 | echo "Scheduler: Taskpool" 76 | echo "Benchmark: dfs" 77 | echo "Threads: ", nthreads 78 | when not defined(windows): 79 | echo "Time(us) ", stop - start 80 | echo "Output: ", answer 81 | 82 | quit 0 83 | 84 | main() 85 | -------------------------------------------------------------------------------- /benchmarks/fibonacci/README.md: -------------------------------------------------------------------------------- 1 | # Fibonacci benchmarks 2 | 3 | ⚠️ Disclaimer: 4 | Please don't use parallel fibonacci in production! 5 | Use the fast doubling method with memoization instead. 6 | 7 | Fibonacci benchmark has 3 draws: 8 | 9 | 1. It's very simple to implement 10 | 2. It's unbalanced and efficiency requires distributions to avoid idle cores. 11 | 3. It's a very effective scheduler overhead benchmark, because the basic task is very trivial and the task spawning grows at 2^n scale. 12 | 13 | Want to know the difference between low and high overhead? 14 | 15 | Run the following C code (taken from [Oracle OpenMP example](https://docs.oracle.com/cd/E19205-01/820-7883/girtd/index.html)) 16 | 17 | ```C 18 | #include 19 | #include 20 | int fib(int n) 21 | { 22 | int i, j; 23 | if (n<2) 24 | return n; 25 | else 26 | { 27 | #pragma omp task shared(i) firstprivate(n) 28 | { 29 | i=fib(n-1); 30 | } 31 | 32 | j=fib(n-2); 33 | #pragma omp taskwait 34 | return i+j; 35 | } 36 | } 37 | 38 | int main() 39 | { 40 | int n = 40; 41 | 42 | #pragma omp parallel shared(n) 43 | { 44 | #pragma omp single 45 | printf ("fib(%d) = %d\n", n, fib(n)); 46 | } 47 | } 48 | ``` 49 | 50 | First compile with Clang and run it 51 | ``` 52 | clang -O3 -fopenmp benchmarks/fibonacci/omp_fib.c 53 | time a.out 54 | ``` 55 | It should be fairly quick 56 | 57 | 58 | Then compile with GCC and run it 59 | ``` 60 | gcc -O3 -fopenmp benchmarks/fibonacci/omp_fib.c 61 | time a.out 62 | ``` 63 | 64 | Notice how some cores get idle as time goes on? 65 | Don't forget to kill the benchmark, you'll be there all day. 66 | 67 | What's happening? 68 | 69 | GCC's OpenMP implementation uses a single queue for all tasks. 70 | That queue gets constantly hammered by all threads and becomes a contention point. 71 | Furthermore, it seems like there is no load balancing or that due to the contention/lock 72 | threads are descheduled. 73 | 74 | However Clang implementation uses a work-stealing scheduler with one deque per thread. 75 | The only contention happens when a thread run out of work and has to look for more work, 76 | in the deque of other threads. And which thread to check is chosen at random so 77 | the potential contention is distributed among all threads instead of a single structure. 78 | -------------------------------------------------------------------------------- /benchmarks/fibonacci/stdnim_fib.nim: -------------------------------------------------------------------------------- 1 | import 2 | # STD lib 3 | os, strutils, threadpool, strformat, 4 | # bench 5 | ../wtime 6 | 7 | # Using Nim's standard threadpool 8 | # Compile with "nim c --threads:on -d:release -d:danger --outdir:build benchmarks/fibonacci/stdnim_fib.nim" 9 | # 10 | # Note: it breaks at fib 16. 11 | 12 | proc parfib(n: uint64): uint64 = 13 | if n < 2: # Note: be sure to compare n<2 -> return n 14 | return n # instead of n<=2 -> return 1 15 | 16 | let x = spawn parfib(n-1) 17 | let y = parfib(n-2) 18 | 19 | return ^x + y 20 | 21 | proc main() = 22 | if paramCount() != 1: 23 | echo "Usage: fib " 24 | quit 0 25 | 26 | let n = paramStr(1).parseUInt.uint64 27 | 28 | let start = wtime_msec() 29 | let f = parfib(n) 30 | let stop = wtime_msec() 31 | 32 | echo "Result: ", f 33 | echo &"Elapsed wall time: {stop-start:.2} ms" 34 | 35 | main() 36 | -------------------------------------------------------------------------------- /benchmarks/fibonacci/taskpool_fib.nim: -------------------------------------------------------------------------------- 1 | import 2 | # STD lib 3 | os, strutils, cpuinfo, strformat, math, 4 | # Library 5 | ../../taskpools 6 | 7 | when not defined(windows): 8 | # bench 9 | import ../wtime, ../resources 10 | 11 | var tp: Taskpool 12 | 13 | proc fib(n: int): int = 14 | # int64 on x86-64 15 | if n < 2: 16 | return n 17 | 18 | let x = tp.spawn fib(n-1) 19 | let y = fib(n-2) 20 | 21 | result = sync(x) + y 22 | 23 | proc main() = 24 | var n = 40 25 | var nthreads: int 26 | 27 | if paramCount() == 0: 28 | let exeName = getAppFilename().extractFilename() 29 | echo &"Usage: {exeName} " 30 | echo &"Running with default n = {n}" 31 | elif paramCount() == 1: 32 | n = paramStr(1).parseInt 33 | else: 34 | let exeName = getAppFilename().extractFilename() 35 | echo &"Usage: {exeName} " 36 | quit 1 37 | 38 | if existsEnv"TP_NUM_THREADS": 39 | nthreads = getEnv"TP_NUM_THREADS".parseInt() 40 | else: 41 | nthreads = countProcessors() 42 | 43 | tp = Taskpool.new() 44 | 45 | # measure overhead during tasking 46 | when not defined(windows): 47 | var ru: Rusage 48 | getrusage(RusageSelf, ru) 49 | var 50 | rss = ru.ru_maxrss 51 | flt = ru.ru_minflt 52 | 53 | let start = wtime_msec() 54 | let f = fib(n) 55 | 56 | when not defined(windows): 57 | let stop = wtime_msec() 58 | 59 | tp.shutdown() 60 | 61 | when not defined(windows): 62 | getrusage(RusageSelf, ru) 63 | rss = ru.ru_maxrss - rss 64 | flt = ru.ru_minflt - flt 65 | 66 | echo "--------------------------------------------------------------------------" 67 | echo "Scheduler: Taskpool" 68 | echo "Benchmark: Fibonacci" 69 | echo "Threads: ", nthreads 70 | when not defined(windows): 71 | echo "Time(ms) ", round(stop - start, 3) 72 | echo "Max RSS (KB): ", ru.ru_maxrss 73 | echo "Runtime RSS (KB): ", rss 74 | echo "# of page faults: ", flt 75 | echo "--------------------------------------------------------------------------" 76 | echo "n requested: ", n 77 | echo "result: ", f 78 | 79 | main() 80 | -------------------------------------------------------------------------------- /benchmarks/heat/stdnim_heat.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | # From fibril 9 | # 10 | # Original license 11 | # 12 | # /* 13 | # * Heat diffusion (Jacobi-type iteration) 14 | # * 15 | # * Volker Strumpen, Boston August 1996 16 | # * 17 | # * Copyright (c) 1996 Massachusetts Institute of Technology 18 | # * 19 | # * This program is free software; you can redistribute it and/or modify 20 | # * it under the terms of the GNU General Public License as published by 21 | # * the Free Software Foundation; either version 2 of the License, or 22 | # * (at your option) any later version. 23 | # * 24 | # * This program is distributed in the hope that it will be useful, 25 | # * but WITHOUT ANY WARRANTY; without even the implied warranty of 26 | # * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 | # * GNU General Public License for more details. 28 | # * 29 | # * You should have received a copy of the GNU General Public License 30 | # * along with this program; if not, write to the Free Software 31 | # * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 32 | # */ 33 | 34 | import 35 | # Stdlib 36 | strformat, os, strutils, math, system/ansi_c, 37 | cpuinfo, threadpool, 38 | # bench 39 | ../wtime, ../resources 40 | 41 | # This deadlocks :/ 42 | 43 | # Helpers 44 | # ------------------------------------------------------- 45 | 46 | # We need a thin wrapper around raw pointers for matrices, 47 | # we can't pass "var seq[seq[float64]]" to other threads 48 | # nor "var" for that matter 49 | type 50 | Matrix[T] = object 51 | buffer: ptr UncheckedArray[T] 52 | m, n: int 53 | 54 | Row[T] = object 55 | buffer: ptr UncheckedArray[T] 56 | len: int 57 | 58 | func newMatrix[T](m, n: int): Matrix[T] {.inline.} = 59 | result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T))) 60 | result.m = m 61 | result.n = n 62 | 63 | template `[]`[T](mat: Matrix[T], row, col: Natural): T = 64 | # row-major storage 65 | assert row < mat.m 66 | assert col < mat.n 67 | mat.buffer[row * mat.n + col] 68 | 69 | template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) = 70 | assert row < mat.m 71 | assert col < mat.n 72 | mat.buffer[row * mat.n + col] = value 73 | 74 | func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} = 75 | # row-major storage, there are n columns in between each rows 76 | assert rowIdx < mat.m 77 | result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr) 78 | result.len = mat.m 79 | 80 | template `[]`[T](row: Row[T], idx: Natural): T = 81 | assert idx < row.len 82 | row.buffer[idx] 83 | 84 | template `[]=`[T](row: Row[T], idx: Natural, value: T) = 85 | assert idx < row.len 86 | row.buffer[idx] = value 87 | 88 | func delete[T](mat: sink Matrix[T]) = 89 | c_free(mat.buffer) 90 | 91 | # And an auto converter for int32 -> float64 so we don't have to convert 92 | # all i, j indices manually 93 | 94 | converter i32toF64(x: int32): float64 {.inline.} = 95 | float64(x) 96 | 97 | # ------------------------------------------------------- 98 | 99 | template f(x, y: SomeFloat): SomeFloat = 100 | sin(x) * sin(y) 101 | 102 | template randa[T: SomeFloat](x, t: T): T = 103 | T(0.0) 104 | 105 | proc randb(x, t: SomeFloat): SomeFloat {.inline.} = 106 | # proc instead of template to avoid Nim constant folding bug: 107 | # https://github.com/nim-lang/Nim/issues/12783 108 | exp(-2 * t) * sin(x) 109 | 110 | template randc[T: SomeFloat](y, t: T): T = 111 | T(0.0) 112 | 113 | proc randd(y, t: SomeFloat): SomeFloat {.inline.} = 114 | # proc instead of template to avoid Nim constant folding bug: 115 | # https://github.com/nim-lang/Nim/issues/12783 116 | exp(-2 * t) * sin(y) 117 | 118 | template solu(x, y, t: SomeFloat): SomeFloat = 119 | exp(-2 * t) * sin(x) * sin(y) 120 | 121 | const n = 4096'i32 122 | 123 | var 124 | nx, ny, nt: int32 125 | xu, xo, yu, yo, tu, to: float64 126 | 127 | dx, dy, dt: float64 128 | dtdxsq, dtdysq: float64 129 | 130 | odd: Matrix[float64] 131 | even: Matrix[float64] 132 | 133 | proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}= 134 | # TODO to allow awaiting `heat` we return a dummy bool 135 | # The parallel spawns are updating the same matrix cells otherwise 136 | if iu - il > 1: 137 | let im = (il + iu) div 2 138 | 139 | let h = spawn heat(m, il, im) 140 | heat(m, im, iu) 141 | discard ^h 142 | return true 143 | # ------------------------ 144 | 145 | let i = il 146 | let row = m.getRow(i) 147 | 148 | if i == 0: 149 | for j in 0 ..< ny: 150 | row[j] = randc(yu + j*dy, 0) 151 | elif i == nx - 1: 152 | for j in 0 ..< ny: 153 | row[j] = randd(yu + j*dy, 0) 154 | else: 155 | row[0] = randa(xu + i*dx, 0) 156 | for j in 1 ..< ny - 1: 157 | row[j] = f(xu + i*dx, yu + j*dy) 158 | row[ny - 1] = randb(xu + i*dx, 0) 159 | 160 | proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} = 161 | # TODO to allow awaiting `diffuse` we return a dummy bool 162 | # The parallel spawns are updating the same matrix cells otherwise 163 | if iu - il > 1: 164 | let im = (il + iu) div 2 165 | 166 | let d = spawn diffuse(output, input, il, im, t) 167 | diffuse(output, input, im, iu, t) 168 | discard ^d 169 | return true 170 | # ------------------------ 171 | 172 | let i = il 173 | let row = output.getRow(i) 174 | 175 | if i == 0: 176 | for j in 0 ..< ny: 177 | row[j] = randc(yu + j*dy, t) 178 | elif i == nx - 1: 179 | for j in 0 ..< ny: 180 | row[j] = randd(yu + j*dy, t) 181 | else: 182 | row[0] = randa(xu + i*dx, t) 183 | for j in 1 ..< ny - 1: 184 | row[j] = input[i, j] + # The use of nested sequences here is a bad idea ... 185 | dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) + 186 | dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j]) 187 | row[ny - 1] = randb(xu + i*dx, t) 188 | 189 | proc initTest() = 190 | nx = n 191 | ny = 1024 192 | nt = 100 193 | xu = 0.0 194 | xo = 1.570796326794896558 195 | yu = 0.0 196 | yo = 1.570796326794896558 197 | tu = 0.0 198 | to = 0.0000001 199 | 200 | dx = (xo - xu) / float64(nx - 1) 201 | dy = (yo - yu) / float64(ny - 1) 202 | dt = (to - tu) / float64(nt) 203 | 204 | dtdxsq = dt / (dx * dx) 205 | dtdysq = dt / (dy * dy) 206 | 207 | even = newMatrix[float64](nx, ny) 208 | odd = newMatrix[float64](nx, ny) 209 | 210 | proc prep() = 211 | heat(even, 0, nx) 212 | 213 | proc test() = 214 | var t = tu 215 | 216 | for _ in countup(1, nt.int, 2): 217 | # nt included 218 | t += dt 219 | diffuse(odd, even, 0, nx, t) 220 | t += dt 221 | diffuse(even, odd, 0, nx, t) 222 | 223 | if nt mod 2 != 0: 224 | t += dt 225 | diffuse(odd, even, 0, nx, t) 226 | 227 | proc verify() = 228 | var 229 | mat: Matrix[float64] 230 | mae: float64 231 | mre: float64 232 | me: float64 233 | 234 | mat = if nt mod 2 != 0: odd else: even 235 | 236 | for a in 0 ..< nx: 237 | for b in 0 ..< ny: 238 | var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to)) 239 | if tmp > 1e-3: 240 | echo "nx: ", nx, " - ny: ", ny 241 | echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to) 242 | quit 1 243 | 244 | me += tmp 245 | if tmp > mae: mae = tmp 246 | if mat[a, b] != 0.0: tmp /= mat[a, b] 247 | if tmp > mre: mre = tmp 248 | 249 | me /= nx * ny 250 | 251 | if mae > 1e-12: 252 | echo &"Local maximal absolute error {mae:1.3e}" 253 | quit 1 254 | if mre > 1e-12: 255 | echo &"Local maximal relative error {mre:1.3e}" 256 | quit 1 257 | if me > 1e-12: 258 | echo &"Global mean absolute error {me:1.3e}" 259 | quit 1 260 | 261 | echo "Verification successful" 262 | 263 | proc main() = 264 | var nthreads: int 265 | nthreads = countProcessors() 266 | 267 | var ru: Rusage 268 | getrusage(RusageSelf, ru) 269 | var 270 | rss = ru.ru_maxrss 271 | flt = ru.ru_minflt 272 | 273 | initTest() 274 | 275 | prep() 276 | let start = wtime_usec() 277 | test() 278 | let stop = wtime_usec() 279 | 280 | getrusage(RusageSelf, ru) 281 | rss = ru.ru_maxrss - rss 282 | flt = ru.ru_minflt - flt 283 | 284 | sync() 285 | 286 | verify() 287 | delete(even) 288 | delete(odd) 289 | 290 | echo "Scheduler: Nim threadpool (standard lib)" 291 | echo "Benchmark: heat" 292 | echo "Threads: ", nthreads 293 | echo "Time(us) ", stop - start 294 | echo "Max RSS (KB): ", ru.ru_maxrss 295 | echo "Runtime RSS (KB): ", rss 296 | echo "# of page faults: ", flt 297 | 298 | quit 0 299 | 300 | main() 301 | -------------------------------------------------------------------------------- /benchmarks/heat/taskpool_heat.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | {.push raises: [].} 9 | 10 | # From fibril 11 | # 12 | # Original license 13 | # 14 | # /* 15 | # * Heat diffusion (Jacobi-type iteration) 16 | # * 17 | # * Volker Strumpen, Boston August 1996 18 | # * 19 | # * Copyright (c) 1996 Massachusetts Institute of Technology 20 | # * 21 | # * This program is free software; you can redistribute it and/or modify 22 | # * it under the terms of the GNU General Public License as published by 23 | # * the Free Software Foundation; either version 2 of the License, or 24 | # * (at your option) any later version. 25 | # * 26 | # * This program is distributed in the hope that it will be useful, 27 | # * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | # * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | # * GNU General Public License for more details. 30 | # * 31 | # * You should have received a copy of the GNU General Public License 32 | # * along with this program; if not, write to the Free Software 33 | # * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 34 | # */ 35 | 36 | import 37 | # Stdlib 38 | strformat, os, strutils, math, system/ansi_c, 39 | cpuinfo, 40 | # Taskpools 41 | ../../taskpools 42 | when not defined(windows): 43 | # bench 44 | import ../wtime, ../resources 45 | 46 | # Helpers 47 | # ------------------------------------------------------- 48 | 49 | # We need a thin wrapper around raw pointers for matrices, 50 | # we can't pass "var seq[seq[float64]]" to other threads 51 | # nor "var" for that matter 52 | type 53 | Matrix[T] = object 54 | buffer: ptr UncheckedArray[T] 55 | m, n: int 56 | 57 | Row[T] = object 58 | buffer: ptr UncheckedArray[T] 59 | len: int 60 | 61 | var tp: Taskpool 62 | 63 | func newMatrix[T](m, n: int): Matrix[T] {.inline.} = 64 | result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T))) 65 | result.m = m 66 | result.n = n 67 | 68 | template `[]`[T](mat: Matrix[T], row, col: Natural): T = 69 | # row-major storage 70 | assert row < mat.m 71 | assert col < mat.n 72 | mat.buffer[row * mat.n + col] 73 | 74 | template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) = 75 | assert row < mat.m 76 | assert col < mat.n 77 | mat.buffer[row * mat.n + col] = value 78 | 79 | func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} = 80 | # row-major storage, there are n columns in between each rows 81 | assert rowIdx < mat.m 82 | result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr) 83 | result.len = mat.m 84 | 85 | template `[]`[T](row: Row[T], idx: Natural): T = 86 | assert idx < row.len 87 | row.buffer[idx] 88 | 89 | template `[]=`[T](row: Row[T], idx: Natural, value: T) = 90 | assert idx < row.len 91 | row.buffer[idx] = value 92 | 93 | func delete[T](mat: sink Matrix[T]) = 94 | c_free(mat.buffer) 95 | 96 | # And an auto converter for int32 -> float64 so we don't have to convert 97 | # all i, j indices manually 98 | 99 | converter i32toF64(x: int32): float64 {.inline.} = 100 | float64(x) 101 | 102 | # ------------------------------------------------------- 103 | 104 | template f(x, y: SomeFloat): SomeFloat = 105 | sin(x) * sin(y) 106 | 107 | template randa[T: SomeFloat](x, t: T): T = 108 | T(0.0) 109 | 110 | proc randb(x, t: SomeFloat): SomeFloat {.inline.} = 111 | # proc instead of template to avoid Nim constant folding bug: 112 | # https://github.com/nim-lang/Nim/issues/12783 113 | exp(-2 * t) * sin(x) 114 | 115 | template randc[T: SomeFloat](y, t: T): T = 116 | T(0.0) 117 | 118 | proc randd(y, t: SomeFloat): SomeFloat {.inline.} = 119 | # proc instead of template to avoid Nim constant folding bug: 120 | # https://github.com/nim-lang/Nim/issues/12783 121 | exp(-2 * t) * sin(y) 122 | 123 | template solu(x, y, t: SomeFloat): SomeFloat = 124 | exp(-2 * t) * sin(x) * sin(y) 125 | 126 | const n = 4096'i32 127 | 128 | var 129 | nx, ny, nt: int32 130 | xu, xo, yu, yo, tu, to: float64 131 | 132 | dx, dy, dt: float64 133 | dtdxsq, dtdysq: float64 134 | 135 | odd: Matrix[float64] 136 | even: Matrix[float64] 137 | 138 | proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable, gcsafe.}= 139 | # TODO to allow awaiting `heat` we return a dummy bool 140 | # The parallel spawns are updating the same matrix cells otherwise 141 | if iu - il > 1: 142 | let im = (il + iu) div 2 143 | 144 | let h = tp.spawn heat(m, il, im) 145 | heat(m, im, iu) 146 | discard sync(h) 147 | return true 148 | # ------------------------ 149 | 150 | let i = il 151 | let row = m.getRow(i) 152 | 153 | if i == 0: 154 | for j in 0 ..< ny: 155 | row[j] = randc(yu + j*dy, 0) 156 | elif i == nx - 1: 157 | for j in 0 ..< ny: 158 | row[j] = randd(yu + j*dy, 0) 159 | else: 160 | row[0] = randa(xu + i*dx, 0) 161 | for j in 1 ..< ny - 1: 162 | row[j] = f(xu + i*dx, yu + j*dy) 163 | row[ny - 1] = randb(xu + i*dx, 0) 164 | 165 | proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable, gcsafe.} = 166 | # TODO to allow awaiting `diffuse` we return a dummy bool 167 | # The parallel spawns are updating the same matrix cells otherwise 168 | if iu - il > 1: 169 | let im = (il + iu) div 2 170 | 171 | let d = tp.spawn diffuse(output, input, il, im, t) 172 | diffuse(output, input, im, iu, t) 173 | discard sync(d) 174 | return true 175 | # ------------------------ 176 | 177 | let i = il 178 | let row = output.getRow(i) 179 | 180 | if i == 0: 181 | for j in 0 ..< ny: 182 | row[j] = randc(yu + j*dy, t) 183 | elif i == nx - 1: 184 | for j in 0 ..< ny: 185 | row[j] = randd(yu + j*dy, t) 186 | else: 187 | row[0] = randa(xu + i*dx, t) 188 | for j in 1 ..< ny - 1: 189 | row[j] = input[i, j] + # The use of nested sequences here is a bad idea ... 190 | dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) + 191 | dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j]) 192 | row[ny - 1] = randb(xu + i*dx, t) 193 | 194 | proc initTest() = 195 | nx = n 196 | ny = 1024 197 | nt = 100 198 | xu = 0.0 199 | xo = 1.570796326794896558 200 | yu = 0.0 201 | yo = 1.570796326794896558 202 | tu = 0.0 203 | to = 0.0000001 204 | 205 | dx = (xo - xu) / float64(nx - 1) 206 | dy = (yo - yu) / float64(ny - 1) 207 | dt = (to - tu) / float64(nt) 208 | 209 | dtdxsq = dt / (dx * dx) 210 | dtdysq = dt / (dy * dy) 211 | 212 | even = newMatrix[float64](nx, ny) 213 | odd = newMatrix[float64](nx, ny) 214 | 215 | proc prep() = 216 | heat(even, 0, nx) 217 | 218 | proc test() = 219 | var t = tu 220 | 221 | for _ in countup(1, nt.int, 2): 222 | # nt included 223 | t += dt 224 | diffuse(odd, even, 0, nx, t) 225 | t += dt 226 | diffuse(even, odd, 0, nx, t) 227 | 228 | if nt mod 2 != 0: 229 | t += dt 230 | diffuse(odd, even, 0, nx, t) 231 | 232 | proc verify() = 233 | var 234 | mat: Matrix[float64] 235 | mae: float64 236 | mre: float64 237 | me: float64 238 | 239 | mat = if nt mod 2 != 0: odd else: even 240 | 241 | for a in 0 ..< nx: 242 | for b in 0 ..< ny: 243 | var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to)) 244 | if tmp > 1e-3: 245 | echo "nx: ", nx, " - ny: ", ny 246 | echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to) 247 | quit 1 248 | 249 | me += tmp 250 | if tmp > mae: mae = tmp 251 | if mat[a, b] != 0.0: tmp /= mat[a, b] 252 | if tmp > mre: mre = tmp 253 | 254 | me /= nx * ny 255 | 256 | try: 257 | if mae > 1e-12: 258 | echo &"Local maximal absolute error {mae:1.3e}" 259 | quit 1 260 | if mre > 1e-12: 261 | echo &"Local maximal relative error {mre:1.3e}" 262 | quit 1 263 | if me > 1e-12: 264 | echo &"Global mean absolute error {me:1.3e}" 265 | quit 1 266 | except ValueError: raiseAssert "format strings" 267 | 268 | echo "Verification successful" 269 | 270 | {.pop.} 271 | 272 | proc main() = 273 | var nthreads: int 274 | if existsEnv"TASKPOOL_NUM_THREADS": 275 | nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt() 276 | else: 277 | nthreads = countProcessors() 278 | 279 | when not defined(windows): 280 | var ru: Rusage 281 | getrusage(RusageSelf, ru) 282 | var 283 | rss = ru.ru_maxrss 284 | flt = ru.ru_minflt 285 | 286 | initTest() 287 | 288 | # Fibril initializes before benching 289 | tp = Taskpool.new(numThreads = nthreads) 290 | 291 | prep() 292 | when not defined(windows): 293 | let start = wtime_usec() 294 | test() 295 | when not defined(windows): 296 | let stop = wtime_usec() 297 | 298 | getrusage(RusageSelf, ru) 299 | rss = ru.ru_maxrss - rss 300 | flt = ru.ru_minflt - flt 301 | 302 | tp.shutdown() 303 | 304 | verify() 305 | delete(even) 306 | delete(odd) 307 | 308 | echo "Scheduler: Taskpools" 309 | echo "Benchmark: heat" 310 | echo "Threads: ", nthreads 311 | when not defined(windows): 312 | echo "Time(us) ", stop - start 313 | echo "Max RSS (KB): ", ru.ru_maxrss 314 | echo "Runtime RSS (KB): ", rss 315 | echo "# of page faults: ", flt 316 | 317 | quit 0 318 | 319 | main() 320 | -------------------------------------------------------------------------------- /benchmarks/matmul_cache_oblivious/README.md: -------------------------------------------------------------------------------- 1 | # Cache-Oblivious Matrix Multiplication 2 | 3 | From Staccato and Cilk 4 | 5 | https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk 6 | See the paper ``Cache-Oblivious Algorithms'', by 7 | Matteo Frigo, Charles E. Leiserson, Harald Prokop, and 8 | Sridhar Ramachandran, FOCS 1999, for an explanation of 9 | why this algorithm is good for caches. 10 | 11 | Note that the benchmarks output incorrect matrix traces 12 | according to the check ... 13 | -------------------------------------------------------------------------------- /benchmarks/matmul_cache_oblivious/taskpool_matmul_co.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | # Rectangular matrix multiplication. 9 | # 10 | # Adapted from Cilk 5.4.3 example 11 | # 12 | # https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk; 13 | # See the paper ``Cache-Oblivious Algorithms'', by 14 | # Matteo Frigo, Charles E. Leiserson, Harald Prokop, and 15 | # Sridhar Ramachandran, FOCS 1999, for an explanation of 16 | # why this algorithm is good for caches. 17 | 18 | import 19 | # Stdlib 20 | strformat, os, strutils, math, system/ansi_c, 21 | cpuinfo, 22 | # Taskpool 23 | ../../taskpools, 24 | # bench 25 | ../wtime, ../resources 26 | 27 | # Helpers 28 | # ------------------------------------------------------- 29 | 30 | # We need a thin wrapper around raw pointers for matrices, 31 | # we can't pass "var" to other threads 32 | type 33 | Matrix[T: SomeFloat] = object 34 | buffer: ptr UncheckedArray[T] 35 | ld: int 36 | 37 | var tp: Taskpool 38 | 39 | func newMatrixNxN[T](n: int): Matrix[T] {.inline.} = 40 | result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T))) 41 | result.ld = n 42 | 43 | template `[]`[T](mat: Matrix[T], row, col: Natural): T = 44 | # row-major storage 45 | assert row < mat.ld, $i & " < " & $mat.ld 46 | assert col < mat.ld, $i & " < " & $mat.ld 47 | mat.buffer[row * mat.ld + col] 48 | 49 | template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) = 50 | assert row < mat.ld, $i & " < " & $mat.ld 51 | assert col < mat.ld, $i & " < " & $mat.ld 52 | mat.buffer[row * mat.ld + col] = value 53 | 54 | func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}= 55 | ## Returns a new view offset by the row and column stride 56 | result.buffer = cast[ptr UncheckedArray[T]]( 57 | addr mat.buffer[row*mat.ld + col] 58 | ) 59 | 60 | func delete[T](mat: sink Matrix[T]) = 61 | c_free(mat.buffer) 62 | 63 | # ------------------------------------------------------- 64 | 65 | proc xorshiftRand(): uint32 = 66 | var x {.global.} = uint32(2463534242) 67 | x = x xor (x shr 13) 68 | x = x xor (x shl 17) 69 | x = x xor (x shr 5) 70 | return x 71 | 72 | func zero[T](A: Matrix[T]) = 73 | # zeroing is not timed 74 | zeroMem(A.buffer, A.ld * A.ld * sizeof(T)) 75 | 76 | proc fill[T](A: Matrix[T]) = 77 | for i in 0 ..< A.ld: 78 | for j in 0 ..< A.ld: 79 | A[i, j] = T(xorshiftRand() mod A.ld.uint32) 80 | 81 | func maxError(A, B: Matrix): float64 = 82 | assert A.ld == B.ld 83 | for i in 0 ..< A.ld: 84 | for j in 0 ..< A.ld: 85 | var diff = (A[i, j] - B[i, j]) / A[i, j] 86 | if diff < 0: 87 | diff = -diff 88 | if diff > result: 89 | result = diff 90 | 91 | func check[T](A, B, C: Matrix[T], n: int): bool = 92 | var 93 | tr_C = 0.T 94 | tr_AB = 0.T 95 | for i in 0 ..< n: 96 | for j in 0 ..< n: 97 | tr_AB += A[i, j] * B[j, i] 98 | tr_C += C[i, i] 99 | 100 | # Note, all benchmarks return false ‾\_(ツ)_/‾ 101 | return abs(tr_AB - tr_C) < 1e-3 102 | 103 | proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool = 104 | # The original bench passes around a ``ld`` parameter (leading dimension?), 105 | # we store it in the matrices 106 | # We return a dummy bool to allow waiting on the matmul 107 | 108 | # Threshold 109 | if (m + n + p) <= 64: 110 | if add: 111 | for i in 0 ..< m: 112 | for k in 0 ..< p: 113 | var c = 0.T 114 | for j in 0 ..< n: 115 | c += A[i, j] * B[j, k] 116 | C[i, k] += c 117 | else: 118 | for i in 0 ..< m: 119 | for k in 0 ..< p: 120 | var c = 0.T 121 | for j in 0 ..< n: 122 | c += A[i, j] * B[j, k] 123 | C[i, k] = c 124 | 125 | return 126 | 127 | var h0, h1: FlowVar[bool] 128 | ## Each half of the computation 129 | 130 | # matrix is larger than threshold 131 | if m >= n and n >= p: 132 | let m1 = m shr 1 # divide by 2 133 | h0 = tp.spawn matmul(A, B, C, m1, n, p, add) 134 | h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add) 135 | elif n >= m and n >= p: 136 | let n1 = n shr 1 # divide by 2 137 | h0 = tp.spawn matmul(A, B, C, m, n1, p, add) 138 | h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true) 139 | else: 140 | let p1 = p shr 1 141 | h0 = tp.spawn matmul(A, B, C, m, n, p1, add) 142 | h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add) 143 | 144 | discard sync(h0) 145 | discard sync(h1) 146 | 147 | proc main() = 148 | echo "Warning the benchmark seems to not be correct." 149 | var 150 | n = 3000 151 | nthreads: int 152 | 153 | if existsEnv"TASKPOOL_NUM_THREADS": 154 | nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt() 155 | else: 156 | nthreads = countProcessors() 157 | 158 | if paramCount() == 0: 159 | let exeName = getAppFilename().extractFilename() 160 | echo &"Usage: {exeName} " 161 | echo &"Running with default config n = {n}" 162 | elif paramCount() == 1: 163 | n = paramStr(1).parseInt() 164 | else: 165 | let exeName = getAppFilename().extractFilename() 166 | echo &"Usage: {exeName} " 167 | echo &"Up to 1 parameter is valid. Received {paramCount()}" 168 | quit 1 169 | 170 | var A = newMatrixNxN[float32](n) 171 | var B = newMatrixNxN[float32](n) 172 | var C = newMatrixNxN[float32](n) 173 | 174 | fill(A) 175 | fill(B) 176 | zero(C) 177 | 178 | var ru: Rusage 179 | getrusage(RusageSelf, ru) 180 | var 181 | rss = ru.ru_maxrss 182 | flt = ru.ru_minflt 183 | 184 | # Staccato benches runtime init and exit as well 185 | let start = wtime_msec() 186 | 187 | tp = Taskpool.new(numThreads = nthreads) 188 | discard sync tp.spawn matmul(A, B, C, n, n, n, add = false) 189 | tp.shutdown() 190 | 191 | let stop = wtime_msec() 192 | 193 | getrusage(RusageSelf, ru) 194 | rss = ru.ru_maxrss - rss 195 | flt = ru.ru_minflt - flt 196 | 197 | echo "Scheduler: Taskpool" 198 | echo "Benchmark: Matrix Multiplication (cache oblivious)" 199 | echo "Threads: ", nthreads 200 | echo "Time(ms) ", stop - start 201 | echo "Max RSS (KB): ", ru.ru_maxrss 202 | echo "Runtime RSS (KB): ", rss 203 | echo "# of page faults: ", flt 204 | echo "Input: ", n 205 | echo "Error: ", check(A, B, C, n) 206 | 207 | delete A 208 | delete B 209 | delete C 210 | 211 | quit 0 212 | 213 | main() 214 | -------------------------------------------------------------------------------- /benchmarks/nqueens/stdnim_nqueens.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | # 8 | # Original code licenses 9 | # ------------------------------------------------------------------------------------------------ 10 | # 11 | # /**********************************************************************************************/ 12 | # /* This program is part of the Barcelona OpenMP Tasks Suite */ 13 | # /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ 14 | # /* Copyright (C) 2009 Universitat Politecnica de Catalunya */ 15 | # /* */ 16 | # /* This program is free software; you can redistribute it and/or modify */ 17 | # /* it under the terms of the GNU General Public License as published by */ 18 | # /* the Free Software Foundation; either version 2 of the License, or */ 19 | # /* (at your option) any later version. */ 20 | # /* */ 21 | # /* This program is distributed in the hope that it will be useful, */ 22 | # /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ 23 | # /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ 24 | # /* GNU General Public License for more details. */ 25 | # /* */ 26 | # /* You should have received a copy of the GNU General Public License */ 27 | # /* along with this program; if not, write to the Free Software */ 28 | # /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ 29 | # /**********************************************************************************************/ 30 | # 31 | # /* 32 | # * Original code from the Cilk project (by Keith Randall) 33 | # * 34 | # * Copyright (c) 2000 Massachusetts Institute of Technology 35 | # * Copyright (c) 2000 Matteo Frigo 36 | # */ 37 | 38 | import 39 | # Stdlib 40 | system/ansi_c, strformat, os, strutils, 41 | threadpool, 42 | # bench 43 | ../wtime 44 | 45 | # This deadlocks :/ 46 | 47 | # Nim helpers 48 | # ------------------------------------------------- 49 | 50 | when defined(windows): 51 | proc alloca(size: csize): pointer {.header: "".} 52 | else: 53 | proc alloca(size: csize): pointer {.header: "".} 54 | 55 | template alloca*(T: typedesc): ptr T = 56 | cast[ptr T](alloca(sizeof(T))) 57 | 58 | template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] = 59 | cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) 60 | 61 | proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} = 62 | cast[type result](c_malloc(csize_t len*sizeof(T))) 63 | 64 | proc tp_free*[T: ptr](p: T) {.inline.} = 65 | c_free(p) 66 | 67 | # We assume that Nim zeroMem vs C memset 68 | # and Nim copyMem vs C memcpy have no difference 69 | # Nim does have extra checks to handle GC-ed types 70 | # but they should be eliminated by the Nim compiler. 71 | 72 | # ------------------------------------------------- 73 | 74 | type CharArray = ptr UncheckedArray[char] 75 | 76 | var example_solution: ptr UncheckedArray[char] 77 | 78 | func isValid(n: int32, a: CharArray): bool = 79 | ## `a` contains an array of `n` queen positions. 80 | ## Returns true if none of the queens conflict and 0 otherwise. 81 | 82 | for i in 0'i32 ..< n: 83 | let p = cast[int32](a[i]) 84 | 85 | for j in i+1 ..< n: 86 | let q = cast[int32](a[j]) 87 | if q == p or q == p - (j-i) or q == p + (j-i): 88 | return false 89 | return true 90 | 91 | proc nqueens_ser(n, j: int32, a: CharArray): int32 = 92 | # Serial nqueens 93 | if n == j: 94 | # Good solution count it 95 | if example_solution.isNil: 96 | example_solution = tp_alloc(char, n) 97 | copyMem(example_solution, a, n * sizeof(char)) 98 | return 1 99 | 100 | # Try each possible position for queen `j` 101 | for i in 0 ..< n: 102 | a[j] = cast[char](i) 103 | if isValid(j+1, a): 104 | result += nqueens_ser(n, j+1, a) 105 | 106 | proc nqueens_par(n, j: int32, a: CharArray): int32 = 107 | 108 | if n == j: 109 | # Good solution, count it 110 | return 1 111 | 112 | var localCounts = alloca(Flowvar[int32], n) 113 | zeroMem(localCounts, n * sizeof(Flowvar[int32])) 114 | 115 | # Try each position for queen `j` 116 | for i in 0 ..< n: 117 | var b = alloca(char, j+1) 118 | copyMem(b, a, j * sizeof(char)) 119 | b[j] = cast[char](i) 120 | if isValid(j+1, b): 121 | localCounts[i] = spawn nqueens_par(n, j+1, b) 122 | 123 | for i in 0 ..< n: 124 | if not localCounts[i].isNil(): 125 | result += ^localCounts[i] 126 | 127 | const solutions = [ 128 | 1, 129 | 0, 130 | 0, 131 | 2, 132 | 10, # 5x5 133 | 4, 134 | 10, 135 | 92, # 8x8 136 | 352, 137 | 724, # 10x10 138 | 2680, 139 | 14200, 140 | 73712, 141 | 365596, 142 | 2279184, # 15x15 143 | 14772512 144 | ] 145 | 146 | proc verifyQueens(n, res: int32) = 147 | if n > solutions.len: 148 | echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]" 149 | return 150 | 151 | if res != solutions[n-1]: 152 | echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}" 153 | 154 | proc main() = 155 | if paramCount() != 1: 156 | let exeName = getAppFilename().extractFilename() 157 | echo &"Usage: {exeName} " 158 | quit 0 159 | 160 | let n = paramStr(1).parseInt.int32 161 | 162 | if n notin 1 .. solutions.len: 163 | echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]" 164 | quit 1 165 | 166 | 167 | let start = wtime_msec() 168 | let count = nqueens_par(n, 0, alloca(char, n)) 169 | let stop = wtime_msec() 170 | 171 | verifyQueens(n, count) 172 | 173 | if not example_solution.isNil: 174 | stdout.write("Example solution: ") 175 | for i in 0 ..< n: 176 | c_printf("%2d ", example_solution[i]) 177 | stdout.write('\n') 178 | 179 | echo &"Elapsed wall time: {stop-start:2.4f} ms" 180 | 181 | main() 182 | -------------------------------------------------------------------------------- /benchmarks/nqueens/taskpool_nqueens.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | # 8 | # Original code licenses 9 | # ------------------------------------------------------------------------------------------------ 10 | # 11 | # /**********************************************************************************************/ 12 | # /* This program is part of the Barcelona OpenMP Tasks Suite */ 13 | # /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ 14 | # /* Copyright (C) 2009 Universitat Politecnica de Catalunya */ 15 | # /* */ 16 | # /* This program is free software; you can redistribute it and/or modify */ 17 | # /* it under the terms of the GNU General Public License as published by */ 18 | # /* the Free Software Foundation; either version 2 of the License, or */ 19 | # /* (at your option) any later version. */ 20 | # /* */ 21 | # /* This program is distributed in the hope that it will be useful, */ 22 | # /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ 23 | # /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ 24 | # /* GNU General Public License for more details. */ 25 | # /* */ 26 | # /* You should have received a copy of the GNU General Public License */ 27 | # /* along with this program; if not, write to the Free Software */ 28 | # /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ 29 | # /**********************************************************************************************/ 30 | # 31 | # /* 32 | # * Original code from the Cilk project (by Keith Randall) 33 | # * 34 | # * Copyright (c) 2000 Massachusetts Institute of Technology 35 | # * Copyright (c) 2000 Matteo Frigo 36 | # */ 37 | 38 | import 39 | # Stdlib 40 | system/ansi_c, strformat, os, strutils, cpuinfo, 41 | # Taskpools 42 | ../../taskpools 43 | 44 | when not defined(windows): 45 | # bench 46 | import ../wtime, ../resources 47 | 48 | # Nim helpers 49 | # ------------------------------------------------- 50 | 51 | when defined(windows): 52 | proc alloca(size: int): pointer {.header: "".} 53 | else: 54 | proc alloca(size: int): pointer {.header: "".} 55 | 56 | template alloca*(T: typedesc): ptr T = 57 | cast[ptr T](alloca(sizeof(T))) 58 | 59 | template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] = 60 | cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) 61 | 62 | proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} = 63 | when defined(TP_useNimAlloc): 64 | cast[type result](createSharedU(T, len)) 65 | else: 66 | cast[type result](c_malloc(csize_t len*sizeof(T))) 67 | 68 | proc tp_free*[T: ptr](p: T) {.inline.} = 69 | when defined(TP_useNimAlloc): 70 | freeShared(p) 71 | else: 72 | c_free(p) 73 | 74 | # We assume that Nim zeroMem vs C memset 75 | # and Nim copyMem vs C memcpy have no difference 76 | # Nim does have extra checks to handle GC-ed types 77 | # but they should be eliminated by the Nim compiler. 78 | 79 | # ------------------------------------------------- 80 | 81 | type CharArray = ptr UncheckedArray[char] 82 | 83 | var tp: Taskpool 84 | var example_solution: ptr UncheckedArray[char] 85 | 86 | func isValid(n: int32, a: CharArray): bool = 87 | ## `a` contains an array of `n` queen positions. 88 | ## Returns true if none of the queens conflict and 0 otherwise. 89 | 90 | for i in 0'i32 ..< n: 91 | let p = int32(a[i]) 92 | 93 | for j in i+1 ..< n: 94 | let q = int32(a[j]) 95 | if q == p or q == p - (j-i) or q == p + (j-i): 96 | return false 97 | return true 98 | 99 | proc nqueens_ser(n, j: int32, a: CharArray): int32 = 100 | # Serial nqueens 101 | if n == j: 102 | # Good solution count it 103 | if example_solution.isNil: 104 | example_solution = tp_alloc(char, n) 105 | copyMem(example_solution, a, n * sizeof(char)) 106 | return 1 107 | 108 | # Try each possible position for queen `j` 109 | for i in 0 ..< n: 110 | a[j] = cast[char](i) 111 | if isValid(j+1, a): 112 | result += nqueens_ser(n, j+1, a) 113 | 114 | proc nqueens_par(n, j: int32, a: CharArray): int32 {.gcsafe, raises: [].} = 115 | 116 | if n == j: 117 | # Good solution, count it 118 | return 1 119 | 120 | var localCounts = alloca(Flowvar[int32], n) 121 | zeroMem(localCounts, n * sizeof(Flowvar[int32])) 122 | 123 | # Try each position for queen `j` 124 | for i in 0 ..< n: 125 | var b = alloca(char, j+1) 126 | copyMem(b, a, j * sizeof(char)) 127 | b[j] = cast[char](i) 128 | if isValid(j+1, b): 129 | localCounts[i] = tp.spawn nqueens_par(n, j+1, b) 130 | 131 | for i in 0 ..< n: 132 | if localCounts[i].isSpawned(): 133 | result += sync(localCounts[i]) 134 | 135 | const solutions = [ 136 | 1, 137 | 0, 138 | 0, 139 | 2, 140 | 10, # 5x5 141 | 4, 142 | 10, 143 | 92, # 8x8 144 | 352, 145 | 724, # 10x10 146 | 2680, 147 | 14200, 148 | 73712, 149 | 365596, 150 | 2279184, # 15x15 151 | 14772512 152 | ] 153 | 154 | proc verifyQueens(n, res: int32) = 155 | if n > solutions.len: 156 | echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]" 157 | return 158 | 159 | if res != solutions[n-1]: 160 | echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}" 161 | 162 | proc main() = 163 | var 164 | n = 11'i32 165 | nthreads: int 166 | 167 | if existsEnv"TASKPOOL_NUM_THREADS": 168 | nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt() 169 | else: 170 | nthreads = countProcessors() 171 | 172 | if paramCount() == 0: 173 | let exeName = getAppFilename().extractFilename() 174 | echo &"Usage: {exeName} " 175 | echo &"Running with default config N = {n}\n" 176 | 177 | if paramCount() >= 1: 178 | n = paramStr(1).parseInt.int32 179 | 180 | if n notin 1 .. solutions.len: 181 | echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]" 182 | quit 1 183 | 184 | when not defined(windows): 185 | var ru: Rusage 186 | getrusage(RusageSelf, ru) 187 | var 188 | rss = ru.ru_maxrss 189 | flt = ru.ru_minflt 190 | 191 | tp = Taskpool.new(numThreads = nthreads) 192 | 193 | when not defined(windows): 194 | let start = wtime_msec() 195 | 196 | let count = nqueens_par(n, 0, alloca(char, n)) 197 | 198 | when not defined(windows): 199 | let stop = wtime_msec() 200 | 201 | when not defined(windows): 202 | getrusage(RusageSelf, ru) 203 | rss = ru.ru_maxrss - rss 204 | flt = ru.ru_minflt - flt 205 | 206 | tp.shutdown() 207 | 208 | verifyQueens(n, count) 209 | 210 | if not example_solution.isNil: 211 | stdout.write("Example solution: ") 212 | for i in 0 ..< n: 213 | c_printf("%2d ", example_solution[i]) 214 | stdout.write('\n') 215 | 216 | echo "Scheduler: Taskpool" 217 | echo "Benchmark: N-queens" 218 | echo "Threads: ", nthreads 219 | when not defined(windows): 220 | echo "Time(us) ", stop - start 221 | echo "Max RSS (KB): ", ru.ru_maxrss 222 | echo "Runtime RSS (KB): ", rss 223 | echo "# of page faults: ", flt 224 | echo "Problem size: ", n,"x",n, " board with ",n, " queens" 225 | echo "Solutions found: ", count 226 | 227 | quit 0 228 | 229 | main() 230 | -------------------------------------------------------------------------------- /benchmarks/resources.nim: -------------------------------------------------------------------------------- 1 | type 2 | Timeval {.importc: "timeval", header:"", bycopy.} = object 3 | 4 | Rusage* {.importc: "struct rusage", header:"", bycopy.} = object 5 | ru_utime {.importc.}: Timeval 6 | ru_stime {.importc.}: Timeval 7 | ru_maxrss* {.importc.}: int32 # Maximum resident set size 8 | # ... 9 | ru_minflt* {.importc.}: int32 # page reclaims (soft page faults) 10 | 11 | RusageWho* {.size: sizeof(cint).} = enum 12 | RusageChildren = -1 13 | RusageSelf = 0 14 | RusageThread = 1 15 | 16 | when defined(debug): 17 | var H_RUSAGE_SELF{.importc, header:"= stop: 41 | break 42 | 43 | dummy_cpt() 44 | 45 | # if elapsed >= global_poll_elapsed: 46 | # let pollStart = wtime_usec() 47 | # loadBalance(Weave) 48 | # pollElapsed += wtime_usec() - pollStart 49 | # global_poll_elapsed += PollInterval 50 | 51 | # c_printf("Elapsed: %.2lfus\n", elapsed) 52 | 53 | proc spc_consume_nopoll(usec: int32) = 54 | 55 | let start = wtime_usec() 56 | let stop = usec.float64 57 | 58 | while true: 59 | var elapsed = wtime_usec() - start 60 | if elapsed >= stop: 61 | break 62 | 63 | dummy_cpt() 64 | 65 | # c_printf("Elapsed: %.2lfus\n", elapsed) 66 | 67 | proc spc_produce(n: int32) = 68 | for i in 0 ..< n: 69 | tp.spawn spc_consume(TaskGranularity) 70 | 71 | proc spc_produce_seq(n: int32) = 72 | for i in 0 ..< n: 73 | spc_consume_nopoll(TaskGranularity) 74 | 75 | proc main() = 76 | NumTasksTotal = 1000000 77 | TaskGranularity = 10 78 | PollInterval = 10 79 | 80 | if paramCount() == 0: 81 | let exeName = getAppFilename().extractFilename() 82 | echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " & 83 | &" " & 84 | &"[polling interval (us): task granularity]" 85 | echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}" 86 | if paramCount() >= 1: 87 | NumTasksTotal = paramStr(1).parseInt.int32 88 | if paramCount() >= 2: 89 | TaskGranularity = paramStr(2). parseInt.int32 90 | if paramCount() == 3: 91 | PollInterval = paramStr(3).parseInt.float64 92 | else: 93 | PollInterval = TaskGranularity.float64 94 | if paramCount() > 3: 95 | let exeName = getAppFilename().extractFilename() 96 | echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " & 97 | &" " & 98 | &"[polling interval (us): task granularity]" 99 | quit 1 100 | 101 | var nthreads: int 102 | if existsEnv"TP_NUM_THREADS": 103 | nthreads = getEnv"TP_NUM_THREADS".parseInt() 104 | else: 105 | nthreads = countProcessors() 106 | 107 | tp = Taskpool.new(numThreads = nthreads) 108 | 109 | # measure overhead during tasking 110 | var ru: Rusage 111 | getrusage(RusageSelf, ru) 112 | var 113 | rss = ru.ru_maxrss 114 | flt = ru.ru_minflt 115 | 116 | let start = wtime_msec() 117 | 118 | # spc_produce_seq(NumTasksTotal) 119 | spc_produce(NumTasksTotal) 120 | tp.syncAll() 121 | 122 | let stop = wtime_msec() 123 | 124 | getrusage(RusageSelf, ru) 125 | rss = ru.ru_maxrss - rss 126 | flt = ru.ru_minflt - flt 127 | 128 | tp.shutdown() 129 | 130 | echo "--------------------------------------------------------------------------" 131 | echo "Scheduler: Taskpool" 132 | echo "Benchmark: SPC (Single task Producer - multi Consumer)" 133 | echo "Threads: ", nthreads 134 | echo "Time(ms) ", round(stop - start, 3) 135 | echo "Max RSS (KB): ", ru.ru_maxrss 136 | echo "Runtime RSS (KB): ", rss 137 | echo "# of page faults: ", flt 138 | echo "--------------------------------------------------------------------------" 139 | echo "# of tasks: ", NumTasksTotal 140 | echo "Task granularity (us): ", TaskGranularity 141 | echo "Polling / manual load balancing interval (us): ", PollInterval 142 | 143 | quit 0 144 | 145 | main() 146 | -------------------------------------------------------------------------------- /benchmarks/wtime.h: -------------------------------------------------------------------------------- 1 | #ifndef WTIME_H 2 | #define WTIME_H 3 | 4 | #include 5 | #include 6 | 7 | // Number of seconds since the Epoch 8 | static inline double Wtime_sec(void) 9 | { 10 | struct timeval tv; 11 | gettimeofday(&tv, NULL); 12 | return tv.tv_sec + tv.tv_usec / 1e6; 13 | } 14 | 15 | // Number of milliseconds since the Epoch 16 | static inline double Wtime_msec(void) 17 | { 18 | struct timeval tv; 19 | gettimeofday(&tv, NULL); 20 | return tv.tv_sec * 1e3 + tv.tv_usec / 1e3; 21 | } 22 | 23 | // Number of microseconds since the Epoch 24 | static inline double Wtime_usec(void) 25 | { 26 | struct timeval tv; 27 | gettimeofday(&tv, NULL); 28 | return tv.tv_sec * 1e6 + tv.tv_usec; 29 | } 30 | 31 | #if 0 32 | // Read time stamp counter on x86 33 | static inline unsigned long long readtsc(void) 34 | { 35 | unsigned int lo, hi; 36 | // RDTSC copies contents of 64-bit TSC into EDX:EAX 37 | asm volatile ("rdtsc" : "=a" (lo), "=d" (hi)); 38 | return (unsigned long long)hi << 32 | lo; 39 | } 40 | #endif 41 | 42 | #define WTIME_unique_var_name_paste(id, n) id ## n 43 | #define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n) 44 | #define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__) 45 | 46 | // Convenience macro for time measurement 47 | #define WTIME(unit) \ 48 | double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \ 49 | int WTIME_unique_var(_i_) = 0; \ 50 | for (; WTIME_unique_var(_i_) == 0 || \ 51 | (printf("Elapsed wall time: %.2lf "#unit"\n", \ 52 | Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \ 53 | WTIME_unique_var(_i_)++) 54 | 55 | #endif // WTIME_H 56 | -------------------------------------------------------------------------------- /benchmarks/wtime.nim: -------------------------------------------------------------------------------- 1 | 2 | import strutils, os 3 | 4 | const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0] 5 | const cHeader = cSourcesPath / "wtime.h" 6 | 7 | {.passc: "-I" & cSourcesPath .} 8 | 9 | proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.} 10 | proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.} 11 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Taskpools architecture 2 | 3 | Taskpools architecture is a simple threadpool with work-stealing to handle unbalanced workloads. 4 | 5 | ## Architecture 6 | 7 | ### Processing steps 8 | 9 | 1. On a `spawn` expression, thread i packages the function call in a task. 10 | 2. It enqueues it in it's own dequeue. 11 | 3. It notify_one a condition variable that holds all sleeping threads. 12 | 4. The notified thread wakes up and 13 | 5. The notified thread randomly tries to steal a task in a worker. 14 | 6. If no tasks are found, it goes back to sleep. 15 | 7. Otherwise it runs the task. 16 | 8. On a `sync` statement, it runs task in its own task dequeue or steal a task from another worker. 17 | 9. Once the `sync` task is ready, it can run the following statements (continuation). 18 | -------------------------------------------------------------------------------- /examples/e01_simple_tasks.nim: -------------------------------------------------------------------------------- 1 | import ../taskpools 2 | 3 | block: # Async without result 4 | 5 | proc displayInt(x: int) = 6 | try: 7 | stdout.write(x) 8 | stdout.write(" - SUCCESS\n") 9 | except IOError: 10 | quit 1 # can't do anything productive 11 | 12 | proc main() = 13 | echo "\nSanity check 1: Printing 123456 654321 in parallel" 14 | 15 | var tp = Taskpool.new(numThreads = 4) 16 | tp.spawn displayInt(123456) 17 | tp.spawn displayInt(654321) 18 | tp.shutdown() 19 | 20 | main() 21 | 22 | block: # Async/Await 23 | 24 | var tp: Taskpool 25 | 26 | 27 | proc asyncFib(n: int): int {.gcsafe, raises: [].} = 28 | if n < 2: 29 | return n 30 | 31 | let x = tp.spawn asyncFib(n-1) 32 | let y = asyncFib(n-2) 33 | 34 | result = sync(x) + y 35 | 36 | proc main2() = 37 | echo "\nSanity check 2: fib(20)" 38 | 39 | tp = Taskpool.new() 40 | let f = asyncFib(20) 41 | tp.shutdown() 42 | 43 | doAssert f == 6765 44 | 45 | main2() 46 | -------------------------------------------------------------------------------- /examples/e02_parallel_pi.nim: -------------------------------------------------------------------------------- 1 | # Demo of API using a very inefficient π approcimation algorithm. 2 | 3 | import 4 | std/[strutils, cpuinfo], 5 | ../taskpools 6 | 7 | # From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim 8 | # Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80 9 | proc term(k: int): float = 10 | if k mod 2 == 1: 11 | -4'f / float(2*k + 1) 12 | else: 13 | 4'f / float(2*k + 1) 14 | 15 | proc piApprox(tp: Taskpool, n: int): float = 16 | var pendingFuts = newSeq[Flowvar[float]](n) 17 | for k in 0 ..< pendingFuts.len: 18 | pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result. 19 | for k in 0 ..< pendingFuts.len: 20 | result += sync pendingFuts[k] # Block until the result is available. 21 | 22 | proc main() = 23 | var n = 1_000_000 24 | var nthreads = countProcessors() 25 | 26 | var tp = Taskpool.new(num_threads = nthreads) # Default to the number of hardware threads. 27 | 28 | echo formatFloat(tp.piApprox(n)) 29 | 30 | tp.syncAll() # Block until all pending tasks are processed (implied in tp.shutdown()) 31 | tp.shutdown() 32 | 33 | # Compile with nim c -r -d:release --threads:on --outdir:build example.nim 34 | main() 35 | -------------------------------------------------------------------------------- /papers/Chase-Lev - Dynamic Circular Work-Stealing Deque.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/status-im/nim-taskpools/9e8ccc754631ac55ac2fd495e167e74e86293edb/papers/Chase-Lev - Dynamic Circular Work-Stealing Deque.pdf -------------------------------------------------------------------------------- /papers/Nhat Minh Le et al - Correct and Efficient Work-Stealing for Weak Memory Models.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/status-im/nim-taskpools/9e8ccc754631ac55ac2fd495e167e74e86293edb/papers/Nhat Minh Le et al - Correct and Efficient Work-Stealing for Weak Memory Models.pdf -------------------------------------------------------------------------------- /taskpools.nim: -------------------------------------------------------------------------------- 1 | # taskpools 2 | # Copyright (c) 2021- Status Research & Development GmbH 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import taskpools/taskpools 9 | export taskpools 10 | -------------------------------------------------------------------------------- /taskpools.nimble: -------------------------------------------------------------------------------- 1 | mode = ScriptMode.Verbose 2 | 3 | packageName = "taskpools" 4 | version = "0.1.0" 5 | author = "Status Research & Development GmbH" 6 | description = "lightweight, energy-efficient, easily auditable threadpool" 7 | license = "MIT" 8 | skipDirs = @["tests"] 9 | 10 | requires "nim >= 1.6.0" 11 | 12 | let nimc = getEnv("NIMC", "nim") # Which nim compiler to use 13 | let lang = getEnv("NIMLANG", "c") # Which backend (c/cpp/js) 14 | let flags = getEnv("NIMFLAGS", "") # Extra flags for the compiler 15 | let verbose = getEnv("V", "") notin ["", "0"] 16 | 17 | let cfg = 18 | " --styleCheck:usages --styleCheck:error" & 19 | (if verbose: "" else: " --verbosity:0 --hints:off") & 20 | " --skipParentCfg --skipUserCfg --outdir:build --nimcache:build/nimcache -f" & 21 | " --stacktrace:on --linetrace:on" & 22 | " --threads:on" 23 | 24 | proc build(args, path: string) = 25 | exec nimc & " " & lang & " " & cfg & " " & flags & " " & args & " " & path 26 | 27 | proc run(args, path: string) = 28 | build args & " --mm:refc -r", path 29 | if (NimMajor, NimMinor) > (1, 6): 30 | build args & " --mm:orc -r", path 31 | 32 | task test, "Run Taskpools tests": 33 | # Internal data structures 34 | run "", "taskpools/channels_spsc_single.nim" 35 | run "", "taskpools/sparsesets.nim" 36 | 37 | # Examples 38 | run "", "examples/e01_simple_tasks.nim" 39 | run "", "examples/e02_parallel_pi.nim" 40 | 41 | # Benchmarks 42 | run "", "benchmarks/dfs/taskpool_dfs.nim" 43 | run "", "benchmarks/heat/taskpool_heat.nim" 44 | run "", "benchmarks/nqueens/taskpool_nqueens.nim" 45 | 46 | when not defined(windows): 47 | run "", "benchmarks/single_task_producer/taskpool_spc.nim" 48 | run "", "benchmarks/bouncing_producer_consumer/taskpool_bpc.nim" 49 | 50 | # TODO - generics in macro issue 51 | # run "", "benchmarks/matmul_cache_oblivious/taskpool_matmul_co.nim" 52 | -------------------------------------------------------------------------------- /taskpools/ast_utils.nim: -------------------------------------------------------------------------------- 1 | # taskpools 2 | # Copyright (c) 2021 Status Research & Development GmbH 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import macros 9 | 10 | template letsGoDeeper = 11 | var rTree = node.kind.newTree() 12 | for child in node: 13 | rTree.add inspect(child) 14 | return rTree 15 | 16 | proc replaceSymsByIdents*(ast: NimNode): NimNode = 17 | proc inspect(node: NimNode): NimNode = 18 | case node.kind: 19 | of {nnkIdent, nnkSym}: 20 | return ident($node) 21 | of nnkEmpty: 22 | return node 23 | of nnkLiterals: 24 | return node 25 | of nnkHiddenStdConv: 26 | if node[1].kind == nnkIntLit: 27 | return node[1] 28 | else: 29 | expectKind(node[1], nnkSym) 30 | return ident($node[1]) 31 | else: 32 | letsGoDeeper() 33 | result = inspect(ast) 34 | -------------------------------------------------------------------------------- /taskpools/channels_spsc_single.nim: -------------------------------------------------------------------------------- 1 | # taskpools 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Copyright (c) 2021- Status Research & Development GmbH 4 | # Licensed and distributed under either of 5 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 6 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 7 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 8 | 9 | {.push raises: [].} 10 | 11 | import 12 | std/[atomics, typetraits] 13 | 14 | type 15 | ChannelSPSCSingle*[T] = object 16 | ## A single-value SPSC channel 17 | ## 18 | ## Wait-free bounded single-producer single-consumer channel 19 | ## that can only buffer a single item 20 | ## Properties: 21 | ## - wait-free 22 | ## - supports weak memory models 23 | ## - buffers a single item 24 | ## - Padded to avoid false sharing in collections 25 | ## - No extra indirection to access the item, the buffer is inline the channel 26 | ## - Linearizable 27 | ## 28 | ## The channel should be the last field of an object if used in an intrusive manner 29 | full{.align: 64.}: Atomic[bool] 30 | value*: T 31 | 32 | proc `=copy`[T]( 33 | dest: var ChannelSPSCSingle[T], 34 | source: ChannelSPSCSingle[T] 35 | ) {.error: "A channel cannot be copied".} 36 | 37 | func isEmpty*(chan: var ChannelSPSCSingle): bool {.inline.} = 38 | not chan.full.load(moAcquire) 39 | 40 | func tryRecv*[T](chan: var ChannelSPSCSingle, dst: var T): bool {.inline.} = 41 | ## Try receiving the item buffered in the channel 42 | ## Returns true if successful (channel was not empty) 43 | ## 44 | ## ⚠ Use only in the consumer thread that reads from the channel. 45 | static: doAssert supportsCopyMem(T), "Channel is not garbage-collection-safe" 46 | 47 | case chan.full.load(moAcquire) 48 | of true: 49 | dst = move(chan.value) 50 | chan.full.store(false, moRelease) 51 | true 52 | of false: 53 | false 54 | 55 | func trySend*[T](chan: var ChannelSPSCSingle, src: sink T): bool {.inline.} = 56 | ## Try sending an item into the channel 57 | ## Reurns true if successful (channel was empty) 58 | ## 59 | ## ⚠ Use only in the producer thread that writes from the channel. 60 | static: doAssert supportsCopyMem(T), "Channel is not garbage-collection-safe" 61 | 62 | case chan.full.load(moAcquire) 63 | of true: 64 | false 65 | of false: 66 | chan.value = move(src) 67 | chan.full.store(true, moRelease) 68 | true 69 | 70 | {.pop.} # raises: [] 71 | 72 | # Sanity checks 73 | # ------------------------------------------------------------------------------ 74 | when isMainModule: 75 | when not compileOption("threads"): 76 | {.error: "This requires --threads:on compilation flag".} 77 | 78 | template sendLoop[T](chan: var ChannelSPSCSingle[T], 79 | data: sink T, 80 | body: untyped): untyped = 81 | while not chan.trySend(data): 82 | body 83 | 84 | template recvLoop[T](chan: var ChannelSPSCSingle[T], 85 | data: var T, 86 | body: untyped): untyped = 87 | while not chan.tryRecv(data): 88 | body 89 | 90 | type 91 | ThreadArgs = object 92 | ID: WorkerKind 93 | chan: ptr ChannelSPSCSingle[int] 94 | 95 | WorkerKind = enum 96 | Sender 97 | Receiver 98 | 99 | template Worker(id: WorkerKind, body: untyped): untyped {.dirty.} = 100 | if args.ID == id: 101 | body 102 | 103 | proc thread_func(args: ThreadArgs) = 104 | 105 | # Worker RECEIVER: 106 | # --------- 107 | # <- chan 108 | # <- chan 109 | # <- chan 110 | # 111 | # Worker SENDER: 112 | # --------- 113 | # chan <- 42 114 | # chan <- 53 115 | # chan <- 64 116 | Worker(Receiver): 117 | var val: int 118 | for j in 0 ..< 10: 119 | args.chan[].recvLoop(val): 120 | # Busy loop, in prod we might want to yield the core/thread timeslice 121 | discard 122 | echo " Receiver got: ", val 123 | doAssert val == 42 + j*11 124 | 125 | Worker(Sender): 126 | doAssert args.chan.full.load(moRelaxed) == false 127 | for j in 0 ..< 10: 128 | let val = 42 + j*11 129 | args.chan[].sendLoop(val): 130 | # Busy loop, in prod we might want to yield the core/thread timeslice 131 | discard 132 | echo "Sender sent: ", val 133 | 134 | import primitives/allocs 135 | proc main() = 136 | echo "Testing if 2 threads can send data" 137 | echo "-----------------------------------" 138 | 139 | var threads: array[2, Thread[ThreadArgs]] 140 | var chan = tp_allocAligned( 141 | ChannelSPSCSingle[int], sizeof(ChannelSPSCSingle[int]), 64) 142 | zeroMem(chan, sizeof(ChannelSPSCSingle[int])) 143 | 144 | createThread(threads[0], thread_func, ThreadArgs(ID: Receiver, chan: chan)) 145 | createThread(threads[1], thread_func, ThreadArgs(ID: Sender, chan: chan)) 146 | 147 | joinThread(threads[0]) 148 | joinThread(threads[1]) 149 | 150 | tp_freeAligned(chan) 151 | 152 | echo "-----------------------------------" 153 | echo "Success" 154 | 155 | main() 156 | -------------------------------------------------------------------------------- /taskpools/chase_lev_deques.nim: -------------------------------------------------------------------------------- 1 | # taskpools 2 | # Copyright (c) 2021 Status Research & Development GmbH 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | # chase_lev_deques.nim 9 | # -------------------- 10 | # This file implements a Chase-Lev deque 11 | # This is a single-consumer multi-consumer concurrent queue 12 | # for work-stealing schedulers. 13 | # 14 | # Papers: 15 | # - Dynamic Circular Work-Stealing Deque 16 | # David Chase, Yossi Lev, 1993 17 | # https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf 18 | # 19 | # - Correct and Efficient Work-Stealing for Weak Memory Models 20 | # Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013 21 | # https://fzn.fr/readings/ppopp13.pdf 22 | # 23 | # We straight translate the second paper which includes formal proofs of correctness, 24 | # and uses modern C++11 code. 25 | # 26 | # A Chase-lev dequeue implements the following push, pop, steal. 27 | # 28 | # top bottom 29 | # --------------------------------- 30 | # | | | | <- push() 31 | # steal() <- | Task 0 | Task 1 | Task 2 | -> pop() 32 | # any thread | | | | owner-only 33 | # --------------------------------- 34 | # 35 | # To reduce contention, stealing is done on the opposite end from push/pop 36 | # so that there is a race only for the very last task. 37 | 38 | {.push raises: [].} # Ensure no exceptions can happen 39 | 40 | import 41 | system/ansi_c, 42 | std/atomics, 43 | ./instrumentation/[contracts, loggers], 44 | ./primitives/allocs 45 | 46 | type 47 | Buf[T] = object 48 | ## Backend buffer of a ChaseLevDeque 49 | ## `capacity` MUST be a power of 2 50 | 51 | # Note: update tp_allocUnchecked allocation if any field changes. 52 | # Unused. There is no memory reclamation scheme. 53 | prev: ptr Buf[T] 54 | 55 | capacity: int 56 | mask: int # == capacity-1 implies (i and mask) == (i mod capacity) 57 | rawBuffer: UncheckedArray[Atomic[T]] 58 | 59 | ChaseLevDeque*[T] = object 60 | ## This implements a lock-free, growable, work-stealing deque. 61 | ## The owning thread enqueues and dequeues at the bottom 62 | ## Foreign threads steal at the top. 63 | ## 64 | ## There is no memory reclamation scheme for simplicity. 65 | top {.align: 64.}: Atomic[int] 66 | bottom: Atomic[int] 67 | buf: Atomic[ptr Buf[T]] 68 | garbage: ptr Buf[T] 69 | 70 | {.push overflowChecks: off.} # We don't want exceptions (for Defect) in a multithreaded context 71 | # but we don't to deal with underflow of unsigned int either 72 | # say "if a < b - c" with c > b 73 | 74 | func isPowerOfTwo(n: int): bool {.inline.} = 75 | (n and (n - 1)) == 0 and (n != 0) 76 | 77 | proc newBuf(T: typedesc, capacity: int): ptr Buf[T] = 78 | # Tasks have a destructor 79 | # static: 80 | # doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref." 81 | 82 | preCondition: capacity.isPowerOfTwo() 83 | 84 | result = tp_allocUnchecked( 85 | Buf[T], 86 | 1*sizeof(pointer) + 2*sizeof(int) + sizeof(T)*capacity, 87 | zero = true 88 | ) 89 | 90 | # result.prev = nil 91 | result.capacity = capacity 92 | result.mask = capacity - 1 93 | # result.rawBuffer.addr.zeroMem(sizeof(T)*capacity) 94 | 95 | proc `[]=`[T](buf: var Buf[T], index: int, item: T) {.inline.} = 96 | buf.rawBuffer[index and buf.mask].store(item, moRelaxed) 97 | 98 | proc `[]`[T](buf: var Buf[T], index: int): T {.inline.} = 99 | result = buf.rawBuffer[index and buf.mask].load(moRelaxed) 100 | 101 | proc grow[T](deque: var ChaseLevDeque[T], buf: var ptr Buf[T], top, bottom: int) {.inline.} = 102 | ## Double the buffer size 103 | ## bottom is the last item index 104 | ## 105 | ## To handle race-conditions the current "top", "bottom" and "buf" 106 | ## have to be saved before calling this procedure. 107 | ## It reads and writes the "deque.buf", "deque.garbage" and "deque.garbageUsed" 108 | 109 | # Read -> Copy -> Update 110 | var tmp = newBuf(T, buf.capacity*2) 111 | for i in top ..< bottom: 112 | tmp[][i] = buf[][i] 113 | 114 | buf.prev = deque.garbage 115 | deque.garbage = buf 116 | # publish globally 117 | deque.buf.store(tmp, moRelaxed) 118 | # publish locally 119 | swap(buf, tmp) 120 | 121 | # Public API 122 | # --------------------------------------------------- 123 | 124 | proc init*[T](deque: var ChaseLevDeque[T], initialCapacity: int) = 125 | ## Initializes a new Chase-lev work-stealing deque. 126 | deque.reset() 127 | deque.buf.store(newBuf(T, initialCapacity), moRelaxed) 128 | 129 | proc teardown*[T](deque: var ChaseLevDeque[T]) = 130 | ## Teardown a Chase-lev work-stealing deque. 131 | var node = deque.garbage 132 | while node != nil: 133 | let tmp = node.prev 134 | c_free(node) 135 | node = tmp 136 | c_free(deque.buf.load(moRelaxed)) 137 | 138 | proc push*[T](deque: var ChaseLevDeque[T], item: T) = 139 | ## Enqueue an item at the bottom 140 | ## The item should not be used afterwards. 141 | 142 | let # Handle race conditions 143 | b = deque.bottom.load(moRelaxed) 144 | t = deque.top.load(moAcquire) 145 | var a = deque.buf.load(moRelaxed) 146 | 147 | if b-t > a.capacity - 1: 148 | # Full queue 149 | deque.grow(a, t, b) 150 | 151 | a[][b] = item 152 | fence(moRelease) 153 | deque.bottom.store(b+1, moRelaxed) 154 | 155 | proc pop*[T](deque: var ChaseLevDeque[T]): T = 156 | ## Deque an item at the bottom 157 | 158 | let # Handle race conditions 159 | b = deque.bottom.load(moRelaxed) - 1 160 | a = deque.buf.load(moRelaxed) 161 | 162 | deque.bottom.store(b, moRelaxed) 163 | fence(moSequentiallyConsistent) 164 | var t = deque.top.load(moRelaxed) 165 | 166 | if t <= b: 167 | # Non-empty queue. 168 | result = a[][b] 169 | if t == b: 170 | # Single last element in queue. 171 | if not compareExchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed): 172 | # Failed race. 173 | result = default(T) 174 | deque.bottom.store(b+1, moRelaxed) 175 | else: 176 | # Empty queue. 177 | result = default(T) 178 | deque.bottom.store(b+1, moRelaxed) 179 | 180 | proc steal*[T](deque: var ChaseLevDeque[T]): T = 181 | ## Deque an item at the top 182 | var t = deque.top.load(moAcquire) 183 | fence(moSequentiallyConsistent) 184 | let b = deque.bottom.load(moAcquire) 185 | result = default(T) 186 | 187 | if t < b: 188 | # Non-empty queue. 189 | let a = deque.buf.load(moConsume) 190 | result = a[][t] 191 | if not compareExchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed): 192 | # Failed race. 193 | return default(T) 194 | 195 | {.pop.} # overflowChecks 196 | {.pop.} # raises: [] 197 | -------------------------------------------------------------------------------- /taskpools/event_notifiers.nim: -------------------------------------------------------------------------------- 1 | # taskpools 2 | # Copyright (c) 2021-2023 Status Research & Development GmbH 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | # event_notifier.nim 9 | # ------------------ 10 | # This file implements an event notifier. 11 | # It allows putting idle threads to sleep or waking them up. 12 | 13 | # Design 14 | # Currently it is a shared lock + condition variable (a.k.a. a semaphore) 15 | # 16 | # In the future an eventcount might be considered, an event count significantly 17 | # reduces scheduler overhead by removing lock acquisition from critical path. 18 | # See overview and implementations at 19 | # https://gist.github.com/mratsim/04a29bdd98d6295acda4d0677c4d0041 20 | # 21 | # Weave "one event-notifier per thread" further reduces overhead 22 | # but requires the threadpool to be message-passing based. 23 | # https://github.com/mratsim/weave/blob/a230cce98a8524b2680011e496ec17de3c1039f2/weave/cross_thread_com/event_notifiers.nim 24 | 25 | {.push raises: [].} # Ensure no exceptions can happen 26 | 27 | import 28 | std/locks, 29 | ./instrumentation/contracts 30 | 31 | type 32 | EventNotifier* = object 33 | ## This data structure allows threads to be parked when no events are pending 34 | ## and woken up when a new event is. 35 | # Lock must be aligned to a cache-line to avoid false-sharing. 36 | lock{.align: 64.}: Lock 37 | cond: Cond 38 | parked: int 39 | signals: int 40 | 41 | {.push overflowChecks: off.} # We don't want exceptions (for Defect) in a multithreaded context 42 | # but we don't to deal with underflow of unsigned int either 43 | # say "if a < b - c" with c > b 44 | 45 | func initialize*(en: var EventNotifier) {.inline.} = 46 | ## Initialize the event notifier 47 | en.lock.initLock() 48 | en.cond.initCond() 49 | en.parked = 0 50 | en.signals = 0 51 | 52 | func `=destroy`*(en: var EventNotifier) {.inline.} = 53 | en.cond.deinitCond() 54 | en.lock.deinitLock() 55 | 56 | func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".} 57 | func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".} 58 | 59 | proc park*(en: var EventNotifier) {.inline.} = 60 | ## Wait until we are signaled of an event 61 | ## Thread is parked and does not consume CPU resources 62 | en.lock.acquire() 63 | 64 | if en.signals > 0: 65 | en.signals -= 1 66 | en.lock.release() 67 | return 68 | 69 | en.parked += 1 70 | while en.signals == 0: # handle spurious wakeups 71 | en.cond.wait(en.lock) 72 | en.parked -= 1 73 | en.signals -= 1 74 | 75 | postCondition: en.signals >= 0 76 | en.lock.release() 77 | 78 | proc notify*(en: var EventNotifier) {.inline.} = 79 | ## Unpark a thread if any is available 80 | en.lock.acquire() 81 | 82 | if en.parked > 0: 83 | en.signals += 1 84 | en.cond.signal() 85 | 86 | en.lock.release() 87 | 88 | proc getParked*(en: var EventNotifier): int {.inline.} = 89 | ## Get the number of parked thread 90 | en.lock.acquire() 91 | result = en.parked 92 | en.lock.release() 93 | 94 | {.pop.} # overflowChecks 95 | {.pop.} # raises: [AssertionDefect] 96 | -------------------------------------------------------------------------------- /taskpools/flowvars.nim: -------------------------------------------------------------------------------- 1 | # taskpools 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Copyright (c) 2021 Status Research & Development GmbH 4 | # Licensed and distributed under either of 5 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 6 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 7 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 8 | 9 | {.push raises: [].} 10 | 11 | import 12 | ./instrumentation/contracts, 13 | ./channels_spsc_single, 14 | ./primitives/allocs 15 | 16 | type 17 | Flowvar*[T] = object 18 | ## A Flowvar is a placeholder for a future result that may be computed in parallel 19 | # Flowvar are optimized when containing a ptr type. 20 | # They take less size in memory by testing isNil 21 | # instead of having an extra atomic bool 22 | # They also use type-erasure to avoid having duplicate code 23 | # due to generic monomorphization. 24 | chan: ptr ChannelSPSCSingle[T] 25 | 26 | # proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".} 27 | # 28 | # Unfortunately we cannot prevent this easily as internally 29 | # we need a copy: 30 | # - taskpools level when doing toTask(fnCall(args, fut)) and then returning fut. (Can be worked around with copyMem) 31 | # - in std/tasks (need upstream workaround) 32 | 33 | proc newFlowVar*(T: typedesc): Flowvar[T] {.inline.} = 34 | result.chan = tp_allocAligned( 35 | ChannelSPSCSingle[T], sizeof(ChannelSPSCSingle[T]), alignment = 64) 36 | zeroMem(result.chan, sizeof(ChannelSPSCSingle[T])) 37 | 38 | proc cleanup(fv: Flowvar) {.inline.} = 39 | # TODO: Nim v1.4+ can use "sink Flowvar" 40 | if not fv.chan.isNil: 41 | tp_freeAligned(fv.chan) 42 | 43 | func isSpawned*(fv: Flowvar): bool {.inline.} = 44 | ## Returns true if a flowvar is spawned 45 | ## This may be useful for recursive algorithms that 46 | ## may or may not spawn a flowvar depending on a condition. 47 | ## This is similar to Option or Maybe types 48 | return not fv.chan.isNil 49 | 50 | proc readyWith*[T](fv: Flowvar[T], childResult: T) {.inline.} = 51 | ## Send the Flowvar result from the child thread processing the task 52 | ## to its parent thread. 53 | let resultSent {.used.} = fv.chan[].trySend(childResult) 54 | postCondition: resultSent 55 | 56 | template tryComplete*[T](fv: Flowvar, parentResult: var T): bool = 57 | fv.chan[].tryRecv(parentResult) 58 | 59 | func isReady*[T](fv: Flowvar[T]): bool {.inline.} = 60 | ## Returns true if the result of a Flowvar is ready. 61 | ## In that case `sync` will not block. 62 | ## Otherwise the current will block to help on all the pending tasks 63 | ## until the Flowvar is ready. 64 | not fv.chan[].isEmpty() 65 | 66 | proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} = 67 | ## Blocks the current thread until the flowvar is available 68 | ## and returned. 69 | ## The thread is not idle and will complete pending tasks. 70 | mixin forceFuture 71 | forceFuture(fv, result) 72 | cleanup(fv) 73 | -------------------------------------------------------------------------------- /taskpools/instrumentation/contracts.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import macros, os, strutils 9 | 10 | {.used.} 11 | 12 | # A simple design-by-contract API 13 | # ---------------------------------------------------------------------------------- 14 | 15 | # Everything should be a template that doesn't produce any code 16 | # when TP_Asserts is not defined. 17 | # Those checks are controlled by a custom flag instead of 18 | # "--boundsChecks" or "--nilChecks" to decouple them from user code checks. 19 | # Furthermore, we want them to be very lightweight on performance 20 | 21 | # TODO auto-add documentation 22 | 23 | proc inspectInfix(node: NimNode): NimNode = 24 | ## Inspect an expression, 25 | ## Returns the AST as string with runtime values inlined 26 | ## from infix operators inlined. 27 | # TODO: pointer and custom type need a default repr 28 | # otherwise we can only resulve simple expressions 29 | proc inspect(node: NimNode): NimNode = 30 | case node.kind: 31 | of nnkInfix: 32 | return newCall( 33 | bindSym"&", 34 | newCall( 35 | bindSym"&", 36 | newCall(ident"$", inspect(node[1])), 37 | newLit(" " & $node[0] & " ") 38 | ), 39 | newCall(ident"$", inspect(node[2])) 40 | ) 41 | of {nnkIdent, nnkSym}: 42 | return node 43 | of nnkDotExpr: 44 | return quote do: 45 | when `node` is pointer or 46 | `node` is ptr or 47 | `node` is (proc): 48 | toHex(cast[ByteAddress](`node`) and 0xffff_ffff) 49 | else: 50 | $(`node`) 51 | of nnkPar: 52 | result = nnkPar.newTree() 53 | for sub in node: 54 | result.add inspect(sub) 55 | else: 56 | return node.toStrLit() 57 | return inspect(node) 58 | 59 | macro assertContract( 60 | checkName: static string, 61 | predicate: untyped) = 62 | let lineinfo = lineInfoObj(predicate) 63 | let file = extractFilename(lineinfo.filename) 64 | 65 | var strippedPredicate: NimNode 66 | if predicate.kind == nnkStmtList: 67 | assert predicate.len == 1, "Only one-liner conditions are supported" 68 | strippedPredicate = predicate[0] 69 | else: 70 | strippedPredicate = predicate 71 | 72 | let debug = "\n Contract violated for " & checkName & " at " & file & ":" & $lineinfo.line & 73 | "\n " & $strippedPredicate.toStrLit & 74 | "\n The following values are contrary to expectations:" & 75 | "\n " 76 | let values = inspectInfix(strippedPredicate) 77 | let workerID = quote do: 78 | when declared(workerContext): 79 | $workerContext.id 80 | else: 81 | "N/A" 82 | let taskpoolID = quote do: 83 | when declared(workerContext): 84 | "0x" & cast[uint](workerContext.taskpool).toHex().toLowerAscii() 85 | else: 86 | "N/A" 87 | 88 | result = quote do: 89 | {.noSideEffect.}: 90 | when compileOption("assertions"): 91 | assert(`predicate`, `debug` & $`values` & " [Worker " & `workerID` & " on taskpool " & `taskpoolID` & "]\n") 92 | elif defined(TP_Asserts): 93 | if unlikely(not(`predicate`)): 94 | raiseAssert(`debug` & $`values` & " [Worker " & `workerID` & " on taskpool " & `taskpoolID` & "]\n") 95 | 96 | # A way way to get the caller function would be nice. 97 | 98 | template preCondition*(require: untyped) = 99 | ## Optional runtime check before returning from a function 100 | assertContract("pre-condition", require) 101 | 102 | template postCondition*(ensure: untyped) = 103 | ## Optional runtime check at the start of a function 104 | assertContract("post-condition", ensure) 105 | 106 | template ascertain*(check: untyped) = 107 | ## Optional runtime check in the middle of processing 108 | assertContract("transient condition", check) 109 | 110 | # Sanity checks 111 | # ---------------------------------------------------------------------------------- 112 | 113 | when isMainModule: 114 | proc assertGreater(x, y: int) = 115 | postcondition(x > y) 116 | 117 | # We should get a nicely formatted exception 118 | assertGreater(10, 12) 119 | -------------------------------------------------------------------------------- /taskpools/instrumentation/loggers.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import system/ansi_c 9 | 10 | {.used.} 11 | 12 | template log*(args: varargs[untyped]): untyped = 13 | c_printf(args) 14 | flushFile(stdout) 15 | 16 | template debugTermination*(body: untyped): untyped = 17 | when defined(TP_DebugTermination) or defined(TP_Debug): 18 | {.noSideEffect, gcsafe.}: body 19 | 20 | template debug*(body: untyped): untyped = 21 | when defined(TP_Debug): 22 | {.noSideEffect, gcsafe.}: body 23 | -------------------------------------------------------------------------------- /taskpools/primitives/allocs.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import system/ansi_c 9 | 10 | # Helpers 11 | # ---------------------------------------------------------------------------------- 12 | 13 | func isPowerOfTwo(n: int): bool {.inline.} = 14 | (n and (n - 1)) == 0 and (n != 0) 15 | 16 | # TODO: cannot dispatch at compile-time due to https://github.com/nim-lang/Nim/issues/12726 17 | # but all our use-case are for power of 2 18 | 19 | func roundNextMultipleOf*(x: Natural, n: Natural): int {.inline.} = 20 | assert n.isPowerOfTwo() 21 | result = (x + n - 1) and not(n - 1) 22 | 23 | # func roundNextMultipleOf*(x: Natural, n: static Natural): int {.inline.} = 24 | # ## Round the input to the next multiple of "n" 25 | # when n.isPowerOfTwo(): 26 | # # n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim) 27 | # result = (x + n - 1) and not(n - 1) 28 | # else: 29 | # result = ((x + n - 1) div n) * n 30 | 31 | # Memory 32 | # ---------------------------------------------------------------------------------- 33 | 34 | # Nim allocShared, createShared, deallocShared 35 | # take a global lock that is absolutely killing performance 36 | # and shows up either: 37 | # - native_queued_spin_lock_slowpath 38 | # - __pthread_mutex_lock and __pthread_mutex_unlock_usercnt 39 | # 40 | # We use system malloc by default, the flag -d:useMalloc is not enough 41 | 42 | template deref*(T: typedesc): typedesc = 43 | ## Return the base object type behind a ptr type 44 | typeof(default(T)[]) 45 | 46 | proc tp_alloc*(T: typedesc, zero: static bool = false): ptr T {.inline.}= 47 | ## Default allocator for the Taskpools library 48 | ## This allocates memory to hold the type T 49 | ## and returns a pointer to it 50 | ## 51 | ## Can use Nim allocator to measure the overhead of its lock 52 | ## Memory is not zeroed 53 | result = cast[ptr T](c_malloc(csize_t sizeof(T))) 54 | when zero: 55 | zeroMem(result, sizeof(T)) 56 | 57 | proc tp_allocPtr*(T: typedesc[ptr], zero: static bool = false): T {.inline.}= 58 | ## Default allocator for the Taskpools library 59 | ## This allocates memory to hold the 60 | ## underlying type of the pointer type T. 61 | ## i.e. if T is ptr int, this allocates an int 62 | ## 63 | ## Can use Nim allocator to measure the overhead of its lock 64 | ## Memory is zeroed if requested 65 | result = tp_alloc(deref(T)) 66 | when zero: 67 | zeroMem(result, sizeof(deref(T))) 68 | 69 | proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} = 70 | ## Default allocator for the Taskpools library. 71 | ## This allocates a contiguous chunk of memory 72 | ## to hold ``len`` elements of type T 73 | ## and returns a pointer to it. 74 | ## 75 | ## Can use Nim allocator to measure the overhead of its lock 76 | ## Memory is not zeroed 77 | cast[type result](c_malloc(csize_t len*sizeof(T))) 78 | 79 | proc tp_allocUnchecked*(T: typedesc, size: SomeInteger, zero: static bool = false): ptr T {.inline.} = 80 | ## Default allocator for the Taskpools library. 81 | ## This allocates "size" bytes. 82 | ## This is for datastructure which contained an UncheckedArray field 83 | result = cast[type result](c_malloc(csize_t size)) 84 | when zero: 85 | zeroMem(result, size) 86 | 87 | proc tp_free*[T: ptr](p: T) {.inline.} = 88 | when defined(WV_useNimAlloc): 89 | freeShared(p) 90 | else: 91 | c_free(p) 92 | 93 | when defined(windows): 94 | proc alloca(size: int): pointer {.header: "".} 95 | else: 96 | proc alloca(size: int): pointer {.header: "".} 97 | 98 | template alloca*(T: typedesc): ptr T = 99 | cast[ptr T](alloca(sizeof(T))) 100 | 101 | template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] = 102 | cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) 103 | 104 | when defined(windows): 105 | proc aligned_alloc_windows(size, alignment: csize_t): pointer {.sideEffect,importc:"_aligned_malloc", header:"".} 106 | # Beware of the arg order! 107 | proc tp_freeAligned*[T](p: ptr T){.sideEffect,importc:"_aligned_free", header:"".} 108 | elif defined(osx): 109 | proc posix_memalign(mem: var pointer, alignment, size: csize_t){.sideEffect,importc, header:"".} 110 | proc aligned_alloc(alignment, size: csize_t): pointer {.inline.} = 111 | posix_memalign(result, alignment, size) 112 | proc tp_freeAligned*[T](p: ptr T){.inline.} = 113 | c_free(p) 114 | else: 115 | proc aligned_alloc(alignment, size: csize_t): pointer {.sideEffect,importc, header:"".} 116 | proc tp_freeAligned*[T](p: ptr T){.inline.} = 117 | c_free(p) 118 | 119 | proc tp_allocAligned*(T: typedesc, alignment: static Natural): ptr T {.inline.} = 120 | ## aligned_alloc requires allocating in multiple of the alignment. 121 | static: 122 | assert alignment.isPowerOfTwo() 123 | let # TODO - cannot use a const due to https://github.com/nim-lang/Nim/issues/12726 124 | size = sizeof(T) 125 | requiredMem = size.roundNextMultipleOf(alignment) 126 | 127 | when defined(windows): 128 | cast[ptr T](aligned_alloc_windows(csize_t requiredMem, csize_t alignment)) 129 | else: 130 | cast[ptr T](aligned_alloc(csize_t alignment, csize_t requiredMem)) 131 | 132 | proc tp_allocAligned*(T: typedesc, size: int, alignment: static Natural): ptr T {.inline.} = 133 | ## aligned_alloc requires allocating in multiple of the alignment. 134 | static: 135 | assert alignment.isPowerOfTwo() 136 | let 137 | requiredMem = size.roundNextMultipleOf(alignment) 138 | 139 | when defined(windows): 140 | cast[ptr T](aligned_alloc_windows(csize_t requiredMem, csize_t alignment)) 141 | else: 142 | cast[ptr T](aligned_alloc(csize_t alignment, csize_t requiredMem)) 143 | 144 | proc tp_allocArrayAligned*(T: typedesc, len: int, alignment: static Natural): ptr UncheckedArray[T] {.inline.} = 145 | ## aligned_alloc requires allocating in multiple of the alignment. 146 | static: 147 | assert alignment.isPowerOfTwo() 148 | let 149 | size = sizeof(T) * len 150 | requiredMem = size.roundNextMultipleOf(alignment) 151 | 152 | when defined(windows): 153 | cast[ptr UncheckedArray[T]](aligned_alloc_windows(csize_t requiredMem, csize_t alignment)) 154 | else: 155 | cast[ptr UncheckedArray[T]](aligned_alloc(csize_t alignment, csize_t requiredMem)) 156 | -------------------------------------------------------------------------------- /taskpools/primitives/barriers.md: -------------------------------------------------------------------------------- 1 | # Synchronization Barriers 2 | 3 | OSX does not implement pthread_barrier as its an optional part 4 | of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch. 5 | 6 | So we need to roll our own with a POSIX compatible API. 7 | 8 | ## Glibc barriers, design bug and implementation 9 | 10 | > Note: due to GPL licensing, do not lift the code. 11 | > Not that we can as it is heavily dependent on futexes 12 | > which are not available on OSX 13 | 14 | We need to make sure that we don't hit the same bug 15 | as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065 16 | which seems to be an issue in some of the barrier implementations 17 | in the wild. 18 | 19 | The design of Glibc barriers is here: 20 | https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD 21 | 22 | And implementation: 23 | - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD 24 | - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD` 25 | - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD 26 | 27 | ## Synchronization barrier techniques 28 | 29 | This article goes over the techniques of 30 | "pool barrier" and "ticket barrier" 31 | https://locklessinc.com/articles/barriers/ 32 | to reach 2x to 20x the speed of pthreads barrier 33 | 34 | This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf 35 | goes over 36 | - centralized barrier with sense reversal 37 | - combining tree barrier 38 | - dissemination barrier 39 | - tournament barrier 40 | - scalable tree barrier 41 | More courses: 42 | - http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf 43 | 44 | It however requires lightweight mutexes like Linux futexes 45 | that OSX lacks. 46 | 47 | This post goes over lightweight mutexes like Benaphores (from BeOS) 48 | https://preshing.com/20120226/roll-your-own-lightweight-mutex/ 49 | 50 | This gives a few barrier implementations 51 | http://gallium.inria.fr/~maranget/MPRI/02.pdf 52 | and refers to Cubible paper for formally verifying synchronization barriers 53 | http://cubicle.lri.fr/papers/jfla2014.pdf (in French) 54 | -------------------------------------------------------------------------------- /taskpools/primitives/barriers.nim: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 2 | # Copyright (c) 2024 Status Research & Development GmbH 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | {.push raises: [], gcsafe, inline.} 9 | 10 | import os 11 | 12 | when defined(windows): 13 | import ./barriers_windows 14 | 15 | type SyncBarrier* = SynchronizationBarrier 16 | 17 | proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.raises: [OSError].} = 18 | ## Initialize a synchronization barrier that will block ``threadCount`` threads 19 | ## before release. 20 | if InitializeSynchronizationBarrier(syncBarrier, threadCount, -1) != 1: 21 | raiseOSError(osLastError()) 22 | 23 | proc wait*(syncBarrier: var SyncBarrier): bool = 24 | ## Blocks thread at a synchronization barrier. 25 | ## Returns true for one of the threads (the last one on Windows, undefined on Posix) 26 | ## and false for the others. 27 | bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE) 28 | 29 | proc delete*(syncBarrier: sink SyncBarrier) = 30 | ## Deletes a synchronization barrier. 31 | ## This assumes no race between waiting at a barrier and deleting it, 32 | ## and reuse of the barrier requires initialization. 33 | DeleteSynchronizationBarrier(syncBarrier.addr) 34 | 35 | else: 36 | import ./barriers_posix 37 | 38 | type SyncBarrier* = PthreadBarrier 39 | 40 | proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.raises: [OSError].} = 41 | ## Initialize a synchronization barrier that will block ``threadCount`` threads 42 | ## before release. 43 | let err = pthread_barrier_init(syncBarrier, nil, cuint threadCount) 44 | if err != 0: 45 | raiseOSError(OSErrorCode(err)) 46 | 47 | proc wait*(syncBarrier: var SyncBarrier): bool = 48 | ## Blocks thread at a synchronization barrier. 49 | ## Returns true for one of the threads (the last one on Windows, undefined on Posix) 50 | ## and false for the others. 51 | ## 52 | # https://pubs.opengroup.org/onlinepubs/009696899/functions/pthread_barrier_wait.html 53 | let res = pthread_barrier_wait(syncBarrier) 54 | assert res == 0 or res == PTHREAD_BARRIER_SERIAL_THREAD, osErrorMsg(OSErrorCode(res)) 55 | res == PTHREAD_BARRIER_SERIAL_THREAD 56 | 57 | proc delete*(syncBarrier: sink SyncBarrier) = 58 | ## Deletes a synchronization barrier. 59 | ## This assumes no race between waiting at a barrier and deleting it, 60 | ## and reuse of the barrier requires initialization. 61 | let err {.used.} = pthread_barrier_destroy(syncBarrier) 62 | assert err == 0, osErrorMsg(OSErrorCode(err)) 63 | -------------------------------------------------------------------------------- /taskpools/primitives/barriers_macos.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | # OSX doesn't implement pthread_barrier_t 9 | # It's an optional part of the POSIX standard 10 | # 11 | # This is a manual implementation of a sense reversing barrier 12 | 13 | import locks 14 | 15 | type 16 | Errno* = cint 17 | 18 | PthreadBarrierAttr* = object 19 | ## Dummy 20 | PthreadBarrier* = object 21 | ## Implementation of a sense reversing barrier 22 | ## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit) 23 | 24 | lock: Lock # Alternatively spinlock on Atomic 25 | cond {.guard: lock.}: Cond 26 | sense {.guard: lock.}: bool # Choose int32 to avoid zero-expansion cost in registers? 27 | left {.guard: lock.}: cuint # Number of threads missing at the barrier before opening 28 | count: cuint # Total number of threads that need to arrive before opening the barrier 29 | 30 | const 31 | PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1) 32 | 33 | func pthread_barrier_init*( 34 | barrier: var PthreadBarrier, 35 | attr: ptr PthreadBarrierAttr, 36 | count: cuint 37 | ): Errno = 38 | barrier.lock.initLock() 39 | {.locks: [barrier.lock].}: 40 | barrier.cond.initCond() 41 | barrier.left = count 42 | barrier.count = count 43 | # barrier.sense = false 44 | 45 | proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno = 46 | ## Wait on `barrier` 47 | ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread 48 | ## Returns 0 for the other 49 | ## Returns Errno if there is an error 50 | barrier.lock.acquire() 51 | {.locks: [barrier.lock].}: 52 | var local_sense = barrier.sense # Thread local sense 53 | dec barrier.left 54 | 55 | if barrier.left == 0: 56 | # Last thread to arrive at the barrier 57 | # Reverse phase and release it 58 | barrier.left = barrier.count 59 | barrier.sense = not barrier.sense 60 | barrier.cond.broadcast() 61 | barrier.lock.release() 62 | return PTHREAD_BARRIER_SERIAL_THREAD 63 | 64 | while barrier.sense == local_sense: 65 | # We are waiting for threads 66 | # Wait for the sense to reverse 67 | # while loop because we might have spurious wakeups 68 | barrier.cond.wait(barrier.lock) 69 | 70 | # Reversed, we can leave the barrier 71 | barrier.lock.release() 72 | return Errno(0) 73 | 74 | proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno = 75 | {.locks: [barrier.lock].}: 76 | barrier.cond.deinitCond() 77 | barrier.lock.deinitLock() 78 | 79 | # TODO: tests 80 | -------------------------------------------------------------------------------- /taskpools/primitives/barriers_posix.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | # Abstractions over POSIX barriers (non-)implementations 9 | 10 | when not compileOption("threads"): 11 | {.error: "This requires --threads:on compilation flag".} 12 | 13 | # Types 14 | # ------------------------------------------------------- 15 | 16 | when defined(osx): 17 | import ./barriers_macos 18 | export PthreadBarrierAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD 19 | else: 20 | type 21 | PthreadBarrierAttr* {.importc: "pthread_barrierattr_t", header: "", byref.} = object 22 | when (defined(linux) and not defined(android)) and defined(amd64): 23 | abi: array[4 div sizeof(cint), cint] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l45 24 | PthreadBarrier* {.importc: "pthread_barrier_t", header: "", byref.} = object 25 | when (defined(linux) and not defined(android)) and defined(amd64): 26 | abi: array[32 div sizeof(clong), clong] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l28 27 | 28 | Errno* = cint 29 | 30 | var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"".}: Errno 31 | 32 | # Pthread 33 | # ------------------------------------------------------- 34 | when defined(osx): 35 | export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy 36 | else: 37 | # TODO careful, this function mutates `barrier` without it being `var` which 38 | # is allowed as a consequence of `byref` - it is also different from the 39 | # one in barriers_macos 40 | # see https://github.com/status-im/nim-taskpools/pull/20#discussion_r923843093 41 | proc pthread_barrier_init*( 42 | barrier: PthreadBarrier, 43 | attr: ptr PthreadBarrierAttr, 44 | count: cuint 45 | ): Errno {.header: "".} 46 | ## Initialize `barrier` with the attributes `attr`. 47 | ## The barrier is opened when `count` waiters arrived. 48 | 49 | # TODO the macos signature is var instead of sink 50 | proc pthread_barrier_destroy*( 51 | barrier: sink PthreadBarrier): Errno {.header: "".} 52 | ## Destroy a previously dynamically initialized `barrier`. 53 | 54 | proc pthread_barrier_wait*( 55 | barrier: var PthreadBarrier 56 | ): Errno {.header: "".} 57 | ## Wait on `barrier` 58 | ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread 59 | ## Returns 0 for the other 60 | ## Returns Errno if there is an error 61 | -------------------------------------------------------------------------------- /taskpools/primitives/barriers_windows.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import winlean 9 | 10 | # Technically in but MSVC complains with 11 | # @m..@s..@sweave@sscheduler.nim.cpp 12 | # C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error: "No Target Architecture 13 | 14 | type 15 | SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"".} = object 16 | 17 | var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "".}: DWORD 18 | ## Skip expensive checks on barrier enter if a barrier is never deleted. 19 | 20 | proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "".} 21 | proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "".} 22 | proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "".} 23 | 24 | when isMainModule: 25 | import os 26 | 27 | var x{.noinit.}: SynchronizationBarrier 28 | let err = InitializeSynchronizationBarrier(x, 2, -1) 29 | if err != 1: 30 | assert err == 0 31 | raiseOSError(osLastError()) -------------------------------------------------------------------------------- /taskpools/sparsesets.nim: -------------------------------------------------------------------------------- 1 | # Weave 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | import 9 | std/random, 10 | system/ansi_c, 11 | ./instrumentation/contracts, 12 | ./primitives/allocs 13 | 14 | const TP_MaxWorkers = 255 15 | type Setuint = uint8 # We support at most 255 threads (0xFF is kept as special value to signify absence in the set) 16 | 17 | const Empty = high(Setuint) 18 | 19 | type 20 | SparseSet* = object 21 | ## Stores efficiently a set of integers in the range [0 .. Capacity) 22 | ## Supports: 23 | ## - O(1) inclusion, exclusion and contains 24 | ## - O(1) random pick 25 | ## - O(1) length 26 | ## - O(length) iteration 27 | ## 28 | ## Space: Capacity * sizeof(words) 29 | ## 30 | ## This is contrary to bitsets which requires: 31 | ## - random picking: multiple random "contains" + a fallback to uncompressing the set 32 | ## - O(Capacity/sizeof(words)) length (via popcounts) 33 | ## - O(capacity) iteration 34 | indices: ptr UncheckedArray[Setuint] 35 | values: ptr UncheckedArray[Setuint] 36 | rawBuffer: ptr UncheckedArray[Setuint] 37 | len*: Setuint 38 | capacity*: Setuint 39 | 40 | func allocate*(s: var SparseSet, capacity: SomeInteger) {.inline.} = 41 | preCondition: capacity <= TP_MaxWorkers 42 | 43 | s.capacity = Setuint capacity 44 | s.rawBuffer = tp_alloc(Setuint, 2*capacity) 45 | s.indices = s.rawBuffer 46 | s.values = cast[ptr UncheckedArray[Setuint]](s.rawBuffer[capacity].addr) 47 | 48 | func delete*(s: var SparseSet) {.inline.} = 49 | s.indices = nil 50 | s.values = nil 51 | c_free(s.rawBuffer) 52 | 53 | func refill*(s: var SparseSet) {.inline.} = 54 | ## Reset the sparseset by including all integers 55 | ## in the range [0 .. Capacity) 56 | preCondition: not s.indices.isNil 57 | preCondition: not s.values.isNil 58 | preCondition: not s.rawBuffer.isNil 59 | preCondition: s.capacity != 0 60 | 61 | s.len = s.capacity 62 | 63 | for i in Setuint(0) ..< s.len: 64 | s.indices[i] = i 65 | s.values[i] = i 66 | 67 | func isEmpty*(s: SparseSet): bool {.inline.} = 68 | s.len == 0 69 | 70 | func contains*(s: SparseSet, n: SomeInteger): bool {.inline.} = 71 | assert n.int != Empty.int 72 | s.indices[n] != Empty 73 | 74 | func incl*(s: var SparseSet, n: SomeInteger) {.inline.} = 75 | preCondition: n < Empty 76 | 77 | if n in s: return 78 | 79 | preCondition: s.len < s.capacity 80 | 81 | s.indices[n] = s.len 82 | s.values[s.len] = n 83 | s.len += 1 84 | 85 | func peek*(s: SparseSet): int32 {.inline.} = 86 | ## Returns the last point in the set 87 | ## Note: if an item is deleted this is not the last inserted point 88 | preCondition: s.len.int > 0 89 | int32 s.values[s.len - 1] 90 | 91 | func excl*(s: var SparseSet, n: SomeInteger) {.inline.} = 92 | if n notin s: return 93 | 94 | # We do constant time deletion by replacing the deleted 95 | # integer by the last value in the array of values 96 | 97 | let delIdx = s.indices[n] 98 | 99 | s.len -= 1 100 | let lastVal = s.values[s.len] 101 | 102 | s.indices[lastVal] = delIdx # Last value now points to deleted index 103 | s.values[delIdx] = s.values[lastVal] # Deleted item is now last value 104 | 105 | # Erase the item 106 | s.indices[n] = Empty 107 | 108 | func randomPick*(s: SparseSet, rng: var Rand): int {.inline.} = 109 | ## Randomly pick from the set. 110 | # The value is NOT removed from it. 111 | let pickIdx = rng.rand(s.len-1) 112 | result = s.values[pickIdx].int 113 | 114 | func `$`*(s: SparseSet): string = 115 | $toOpenArray(s.values, 0, s.len.int - 1) 116 | 117 | # Sanity checks 118 | # ------------------------------------------------------------------------------ 119 | 120 | when isMainModule: 121 | 122 | const Size = 10 123 | const Picked = 5 124 | 125 | var S: SparseSet 126 | S.allocate(Size) 127 | S.refill() 128 | echo S 129 | 130 | var rngState = initRand(123) 131 | var picked: seq[int] 132 | 133 | for _ in 0 ..< Picked: 134 | let p = S.randomPick(rngState) 135 | picked.add p 136 | S.excl p 137 | echo "---" 138 | echo "picked: ", p 139 | echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1) 140 | 141 | echo "---" 142 | echo "picked: ", picked 143 | echo "S: ", S 144 | echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1) 145 | 146 | for x in 0 ..< Size: 147 | if x notin picked: 148 | echo x, " notin picked -> in S" 149 | doAssert x in S 150 | else: 151 | echo x, " in picked -> notin S" 152 | doAssert x notin S 153 | -------------------------------------------------------------------------------- /taskpools/taskpools.nim: -------------------------------------------------------------------------------- 1 | # taskpools 2 | # Copyright (c) 2021 Status Research & Development GmbH 3 | # Licensed and distributed under either of 4 | # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 5 | # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms. 7 | 8 | # Taskpools 9 | # 10 | # This file implements a taskpool 11 | # 12 | # Implementation: 13 | # 14 | # It is a simple shared memory based work-stealing threadpool. 15 | # The primary focus is: 16 | # - Delegate compute intensive tasks to the threadpool. 17 | # - Simple to audit by staying close to foundational papers 18 | # and using simple datastructures otherwise. 19 | # - Low energy consumption: 20 | # threads should be put to sleep ASAP 21 | # instead of polling/spinning (energy vs latency tradeoff) 22 | # - Decent performance: 23 | # Work-stealing has optimal asymptotic parallel speedup. 24 | # Work-stealing has significantly reduced contention 25 | # when many tasks are created, 26 | # for example by divide-and-conquer algorithms, compared to a global task queue 27 | # 28 | # Not a priority: 29 | # - Handling trillions of very short tasks (less than 100µs). 30 | # - Advanced task dependencies or events API. 31 | # - Unbalanced parallel-for loops. 32 | # - Handling services that should run for the lifetime of the program. 33 | # 34 | # Doing IO on a compute threadpool should be avoided 35 | # In case a thread is blocked for IO, other threads can steal pending tasks in that thread. 36 | # If all threads are pending for IO, the threadpool will not make any progress and be soft-locked. 37 | 38 | {.push raises: [], gcsafe.} # Ensure no exceptions can happen 39 | 40 | import 41 | system/ansi_c, 42 | std/[random, cpuinfo, atomics, macros], 43 | ./channels_spsc_single, 44 | ./chase_lev_deques, 45 | ./event_notifiers, 46 | ./primitives/[barriers, allocs], 47 | ./instrumentation/[contracts, loggers], 48 | ./sparsesets, 49 | ./flowvars, 50 | ./ast_utils, 51 | ./tasks 52 | 53 | export 54 | # flowvars 55 | Flowvar, isSpawned, isReady, sync, tasks 56 | 57 | 58 | type 59 | WorkerID = int32 60 | 61 | TaskNode = ptr object 62 | # Linked list of tasks 63 | parent: TaskNode 64 | task: Task 65 | 66 | Signal = object 67 | terminate {.align: 64.}: Atomic[bool] 68 | 69 | WorkerContext = object 70 | ## Thread-local worker context 71 | 72 | # Params 73 | id: WorkerID 74 | taskpool: Taskpool 75 | 76 | # Tasks 77 | taskDeque: ptr ChaseLevDeque[TaskNode] # owned task deque 78 | currentTask: TaskNode 79 | 80 | # Synchronization 81 | eventNotifier: ptr EventNotifier # shared event notifier 82 | signal: ptr Signal # owned signal 83 | 84 | # Thefts 85 | rng: Rand # RNG state to select victims 86 | otherDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]] 87 | victims: SparseSet 88 | 89 | Taskpool* = ptr object 90 | ## A taskpool schedules procedures to be executed in parallel 91 | barrier: SyncBarrier 92 | ## Barrier for initialization and teardown 93 | # --- Align: 64 94 | eventNotifier: EventNotifier 95 | ## Puts thread to sleep 96 | 97 | numThreads*{.align: 64.}: int 98 | workerDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]] 99 | ## Direct access for task stealing 100 | workers: ptr UncheckedArray[Thread[(Taskpool, WorkerID)]] 101 | workerSignals: ptr UncheckedArray[Signal] 102 | ## Access signaledTerminate 103 | 104 | # Thread-local config 105 | # --------------------------------------------- 106 | 107 | var workerContext {.threadvar.}: WorkerContext 108 | ## Thread-local Worker context 109 | 110 | proc setupWorker() = 111 | ## Initialize the thread-local context of a worker 112 | ## Requires the ID and taskpool fields to be initialized 113 | template ctx: untyped = workerContext 114 | 115 | preCondition: not ctx.taskpool.isNil() 116 | preCondition: 0 <= ctx.id and ctx.id < ctx.taskpool.numThreads 117 | preCondition: not ctx.taskpool.workerDeques.isNil() 118 | preCondition: not ctx.taskpool.workerSignals.isNil() 119 | 120 | # Thefts 121 | ctx.rng = initRand(0xEFFACED + ctx.id) 122 | ctx.otherDeques = ctx.taskpool.workerDeques 123 | ctx.victims.allocate(ctx.taskpool.numThreads) 124 | 125 | # Synchronization 126 | ctx.eventNotifier = addr ctx.taskpool.eventNotifier 127 | ctx.signal = addr ctx.taskpool.workerSignals[ctx.id] 128 | ctx.signal.terminate.store(false, moRelaxed) 129 | 130 | # Tasks 131 | ctx.taskDeque = addr ctx.taskpool.workerDeques[ctx.id] 132 | ctx.currentTask = nil 133 | 134 | # Init 135 | ctx.taskDeque[].init(initialCapacity = 32) 136 | 137 | proc teardownWorker() = 138 | ## Cleanup the thread-local context of a worker 139 | template ctx: untyped = workerContext 140 | ctx.taskDeque[].teardown() 141 | ctx.victims.delete() 142 | 143 | proc eventLoop(ctx: var WorkerContext) {.raises:[].} 144 | 145 | proc workerEntryFn(params: tuple[taskpool: Taskpool, id: WorkerID]) = 146 | ## On the start of the threadpool workers will execute this 147 | ## until they receive a termination signal 148 | # We assume that thread_local variables start all at their binary zero value 149 | preCondition: workerContext == default(WorkerContext) 150 | 151 | template ctx: untyped = workerContext 152 | 153 | # If the following crashes, you need --tlsEmulation:off 154 | ctx.id = params.id 155 | ctx.taskpool = params.taskpool 156 | 157 | setupWorker() 158 | 159 | # 1 matching barrier in Taskpool.new() for root thread 160 | discard params.taskpool.barrier.wait() 161 | 162 | {.gcsafe.}: # Not GC-safe when multi-threaded due to thread-local variables 163 | ctx.eventLoop() 164 | 165 | debugTermination: 166 | log(">>> Worker %2d shutting down <<<\n", ctx.id) 167 | 168 | # 1 matching barrier in taskpool.shutdown() for root thread 169 | discard params.taskpool.barrier.wait() 170 | 171 | teardownWorker() 172 | 173 | # Tasks 174 | # --------------------------------------------- 175 | 176 | proc new(T: type TaskNode, parent: TaskNode, task: sink Task): T = 177 | var tn = tp_allocPtr(TaskNode) 178 | tn.parent = parent 179 | wasMoved(tn.task) # tn.task is uninitialized, prevent Nim from running the Task destructor 180 | tn.task = task 181 | return tn 182 | 183 | proc runTask(tn: var TaskNode) {.inline.} = 184 | ## Run a task and consumes the taskNode 185 | tn.task.invoke() 186 | {.gcsafe.}: # Upstream missing tagging `=destroy` as gcsafe 187 | tn.task.`=destroy`() 188 | tn.c_free() 189 | 190 | proc schedule(ctx: WorkerContext, tn: sink TaskNode) {.inline.} = 191 | ## Schedule a task in the taskpool 192 | debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask) 193 | ctx.taskDeque[].push(tn) 194 | ctx.taskpool.eventNotifier.notify() 195 | 196 | # Scheduler 197 | # --------------------------------------------- 198 | 199 | proc trySteal(ctx: var WorkerContext): TaskNode = 200 | ## Try to steal a task. 201 | 202 | ctx.victims.refill() 203 | ctx.victims.excl(ctx.id) 204 | 205 | while not ctx.victims.isEmpty(): 206 | let target = ctx.victims.randomPick(ctx.rng) 207 | 208 | let stolenTask = ctx.otherDeques[target].steal() 209 | if not stolenTask.isNil: 210 | return stolenTask 211 | 212 | ctx.victims.excl(target) 213 | 214 | return nil 215 | 216 | proc eventLoop(ctx: var WorkerContext) = 217 | ## Each worker thread executes this loop over and over. 218 | while not ctx.signal.terminate.load(moRelaxed): 219 | # 1. Pick from local deque 220 | debug: log("Worker %2d: eventLoop 1 - searching task from local deque\n", ctx.id) 221 | while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil): 222 | debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask) 223 | taskNode.runTask() 224 | 225 | # 2. Run out of tasks, become a thief 226 | debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id) 227 | var stolenTask = ctx.trySteal() 228 | if not stolenTask.isNil: 229 | # 2.a Run task 230 | debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask) 231 | stolenTask.runTask() 232 | else: 233 | # 2.b Park the thread until a new task enters the taskpool 234 | debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id) 235 | ctx.eventNotifier[].park() 236 | debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id) 237 | 238 | # Tasking 239 | # --------------------------------------------- 240 | 241 | const RootTask = default(Task) # TODO: sentinel value different from null task 242 | 243 | template isRootTask(task: Task): bool = 244 | task == RootTask 245 | 246 | proc forceFuture*[T](fv: Flowvar[T], parentResult: var T) = 247 | ## Eagerly complete an awaited FlowVar 248 | 249 | template ctx: untyped = workerContext 250 | 251 | template isFutReady(): untyped = 252 | fv.chan[].tryRecv(parentResult) 253 | 254 | if isFutReady(): 255 | return 256 | 257 | ## 1. Process all the children of the current tasks. 258 | ## This ensures that we can give control back ASAP. 259 | debug: log("Worker %2d: sync 1 - searching task from local deque\n", ctx.id) 260 | while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil): 261 | if taskNode.parent != ctx.currentTask: 262 | debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask) 263 | ctx.schedule(taskNode) 264 | break 265 | debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask) 266 | taskNode.runTask() 267 | if isFutReady(): 268 | debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id) 269 | return 270 | 271 | ## 2. We run out-of-tasks or out-of-direct-child of our current awaited task 272 | ## So the task is bottlenecked by dependencies in other threads, 273 | ## hence we abandon our enqueued work and steal in the others' queues 274 | ## in hope it advances our awaited task. This prioritizes latency over throughput. 275 | debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask) 276 | while not isFutReady(): 277 | var taskNode = ctx.trySteal() 278 | 279 | if not taskNode.isNil: 280 | # We stole some task, we hope we advance our awaited task 281 | debug: log("Worker %2d: sync 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask) 282 | taskNode.runTask() 283 | # elif (taskNode = ctx.taskDeque[].pop(); not taskNode.isNil): 284 | # # We advance our own queue, this increases throughput but may impact latency on the awaited task 285 | # debug: log("Worker %2d: sync 2.2 - couldn't steal, running own task\n", ctx.id) 286 | # taskNode.runTask() 287 | else: 288 | # We don't park as there is no notif for task completion 289 | cpuRelax() 290 | 291 | proc syncAll*(tp: Taskpool) = 292 | ## Blocks until all pending tasks are completed 293 | ## This MUST only be called from 294 | ## the root scope that created the taskpool 295 | template ctx: untyped = workerContext 296 | 297 | debugTermination: 298 | log(">>> Worker %2d enters barrier <<<\n", ctx.id) 299 | 300 | preCondition: ctx.id == 0 301 | preCondition: ctx.currentTask.task.isRootTask() 302 | 303 | # Empty all tasks 304 | var foreignThreadsParked = false 305 | while not foreignThreadsParked: 306 | # 1. Empty local tasks 307 | debug: log("Worker %2d: syncAll 1 - searching task from local deque\n", ctx.id) 308 | while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil): 309 | debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask) 310 | taskNode.runTask() 311 | 312 | if tp.numThreads == 1 or foreignThreadsParked: 313 | break 314 | 315 | # 2. Help other threads 316 | debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id) 317 | var taskNode = ctx.trySteal() 318 | 319 | if not taskNode.isNil: 320 | # 2.1 We stole some task 321 | debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask) 322 | taskNode.runTask() 323 | else: 324 | # 2.2 No task to steal 325 | if tp.eventNotifier.getParked() == tp.numThreads - 1: 326 | # 2.2.1 all threads besides the current are parked 327 | debugTermination: 328 | log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id) 329 | foreignThreadsParked = true 330 | else: 331 | # 2.2.2 We don't park as there is no notif for task completion 332 | cpuRelax() 333 | 334 | debugTermination: 335 | log(">>> Worker %2d leaves barrier <<<\n", ctx.id) 336 | 337 | # Runtime 338 | # --------------------------------------------- 339 | 340 | proc new*(T: type Taskpool, numThreads = countProcessors()): T {.raises: [CatchableError].} = 341 | ## Initialize a threadpool that manages `numThreads` threads. 342 | ## Default to the number of logical processors available. 343 | 344 | type TpObj = typeof(default(Taskpool)[]) 345 | # Event notifier requires an extra 64 bytes for alignment 346 | var tp = tp_allocAligned(TpObj, sizeof(TpObj) + 64, 64) 347 | 348 | tp.barrier.init(numThreads.int32) 349 | tp.eventNotifier.initialize() 350 | tp.numThreads = numThreads 351 | tp.workerDeques = tp_allocArrayAligned(ChaseLevDeque[TaskNode], numThreads, alignment = 64) 352 | tp.workers = tp_allocArrayAligned(Thread[(Taskpool, WorkerID)], numThreads, alignment = 64) 353 | tp.workerSignals = tp_allocArrayAligned(Signal, numThreads, alignment = 64) 354 | 355 | # Setup master thread 356 | workerContext.id = 0 357 | workerContext.taskpool = tp 358 | 359 | # Start worker threads 360 | for i in 1 ..< numThreads: 361 | createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i))) 362 | 363 | # Root worker 364 | setupWorker() 365 | 366 | # Root task, this is a sentinel task that is never called. 367 | workerContext.currentTask = TaskNode.new( 368 | parent = nil, 369 | task = default(Task) # TODO RootTask, somehow this uses `=copy` 370 | ) 371 | 372 | # Wait for the child threads 373 | discard tp.barrier.wait() 374 | return tp 375 | 376 | proc cleanup(tp: var Taskpool) = 377 | ## Cleanup all resources allocated by the taskpool 378 | preCondition: workerContext.currentTask.task.isRootTask() 379 | 380 | for i in 1 ..< tp.numThreads: 381 | joinThread(tp.workers[i]) 382 | 383 | tp.workerSignals.tp_freeAligned() 384 | tp.workers.tp_freeAligned() 385 | tp.workerDeques.tp_freeAligned() 386 | `=destroy`(tp.eventNotifier) 387 | tp.barrier.delete() 388 | 389 | tp.tp_freeAligned() 390 | 391 | proc shutdown*(tp: var Taskpool) = 392 | ## Wait until all tasks are processed and then shutdown the taskpool 393 | preCondition: workerContext.currentTask.task.isRootTask() 394 | tp.syncAll() 395 | 396 | # Signal termination to all threads 397 | for i in 0 ..< tp.numThreads: 398 | tp.workerSignals[i].terminate.store(true, moRelaxed) 399 | 400 | let parked = tp.eventNotifier.getParked() 401 | for i in 0 ..< parked: 402 | tp.eventNotifier.notify() 403 | 404 | # 1 matching barrier in worker_entry_fn 405 | discard tp.barrier.wait() 406 | 407 | teardownWorker() 408 | tp.cleanup() 409 | 410 | # Dealloc dummy task 411 | workerContext.currentTask.c_free() 412 | 413 | # Task parallelism 414 | # --------------------------------------------- 415 | {.pop.} # raises:[] 416 | 417 | macro spawn*(tp: Taskpool, fnCall: typed): untyped = 418 | ## Spawns the input function call asynchronously, potentially on another thread of execution. 419 | ## 420 | ## If the function calls returns a result, spawn will wrap it in a Flowvar. 421 | ## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar. 422 | ## You can use `isReady` to check if result is available and if subsequent 423 | ## `spawn` returns immediately. 424 | ## 425 | ## Tasks are processed approximately in Last-In-First-Out (LIFO) order 426 | result = newStmtList() 427 | 428 | let fn = fnCall[0] 429 | let fnName = $fn 430 | 431 | # Get the return type if any 432 | let retType = fnCall[0].getImpl[3][0] 433 | let needFuture = retType.kind != nnkEmpty 434 | 435 | # Package in a task 436 | let taskNode = ident("taskNode") 437 | if not needFuture: 438 | result.add quote do: 439 | let `taskNode` = TaskNode.new(workerContext.currentTask, toTask(`fnCall`)) 440 | schedule(workerContext, `taskNode`) 441 | 442 | else: 443 | # tasks have no return value. 444 | # 1. We create a channel/flowvar to transmit the return value to awaiter/sync 445 | # 2. We create a wrapper async_fn without return value that send the return value in the channel 446 | # 3. We package that wrapper function in a task 447 | 448 | # 1. Create the channel 449 | let fut = ident("fut") 450 | let futTy = nnkBracketExpr.newTree( 451 | bindSym"FlowVar", 452 | retType 453 | ) 454 | result.add quote do: 455 | let `fut` = newFlowVar(type `retType`) 456 | 457 | # 2. Create a wrapper function that sends result to the channel 458 | # TODO, upstream "getImpl" doesn't return the generic params 459 | let genericParams = fn.getImpl()[2].replaceSymsByIdents() 460 | let formalParams = fn.getImpl()[3].replaceSymsByIdents() 461 | 462 | var asyncParams = nnkFormalParams.newTree( 463 | newEmptyNode() 464 | ) 465 | var fnCallIdents = nnkCall.newTree( 466 | fnCall[0] 467 | ) 468 | for i in 1 ..< formalParams.len: 469 | let ident = formalParams[i].replaceSymsByIdents() 470 | asyncParams.add ident 471 | for j in 0 ..< ident.len - 2: 472 | # Handle "a, b: int" 473 | fnCallIdents.add ident[j] 474 | 475 | let futFnParam = ident("fut") 476 | asyncParams.add newIdentDefs(futFnParam, futTy) 477 | 478 | let asyncBody = quote do: 479 | # XXX: can't test that when the RootTask is default(Task) instead of a sentinel value 480 | # preCondition: not isRootTask(workerContext.currentTask.task) 481 | 482 | let res = `fnCallIdents` 483 | readyWith(`futFnParam`, res) 484 | 485 | let asyncFn = ident("taskpool_" & fnName) 486 | result.add nnkProcDef.newTree( 487 | asyncFn, 488 | newEmptyNode(), 489 | genericParams, 490 | asyncParams, 491 | nnkPragma.newTree(ident("nimcall")), 492 | newEmptyNode(), 493 | asyncBody 494 | ) 495 | 496 | var asyncCall = newCall(asyncFn) 497 | for i in 1 ..< fnCall.len: 498 | asyncCall.add fnCall[i].replaceSymsByIdents() 499 | asyncCall.add fut 500 | 501 | result.add quote do: 502 | let `taskNode` = TaskNode.new(workerContext.currentTask, toTask(`asyncCall`)) 503 | schedule(workerContext, `taskNode`) 504 | 505 | # Return the future / flowvar 506 | `fut` 507 | 508 | # Wrap in a block for namespacing 509 | result = nnkBlockStmt.newTree(newEmptyNode(), result) 510 | # echo result.toStrLit() 511 | -------------------------------------------------------------------------------- /taskpools/tasks.nim: -------------------------------------------------------------------------------- 1 | # (c) Copyright 2021 Nim contributors 2 | # Copyright (c) 2023- Status Research & Development GmbH 3 | 4 | ## This module provides basic primitives for creating parallel programs. 5 | ## A `Task` should be only owned by a single Thread, it cannot be shared by threads. 6 | ## 7 | ## The module was forked from std/tasks in Nim 1.6 to add new functionality and 8 | ## tune to the taskpools use case. 9 | 10 | import std/[macros, isolation, typetraits] 11 | import system/ansi_c 12 | 13 | export isolation 14 | 15 | 16 | when compileOption("threads"): 17 | from std/effecttraits import isGcSafe 18 | 19 | 20 | # 21 | # proc hello(a: int, b: string) = 22 | # echo $a & b 23 | # 24 | # let literal = "Nim" 25 | # let t = toTask(hello(521, literal)) 26 | # 27 | # 28 | # is roughly converted to 29 | # 30 | # type 31 | # ScratchObj_369098780 = object 32 | # a: int 33 | # b: string 34 | # 35 | # let scratch_369098762 = cast[ptr ScratchObj_369098780](c_calloc(csize_t 1, 36 | # csize_t sizeof(ScratchObj_369098780))) 37 | # if scratch_369098762.isNil: 38 | # raise newException(OutOfMemDefect, "Could not allocate memory") 39 | # block: 40 | # var isolate_369098776 = isolate(521) 41 | # scratch_369098762.a = extract(isolate_369098776) 42 | # var isolate_369098778 = isolate(literal) 43 | # scratch_369098762.b = extract(isolate_369098778) 44 | # proc hello_369098781(args`gensym3: pointer) {.nimcall.} = 45 | # let objTemp_369098775 = cast[ptr ScratchObj_369098780](args`gensym3) 46 | # let :tmp_369098777 = objTemp_369098775.a 47 | # let :tmp_369098779 = objTemp_369098775.b 48 | # hello(a = :tmp_369098777, b = :tmp_369098779) 49 | # 50 | # proc destroyScratch_369098782(args`gensym3: pointer) {.nimcall.} = 51 | # let obj_369098783 = cast[ptr ScratchObj_369098780](args`gensym3) 52 | # =destroy(obj_369098783[]) 53 | # let t = Task(callback: hello_369098781, args: scratch_369098762, destroy: destroyScratch_369098782) 54 | # 55 | 56 | {.push raises: [], gcsafe.} 57 | 58 | type 59 | Task* = object ## `Task` contains the callback and its arguments. 60 | callback: proc (args: pointer) {.nimcall, gcsafe, raises: [].} 61 | args: pointer 62 | destroy: proc (args: pointer) {.nimcall, gcsafe, raises: [].} 63 | 64 | 65 | proc `=copy`*(x: var Task, y: Task) {.error.} 66 | 67 | proc `=destroy`*(t: var Task) {.inline.} = 68 | ## Frees the resources allocated for a `Task`. 69 | if t.args != nil: 70 | if t.destroy != nil: 71 | t.destroy(t.args) 72 | c_free(t.args) 73 | 74 | proc invoke*(task: Task) {.inline.} = 75 | ## Invokes the `task`. 76 | assert task.callback != nil 77 | task.callback(task.args) 78 | 79 | template checkIsolate(scratchAssignList: seq[NimNode], procParam, scratchDotExpr: NimNode) = 80 | # block: 81 | # var isoTempA = isolate(521) 82 | # scratch.a = extract(isolateA) 83 | # var isoTempB = isolate(literal) 84 | # scratch.b = extract(isolateB) 85 | let isolatedTemp = genSym(nskTemp, "isoTemp") 86 | scratchAssignList.add newVarStmt(isolatedTemp, newCall(newIdentNode("isolate"), procParam)) 87 | scratchAssignList.add newAssignment(scratchDotExpr, 88 | newCall(newIdentNode("extract"), isolatedTemp)) 89 | 90 | template addAllNode(assignParam: NimNode, procParam: NimNode) = 91 | let scratchDotExpr = newDotExpr(scratchIdent, formalParams[i][0]) 92 | 93 | checkIsolate(scratchAssignList, procParam, scratchDotExpr) 94 | 95 | let tempNode = genSym(kind = nskTemp, ident = formalParams[i][0].strVal) 96 | callNode.add nnkExprEqExpr.newTree(formalParams[i][0], tempNode) 97 | tempAssignList.add newLetStmt(tempNode, newDotExpr(objTemp, formalParams[i][0])) 98 | scratchRecList.add newIdentDefs(newIdentNode(formalParams[i][0].strVal), assignParam) 99 | 100 | macro toTask*(e: typed{nkCall | nkInfix | nkPrefix | nkPostfix | nkCommand | nkCallStrLit}): Task = 101 | ## Converts the call and its arguments to `Task`. 102 | runnableExamples("--gc:orc"): 103 | proc hello(a: int) = echo a 104 | 105 | let b = toTask hello(13) 106 | assert b is Task 107 | 108 | doAssert getTypeInst(e).typeKind == ntyVoid 109 | 110 | when compileOption("threads"): 111 | if not isGcSafe(e[0]): 112 | error("'toTask' takes a GC safe call expression", e) 113 | 114 | if hasClosure(e[0]): 115 | error("closure call is not allowed", e) 116 | 117 | if e.len > 1: 118 | let scratchIdent = genSym(kind = nskTemp, ident = "scratch") 119 | let impl = e[0].getTypeInst 120 | 121 | when defined(nimTasksDebug): 122 | echo impl.treeRepr 123 | echo e.treeRepr 124 | let formalParams = impl[0] 125 | 126 | var 127 | scratchRecList = newNimNode(nnkRecList) 128 | scratchAssignList: seq[NimNode] 129 | tempAssignList: seq[NimNode] 130 | callNode: seq[NimNode] 131 | 132 | let 133 | objTemp = genSym(nskTemp, ident = "objTemp") 134 | 135 | for i in 1 ..< formalParams.len: 136 | var param = formalParams[i][1] 137 | 138 | if param.kind == nnkBracketExpr and param[0].eqIdent("sink"): 139 | param = param[0] 140 | 141 | if param.typeKind in {ntyExpr, ntyStmt}: 142 | error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter", e) 143 | 144 | case param.kind 145 | of nnkVarTy: 146 | error("'toTask'ed function cannot have a 'var' parameter", e) 147 | of nnkBracketExpr: 148 | if param[0].typeKind == ntyTypeDesc: 149 | callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i]) 150 | elif param[0].typeKind in {ntyVarargs, ntyOpenArray}: 151 | if param[1].typeKind in {ntyExpr, ntyStmt}: 152 | error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter", e) 153 | let 154 | seqType = nnkBracketExpr.newTree(newIdentNode("seq"), param[1]) 155 | seqCallNode = newCall("@", e[i]) 156 | addAllNode(seqType, seqCallNode) 157 | else: 158 | addAllNode(param, e[i]) 159 | of nnkBracket, nnkObjConstr: 160 | # passing by static parameters 161 | # so we pass them directly instead of passing by scratchObj 162 | callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i]) 163 | of nnkSym, nnkPtrTy: 164 | addAllNode(param, e[i]) 165 | of nnkCharLit..nnkNilLit: 166 | callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i]) 167 | else: 168 | error("'toTask'ed function cannot have a parameter of " & $param.kind & " kind", e) 169 | 170 | let scratchObjType = genSym(kind = nskType, ident = "ScratchObj") 171 | let scratchObj = nnkTypeSection.newTree( 172 | nnkTypeDef.newTree( 173 | scratchObjType, 174 | newEmptyNode(), 175 | nnkObjectTy.newTree( 176 | newEmptyNode(), 177 | newEmptyNode(), 178 | scratchRecList 179 | ) 180 | ) 181 | ) 182 | 183 | 184 | let scratchObjPtrType = quote do: 185 | cast[ptr `scratchObjType`](c_calloc(csize_t 1, csize_t sizeof(`scratchObjType`))) 186 | 187 | let scratchLetSection = newLetStmt( 188 | scratchIdent, 189 | scratchObjPtrType 190 | ) 191 | 192 | let scratchCheck = quote do: 193 | if `scratchIdent`.isNil: 194 | raise newException(OutOfMemDefect, "Could not allocate memory") 195 | 196 | var stmtList = newStmtList() 197 | stmtList.add(scratchObj) 198 | stmtList.add(scratchLetSection) 199 | stmtList.add(scratchCheck) 200 | stmtList.add(nnkBlockStmt.newTree(newEmptyNode(), newStmtList(scratchAssignList))) 201 | 202 | var functionStmtList = newStmtList() 203 | let funcCall = newCall(e[0], callNode) 204 | functionStmtList.add tempAssignList 205 | functionStmtList.add funcCall 206 | 207 | let funcName = genSym(nskProc, e[0].strVal) 208 | let destroyName = genSym(nskProc, "destroyScratch") 209 | let objTemp2 = genSym(ident = "obj") 210 | let tempNode = quote("@") do: 211 | `=destroy`(@objTemp2[]) 212 | 213 | result = quote do: 214 | `stmtList` 215 | 216 | proc `funcName`(args: pointer) {.gcsafe, nimcall, raises: [].} = 217 | let `objTemp` = cast[ptr `scratchObjType`](args) 218 | `functionStmtList` 219 | 220 | proc `destroyName`(args: pointer) {.gcsafe, nimcall, raises: [].} = 221 | let `objTemp2` = cast[ptr `scratchObjType`](args) 222 | `tempNode` 223 | 224 | Task(callback: `funcName`, args: `scratchIdent`, destroy: `destroyName`) 225 | else: 226 | let funcCall = newCall(e[0]) 227 | let funcName = genSym(nskProc, e[0].strVal) 228 | 229 | result = quote do: 230 | proc `funcName`(args: pointer) {.gcsafe, nimcall, raises: [].} = 231 | `funcCall` 232 | 233 | Task(callback: `funcName`, args: nil) 234 | 235 | when defined(nimTasksDebug): 236 | echo result.repr 237 | 238 | when isMainModule: 239 | block: 240 | var num = 0 241 | proc hello(a: int) = inc num, a 242 | 243 | let b = toTask hello(13) 244 | b.invoke() 245 | assert num == 13 246 | # A task can be invoked multiple times 247 | b.invoke() 248 | assert num == 26 249 | 250 | block: 251 | type 252 | Runnable = ref object 253 | data: int 254 | 255 | var data: int 256 | proc hello(a: Runnable) {.nimcall.} = 257 | a.data += 2 258 | data = a.data 259 | 260 | 261 | when false: 262 | # the parameters of call must be isolated. 263 | let x = Runnable(data: 12) 264 | let b = toTask hello(x) # error ----> expression cannot be isolated: x 265 | b.invoke() 266 | 267 | let b = toTask(hello(Runnable(data: 12))) 268 | b.invoke() 269 | assert data == 14 270 | b.invoke() 271 | assert data == 16 272 | --------------------------------------------------------------------------------