├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE-APACHEv2
├── LICENSE-MIT
├── README.md
├── benchmarks
    ├── bouncing_producer_consumer
    │   ├── README.md
    │   └── taskpool_bpc.nim
    ├── dfs
    │   └── taskpool_dfs.nim
    ├── fibonacci
    │   ├── README.md
    │   ├── stdnim_fib.nim
    │   └── taskpool_fib.nim
    ├── heat
    │   ├── stdnim_heat.nim
    │   └── taskpool_heat.nim
    ├── matmul_cache_oblivious
    │   ├── README.md
    │   └── taskpool_matmul_co.nim
    ├── nqueens
    │   ├── stdnim_nqueens.nim
    │   └── taskpool_nqueens.nim
    ├── resources.nim
    ├── single_task_producer
    │   ├── README.md
    │   └── taskpool_spc.nim
    ├── wtime.h
    └── wtime.nim
├── doc
    └── README.md
├── examples
    ├── e01_simple_tasks.nim
    └── e02_parallel_pi.nim
├── papers
    ├── Chase-Lev - Dynamic Circular Work-Stealing Deque.pdf
    └── Nhat Minh Le et al - Correct and Efficient Work-Stealing for Weak Memory Models.pdf
├── taskpools.nim
├── taskpools.nimble
└── taskpools
    ├── ast_utils.nim
    ├── channels_spsc_single.nim
    ├── chase_lev_deques.nim
    ├── event_notifiers.nim
    ├── flowvars.nim
    ├── instrumentation
        ├── contracts.nim
        └── loggers.nim
    ├── primitives
        ├── allocs.nim
        ├── barriers.md
        ├── barriers.nim
        ├── barriers_macos.nim
        ├── barriers_posix.nim
        └── barriers_windows.nim
    ├── sparsesets.nim
    ├── taskpools.nim
    └── tasks.nim


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - stable
 6 |       - unstable
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build:
12 |     uses: status-im/nimbus-common-workflow/.github/workflows/common.yml@main
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nimcache/
2 | 
3 | # Executables shall be put in an ignored build/ directory
4 | build/
5 | nimble.develop
6 | nimble.paths
7 | 


--------------------------------------------------------------------------------
/LICENSE-APACHEv2:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2021-Present Status Research & Development GmbH
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2021-Present Status Research & Development GmbH
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Taskpools
  2 | 
  3 | This implements a lightweight, energy-efficient, easily auditable multithreaded taskpools.
  4 | 
  5 | This taskpools will be used in a highly security-sensitive blockchain application
  6 | targeted at resource-restricted devices hence desirable properties are:
  7 | 
  8 | - Ease of auditing and maintenance.
  9 |   - Formally verified synchronization primitives are highly-sought after.
 10 |   - Otherwise primitives are implemented from papers or ported from proven codebases
 11 |     that can serve as reference for auditors.
 12 | - Resource-efficient. Threads spindown to save power, low memory use.
 13 | - Decent performance and scalability. The CPU should spent its time processing user workloads
 14 |   and not dealing with threadpool contention, latencies and overheads.
 15 | 
 16 | ## Example usage
 17 | 
 18 | ```Nim
 19 | # Demo of API using a very inefficient π approcimation algorithm.
 20 | 
 21 | import
 22 |   std/[strutils, math, cpuinfo],
 23 |   taskpools
 24 | 
 25 | # From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim
 26 | # Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
 27 | proc term(k: int): float =
 28 |   if k mod 2 == 1:
 29 |     -4'f / float(2*k + 1)
 30 |   else:
 31 |     4'f / float(2*k + 1)
 32 | 
 33 | proc piApprox(tp: Taskpool, n: int): float =
 34 |   var pendingFuts = newSeq[FlowVar[float]](n)
 35 |   for k in 0 ..< pendingFuts.len:
 36 |     pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result.
 37 |   for k in 0 ..< pendingFuts.len:
 38 |     result += sync pendingFuts[k]     # Block until the result is available.
 39 | 
 40 | proc main() =
 41 |   var n = 1_000_000
 42 |   var nthreads = countProcessors()
 43 | 
 44 |   var tp = Taskpool.new(num_threads = nthreads) # Default to the number of hardware threads.
 45 | 
 46 |   echo formatFloat(tp.piApprox(n))
 47 | 
 48 |   tp.syncAll()                                  # Block until all pending tasks are processed (implied in tp.shutdown())
 49 |   tp.shutdown()
 50 | 
 51 | # Compile with nim c -r -d:release --threads:on --outdir:build example.nim
 52 | main()
 53 | ```
 54 | 
 55 | ## API
 56 | 
 57 | The API follows the spec proposed here https://github.com/nim-lang/RFCs/issues/347#task-parallelism-api
 58 | 
 59 | The following types and procedures are exposed:
 60 | 
 61 | - Taskpool:
 62 |   - ```Nim
 63 |     type Taskpool* = ptr object
 64 |       ## A taskpool schedules procedures to be executed in parallel
 65 |     ```
 66 |   - ```Nim
 67 |     proc new(T: type Taskpool, numThreads = countProcessor()): T
 68 |       ## Initialize a threadpool that manages `numThreads` threads.
 69 |       ## Default to the number of logical processors available.
 70 |     ```
 71 |   - ```Nim
 72 |     proc syncAll*(pool: Taskpool) =
 73 |       ## Blocks until all pending tasks are completed.
 74 |       ##
 75 |       ## This MUST only be called from
 76 |       ## the root thread that created the taskpool
 77 |     ```
 78 |   - ```Nim
 79 |     proc shutdown*(tp: var TaskPool) =
 80 |       ## Wait until all tasks are completed and then shutdown the taskpool.
 81 |       ##
 82 |       ## This MUST only be called from
 83 |       ## the root scope that created the taskpool.
 84 |     ```
 85 |   - ```Nim
 86 |     macro spawn*(tp: TaskPool, fnCall: typed): untyped =
 87 |       ## Spawns the input function call asynchronously, potentially on another thread of execution.
 88 |       ##
 89 |       ## If the function calls returns a result, spawn will wrap it in a Flowvar.
 90 |       ## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
 91 |       ## You can use `isReady` to check if result is available and if subsequent
 92 |       ## `spawn` returns immediately.
 93 |       ##
 94 |       ## Tasks are processed approximately in Last-In-First-Out (LIFO) order
 95 |     ```
 96 |     In practice the signature is one of the following
 97 |     ```Nim
 98 |     proc spawn*(tp: TaskPool, fnCall(args) -> T): Flowvar[T]
 99 |     proc spawn*(tp: TaskPool, fnCall(args) -> void): void
100 |     ```
101 | - Flowvar, a handle on an asynchronous computation scheduled on the threadpool
102 |   - ```Nim
103 |     type Flowvar*[T] = object
104 |       ## A Flowvar is a placeholder for a future result that may be computed in parallel
105 |     ```
106 |   - ```Nim
107 |     func isSpawned*(fv: Flowvar): bool =
108 |       ## Returns true if a flowvar is spawned
109 |       ## This may be useful for recursive algorithms that
110 |       ## may or may not spawn a flowvar depending on a condition.
111 |       ## This is similar to Option or Maybe types
112 |     ```
113 |   - ```Nim
114 |     func isReady*[T](fv: Flowvar[T]): bool =
115 |       ## Returns true if the result of a Flowvar is ready.
116 |       ## In that case `sync` will not block.
117 |       ## Otherwise the current will block to help on all the pending tasks
118 |       ## until the Flowvar is ready.
119 |     ```
120 |   - ```Nim
121 |     proc sync*[T](fv: sink Flowvar[T]): T =
122 |       ## Blocks the current thread until the flowvar is available
123 |       ## and returned.
124 |       ## The thread is not idle and will complete pending tasks.
125 |     ```
126 | 
127 | ### Non-goals
128 | 
129 | The following are non-goals:
130 | 
131 | - Supporting GC-ed types with Nim default GC (sequences and strings). Using no GC or --gc:arc, --gc:orc or --gc:boehm (any GC that doesn't have thread-local heaps).
132 | - Having async-awaitable tasks
133 | - Running on environments without dynamic memory allocation
134 | - High-Performance Computing specificities (distribution on many machines or GPUs or machines with 200+ cores or multi-sockets)
135 | 
136 | ### Comparison with Weave
137 | 
138 | Compared to [Weave](https://github.com/mratsim/weave), here are the tradeoffs:
139 | - Taskpools only provide spawn/sync (task parallelism).\
140 |   There is no (extremely) optimized parallel for (data parallelism)\
141 |   or precise in/out dependencies (events / dataflow parallelism).
142 | - Weave can handle trillions of small tasks that require only 10µs per task. (Load Balancing overhead)
143 | - Weave maintains an adaptive memory pool to reduce memory allocation overhead,
144 |   Taskpools allocations are as-needed. (Scheduler overhead)
145 | 
146 | ## License
147 | 
148 | Licensed and distributed under either of
149 | 
150 | * MIT license: [LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT
151 | * Apache License, Version 2.0, ([LICENSE-APACHEv2](LICENSE-APACHEv2) or http://www.apache.org/licenses/LICENSE-2.0)
152 | 
153 | at your option. This file may not be copied, modified, or distributed except according to those terms.
154 | 


--------------------------------------------------------------------------------
/benchmarks/bouncing_producer_consumer/README.md:
--------------------------------------------------------------------------------
 1 | # BPC (Bouncing Producer-Consumer)
 2 | 
 3 | From [tasking-2.0](https://github.com/aprell/tasking-2.0) description
 4 | 
 5 | > **BPC**, short for **B**ouncing **P**roducer-**C**onsumer benchmark, as far
 6 | > as I know, first described by [Dinan et al][1]. There are two types of
 7 | > tasks, producer and consumer tasks. Each producer task creates another
 8 | > producer task followed by *n* consumer tasks, until a certain depth *d* is
 9 | > reached. Consumer tasks run for *t* microseconds. The smaller the values of
10 | > *n* and *t*, the harder it becomes to exploit the available parallelism. A
11 | > solid contender for the most antagonistic microbenchmark.
12 | 


--------------------------------------------------------------------------------
/benchmarks/bouncing_producer_consumer/taskpool_bpc.nim:
--------------------------------------------------------------------------------
  1 | import
  2 |   # STD lib
  3 |   os, strutils, system/ansi_c, cpuinfo, strformat, math,
  4 |   # Library
  5 |   ../../taskpools,
  6 |   # bench
  7 |   ../wtime, ../resources
  8 | 
  9 | var
 10 |   Depth: int32 # For example 10000
 11 |   NumTasksPerDepth: int32 # For example 9
 12 |   # The total number of tasks in the BPC benchmark is
 13 |   # (NumTasksPerDepth + 1) * Depth
 14 |   NumTasksTotal: int32
 15 |   TaskGranularity: int32 # in microseconds
 16 |   PollInterval: float64  # in microseconds
 17 | 
 18 |   tp: Taskpool
 19 | 
 20 | var global_poll_elapsed {.threadvar.}: float64
 21 | 
 22 | template dummy_cpt(): untyped =
 23 |   # Dummy computation
 24 |   # Calculate fib(30) iteratively
 25 |   var
 26 |     fib = 0
 27 |     f2 = 0
 28 |     f1 = 1
 29 |   for i in 2 .. 30:
 30 |     fib = f1 + f2
 31 |     f2 = f1
 32 |     f1 = fib
 33 | 
 34 | proc bpc_consume(usec: int32) =
 35 | 
 36 |   var pollElapsed = 0'f64
 37 | 
 38 |   let start = wtime_usec()
 39 |   let stop = usec.float64
 40 |   global_poll_elapsed = PollInterval
 41 | 
 42 |   while true:
 43 |     var elapsed = wtime_usec() - start
 44 |     elapsed -= pollElapsed
 45 |     if elapsed >= stop:
 46 |       break
 47 | 
 48 |     dummy_cpt()
 49 | 
 50 |     # if elapsed >= global_poll_elapsed:
 51 |     #   let pollStart = wtime_usec()
 52 |     #   loadBalance(Weave)
 53 |     #   pollElapsed += wtime_usec() - pollStart
 54 |     #   global_poll_elapsed += PollInterval
 55 | 
 56 | proc bpc_consume_nopoll(usec: int32) =
 57 | 
 58 |   let start = wtime_usec()
 59 |   let stop = usec.float64
 60 | 
 61 |   while true:
 62 |     var elapsed = wtime_usec() - start
 63 |     if elapsed >= stop:
 64 |       break
 65 | 
 66 |     dummy_cpt()
 67 | 
 68 | proc bpc_produce(n, d: int32) {.gcsafe, raises: [].} =
 69 |   if d > 0:
 70 |     # Create producer task
 71 |     tp.spawn bpc_produce(n, d-1)
 72 |   else:
 73 |     return
 74 | 
 75 |   # Followed by n consumer tasks
 76 |   for i in 0 ..< n:
 77 |     tp.spawn bpc_consume(TaskGranularity)
 78 | 
 79 | proc main() =
 80 |   Depth = 10000
 81 |   NumTasksPerDepth = 999
 82 |   TaskGranularity = 1
 83 | 
 84 |   if paramCount() == 0:
 85 |     let exeName = getAppFilename().extractFilename()
 86 |     echo &"Usage: {exeName} <depth: {Depth}> " &
 87 |          &"<# of tasks per depth: {NumTasksPerDepth}> " &
 88 |          &"[task granularity (us): {TaskGranularity}] " &
 89 |          &"[polling interval (us): task granularity]"
 90 |     echo &"Running with default config Depth = {Depth}, NumTasksPerDepth = {NumTasksPerDepth}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
 91 |   if paramCount() >= 1:
 92 |     Depth = paramStr(1).parseInt.int32
 93 |   if paramCount() >= 2:
 94 |     NumTasksPerDepth = paramStr(2). parseInt.int32
 95 |   if paramCount() >= 3:
 96 |     TaskGranularity = paramStr(3). parseInt.int32
 97 |   if paramCount() == 4:
 98 |     PollInterval = paramStr(4).parseInt.float64
 99 |   else:
100 |     PollInterval = TaskGranularity.float64
101 |   if paramCount() > 4:
102 |     let exeName = getAppFilename().extractFilename()
103 |     echo &"Usage: {exeName} <depth: {Depth}> " &
104 |          &"<# of tasks per depth: {NumTasksPerDepth}> " &
105 |          &"[task granularity (us): {TaskGranularity}] " &
106 |          &"[polling interval (us): task granularity]"
107 |     quit 1
108 | 
109 |   NumTasksTotal = (NumTasksPerDepth + 1) * Depth
110 | 
111 |   var nthreads: int
112 |   if existsEnv"TASKPOOL_NUM_THREADS":
113 |     nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
114 |   else:
115 |     nthreads = countProcessors()
116 | 
117 |   tp = Taskpool.new(numThreads = nthreads)
118 | 
119 |   # measure overhead during tasking
120 |   var ru: Rusage
121 |   getrusage(RusageSelf, ru)
122 |   var
123 |     rss = ru.ru_maxrss
124 |     flt = ru.ru_minflt
125 | 
126 |   let start = wtime_msec()
127 | 
128 |   bpc_produce(NumTasksPerDepth, Depth)
129 |   tp.syncAll()
130 | 
131 |   let stop = wtime_msec()
132 | 
133 |   getrusage(RusageSelf, ru)
134 |   rss = ru.ru_maxrss - rss
135 |   flt = ru.ru_minflt - flt
136 | 
137 |   tp.shutdown()
138 | 
139 |   echo "--------------------------------------------------------------------------"
140 |   echo "Scheduler:                                     Taskpool"
141 |   echo "Benchmark:                                     BPC (Bouncing Producer-Consumer)"
142 |   echo "Threads:                                       ", nthreads
143 |   echo "Time(ms)                                       ", round(stop - start, 3)
144 |   echo "Max RSS (KB):                                  ", ru.ru_maxrss
145 |   echo "Runtime RSS (KB):                              ", rss
146 |   echo "# of page faults:                              ", flt
147 |   echo "--------------------------------------------------------------------------"
148 |   echo "# of tasks:                                    ", NumTasksTotal
149 |   echo "# of tasks/depth:                              ", NumTasksPerDepth
150 |   echo "Depth:                                         ", Depth
151 |   echo "Task granularity (us):                         ", TaskGranularity
152 |   echo "Polling / manual load balancing interval (us): ", PollInterval
153 | 
154 |   quit 0
155 | 
156 | main()
157 | 


--------------------------------------------------------------------------------
/benchmarks/dfs/taskpool_dfs.nim:
--------------------------------------------------------------------------------
 1 | # Weave
 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | import
 9 |   # Stdlib
10 |   system/ansi_c, strformat, os, strutils, cpuinfo,
11 |   # Library
12 |   ../../taskpools
13 | 
14 | when not defined(windows):
15 |   # bench
16 |   import ../wtime
17 | 
18 | var tp: Taskpool
19 | 
20 | proc dfs(depth, breadth: int): uint32 {.gcsafe, raises: [].} =
21 |   if depth == 0:
22 |     return 1
23 | 
24 |   # We could use alloca to avoid heap allocation here
25 |   var sums = newSeq[Flowvar[uint32]](breadth)
26 | 
27 |   for i in 0 ..< breadth:
28 |     sums[i] = tp.spawn dfs(depth - 1, breadth)
29 | 
30 |   for i in 0 ..< breadth:
31 |     result += sync(sums[i])
32 | 
33 | proc test(depth, breadth: int): uint32 =
34 |   result = sync tp.spawn dfs(depth, breadth)
35 | 
36 | proc main() =
37 | 
38 |   var
39 |     depth = 8
40 |     breadth = 8
41 |     answer: uint32
42 |     nthreads: int
43 | 
44 |   if existsEnv"TP_NUM_THREADS":
45 |     nthreads = getEnv"TP_NUM_THREADS".parseInt()
46 |   else:
47 |     nthreads = countProcessors()
48 | 
49 |   if paramCount() == 0:
50 |     let exeName = getAppFilename().extractFilename()
51 |     echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
52 |     echo &"Running with default config depth = {depth} and breadth = {breadth}"
53 | 
54 |   if paramCount() >= 1:
55 |     depth = paramStr(1).parseInt()
56 |   if paramCount() == 2:
57 |     breadth = paramStr(2).parseInt()
58 |   if paramCount() > 2:
59 |     let exeName = getAppFilename().extractFilename()
60 |     echo &"Usage: {exeName} <depth:{depth}> <breadth:{breadth}>"
61 |     echo &"Up to 2 parameters are valid. Received {paramCount()}"
62 |     quit 1
63 | 
64 |   # Staccato benches runtime init and exit as well
65 |   when not defined(windows):
66 |     let start = wtime_usec()
67 | 
68 |   tp = Taskpool.new()
69 |   answer = test(depth, breadth)
70 |   tp.shutdown()
71 | 
72 |   when not defined(windows):
73 |     let stop = wtime_usec()
74 | 
75 |   echo "Scheduler:  Taskpool"
76 |   echo "Benchmark:  dfs"
77 |   echo "Threads:    ", nthreads
78 |   when not defined(windows):
79 |     echo "Time(us)    ", stop - start
80 |   echo "Output:     ", answer
81 | 
82 |   quit 0
83 | 
84 | main()
85 | 


--------------------------------------------------------------------------------
/benchmarks/fibonacci/README.md:
--------------------------------------------------------------------------------
 1 | # Fibonacci benchmarks
 2 | 
 3 | ⚠️ Disclaimer:
 4 |    Please don't use parallel fibonacci in production!
 5 |    Use the fast doubling method with memoization instead.
 6 | 
 7 | Fibonacci benchmark has 3 draws:
 8 | 
 9 | 1. It's very simple to implement
10 | 2. It's unbalanced and efficiency requires distributions to avoid idle cores.
11 | 3. It's a very effective scheduler overhead benchmark, because the basic task is very trivial and the task spawning grows at 2^n scale.
12 | 
13 | Want to know the difference between low and high overhead?
14 | 
15 | Run the following C code (taken from [Oracle OpenMP example](https://docs.oracle.com/cd/E19205-01/820-7883/girtd/index.html))
16 | 
17 | ```C
18 | #include <stdio.h>
19 | #include <omp.h>
20 | int fib(int n)
21 | {
22 |   int i, j;
23 |   if (n<2)
24 |     return n;
25 |   else
26 |     {
27 |        #pragma omp task shared(i) firstprivate(n)
28 |        {
29 |          i=fib(n-1);
30 |        }
31 | 
32 |        j=fib(n-2);
33 |        #pragma omp taskwait
34 |        return i+j;
35 |     }
36 | }
37 | 
38 | int main()
39 | {
40 |   int n = 40;
41 | 
42 |   #pragma omp parallel shared(n)
43 |   {
44 |     #pragma omp single
45 |     printf ("fib(%d) = %d\n", n, fib(n));
46 |   }
47 | }
48 | ```
49 | 
50 | First compile with Clang and run it
51 | ```
52 | clang -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
53 | time a.out
54 | ```
55 | It should be fairly quick
56 | 
57 | 
58 | Then compile with GCC and run it
59 | ```
60 | gcc -O3 -fopenmp benchmarks/fibonacci/omp_fib.c
61 | time a.out
62 | ```
63 | 
64 | Notice how some cores get idle as time goes on?
65 | Don't forget to kill the benchmark, you'll be there all day.
66 | 
67 | What's happening?
68 | 
69 | GCC's OpenMP implementation uses a single queue for all tasks.
70 | That queue gets constantly hammered by all threads and becomes a contention point.
71 | Furthermore, it seems like there is no load balancing or that due to the contention/lock
72 | threads are descheduled.
73 | 
74 | However Clang implementation uses a work-stealing scheduler with one deque per thread.
75 | The only contention happens when a thread run out of work and has to look for more work,
76 | in the deque of other threads. And which thread to check is chosen at random so
77 | the potential contention is distributed among all threads instead of a single structure.
78 | 


--------------------------------------------------------------------------------
/benchmarks/fibonacci/stdnim_fib.nim:
--------------------------------------------------------------------------------
 1 | import
 2 |   # STD lib
 3 |   os, strutils, threadpool, strformat,
 4 |   # bench
 5 |   ../wtime
 6 | 
 7 | # Using Nim's standard threadpool
 8 | # Compile with "nim c --threads:on -d:release -d:danger --outdir:build benchmarks/fibonacci/stdnim_fib.nim"
 9 | #
10 | # Note: it breaks at fib 16.
11 | 
12 | proc parfib(n: uint64): uint64 =
13 |   if n < 2:   # Note: be sure to compare n<2 -> return n
14 |     return n #       instead of n<=2 -> return 1
15 | 
16 |   let x = spawn parfib(n-1)
17 |   let y = parfib(n-2)
18 | 
19 |   return ^x + y
20 | 
21 | proc main() =
22 |   if paramCount() != 1:
23 |     echo "Usage: fib <n-th fibonacci number requested>"
24 |     quit 0
25 | 
26 |   let n = paramStr(1).parseUInt.uint64
27 | 
28 |   let start = wtime_msec()
29 |   let f = parfib(n)
30 |   let stop = wtime_msec()
31 | 
32 |   echo "Result: ", f
33 |   echo &"Elapsed wall time: {stop-start:.2} ms"
34 | 
35 | main()
36 | 


--------------------------------------------------------------------------------
/benchmarks/fibonacci/taskpool_fib.nim:
--------------------------------------------------------------------------------
 1 | import
 2 |   # STD lib
 3 |   os, strutils, cpuinfo, strformat, math,
 4 |   # Library
 5 |   ../../taskpools
 6 | 
 7 | when not defined(windows):
 8 |   # bench
 9 |   import ../wtime, ../resources
10 | 
11 | var tp: Taskpool
12 | 
13 | proc fib(n: int): int =
14 |   # int64 on x86-64
15 |   if n < 2:
16 |     return n
17 | 
18 |   let x = tp.spawn fib(n-1)
19 |   let y = fib(n-2)
20 | 
21 |   result = sync(x) + y
22 | 
23 | proc main() =
24 |   var n = 40
25 |   var nthreads: int
26 | 
27 |   if paramCount() == 0:
28 |     let exeName = getAppFilename().extractFilename()
29 |     echo &"Usage: {exeName} <n-th fibonacci number requested:{n}> "
30 |     echo &"Running with default n = {n}"
31 |   elif paramCount() == 1:
32 |     n = paramStr(1).parseInt
33 |   else:
34 |     let exeName = getAppFilename().extractFilename()
35 |     echo &"Usage: {exeName} <n-th fibonacci number requested:{n}>"
36 |     quit 1
37 | 
38 |   if existsEnv"TP_NUM_THREADS":
39 |     nthreads = getEnv"TP_NUM_THREADS".parseInt()
40 |   else:
41 |     nthreads = countProcessors()
42 | 
43 |   tp = Taskpool.new()
44 | 
45 |   # measure overhead during tasking
46 |   when not defined(windows):
47 |     var ru: Rusage
48 |     getrusage(RusageSelf, ru)
49 |     var
50 |       rss = ru.ru_maxrss
51 |       flt = ru.ru_minflt
52 | 
53 |     let start = wtime_msec()
54 |   let f = fib(n)
55 | 
56 |   when not defined(windows):
57 |     let stop = wtime_msec()
58 | 
59 |   tp.shutdown()
60 | 
61 |   when not defined(windows):
62 |     getrusage(RusageSelf, ru)
63 |     rss = ru.ru_maxrss - rss
64 |     flt = ru.ru_minflt - flt
65 | 
66 |   echo "--------------------------------------------------------------------------"
67 |   echo "Scheduler:                                    Taskpool"
68 |   echo "Benchmark:                                    Fibonacci"
69 |   echo "Threads:                                      ", nthreads
70 |   when not defined(windows):
71 |     echo "Time(ms)                                      ", round(stop - start, 3)
72 |     echo "Max RSS (KB):                                 ", ru.ru_maxrss
73 |     echo "Runtime RSS (KB):                             ", rss
74 |     echo "# of page faults:                             ", flt
75 |   echo "--------------------------------------------------------------------------"
76 |   echo "n requested:                                  ", n
77 |   echo "result:                                       ", f
78 | 
79 | main()
80 | 


--------------------------------------------------------------------------------
/benchmarks/heat/stdnim_heat.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | # From fibril
  9 | #
 10 | # Original license
 11 | #
 12 | # /*
 13 | #  * Heat diffusion (Jacobi-type iteration)
 14 | #  *
 15 | #  * Volker Strumpen, Boston                                 August 1996
 16 | #  *
 17 | #  * Copyright (c) 1996 Massachusetts Institute of Technology
 18 | #  *
 19 | #  * This program is free software; you can redistribute it and/or modify
 20 | #  * it under the terms of the GNU General Public License as published by
 21 | #  * the Free Software Foundation; either version 2 of the License, or
 22 | #  * (at your option) any later version.
 23 | #  *
 24 | #  * This program is distributed in the hope that it will be useful,
 25 | #  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 26 | #  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 27 | #  * GNU General Public License for more details.
 28 | #  *
 29 | #  * You should have received a copy of the GNU General Public License
 30 | #  * along with this program; if not, write to the Free Software
 31 | #  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 32 | #  */
 33 | 
 34 | import
 35 |   # Stdlib
 36 |   strformat, os, strutils, math, system/ansi_c,
 37 |   cpuinfo, threadpool,
 38 |   # bench
 39 |   ../wtime, ../resources
 40 | 
 41 | # This deadlocks :/
 42 | 
 43 | # Helpers
 44 | # -------------------------------------------------------
 45 | 
 46 | # We need a thin wrapper around raw pointers for matrices,
 47 | # we can't pass "var seq[seq[float64]]" to other threads
 48 | # nor "var" for that matter
 49 | type
 50 |   Matrix[T] = object
 51 |     buffer: ptr UncheckedArray[T]
 52 |     m, n: int
 53 | 
 54 |   Row[T] = object
 55 |     buffer: ptr UncheckedArray[T]
 56 |     len: int
 57 | 
 58 | func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
 59 |   result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
 60 |   result.m = m
 61 |   result.n = n
 62 | 
 63 | template `[]`[T](mat: Matrix[T], row, col: Natural): T =
 64 |   # row-major storage
 65 |   assert row < mat.m
 66 |   assert col < mat.n
 67 |   mat.buffer[row * mat.n + col]
 68 | 
 69 | template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
 70 |   assert row < mat.m
 71 |   assert col < mat.n
 72 |   mat.buffer[row * mat.n + col] = value
 73 | 
 74 | func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
 75 |   # row-major storage, there are n columns in between each rows
 76 |   assert rowIdx < mat.m
 77 |   result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
 78 |   result.len = mat.m
 79 | 
 80 | template `[]`[T](row: Row[T], idx: Natural): T =
 81 |   assert idx < row.len
 82 |   row.buffer[idx]
 83 | 
 84 | template `[]=`[T](row: Row[T], idx: Natural, value: T) =
 85 |   assert idx < row.len
 86 |   row.buffer[idx] = value
 87 | 
 88 | func delete[T](mat: sink Matrix[T]) =
 89 |   c_free(mat.buffer)
 90 | 
 91 | # And an auto converter for int32 -> float64 so we don't have to convert
 92 | # all i, j indices manually
 93 | 
 94 | converter i32toF64(x: int32): float64 {.inline.} =
 95 |   float64(x)
 96 | 
 97 | # -------------------------------------------------------
 98 | 
 99 | template f(x, y: SomeFloat): SomeFloat =
100 |   sin(x) * sin(y)
101 | 
102 | template randa[T: SomeFloat](x, t: T): T =
103 |   T(0.0)
104 | 
105 | proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
106 |   # proc instead of template to avoid Nim constant folding bug:
107 |   # https://github.com/nim-lang/Nim/issues/12783
108 |   exp(-2 * t) * sin(x)
109 | 
110 | template randc[T: SomeFloat](y, t: T): T =
111 |   T(0.0)
112 | 
113 | proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
114 |   # proc instead of template to avoid Nim constant folding bug:
115 |   # https://github.com/nim-lang/Nim/issues/12783
116 |   exp(-2 * t) * sin(y)
117 | 
118 | template solu(x, y, t: SomeFloat): SomeFloat =
119 |   exp(-2 * t) * sin(x) * sin(y)
120 | 
121 | const n = 4096'i32
122 | 
123 | var
124 |   nx, ny, nt: int32
125 |   xu, xo, yu, yo, tu, to: float64
126 | 
127 |   dx, dy, dt: float64
128 |   dtdxsq, dtdysq: float64
129 | 
130 |   odd: Matrix[float64]
131 |   even: Matrix[float64]
132 | 
133 | proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable.}=
134 |   # TODO to allow awaiting `heat` we return a dummy bool
135 |   # The parallel spawns are updating the same matrix cells otherwise
136 |   if iu - il > 1:
137 |     let im = (il + iu) div 2
138 | 
139 |     let h = spawn heat(m, il, im)
140 |     heat(m, im, iu)
141 |     discard ^h
142 |     return true
143 |   # ------------------------
144 | 
145 |   let i = il
146 |   let row = m.getRow(i)
147 | 
148 |   if i == 0:
149 |     for j in 0 ..< ny:
150 |       row[j] = randc(yu + j*dy, 0)
151 |   elif i == nx - 1:
152 |     for j in 0 ..< ny:
153 |       row[j] = randd(yu + j*dy, 0)
154 |   else:
155 |     row[0] = randa(xu + i*dx, 0)
156 |     for j in 1 ..< ny - 1:
157 |       row[j] = f(xu + i*dx, yu + j*dy)
158 |     row[ny - 1] = randb(xu + i*dx, 0)
159 | 
160 | proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable.} =
161 |   # TODO to allow awaiting `diffuse` we return a dummy bool
162 |   # The parallel spawns are updating the same matrix cells otherwise
163 |   if iu - il > 1:
164 |     let im = (il + iu) div 2
165 | 
166 |     let d = spawn diffuse(output, input, il, im, t)
167 |     diffuse(output, input, im, iu, t)
168 |     discard ^d
169 |     return true
170 |   # ------------------------
171 | 
172 |   let i = il
173 |   let row = output.getRow(i)
174 | 
175 |   if i == 0:
176 |     for j in 0 ..< ny:
177 |       row[j] = randc(yu + j*dy, t)
178 |   elif i == nx - 1:
179 |     for j in 0 ..< ny:
180 |       row[j] = randd(yu + j*dy, t)
181 |   else:
182 |     row[0] = randa(xu + i*dx, t)
183 |     for j in 1 ..< ny - 1:
184 |       row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
185 |                dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
186 |                dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
187 |     row[ny - 1] = randb(xu + i*dx, t)
188 | 
189 | proc initTest() =
190 |   nx = n
191 |   ny = 1024
192 |   nt = 100
193 |   xu = 0.0
194 |   xo = 1.570796326794896558
195 |   yu = 0.0
196 |   yo = 1.570796326794896558
197 |   tu = 0.0
198 |   to = 0.0000001
199 | 
200 |   dx = (xo - xu) / float64(nx - 1)
201 |   dy = (yo - yu) / float64(ny - 1)
202 |   dt = (to - tu) / float64(nt)
203 | 
204 |   dtdxsq = dt / (dx * dx)
205 |   dtdysq = dt / (dy * dy)
206 | 
207 |   even = newMatrix[float64](nx, ny)
208 |   odd = newMatrix[float64](nx, ny)
209 | 
210 | proc prep() =
211 |   heat(even, 0, nx)
212 | 
213 | proc test() =
214 |   var t = tu
215 | 
216 |   for _ in countup(1, nt.int, 2):
217 |     # nt included
218 |     t += dt
219 |     diffuse(odd, even, 0, nx, t)
220 |     t += dt
221 |     diffuse(even, odd, 0, nx, t)
222 | 
223 |   if nt mod 2 != 0:
224 |     t += dt
225 |     diffuse(odd, even, 0, nx, t)
226 | 
227 | proc verify() =
228 |   var
229 |     mat: Matrix[float64]
230 |     mae: float64
231 |     mre: float64
232 |     me:  float64
233 | 
234 |   mat = if nt mod 2 != 0: odd else: even
235 | 
236 |   for a in 0 ..< nx:
237 |     for b in 0 ..< ny:
238 |       var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
239 |       if tmp > 1e-3:
240 |         echo "nx: ", nx, " - ny: ", ny
241 |         echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
242 |         quit 1
243 | 
244 |       me += tmp
245 |       if tmp > mae: mae = tmp
246 |       if mat[a, b] != 0.0: tmp /= mat[a, b]
247 |       if tmp > mre: mre = tmp
248 | 
249 |   me /= nx * ny
250 | 
251 |   if mae > 1e-12:
252 |     echo &"Local maximal absolute error {mae:1.3e}"
253 |     quit 1
254 |   if mre > 1e-12:
255 |     echo &"Local maximal relative error {mre:1.3e}"
256 |     quit 1
257 |   if me > 1e-12:
258 |     echo &"Global mean absolute error {me:1.3e}"
259 |     quit 1
260 | 
261 |   echo "Verification successful"
262 | 
263 | proc main() =
264 |   var nthreads: int
265 |   nthreads = countProcessors()
266 | 
267 |   var ru: Rusage
268 |   getrusage(RusageSelf, ru)
269 |   var
270 |     rss = ru.ru_maxrss
271 |     flt = ru.ru_minflt
272 | 
273 |   initTest()
274 | 
275 |   prep()
276 |   let start = wtime_usec()
277 |   test()
278 |   let stop = wtime_usec()
279 | 
280 |   getrusage(RusageSelf, ru)
281 |   rss = ru.ru_maxrss - rss
282 |   flt = ru.ru_minflt - flt
283 | 
284 |   sync()
285 | 
286 |   verify()
287 |   delete(even)
288 |   delete(odd)
289 | 
290 |   echo "Scheduler:  Nim threadpool (standard lib)"
291 |   echo "Benchmark:        heat"
292 |   echo "Threads:          ", nthreads
293 |   echo "Time(us)          ", stop - start
294 |   echo "Max RSS (KB):     ", ru.ru_maxrss
295 |   echo "Runtime RSS (KB): ", rss
296 |   echo "# of page faults: ", flt
297 | 
298 |   quit 0
299 | 
300 | main()
301 | 


--------------------------------------------------------------------------------
/benchmarks/heat/taskpool_heat.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | {.push raises: [].}
  9 | 
 10 | # From fibril
 11 | #
 12 | # Original license
 13 | #
 14 | # /*
 15 | #  * Heat diffusion (Jacobi-type iteration)
 16 | #  *
 17 | #  * Volker Strumpen, Boston                                 August 1996
 18 | #  *
 19 | #  * Copyright (c) 1996 Massachusetts Institute of Technology
 20 | #  *
 21 | #  * This program is free software; you can redistribute it and/or modify
 22 | #  * it under the terms of the GNU General Public License as published by
 23 | #  * the Free Software Foundation; either version 2 of the License, or
 24 | #  * (at your option) any later version.
 25 | #  *
 26 | #  * This program is distributed in the hope that it will be useful,
 27 | #  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 28 | #  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 29 | #  * GNU General Public License for more details.
 30 | #  *
 31 | #  * You should have received a copy of the GNU General Public License
 32 | #  * along with this program; if not, write to the Free Software
 33 | #  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 34 | #  */
 35 | 
 36 | import
 37 |   # Stdlib
 38 |   strformat, os, strutils, math, system/ansi_c,
 39 |   cpuinfo,
 40 |   # Taskpools
 41 |   ../../taskpools
 42 | when not defined(windows):
 43 |   # bench
 44 |   import ../wtime, ../resources
 45 | 
 46 | # Helpers
 47 | # -------------------------------------------------------
 48 | 
 49 | # We need a thin wrapper around raw pointers for matrices,
 50 | # we can't pass "var seq[seq[float64]]" to other threads
 51 | # nor "var" for that matter
 52 | type
 53 |   Matrix[T] = object
 54 |     buffer: ptr UncheckedArray[T]
 55 |     m, n: int
 56 | 
 57 |   Row[T] = object
 58 |     buffer: ptr UncheckedArray[T]
 59 |     len: int
 60 | 
 61 | var tp: Taskpool
 62 | 
 63 | func newMatrix[T](m, n: int): Matrix[T] {.inline.} =
 64 |   result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t m*n*sizeof(T)))
 65 |   result.m = m
 66 |   result.n = n
 67 | 
 68 | template `[]`[T](mat: Matrix[T], row, col: Natural): T =
 69 |   # row-major storage
 70 |   assert row < mat.m
 71 |   assert col < mat.n
 72 |   mat.buffer[row * mat.n + col]
 73 | 
 74 | template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
 75 |   assert row < mat.m
 76 |   assert col < mat.n
 77 |   mat.buffer[row * mat.n + col] = value
 78 | 
 79 | func getRow[T](mat: Matrix[T], rowIdx: Natural): Row[T] {.inline.} =
 80 |   # row-major storage, there are n columns in between each rows
 81 |   assert rowIdx < mat.m
 82 |   result.buffer = cast[ptr UncheckedArray[T]](mat.buffer[rowIdx * mat.n].addr)
 83 |   result.len = mat.m
 84 | 
 85 | template `[]`[T](row: Row[T], idx: Natural): T =
 86 |   assert idx < row.len
 87 |   row.buffer[idx]
 88 | 
 89 | template `[]=`[T](row: Row[T], idx: Natural, value: T) =
 90 |   assert idx < row.len
 91 |   row.buffer[idx] = value
 92 | 
 93 | func delete[T](mat: sink Matrix[T]) =
 94 |   c_free(mat.buffer)
 95 | 
 96 | # And an auto converter for int32 -> float64 so we don't have to convert
 97 | # all i, j indices manually
 98 | 
 99 | converter i32toF64(x: int32): float64 {.inline.} =
100 |   float64(x)
101 | 
102 | # -------------------------------------------------------
103 | 
104 | template f(x, y: SomeFloat): SomeFloat =
105 |   sin(x) * sin(y)
106 | 
107 | template randa[T: SomeFloat](x, t: T): T =
108 |   T(0.0)
109 | 
110 | proc randb(x, t: SomeFloat): SomeFloat {.inline.} =
111 |   # proc instead of template to avoid Nim constant folding bug:
112 |   # https://github.com/nim-lang/Nim/issues/12783
113 |   exp(-2 * t) * sin(x)
114 | 
115 | template randc[T: SomeFloat](y, t: T): T =
116 |   T(0.0)
117 | 
118 | proc randd(y, t: SomeFloat): SomeFloat {.inline.} =
119 |   # proc instead of template to avoid Nim constant folding bug:
120 |   # https://github.com/nim-lang/Nim/issues/12783
121 |   exp(-2 * t) * sin(y)
122 | 
123 | template solu(x, y, t: SomeFloat): SomeFloat =
124 |   exp(-2 * t) * sin(x) * sin(y)
125 | 
126 | const n = 4096'i32
127 | 
128 | var
129 |   nx, ny, nt: int32
130 |   xu, xo, yu, yo, tu, to: float64
131 | 
132 |   dx, dy, dt: float64
133 |   dtdxsq, dtdysq: float64
134 | 
135 |   odd: Matrix[float64]
136 |   even: Matrix[float64]
137 | 
138 | proc heat(m: Matrix[float64], il, iu: int32): bool {.discardable, gcsafe.}=
139 |   # TODO to allow awaiting `heat` we return a dummy bool
140 |   # The parallel spawns are updating the same matrix cells otherwise
141 |   if iu - il > 1:
142 |     let im = (il + iu) div 2
143 | 
144 |     let h = tp.spawn heat(m, il, im)
145 |     heat(m, im, iu)
146 |     discard sync(h)
147 |     return true
148 |   # ------------------------
149 | 
150 |   let i = il
151 |   let row = m.getRow(i)
152 | 
153 |   if i == 0:
154 |     for j in 0 ..< ny:
155 |       row[j] = randc(yu + j*dy, 0)
156 |   elif i == nx - 1:
157 |     for j in 0 ..< ny:
158 |       row[j] = randd(yu + j*dy, 0)
159 |   else:
160 |     row[0] = randa(xu + i*dx, 0)
161 |     for j in 1 ..< ny - 1:
162 |       row[j] = f(xu + i*dx, yu + j*dy)
163 |     row[ny - 1] = randb(xu + i*dx, 0)
164 | 
165 | proc diffuse(output: Matrix[float64], input: Matrix[float64], il, iu: int32, t: float64): bool {.discardable, gcsafe.} =
166 |   # TODO to allow awaiting `diffuse` we return a dummy bool
167 |   # The parallel spawns are updating the same matrix cells otherwise
168 |   if iu - il > 1:
169 |     let im = (il + iu) div 2
170 | 
171 |     let d = tp.spawn diffuse(output, input, il, im, t)
172 |     diffuse(output, input, im, iu, t)
173 |     discard sync(d)
174 |     return true
175 |   # ------------------------
176 | 
177 |   let i = il
178 |   let row = output.getRow(i)
179 | 
180 |   if i == 0:
181 |     for j in 0 ..< ny:
182 |       row[j] = randc(yu + j*dy, t)
183 |   elif i == nx - 1:
184 |     for j in 0 ..< ny:
185 |       row[j] = randd(yu + j*dy, t)
186 |   else:
187 |     row[0] = randa(xu + i*dx, t)
188 |     for j in 1 ..< ny - 1:
189 |       row[j] = input[i, j] + # The use of nested sequences here is a bad idea ...
190 |                dtdysq * (input[i, j+1] - 2 * input[i, j] + input[i, j-1]) +
191 |                dtdxsq * (input[i+1, j] - 2 * input[i, j] + input[i-1, j])
192 |     row[ny - 1] = randb(xu + i*dx, t)
193 | 
194 | proc initTest() =
195 |   nx = n
196 |   ny = 1024
197 |   nt = 100
198 |   xu = 0.0
199 |   xo = 1.570796326794896558
200 |   yu = 0.0
201 |   yo = 1.570796326794896558
202 |   tu = 0.0
203 |   to = 0.0000001
204 | 
205 |   dx = (xo - xu) / float64(nx - 1)
206 |   dy = (yo - yu) / float64(ny - 1)
207 |   dt = (to - tu) / float64(nt)
208 | 
209 |   dtdxsq = dt / (dx * dx)
210 |   dtdysq = dt / (dy * dy)
211 | 
212 |   even = newMatrix[float64](nx, ny)
213 |   odd = newMatrix[float64](nx, ny)
214 | 
215 | proc prep() =
216 |   heat(even, 0, nx)
217 | 
218 | proc test() =
219 |   var t = tu
220 | 
221 |   for _ in countup(1, nt.int, 2):
222 |     # nt included
223 |     t += dt
224 |     diffuse(odd, even, 0, nx, t)
225 |     t += dt
226 |     diffuse(even, odd, 0, nx, t)
227 | 
228 |   if nt mod 2 != 0:
229 |     t += dt
230 |     diffuse(odd, even, 0, nx, t)
231 | 
232 | proc verify() =
233 |   var
234 |     mat: Matrix[float64]
235 |     mae: float64
236 |     mre: float64
237 |     me:  float64
238 | 
239 |   mat = if nt mod 2 != 0: odd else: even
240 | 
241 |   for a in 0 ..< nx:
242 |     for b in 0 ..< ny:
243 |       var tmp = abs(mat[a, b] - solu(xu + a*dx, yu + b*dy, to))
244 |       if tmp > 1e-3:
245 |         echo "nx: ", nx, " - ny: ", ny
246 |         echo "mat[", a, ", ", b, "] = ", mat[a, b], ", expected sol = ", solu(xu + a*dx, yu + b*dy, to)
247 |         quit 1
248 | 
249 |       me += tmp
250 |       if tmp > mae: mae = tmp
251 |       if mat[a, b] != 0.0: tmp /= mat[a, b]
252 |       if tmp > mre: mre = tmp
253 | 
254 |   me /= nx * ny
255 | 
256 |   try:
257 |     if mae > 1e-12:
258 |       echo &"Local maximal absolute error {mae:1.3e}"
259 |       quit 1
260 |     if mre > 1e-12:
261 |       echo &"Local maximal relative error {mre:1.3e}"
262 |       quit 1
263 |     if me > 1e-12:
264 |       echo &"Global mean absolute error {me:1.3e}"
265 |       quit 1
266 |   except ValueError: raiseAssert "format strings"
267 | 
268 |   echo "Verification successful"
269 | 
270 | {.pop.}
271 | 
272 | proc main() =
273 |   var nthreads: int
274 |   if existsEnv"TASKPOOL_NUM_THREADS":
275 |     nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
276 |   else:
277 |     nthreads = countProcessors()
278 | 
279 |   when not defined(windows):
280 |     var ru: Rusage
281 |     getrusage(RusageSelf, ru)
282 |     var
283 |       rss = ru.ru_maxrss
284 |       flt = ru.ru_minflt
285 | 
286 |   initTest()
287 | 
288 |   # Fibril initializes before benching
289 |   tp = Taskpool.new(numThreads = nthreads)
290 | 
291 |   prep()
292 |   when not defined(windows):
293 |     let start = wtime_usec()
294 |   test()
295 |   when not defined(windows):
296 |     let stop = wtime_usec()
297 | 
298 |     getrusage(RusageSelf, ru)
299 |     rss = ru.ru_maxrss - rss
300 |     flt = ru.ru_minflt - flt
301 | 
302 |   tp.shutdown()
303 | 
304 |   verify()
305 |   delete(even)
306 |   delete(odd)
307 | 
308 |   echo "Scheduler:        Taskpools"
309 |   echo "Benchmark:        heat"
310 |   echo "Threads:          ", nthreads
311 |   when not defined(windows):
312 |     echo "Time(us)          ", stop - start
313 |     echo "Max RSS (KB):     ", ru.ru_maxrss
314 |     echo "Runtime RSS (KB): ", rss
315 |     echo "# of page faults: ", flt
316 | 
317 |   quit 0
318 | 
319 | main()
320 | 


--------------------------------------------------------------------------------
/benchmarks/matmul_cache_oblivious/README.md:
--------------------------------------------------------------------------------
 1 | # Cache-Oblivious Matrix Multiplication
 2 | 
 3 | From Staccato and Cilk
 4 | 
 5 | https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk
 6 | See the paper ``Cache-Oblivious Algorithms'', by
 7 | Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
 8 | Sridhar Ramachandran, FOCS 1999, for an explanation of
 9 | why this algorithm is good for caches.
10 | 
11 | Note that the benchmarks output incorrect matrix traces
12 | according to the check ...
13 | 


--------------------------------------------------------------------------------
/benchmarks/matmul_cache_oblivious/taskpool_matmul_co.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | # Rectangular matrix multiplication.
  9 | #
 10 | # Adapted from Cilk 5.4.3 example
 11 | #
 12 | # https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk;
 13 | # See the paper ``Cache-Oblivious Algorithms'', by
 14 | # Matteo Frigo, Charles E. Leiserson, Harald Prokop, and
 15 | # Sridhar Ramachandran, FOCS 1999, for an explanation of
 16 | # why this algorithm is good for caches.
 17 | 
 18 | import
 19 |   # Stdlib
 20 |   strformat, os, strutils, math, system/ansi_c,
 21 |   cpuinfo,
 22 |   # Taskpool
 23 |   ../../taskpools,
 24 |   # bench
 25 |   ../wtime, ../resources
 26 | 
 27 | # Helpers
 28 | # -------------------------------------------------------
 29 | 
 30 | # We need a thin wrapper around raw pointers for matrices,
 31 | # we can't pass "var" to other threads
 32 | type
 33 |   Matrix[T: SomeFloat] = object
 34 |     buffer: ptr UncheckedArray[T]
 35 |     ld: int
 36 | 
 37 | var tp: Taskpool
 38 | 
 39 | func newMatrixNxN[T](n: int): Matrix[T] {.inline.} =
 40 |   result.buffer = cast[ptr UncheckedArray[T]](c_malloc(csize_t n*n*sizeof(T)))
 41 |   result.ld = n
 42 | 
 43 | template `[]`[T](mat: Matrix[T], row, col: Natural): T =
 44 |   # row-major storage
 45 |   assert row < mat.ld, $i & " < " & $mat.ld
 46 |   assert col < mat.ld, $i & " < " & $mat.ld
 47 |   mat.buffer[row * mat.ld + col]
 48 | 
 49 | template `[]=`[T](mat: Matrix[T], row, col: Natural, value: T) =
 50 |   assert row < mat.ld, $i & " < " & $mat.ld
 51 |   assert col < mat.ld, $i & " < " & $mat.ld
 52 |   mat.buffer[row * mat.ld + col] = value
 53 | 
 54 | func stride*[T](mat: Matrix[T], row, col: Natural): Matrix[T]{.inline.}=
 55 |   ## Returns a new view offset by the row and column stride
 56 |   result.buffer = cast[ptr UncheckedArray[T]](
 57 |     addr mat.buffer[row*mat.ld + col]
 58 |   )
 59 | 
 60 | func delete[T](mat: sink Matrix[T]) =
 61 |   c_free(mat.buffer)
 62 | 
 63 | # -------------------------------------------------------
 64 | 
 65 | proc xorshiftRand(): uint32 =
 66 |   var x {.global.} = uint32(2463534242)
 67 |   x = x xor (x shr 13)
 68 |   x = x xor (x shl 17)
 69 |   x = x xor (x shr 5)
 70 |   return x
 71 | 
 72 | func zero[T](A: Matrix[T]) =
 73 |   # zeroing is not timed
 74 |   zeroMem(A.buffer, A.ld * A.ld * sizeof(T))
 75 | 
 76 | proc fill[T](A: Matrix[T]) =
 77 |   for i in 0 ..< A.ld:
 78 |     for j in 0 ..< A.ld:
 79 |       A[i, j] = T(xorshiftRand() mod A.ld.uint32)
 80 | 
 81 | func maxError(A, B: Matrix): float64 =
 82 |   assert A.ld == B.ld
 83 |   for i in 0 ..< A.ld:
 84 |     for j in 0 ..< A.ld:
 85 |       var diff = (A[i, j] - B[i, j]) / A[i, j]
 86 |       if diff < 0:
 87 |         diff = -diff
 88 |       if diff > result:
 89 |         result = diff
 90 | 
 91 | func check[T](A, B, C: Matrix[T], n: int): bool =
 92 |   var
 93 |     tr_C = 0.T
 94 |     tr_AB = 0.T
 95 |   for i in 0 ..< n:
 96 |     for j in 0 ..< n:
 97 |       tr_AB += A[i, j] * B[j, i]
 98 |     tr_C += C[i, i]
 99 | 
100 |   # Note, all benchmarks return false ‾\_(ツ)_/‾
101 |   return abs(tr_AB - tr_C) < 1e-3
102 | 
103 | proc matmul[T](A, B, C: Matrix[T], m, n, p: int, add: bool): bool =
104 |   # The original bench passes around a ``ld`` parameter (leading dimension?),
105 |   # we store it in the matrices
106 |   # We return a dummy bool to allow waiting on the matmul
107 | 
108 |   # Threshold
109 |   if (m + n + p) <= 64:
110 |     if add:
111 |       for i in 0 ..< m:
112 |         for k in 0 ..< p:
113 |           var c = 0.T
114 |           for j in 0 ..< n:
115 |             c += A[i, j] * B[j, k]
116 |           C[i, k] += c
117 |     else:
118 |       for i in 0 ..< m:
119 |         for k in 0 ..< p:
120 |           var c = 0.T
121 |           for j in 0 ..< n:
122 |             c += A[i, j] * B[j, k]
123 |           C[i, k] = c
124 | 
125 |     return
126 | 
127 |   var h0, h1: FlowVar[bool]
128 |   ## Each half of the computation
129 | 
130 |   # matrix is larger than threshold
131 |   if m >= n and n >= p:
132 |     let m1 = m shr 1 # divide by 2
133 |     h0 = tp.spawn matmul(A, B, C, m1, n, p, add)
134 |     h1 = tp.spawn matmul(A.stride(m1, 0), B, C.stride(m1, 0), m - m1, n, p, add)
135 |   elif n >= m and n >= p:
136 |     let n1 = n shr 1 # divide by 2
137 |     h0 = tp.spawn matmul(A, B, C, m, n1, p, add)
138 |     h1 = tp.spawn matmul(A.stride(0, n1), B.stride(n1, 0), C, m, n - n1, p, add = true)
139 |   else:
140 |     let p1 = p shr 1
141 |     h0 = tp.spawn matmul(A, B, C, m, n, p1, add)
142 |     h1 = tp.spawn matmul(A, B.stride(0, p1), C.stride(0, p1), m, n, p - p1, add)
143 | 
144 |   discard sync(h0)
145 |   discard sync(h1)
146 | 
147 | proc main() =
148 |   echo "Warning the benchmark seems to not be correct."
149 |   var
150 |     n = 3000
151 |     nthreads: int
152 | 
153 |   if existsEnv"TASKPOOL_NUM_THREADS":
154 |     nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
155 |   else:
156 |     nthreads = countProcessors()
157 | 
158 |   if paramCount() == 0:
159 |     let exeName = getAppFilename().extractFilename()
160 |     echo &"Usage: {exeName} <n (matrix size):{n}>"
161 |     echo &"Running with default config n = {n}"
162 |   elif paramCount() == 1:
163 |     n = paramStr(1).parseInt()
164 |   else:
165 |     let exeName = getAppFilename().extractFilename()
166 |     echo &"Usage: {exeName} <n (matrix size):{n}>"
167 |     echo &"Up to 1 parameter is valid. Received {paramCount()}"
168 |     quit 1
169 | 
170 |   var A = newMatrixNxN[float32](n)
171 |   var B = newMatrixNxN[float32](n)
172 |   var C = newMatrixNxN[float32](n)
173 | 
174 |   fill(A)
175 |   fill(B)
176 |   zero(C)
177 | 
178 |   var ru: Rusage
179 |   getrusage(RusageSelf, ru)
180 |   var
181 |     rss = ru.ru_maxrss
182 |     flt = ru.ru_minflt
183 | 
184 |   # Staccato benches runtime init and exit as well
185 |   let start = wtime_msec()
186 | 
187 |   tp = Taskpool.new(numThreads = nthreads)
188 |   discard sync tp.spawn matmul(A, B, C, n, n, n, add = false)
189 |   tp.shutdown()
190 | 
191 |   let stop = wtime_msec()
192 | 
193 |   getrusage(RusageSelf, ru)
194 |   rss = ru.ru_maxrss - rss
195 |   flt = ru.ru_minflt - flt
196 | 
197 |   echo "Scheduler:        Taskpool"
198 |   echo "Benchmark:        Matrix Multiplication (cache oblivious)"
199 |   echo "Threads:          ", nthreads
200 |   echo "Time(ms)          ", stop - start
201 |   echo "Max RSS (KB):     ", ru.ru_maxrss
202 |   echo "Runtime RSS (KB): ", rss
203 |   echo "# of page faults: ", flt
204 |   echo "Input:            ", n
205 |   echo "Error:           ", check(A, B, C, n)
206 | 
207 |   delete A
208 |   delete B
209 |   delete C
210 | 
211 |   quit 0
212 | 
213 | main()
214 | 


--------------------------------------------------------------------------------
/benchmarks/nqueens/stdnim_nqueens.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | #
  8 | # Original code licenses
  9 | # ------------------------------------------------------------------------------------------------
 10 | #
 11 | # /**********************************************************************************************/
 12 | # /*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
 13 | # /*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
 14 | # /*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
 15 | # /*                                                                                            */
 16 | # /*  This program is free software; you can redistribute it and/or modify                      */
 17 | # /*  it under the terms of the GNU General Public License as published by                      */
 18 | # /*  the Free Software Foundation; either version 2 of the License, or                         */
 19 | # /*  (at your option) any later version.                                                       */
 20 | # /*                                                                                            */
 21 | # /*  This program is distributed in the hope that it will be useful,                           */
 22 | # /*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
 23 | # /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
 24 | # /*  GNU General Public License for more details.                                              */
 25 | # /*                                                                                            */
 26 | # /*  You should have received a copy of the GNU General Public License                         */
 27 | # /*  along with this program; if not, write to the Free Software                               */
 28 | # /*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
 29 | # /**********************************************************************************************/
 30 | #
 31 | # /*
 32 | #  * Original code from the Cilk project (by Keith Randall)
 33 | #  *
 34 | #  * Copyright (c) 2000 Massachusetts Institute of Technology
 35 | #  * Copyright (c) 2000 Matteo Frigo
 36 | #  */
 37 | 
 38 | import
 39 |   # Stdlib
 40 |   system/ansi_c, strformat, os, strutils,
 41 |   threadpool,
 42 |   # bench
 43 |   ../wtime
 44 | 
 45 | # This deadlocks :/
 46 | 
 47 | # Nim helpers
 48 | # -------------------------------------------------
 49 | 
 50 | when defined(windows):
 51 |   proc alloca(size: csize): pointer {.header: "<malloc.h>".}
 52 | else:
 53 |   proc alloca(size: csize): pointer {.header: "<alloca.h>".}
 54 | 
 55 | template alloca*(T: typedesc): ptr T =
 56 |   cast[ptr T](alloca(sizeof(T)))
 57 | 
 58 | template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
 59 |   cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
 60 | 
 61 | proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
 62 |   cast[type result](c_malloc(csize_t len*sizeof(T)))
 63 | 
 64 | proc tp_free*[T: ptr](p: T) {.inline.} =
 65 |   c_free(p)
 66 | 
 67 | # We assume that Nim zeroMem vs C memset
 68 | # and Nim copyMem vs C memcpy have no difference
 69 | # Nim does have extra checks to handle GC-ed types
 70 | # but they should be eliminated by the Nim compiler.
 71 | 
 72 | # -------------------------------------------------
 73 | 
 74 | type CharArray = ptr UncheckedArray[char]
 75 | 
 76 | var example_solution: ptr UncheckedArray[char]
 77 | 
 78 | func isValid(n: int32, a: CharArray): bool =
 79 |   ## `a` contains an array of `n` queen positions.
 80 |   ## Returns true if none of the queens conflict and 0 otherwise.
 81 | 
 82 |   for i in 0'i32 ..< n:
 83 |     let p = cast[int32](a[i])
 84 | 
 85 |     for j in i+1 ..< n:
 86 |       let q = cast[int32](a[j])
 87 |       if q == p or q == p - (j-i) or q == p + (j-i):
 88 |         return false
 89 |   return true
 90 | 
 91 | proc nqueens_ser(n, j: int32, a: CharArray): int32 =
 92 |   # Serial nqueens
 93 |   if n == j:
 94 |     # Good solution count it
 95 |     if example_solution.isNil:
 96 |       example_solution = tp_alloc(char, n)
 97 |       copyMem(example_solution, a, n * sizeof(char))
 98 |       return 1
 99 | 
100 |   # Try each possible position for queen `j`
101 |   for i in 0 ..< n:
102 |     a[j] = cast[char](i)
103 |     if isValid(j+1, a):
104 |       result += nqueens_ser(n, j+1, a)
105 | 
106 | proc nqueens_par(n, j: int32, a: CharArray): int32 =
107 | 
108 |   if n == j:
109 |     # Good solution, count it
110 |     return 1
111 | 
112 |   var localCounts = alloca(Flowvar[int32], n)
113 |   zeroMem(localCounts, n * sizeof(Flowvar[int32]))
114 | 
115 |   # Try each position for queen `j`
116 |   for i in 0 ..< n:
117 |     var b = alloca(char, j+1)
118 |     copyMem(b, a, j * sizeof(char))
119 |     b[j] = cast[char](i)
120 |     if isValid(j+1, b):
121 |       localCounts[i] = spawn nqueens_par(n, j+1, b)
122 | 
123 |   for i in 0 ..< n:
124 |     if not localCounts[i].isNil():
125 |       result += ^localCounts[i]
126 | 
127 | const solutions = [
128 |   1,
129 |   0,
130 |   0,
131 |   2,
132 |   10, # 5x5
133 |   4,
134 |   10,
135 |   92, # 8x8
136 |   352,
137 |   724, # 10x10
138 |   2680,
139 |   14200,
140 |   73712,
141 |   365596,
142 |   2279184, # 15x15
143 |   14772512
144 | ]
145 | 
146 | proc verifyQueens(n, res: int32) =
147 |   if n > solutions.len:
148 |     echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
149 |     return
150 | 
151 |   if res != solutions[n-1]:
152 |     echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
153 | 
154 | proc main() =
155 |   if paramCount() != 1:
156 |     let exeName = getAppFilename().extractFilename()
157 |     echo &"Usage: {exeName} <n: number of queens on a nxn board>"
158 |     quit 0
159 | 
160 |   let n = paramStr(1).parseInt.int32
161 | 
162 |   if n notin 1 .. solutions.len:
163 |     echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
164 |     quit 1
165 | 
166 | 
167 |   let start = wtime_msec()
168 |   let count = nqueens_par(n, 0, alloca(char, n))
169 |   let stop = wtime_msec()
170 | 
171 |   verifyQueens(n, count)
172 | 
173 |   if not example_solution.isNil:
174 |     stdout.write("Example solution: ")
175 |     for i in 0 ..< n:
176 |       c_printf("%2d ", example_solution[i])
177 |     stdout.write('\n')
178 | 
179 |   echo &"Elapsed wall time: {stop-start:2.4f} ms"
180 | 
181 | main()
182 | 


--------------------------------------------------------------------------------
/benchmarks/nqueens/taskpool_nqueens.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | #
  8 | # Original code licenses
  9 | # ------------------------------------------------------------------------------------------------
 10 | #
 11 | # /**********************************************************************************************/
 12 | # /*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
 13 | # /*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
 14 | # /*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
 15 | # /*                                                                                            */
 16 | # /*  This program is free software; you can redistribute it and/or modify                      */
 17 | # /*  it under the terms of the GNU General Public License as published by                      */
 18 | # /*  the Free Software Foundation; either version 2 of the License, or                         */
 19 | # /*  (at your option) any later version.                                                       */
 20 | # /*                                                                                            */
 21 | # /*  This program is distributed in the hope that it will be useful,                           */
 22 | # /*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
 23 | # /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
 24 | # /*  GNU General Public License for more details.                                              */
 25 | # /*                                                                                            */
 26 | # /*  You should have received a copy of the GNU General Public License                         */
 27 | # /*  along with this program; if not, write to the Free Software                               */
 28 | # /*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
 29 | # /**********************************************************************************************/
 30 | #
 31 | # /*
 32 | #  * Original code from the Cilk project (by Keith Randall)
 33 | #  *
 34 | #  * Copyright (c) 2000 Massachusetts Institute of Technology
 35 | #  * Copyright (c) 2000 Matteo Frigo
 36 | #  */
 37 | 
 38 | import
 39 |   # Stdlib
 40 |   system/ansi_c, strformat, os, strutils, cpuinfo,
 41 |   # Taskpools
 42 |   ../../taskpools
 43 | 
 44 | when not defined(windows):
 45 |   # bench
 46 |   import ../wtime, ../resources
 47 | 
 48 | # Nim helpers
 49 | # -------------------------------------------------
 50 | 
 51 | when defined(windows):
 52 |   proc alloca(size: int): pointer {.header: "<malloc.h>".}
 53 | else:
 54 |   proc alloca(size: int): pointer {.header: "<alloca.h>".}
 55 | 
 56 | template alloca*(T: typedesc): ptr T =
 57 |   cast[ptr T](alloca(sizeof(T)))
 58 | 
 59 | template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
 60 |   cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
 61 | 
 62 | proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
 63 |   when defined(TP_useNimAlloc):
 64 |     cast[type result](createSharedU(T, len))
 65 |   else:
 66 |     cast[type result](c_malloc(csize_t len*sizeof(T)))
 67 | 
 68 | proc tp_free*[T: ptr](p: T) {.inline.} =
 69 |   when defined(TP_useNimAlloc):
 70 |     freeShared(p)
 71 |   else:
 72 |     c_free(p)
 73 | 
 74 | # We assume that Nim zeroMem vs C memset
 75 | # and Nim copyMem vs C memcpy have no difference
 76 | # Nim does have extra checks to handle GC-ed types
 77 | # but they should be eliminated by the Nim compiler.
 78 | 
 79 | # -------------------------------------------------
 80 | 
 81 | type CharArray = ptr UncheckedArray[char]
 82 | 
 83 | var tp: Taskpool
 84 | var example_solution: ptr UncheckedArray[char]
 85 | 
 86 | func isValid(n: int32, a: CharArray): bool =
 87 |   ## `a` contains an array of `n` queen positions.
 88 |   ## Returns true if none of the queens conflict and 0 otherwise.
 89 | 
 90 |   for i in 0'i32 ..< n:
 91 |     let p = int32(a[i])
 92 | 
 93 |     for j in i+1 ..< n:
 94 |       let q = int32(a[j])
 95 |       if q == p or q == p - (j-i) or q == p + (j-i):
 96 |         return false
 97 |   return true
 98 | 
 99 | proc nqueens_ser(n, j: int32, a: CharArray): int32 =
100 |   # Serial nqueens
101 |   if n == j:
102 |     # Good solution count it
103 |     if example_solution.isNil:
104 |       example_solution = tp_alloc(char, n)
105 |       copyMem(example_solution, a, n * sizeof(char))
106 |     return 1
107 | 
108 |   # Try each possible position for queen `j`
109 |   for i in 0 ..< n:
110 |     a[j] = cast[char](i)
111 |     if isValid(j+1, a):
112 |       result += nqueens_ser(n, j+1, a)
113 | 
114 | proc nqueens_par(n, j: int32, a: CharArray): int32 {.gcsafe, raises: [].} =
115 | 
116 |   if n == j:
117 |     # Good solution, count it
118 |     return 1
119 | 
120 |   var localCounts = alloca(Flowvar[int32], n)
121 |   zeroMem(localCounts, n * sizeof(Flowvar[int32]))
122 | 
123 |   # Try each position for queen `j`
124 |   for i in 0 ..< n:
125 |     var b = alloca(char, j+1)
126 |     copyMem(b, a, j * sizeof(char))
127 |     b[j] = cast[char](i)
128 |     if isValid(j+1, b):
129 |       localCounts[i] = tp.spawn nqueens_par(n, j+1, b)
130 | 
131 |   for i in 0 ..< n:
132 |     if localCounts[i].isSpawned():
133 |       result += sync(localCounts[i])
134 | 
135 | const solutions = [
136 |   1,
137 |   0,
138 |   0,
139 |   2,
140 |   10, # 5x5
141 |   4,
142 |   10,
143 |   92, # 8x8
144 |   352,
145 |   724, # 10x10
146 |   2680,
147 |   14200,
148 |   73712,
149 |   365596,
150 |   2279184, # 15x15
151 |   14772512
152 | ]
153 | 
154 | proc verifyQueens(n, res: int32) =
155 |   if n > solutions.len:
156 |     echo &"Cannot verify result: {n} is out of range [1,{solutions.len}]"
157 |     return
158 | 
159 |   if res != solutions[n-1]:
160 |     echo &"N-Queens failure: {res} is different from expected {solutions[n-1]}"
161 | 
162 | proc main() =
163 |   var
164 |     n = 11'i32
165 |     nthreads: int
166 | 
167 |   if existsEnv"TASKPOOL_NUM_THREADS":
168 |     nthreads = getEnv"TASKPOOL_NUM_THREADS".parseInt()
169 |   else:
170 |     nthreads = countProcessors()
171 | 
172 |   if paramCount() == 0:
173 |     let exeName = getAppFilename().extractFilename()
174 |     echo &"Usage: {exeName} <N:{n}>"
175 |     echo &"Running with default config N = {n}\n"
176 | 
177 |   if paramCount() >= 1:
178 |     n = paramStr(1).parseInt.int32
179 | 
180 |   if n notin 1 .. solutions.len:
181 |     echo &"The number of queens N (on a NxN board) must be in the range [1, {solutions.len}]"
182 |     quit 1
183 | 
184 |   when not defined(windows):
185 |     var ru: Rusage
186 |     getrusage(RusageSelf, ru)
187 |     var
188 |       rss = ru.ru_maxrss
189 |       flt = ru.ru_minflt
190 | 
191 |   tp = Taskpool.new(numThreads = nthreads)
192 | 
193 |   when not defined(windows):
194 |     let start = wtime_msec()
195 | 
196 |   let count = nqueens_par(n, 0, alloca(char, n))
197 | 
198 |   when not defined(windows):
199 |     let stop = wtime_msec()
200 | 
201 |   when not defined(windows):
202 |     getrusage(RusageSelf, ru)
203 |     rss = ru.ru_maxrss - rss
204 |     flt = ru.ru_minflt - flt
205 | 
206 |   tp.shutdown()
207 | 
208 |   verifyQueens(n, count)
209 | 
210 |   if not example_solution.isNil:
211 |     stdout.write("Example solution: ")
212 |     for i in 0 ..< n:
213 |       c_printf("%2d ", example_solution[i])
214 |     stdout.write('\n')
215 | 
216 |   echo "Scheduler:            Taskpool"
217 |   echo "Benchmark:            N-queens"
218 |   echo "Threads:              ", nthreads
219 |   when not defined(windows):
220 |     echo "Time(us)              ", stop - start
221 |     echo "Max RSS (KB):         ", ru.ru_maxrss
222 |     echo "Runtime RSS (KB):     ", rss
223 |     echo "# of page faults:     ", flt
224 |   echo "Problem size:         ", n,"x",n, " board with ",n, " queens"
225 |   echo "Solutions found:      ", count
226 | 
227 |   quit 0
228 | 
229 | main()
230 | 


--------------------------------------------------------------------------------
/benchmarks/resources.nim:
--------------------------------------------------------------------------------
 1 | type
 2 |   Timeval {.importc: "timeval", header:"<sys/time.h>", bycopy.} = object
 3 | 
 4 |   Rusage* {.importc: "struct rusage", header:"<sys/resource.h>", bycopy.} = object
 5 |     ru_utime {.importc.}: Timeval
 6 |     ru_stime {.importc.}: Timeval
 7 |     ru_maxrss* {.importc.}: int32  # Maximum resident set size
 8 |     # ...
 9 |     ru_minflt* {.importc.}: int32  # page reclaims (soft page faults)
10 | 
11 |   RusageWho* {.size: sizeof(cint).} = enum
12 |     RusageChildren = -1
13 |     RusageSelf = 0
14 |     RusageThread = 1
15 | 
16 | when defined(debug):
17 |   var H_RUSAGE_SELF{.importc, header:"<sys/resource.h".}: cint
18 |   var H_RUSAGE_CHILDREN{.importc, header:"<sys/resource.h".}: cint
19 |   var H_RUSAGE_THREAD{.importc, header:"<sys/resource.h".}: cint
20 |   assert H_RUSAGE_SELF == ord(RusageSelf)
21 |   assert H_RUSAGE_CHILDREN = ord(RusageChildren)
22 |   assert H_RUSAGE_THREAD = ord(RusageThread)
23 | 
24 | proc getrusage*(who: RusageWho, usage: var Rusage) {.importc, header: "sys/resource.h".}
25 | 


--------------------------------------------------------------------------------
/benchmarks/single_task_producer/README.md:
--------------------------------------------------------------------------------
1 | # Simple single-producer multiple consumers benchmarks
2 | 
3 | SPC A Simple Producer-Consumer benchmark.
4 | 
5 | A single worker produces n tasks,
6 | each running for t microseconds. This benchmark allows us to test how many
7 | concurrent consumers a single producer can sustain.
8 | 


--------------------------------------------------------------------------------
/benchmarks/single_task_producer/taskpool_spc.nim:
--------------------------------------------------------------------------------
  1 | import
  2 |   # STD lib
  3 |   os, strutils, system/ansi_c, cpuinfo, strformat, math,
  4 |   # Library
  5 |   ../../taskpools,
  6 |   # bench
  7 |   ../wtime, ../resources
  8 | 
  9 | var NumTasksTotal: int32
 10 | var TaskGranularity: int32 # microsecond
 11 | var PollInterval: float64  # microsecond
 12 | 
 13 | var tp: Taskpool
 14 | 
 15 | var global_poll_elapsed {.threadvar.}: float64
 16 | 
 17 | template dummy_cpt(): untyped =
 18 |   # Dummy computation
 19 |   # Calculate fib(30) iteratively
 20 |   var
 21 |     fib = 0
 22 |     f2 = 0
 23 |     f1 = 1
 24 |   for i in 2 .. 30:
 25 |     fib = f1 + f2
 26 |     f2 = f1
 27 |     f1 = fib
 28 | 
 29 | proc spc_consume(usec: int32) =
 30 | 
 31 |   var pollElapsed = 0'f64
 32 | 
 33 |   let start = wtime_usec()
 34 |   let stop = usec.float64
 35 |   global_poll_elapsed = PollInterval
 36 | 
 37 |   while true:
 38 |     var elapsed = wtime_usec() - start
 39 |     elapsed = elapsed - pollElapsed
 40 |     if elapsed >= stop:
 41 |       break
 42 | 
 43 |     dummy_cpt()
 44 | 
 45 |     # if elapsed >= global_poll_elapsed:
 46 |     #   let pollStart = wtime_usec()
 47 |     #   loadBalance(Weave)
 48 |     #   pollElapsed += wtime_usec() - pollStart
 49 |     #   global_poll_elapsed += PollInterval
 50 | 
 51 |   # c_printf("Elapsed: %.2lfus\n", elapsed)
 52 | 
 53 | proc spc_consume_nopoll(usec: int32) =
 54 | 
 55 |   let start = wtime_usec()
 56 |   let stop = usec.float64
 57 | 
 58 |   while true:
 59 |     var elapsed = wtime_usec() - start
 60 |     if elapsed >= stop:
 61 |       break
 62 | 
 63 |     dummy_cpt()
 64 | 
 65 |   # c_printf("Elapsed: %.2lfus\n", elapsed)
 66 | 
 67 | proc spc_produce(n: int32) =
 68 |   for i in 0 ..< n:
 69 |     tp.spawn spc_consume(TaskGranularity)
 70 | 
 71 | proc spc_produce_seq(n: int32) =
 72 |   for i in 0 ..< n:
 73 |     spc_consume_nopoll(TaskGranularity)
 74 | 
 75 | proc main() =
 76 |   NumTasksTotal = 1000000
 77 |   TaskGranularity = 10
 78 |   PollInterval = 10
 79 | 
 80 |   if paramCount() == 0:
 81 |     let exeName = getAppFilename().extractFilename()
 82 |     echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
 83 |          &"<task granularity (us): {TaskGranularity}> " &
 84 |          &"[polling interval (us): task granularity]"
 85 |     echo &"Running with default config tasks = {NumTasksTotal}, granularity (us) = {TaskGranularity}, polling (us) = {PollInterval}"
 86 |   if paramCount() >= 1:
 87 |     NumTasksTotal = paramStr(1).parseInt.int32
 88 |   if paramCount() >= 2:
 89 |     TaskGranularity = paramStr(2). parseInt.int32
 90 |   if paramCount() == 3:
 91 |     PollInterval = paramStr(3).parseInt.float64
 92 |   else:
 93 |     PollInterval = TaskGranularity.float64
 94 |   if paramCount() > 3:
 95 |     let exeName = getAppFilename().extractFilename()
 96 |     echo &"Usage: {exeName} <# of tasks:{NumTasksTotal}> " &
 97 |          &"<task granularity (us): {TaskGranularity}> " &
 98 |          &"[polling interval (us): task granularity]"
 99 |     quit 1
100 | 
101 |   var nthreads: int
102 |   if existsEnv"TP_NUM_THREADS":
103 |     nthreads = getEnv"TP_NUM_THREADS".parseInt()
104 |   else:
105 |     nthreads = countProcessors()
106 | 
107 |   tp = Taskpool.new(numThreads = nthreads)
108 | 
109 |   # measure overhead during tasking
110 |   var ru: Rusage
111 |   getrusage(RusageSelf, ru)
112 |   var
113 |     rss = ru.ru_maxrss
114 |     flt = ru.ru_minflt
115 | 
116 |   let start = wtime_msec()
117 | 
118 |   # spc_produce_seq(NumTasksTotal)
119 |   spc_produce(NumTasksTotal)
120 |   tp.syncAll()
121 | 
122 |   let stop = wtime_msec()
123 | 
124 |   getrusage(RusageSelf, ru)
125 |   rss = ru.ru_maxrss - rss
126 |   flt = ru.ru_minflt - flt
127 | 
128 |   tp.shutdown()
129 | 
130 |   echo "--------------------------------------------------------------------------"
131 |   echo "Scheduler:                                     Taskpool"
132 |   echo "Benchmark:                                     SPC (Single task Producer - multi Consumer)"
133 |   echo "Threads:                                       ", nthreads
134 |   echo "Time(ms)                                       ", round(stop - start, 3)
135 |   echo "Max RSS (KB):                                  ", ru.ru_maxrss
136 |   echo "Runtime RSS (KB):                              ", rss
137 |   echo "# of page faults:                              ", flt
138 |   echo "--------------------------------------------------------------------------"
139 |   echo "# of tasks:                                    ", NumTasksTotal
140 |   echo "Task granularity (us):                         ", TaskGranularity
141 |   echo "Polling / manual load balancing interval (us): ", PollInterval
142 | 
143 |   quit 0
144 | 
145 | main()
146 | 


--------------------------------------------------------------------------------
/benchmarks/wtime.h:
--------------------------------------------------------------------------------
 1 | #ifndef WTIME_H
 2 | #define WTIME_H
 3 | 
 4 | #include <sys/time.h>
 5 | #include <time.h>
 6 | 
 7 | // Number of seconds since the Epoch
 8 | static inline double Wtime_sec(void)
 9 | {
10 | 	struct timeval tv;
11 | 	gettimeofday(&tv, NULL);
12 | 	return tv.tv_sec + tv.tv_usec / 1e6;
13 | }
14 | 
15 | // Number of milliseconds since the Epoch
16 | static inline double Wtime_msec(void)
17 | {
18 | 	struct timeval tv;
19 | 	gettimeofday(&tv, NULL);
20 | 	return tv.tv_sec * 1e3 + tv.tv_usec / 1e3;
21 | }
22 | 
23 | // Number of microseconds since the Epoch
24 | static inline double Wtime_usec(void)
25 | {
26 | 	struct timeval tv;
27 | 	gettimeofday(&tv, NULL);
28 | 	return tv.tv_sec * 1e6 + tv.tv_usec;
29 | }
30 | 
31 | #if 0
32 | // Read time stamp counter on x86
33 | static inline unsigned long long readtsc(void)
34 | {
35 | 	unsigned int lo, hi;
36 | 	// RDTSC copies contents of 64-bit TSC into EDX:EAX
37 | 	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
38 |  	return (unsigned long long)hi << 32 | lo;
39 | }
40 | #endif
41 | 
42 | #define WTIME_unique_var_name_paste(id, n) id ## n
43 | #define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
44 | #define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)
45 | 
46 | // Convenience macro for time measurement
47 | #define WTIME(unit) \
48 | 	double WTIME_unique_var(_start_##unit##_) = Wtime_##unit##ec(); \
49 | 	int WTIME_unique_var(_i_) = 0; \
50 | 	for (; WTIME_unique_var(_i_) == 0 || \
51 | 		 (printf("Elapsed wall time: %.2lf "#unit"\n", \
52 | 			     Wtime_##unit##ec() - WTIME_unique_var(_start_##unit##_)), 0); \
53 | 		 WTIME_unique_var(_i_)++)
54 | 
55 | #endif // WTIME_H
56 | 


--------------------------------------------------------------------------------
/benchmarks/wtime.nim:
--------------------------------------------------------------------------------
 1 | 
 2 | import strutils, os
 3 | 
 4 | const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0]
 5 | const cHeader = cSourcesPath / "wtime.h"
 6 | 
 7 | {.passc: "-I" & cSourcesPath .}
 8 | 
 9 | proc wtime_usec*: float64 {.importc: "Wtime_usec", header: cHeader.}
10 | proc wtime_msec*: float64 {.importc: "Wtime_msec", header: cHeader.}
11 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
 1 | # Taskpools architecture
 2 | 
 3 | Taskpools architecture is a simple threadpool with work-stealing to handle unbalanced workloads.
 4 | 
 5 | ## Architecture
 6 | 
 7 | ### Processing steps
 8 | 
 9 | 1. On a `spawn` expression, thread i packages the function call in a task.
10 | 2. It enqueues it in it's own dequeue.
11 | 3. It notify_one a condition variable that holds all sleeping threads.
12 | 4. The notified thread wakes up and
13 | 5. The notified thread randomly tries to steal a task in a worker.
14 | 6. If no tasks are found, it goes back to sleep.
15 | 7. Otherwise it runs the task.
16 | 8. On a `sync` statement, it runs task in its own task dequeue or steal a task from another worker.
17 | 9. Once the `sync` task is ready, it can run the following statements (continuation).
18 | 


--------------------------------------------------------------------------------
/examples/e01_simple_tasks.nim:
--------------------------------------------------------------------------------
 1 | import ../taskpools
 2 | 
 3 | block: # Async without result
 4 | 
 5 |   proc displayInt(x: int) =
 6 |     try:
 7 |       stdout.write(x)
 8 |       stdout.write(" - SUCCESS\n")
 9 |     except IOError:
10 |       quit 1 # can't do anything productive
11 | 
12 |   proc main() =
13 |     echo "\nSanity check 1: Printing 123456 654321 in parallel"
14 | 
15 |     var tp = Taskpool.new(numThreads = 4)
16 |     tp.spawn displayInt(123456)
17 |     tp.spawn displayInt(654321)
18 |     tp.shutdown()
19 | 
20 |   main()
21 | 
22 | block: # Async/Await
23 | 
24 |   var tp: Taskpool
25 | 
26 | 
27 |   proc asyncFib(n: int): int {.gcsafe, raises: [].} =
28 |     if n < 2:
29 |       return n
30 | 
31 |     let x = tp.spawn asyncFib(n-1)
32 |     let y = asyncFib(n-2)
33 | 
34 |     result = sync(x) + y
35 | 
36 |   proc main2() =
37 |     echo "\nSanity check 2: fib(20)"
38 | 
39 |     tp = Taskpool.new()
40 |     let f = asyncFib(20)
41 |     tp.shutdown()
42 | 
43 |     doAssert f == 6765
44 | 
45 |   main2()
46 | 


--------------------------------------------------------------------------------
/examples/e02_parallel_pi.nim:
--------------------------------------------------------------------------------
 1 | # Demo of API using a very inefficient π approcimation algorithm.
 2 | 
 3 | import
 4 |   std/[strutils, cpuinfo],
 5 |   ../taskpools
 6 | 
 7 | # From https://github.com/nim-lang/Nim/blob/v1.6.2/tests/parallel/tpi.nim
 8 | # Leibniz Formula https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
 9 | proc term(k: int): float =
10 |   if k mod 2 == 1:
11 |     -4'f / float(2*k + 1)
12 |   else:
13 |     4'f / float(2*k + 1)
14 | 
15 | proc piApprox(tp: Taskpool, n: int): float =
16 |   var pendingFuts = newSeq[Flowvar[float]](n)
17 |   for k in 0 ..< pendingFuts.len:
18 |     pendingFuts[k] = tp.spawn term(k) # Schedule a task on the threadpool a return a handle to retrieve the result.
19 |   for k in 0 ..< pendingFuts.len:
20 |     result += sync pendingFuts[k]     # Block until the result is available.
21 | 
22 | proc main() =
23 |   var n = 1_000_000
24 |   var nthreads = countProcessors()
25 | 
26 |   var tp = Taskpool.new(num_threads = nthreads) # Default to the number of hardware threads.
27 | 
28 |   echo formatFloat(tp.piApprox(n))
29 | 
30 |   tp.syncAll()                                  # Block until all pending tasks are processed (implied in tp.shutdown())
31 |   tp.shutdown()
32 | 
33 | # Compile with nim c -r -d:release --threads:on --outdir:build example.nim
34 | main()
35 | 


--------------------------------------------------------------------------------
/papers/Chase-Lev - Dynamic Circular Work-Stealing Deque.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/status-im/nim-taskpools/9e8ccc754631ac55ac2fd495e167e74e86293edb/papers/Chase-Lev - Dynamic Circular Work-Stealing Deque.pdf


--------------------------------------------------------------------------------
/papers/Nhat Minh Le et al - Correct and Efficient Work-Stealing for Weak Memory Models.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/status-im/nim-taskpools/9e8ccc754631ac55ac2fd495e167e74e86293edb/papers/Nhat Minh Le et al - Correct and Efficient Work-Stealing for Weak Memory Models.pdf


--------------------------------------------------------------------------------
/taskpools.nim:
--------------------------------------------------------------------------------
 1 | # taskpools
 2 | # Copyright (c) 2021- Status Research & Development GmbH
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | import taskpools/taskpools
 9 | export taskpools
10 | 


--------------------------------------------------------------------------------
/taskpools.nimble:
--------------------------------------------------------------------------------
 1 | mode = ScriptMode.Verbose
 2 | 
 3 | packageName   = "taskpools"
 4 | version       = "0.1.0"
 5 | author        = "Status Research & Development GmbH"
 6 | description   = "lightweight, energy-efficient, easily auditable threadpool"
 7 | license       = "MIT"
 8 | skipDirs      = @["tests"]
 9 | 
10 | requires "nim >= 1.6.0"
11 | 
12 | let nimc = getEnv("NIMC", "nim") # Which nim compiler to use
13 | let lang = getEnv("NIMLANG", "c") # Which backend (c/cpp/js)
14 | let flags = getEnv("NIMFLAGS", "") # Extra flags for the compiler
15 | let verbose = getEnv("V", "") notin ["", "0"]
16 | 
17 | let cfg =
18 |   " --styleCheck:usages --styleCheck:error" &
19 |   (if verbose: "" else: " --verbosity:0 --hints:off") &
20 |   " --skipParentCfg --skipUserCfg --outdir:build --nimcache:build/nimcache -f" &
21 |   " --stacktrace:on --linetrace:on" &
22 |   " --threads:on"
23 | 
24 | proc build(args, path: string) =
25 |   exec nimc & " " & lang & " " & cfg & " " & flags & " " & args & " " & path
26 | 
27 | proc run(args, path: string) =
28 |   build args & " --mm:refc -r", path
29 |   if (NimMajor, NimMinor) > (1, 6):
30 |     build args & " --mm:orc -r", path
31 | 
32 | task test, "Run Taskpools tests":
33 |   # Internal data structures
34 |   run "", "taskpools/channels_spsc_single.nim"
35 |   run "", "taskpools/sparsesets.nim"
36 | 
37 |   # Examples
38 |   run "", "examples/e01_simple_tasks.nim"
39 |   run "", "examples/e02_parallel_pi.nim"
40 | 
41 |   # Benchmarks
42 |   run "", "benchmarks/dfs/taskpool_dfs.nim"
43 |   run "", "benchmarks/heat/taskpool_heat.nim"
44 |   run "", "benchmarks/nqueens/taskpool_nqueens.nim"
45 | 
46 |   when not defined(windows):
47 |     run "", "benchmarks/single_task_producer/taskpool_spc.nim"
48 |     run "", "benchmarks/bouncing_producer_consumer/taskpool_bpc.nim"
49 | 
50 |   # TODO - generics in macro issue
51 |   # run "", "benchmarks/matmul_cache_oblivious/taskpool_matmul_co.nim"
52 | 


--------------------------------------------------------------------------------
/taskpools/ast_utils.nim:
--------------------------------------------------------------------------------
 1 | # taskpools
 2 | # Copyright (c) 2021 Status Research & Development GmbH
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | import macros
 9 | 
10 | template letsGoDeeper =
11 |   var rTree = node.kind.newTree()
12 |   for child in node:
13 |     rTree.add inspect(child)
14 |   return rTree
15 | 
16 | proc replaceSymsByIdents*(ast: NimNode): NimNode =
17 |   proc inspect(node: NimNode): NimNode =
18 |     case node.kind:
19 |     of {nnkIdent, nnkSym}:
20 |       return ident($node)
21 |     of nnkEmpty:
22 |       return node
23 |     of nnkLiterals:
24 |       return node
25 |     of nnkHiddenStdConv:
26 |       if node[1].kind == nnkIntLit:
27 |         return node[1]
28 |       else:
29 |         expectKind(node[1], nnkSym)
30 |         return ident($node[1])
31 |     else:
32 |       letsGoDeeper()
33 |   result = inspect(ast)
34 | 


--------------------------------------------------------------------------------
/taskpools/channels_spsc_single.nim:
--------------------------------------------------------------------------------
  1 | # taskpools
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Copyright (c) 2021- Status Research & Development GmbH
  4 | # Licensed and distributed under either of
  5 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  6 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  7 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  8 | 
  9 | {.push raises: [].}
 10 | 
 11 | import
 12 |   std/[atomics, typetraits]
 13 | 
 14 | type
 15 |   ChannelSPSCSingle*[T] = object
 16 |     ## A single-value SPSC channel
 17 |     ##
 18 |     ## Wait-free bounded single-producer single-consumer channel
 19 |     ## that can only buffer a single item
 20 |     ## Properties:
 21 |     ##   - wait-free
 22 |     ##   - supports weak memory models
 23 |     ##   - buffers a single item
 24 |     ##   - Padded to avoid false sharing in collections
 25 |     ##   - No extra indirection to access the item, the buffer is inline the channel
 26 |     ##   - Linearizable
 27 |     ##
 28 |     ## The channel should be the last field of an object if used in an intrusive manner
 29 |     full{.align: 64.}: Atomic[bool]
 30 |     value*: T
 31 | 
 32 | proc `=copy`[T](
 33 |     dest: var ChannelSPSCSingle[T],
 34 |     source: ChannelSPSCSingle[T]
 35 |   ) {.error: "A channel cannot be copied".}
 36 | 
 37 | func isEmpty*(chan: var ChannelSPSCSingle): bool {.inline.} =
 38 |   not chan.full.load(moAcquire)
 39 | 
 40 | func tryRecv*[T](chan: var ChannelSPSCSingle, dst: var T): bool {.inline.} =
 41 |   ## Try receiving the item buffered in the channel
 42 |   ## Returns true if successful (channel was not empty)
 43 |   ##
 44 |   ## ⚠ Use only in the consumer thread that reads from the channel.
 45 |   static: doAssert supportsCopyMem(T), "Channel is not garbage-collection-safe"
 46 | 
 47 |   case chan.full.load(moAcquire)
 48 |   of true:
 49 |     dst = move(chan.value)
 50 |     chan.full.store(false, moRelease)
 51 |     true
 52 |   of false:
 53 |     false
 54 | 
 55 | func trySend*[T](chan: var ChannelSPSCSingle, src: sink T): bool {.inline.} =
 56 |   ## Try sending an item into the channel
 57 |   ## Reurns true if successful (channel was empty)
 58 |   ##
 59 |   ## ⚠ Use only in the producer thread that writes from the channel.
 60 |   static: doAssert supportsCopyMem(T), "Channel is not garbage-collection-safe"
 61 | 
 62 |   case chan.full.load(moAcquire)
 63 |   of true:
 64 |     false
 65 |   of false:
 66 |     chan.value = move(src)
 67 |     chan.full.store(true, moRelease)
 68 |     true
 69 | 
 70 | {.pop.} # raises: []
 71 | 
 72 | # Sanity checks
 73 | # ------------------------------------------------------------------------------
 74 | when isMainModule:
 75 |   when not compileOption("threads"):
 76 |     {.error: "This requires --threads:on compilation flag".}
 77 | 
 78 |   template sendLoop[T](chan: var ChannelSPSCSingle[T],
 79 |                        data: sink T,
 80 |                        body: untyped): untyped =
 81 |     while not chan.trySend(data):
 82 |       body
 83 | 
 84 |   template recvLoop[T](chan: var ChannelSPSCSingle[T],
 85 |                        data: var T,
 86 |                        body: untyped): untyped =
 87 |     while not chan.tryRecv(data):
 88 |       body
 89 | 
 90 |   type
 91 |     ThreadArgs = object
 92 |       ID: WorkerKind
 93 |       chan: ptr ChannelSPSCSingle[int]
 94 | 
 95 |     WorkerKind = enum
 96 |       Sender
 97 |       Receiver
 98 | 
 99 |   template Worker(id: WorkerKind, body: untyped): untyped {.dirty.} =
100 |     if args.ID == id:
101 |       body
102 | 
103 |   proc thread_func(args: ThreadArgs) =
104 | 
105 |     # Worker RECEIVER:
106 |     # ---------
107 |     # <- chan
108 |     # <- chan
109 |     # <- chan
110 |     #
111 |     # Worker SENDER:
112 |     # ---------
113 |     # chan <- 42
114 |     # chan <- 53
115 |     # chan <- 64
116 |     Worker(Receiver):
117 |       var val: int
118 |       for j in 0 ..< 10:
119 |         args.chan[].recvLoop(val):
120 |           # Busy loop, in prod we might want to yield the core/thread timeslice
121 |           discard
122 |         echo "                  Receiver got: ", val
123 |         doAssert val == 42 + j*11
124 | 
125 |     Worker(Sender):
126 |       doAssert args.chan.full.load(moRelaxed) == false
127 |       for j in 0 ..< 10:
128 |         let val = 42 + j*11
129 |         args.chan[].sendLoop(val):
130 |           # Busy loop, in prod we might want to yield the core/thread timeslice
131 |           discard
132 |         echo "Sender sent: ", val
133 | 
134 |   import primitives/allocs
135 |   proc main() =
136 |     echo "Testing if 2 threads can send data"
137 |     echo "-----------------------------------"
138 | 
139 |     var threads: array[2, Thread[ThreadArgs]]
140 |     var chan = tp_allocAligned(
141 |       ChannelSPSCSingle[int], sizeof(ChannelSPSCSingle[int]), 64)
142 |     zeroMem(chan, sizeof(ChannelSPSCSingle[int]))
143 | 
144 |     createThread(threads[0], thread_func, ThreadArgs(ID: Receiver, chan: chan))
145 |     createThread(threads[1], thread_func, ThreadArgs(ID: Sender, chan: chan))
146 | 
147 |     joinThread(threads[0])
148 |     joinThread(threads[1])
149 | 
150 |     tp_freeAligned(chan)
151 | 
152 |     echo "-----------------------------------"
153 |     echo "Success"
154 | 
155 |   main()
156 | 


--------------------------------------------------------------------------------
/taskpools/chase_lev_deques.nim:
--------------------------------------------------------------------------------
  1 | # taskpools
  2 | # Copyright (c) 2021 Status Research & Development GmbH
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | # chase_lev_deques.nim
  9 | # --------------------
 10 | # This file implements a Chase-Lev deque
 11 | # This is a single-consumer multi-consumer concurrent queue
 12 | # for work-stealing schedulers.
 13 | #
 14 | # Papers:
 15 | # - Dynamic Circular Work-Stealing Deque
 16 | #   David Chase, Yossi Lev, 1993
 17 | #   https://www.dre.vanderbilt.edu/~schmidt/PDF/work-stealing-dequeue.pdf
 18 | #
 19 | # - Correct and Efficient Work-Stealing for Weak Memory Models
 20 | #   Nhat Minh Lê, Antoniu Pop, Albert Cohen, Francesco Zappa Nardelli, 2013
 21 | #   https://fzn.fr/readings/ppopp13.pdf
 22 | #
 23 | # We straight translate the second paper which includes formal proofs of correctness,
 24 | # and uses modern C++11 code.
 25 | #
 26 | # A Chase-lev dequeue implements the following push, pop, steal.
 27 | #
 28 | #     top                                            bottom
 29 | #               ---------------------------------
 30 | #               |         |          |          | <- push()
 31 | #  steal()   <- | Task 0  |  Task 1  |  Task 2  | -> pop()
 32 | #  any thread   |         |          |          |    owner-only
 33 | #               ---------------------------------
 34 | #
 35 | # To reduce contention, stealing is done on the opposite end from push/pop
 36 | # so that there is a race only for the very last task.
 37 | 
 38 | {.push raises: [].} # Ensure no exceptions can happen
 39 | 
 40 | import
 41 |   system/ansi_c,
 42 |   std/atomics,
 43 |   ./instrumentation/[contracts, loggers],
 44 |   ./primitives/allocs
 45 | 
 46 | type
 47 |   Buf[T] = object
 48 |     ## Backend buffer of a ChaseLevDeque
 49 |     ## `capacity` MUST be a power of 2
 50 | 
 51 |     # Note: update tp_allocUnchecked allocation if any field changes.
 52 |     # Unused. There is no memory reclamation scheme.
 53 |     prev: ptr Buf[T]
 54 | 
 55 |     capacity: int
 56 |     mask: int        # == capacity-1 implies (i and mask) == (i mod capacity)
 57 |     rawBuffer: UncheckedArray[Atomic[T]]
 58 | 
 59 |   ChaseLevDeque*[T] = object
 60 |     ## This implements a lock-free, growable, work-stealing deque.
 61 |     ## The owning thread enqueues and dequeues at the bottom
 62 |     ## Foreign threads steal at the top.
 63 |     ##
 64 |     ## There is no memory reclamation scheme for simplicity.
 65 |     top {.align: 64.}: Atomic[int]
 66 |     bottom: Atomic[int]
 67 |     buf: Atomic[ptr Buf[T]]
 68 |     garbage: ptr Buf[T]
 69 | 
 70 | {.push overflowChecks: off.}       # We don't want exceptions (for Defect) in a multithreaded context
 71 |                                    # but we don't to deal with underflow of unsigned int either
 72 |                                    # say "if a < b - c" with c > b
 73 | 
 74 | func isPowerOfTwo(n: int): bool {.inline.} =
 75 |   (n and (n - 1)) == 0 and (n != 0)
 76 | 
 77 | proc newBuf(T: typedesc, capacity: int): ptr Buf[T] =
 78 |   # Tasks have a destructor
 79 |   # static:
 80 |   #   doAssert supportsCopyMem(T), $T & " must be a (POD) plain-old-data type: no seq, string, ref."
 81 | 
 82 |   preCondition: capacity.isPowerOfTwo()
 83 | 
 84 |   result = tp_allocUnchecked(
 85 |     Buf[T],
 86 |     1*sizeof(pointer) + 2*sizeof(int) + sizeof(T)*capacity,
 87 |     zero = true
 88 |   )
 89 | 
 90 |   # result.prev = nil
 91 |   result.capacity = capacity
 92 |   result.mask = capacity - 1
 93 |   # result.rawBuffer.addr.zeroMem(sizeof(T)*capacity)
 94 | 
 95 | proc `[]=`[T](buf: var Buf[T], index: int, item: T) {.inline.} =
 96 |   buf.rawBuffer[index and buf.mask].store(item, moRelaxed)
 97 | 
 98 | proc `[]`[T](buf: var Buf[T], index: int): T {.inline.} =
 99 |   result = buf.rawBuffer[index and buf.mask].load(moRelaxed)
100 | 
101 | proc grow[T](deque: var ChaseLevDeque[T], buf: var ptr Buf[T], top, bottom: int) {.inline.} =
102 |   ## Double the buffer size
103 |   ## bottom is the last item index
104 |   ##
105 |   ## To handle race-conditions the current "top", "bottom" and "buf"
106 |   ## have to be saved before calling this procedure.
107 |   ## It reads and writes the "deque.buf", "deque.garbage" and "deque.garbageUsed"
108 | 
109 |   # Read -> Copy -> Update
110 |   var tmp = newBuf(T, buf.capacity*2)
111 |   for i in top ..< bottom:
112 |     tmp[][i] = buf[][i]
113 | 
114 |   buf.prev = deque.garbage
115 |   deque.garbage = buf
116 |   # publish globally
117 |   deque.buf.store(tmp, moRelaxed)
118 |   # publish locally
119 |   swap(buf, tmp)
120 | 
121 | # Public API
122 | # ---------------------------------------------------
123 | 
124 | proc init*[T](deque: var ChaseLevDeque[T], initialCapacity: int) =
125 |   ## Initializes a new Chase-lev work-stealing deque.
126 |   deque.reset()
127 |   deque.buf.store(newBuf(T, initialCapacity), moRelaxed)
128 | 
129 | proc teardown*[T](deque: var ChaseLevDeque[T]) =
130 |   ## Teardown a Chase-lev work-stealing deque.
131 |   var node = deque.garbage
132 |   while node != nil:
133 |     let tmp = node.prev
134 |     c_free(node)
135 |     node = tmp
136 |   c_free(deque.buf.load(moRelaxed))
137 | 
138 | proc push*[T](deque: var ChaseLevDeque[T], item: T) =
139 |   ## Enqueue an item at the bottom
140 |   ## The item should not be used afterwards.
141 | 
142 |   let # Handle race conditions
143 |     b = deque.bottom.load(moRelaxed)
144 |     t = deque.top.load(moAcquire)
145 |   var a = deque.buf.load(moRelaxed)
146 | 
147 |   if b-t > a.capacity - 1:
148 |     # Full queue
149 |     deque.grow(a, t, b)
150 | 
151 |   a[][b] = item
152 |   fence(moRelease)
153 |   deque.bottom.store(b+1, moRelaxed)
154 | 
155 | proc pop*[T](deque: var ChaseLevDeque[T]): T =
156 |   ## Deque an item at the bottom
157 | 
158 |   let # Handle race conditions
159 |     b = deque.bottom.load(moRelaxed) - 1
160 |     a = deque.buf.load(moRelaxed)
161 | 
162 |   deque.bottom.store(b, moRelaxed)
163 |   fence(moSequentiallyConsistent)
164 |   var t = deque.top.load(moRelaxed)
165 | 
166 |   if t <= b:
167 |     # Non-empty queue.
168 |     result = a[][b]
169 |     if t == b:
170 |       # Single last element in queue.
171 |       if not compareExchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed):
172 |         # Failed race.
173 |         result = default(T)
174 |       deque.bottom.store(b+1, moRelaxed)
175 |   else:
176 |     # Empty queue.
177 |     result = default(T)
178 |     deque.bottom.store(b+1, moRelaxed)
179 | 
180 | proc steal*[T](deque: var ChaseLevDeque[T]): T =
181 |   ## Deque an item at the top
182 |   var t = deque.top.load(moAcquire)
183 |   fence(moSequentiallyConsistent)
184 |   let b = deque.bottom.load(moAcquire)
185 |   result = default(T)
186 | 
187 |   if t < b:
188 |     # Non-empty queue.
189 |     let a = deque.buf.load(moConsume)
190 |     result = a[][t]
191 |     if not compareExchange(deque.top, t, t+1, moSequentiallyConsistent, moRelaxed):
192 |       # Failed race.
193 |       return default(T)
194 | 
195 | {.pop.} # overflowChecks
196 | {.pop.} # raises: []
197 | 


--------------------------------------------------------------------------------
/taskpools/event_notifiers.nim:
--------------------------------------------------------------------------------
 1 | # taskpools
 2 | # Copyright (c) 2021-2023 Status Research & Development GmbH
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | # event_notifier.nim
 9 | # ------------------
10 | # This file implements an event notifier.
11 | # It allows putting idle threads to sleep or waking them up.
12 | 
13 | # Design
14 | # Currently it is a shared lock + condition variable (a.k.a. a semaphore)
15 | #
16 | # In the future an eventcount might be considered, an event count significantly
17 | # reduces scheduler overhead by removing lock acquisition from critical path.
18 | # See overview and implementations at
19 | # https://gist.github.com/mratsim/04a29bdd98d6295acda4d0677c4d0041
20 | #
21 | # Weave "one event-notifier per thread" further reduces overhead
22 | # but requires the threadpool to be message-passing based.
23 | # https://github.com/mratsim/weave/blob/a230cce98a8524b2680011e496ec17de3c1039f2/weave/cross_thread_com/event_notifiers.nim
24 | 
25 | {.push raises: [].} # Ensure no exceptions can happen
26 | 
27 | import
28 |   std/locks,
29 |   ./instrumentation/contracts
30 | 
31 | type
32 |   EventNotifier* = object
33 |     ## This data structure allows threads to be parked when no events are pending
34 |     ## and woken up when a new event is.
35 |     # Lock must be aligned to a cache-line to avoid false-sharing.
36 |     lock{.align: 64.}: Lock
37 |     cond: Cond
38 |     parked: int
39 |     signals: int
40 | 
41 | {.push overflowChecks: off.}       # We don't want exceptions (for Defect) in a multithreaded context
42 |                                    # but we don't to deal with underflow of unsigned int either
43 |                                    # say "if a < b - c" with c > b
44 | 
45 | func initialize*(en: var EventNotifier) {.inline.} =
46 |   ## Initialize the event notifier
47 |   en.lock.initLock()
48 |   en.cond.initCond()
49 |   en.parked = 0
50 |   en.signals = 0
51 | 
52 | func `=destroy`*(en: var EventNotifier) {.inline.} =
53 |   en.cond.deinitCond()
54 |   en.lock.deinitLock()
55 | 
56 | func `=`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be copied".}
57 | func `=sink`*(dst: var EventNotifier, src: EventNotifier) {.error: "An event notifier cannot be moved".}
58 | 
59 | proc park*(en: var EventNotifier) {.inline.} =
60 |   ## Wait until we are signaled of an event
61 |   ## Thread is parked and does not consume CPU resources
62 |   en.lock.acquire()
63 | 
64 |   if en.signals > 0:
65 |     en.signals -= 1
66 |     en.lock.release()
67 |     return
68 | 
69 |   en.parked += 1
70 |   while en.signals == 0: # handle spurious wakeups
71 |     en.cond.wait(en.lock)
72 |   en.parked -= 1
73 |   en.signals -= 1
74 | 
75 |   postCondition: en.signals >= 0
76 |   en.lock.release()
77 | 
78 | proc notify*(en: var EventNotifier) {.inline.} =
79 |   ## Unpark a thread if any is available
80 |   en.lock.acquire()
81 | 
82 |   if en.parked > 0:
83 |     en.signals += 1
84 |     en.cond.signal()
85 | 
86 |   en.lock.release()
87 | 
88 | proc getParked*(en: var EventNotifier): int {.inline.} =
89 |   ## Get the number of parked thread
90 |   en.lock.acquire()
91 |   result = en.parked
92 |   en.lock.release()
93 | 
94 | {.pop.} # overflowChecks
95 | {.pop.} # raises: [AssertionDefect]
96 | 


--------------------------------------------------------------------------------
/taskpools/flowvars.nim:
--------------------------------------------------------------------------------
 1 | # taskpools
 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
 3 | # Copyright (c) 2021 Status Research & Development GmbH
 4 | # Licensed and distributed under either of
 5 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 6 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 7 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 8 | 
 9 | {.push raises: [].}
10 | 
11 | import
12 |   ./instrumentation/contracts,
13 |   ./channels_spsc_single,
14 |   ./primitives/allocs
15 | 
16 | type
17 |   Flowvar*[T] = object
18 |     ## A Flowvar is a placeholder for a future result that may be computed in parallel
19 |     # Flowvar are optimized when containing a ptr type.
20 |     # They take less size in memory by testing isNil
21 |     # instead of having an extra atomic bool
22 |     # They also use type-erasure to avoid having duplicate code
23 |     # due to generic monomorphization.
24 |     chan: ptr ChannelSPSCSingle[T]
25 | 
26 | # proc `=copy`*[T](dst: var Flowvar[T], src: Flowvar[T]) {.error: "Futures/Flowvars cannot be copied".}
27 | #
28 | # Unfortunately we cannot prevent this easily as internally
29 | # we need a copy:
30 | # - taskpools level when doing toTask(fnCall(args, fut)) and then returning fut. (Can be worked around with copyMem)
31 | # - in std/tasks (need upstream workaround)
32 | 
33 | proc newFlowVar*(T: typedesc): Flowvar[T] {.inline.} =
34 |   result.chan = tp_allocAligned(
35 |     ChannelSPSCSingle[T], sizeof(ChannelSPSCSingle[T]), alignment = 64)
36 |   zeroMem(result.chan, sizeof(ChannelSPSCSingle[T]))
37 | 
38 | proc cleanup(fv: Flowvar) {.inline.} =
39 |   # TODO: Nim v1.4+ can use "sink Flowvar"
40 |   if not fv.chan.isNil:
41 |     tp_freeAligned(fv.chan)
42 | 
43 | func isSpawned*(fv: Flowvar): bool {.inline.} =
44 |   ## Returns true if a flowvar is spawned
45 |   ## This may be useful for recursive algorithms that
46 |   ## may or may not spawn a flowvar depending on a condition.
47 |   ## This is similar to Option or Maybe types
48 |   return not fv.chan.isNil
49 | 
50 | proc readyWith*[T](fv: Flowvar[T], childResult: T) {.inline.} =
51 |   ## Send the Flowvar result from the child thread processing the task
52 |   ## to its parent thread.
53 |   let resultSent {.used.} = fv.chan[].trySend(childResult)
54 |   postCondition: resultSent
55 | 
56 | template tryComplete*[T](fv: Flowvar, parentResult: var T): bool =
57 |   fv.chan[].tryRecv(parentResult)
58 | 
59 | func isReady*[T](fv: Flowvar[T]): bool {.inline.} =
60 |   ## Returns true if the result of a Flowvar is ready.
61 |   ## In that case `sync` will not block.
62 |   ## Otherwise the current will block to help on all the pending tasks
63 |   ## until the Flowvar is ready.
64 |   not fv.chan[].isEmpty()
65 | 
66 | proc sync*[T](fv: sink Flowvar[T]): T {.inline, gcsafe.} =
67 |   ## Blocks the current thread until the flowvar is available
68 |   ## and returned.
69 |   ## The thread is not idle and will complete pending tasks.
70 |   mixin forceFuture
71 |   forceFuture(fv, result)
72 |   cleanup(fv)
73 | 


--------------------------------------------------------------------------------
/taskpools/instrumentation/contracts.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | import macros, os, strutils
  9 | 
 10 | {.used.}
 11 | 
 12 | # A simple design-by-contract API
 13 | # ----------------------------------------------------------------------------------
 14 | 
 15 | # Everything should be a template that doesn't produce any code
 16 | # when TP_Asserts is not defined.
 17 | # Those checks are controlled by a custom flag instead of
 18 | # "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
 19 | # Furthermore, we want them to be very lightweight on performance
 20 | 
 21 | # TODO auto-add documentation
 22 | 
 23 | proc inspectInfix(node: NimNode): NimNode =
 24 |   ## Inspect an expression,
 25 |   ## Returns the AST as string with runtime values inlined
 26 |   ## from infix operators inlined.
 27 |   # TODO: pointer and custom type need a default repr
 28 |   #       otherwise we can only resulve simple expressions
 29 |   proc inspect(node: NimNode): NimNode =
 30 |     case node.kind:
 31 |     of nnkInfix:
 32 |       return newCall(
 33 |           bindSym"&",
 34 |           newCall(
 35 |             bindSym"&",
 36 |             newCall(ident"$", inspect(node[1])),
 37 |             newLit(" " & $node[0] & " ")
 38 |           ),
 39 |           newCall(ident"$", inspect(node[2]))
 40 |         )
 41 |     of {nnkIdent, nnkSym}:
 42 |       return node
 43 |     of nnkDotExpr:
 44 |       return quote do:
 45 |         when `node` is pointer or
 46 |              `node` is ptr or
 47 |              `node` is (proc):
 48 |           toHex(cast[ByteAddress](`node`) and 0xffff_ffff)
 49 |         else:
 50 |           $(`node`)
 51 |     of nnkPar:
 52 |       result = nnkPar.newTree()
 53 |       for sub in node:
 54 |         result.add inspect(sub)
 55 |     else:
 56 |       return node.toStrLit()
 57 |   return inspect(node)
 58 | 
 59 | macro assertContract(
 60 |         checkName: static string,
 61 |         predicate: untyped) =
 62 |   let lineinfo = lineInfoObj(predicate)
 63 |   let file = extractFilename(lineinfo.filename)
 64 | 
 65 |   var strippedPredicate: NimNode
 66 |   if predicate.kind == nnkStmtList:
 67 |     assert predicate.len == 1, "Only one-liner conditions are supported"
 68 |     strippedPredicate = predicate[0]
 69 |   else:
 70 |     strippedPredicate = predicate
 71 | 
 72 |   let debug = "\n    Contract violated for " & checkName & " at " & file & ":" & $lineinfo.line &
 73 |               "\n        " & $strippedPredicate.toStrLit &
 74 |               "\n    The following values are contrary to expectations:" &
 75 |               "\n        "
 76 |   let values = inspectInfix(strippedPredicate)
 77 |   let workerID = quote do:
 78 |     when declared(workerContext):
 79 |       $workerContext.id
 80 |     else:
 81 |       "N/A"
 82 |   let taskpoolID = quote do:
 83 |     when declared(workerContext):
 84 |       "0x" & cast[uint](workerContext.taskpool).toHex().toLowerAscii()
 85 |     else:
 86 |       "N/A"
 87 | 
 88 |   result = quote do:
 89 |     {.noSideEffect.}:
 90 |       when compileOption("assertions"):
 91 |         assert(`predicate`, `debug` & $`values` & "  [Worker " & `workerID` & " on taskpool " & `taskpoolID` & "]\n")
 92 |       elif defined(TP_Asserts):
 93 |         if unlikely(not(`predicate`)):
 94 |           raiseAssert(`debug` & $`values` & "  [Worker " & `workerID` & " on taskpool " & `taskpoolID` & "]\n")
 95 | 
 96 | # A way way to get the caller function would be nice.
 97 | 
 98 | template preCondition*(require: untyped) =
 99 |   ## Optional runtime check before returning from a function
100 |   assertContract("pre-condition", require)
101 | 
102 | template postCondition*(ensure: untyped) =
103 |   ## Optional runtime check at the start of a function
104 |   assertContract("post-condition", ensure)
105 | 
106 | template ascertain*(check: untyped) =
107 |   ## Optional runtime check in the middle of processing
108 |   assertContract("transient condition", check)
109 | 
110 | # Sanity checks
111 | # ----------------------------------------------------------------------------------
112 | 
113 | when isMainModule:
114 |   proc assertGreater(x, y: int) =
115 |     postcondition(x > y)
116 | 
117 |   # We should get a nicely formatted exception
118 |   assertGreater(10, 12)
119 | 


--------------------------------------------------------------------------------
/taskpools/instrumentation/loggers.nim:
--------------------------------------------------------------------------------
 1 | # Weave
 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | import system/ansi_c
 9 | 
10 | {.used.}
11 | 
12 | template log*(args: varargs[untyped]): untyped =
13 |   c_printf(args)
14 |   flushFile(stdout)
15 | 
16 | template debugTermination*(body: untyped): untyped =
17 |   when defined(TP_DebugTermination) or defined(TP_Debug):
18 |     {.noSideEffect, gcsafe.}: body
19 | 
20 | template debug*(body: untyped): untyped =
21 |   when defined(TP_Debug):
22 |     {.noSideEffect, gcsafe.}: body
23 | 


--------------------------------------------------------------------------------
/taskpools/primitives/allocs.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | import system/ansi_c
  9 | 
 10 | # Helpers
 11 | # ----------------------------------------------------------------------------------
 12 | 
 13 | func isPowerOfTwo(n: int): bool {.inline.} =
 14 |   (n and (n - 1)) == 0 and (n != 0)
 15 | 
 16 | # TODO: cannot dispatch at compile-time due to https://github.com/nim-lang/Nim/issues/12726
 17 | # but all our use-case are for power of 2
 18 | 
 19 | func roundNextMultipleOf*(x: Natural, n: Natural): int {.inline.} =
 20 |   assert n.isPowerOfTwo()
 21 |   result = (x + n - 1) and not(n - 1)
 22 | 
 23 | # func roundNextMultipleOf*(x: Natural, n: static Natural): int {.inline.} =
 24 | #   ## Round the input to the next multiple of "n"
 25 | #   when n.isPowerOfTwo():
 26 | #     # n is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
 27 | #     result = (x + n - 1) and not(n - 1)
 28 | #   else:
 29 | #     result = ((x + n - 1) div n) * n
 30 | 
 31 | # Memory
 32 | # ----------------------------------------------------------------------------------
 33 | 
 34 | # Nim allocShared, createShared, deallocShared
 35 | # take a global lock that is absolutely killing performance
 36 | # and shows up either:
 37 | # - native_queued_spin_lock_slowpath
 38 | # - __pthread_mutex_lock and __pthread_mutex_unlock_usercnt
 39 | #
 40 | # We use system malloc by default, the flag -d:useMalloc is not enough
 41 | 
 42 | template deref*(T: typedesc): typedesc =
 43 |   ## Return the base object type behind a ptr type
 44 |   typeof(default(T)[])
 45 | 
 46 | proc tp_alloc*(T: typedesc, zero: static bool = false): ptr T {.inline.}=
 47 |   ## Default allocator for the Taskpools library
 48 |   ## This allocates memory to hold the type T
 49 |   ## and returns a pointer to it
 50 |   ##
 51 |   ## Can use Nim allocator to measure the overhead of its lock
 52 |   ## Memory is not zeroed
 53 |   result = cast[ptr T](c_malloc(csize_t sizeof(T)))
 54 |   when zero:
 55 |     zeroMem(result, sizeof(T))
 56 | 
 57 | proc tp_allocPtr*(T: typedesc[ptr], zero: static bool = false): T {.inline.}=
 58 |   ## Default allocator for the Taskpools library
 59 |   ## This allocates memory to hold the
 60 |   ## underlying type of the pointer type T.
 61 |   ## i.e. if T is ptr int, this allocates an int
 62 |   ##
 63 |   ## Can use Nim allocator to measure the overhead of its lock
 64 |   ## Memory is zeroed if requested
 65 |   result = tp_alloc(deref(T))
 66 |   when zero:
 67 |     zeroMem(result, sizeof(deref(T)))
 68 | 
 69 | proc tp_alloc*(T: typedesc, len: SomeInteger): ptr UncheckedArray[T] {.inline.} =
 70 |   ## Default allocator for the Taskpools library.
 71 |   ## This allocates a contiguous chunk of memory
 72 |   ## to hold ``len`` elements of type T
 73 |   ## and returns a pointer to it.
 74 |   ##
 75 |   ## Can use Nim allocator to measure the overhead of its lock
 76 |   ## Memory is not zeroed
 77 |   cast[type result](c_malloc(csize_t len*sizeof(T)))
 78 | 
 79 | proc tp_allocUnchecked*(T: typedesc, size: SomeInteger, zero: static bool = false): ptr T {.inline.} =
 80 |   ## Default allocator for the Taskpools library.
 81 |   ## This allocates "size" bytes.
 82 |   ## This is for datastructure which contained an UncheckedArray field
 83 |   result = cast[type result](c_malloc(csize_t size))
 84 |   when zero:
 85 |     zeroMem(result, size)
 86 | 
 87 | proc tp_free*[T: ptr](p: T) {.inline.} =
 88 |   when defined(WV_useNimAlloc):
 89 |     freeShared(p)
 90 |   else:
 91 |     c_free(p)
 92 | 
 93 | when defined(windows):
 94 |   proc alloca(size: int): pointer {.header: "<malloc.h>".}
 95 | else:
 96 |   proc alloca(size: int): pointer {.header: "<alloca.h>".}
 97 | 
 98 | template alloca*(T: typedesc): ptr T =
 99 |   cast[ptr T](alloca(sizeof(T)))
100 | 
101 | template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
102 |   cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
103 | 
104 | when defined(windows):
105 |   proc aligned_alloc_windows(size, alignment: csize_t): pointer {.sideEffect,importc:"_aligned_malloc", header:"<malloc.h>".}
106 |     # Beware of the arg order!
107 |   proc tp_freeAligned*[T](p: ptr T){.sideEffect,importc:"_aligned_free", header:"<malloc.h>".}
108 | elif defined(osx):
109 |   proc posix_memalign(mem: var pointer, alignment, size: csize_t){.sideEffect,importc, header:"<stdlib.h>".}
110 |   proc aligned_alloc(alignment, size: csize_t): pointer {.inline.} =
111 |     posix_memalign(result, alignment, size)
112 |   proc tp_freeAligned*[T](p: ptr T){.inline.} =
113 |     c_free(p)
114 | else:
115 |   proc aligned_alloc(alignment, size: csize_t): pointer {.sideEffect,importc, header:"<stdlib.h>".}
116 |   proc tp_freeAligned*[T](p: ptr T){.inline.} =
117 |     c_free(p)
118 | 
119 | proc tp_allocAligned*(T: typedesc, alignment: static Natural): ptr T {.inline.} =
120 |   ## aligned_alloc requires allocating in multiple of the alignment.
121 |   static:
122 |     assert alignment.isPowerOfTwo()
123 |   let # TODO - cannot use a const due to https://github.com/nim-lang/Nim/issues/12726
124 |     size = sizeof(T)
125 |     requiredMem = size.roundNextMultipleOf(alignment)
126 | 
127 |   when defined(windows):
128 |     cast[ptr T](aligned_alloc_windows(csize_t requiredMem, csize_t alignment))
129 |   else:
130 |     cast[ptr T](aligned_alloc(csize_t alignment, csize_t requiredMem))
131 | 
132 | proc tp_allocAligned*(T: typedesc, size: int, alignment: static Natural): ptr T {.inline.} =
133 |   ## aligned_alloc requires allocating in multiple of the alignment.
134 |   static:
135 |     assert alignment.isPowerOfTwo()
136 |   let
137 |     requiredMem = size.roundNextMultipleOf(alignment)
138 | 
139 |   when defined(windows):
140 |     cast[ptr T](aligned_alloc_windows(csize_t requiredMem, csize_t alignment))
141 |   else:
142 |     cast[ptr T](aligned_alloc(csize_t alignment, csize_t requiredMem))
143 | 
144 | proc tp_allocArrayAligned*(T: typedesc, len: int, alignment: static Natural): ptr UncheckedArray[T] {.inline.} =
145 |   ## aligned_alloc requires allocating in multiple of the alignment.
146 |   static:
147 |     assert alignment.isPowerOfTwo()
148 |   let
149 |     size = sizeof(T) * len
150 |     requiredMem = size.roundNextMultipleOf(alignment)
151 | 
152 |   when defined(windows):
153 |     cast[ptr UncheckedArray[T]](aligned_alloc_windows(csize_t requiredMem, csize_t alignment))
154 |   else:
155 |     cast[ptr UncheckedArray[T]](aligned_alloc(csize_t alignment, csize_t requiredMem))
156 | 


--------------------------------------------------------------------------------
/taskpools/primitives/barriers.md:
--------------------------------------------------------------------------------
 1 | # Synchronization Barriers
 2 | 
 3 | OSX does not implement pthread_barrier as its an optional part
 4 | of the POSIX standard and they probably want to drive people to libdispatch/Grand Central Dispatch.
 5 | 
 6 | So we need to roll our own with a POSIX compatible API.
 7 | 
 8 | ## Glibc barriers, design bug and implementation
 9 | 
10 | > Note: due to GPL licensing, do not lift the code.
11 | >       Not that we can as it is heavily dependent on futexes
12 | >       which are not available on OSX
13 | 
14 | We need to make sure that we don't hit the same bug
15 | as glibc: https://sourceware.org/bugzilla/show_bug.cgi?id=13065
16 | which seems to be an issue in some of the barrier implementations
17 | in the wild.
18 | 
19 | The design of Glibc barriers is here:
20 | https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/DESIGN-barrier.txt;h=23463c6b7e77231697db3e13933b36ce295365b1;hb=HEAD
21 | 
22 | And implementation:
23 | - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_destroy.c;h=76957adef3ee751e5b0cfa429fcf4dd3cfd80b2b;hb=HEAD
24 | - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_init.c;h=c8ebab3a3cb5cbbe469c0d05fb8d9ca0c365b2bb;hb=HEAD`
25 | - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_barrier_wait.c;h=49fcfd370c1c4929fdabdf420f2f19720362e4a0;hb=HEAD
26 | 
27 | ## Synchronization barrier techniques
28 | 
29 | This article goes over the techniques of
30 | "pool barrier" and "ticket barrier"
31 | https://locklessinc.com/articles/barriers/
32 | to reach 2x to 20x the speed of pthreads barrier
33 | 
34 | This course https://cs.anu.edu.au/courses/comp8320/lectures/aux/comp422-Lecture21-Barriers.pdf
35 | goes over
36 | - centralized barrier with sense reversal
37 | - combining tree barrier
38 | - dissemination barrier
39 | - tournament barrier
40 | - scalable tree barrier
41 | More courses:
42 | - http://www.cs.rochester.edu/u/sandhya/csc458/seminars/jb_Barrier_Methods.pdf
43 | 
44 | It however requires lightweight mutexes like Linux futexes
45 | that OSX lacks.
46 | 
47 | This post goes over lightweight mutexes like Benaphores (from BeOS)
48 | https://preshing.com/20120226/roll-your-own-lightweight-mutex/
49 | 
50 | This gives a few barrier implementations
51 | http://gallium.inria.fr/~maranget/MPRI/02.pdf
52 | and refers to Cubible paper for formally verifying synchronization barriers
53 | http://cubicle.lri.fr/papers/jfla2014.pdf (in French)
54 | 


--------------------------------------------------------------------------------
/taskpools/primitives/barriers.nim:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
 2 | # Copyright (c) 2024 Status Research & Development GmbH
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | {.push raises: [], gcsafe, inline.}
 9 | 
10 | import os
11 | 
12 | when defined(windows):
13 |   import ./barriers_windows
14 | 
15 |   type SyncBarrier* = SynchronizationBarrier
16 | 
17 |   proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.raises: [OSError].} =
18 |     ## Initialize a synchronization barrier that will block ``threadCount`` threads
19 |     ## before release.
20 |     if InitializeSynchronizationBarrier(syncBarrier, threadCount, -1) != 1:
21 |       raiseOSError(osLastError())
22 | 
23 |   proc wait*(syncBarrier: var SyncBarrier): bool =
24 |     ## Blocks thread at a synchronization barrier.
25 |     ## Returns true for one of the threads (the last one on Windows, undefined on Posix)
26 |     ## and false for the others.
27 |     bool EnterSynchronizationBarrier(syncBarrier, SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE)
28 | 
29 |   proc delete*(syncBarrier: sink SyncBarrier) =
30 |     ## Deletes a synchronization barrier.
31 |     ## This assumes no race between waiting at a barrier and deleting it,
32 |     ## and reuse of the barrier requires initialization.
33 |     DeleteSynchronizationBarrier(syncBarrier.addr)
34 | 
35 | else:
36 |   import ./barriers_posix
37 | 
38 |   type SyncBarrier* = PthreadBarrier
39 | 
40 |   proc init*(syncBarrier: var SyncBarrier, threadCount: range[0'i32..high(int32)]) {.raises: [OSError].} =
41 |     ## Initialize a synchronization barrier that will block ``threadCount`` threads
42 |     ## before release.
43 |     let err = pthread_barrier_init(syncBarrier, nil, cuint threadCount)
44 |     if err != 0:
45 |       raiseOSError(OSErrorCode(err))
46 | 
47 |   proc wait*(syncBarrier: var SyncBarrier): bool =
48 |     ## Blocks thread at a synchronization barrier.
49 |     ## Returns true for one of the threads (the last one on Windows, undefined on Posix)
50 |     ## and false for the others.
51 |     ##
52 |     # https://pubs.opengroup.org/onlinepubs/009696899/functions/pthread_barrier_wait.html
53 |     let res = pthread_barrier_wait(syncBarrier)
54 |     assert res == 0 or res == PTHREAD_BARRIER_SERIAL_THREAD, osErrorMsg(OSErrorCode(res))
55 |     res == PTHREAD_BARRIER_SERIAL_THREAD
56 | 
57 |   proc delete*(syncBarrier: sink SyncBarrier) =
58 |     ## Deletes a synchronization barrier.
59 |     ## This assumes no race between waiting at a barrier and deleting it,
60 |     ## and reuse of the barrier requires initialization.
61 |     let err {.used.} = pthread_barrier_destroy(syncBarrier)
62 |     assert err == 0, osErrorMsg(OSErrorCode(err))
63 | 


--------------------------------------------------------------------------------
/taskpools/primitives/barriers_macos.nim:
--------------------------------------------------------------------------------
 1 | # Weave
 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | # OSX doesn't implement pthread_barrier_t
 9 | # It's an optional part of the POSIX standard
10 | #
11 | # This is a manual implementation of a sense reversing barrier
12 | 
13 | import locks
14 | 
15 | type
16 |   Errno* = cint
17 | 
18 |   PthreadBarrierAttr* = object
19 |     ## Dummy
20 |   PthreadBarrier* = object
21 |     ## Implementation of a sense reversing barrier
22 |     ## (The Art of Multiprocessor Programming by Maurice Herlihy & Nir Shavit)
23 | 
24 |     lock: Lock                      # Alternatively spinlock on Atomic
25 |     cond {.guard: lock.}: Cond
26 |     sense {.guard: lock.}: bool     # Choose int32 to avoid zero-expansion cost in registers?
27 |     left {.guard: lock.}: cuint     # Number of threads missing at the barrier before opening
28 |     count: cuint                    # Total number of threads that need to arrive before opening the barrier
29 | 
30 | const
31 |   PTHREAD_BARRIER_SERIAL_THREAD* = Errno(1)
32 | 
33 | func pthread_barrier_init*(
34 |         barrier: var PthreadBarrier,
35 |         attr: ptr PthreadBarrierAttr,
36 |         count: cuint
37 |       ): Errno =
38 |   barrier.lock.initLock()
39 |   {.locks: [barrier.lock].}:
40 |     barrier.cond.initCond()
41 |     barrier.left = count
42 |   barrier.count = count
43 |   # barrier.sense = false
44 | 
45 | proc pthread_barrier_wait*(barrier: var PthreadBarrier): Errno =
46 |   ## Wait on `barrier`
47 |   ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
48 |   ## Returns 0 for the other
49 |   ## Returns Errno if there is an error
50 |   barrier.lock.acquire()
51 |   {.locks: [barrier.lock].}:
52 |     var local_sense = barrier.sense # Thread local sense
53 |     dec barrier.left
54 | 
55 |     if barrier.left == 0:
56 |       # Last thread to arrive at the barrier
57 |       # Reverse phase and release it
58 |       barrier.left = barrier.count
59 |       barrier.sense = not barrier.sense
60 |       barrier.cond.broadcast()
61 |       barrier.lock.release()
62 |       return PTHREAD_BARRIER_SERIAL_THREAD
63 | 
64 |     while barrier.sense == local_sense:
65 |       # We are waiting for threads
66 |       # Wait for the sense to reverse
67 |       # while loop because we might have spurious wakeups
68 |       barrier.cond.wait(barrier.lock)
69 | 
70 |     # Reversed, we can leave the barrier
71 |     barrier.lock.release()
72 |     return Errno(0)
73 | 
74 | proc pthread_barrier_destroy*(barrier: var PthreadBarrier): Errno =
75 |   {.locks: [barrier.lock].}:
76 |     barrier.cond.deinitCond()
77 |   barrier.lock.deinitLock()
78 | 
79 | # TODO: tests
80 | 


--------------------------------------------------------------------------------
/taskpools/primitives/barriers_posix.nim:
--------------------------------------------------------------------------------
 1 | # Weave
 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | # Abstractions over POSIX barriers (non-)implementations
 9 | 
10 | when not compileOption("threads"):
11 |   {.error: "This requires --threads:on compilation flag".}
12 | 
13 | # Types
14 | # -------------------------------------------------------
15 | 
16 | when defined(osx):
17 |   import ./barriers_macos
18 |   export PthreadBarrierAttr, PthreadBarrier, Errno, PTHREAD_BARRIER_SERIAL_THREAD
19 | else:
20 |   type
21 |     PthreadBarrierAttr* {.importc: "pthread_barrierattr_t", header: "<sys/types.h>", byref.} = object
22 |       when (defined(linux) and not defined(android)) and defined(amd64):
23 |         abi: array[4 div sizeof(cint), cint] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l45
24 |     PthreadBarrier* {.importc: "pthread_barrier_t", header: "<sys/types.h>", byref.} = object
25 |       when (defined(linux) and not defined(android)) and defined(amd64):
26 |         abi: array[32 div sizeof(clong), clong] # https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86/nptl/bits/pthreadtypes-arch.h;h=dd06d6753ebc80d94ede6c3c18227a3ad3104570;hb=HEAD#l28
27 | 
28 |     Errno* = cint
29 | 
30 |   var PTHREAD_BARRIER_SERIAL_THREAD* {.importc, header:"<pthread.h>".}: Errno
31 | 
32 | # Pthread
33 | # -------------------------------------------------------
34 | when defined(osx):
35 |   export pthread_barrier_init, pthread_barrier_wait, pthread_barrier_destroy
36 | else:
37 |   # TODO careful, this function mutates `barrier` without it being `var` which
38 |   #      is allowed as a consequence of `byref` - it is also different from the
39 |   #      one in barriers_macos
40 |   #      see https://github.com/status-im/nim-taskpools/pull/20#discussion_r923843093
41 |   proc pthread_barrier_init*(
42 |         barrier: PthreadBarrier,
43 |         attr: ptr PthreadBarrierAttr,
44 |         count: cuint
45 |       ): Errno {.header: "<pthread.h>".}
46 |     ## Initialize `barrier` with the attributes `attr`.
47 |     ## The barrier is opened when `count` waiters arrived.
48 | 
49 |   # TODO the macos signature is var instead of sink
50 |   proc pthread_barrier_destroy*(
51 |         barrier: sink PthreadBarrier): Errno {.header: "<pthread.h>".}
52 |     ## Destroy a previously dynamically initialized `barrier`.
53 | 
54 |   proc pthread_barrier_wait*(
55 |         barrier: var PthreadBarrier
56 |       ): Errno {.header: "<pthread.h>".}
57 |     ## Wait on `barrier`
58 |     ## Returns PTHREAD_BARRIER_SERIAL_THREAD for a single arbitrary thread
59 |     ## Returns 0 for the other
60 |     ## Returns Errno if there is an error
61 | 


--------------------------------------------------------------------------------
/taskpools/primitives/barriers_windows.nim:
--------------------------------------------------------------------------------
 1 | # Weave
 2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
 3 | # Licensed and distributed under either of
 4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
 7 | 
 8 | import winlean
 9 | 
10 | # Technically in <synchapi.h> but MSVC complains with 
11 | # @m..@s..@sweave@sscheduler.nim.cpp
12 | # C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\um\winnt.h(154): fatal error C1189: #error:  "No Target Architecture
13 | 
14 | type
15 |   SynchronizationBarrier*{.importc:"SYNCHRONIZATION_BARRIER", header:"<windows.h>".} = object
16 | 
17 | var SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE* {.importc, header: "<windows.h>".}: DWORD
18 |   ## Skip expensive checks on barrier enter if a barrier is never deleted.
19 | 
20 | proc EnterSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, dwFlags: DWORD): WINBOOL {.importc, stdcall, header: "<windows.h>".}
21 | proc DeleteSynchronizationBarrier*(lpBarrier: ptr SynchronizationBarrier) {.importc, stdcall, header: "<windows.h>".}
22 | proc InitializeSynchronizationBarrier*(lpBarrier: var SynchronizationBarrier, lTotalThreads: LONG, lSpinCount: LONG): WINBOOL {.importc, stdcall, header: "<windows.h>".}
23 | 
24 | when isMainModule:
25 |   import os
26 | 
27 |   var x{.noinit.}: SynchronizationBarrier
28 |   let err = InitializeSynchronizationBarrier(x, 2, -1)
29 |   if err != 1:
30 |     assert err == 0
31 |     raiseOSError(osLastError())


--------------------------------------------------------------------------------
/taskpools/sparsesets.nim:
--------------------------------------------------------------------------------
  1 | # Weave
  2 | # Copyright (c) 2019 Mamy André-Ratsimbazafy
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | import
  9 |   std/random,
 10 |   system/ansi_c,
 11 |   ./instrumentation/contracts,
 12 |   ./primitives/allocs
 13 | 
 14 | const TP_MaxWorkers = 255
 15 | type Setuint = uint8 # We support at most 255 threads (0xFF is kept as special value to signify absence in the set)
 16 | 
 17 | const Empty = high(Setuint)
 18 | 
 19 | type
 20 |   SparseSet* = object
 21 |     ## Stores efficiently a set of integers in the range [0 .. Capacity)
 22 |     ## Supports:
 23 |     ## - O(1)      inclusion, exclusion and contains
 24 |     ## - O(1)      random pick
 25 |     ## - O(1)      length
 26 |     ## - O(length) iteration
 27 |     ##
 28 |     ## Space: Capacity * sizeof(words)
 29 |     ##
 30 |     ## This is contrary to bitsets which requires:
 31 |     ## - random picking: multiple random "contains" + a fallback to uncompressing the set
 32 |     ## - O(Capacity/sizeof(words)) length (via popcounts)
 33 |     ## - O(capacity) iteration
 34 |     indices: ptr UncheckedArray[Setuint]
 35 |     values: ptr UncheckedArray[Setuint]
 36 |     rawBuffer: ptr UncheckedArray[Setuint]
 37 |     len*: Setuint
 38 |     capacity*: Setuint
 39 | 
 40 | func allocate*(s: var SparseSet, capacity: SomeInteger) {.inline.} =
 41 |   preCondition: capacity <= TP_MaxWorkers
 42 | 
 43 |   s.capacity = Setuint capacity
 44 |   s.rawBuffer = tp_alloc(Setuint, 2*capacity)
 45 |   s.indices = s.rawBuffer
 46 |   s.values = cast[ptr UncheckedArray[Setuint]](s.rawBuffer[capacity].addr)
 47 | 
 48 | func delete*(s: var SparseSet) {.inline.} =
 49 |   s.indices = nil
 50 |   s.values = nil
 51 |   c_free(s.rawBuffer)
 52 | 
 53 | func refill*(s: var SparseSet) {.inline.} =
 54 |   ## Reset the sparseset by including all integers
 55 |   ## in the range [0 .. Capacity)
 56 |   preCondition: not s.indices.isNil
 57 |   preCondition: not s.values.isNil
 58 |   preCondition: not s.rawBuffer.isNil
 59 |   preCondition: s.capacity != 0
 60 | 
 61 |   s.len = s.capacity
 62 | 
 63 |   for i in Setuint(0) ..< s.len:
 64 |     s.indices[i] = i
 65 |     s.values[i] = i
 66 | 
 67 | func isEmpty*(s: SparseSet): bool {.inline.} =
 68 |   s.len == 0
 69 | 
 70 | func contains*(s: SparseSet, n: SomeInteger): bool {.inline.} =
 71 |   assert n.int != Empty.int
 72 |   s.indices[n] != Empty
 73 | 
 74 | func incl*(s: var SparseSet, n: SomeInteger) {.inline.} =
 75 |   preCondition: n < Empty
 76 | 
 77 |   if n in s: return
 78 | 
 79 |   preCondition: s.len < s.capacity
 80 | 
 81 |   s.indices[n] = s.len
 82 |   s.values[s.len] = n
 83 |   s.len += 1
 84 | 
 85 | func peek*(s: SparseSet): int32 {.inline.} =
 86 |   ## Returns the last point in the set
 87 |   ## Note: if an item is deleted this is not the last inserted point
 88 |   preCondition: s.len.int > 0
 89 |   int32 s.values[s.len - 1]
 90 | 
 91 | func excl*(s: var SparseSet, n: SomeInteger) {.inline.} =
 92 |   if n notin s: return
 93 | 
 94 |   # We do constant time deletion by replacing the deleted
 95 |   # integer by the last value in the array of values
 96 | 
 97 |   let delIdx = s.indices[n]
 98 | 
 99 |   s.len -= 1
100 |   let lastVal = s.values[s.len]
101 | 
102 |   s.indices[lastVal] = delIdx         # Last value now points to deleted index
103 |   s.values[delIdx] = s.values[lastVal] # Deleted item is now last value
104 | 
105 |   # Erase the item
106 |   s.indices[n] = Empty
107 | 
108 | func randomPick*(s: SparseSet, rng: var Rand): int {.inline.} =
109 |   ## Randomly pick from the set.
110 |   # The value is NOT removed from it.
111 |   let pickIdx = rng.rand(s.len-1)
112 |   result = s.values[pickIdx].int
113 | 
114 | func `$`*(s: SparseSet): string =
115 |   $toOpenArray(s.values, 0, s.len.int - 1)
116 | 
117 | # Sanity checks
118 | # ------------------------------------------------------------------------------
119 | 
120 | when isMainModule:
121 | 
122 |   const Size = 10
123 |   const Picked = 5
124 | 
125 |   var S: SparseSet
126 |   S.allocate(Size)
127 |   S.refill()
128 |   echo S
129 | 
130 |   var rngState = initRand(123)
131 |   var picked: seq[int]
132 | 
133 |   for _ in 0 ..< Picked:
134 |     let p = S.randomPick(rngState)
135 |     picked.add p
136 |     S.excl p
137 |     echo "---"
138 |     echo "picked: ", p
139 |     echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1)
140 | 
141 |   echo "---"
142 |   echo "picked: ", picked
143 |   echo "S: ", S
144 |   echo "S indices: ", toOpenArray(S.indices, 0, S.capacity.int - 1)
145 | 
146 |   for x in 0 ..< Size:
147 |     if x notin picked:
148 |       echo x, " notin picked -> in S"
149 |       doAssert x in S
150 |     else:
151 |       echo x, " in picked -> notin S"
152 |       doAssert x notin S
153 | 


--------------------------------------------------------------------------------
/taskpools/taskpools.nim:
--------------------------------------------------------------------------------
  1 | # taskpools
  2 | # Copyright (c) 2021 Status Research & Development GmbH
  3 | # Licensed and distributed under either of
  4 | #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  5 | #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  6 | # at your option. This file may not be copied, modified, or distributed except according to those terms.
  7 | 
  8 | # Taskpools
  9 | #
 10 | # This file implements a taskpool
 11 | #
 12 | # Implementation:
 13 | #
 14 | # It is a simple shared memory based work-stealing threadpool.
 15 | # The primary focus is:
 16 | # - Delegate compute intensive tasks to the threadpool.
 17 | # - Simple to audit by staying close to foundational papers
 18 | #   and using simple datastructures otherwise.
 19 | # - Low energy consumption:
 20 | #   threads should be put to sleep ASAP
 21 | #   instead of polling/spinning (energy vs latency tradeoff)
 22 | # - Decent performance:
 23 | #   Work-stealing has optimal asymptotic parallel speedup.
 24 | #   Work-stealing has significantly reduced contention
 25 | #   when many tasks are created,
 26 | #   for example by divide-and-conquer algorithms, compared to a global task queue
 27 | #
 28 | # Not a priority:
 29 | # - Handling trillions of very short tasks (less than 100µs).
 30 | # - Advanced task dependencies or events API.
 31 | # - Unbalanced parallel-for loops.
 32 | # - Handling services that should run for the lifetime of the program.
 33 | #
 34 | # Doing IO on a compute threadpool should be avoided
 35 | # In case a thread is blocked for IO, other threads can steal pending tasks in that thread.
 36 | # If all threads are pending for IO, the threadpool will not make any progress and be soft-locked.
 37 | 
 38 | {.push raises: [], gcsafe.} # Ensure no exceptions can happen
 39 | 
 40 | import
 41 |   system/ansi_c,
 42 |   std/[random, cpuinfo, atomics, macros],
 43 |   ./channels_spsc_single,
 44 |   ./chase_lev_deques,
 45 |   ./event_notifiers,
 46 |   ./primitives/[barriers, allocs],
 47 |   ./instrumentation/[contracts, loggers],
 48 |   ./sparsesets,
 49 |   ./flowvars,
 50 |   ./ast_utils,
 51 |   ./tasks
 52 | 
 53 | export
 54 |   # flowvars
 55 |   Flowvar, isSpawned, isReady, sync, tasks
 56 | 
 57 | 
 58 | type
 59 |   WorkerID = int32
 60 | 
 61 |   TaskNode = ptr object
 62 |     # Linked list of tasks
 63 |     parent: TaskNode
 64 |     task: Task
 65 | 
 66 |   Signal = object
 67 |     terminate {.align: 64.}: Atomic[bool]
 68 | 
 69 |   WorkerContext = object
 70 |     ## Thread-local worker context
 71 | 
 72 |     # Params
 73 |     id: WorkerID
 74 |     taskpool: Taskpool
 75 | 
 76 |     # Tasks
 77 |     taskDeque: ptr ChaseLevDeque[TaskNode] # owned task deque
 78 |     currentTask: TaskNode
 79 | 
 80 |     # Synchronization
 81 |     eventNotifier: ptr EventNotifier # shared event notifier
 82 |     signal: ptr Signal               # owned signal
 83 | 
 84 |     # Thefts
 85 |     rng: Rand                        # RNG state to select victims
 86 |     otherDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
 87 |     victims: SparseSet
 88 | 
 89 |   Taskpool* = ptr object
 90 |     ## A taskpool schedules procedures to be executed in parallel
 91 |     barrier: SyncBarrier
 92 |       ## Barrier for initialization and teardown
 93 |     # --- Align: 64
 94 |     eventNotifier: EventNotifier
 95 |       ## Puts thread to sleep
 96 | 
 97 |     numThreads*{.align: 64.}: int
 98 |     workerDeques: ptr UncheckedArray[ChaseLevDeque[TaskNode]]
 99 |       ## Direct access for task stealing
100 |     workers: ptr UncheckedArray[Thread[(Taskpool, WorkerID)]]
101 |     workerSignals: ptr UncheckedArray[Signal]
102 |       ## Access signaledTerminate
103 | 
104 | # Thread-local config
105 | # ---------------------------------------------
106 | 
107 | var workerContext {.threadvar.}: WorkerContext
108 |   ## Thread-local Worker context
109 | 
110 | proc setupWorker() =
111 |   ## Initialize the thread-local context of a worker
112 |   ## Requires the ID and taskpool fields to be initialized
113 |   template ctx: untyped = workerContext
114 | 
115 |   preCondition: not ctx.taskpool.isNil()
116 |   preCondition: 0 <= ctx.id and ctx.id < ctx.taskpool.numThreads
117 |   preCondition: not ctx.taskpool.workerDeques.isNil()
118 |   preCondition: not ctx.taskpool.workerSignals.isNil()
119 | 
120 |   # Thefts
121 |   ctx.rng = initRand(0xEFFACED + ctx.id)
122 |   ctx.otherDeques = ctx.taskpool.workerDeques
123 |   ctx.victims.allocate(ctx.taskpool.numThreads)
124 | 
125 |   # Synchronization
126 |   ctx.eventNotifier = addr ctx.taskpool.eventNotifier
127 |   ctx.signal = addr ctx.taskpool.workerSignals[ctx.id]
128 |   ctx.signal.terminate.store(false, moRelaxed)
129 | 
130 |   # Tasks
131 |   ctx.taskDeque = addr ctx.taskpool.workerDeques[ctx.id]
132 |   ctx.currentTask = nil
133 | 
134 |   # Init
135 |   ctx.taskDeque[].init(initialCapacity = 32)
136 | 
137 | proc teardownWorker() =
138 |   ## Cleanup the thread-local context of a worker
139 |   template ctx: untyped = workerContext
140 |   ctx.taskDeque[].teardown()
141 |   ctx.victims.delete()
142 | 
143 | proc eventLoop(ctx: var WorkerContext) {.raises:[].}
144 | 
145 | proc workerEntryFn(params: tuple[taskpool: Taskpool, id: WorkerID]) =
146 |   ## On the start of the threadpool workers will execute this
147 |   ## until they receive a termination signal
148 |   # We assume that thread_local variables start all at their binary zero value
149 |   preCondition: workerContext == default(WorkerContext)
150 | 
151 |   template ctx: untyped = workerContext
152 | 
153 |   # If the following crashes, you need --tlsEmulation:off
154 |   ctx.id = params.id
155 |   ctx.taskpool = params.taskpool
156 | 
157 |   setupWorker()
158 | 
159 |   # 1 matching barrier in Taskpool.new() for root thread
160 |   discard params.taskpool.barrier.wait()
161 | 
162 |   {.gcsafe.}: # Not GC-safe when multi-threaded due to thread-local variables
163 |     ctx.eventLoop()
164 | 
165 |   debugTermination:
166 |     log(">>> Worker %2d shutting down <<<\n", ctx.id)
167 | 
168 |   # 1 matching barrier in taskpool.shutdown() for root thread
169 |   discard params.taskpool.barrier.wait()
170 | 
171 |   teardownWorker()
172 | 
173 | # Tasks
174 | # ---------------------------------------------
175 | 
176 | proc new(T: type TaskNode, parent: TaskNode, task: sink Task): T =
177 |   var tn = tp_allocPtr(TaskNode)
178 |   tn.parent = parent
179 |   wasMoved(tn.task) # tn.task is uninitialized, prevent Nim from running the Task destructor
180 |   tn.task = task
181 |   return tn
182 | 
183 | proc runTask(tn: var TaskNode) {.inline.} =
184 |   ## Run a task and consumes the taskNode
185 |   tn.task.invoke()
186 |   {.gcsafe.}: # Upstream missing tagging `=destroy` as gcsafe
187 |     tn.task.`=destroy`()
188 |   tn.c_free()
189 | 
190 | proc schedule(ctx: WorkerContext, tn: sink TaskNode) {.inline.} =
191 |   ## Schedule a task in the taskpool
192 |   debug: log("Worker %2d: schedule task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, tn, tn.parent, ctx.currentTask)
193 |   ctx.taskDeque[].push(tn)
194 |   ctx.taskpool.eventNotifier.notify()
195 | 
196 | # Scheduler
197 | # ---------------------------------------------
198 | 
199 | proc trySteal(ctx: var WorkerContext): TaskNode =
200 |   ## Try to steal a task.
201 | 
202 |   ctx.victims.refill()
203 |   ctx.victims.excl(ctx.id)
204 | 
205 |   while not ctx.victims.isEmpty():
206 |     let target = ctx.victims.randomPick(ctx.rng)
207 | 
208 |     let stolenTask = ctx.otherDeques[target].steal()
209 |     if not stolenTask.isNil:
210 |       return stolenTask
211 | 
212 |     ctx.victims.excl(target)
213 | 
214 |   return nil
215 | 
216 | proc eventLoop(ctx: var WorkerContext) =
217 |   ## Each worker thread executes this loop over and over.
218 |   while not ctx.signal.terminate.load(moRelaxed):
219 |     # 1. Pick from local deque
220 |     debug: log("Worker %2d: eventLoop 1 - searching task from local deque\n", ctx.id)
221 |     while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
222 |       debug: log("Worker %2d: eventLoop 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
223 |       taskNode.runTask()
224 | 
225 |     # 2. Run out of tasks, become a thief
226 |     debug: log("Worker %2d: eventLoop 2 - becoming a thief\n", ctx.id)
227 |     var stolenTask = ctx.trySteal()
228 |     if not stolenTask.isNil:
229 |       # 2.a Run task
230 |       debug: log("Worker %2d: eventLoop 2.a - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, stolenTask, stolenTask.parent, ctx.currentTask)
231 |       stolenTask.runTask()
232 |     else:
233 |       # 2.b Park the thread until a new task enters the taskpool
234 |       debug: log("Worker %2d: eventLoop 2.b - sleeping\n", ctx.id)
235 |       ctx.eventNotifier[].park()
236 |       debug: log("Worker %2d: eventLoop 2.b - waking\n", ctx.id)
237 | 
238 | # Tasking
239 | # ---------------------------------------------
240 | 
241 | const RootTask = default(Task) # TODO: sentinel value different from null task
242 | 
243 | template isRootTask(task: Task): bool =
244 |   task == RootTask
245 | 
246 | proc forceFuture*[T](fv: Flowvar[T], parentResult: var T) =
247 |   ## Eagerly complete an awaited FlowVar
248 | 
249 |   template ctx: untyped = workerContext
250 | 
251 |   template isFutReady(): untyped =
252 |     fv.chan[].tryRecv(parentResult)
253 | 
254 |   if isFutReady():
255 |     return
256 | 
257 |   ## 1. Process all the children of the current tasks.
258 |   ##    This ensures that we can give control back ASAP.
259 |   debug: log("Worker %2d: sync 1 - searching task from local deque\n", ctx.id)
260 |   while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
261 |     if taskNode.parent != ctx.currentTask:
262 |       debug: log("Worker %2d: sync 1 - skipping non-direct descendant task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
263 |       ctx.schedule(taskNode)
264 |       break
265 |     debug: log("Worker %2d: sync 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
266 |     taskNode.runTask()
267 |     if isFutReady():
268 |       debug: log("Worker %2d: sync 1 - future ready, exiting\n", ctx.id)
269 |       return
270 | 
271 |   ## 2. We run out-of-tasks or out-of-direct-child of our current awaited task
272 |   ##    So the task is bottlenecked by dependencies in other threads,
273 |   ##    hence we abandon our enqueued work and steal in the others' queues
274 |   ##    in hope it advances our awaited task. This prioritizes latency over throughput.
275 |   debug: log("Worker %2d: sync 2 - future not ready, becoming a thief (currentTask 0x%.08x)\n", ctx.id, ctx.currentTask)
276 |   while not isFutReady():
277 |     var taskNode = ctx.trySteal()
278 | 
279 |     if not taskNode.isNil:
280 |       # We stole some task, we hope we advance our awaited task
281 |       debug: log("Worker %2d: sync 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
282 |       taskNode.runTask()
283 |     # elif (taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
284 |     #   # We advance our own queue, this increases throughput but may impact latency on the awaited task
285 |     #   debug: log("Worker %2d: sync 2.2 - couldn't steal, running own task\n", ctx.id)
286 |     #   taskNode.runTask()
287 |     else:
288 |       # We don't park as there is no notif for task completion
289 |       cpuRelax()
290 | 
291 | proc syncAll*(tp: Taskpool) =
292 |   ## Blocks until all pending tasks are completed
293 |   ## This MUST only be called from
294 |   ## the root scope that created the taskpool
295 |   template ctx: untyped = workerContext
296 | 
297 |   debugTermination:
298 |     log(">>> Worker %2d enters barrier <<<\n", ctx.id)
299 | 
300 |   preCondition: ctx.id == 0
301 |   preCondition: ctx.currentTask.task.isRootTask()
302 | 
303 |   # Empty all tasks
304 |   var foreignThreadsParked = false
305 |   while not foreignThreadsParked:
306 |     # 1. Empty local tasks
307 |     debug: log("Worker %2d: syncAll 1 - searching task from local deque\n", ctx.id)
308 |     while (var taskNode = ctx.taskDeque[].pop(); not taskNode.isNil):
309 |       debug: log("Worker %2d: syncAll 1 - running task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
310 |       taskNode.runTask()
311 | 
312 |     if tp.numThreads == 1 or foreignThreadsParked:
313 |       break
314 | 
315 |     # 2. Help other threads
316 |     debug: log("Worker %2d: syncAll 2 - becoming a thief\n", ctx.id)
317 |     var taskNode = ctx.trySteal()
318 | 
319 |     if not taskNode.isNil:
320 |       # 2.1 We stole some task
321 |       debug: log("Worker %2d: syncAll 2.1 - stole task 0x%.08x (parent 0x%.08x, current 0x%.08x)\n", ctx.id, taskNode, taskNode.parent, ctx.currentTask)
322 |       taskNode.runTask()
323 |     else:
324 |       # 2.2 No task to steal
325 |       if tp.eventNotifier.getParked() == tp.numThreads - 1:
326 |         # 2.2.1 all threads besides the current are parked
327 |         debugTermination:
328 |           log("Worker %2d: syncAll 2.2.1 - termination, all other threads sleeping\n", ctx.id)
329 |         foreignThreadsParked = true
330 |       else:
331 |         # 2.2.2 We don't park as there is no notif for task completion
332 |         cpuRelax()
333 | 
334 |   debugTermination:
335 |     log(">>> Worker %2d leaves barrier <<<\n", ctx.id)
336 | 
337 | # Runtime
338 | # ---------------------------------------------
339 | 
340 | proc new*(T: type Taskpool, numThreads = countProcessors()): T {.raises: [CatchableError].} =
341 |   ## Initialize a threadpool that manages `numThreads` threads.
342 |   ## Default to the number of logical processors available.
343 | 
344 |   type TpObj = typeof(default(Taskpool)[])
345 |   # Event notifier requires an extra 64 bytes for alignment
346 |   var tp = tp_allocAligned(TpObj, sizeof(TpObj) + 64, 64)
347 | 
348 |   tp.barrier.init(numThreads.int32)
349 |   tp.eventNotifier.initialize()
350 |   tp.numThreads = numThreads
351 |   tp.workerDeques = tp_allocArrayAligned(ChaseLevDeque[TaskNode], numThreads, alignment = 64)
352 |   tp.workers = tp_allocArrayAligned(Thread[(Taskpool, WorkerID)], numThreads, alignment = 64)
353 |   tp.workerSignals = tp_allocArrayAligned(Signal, numThreads, alignment = 64)
354 | 
355 |   # Setup master thread
356 |   workerContext.id = 0
357 |   workerContext.taskpool = tp
358 | 
359 |   # Start worker threads
360 |   for i in 1 ..< numThreads:
361 |     createThread(tp.workers[i], workerEntryFn, (tp, WorkerID(i)))
362 | 
363 |   # Root worker
364 |   setupWorker()
365 | 
366 |   # Root task, this is a sentinel task that is never called.
367 |   workerContext.currentTask = TaskNode.new(
368 |     parent = nil,
369 |     task = default(Task) # TODO RootTask, somehow this uses `=copy`
370 |   )
371 | 
372 |   # Wait for the child threads
373 |   discard tp.barrier.wait()
374 |   return tp
375 | 
376 | proc cleanup(tp: var Taskpool) =
377 |   ## Cleanup all resources allocated by the taskpool
378 |   preCondition: workerContext.currentTask.task.isRootTask()
379 | 
380 |   for i in 1 ..< tp.numThreads:
381 |     joinThread(tp.workers[i])
382 | 
383 |   tp.workerSignals.tp_freeAligned()
384 |   tp.workers.tp_freeAligned()
385 |   tp.workerDeques.tp_freeAligned()
386 |   `=destroy`(tp.eventNotifier)
387 |   tp.barrier.delete()
388 | 
389 |   tp.tp_freeAligned()
390 | 
391 | proc shutdown*(tp: var Taskpool) =
392 |   ## Wait until all tasks are processed and then shutdown the taskpool
393 |   preCondition: workerContext.currentTask.task.isRootTask()
394 |   tp.syncAll()
395 | 
396 |   # Signal termination to all threads
397 |   for i in 0 ..< tp.numThreads:
398 |     tp.workerSignals[i].terminate.store(true, moRelaxed)
399 | 
400 |   let parked = tp.eventNotifier.getParked()
401 |   for i in 0 ..< parked:
402 |     tp.eventNotifier.notify()
403 | 
404 |   # 1 matching barrier in worker_entry_fn
405 |   discard tp.barrier.wait()
406 | 
407 |   teardownWorker()
408 |   tp.cleanup()
409 | 
410 |   # Dealloc dummy task
411 |   workerContext.currentTask.c_free()
412 | 
413 | # Task parallelism
414 | # ---------------------------------------------
415 | {.pop.} # raises:[]
416 | 
417 | macro spawn*(tp: Taskpool, fnCall: typed): untyped =
418 |   ## Spawns the input function call asynchronously, potentially on another thread of execution.
419 |   ##
420 |   ## If the function calls returns a result, spawn will wrap it in a Flowvar.
421 |   ## You can use `sync` to block the current thread and extract the asynchronous result from the flowvar.
422 |   ## You can use `isReady` to check if result is available and if subsequent
423 |   ## `spawn` returns immediately.
424 |   ##
425 |   ## Tasks are processed approximately in Last-In-First-Out (LIFO) order
426 |   result = newStmtList()
427 | 
428 |   let fn = fnCall[0]
429 |   let fnName = $fn
430 | 
431 |   # Get the return type if any
432 |   let retType = fnCall[0].getImpl[3][0]
433 |   let needFuture = retType.kind != nnkEmpty
434 | 
435 |   # Package in a task
436 |   let taskNode = ident("taskNode")
437 |   if not needFuture:
438 |     result.add quote do:
439 |       let `taskNode` = TaskNode.new(workerContext.currentTask, toTask(`fnCall`))
440 |       schedule(workerContext, `taskNode`)
441 | 
442 |   else:
443 |     # tasks have no return value.
444 |     # 1. We create a channel/flowvar to transmit the return value to awaiter/sync
445 |     # 2. We create a wrapper async_fn without return value that send the return value in the channel
446 |     # 3. We package that wrapper function in a task
447 | 
448 |     # 1. Create the channel
449 |     let fut = ident("fut")
450 |     let futTy = nnkBracketExpr.newTree(
451 |       bindSym"FlowVar",
452 |       retType
453 |     )
454 |     result.add quote do:
455 |       let `fut` = newFlowVar(type `retType`)
456 | 
457 |     # 2. Create a wrapper function that sends result to the channel
458 |     # TODO, upstream "getImpl" doesn't return the generic params
459 |     let genericParams = fn.getImpl()[2].replaceSymsByIdents()
460 |     let formalParams = fn.getImpl()[3].replaceSymsByIdents()
461 | 
462 |     var asyncParams = nnkFormalParams.newTree(
463 |       newEmptyNode()
464 |     )
465 |     var fnCallIdents = nnkCall.newTree(
466 |       fnCall[0]
467 |     )
468 |     for i in 1 ..< formalParams.len:
469 |       let ident = formalParams[i].replaceSymsByIdents()
470 |       asyncParams.add ident
471 |       for j in 0 ..< ident.len - 2:
472 |         # Handle "a, b: int"
473 |         fnCallIdents.add ident[j]
474 | 
475 |     let futFnParam = ident("fut")
476 |     asyncParams.add newIdentDefs(futFnParam, futTy)
477 | 
478 |     let asyncBody = quote do:
479 |       # XXX: can't test that when the RootTask is default(Task) instead of a sentinel value
480 |       # preCondition: not isRootTask(workerContext.currentTask.task)
481 | 
482 |       let res = `fnCallIdents`
483 |       readyWith(`futFnParam`, res)
484 | 
485 |     let asyncFn = ident("taskpool_" & fnName)
486 |     result.add nnkProcDef.newTree(
487 |       asyncFn,
488 |       newEmptyNode(),
489 |       genericParams,
490 |       asyncParams,
491 |       nnkPragma.newTree(ident("nimcall")),
492 |       newEmptyNode(),
493 |       asyncBody
494 |     )
495 | 
496 |     var asyncCall = newCall(asyncFn)
497 |     for i in 1 ..< fnCall.len:
498 |       asyncCall.add fnCall[i].replaceSymsByIdents()
499 |     asyncCall.add fut
500 | 
501 |     result.add quote do:
502 |       let `taskNode` = TaskNode.new(workerContext.currentTask, toTask(`asyncCall`))
503 |       schedule(workerContext, `taskNode`)
504 | 
505 |       # Return the future / flowvar
506 |       `fut`
507 | 
508 |   # Wrap in a block for namespacing
509 |   result = nnkBlockStmt.newTree(newEmptyNode(), result)
510 |   # echo result.toStrLit()
511 | 


--------------------------------------------------------------------------------
/taskpools/tasks.nim:
--------------------------------------------------------------------------------
  1 | #        (c) Copyright 2021 Nim contributors
  2 | # Copyright (c) 2023- Status Research & Development GmbH
  3 | 
  4 | ## This module provides basic primitives for creating parallel programs.
  5 | ## A `Task` should be only owned by a single Thread, it cannot be shared by threads.
  6 | ##
  7 | ## The module was forked from std/tasks in Nim 1.6 to add new functionality and
  8 | ## tune to the taskpools use case.
  9 | 
 10 | import std/[macros, isolation, typetraits]
 11 | import system/ansi_c
 12 | 
 13 | export isolation
 14 | 
 15 | 
 16 | when compileOption("threads"):
 17 |   from std/effecttraits import isGcSafe
 18 | 
 19 | 
 20 | #
 21 | # proc hello(a: int, b: string) =
 22 | #   echo $a & b
 23 | #
 24 | # let literal = "Nim"
 25 | # let t = toTask(hello(521, literal))
 26 | #
 27 | #
 28 | # is roughly converted to
 29 | #
 30 | # type
 31 | #   ScratchObj_369098780 = object
 32 | #     a: int
 33 | #     b: string
 34 | #
 35 | # let scratch_369098762 = cast[ptr ScratchObj_369098780](c_calloc(csize_t 1,
 36 | #     csize_t sizeof(ScratchObj_369098780)))
 37 | # if scratch_369098762.isNil:
 38 | #   raise newException(OutOfMemDefect, "Could not allocate memory")
 39 | # block:
 40 | #   var isolate_369098776 = isolate(521)
 41 | #   scratch_369098762.a = extract(isolate_369098776)
 42 | #   var isolate_369098778 = isolate(literal)
 43 | #   scratch_369098762.b = extract(isolate_369098778)
 44 | # proc hello_369098781(args`gensym3: pointer) {.nimcall.} =
 45 | #   let objTemp_369098775 = cast[ptr ScratchObj_369098780](args`gensym3)
 46 | #   let :tmp_369098777 = objTemp_369098775.a
 47 | #   let :tmp_369098779 = objTemp_369098775.b
 48 | #   hello(a = :tmp_369098777, b = :tmp_369098779)
 49 | #
 50 | # proc destroyScratch_369098782(args`gensym3: pointer) {.nimcall.} =
 51 | #   let obj_369098783 = cast[ptr ScratchObj_369098780](args`gensym3)
 52 | #   =destroy(obj_369098783[])
 53 | # let t = Task(callback: hello_369098781, args: scratch_369098762, destroy: destroyScratch_369098782)
 54 | #
 55 | 
 56 | {.push raises: [], gcsafe.}
 57 | 
 58 | type
 59 |   Task* = object ## `Task` contains the callback and its arguments.
 60 |     callback: proc (args: pointer) {.nimcall, gcsafe, raises: [].}
 61 |     args: pointer
 62 |     destroy: proc (args: pointer) {.nimcall, gcsafe, raises: [].}
 63 | 
 64 | 
 65 | proc `=copy`*(x: var Task, y: Task) {.error.}
 66 | 
 67 | proc `=destroy`*(t: var Task) {.inline.} =
 68 |   ## Frees the resources allocated for a `Task`.
 69 |   if t.args != nil:
 70 |     if t.destroy != nil:
 71 |       t.destroy(t.args)
 72 |     c_free(t.args)
 73 | 
 74 | proc invoke*(task: Task) {.inline.} =
 75 |   ## Invokes the `task`.
 76 |   assert task.callback != nil
 77 |   task.callback(task.args)
 78 | 
 79 | template checkIsolate(scratchAssignList: seq[NimNode], procParam, scratchDotExpr: NimNode) =
 80 |   # block:
 81 |   #   var isoTempA = isolate(521)
 82 |   #   scratch.a = extract(isolateA)
 83 |   #   var isoTempB = isolate(literal)
 84 |   #   scratch.b = extract(isolateB)
 85 |   let isolatedTemp = genSym(nskTemp, "isoTemp")
 86 |   scratchAssignList.add newVarStmt(isolatedTemp, newCall(newIdentNode("isolate"), procParam))
 87 |   scratchAssignList.add newAssignment(scratchDotExpr,
 88 |       newCall(newIdentNode("extract"), isolatedTemp))
 89 | 
 90 | template addAllNode(assignParam: NimNode, procParam: NimNode) =
 91 |   let scratchDotExpr = newDotExpr(scratchIdent, formalParams[i][0])
 92 | 
 93 |   checkIsolate(scratchAssignList, procParam, scratchDotExpr)
 94 | 
 95 |   let tempNode = genSym(kind = nskTemp, ident = formalParams[i][0].strVal)
 96 |   callNode.add nnkExprEqExpr.newTree(formalParams[i][0], tempNode)
 97 |   tempAssignList.add newLetStmt(tempNode, newDotExpr(objTemp, formalParams[i][0]))
 98 |   scratchRecList.add newIdentDefs(newIdentNode(formalParams[i][0].strVal), assignParam)
 99 | 
100 | macro toTask*(e: typed{nkCall | nkInfix | nkPrefix | nkPostfix | nkCommand | nkCallStrLit}): Task =
101 |   ## Converts the call and its arguments to `Task`.
102 |   runnableExamples("--gc:orc"):
103 |     proc hello(a: int) = echo a
104 | 
105 |     let b = toTask hello(13)
106 |     assert b is Task
107 | 
108 |   doAssert getTypeInst(e).typeKind == ntyVoid
109 | 
110 |   when compileOption("threads"):
111 |     if not isGcSafe(e[0]):
112 |       error("'toTask' takes a GC safe call expression", e)
113 | 
114 |   if hasClosure(e[0]):
115 |     error("closure call is not allowed", e)
116 | 
117 |   if e.len > 1:
118 |     let scratchIdent = genSym(kind = nskTemp, ident = "scratch")
119 |     let impl = e[0].getTypeInst
120 | 
121 |     when defined(nimTasksDebug):
122 |       echo impl.treeRepr
123 |       echo e.treeRepr
124 |     let formalParams = impl[0]
125 | 
126 |     var
127 |       scratchRecList = newNimNode(nnkRecList)
128 |       scratchAssignList: seq[NimNode]
129 |       tempAssignList: seq[NimNode]
130 |       callNode: seq[NimNode]
131 | 
132 |     let
133 |       objTemp = genSym(nskTemp, ident = "objTemp")
134 | 
135 |     for i in 1 ..< formalParams.len:
136 |       var param = formalParams[i][1]
137 | 
138 |       if param.kind == nnkBracketExpr and param[0].eqIdent("sink"):
139 |         param = param[0]
140 | 
141 |       if param.typeKind in {ntyExpr, ntyStmt}:
142 |         error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter", e)
143 | 
144 |       case param.kind
145 |       of nnkVarTy:
146 |         error("'toTask'ed function cannot have a 'var' parameter", e)
147 |       of nnkBracketExpr:
148 |         if param[0].typeKind == ntyTypeDesc:
149 |           callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
150 |         elif param[0].typeKind in {ntyVarargs, ntyOpenArray}:
151 |           if param[1].typeKind in {ntyExpr, ntyStmt}:
152 |             error("'toTask'ed function cannot have a 'typed' or 'untyped' parameter", e)
153 |           let
154 |             seqType = nnkBracketExpr.newTree(newIdentNode("seq"), param[1])
155 |             seqCallNode = newCall("@", e[i])
156 |           addAllNode(seqType, seqCallNode)
157 |         else:
158 |           addAllNode(param, e[i])
159 |       of nnkBracket, nnkObjConstr:
160 |         # passing by static parameters
161 |         # so we pass them directly instead of passing by scratchObj
162 |         callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
163 |       of nnkSym, nnkPtrTy:
164 |         addAllNode(param, e[i])
165 |       of nnkCharLit..nnkNilLit:
166 |         callNode.add nnkExprEqExpr.newTree(formalParams[i][0], e[i])
167 |       else:
168 |         error("'toTask'ed function cannot have a parameter of " & $param.kind & " kind", e)
169 | 
170 |     let scratchObjType = genSym(kind = nskType, ident = "ScratchObj")
171 |     let scratchObj = nnkTypeSection.newTree(
172 |                       nnkTypeDef.newTree(
173 |                         scratchObjType,
174 |                         newEmptyNode(),
175 |                         nnkObjectTy.newTree(
176 |                           newEmptyNode(),
177 |                           newEmptyNode(),
178 |                           scratchRecList
179 |                         )
180 |                       )
181 |                     )
182 | 
183 | 
184 |     let scratchObjPtrType = quote do:
185 |       cast[ptr `scratchObjType`](c_calloc(csize_t 1, csize_t sizeof(`scratchObjType`)))
186 | 
187 |     let scratchLetSection = newLetStmt(
188 |       scratchIdent,
189 |       scratchObjPtrType
190 |     )
191 | 
192 |     let scratchCheck = quote do:
193 |       if `scratchIdent`.isNil:
194 |         raise newException(OutOfMemDefect, "Could not allocate memory")
195 | 
196 |     var stmtList = newStmtList()
197 |     stmtList.add(scratchObj)
198 |     stmtList.add(scratchLetSection)
199 |     stmtList.add(scratchCheck)
200 |     stmtList.add(nnkBlockStmt.newTree(newEmptyNode(), newStmtList(scratchAssignList)))
201 | 
202 |     var functionStmtList = newStmtList()
203 |     let funcCall = newCall(e[0], callNode)
204 |     functionStmtList.add tempAssignList
205 |     functionStmtList.add funcCall
206 | 
207 |     let funcName = genSym(nskProc, e[0].strVal)
208 |     let destroyName = genSym(nskProc, "destroyScratch")
209 |     let objTemp2 = genSym(ident = "obj")
210 |     let tempNode = quote("@") do:
211 |         `=destroy`(@objTemp2[])
212 | 
213 |     result = quote do:
214 |       `stmtList`
215 | 
216 |       proc `funcName`(args: pointer) {.gcsafe, nimcall, raises: [].} =
217 |         let `objTemp` = cast[ptr `scratchObjType`](args)
218 |         `functionStmtList`
219 | 
220 |       proc `destroyName`(args: pointer) {.gcsafe, nimcall, raises: [].} =
221 |         let `objTemp2` = cast[ptr `scratchObjType`](args)
222 |         `tempNode`
223 | 
224 |       Task(callback: `funcName`, args: `scratchIdent`, destroy: `destroyName`)
225 |   else:
226 |     let funcCall = newCall(e[0])
227 |     let funcName = genSym(nskProc, e[0].strVal)
228 | 
229 |     result = quote do:
230 |       proc `funcName`(args: pointer) {.gcsafe, nimcall, raises: [].} =
231 |         `funcCall`
232 | 
233 |       Task(callback: `funcName`, args: nil)
234 | 
235 |   when defined(nimTasksDebug):
236 |     echo result.repr
237 | 
238 | when isMainModule:
239 |   block:
240 |     var num = 0
241 |     proc hello(a: int) = inc num, a
242 | 
243 |     let b = toTask hello(13)
244 |     b.invoke()
245 |     assert num == 13
246 |     # A task can be invoked multiple times
247 |     b.invoke()
248 |     assert num == 26
249 | 
250 |   block:
251 |     type
252 |       Runnable = ref object
253 |         data: int
254 | 
255 |     var data: int
256 |     proc hello(a: Runnable) {.nimcall.} =
257 |       a.data += 2
258 |       data = a.data
259 | 
260 | 
261 |     when false:
262 |       # the parameters of call must be isolated.
263 |       let x = Runnable(data: 12)
264 |       let b = toTask hello(x) # error ----> expression cannot be isolated: x
265 |       b.invoke()
266 | 
267 |     let b = toTask(hello(Runnable(data: 12)))
268 |     b.invoke()
269 |     assert data == 14
270 |     b.invoke()
271 |     assert data == 16
272 | 


--------------------------------------------------------------------------------