├── .gitignore
├── LICENSE
├── README.md
├── include
    └── partr.h
├── makefile
├── src
    ├── congrng.c
    ├── congrng.h
    ├── log.h
    ├── multiq.c
    ├── multiq.h
    ├── partr.c
    ├── perfutil.h
    ├── profile.h
    ├── synctreepool.c
    ├── synctreepool.h
    ├── task.h
    ├── taskpools.c
    └── taskpools.h
└── test
    ├── fib.c
    ├── l3d.c
    ├── makefile
    ├── multiqtest.c
    ├── parfortest.c
    ├── sleeptest.c
    ├── tap.c
    ├── tap.h
    └── taskpoolstest.c


/.gitignore:
--------------------------------------------------------------------------------
 1 | # VIM files
 2 | .*.swp
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Libraries
15 | *.lib
16 | *.a
17 | *.la
18 | *.lo
19 | 
20 | # Shared objects (inc. Windows DLLs)
21 | *.dll
22 | *.so
23 | *.so.*
24 | *.dylib
25 | 
26 | # Executables
27 | l3d
28 | fib
29 | parfortest
30 | multiqtest
31 | taskpoolstest
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Parallel Tasks Runtime
2 | 
3 | A parallel task execution runtime that uses parallel depth-first (PDF) scheduling [1].
4 | 
5 | [1] Shimin Chen, Phillip B. Gibbons, Michael Kozuch, Vasileios Liaskovitis, Anastassia Ailamaki, Guy E. Blelloch, Babak Falsafi, Limor Fix, Nikos Hardavellas, Todd C. Mowry, and Chris Wilkerson. 2007. Scheduling threads for constructive cache sharing on CMPs. In Proceedings of the nineteenth annual ACM symposium on Parallel algorithms and architectures (SPAA '07). ACM, New York, NY, USA, 105-115. DOI=http://dx.doi.org/10.1145/1248377.1248396
6 | 


--------------------------------------------------------------------------------
/include/partr.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 |  */
 3 | 
 4 | #ifndef PARTR_H
 5 | #define PARTR_H
 6 | 
 7 | #include <stdint.h>
 8 | #include <stdio.h>
 9 | 
10 | #include "log.h"
11 | 
12 | 
13 | /* tasks */
14 | #define TASK_STACK_SIZE                 (1024*4)
15 | 
16 | /* pools */
17 | #define TASKS_PER_POOL                  1024
18 |     /* number allocated = TASKS_PER_POOL * nthreads */
19 | 
20 | /* multiq */
21 | #define MULTIQ_HEAP_C                   4
22 |     /* number of heaps = MULTIQ_HEAP_C * nthreads */
23 | #define MULTIQ_TASKS_PER_HEAP           129
24 |     /* how many in each heap */
25 | 
26 | /* parfor */
27 | #define GRAIN_K                         4
28 |     /* tasks = niters / (GRAIN_K * nthreads) */
29 | 
30 | /* synchronization */
31 | #define ARRIVERS_P                      2
32 |     /* narrivers = ((GRAIN_K * nthreads) ^ ARRIVERS_P) + 1
33 |        limit for number of recursive parfors */
34 | #define REDUCERS_FRAC                   1
35 |     /* nreducers = narrivers * REDUCERS_FRAC */
36 | 
37 | /* logging (debug, info, warn, err, critical, none) */
38 | #define LOG_LEVEL_NAME                  "PARTR_LOG_LEVEL"
39 |     /* environment variable name */
40 | #define DEFAULT_LOG_LEVEL               "debug"
41 | 
42 | /* controls for when threads sleep */
43 | #define THREAD_SLEEP_THRESHOLD_NAME     "PARTR_THREAD_SLEEP_THRESHOLD"
44 |     /* environment variable name */
45 | #define DEFAULT_THREAD_SLEEP_THRESHOLD  4e9
46 |     /* in cycles (1e9 == 1sec@1GHz) */
47 | 
48 | /* defaults for # threads */
49 | #define NUM_THREADS_NAME                "PARTR_NUM_THREADS"
50 |     /* environment variable name */
51 | #define DEFAULT_NUM_THREADS             4
52 | 
53 | /* affinitization behavior */
54 | #define MACHINE_EXCLUSIVE_NAME          "PARTR_EXCLUSIVE"
55 |     /* environment variable name */
56 | #define DEFAULT_MACHINE_EXCLUSIVE       0
57 |     /* don't assume we own the machine */
58 | 
59 | /* performance profiling */
60 | #define PERF_PROFILE                    1
61 |     /* comment to disable profiling */
62 | 
63 | 
64 | /* externally visible globals */
65 | extern log_t plog;                      /* message logger */
66 | extern int16_t nthreads;                /* number of threads */
67 | 
68 | 
69 | /* externally visible thread-local globals */
70 | extern __thread int16_t tid;            /* 0-based thread ID */
71 | extern __thread uint64_t rngseed;       /* per-thread RNG seed */
72 | 
73 | 
74 | /* external interface */
75 | typedef void *partr_t;
76 | 
77 | void partr_init();
78 | void partr_shutdown();
79 | int  partr_start(void **ret, void *(*f)(void *, int64_t, int64_t),
80 |         void *arg, int64_t start, int64_t end);
81 | int  partr_spawn(partr_t *t, void *(*f)(void *, int64_t, int64_t),
82 |         void *arg, int64_t start, int64_t end, int8_t sticky,
83 |         int8_t detach);
84 | int  partr_sync(void **r, partr_t t, int done_with_task);
85 | int  partr_parfor(partr_t *t, void *(*f)(void *, int64_t, int64_t),
86 |         void *arg, int64_t count, void *(*rf)(void *, void *));
87 | 
88 | 
89 | #endif /* PARTR_H */
90 | 
91 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | # parallel tasks runtime
 2 | #
 3 | # makefile
 4 | #
 5 | # 2016.06.01   kiran.pamnany   Initial code
 6 | #
 7 | 
 8 | CC=gcc
 9 | 
10 | .SUFFIXES: .c .h .o .a
11 | .PHONY: clean test
12 | 
13 | CFLAGS+=-DPERF_PROFILE
14 | CFLAGS+=-Wall
15 | CFLAGS+=-std=c11
16 | CFLAGS+=-D_GNU_SOURCE
17 | 
18 | CFLAGS+=-I../hwloc/include
19 | CFLAGS+=-I../libconcurrent/include
20 | CFLAGS+=-I./include
21 | CFLAGS+=-I./src
22 | 
23 | SRCS=src/partr.c src/synctreepool.c src/taskpools.c src/multiq.c src/congrng.c
24 | INCS=include/partr.h src/task.h src/synctreepool.h src/taskpools.h src/multiq.h src/congrng.h src/log.h src/perfutil.h src/profile.h
25 | OBJS=${SRCS:.c=.o}
26 | 
27 | ifeq ($(DEBUG),yes)
28 |     CFLAGS+=-O0 -g
29 | else
30 |     CFLAGS+=-O3
31 | endif
32 | 
33 | TARGET=libpartr.a
34 | 
35 | all: $(TARGET)
36 | 
37 | test: $(TARGET)
38 | 	$(MAKE) -C test
39 | 
40 | $(TARGET): $(OBJS)
41 | 	$(RM) $(TARGET)
42 | 	$(AR) qvs $(TARGET) $(OBJS)
43 | 
44 | %.o: %.c $(INCS) makefile
45 | 	$(CC) $(CFLAGS) -c $< -o $@
46 | 
47 | clean:
48 | 	$(MAKE) -C test clean
49 | 	$(RM) $(TARGET) $(OBJS)
50 | 
51 | 


--------------------------------------------------------------------------------
/src/congrng.c:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 |   
 3 |     Simple congruential random number generator (from VAX). No modulo bias.
 4 |  */
 5 | 
 6 | #include "congrng.h"
 7 | #include "perfutil.h"
 8 | 
 9 | 
10 | /*  seed_cong() -- each thread needs its own seed!
11 |  */
12 | void seed_cong(uint64_t *seed)
13 | {
14 |     *seed = rdtscp();
15 | }
16 | 
17 | 
18 | /*  unbias_cong() -- sets up state to avoid modulo bias for the given max.
19 |  */
20 | void unbias_cong(uint64_t max, uint64_t *unbias)
21 | {
22 |     *unbias = UINT64_MAX - ((UINT64_MAX % max)+1);
23 | }
24 | 
25 | 
26 | /*  cong() -- linear congruential generator (was system RNG on VAXen).
27 |  *      Loop to avoid modulo bias.
28 |  */
29 | uint64_t cong(uint64_t max, uint64_t unbias, uint64_t *seed)
30 | {
31 |     while ((*seed = 69069 * (*seed) + 362437) > unbias)
32 |         ;
33 |     return *seed % max;
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/src/congrng.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 |   
 3 |     Simple random number generator (linear congruential). No modulo bias.
 4 |  */
 5 | 
 6 | #ifndef CONGRNG_H
 7 | #define CONGRNG_H
 8 | 
 9 | #include <stdint.h>
10 | 
11 | 
12 | void seed_cong(uint64_t *seed);
13 | void unbias_cong(uint64_t max, uint64_t *unbias);
14 | uint64_t cong(uint64_t max, uint64_t unbias, uint64_t *seed);
15 | 
16 | 
17 | #endif /* CONGRNG_H */
18 | 
19 | 


--------------------------------------------------------------------------------
/src/log.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 | 
 3 |     log -- message logging
 4 |  */
 5 | 
 6 | #ifndef LOG_H
 7 | #define LOG_H
 8 | 
 9 | #include <stdio.h>
10 | 
11 | enum {
12 |     LOG_LEVEL_DEBUG,
13 |     LOG_LEVEL_INFO,
14 |     LOG_LEVEL_WARN,
15 |     LOG_LEVEL_ERR,
16 |     LOG_LEVEL_CRITICAL
17 | };
18 | 
19 | typedef struct log_tag {
20 |     int level;
21 |     FILE *f;
22 | } log_t;
23 | 
24 | #define LOG_SETUP(l, lvl, fp) do {              \
25 |     (l).level = (lvl);                          \
26 |     (l).f = (fp);                               \
27 | } while(0)
28 | 
29 | #define LOG_DEBUG(l, ...) do {                  \
30 |     if ((l).level <= LOG_LEVEL_DEBUG) {         \
31 |         fprintf((l).f, __VA_ARGS__);            \
32 |         fflush((l).f);                          \
33 |     }                                           \
34 | } while(0)
35 | 
36 | #define LOG_INFO(l, ...) do {                   \
37 |     if ((l).level <= LOG_LEVEL_INFO)            \
38 |         fprintf((l).f, __VA_ARGS__);            \
39 | } while(0)
40 | 
41 | #define LOG_WARN(l, ...) do {                   \
42 |     if ((l).level <= LOG_LEVEL_WARN)            \
43 |         fprintf((l).f, __VA_ARGS__);            \
44 | } while(0)
45 | 
46 | #define LOG_ERR(l, ...) do {                    \
47 |     if ((l).level <= LOG_LEVEL_ERR)             \
48 |         fprintf((l).f, __VA_ARGS__);            \
49 | } while(0)
50 | 
51 | #define LOG_CRITICAL(l, ...) do {               \
52 |     if ((l).level <= LOG_LEVEL_CRITICAL)        \
53 |         fprintf((l).f, __VA_ARGS__);            \
54 | } while(0)
55 | 
56 | #endif /* LOG_H */
57 | 
58 | 


--------------------------------------------------------------------------------
/src/multiq.c:
--------------------------------------------------------------------------------
  1 | /*  partr -- parallel tasks runtime
  2 | 
  3 |     MultiQueues (http://arxiv.org/abs/1411.1209)
  4 |  */
  5 | 
  6 | 
  7 | #include <stdlib.h>
  8 | #include <pthread.h>
  9 | #include "partr.h"
 10 | #include "multiq.h"
 11 | #include "congrng.h"
 12 | #include "perfutil.h"
 13 | 
 14 | 
 15 | /* individual spin-lock synchronized task heap */
 16 | typedef struct taskheap_tag {
 17 |     char lock;
 18 |     ptask_t **tasks;
 19 |     int16_t ntasks, prio;
 20 | } taskheap_t;
 21 | 
 22 | /* heap 'n'ary */
 23 | static const int16_t heap_d = 8;
 24 | 
 25 | /* the multiqueue itself is 'p' task heaps */
 26 | static taskheap_t *heaps;
 27 | static int16_t heap_p;
 28 | 
 29 | /* for atomic snapshot */
 30 | static uint64_t snapshot_owner = -1;
 31 | 
 32 | /* unbias state for the RNG */
 33 | static uint64_t cong_unbias;
 34 | 
 35 | /* state for sleep checking */
 36 | static const int16_t not_sleeping = 0;
 37 | static const int16_t checking_for_sleeping = 1;
 38 | static const int16_t sleeping = 2;
 39 | static int16_t sleep_check_state = not_sleeping;
 40 | 
 41 | 
 42 | /*  multiq_init()
 43 |  */
 44 | void multiq_init()
 45 | {
 46 |     heap_p = MULTIQ_HEAP_C * nthreads;
 47 |     heaps = (taskheap_t *)calloc(heap_p, sizeof(taskheap_t));
 48 |     for (int16_t i = 0;  i < heap_p;  ++i) {
 49 |         __atomic_clear(&heaps[i].lock, __ATOMIC_RELAXED);
 50 |         heaps[i].tasks = (ptask_t **)
 51 |                 calloc(MULTIQ_TASKS_PER_HEAP, sizeof(ptask_t *));
 52 |         heaps[i].ntasks = 0;
 53 |         heaps[i].prio = INT16_MAX;
 54 |     }
 55 |     unbias_cong(heap_p, &cong_unbias);
 56 |     LOG_INFO(plog, "  %d %d-ary heaps of %d tasks each\n",
 57 |              heap_p, heap_d, MULTIQ_TASKS_PER_HEAP);
 58 | }
 59 | 
 60 | 
 61 | /*  multiq_destroy()
 62 |  */
 63 | void multiq_destroy()
 64 | {
 65 |     for (int16_t i = 0;  i < heap_p;  ++i)
 66 |         free(heaps[i].tasks);
 67 |     free(heaps);
 68 | }
 69 | 
 70 | 
 71 | /*  sift_up()
 72 |  */
 73 | static void sift_up(taskheap_t *heap, int16_t idx)
 74 | {
 75 |     if (idx > 0) {
 76 |         int16_t parent = (idx-1)/heap_d;
 77 |         if (heap->tasks[idx]->prio <= heap->tasks[parent]->prio) {
 78 |             ptask_t *t = heap->tasks[parent];
 79 |             heap->tasks[parent] = heap->tasks[idx];
 80 |             heap->tasks[idx] = t;
 81 |             sift_up(heap, parent);
 82 |         }
 83 |     }
 84 | }
 85 | 
 86 | 
 87 | /*  sift_down()
 88 |  */
 89 | void sift_down(taskheap_t *heap, int16_t idx)
 90 | {
 91 |     if (idx < heap->ntasks) {
 92 |         for (int16_t child = heap_d*idx + 1;
 93 |                 child < MULTIQ_TASKS_PER_HEAP && child <= heap_d*idx + heap_d;
 94 |                 ++child) {
 95 |             if (heap->tasks[child]
 96 |                     &&  heap->tasks[child]->prio <= heap->tasks[idx]->prio) {
 97 |                 ptask_t *t = heap->tasks[idx];
 98 |                 heap->tasks[idx] = heap->tasks[child];
 99 |                 heap->tasks[child] = t;
100 |                 sift_down(heap, child);
101 |             }
102 |         }
103 |     }
104 | }
105 | 
106 | 
107 | /*  multiq_insert()
108 |  */
109 | int multiq_insert(ptask_t *task, int16_t priority)
110 | {
111 |     uint64_t rn;
112 |     
113 |     task->prio = priority;
114 |     do {
115 |         rn = cong(heap_p, cong_unbias, &rngseed);
116 |     } while (__atomic_test_and_set(&heaps[rn].lock, __ATOMIC_ACQUIRE));
117 | 
118 |     if (heaps[rn].ntasks >= MULTIQ_TASKS_PER_HEAP) {
119 |         LOG_ERR(plog, "  heap %llu is full\n", rn);
120 |         __atomic_clear(&heaps[rn].lock, __ATOMIC_RELEASE);
121 |         return -1;
122 |     }
123 | 
124 |     heaps[rn].tasks[heaps[rn].ntasks++] = task;
125 |     sift_up(&heaps[rn], heaps[rn].ntasks-1);
126 |     __atomic_clear(&heaps[rn].lock, __ATOMIC_RELEASE);
127 |     int16_t prio = __atomic_load_n(&heaps[rn].prio, __ATOMIC_SEQ_CST);
128 |     if (task->prio < prio)
129 |         __atomic_compare_exchange_n(&heaps[rn].prio, &prio, task->prio,
130 |                                     0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
131 | 
132 |     return 0;
133 | }
134 | 
135 | 
136 | /*  multiq_deletemin()
137 |  */
138 | ptask_t *multiq_deletemin()
139 | {
140 |     uint64_t rn1, rn2;
141 |     int16_t i, prio1, prio2;
142 |     ptask_t *task;
143 | 
144 |     for (i = 0;  i < heap_p;  ++i) {
145 |         rn1 = cong(heap_p, cong_unbias, &rngseed);
146 |         rn2 = cong(heap_p, cong_unbias, &rngseed);
147 |         prio1 = __atomic_load_n(&heaps[rn1].prio, __ATOMIC_SEQ_CST);
148 |         prio2 = __atomic_load_n(&heaps[rn2].prio, __ATOMIC_SEQ_CST);
149 |         if (prio1 > prio2) {
150 |             prio1 = prio2;
151 |             rn1 = rn2;
152 |         }
153 |         else if (prio1 == prio2 && prio1 == INT16_MAX)
154 |             continue;
155 |         if (!__atomic_test_and_set(&heaps[rn1].lock, __ATOMIC_ACQUIRE)) {
156 |             if (prio1 == heaps[rn1].prio)
157 |                 break;
158 |             __atomic_clear(&heaps[rn1].lock, __ATOMIC_RELEASE);
159 |         }
160 |     }
161 |     if (i == heap_p)
162 |         return NULL;
163 | 
164 |     task = heaps[rn1].tasks[0];
165 |     heaps[rn1].tasks[0] = heaps[rn1].tasks[--heaps[rn1].ntasks];
166 |     heaps[rn1].tasks[heaps[rn1].ntasks] = NULL;
167 |     prio1 = INT16_MAX;
168 |     if (heaps[rn1].ntasks > 0) {
169 |         sift_down(&heaps[rn1], 0);
170 |         prio1 = heaps[rn1].tasks[0]->prio;
171 |     }
172 |     __atomic_store_n(&heaps[rn1].prio, prio1, __ATOMIC_SEQ_CST);
173 |     __atomic_clear(&heaps[rn1].lock, __ATOMIC_RELEASE);
174 | 
175 |     return task;
176 | }
177 | 
178 | 
179 | /*  multiq_minprio()
180 |  */
181 | int16_t multiq_minprio()
182 | {
183 |     uint64_t rn1, rn2;
184 |     int16_t prio1, prio2;
185 | 
186 |     rn1 = cong(heap_p, cong_unbias, &rngseed);
187 |     rn2 = cong(heap_p, cong_unbias, &rngseed);
188 |     prio1 = __atomic_load_n(&heaps[rn1].prio, __ATOMIC_SEQ_CST);
189 |     prio2 = __atomic_load_n(&heaps[rn2].prio, __ATOMIC_SEQ_CST);
190 |     if (prio2 < prio1)
191 |         return prio2;
192 |     return prio1;
193 | }
194 | 
195 | 
196 | /*  just_sleep()
197 |  */
198 | static void just_sleep(pthread_mutex_t *lock, pthread_cond_t *wakeup)
199 | {
200 |     pthread_mutex_lock(lock);
201 |     if (__atomic_load_n(&sleep_check_state, __ATOMIC_SEQ_CST) == sleeping)
202 |         pthread_cond_wait(wakeup, lock);
203 |     else
204 |         pthread_mutex_unlock(lock);
205 | }
206 | 
207 | 
208 | /*  snapshot_and_sleep()
209 |  */
210 | static void snapshot_and_sleep(pthread_mutex_t *lock, pthread_cond_t *wakeup)
211 | {
212 |     uint64_t snapshot_id = cong(UINT64_MAX, UINT64_MAX, &rngseed), previous = -1;
213 |     if (!__atomic_compare_exchange_n(&snapshot_owner, &previous, snapshot_id, 0,
214 |                                      __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) {
215 |         LOG_ERR(plog, "  snapshot has previous owner!\n");
216 |         return;
217 |     }
218 | 
219 |     int16_t i;
220 |     for (i = 0;  i < heap_p;  ++i) {
221 |         if (heaps[i].ntasks != 0)
222 |             break;
223 |     }
224 |     if (i != heap_p) {
225 |         LOG_INFO(plog, "  heap has tasks, snapshot aborted\n");
226 |         return;
227 |     }
228 | 
229 |     if (!__atomic_compare_exchange_n(&snapshot_owner, &snapshot_id, previous, 0,
230 |                                      __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) {
231 |         LOG_INFO(plog, "  snapshot owner changed, snapshot aborted\n");
232 |         return;
233 |     }
234 |     if (!__atomic_compare_exchange_n(&sleep_check_state, (int16_t *)&checking_for_sleeping,
235 |                                      sleeping, 0,
236 |                                      __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) {
237 |         LOG_ERR(plog, "  sleep aborted at snapshot end\n");
238 |         return;
239 |     }
240 |     just_sleep(lock, wakeup);
241 | }
242 | 
243 | 
244 | /*  multiq_sleep_if_empty()
245 |  */
246 | void multiq_sleep_if_empty(pthread_mutex_t *lock, pthread_cond_t *wakeup)
247 | {
248 |     int16_t state;
249 | 
250 | sleep_start:
251 |     state = __atomic_load_n(&sleep_check_state, __ATOMIC_SEQ_CST);
252 |     if (state == checking_for_sleeping) {
253 |         for (; ;) {
254 |             cpu_pause();
255 |             state = __atomic_load_n(&sleep_check_state, __ATOMIC_SEQ_CST);
256 |             if (state == not_sleeping)
257 |                 break;
258 |             else if (state == sleeping) {
259 |                 just_sleep(lock, wakeup);
260 |                 break;
261 |             }
262 |         }
263 |     }
264 |     else if (state == not_sleeping) {
265 |         if (!__atomic_compare_exchange_n(&sleep_check_state, (int16_t *)&not_sleeping,
266 |                                          checking_for_sleeping, 0,
267 |                                          __ATOMIC_SEQ_CST, __ATOMIC_RELAXED))
268 |             goto sleep_start;
269 |         snapshot_and_sleep(lock, wakeup);
270 |         if (!__atomic_compare_exchange_n(&sleep_check_state, (int16_t *)&sleeping,
271 |                                          not_sleeping, 0,
272 |                                          __ATOMIC_SEQ_CST, __ATOMIC_RELAXED))
273 |             LOG_ERR(plog, "  sleep check state update failed\n");
274 |     }
275 |     else /* state == sleeping */
276 |         just_sleep(lock, wakeup);
277 | }
278 | 
279 | 


--------------------------------------------------------------------------------
/src/multiq.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 |   
 3 |     MultiQueues (http://arxiv.org/abs/1411.1209)
 4 |  */
 5 | 
 6 | #ifndef MULTIQ_H
 7 | #define MULTIQ_H
 8 | 
 9 | #include <stdint.h>
10 | #include "task.h"
11 | 
12 | 
13 | void multiq_init();
14 | void multiq_destroy();
15 | int multiq_insert(ptask_t *elem, int16_t priority);
16 | ptask_t *multiq_deletemin();
17 | int16_t multiq_minprio();
18 | void multiq_sleep_if_empty(pthread_mutex_t *lock, pthread_cond_t *wakeup);
19 | 
20 | 
21 | #endif /* MULTIQ_H */
22 | 
23 | 


--------------------------------------------------------------------------------
/src/partr.c:
--------------------------------------------------------------------------------
  1 | /*  partr -- parallel tasks runtime
  2 | 
  3 |     interface -- implementation of spawn/sync/parfor, thread function, etc.
  4 |  */
  5 | 
  6 | #include "partr.h"
  7 | 
  8 | #include <assert.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <strings.h>
 12 | #include <pthread.h>
 13 | #include <sched.h>
 14 | #include <hwloc.h>
 15 | 
 16 | #include "congrng.h"
 17 | #include "synctreepool.h"
 18 | #include "taskpools.h"
 19 | #include "multiq.h"
 20 | 
 21 | #include "profile.h"
 22 | 
 23 | 
 24 | /* used for logging by the entire runtime */
 25 | log_t plog;
 26 | 
 27 | /* number of threads created */
 28 | int16_t nthreads;
 29 | 
 30 | /* thread-local 0-based identifier */
 31 | __thread int16_t tid;
 32 | 
 33 | /* the `start` task */
 34 | ptask_t *start_task;
 35 | 
 36 | /* task currently being executed */
 37 | __thread ptask_t *curr_task;
 38 | 
 39 | /* RNG seed */
 40 | __thread uint64_t rngseed;
 41 | 
 42 | /* per-thread task queues, for sticky tasks */
 43 | __thread ptask_t **taskq;
 44 | __thread int8_t  *taskq_lock;
 45 | 
 46 | /* sticky task queues need to be visible to all threads */
 47 | ptask_t  ***all_taskqs;
 48 | int8_t   **all_taskq_locks;
 49 | 
 50 | /* thread sleep threshold */
 51 | uint64_t sleep_threshold;
 52 | 
 53 | /* per-thread sleep lock/wakeup signal */
 54 | __thread pthread_mutex_t *sleep_lock;
 55 | __thread pthread_cond_t *wake_signal;
 56 | 
 57 | /* thread sleep/wakeup signals need to be visible to all threads */
 58 | pthread_mutex_t **all_sleep_locks;
 59 | pthread_cond_t **all_wake_signals;
 60 | 
 61 | /* thread IDs */
 62 | pthread_t *all_thread_ids;
 63 | 
 64 | /* forward declare thread function */
 65 | static void *partr_thread(void *arg_);
 66 | 
 67 | /* internally used to indicate a yield occurred in the runtime itself */
 68 | static const int64_t yield_from_sync = 1;
 69 | 
 70 | /* initialization thread barrier */
 71 | static int volatile barcnt;
 72 | static int volatile barsense = 1;
 73 | 
 74 | #define BARRIER_INIT()          barcnt=nthreads
 75 | #define BARRIER_THREAD_DECL     int mysense = 1
 76 | #define BARRIER() do {                                                  \
 77 |     mysense = !mysense;                                                 \
 78 |     if (!__atomic_sub_fetch(&barcnt, 1, __ATOMIC_SEQ_CST)) {            \
 79 |         barcnt = nthreads;                                              \
 80 |         barsense = mysense;                                             \
 81 |     } else while (barsense != mysense);                                 \
 82 | } while(0)
 83 | 
 84 | 
 85 | /* thread function argument */
 86 | typedef struct lthread_arg_tag {
 87 |     int16_t             tid;
 88 |     int8_t              exclusive;
 89 |     hwloc_topology_t    topology;
 90 |     hwloc_cpuset_t      cpuset;
 91 | } lthread_arg_t;
 92 | 
 93 | 
 94 | /*  log_init() -- set up runtime logging
 95 |  */
 96 | static void log_init()
 97 | {
 98 |     int level;
 99 |     char *cp;
100 | 
101 |     cp = getenv(LOG_LEVEL_NAME);
102 |     if (!cp)
103 |         cp = DEFAULT_LOG_LEVEL;
104 |     if (strncasecmp(cp, "debug", 5) == 0)
105 |         level = LOG_LEVEL_DEBUG;
106 |     else if (strncasecmp(cp, "info", 4) == 0)
107 |         level = LOG_LEVEL_INFO;
108 |     else if (strncasecmp(cp, "err", 3) == 0)
109 |         level = LOG_LEVEL_ERR;
110 |     else if (strncasecmp(cp, "critical", 8) == 0)
111 |         level = LOG_LEVEL_CRITICAL;
112 |     else /* if (strncasecmp(cp, "warn", 4) == 0) */
113 |         level = LOG_LEVEL_WARN;
114 | 
115 |     LOG_SETUP(plog, level, stdout);
116 |     LOG_INFO(plog, "partr threading\n");
117 | }
118 | 
119 | 
120 | /*  show_affinity()
121 |  */
122 | #ifdef __linux__
123 | static void show_affinity()
124 | {
125 |     int i;
126 |     cpu_set_t cset;
127 |     char buf[2048], num[16];
128 | 
129 |     if (plog.level > LOG_LEVEL_DEBUG) return;
130 | 
131 |     pthread_t pthread_id = pthread_self();
132 | 
133 |     CPU_ZERO(&cset);
134 |     pthread_getaffinity_np(pthread_id, sizeof(cset), &cset);
135 |     buf[0] = '\0';
136 |     for (i = 0;  i < CPU_SETSIZE;  ++i) {
137 |         if (CPU_ISSET(i, &cset)) {
138 |             snprintf(num, 15, "%d ", i);
139 |             strcat(buf, num);
140 |         }
141 |     }
142 |     LOG_DEBUG(plog, "    <%d> bound to %d CPU(s): %s\n",
143 |               tid, CPU_COUNT(&cset), buf);
144 | }
145 | #else
146 | static void show_affinity()
147 | {
148 | }
149 | #endif
150 | 
151 | 
152 | /*  wake_thread()
153 |  */
154 | static void wake_thread(int16_t wtid)
155 | {
156 |     if (wtid != tid) {
157 |         pthread_mutex_lock(all_sleep_locks[wtid]);
158 |         pthread_cond_signal(all_wake_signals[wtid]);
159 |         pthread_mutex_unlock(all_sleep_locks[wtid]);
160 |     }
161 | }
162 | 
163 | 
164 | /*  wake_all_threads()
165 |  */
166 | static void wake_all_threads()
167 | {
168 |     for (int16_t i = 0;  i < nthreads;  ++i)
169 |         wake_thread(i);
170 | }
171 | 
172 | 
173 | /*  partr_init() -- initialization entry point
174 |  */
175 | void partr_init()
176 | {
177 |     log_init();
178 | 
179 |     char *cp;
180 | 
181 |     /* get requested # threads */
182 |     nthreads = DEFAULT_NUM_THREADS;
183 |     cp = getenv(NUM_THREADS_NAME);
184 |     if (cp)
185 |         nthreads = strtol(cp, NULL, 10);
186 |     LOG_INFO(plog, "  %d threads requested\n", nthreads);
187 | 
188 |     /* check if we have exclusive use of the machine */
189 |     int exclusive = DEFAULT_MACHINE_EXCLUSIVE;
190 |     cp = getenv(MACHINE_EXCLUSIVE_NAME);
191 |     if (cp)
192 |         exclusive = strtol(cp, NULL, 10);
193 | 
194 |     /* check machine topology */
195 |     hwloc_topology_t topology;
196 |     hwloc_topology_init(&topology);
197 |     hwloc_topology_load(topology);
198 |     int core_depth = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_CORE);
199 |     unsigned ncores = hwloc_get_nbobjs_by_depth(topology, core_depth);
200 |     LOG_INFO(plog, "  %d cores detected\n", ncores);
201 |     int pu_depth = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_PU);
202 |     unsigned npus = hwloc_get_nbobjs_by_depth(topology, pu_depth);
203 |     LOG_INFO(plog, "  %d PUs detected\n", npus);
204 | 
205 |     /* some sanity checks */
206 |     if (nthreads > npus) {
207 |         LOG_WARN(plog, "  won't over-subscribe; adjusting number of threads"
208 |                  " to %d\n", npus);
209 |         nthreads = npus;
210 |     }
211 |     if (nthreads < 1) {
212 |         LOG_INFO(plog, "  setting number of threads to number of cores (%d)\n",
213 |                  ncores);
214 |         nthreads = ncores;
215 |     }
216 |     int depth;
217 |     if (nthreads <= ncores) {
218 |         LOG_INFO(plog, "  1 thread per core\n");
219 |         depth = core_depth;
220 |     }
221 |     else {
222 |         LOG_INFO(plog, "  >1 thread per core\n");
223 |         depth = pu_depth;
224 |     }
225 | 
226 |     /* set affinity if we have exclusive use of the machine */
227 |     hwloc_obj_t obj;
228 |     hwloc_cpuset_t cpuset;
229 |     if (exclusive) {
230 |         LOG_INFO(plog, "  exclusive machine use\n");
231 | 
232 |         /* rebind this thread to the first core/PU */
233 |         obj = hwloc_get_obj_by_depth(topology, depth, 0);
234 |         assert(obj != NULL);
235 |         cpuset = hwloc_bitmap_dup(obj->cpuset);
236 |         /* hwloc_bitmap_singlify(cpuset); */
237 |         hwloc_set_cpubind(topology, cpuset, HWLOC_CPUBIND_THREAD);
238 |         hwloc_bitmap_free(cpuset);
239 |     }
240 |     else
241 |         LOG_INFO(plog, "  non-exclusive machine use\n");
242 | 
243 |     tid = 0;
244 |     seed_cong(&rngseed);
245 |     show_affinity();
246 | 
247 |     /* initialize task pools */
248 |     taskpools_init();
249 | 
250 |     /* initialize sync trees */
251 |     synctreepool_init();
252 | 
253 |     /* initialize task multiqueue */
254 |     multiq_init();
255 | 
256 |     /* initialize libconcurrent */
257 |     concurrent_init();
258 | 
259 |     /* set up the sleep threshold */
260 |     sleep_threshold = DEFAULT_THREAD_SLEEP_THRESHOLD;
261 |     cp = getenv(THREAD_SLEEP_THRESHOLD_NAME);
262 |     if (cp) {
263 |         if (!strncasecmp(cp, "infinite", 8))
264 |             sleep_threshold = 0;
265 |         else
266 |             sleep_threshold = (uint64_t)strtol(cp, NULL, 10);
267 |         LOG_INFO(plog, "  thread sleep threshold is %llu cycles\n", sleep_threshold);
268 |     }
269 | 
270 |     /* allocate per-thread task queues, for sticky tasks */
271 |     posix_memalign((void **)&all_taskqs, 64, nthreads * sizeof(ptask_t **));
272 |     posix_memalign((void **)&all_taskq_locks, 64, nthreads * sizeof(int8_t *));
273 | 
274 |     /* allocate per-thread sleep/wakeup locks and signals */
275 |     posix_memalign((void **)&all_sleep_locks, 64, nthreads * sizeof(pthread_mutex_t *));
276 |     posix_memalign((void **)&all_wake_signals, 64, nthreads * sizeof(pthread_cond_t *));
277 | 
278 |     /* setup profiling */
279 |     PROFILE_SETUP();
280 | 
281 |     /* start threads */
282 |     BARRIER_THREAD_DECL;
283 |     BARRIER_INIT();
284 | 
285 |     /* allocate space for all thread IDs */
286 |     all_thread_ids = (pthread_t *)calloc(nthreads, sizeof(pthread_t));
287 |     all_thread_ids[0] = pthread_self();
288 | 
289 |     for (int16_t i = 1;  i < nthreads;  ++i) {
290 |         lthread_arg_t *targ = (lthread_arg_t *)calloc(1, sizeof(lthread_arg_t));
291 |         targ->tid = i;
292 |         targ->exclusive = exclusive;
293 | 
294 |         if (exclusive) {
295 |             /* tell the thread which core to bind to */
296 |             obj = hwloc_get_obj_by_depth(topology, depth, i);
297 |             cpuset = hwloc_bitmap_dup(obj->cpuset);
298 |             targ->topology = topology;
299 |             targ->cpuset = cpuset;
300 |         }
301 |         pthread_create(&all_thread_ids[i], NULL, partr_thread, targ);
302 |     }
303 | 
304 |     /* allocate this thread's sticky task queue pointer and initialize the lock */
305 |     posix_memalign((void **)&taskq_lock, 64, sizeof(int8_t) + sizeof(ptask_t *));
306 |     taskq = (ptask_t **)(taskq_lock + sizeof(int8_t));
307 |     __atomic_clear(taskq_lock, __ATOMIC_RELAXED);
308 |     *taskq = NULL;
309 |     all_taskqs[tid] = taskq;
310 |     all_taskq_locks[tid] = taskq_lock;
311 | 
312 |     /* allocate this thread's sleep lock and wakeup signal */
313 |     posix_memalign((void **)&sleep_lock, 64, sizeof(pthread_mutex_t));
314 |     posix_memalign((void **)&wake_signal, 64, sizeof(pthread_cond_t));
315 |     pthread_mutex_init(sleep_lock, NULL);
316 |     pthread_cond_init(wake_signal, NULL);
317 |     all_sleep_locks[tid] = sleep_lock;
318 |     all_wake_signals[tid] = wake_signal;
319 | 
320 |     /* set up profiling in thread 0 also */
321 |     PROFILE_INIT_THREAD();
322 | 
323 |     /* wait for all threads to start up and bind to their CPUs */
324 |     BARRIER();
325 |     hwloc_topology_destroy(topology);
326 | }
327 | 
328 | 
329 | /*  partr_shutdown() -- shutdown all threads and clean up
330 |  */
331 | void partr_shutdown()
332 | {
333 |     /* create and add 'nthreads' terminate tasks */
334 |     LOG_INFO(plog, "  thread %d adding %d terminate tasks\n", tid, nthreads);
335 | 
336 |     for (int64_t i = 0;  i < nthreads;  ++i) {
337 |         ptask_t *task = task_alloc();
338 |         if (task == NULL) {
339 |             LOG_CRITICAL(plog, "  thread %d terminate task allocation failed!\n",
340 |                     tid);
341 |             break;
342 |         }
343 |         task->settings = TASK_TERMINATE;
344 |         if (multiq_insert(task, tid) != 0) {
345 |             task_free(task);
346 |             LOG_CRITICAL(plog, "  thread %d shutdown task insertion failed!\n", tid);
347 |             break;
348 |         }
349 |     }
350 |     wake_all_threads();
351 | 
352 |     /* wait for all threads to shut down */
353 |     for (int64_t i = 1;  i < nthreads;  ++i)
354 |         pthread_join(all_thread_ids[i], NULL);
355 | 
356 |     /* show profiling information */
357 |     PROFILE_PRINT();
358 | 
359 |     /* free thread IDs array */
360 |     free(all_thread_ids);
361 | 
362 |     /* free sleep lock and wakeup signal */
363 |     free(wake_signal);
364 |     free(sleep_lock);
365 |     free(all_wake_signals);
366 |     free(all_sleep_locks);
367 | 
368 |     /* free task queues and their locks */
369 |     free(taskq_lock);
370 |     free(all_taskq_locks);
371 |     free(all_taskqs);
372 | 
373 |     /* shut down the tasking library */
374 |     concurrent_fin();
375 | 
376 |     /* destroy the task queues */
377 |     multiq_destroy();
378 | 
379 |     /* destroy the sync trees */
380 |     synctreepool_destroy();
381 | 
382 |     /* destroy the task pools and free all tasks */
383 |     taskpools_destroy();
384 | }
385 | 
386 | 
387 | /*  partr_coro() -- coroutine entry point
388 |  */
389 | static void partr_coro(struct concurrent_ctx *ctx)
390 | {
391 |     ptask_t *task = ctx_get_user_ptr(ctx);
392 |     task->result = task->f(task->arg, task->start, task->end);
393 | 
394 |     /* grain tasks must synchronize */
395 |     if (task->grain_num >= 0) {
396 |         int was_last = 0;
397 | 
398 |         /* reduce... */
399 |         if (task->red) {
400 |             task->result = reduce(task->arr, task->red, task->rf,
401 |                                   task->result, task->grain_num);
402 |             /*  if this task is last, set the result in the parent task */
403 |             if (task->result) {
404 |                 task->parent->red_result = task->result;
405 |                 was_last = 1;
406 |             }
407 |         }
408 |         /* ... or just sync */
409 |         else {
410 |             if (last_arriver(task->arr, task->grain_num))
411 |                 was_last = 1;
412 |         }
413 | 
414 |         /* the last task to finish needs to finish up the loop */
415 |         if (was_last) {
416 |             LOG_DEBUG(plog, "  thread %d grain task %d (%p) was last\n",
417 |                       tid, task->grain_num, task);
418 | 
419 |             /* a non-parent task must wake up the parent */
420 |             if (task->grain_num > 0) {
421 |                 LOG_DEBUG(plog, "  thread %d waking loop parent task %p\n",
422 |                           tid, task->parent);
423 |                 multiq_insert(task->parent, 0);
424 |                 wake_all_threads();
425 |             }
426 |             /* the parent task was last; it can just end */
427 |         }
428 |         else {
429 |             /* the parent task needs to wait */
430 |             if (task->grain_num == 0) {
431 |                 LOG_DEBUG(plog, "  thread %d loop parent task %p yielding\n",
432 |                           tid, task);
433 |                 yield_value(task->ctx, (void *)yield_from_sync);
434 |             }
435 |         }
436 | 
437 |         if (task->grain_num == 0)
438 |             LOG_DEBUG(plog, "  thread %d completed loop task %p\n", tid, task);
439 |     }
440 | }
441 | 
442 | 
443 | /*  setup_task() -- allocate and initialize a task
444 |  */
445 | static ptask_t *setup_task(void *(*f)(void *, int64_t, int64_t), void *arg,
446 |         int64_t start, int64_t end)
447 | {
448 |     ptask_t *task = task_alloc();
449 |     if (task == NULL)
450 |         return NULL;
451 | 
452 |     ctx_construct(task->ctx, task->stack, TASK_STACK_SIZE, partr_coro, task);
453 |     task->f = f;
454 |     task->arg = arg;
455 |     task->start = start;
456 |     task->end = end;
457 |     task->settings = 0;
458 |     task->sticky_tid = -1;
459 |     task->grain_num = -1;
460 | 
461 |     return task;
462 | }
463 | 
464 | 
465 | /*  release_task() -- destroy the coroutine context and free the task
466 |  */
467 | static void *release_task(ptask_t *task)
468 | {
469 |     void *result = task->result;
470 |     ctx_destruct(task->ctx);
471 |     if (task->grain_num == 0  &&  task->red)
472 |         reducer_free(task->red);
473 |     if (task->grain_num == 0  &&  task->arr)
474 |         arriver_free(task->arr);
475 |     task->f = NULL;
476 |     task->arg = task->result = task->red_result = NULL;
477 |     task->start = task->end = 0;
478 |     task->rf = NULL;
479 |     task->parent = task->cq = NULL;
480 |     task->arr = NULL;
481 |     task->red = NULL;
482 |     task_free(task);
483 |     return result;
484 | }
485 | 
486 | 
487 | /*  add_to_taskq() -- add the specified task to the sticky task queue
488 |  */
489 | static void add_to_taskq(ptask_t *task)
490 | {
491 |     assert(task->sticky_tid != -1);
492 | 
493 |     ptask_t **q = all_taskqs[task->sticky_tid];
494 |     int8_t *lock = all_taskq_locks[task->sticky_tid];
495 | 
496 |     while (__atomic_test_and_set(lock, __ATOMIC_ACQUIRE))
497 |         cpu_pause();
498 | 
499 |     if (*q == NULL)
500 |         *q = task;
501 |     else {
502 |         ptask_t *pt = *q;
503 |         while (pt->next)
504 |             pt = pt->next;
505 |         pt->next = task;
506 |     }
507 | 
508 |     __atomic_clear(lock, __ATOMIC_RELEASE);
509 | 
510 |     wake_thread(task->sticky_tid);
511 | }
512 | 
513 | 
514 | /*  get_from_taskq() -- pop the first task off the sticky task queue
515 |  */
516 | static ptask_t *get_from_taskq()
517 | {
518 |     /* racy check for quick path */
519 |     if (*taskq == NULL)
520 |         return NULL;
521 | 
522 |     while (__atomic_test_and_set(taskq_lock, __ATOMIC_ACQUIRE))
523 |         cpu_pause();
524 | 
525 |     ptask_t *task = *taskq;
526 |     if (task) {
527 |         *taskq = task->next;
528 |         task->next = NULL;
529 |     }
530 | 
531 |     __atomic_clear(taskq_lock, __ATOMIC_RELEASE);
532 | 
533 |     return task;
534 | }
535 | 
536 | 
537 | /*  sleep_after_threshold() -- if sleep_threshold cycles have passed, sleep the thread
538 |  */
539 | static void sleep_after_threshold(uint64_t *start_cycles)
540 | {
541 |     if (sleep_threshold) {
542 |         if (!(*start_cycles)) {
543 |             *start_cycles = rdtscp();
544 |             return;
545 |         }
546 |         uint64_t elapsed_cycles = rdtscp() - (*start_cycles);
547 |         if (elapsed_cycles >= sleep_threshold) {
548 |             multiq_sleep_if_empty(sleep_lock, wake_signal);
549 |             *start_cycles = 0;
550 |         }
551 |     }
552 | }
553 | 
554 | 
555 | /*  run_next() -- run the next available task
556 |  */
557 | static int run_next()
558 | {
559 |     ptask_t *task;
560 | 
561 |     /* first check for sticky tasks */
562 |     task = get_from_taskq();
563 |     if (task == NULL) {
564 |         /* no sticky tasks, go to the multiq */
565 |         task = multiq_deletemin();
566 |         if (task != NULL)
567 |             assert(!(task->settings & TASK_IS_STICKY));
568 |     }
569 |     if (task == NULL)
570 |         return 0;
571 | 
572 |     /* terminate tasks tell the thread to die */
573 |     if (task->settings & TASK_TERMINATE) {
574 |         release_task(task);
575 |         LOG_INFO(plog, "  thread %d got terminate task\n", tid);
576 |         return -1;
577 |     }
578 | 
579 |     LOG_DEBUG(plog, "  thread %d resuming task %p\n", tid, task);
580 | 
581 |     /* run/resume the task */
582 |     curr_task = task;
583 |     int64_t y = (int64_t)resume(task->ctx);
584 |     curr_task = NULL;
585 | 
586 |     /* if the task isn't done, it is either in a CQ, or must be re-queued */
587 |     if (!ctx_is_done(task->ctx)) {
588 |         /* the yield value tells us if the task is in a CQ */
589 |         if (y != yield_from_sync) {
590 |             LOG_DEBUG(plog, "  thread %d had task %p yield\n", tid, task);
591 | 
592 |             /* sticky tasks go to the thread's sticky queue */
593 |             if (task->settings & TASK_IS_STICKY)
594 |                 add_to_taskq(task);
595 | 
596 |             /* all others go back into the multiq */
597 |             else {
598 |                 multiq_insert(task, task->prio);
599 |                 wake_all_threads();
600 |             }
601 |         }
602 |         return 1;
603 |     }
604 | 
605 |     LOG_DEBUG(plog, "  thread %d completed task %p\n", tid, task);
606 | 
607 |     /* The task completed. As detached tasks cannot be synced, clean
608 |        those up here.
609 |      */
610 |     if (task->settings & TASK_IS_DETACHED) {
611 |         release_task(task);
612 |         return 2;
613 |     }
614 | 
615 |     /* add back all the tasks in this one's completion queue */
616 |     while (__atomic_test_and_set(&task->cq_lock, __ATOMIC_ACQUIRE))
617 |         cpu_pause();
618 |     ptask_t *cqtask, *cqnext;
619 |     cqtask = task->cq;
620 |     task->cq = NULL;
621 |     while (cqtask) {
622 |         cqnext = cqtask->next;
623 |         cqtask->next = NULL;
624 |         LOG_DEBUG(plog, "  thread %d adding from task %p's CQ: %p\n",
625 |                     tid, task, cqtask);
626 |         if (cqtask->settings & TASK_IS_STICKY)
627 |             add_to_taskq(cqtask);
628 |         else {
629 |             multiq_insert(cqtask, cqtask->prio);
630 |             wake_all_threads();
631 |         }
632 |         cqtask = cqnext;
633 |     }
634 |     __atomic_clear(&task->cq_lock, __ATOMIC_RELEASE);
635 | 
636 |     return 2;
637 | }
638 | 
639 | 
640 | /*  partr_start() -- the runtime entry point
641 | 
642 |     To be called from thread 0, before creating any tasks. Wraps into
643 |     a task and invokes `f(arg)`; tasks should only be spawned/synced
644 |     from within tasks.
645 |  */
646 | int partr_start(void **ret, void *(*f)(void *, int64_t, int64_t),
647 |         void *arg, int64_t start, int64_t end)
648 | {
649 |     assert(tid == 0);
650 | 
651 |     start_task = setup_task(f, arg, start, end);
652 |     if (start_task == NULL)
653 |         return -1;
654 |     start_task->settings |= TASK_IS_STICKY;
655 |     start_task->sticky_tid = tid;
656 | 
657 |     LOG_DEBUG(plog, "  thread %d invoking start task %p\n", tid, start_task);
658 |     curr_task = start_task;
659 |     int64_t y = (int64_t)resume(start_task->ctx);
660 |     curr_task = NULL;
661 | 
662 |     if (!ctx_is_done(start_task->ctx)) {
663 |         LOG_DEBUG(plog, "  thread %d had start task %p yield\n", tid, start_task);
664 |         if (y != yield_from_sync) {
665 |             LOG_DEBUG(plog, "  thread %d re-inserting start task %p\n",
666 |                     tid, start_task);
667 |             add_to_taskq(start_task);
668 |         }
669 | 
670 |         while (run_next() != -1)
671 |             if (ctx_is_done(start_task->ctx))
672 |                 break;
673 |     }
674 | 
675 |     void *r = release_task(start_task);
676 |     if (ret)
677 |         *ret = r;
678 | 
679 |     LOG_DEBUG(plog, "  thread %d released start task %p\n", tid, start_task);
680 |     return 0;
681 | }
682 | 
683 | 
684 | /*  partr_thread() -- the thread function
685 | 
686 |     Loops, getting tasks from the multiqueue and executing them.
687 |  */
688 | static void *partr_thread(void *arg_)
689 | {
690 |     BARRIER_THREAD_DECL;
691 |     lthread_arg_t *arg = (lthread_arg_t *)arg_;
692 | 
693 |     tid = arg->tid;
694 |     seed_cong(&rngseed);
695 | 
696 |     /* set affinity if requested */
697 |     if (arg->exclusive) {
698 |         hwloc_set_cpubind(arg->topology, arg->cpuset, HWLOC_CPUBIND_THREAD);
699 |         hwloc_bitmap_free(arg->cpuset);
700 |     }
701 |     show_affinity();
702 | 
703 |     /* allocate this thread's sticky task queue pointer and initialize the lock */
704 |     posix_memalign((void **)&taskq_lock, 64, sizeof(int8_t) + sizeof(ptask_t *));
705 |     taskq = (ptask_t **)(taskq_lock + sizeof(int8_t));
706 |     __atomic_clear(taskq_lock, __ATOMIC_RELAXED);
707 |     *taskq = NULL;
708 |     all_taskqs[tid] = taskq;
709 |     all_taskq_locks[tid] = taskq_lock;
710 | 
711 |     /* allocate this thread's sleep lock and wakeup signal */
712 |     posix_memalign((void **)&sleep_lock, 64, sizeof(pthread_mutex_t));
713 |     posix_memalign((void **)&wake_signal, 64, sizeof(pthread_cond_t));
714 |     pthread_mutex_init(sleep_lock, NULL);
715 |     pthread_cond_init(wake_signal, NULL);
716 |     all_sleep_locks[tid] = sleep_lock;
717 |     all_wake_signals[tid] = wake_signal;
718 | 
719 |     /* set up per-thread profiling */
720 |     PROFILE_INIT_THREAD();
721 | 
722 |     BARRIER();
723 | 
724 |     /* free the thread function argument */
725 |     free(arg);
726 | 
727 |     /* run the scheduler */
728 |     uint64_t start_cycles = 0;
729 |     int r = 1;
730 |     while (r != -1) {
731 |         r = run_next();
732 |         if (r == 0)
733 |             sleep_after_threshold(&start_cycles);
734 |         else if (r > 0)
735 |             start_cycles = 0;
736 |     }
737 | 
738 |     /* free the sleep lock and wakeup signal */
739 |     free(wake_signal);
740 |     free(sleep_lock);
741 | 
742 |     /* free the sticky task queue pointer (and its lock) */
743 |     free(taskq_lock);
744 | 
745 |     LOG_INFO(plog, "  thread %d exiting\n", tid);
746 |     return NULL;
747 | }
748 | 
749 | 
750 | /*  partr_spawn() -- create a task for `f(arg)` and enqueue it for execution
751 | 
752 |     Implicitly asserts that `f(arg)` can run concurrently with everything
753 |     else that's currently running. If `detach` is set, the spawned task
754 |     will not be returned (and cannot be synced). Yields.
755 |  */
756 | int partr_spawn(partr_t *t, void *(*f)(void *, int64_t, int64_t),
757 |         void *arg, int64_t start, int64_t end, int8_t sticky, int8_t detach)
758 | {
759 |     PROFILE_START(PERF_SPAWN);
760 | 
761 |     ptask_t *task = setup_task(f, arg, start, end);
762 |     if (task == NULL)
763 |         return -1;
764 |     if (detach)
765 |         task->settings |= TASK_IS_DETACHED;
766 |     if (sticky) {
767 |         task->settings |= TASK_IS_STICKY;
768 |         task->sticky_tid = tid;
769 |         add_to_taskq(task);
770 |     }
771 |     else {
772 |         if (multiq_insert(task, tid) != 0) {
773 |             release_task(task);
774 |             return -2;
775 |         }
776 |         wake_all_threads();
777 |     }
778 | 
779 |     *t = detach ? NULL : (partr_t)task;
780 | 
781 |     LOG_DEBUG(plog, "  thread %d task %p spawned task %p\n", tid, curr_task, task);
782 | 
783 |     PROFILE_STAMP(PERF_SPAWN);
784 | 
785 |     /* only yield if we're running a non-sticky task */
786 |     if (!(curr_task->settings & TASK_IS_STICKY))
787 |         yield(curr_task->ctx);
788 | 
789 |     return 0;
790 | }
791 | 
792 | 
793 | /*  partr_sync() -- get the return value of task `t`
794 | 
795 |     Returns only when task `t` has completed.
796 |  */
797 | int partr_sync(void **r, partr_t t, int done_with_task)
798 | {
799 |     PROFILE_START(PERF_SYNC);
800 | 
801 |     ptask_t *task = (ptask_t *)t;
802 | 
803 |     /* if the target task has not finished, add the current task to its
804 |        completion queue; the thread that runs the target task will add
805 |        this task back to the ready queue
806 |      */
807 |     if (!ctx_is_done(task->ctx)) {
808 |         curr_task->next = NULL;
809 |         while (__atomic_test_and_set(&task->cq_lock, __ATOMIC_ACQUIRE))
810 |             cpu_pause();
811 | 
812 |         /* ensure the task didn't finish before we got the lock */
813 |         if (!ctx_is_done(task->ctx)) {
814 |             LOG_DEBUG(plog, "  thread %d task %p sync on task %p\n",
815 |                           tid, curr_task, task);
816 | 
817 |             /* add the current task to the CQ */
818 |             if (task->cq == NULL)
819 |                 task->cq = curr_task;
820 |             else {
821 |                 ptask_t *pt = task->cq;
822 |                 while (pt->next)
823 |                     pt = pt->next;
824 |                 pt->next = curr_task;
825 |             }
826 | 
827 |             /* unlock the CQ and yield the current task */
828 |             __atomic_clear(&task->cq_lock, __ATOMIC_RELEASE);
829 |             PROFILE_STAMP(PERF_SYNC);
830 |             yield_value(curr_task->ctx, (void *)yield_from_sync);
831 |             PROFILE_START(PERF_SYNC);
832 |         }
833 | 
834 |         /* the task finished before we could add to its CQ */
835 |         else {
836 |             __atomic_clear(&task->cq_lock, __ATOMIC_RELEASE);
837 |             LOG_DEBUG(plog, "  thread %d task %p sync on task %p success\n",
838 |                         tid, curr_task, task);
839 |         }
840 |     }
841 | 
842 |     if (r)
843 |         *r = task->grain_num >= 0 && task->red ?
844 |                 task->red_result : task->result;
845 | 
846 |     if (done_with_task)
847 |         release_task(task);
848 | 
849 |     PROFILE_STAMP(PERF_SYNC);
850 | 
851 |     return 0;
852 | }
853 | 
854 | 
855 | /*  partr_parfor() -- spawn multiple tasks for a parallel loop
856 | 
857 |     Spawn tasks that invoke `f(arg, start, end)` such that the sum of `end-start`
858 |     for all tasks is `count`. Uses `rf()`, if provided, to reduce the return
859 |     values from the tasks, and returns the result. Yields.
860 |  */
861 | int partr_parfor(partr_t *t, void *(*f)(void *, int64_t, int64_t),
862 |         void *arg, int64_t count, void *(*rf)(void *, void *))
863 | {
864 |     PROFILE_START(PERF_PARFOR);
865 | 
866 |     int64_t n = GRAIN_K * nthreads;
867 |     lldiv_t each = lldiv(count, n);
868 | 
869 |     /* allocate synchronization tree(s) */
870 |     arriver_t *arr = arriver_alloc();
871 |     if (arr == NULL) {
872 |         LOG_CRITICAL(plog, "  thread %d parfor arriver alloc failed!\n", tid);
873 |         return -1;
874 |     }
875 |     reducer_t *red = NULL;
876 |     if (rf != NULL) {
877 |         red = reducer_alloc();
878 |         if (red == NULL) {
879 |             arriver_free(arr);
880 |             LOG_CRITICAL(plog, "  thread %d parfor reducer alloc failed!\n", tid);
881 |             return -2;
882 |         }
883 |     }
884 | 
885 |     /* allocate and enqueue (GRAIN_K * nthreads) tasks */
886 |     *t = NULL;
887 |     int64_t start = 0, end;
888 |     for (int64_t i = 0;  i < n;  ++i) {
889 |         end = start + each.quot + (i < each.rem ? 1 : 0);
890 |         ptask_t *task = setup_task(f, arg, start, end);
891 |         if (task == NULL) {
892 |             LOG_CRITICAL(plog, "  thread %d parfor task setup failed!\n", tid);
893 |             return -1;
894 |         }
895 | 
896 |         /* The first task is the parent (root) task of the parfor, thus only
897 |            this can be synced. So, we create the remaining tasks detached.
898 |          */
899 |         if (*t == NULL) *t = task;
900 |         else task->settings = TASK_IS_DETACHED;
901 | 
902 |         task->parent = *t;
903 |         task->grain_num = i;
904 |         task->rf = rf;
905 |         task->arr = arr;
906 |         task->red = red;
907 | 
908 |         if (multiq_insert(task, tid) != 0) {
909 |             release_task(task);
910 |             LOG_CRITICAL(plog, "  thread %d parfor multiq insert failed!\n", tid);
911 |             return -3;
912 |         }
913 | 
914 |         start = end;
915 |     }
916 |     wake_all_threads();
917 | 
918 |     LOG_DEBUG(plog, "  thread %d task %p parfor spawned %lld tasks\n",
919 |             tid, curr_task, n);
920 | 
921 |     PROFILE_STAMP(PERF_PARFOR);
922 | 
923 |     /* only yield if we're running a non-sticky task */
924 |     if (!(curr_task->settings & TASK_IS_STICKY))
925 |         yield(curr_task->ctx);
926 | 
927 |     return 0;
928 | }
929 | 
930 | 


--------------------------------------------------------------------------------
/src/perfutil.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 |  */
 3 | 
 4 | #ifndef PERFUTIL_H
 5 | #define PERFUTIL_H
 6 | 
 7 | #include <stdint.h>
 8 | 
 9 | #define cpu_pause()  __asm__("pause;");
10 | 
11 | static inline uint64_t rdtscp()
12 | {
13 |     uint32_t lo, hi;
14 |     __asm__ volatile ("rdtscp"
15 |                       : /* outputs */ "=a" (lo), "=d" (hi)
16 |                       : /* no inputs */
17 |                       : /* clobbers */ "%rcx");
18 |     return ((uint64_t)hi << 32) + lo;
19 | }
20 | 
21 | 
22 | #endif /* PERFUTIL_H */
23 | 
24 | 


--------------------------------------------------------------------------------
/src/profile.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 | 
 3 |     thread-aware performance profiling
 4 |  */
 5 | 
 6 | #ifndef PROFILE_H
 7 | #define PROFILE_H
 8 | 
 9 | #include <stdint.h>
10 | #include <stdlib.h>
11 | #include "perfutil.h"
12 | 
13 | #ifdef PERF_PROFILE
14 | enum {
15 |     PERF_SPAWN, PERF_SYNC, PERF_PARFOR, NTIMES
16 | };
17 | 
18 | char *times_names[] = {
19 |     "spawn", "sync", "parfor", "run", ""
20 | };
21 | 
22 | typedef struct thread_timing_tag {
23 |     uint64_t last, min, max, total, count;
24 | } thread_timing_t;
25 | 
26 | thread_timing_t **thread_times;
27 | 
28 | #define PROFILE_SETUP()                                                       \
29 |     posix_memalign((void **)&thread_times,                                    \
30 |                    64, nthreads * sizeof(thread_timing_t *));
31 | 
32 | #define PROFILE_INIT_THREAD()                                                 \
33 |     posix_memalign((void **)&thread_times[tid], 64,                           \
34 |                    NTIMES * sizeof(thread_timing_t));                         \
35 |     for (int i = 0;  i < NTIMES;  i++) {                                      \
36 |         thread_times[tid][i].last = thread_times[tid][i].max =                \
37 |                 thread_times[tid][i].total = thread_times[tid][i].count = 0;  \
38 |         thread_times[tid][i].min = UINT64_MAX;                                \
39 |     }
40 | 
41 | #define PROFILE_START(w)                                                      \
42 |     thread_times[tid][(w)].last = rdtscp()
43 | 
44 | #define PROFILE_STAMP(w)                                                      \
45 | {                                                                             \
46 |     uint64_t l = thread_times[tid][(w)].last =                                \
47 |             rdtscp() - thread_times[tid][(w)].last;                           \
48 |     if (l < thread_times[tid][(w)].min) thread_times[tid][(w)].min = l;       \
49 |     if (l > thread_times[tid][(w)].max) thread_times[tid][(w)].max = l;       \
50 |     thread_times[tid][(w)].total += l;                                        \
51 |     ++thread_times[tid][(w)].count;                                           \
52 | }
53 | 
54 | #define PROFILE_PRINT()                                                       \
55 | {                                                                             \
56 |     thread_timing_t coll_times[NTIMES];                                       \
57 |     for (int i = 0;  i < NTIMES;  i++) {                                      \
58 |         memset(&coll_times[i], 0, sizeof (thread_timing_t));                  \
59 |         coll_times[i].min = UINT64_MAX;                                       \
60 |     }                                                                         \
61 |     for (int tnum = 0;  tnum < nthreads;  tnum++) {                           \
62 |         for (int i = 0;  i < NTIMES;  i++) {                                  \
63 |             coll_times[i].total += thread_times[tnum][i].total;               \
64 |             coll_times[i].count += thread_times[tnum][i].count;               \
65 |             if (thread_times[tnum][i].max > coll_times[i].max)                \
66 |                 coll_times[i].max = thread_times[tnum][i].max;                \
67 |             if (thread_times[tnum][i].min < coll_times[i].min)                \
68 |                 coll_times[i].min = thread_times[tnum][i].min;                \
69 |         }                                                                     \
70 |     }                                                                         \
71 |     printf("partr profile: #calls, mean ticks, [min, max]\n");                \
72 |     for (int i = 0;  i < NTIMES;  i++) {                                      \
73 |         uint64_t m = 0;                                                       \
74 |         if (coll_times[i].count > 0)                                          \
75 |             m = coll_times[i].total / (double)coll_times[i].count;            \
76 |         printf("%s: %llu, %llu, [%llu, %llu]\n",                              \
77 |                times_names[i], coll_times[i].count, m,                        \
78 |                coll_times[i].min, coll_times[i].max);                         \
79 |     }                                                                         \
80 | }
81 | 
82 | 
83 | #else  /* !PERF_PROFILE */
84 | 
85 | #define PROFILE_SETUP()
86 | #define PROFILE_INIT_THREAD()
87 | #define PROFILE_START(w)
88 | #define PROFILE_STAMP(w)
89 | #define PROFILE_PRINT()
90 | 
91 | #endif /* PERF_PROFILE */
92 | 
93 | #endif /* PROFILE_H */
94 | 
95 | 


--------------------------------------------------------------------------------
/src/synctreepool.c:
--------------------------------------------------------------------------------
  1 | /*  partr -- parallel tasks runtime
  2 | 
  3 |     Pool of synchronization trees, for synchronizing parfor-generated tasks.
  4 |     Synchronization and reduction are managed via two binary trees.
  5 |  */
  6 | 
  7 | #include <stdlib.h>
  8 | #include "partr.h"
  9 | #include "synctreepool.h"
 10 | 
 11 | 
 12 | /* arrival tree */
 13 | struct arriver_tag {
 14 |     int16_t index, next_avail;
 15 |     int16_t **tree;
 16 | };
 17 | 
 18 | 
 19 | /* reduction tree */
 20 | struct reducer_tag {
 21 |     int16_t index, next_avail;
 22 |     void ***tree;
 23 | };
 24 | 
 25 | 
 26 | 
 27 | /* pool of arrival trees */
 28 | static arriver_t *arriverpool;
 29 | static int16_t num_arrivers, num_arriver_tree_nodes, next_arriver;
 30 | 
 31 | 
 32 | /* pool of reduction trees */
 33 | static reducer_t *reducerpool;
 34 | static int16_t num_reducers, num_reducer_tree_nodes, next_reducer;
 35 | 
 36 | 
 37 | /*  synctreepool_init()
 38 |  */
 39 | void synctreepool_init()
 40 | {
 41 |     num_arriver_tree_nodes = (GRAIN_K * nthreads) - 1;
 42 |     num_reducer_tree_nodes = (2 * GRAIN_K * nthreads) - 1;
 43 | 
 44 |     /* num_arrivers = ((GRAIN_K * nthreads) ^ ARRIVERS_P) + 1 */
 45 |     num_arrivers = GRAIN_K * nthreads;
 46 |     for (int i = 1;  i < ARRIVERS_P;  ++i)
 47 |         num_arrivers = num_arrivers * num_arrivers;
 48 |     ++num_arrivers;
 49 | 
 50 |     num_reducers = num_arrivers * REDUCERS_FRAC;
 51 | 
 52 |     /* allocate */
 53 |     arriverpool = (arriver_t *)calloc(num_arrivers, sizeof (arriver_t));
 54 |     next_arriver = 0;
 55 |     for (int i = 0;  i < num_arrivers;  ++i) {
 56 |         arriverpool[i].index = i;
 57 |         arriverpool[i].next_avail = i + 1;
 58 |         posix_memalign((void **)&arriverpool[i].tree, 64,
 59 |                 num_arriver_tree_nodes * sizeof (int16_t *));
 60 |         //arriverpool[i].tree =
 61 |         //        aligned_alloc(64, num_arriver_tree_nodes * sizeof (int16_t *));
 62 |         for (int j = 0;  j < num_arriver_tree_nodes;  ++j)
 63 |             posix_memalign((void **)&arriverpool[i].tree[j], 64, sizeof (int16_t));
 64 |             //arriverpool[i].tree[j] = aligned_alloc(64, sizeof (int16_t));
 65 |     }
 66 |     arriverpool[num_arrivers - 1].next_avail = -1;
 67 | 
 68 |     reducerpool = (reducer_t *)calloc(num_reducers, sizeof (reducer_t));
 69 |     next_reducer = 0;
 70 |     for (int i = 0;  i < num_reducers;  ++i) {
 71 |         reducerpool[i].index = i;
 72 |         reducerpool[i].next_avail = i + 1;
 73 |         posix_memalign((void **)&reducerpool[i].tree, 64,
 74 |                 num_reducer_tree_nodes * sizeof (void **));
 75 |         //reducerpool[i].tree =
 76 |         //        aligned_alloc(64, num_reducer_tree_nodes * sizeof (void **));
 77 |         for (int j = 0;  j < num_reducer_tree_nodes;  ++j)
 78 |             posix_memalign((void **)&reducerpool[i].tree[j], 64, sizeof (void *));
 79 |             //reducerpool[i].tree[j] = aligned_alloc(64, sizeof (void *));
 80 |     }
 81 |     if (num_reducers > 0)
 82 |         reducerpool[num_reducers - 1].next_avail = -1;
 83 |     else
 84 |         next_reducer = -1;
 85 | 
 86 |     LOG_INFO(plog, "  %d arrivers and %d reducers allocated\n",
 87 |             num_arrivers, num_reducers);
 88 | }
 89 | 
 90 | 
 91 | /*  synctreepool_destroy()
 92 |  */
 93 | void synctreepool_destroy()
 94 | {
 95 |     for (int i = 0;  i < num_arrivers;  ++i) {
 96 |         for (int j = 0;  j < num_arriver_tree_nodes;  ++j)
 97 |             free(arriverpool[i].tree[j]);
 98 |         free(arriverpool[i].tree);
 99 |     }
100 |     free(arriverpool);
101 | 
102 |     arriverpool = NULL;
103 |     num_arrivers = 0;
104 |     num_arriver_tree_nodes = 0;
105 |     next_arriver = -1;
106 | 
107 |     for (int i = 0;  i < num_reducers;  ++i) {
108 |         for (int j = 0;  j < num_reducer_tree_nodes;  ++j)
109 |             free(reducerpool[i].tree[j]);
110 |         free(reducerpool[i].tree);
111 |     }
112 |     free(reducerpool);
113 | 
114 |     reducerpool = NULL;
115 |     num_reducers = 0;
116 |     num_reducer_tree_nodes = 0;
117 |     next_reducer = -1;
118 | }
119 | 
120 | 
121 | /*  arriver_alloc()
122 |  */
123 | arriver_t *arriver_alloc()
124 | {
125 |     int16_t candidate;
126 |     arriver_t *arr;
127 | 
128 |     do {
129 |         candidate = __atomic_load_n(&next_arriver, __ATOMIC_SEQ_CST);
130 |         if (candidate == -1) {
131 |             LOG_ERR(plog, "  <%d> arriver allocation failed\n", tid);
132 |             return NULL;
133 |         }
134 |         arr = &arriverpool[candidate];
135 |     } while (!__atomic_compare_exchange_n(&next_arriver,
136 |                      &candidate, arr->next_avail,
137 |                      0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED));
138 |     return arr;
139 | }
140 | 
141 | 
142 | /*  arriver_free()
143 |  */
144 | void arriver_free(arriver_t *arr)
145 | {
146 |     for (int i = 0;  i < num_arriver_tree_nodes;  ++i)
147 |         *arr->tree[i] = 0;
148 | 
149 |     __atomic_exchange(&next_arriver, &arr->index, &arr->next_avail,
150 |             __ATOMIC_SEQ_CST);
151 | }
152 | 
153 | 
154 | /*  reducer_alloc()
155 |  */
156 | reducer_t *reducer_alloc()
157 | {
158 |     int16_t candidate;
159 |     reducer_t *red;
160 | 
161 |     do {
162 |         candidate = __atomic_load_n(&next_reducer, __ATOMIC_SEQ_CST);
163 |         if (candidate == -1) {
164 |             LOG_ERR(plog, "  <%d> reducer allocation failed\n", tid);
165 |             return NULL;
166 |         }
167 |         red = &reducerpool[candidate];
168 |     } while (!__atomic_compare_exchange_n(&next_reducer,
169 |                      &candidate, red->next_avail,
170 |                      0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED));
171 |     return red;
172 | }
173 | 
174 | 
175 | /*  reducer_free()
176 |  */
177 | void reducer_free(reducer_t *red)
178 | {
179 |     for (int i = 0;  i < num_reducer_tree_nodes;  ++i)
180 |         *red->tree[i] = 0;
181 | 
182 |     __atomic_exchange(&next_reducer, &red->index, &red->next_avail,
183 |             __ATOMIC_SEQ_CST);
184 | }
185 | 
186 | 
187 | /*  last_arriver()
188 |  */
189 | int last_arriver(arriver_t *arr, int idx)
190 | {
191 |     int arrived, aidx = idx + (GRAIN_K * nthreads) - 1;
192 | 
193 |     while (aidx > 0) {
194 |         --aidx;
195 |         aidx >>= 1;
196 |         arrived = __atomic_fetch_add(arr->tree[aidx], 1, __ATOMIC_SEQ_CST);
197 |         if (!arrived) return 0;
198 |     }
199 | 
200 |     return 1;
201 | }
202 | 
203 | 
204 | /*  reduce()
205 |  */
206 | void *reduce(arriver_t *arr, reducer_t *red, void *(*rf)(void *, void *),
207 |         void *val, int idx)
208 | {
209 |     int arrived, aidx = idx + (GRAIN_K * nthreads) - 1, ridx = aidx, nidx;
210 | 
211 |     *red->tree[ridx] = val;
212 |     while (aidx > 0) {
213 |         --aidx;
214 |         aidx >>= 1;
215 |         arrived = __atomic_fetch_add(arr->tree[aidx], 1, __ATOMIC_SEQ_CST);
216 |         if (!arrived) return NULL;
217 | 
218 |         /* neighbor has already arrived, get its value and reduce it */
219 |         nidx = ridx & 0x1 ? ridx + 1 : ridx - 1;
220 |         val = rf(val, *red->tree[nidx]);
221 | 
222 |         /* move up the tree */
223 |         --ridx;
224 |         ridx >>= 1;
225 |         *red->tree[ridx] = val;
226 |     }
227 | 
228 |     return val;
229 | }
230 | 
231 | 


--------------------------------------------------------------------------------
/src/synctreepool.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 | 
 3 |     Pool of synchronization trees, for synchronizing parfor-generated tasks.
 4 |  */
 5 | 
 6 | #ifndef SYNCTREEPOOL_H
 7 | #define SYNCTREEPOOL_H
 8 | 
 9 | #include <stdint.h>
10 | 
11 | typedef struct arriver_tag arriver_t;
12 | typedef struct reducer_tag reducer_t;
13 | 
14 | /* interface */
15 | void synctreepool_init();
16 | void synctreepool_destroy();
17 | arriver_t *arriver_alloc();
18 | void arriver_free(arriver_t *);
19 | reducer_t *reducer_alloc();
20 | void reducer_free(reducer_t *);
21 | 
22 | int last_arriver(arriver_t *, int);
23 | void *reduce(arriver_t *, reducer_t *, void *(*rf)(void *, void *), void *, int);
24 | 
25 | 
26 | #endif /* SYNCTREEPOOL_H */
27 | 
28 | 


--------------------------------------------------------------------------------
/src/task.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 | 
 3 |     task definition
 4 |  */
 5 | 
 6 | #ifndef TASK_H
 7 | #define TASK_H
 8 | 
 9 | #include <stdint.h>
10 | #include <concurrent/concurrent.h>
11 | #include <concurrent/shortname.h>
12 | 
13 | 
14 | /* task settings */
15 | #define TASK_TERMINATE          0x01
16 |     /* terminate thread */
17 | #define TASK_IS_DETACHED        0x02
18 |     /* clean up the task on completion */
19 | #define TASK_IS_STICKY          0x04
20 |     /* task is sticky to the thread that first runs it */
21 | 
22 | typedef struct arriver_tag arriver_t;
23 | typedef struct reducer_tag reducer_t;
24 | 
25 | typedef struct ptask_tag ptask_t;
26 | 
27 | 
28 | /* a task */
29 | struct ptask_tag {
30 |     /* to link this task into queues */
31 |     ptask_t *next;
32 | 
33 |     /* coroutine context and stack */
34 |     struct concurrent_ctx *ctx;
35 |     uint8_t               *stack;
36 | 
37 |     /* task entry point, arguments, result, reduction function */
38 |     void    *(*f)(void *, int64_t, int64_t);
39 |     void    *arg, *result;
40 |     int64_t start, end;
41 | 
42 |     /* ----- IA-64 cache line boundary ----- */
43 | 
44 |     /* reduction function, for parfors */
45 |     void    *(*rf)(void *, void *);
46 | 
47 |     /* parent (first) task of a parfor set */
48 |     ptask_t *parent;
49 | 
50 |     /* to synchronize/reduce grains of a parfor */
51 |     arriver_t *arr;
52 |     reducer_t *red;
53 | 
54 |     /* parfor reduction result */
55 |     void *red_result;
56 | 
57 |     /* completion queue and lock */
58 |     ptask_t *cq;
59 |     int8_t  cq_lock;
60 | 
61 |     /* task settings */
62 |     int8_t  settings;
63 | 
64 |     /* tid of the thread to which this task is sticky */
65 |     int16_t sticky_tid;
66 | 
67 |     /* the index of this task in the set of grains of a parfor */
68 |     int16_t grain_num;
69 | 
70 |     /* for the multiqueue */
71 |     int16_t prio;
72 | 
73 |     /* to manage task pools */
74 |     int16_t pool, index, next_avail;
75 | 
76 |     /* padding to cache line boundary */
77 |     int8_t cl2_padding[2];
78 | 
79 |     /* ----- IA-64 cache line boundary ----- */
80 | };
81 | 
82 | 
83 | #endif /* TASK_H */
84 | 
85 | 


--------------------------------------------------------------------------------
/src/taskpools.c:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 | 
 3 |     taskpools for fast allocation/freeing
 4 |  */
 5 | 
 6 | #include <stdlib.h>
 7 | #include "partr.h"
 8 | #include "taskpools.h"
 9 | 
10 | 
11 | /* task pool for quick allocation/freeing of tasks */
12 | typedef struct ptaskpool_tag {
13 |     int16_t num_tasks, next_avail;
14 |     ptask_t *tasks;
15 | } ptaskpool_t;
16 | 
17 | 
18 | /* a task pool for each thread */
19 | static ptaskpool_t *ptaskpools;
20 | 
21 | 
22 | /*  taskpools_init()
23 |  */
24 | void taskpools_init()
25 | {
26 |     ptaskpools = (ptaskpool_t *)calloc(nthreads, sizeof(ptaskpool_t));
27 |     for (int16_t i = 0;  i < nthreads;  ++i) {
28 |         ptaskpools[i].num_tasks = TASKS_PER_POOL;
29 |         ptaskpools[i].next_avail = 0;
30 |         ptaskpools[i].tasks = (ptask_t *)
31 |                 calloc(TASKS_PER_POOL, sizeof(ptask_t));
32 |         for (int16_t j = 0;  j < TASKS_PER_POOL;  ++j) {
33 |             ptaskpools[i].tasks[j].ctx =
34 |                     calloc(ctx_sizeof(), sizeof(uint8_t));
35 |             ptaskpools[i].tasks[j].stack =
36 |                     calloc(TASK_STACK_SIZE, sizeof(uint8_t));
37 |             ptaskpools[i].tasks[j].pool = i;
38 |             ptaskpools[i].tasks[j].index = j;
39 |             ptaskpools[i].tasks[j].next_avail = j + 1;
40 |         }
41 |         ptaskpools[i].tasks[TASKS_PER_POOL-1].next_avail = -1;
42 |     }
43 |     LOG_INFO(plog, "  %d tasks allocated per pool\n", TASKS_PER_POOL);
44 | }
45 | 
46 | 
47 | /*  taskpools_destroy()
48 |  */
49 | void taskpools_destroy()
50 | {
51 |     for (int16_t i = 0;  i < nthreads;  ++i) {
52 |         for (int16_t j = 0;  j < TASKS_PER_POOL;  ++j) {
53 |             free(ptaskpools[i].tasks[j].stack);
54 |             free(ptaskpools[i].tasks[j].ctx);
55 |         }
56 |         free(ptaskpools[i].tasks);
57 |     }
58 |     free(ptaskpools);
59 | }
60 | 
61 | 
62 | /*  task_alloc()
63 |  */
64 | ptask_t *task_alloc()
65 | {
66 |     int16_t candidate;
67 |     ptask_t *task;
68 |     ptaskpool_t *pool = &ptaskpools[tid];
69 | 
70 |     do {
71 |         candidate = __atomic_load_n(&pool->next_avail, __ATOMIC_SEQ_CST);
72 |         if (candidate == -1) {
73 |             LOG_ERR(plog, "  <%d> task allocation failed\n", tid);
74 |             return NULL;
75 |         }
76 |         task = &pool->tasks[candidate];
77 |     } while (!__atomic_compare_exchange_n(&pool->next_avail,
78 |                      &candidate, task->next_avail,
79 |                      0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED));
80 |     return task;
81 | }
82 | 
83 | 
84 | /*  task_free()
85 |  */
86 | void task_free(ptask_t *task)
87 | {
88 |     ptaskpool_t *pool = &ptaskpools[task->pool];
89 |     __atomic_exchange(&pool->next_avail, &task->index, &task->next_avail,
90 |                       __ATOMIC_SEQ_CST);
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/src/taskpools.h:
--------------------------------------------------------------------------------
 1 | /*  partr -- parallel tasks runtime
 2 | 
 3 |     taskpools for fast task allocation/freeing
 4 |  */
 5 | 
 6 | #ifndef TASKPOOLS_H
 7 | #define TASKPOOLS_H
 8 | 
 9 | #include <stdint.h>
10 | #include "task.h"
11 | 
12 | 
13 | /* interface */
14 | void taskpools_init();
15 | void taskpools_destroy();
16 | ptask_t *task_alloc();
17 | void task_free(ptask_t *);
18 | 
19 | 
20 | #endif /* TASKPOOLS_H */
21 | 
22 | 


--------------------------------------------------------------------------------
/test/fib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #include "partr.h"
 4 | 
 5 | void *fib(void *arg_, int64_t start, int64_t end)
 6 | {
 7 |     partr_t tx;
 8 |     int64_t x, y, n = (int64_t)arg_;
 9 |     if (n < 2)
10 |         return (void *)n;
11 | 
12 |     partr_spawn(&tx, fib, (void *)n-1, 0, 0, 0, 0);
13 |     y = (int64_t)fib((void *)n-2, 0, 0);
14 |     partr_sync((void *)&x, tx, 1);
15 | 
16 |     return (void *)x + y;
17 | }
18 | 
19 | void *serial_fib(void *arg_)
20 | {
21 |     int64_t x, y, n = (int64_t)arg_;
22 |     if (n < 2)
23 |         return (void *)n;
24 |     x = (int64_t)serial_fib((void *)n-1);
25 |     y = (int64_t)serial_fib((void *)n-2);
26 |     return (void *)x + y;
27 | }
28 | 
29 | void *run(void *arg, int64_t start, int64_t end)
30 | {
31 |     int64_t v = 10, result, sresult;
32 |     result = (int64_t)fib((void *)v, 0, 0);
33 |     sresult = (int64_t)serial_fib((void *)v);
34 |     printf("fib(%lld)=%lld\nserial_fib(%lld)=%lld\n", v, result, v, sresult);
35 | 
36 |     return 0;
37 | }
38 | 
39 | int main(int argc, char **argv)
40 | {
41 |     void *ret;
42 |     partr_init();
43 |     partr_start(&ret, run, NULL, 0, 0);
44 |     partr_shutdown();
45 |     return 0;
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/test/l3d.c:
--------------------------------------------------------------------------------
  1 | // This file is a part of Julia. License is MIT: http://julialang.org/license
  2 | 
  3 | // GCC command line: gcc -fopenmp -mavx2 laplace3d.c -o laplace3d
  4 | 
  5 | /* Laplace 3D
  6 | 
  7 |    orig:        simple serial version
  8 |    naive:       simple parallelized version
  9 |    auto:        some ninja knowledge, using icc directives
 10 |    sse/avx:     ninja-optimized
 11 | 
 12 |    Requires Sandy Bridge and up.
 13 | 
 14 |    Note that the SSE/AVX versions do not handle boundary conditions
 15 |    and thus each dimension must be 4n+2/8n+2. Try 258x258x258.
 16 | 
 17 |    2014.08.06   anand.deshpande         Initial code.
 18 |    2014.08.06   dhiraj.kalamkar         Padding and streaming stores.
 19 |  */
 20 | 
 21 | #include <stdint.h>
 22 | #include <stdio.h>
 23 | #include <stdlib.h>
 24 | #include <string.h>
 25 | #include <strings.h>
 26 | #include <math.h>
 27 | #include <unistd.h>
 28 | #include <immintrin.h>
 29 | #include <omp.h>
 30 | #include <partr.h>
 31 | 
 32 | #if defined(__i386__)
 33 | static inline uint64_t rdtsc(void)
 34 | {
 35 |     uint64_t x;
 36 |     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
 37 |     return x;
 38 | }
 39 | #elif defined(__x86_64__)
 40 | static inline uint64_t rdtsc(void)
 41 | {
 42 |     unsigned hi, lo;
 43 |     __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
 44 |     return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 45 | }
 46 | #elif defined(_COMPILER_MICROSOFT_)
 47 | #include <intrin.h>
 48 | static inline uint64_t rdtsc(void)
 49 | {
 50 |     return __rdtsc();
 51 | }
 52 | #endif
 53 | 
 54 | void l3d_naive(int nx, int padded_nx, int ny, int nz, float *u1, float *u2);
 55 | void l3d_auto(int nx, int padded_nx, int ny, int nz, float *u1, float *u2);
 56 | void l3d_sse(int nx, int padded_nx, int ny, int nz, float *u1, float *u2);
 57 | void l3d_avx(int nx, int padded_nx, int ny, int nz, float *u1, float *u2);
 58 | void l3d_partr(int nx, int padded_nx, int ny, int nz, float *u1, float *u2);
 59 | void l3d_orig(int nx, int ny, int nz, float *u1, float *u2);
 60 | 
 61 | double cpughz()
 62 | {
 63 |     uint64_t t0 = rdtsc();
 64 |     sleep(1);
 65 |     uint64_t onesec = rdtsc() - t0;
 66 |     return onesec*1.0/1e9;
 67 | }
 68 | 
 69 | int main(int argc, char **argv)
 70 | {
 71 |     int nx, padded_nx, ny, nz, iters, i, j, k, ind, p_ind, verify,
 72 |         nthreads, pad_size;
 73 |     float *u1, *u1_p, *u1_orig, *u2, *u2_p, *u2_orig, *foo,
 74 |         error_tol = 0.00001;
 75 |     double ghz;
 76 |     void (*l3d)(int nx, int padded_nx, int ny, int nz,
 77 |         float *u1, float *u2);
 78 | 
 79 |     if (argc != 7) {
 80 |         fprintf(stderr, "Usage:\n"
 81 |             "  laplace3d <nx> <ny> <nz> <#iters> <naive|auto|sse|avx|partr> "
 82 |             "<verify?>\n");
 83 |         exit(-1);
 84 |     }
 85 | 
 86 |     nx = strtol(argv[1], NULL, 10);
 87 |     ny = strtol(argv[2], NULL, 10);
 88 |     nz = strtol(argv[3], NULL, 10);
 89 | 
 90 |     padded_nx = ((nx + 0x7) & (~0x7));
 91 |     iters = strtol(argv[4], NULL, 10);
 92 | 
 93 |     if (strncasecmp(argv[5], "naive", 5) == 0)
 94 |         l3d = l3d_naive;
 95 |     else if (strncasecmp(argv[5], "auto", 4) == 0)
 96 |         l3d = l3d_auto;
 97 |     else if (strncasecmp(argv[5], "sse", 3) == 0)
 98 |         l3d = l3d_sse;
 99 |     else if (strncasecmp(argv[5], "avx", 3) == 0)
100 |         l3d = l3d_avx;
101 |     else if (strncasecmp(argv[5], "partr", 5) == 0)
102 |         l3d = l3d_partr;
103 |     else {
104 |         fprintf(stderr,
105 |                 "don't recognize %s. naive, auto, sse, avx, or partr?\n",
106 |                 argv[5]);
107 |         exit(-1);
108 |     }
109 | 
110 |     verify = strtol(argv[6], NULL, 10);
111 | 
112 |     ghz = cpughz();
113 |     nthreads = omp_get_max_threads();
114 |     printf("machine speed is %g GHz, using %d threads\n", ghz, nthreads);
115 | 
116 |     printf("laplace3d: %d iterations on %dx%dx%d grid, "
117 |            "verification is %s\n",
118 |            iters, nx, ny, nz, verify ? "on" : "off");
119 | 
120 |     /* pad for aligned access; non-naive only */
121 |     if (strncasecmp(argv[5], "naive", 5) != 0) {
122 |         pad_size = (((1 + padded_nx + padded_nx * ny) + 0xF) & (~0xF)) -
123 |                 (1 + padded_nx + padded_nx * ny);
124 |         printf("using padded_nx = %d, pad_size = %d\n",
125 |                padded_nx, pad_size);
126 | 
127 |         u1_p = (float *)_mm_malloc(sizeof (float) *
128 |                 (padded_nx * ny * nz + pad_size), 64);
129 |         u2_p = (float *)_mm_malloc(sizeof (float) *
130 |                 (padded_nx * ny * nz + pad_size), 64);
131 |         u1 = u1_p + pad_size;
132 |         u2 = u2_p + pad_size;
133 |     }
134 |     else {
135 |         u1_p = (float *)_mm_malloc(sizeof (float) * (nx * ny * nz), 64);
136 |         u2_p = (float *)_mm_malloc(sizeof (float) * (nx * ny * nz), 64);
137 |         u1 = u1_p;
138 |         u2 = u2_p;
139 |         padded_nx = nx;
140 |     }
141 |     u1_orig = (float *)_mm_malloc(sizeof (float) * nx * ny * nz, 64);
142 |     u2_orig = (float *)_mm_malloc(sizeof (float) * nx * ny * nz, 64);
143 | 
144 |     // initialize
145 |     #pragma omp parallel for private(k,j,i,ind,p_ind)
146 |     for (k = 0;  k < nz;  ++k) {
147 |         for (j = 0;  j < ny;  ++j) {
148 |             for (i = 0;  i < nx;  ++i) {
149 |                 ind = i + j*nx + k*nx*ny;
150 |                 p_ind = i + j*padded_nx + k*padded_nx*ny;
151 | 
152 |                 if (i == 0  ||  i == nx - 1
153 |                         ||  j == 0  ||  j == ny - 1
154 |                         ||  k == 0  ||  k == nz - 1) {
155 |                     // Dirichlet b.c.'s
156 |                     u1[p_ind] = u1_orig[ind] = u2[p_ind] = 1.0f;
157 |                 }
158 |                 else {
159 |                     u1[p_ind] = u1_orig[ind] = u2[p_ind] = 0.0f;
160 |                 }
161 |             }
162 |         }
163 |     }
164 | 
165 |     if (strncasecmp(argv[5], "partr", 5) == 0)
166 |         partr_init();
167 | 
168 |     // run optimized version
169 |     uint64_t t0 = rdtsc();
170 |     for (i = 0;  i < iters;  ++i) {
171 |         l3d(nx, padded_nx, ny, nz, u1, u2);
172 |         foo = u1; u1 = u2; u2 = foo;
173 |     }
174 |     uint64_t gold = rdtsc() - t0;
175 |     double elapsed = gold / (ghz * 1e9);
176 | 
177 |     double grid_size = nx * ny * nz;
178 |     double gflops = grid_size * iters * 6.0 / 1e9;
179 |     double gflops_sec = gflops / elapsed;
180 | 
181 |     double traffic = grid_size * iters * 4 * 2.0 / 1e9;
182 |     double bw_realized = traffic / elapsed;
183 | 
184 |     printf("laplace3d completed in %.4lf seconds\n", elapsed);
185 |     printf("GFLOPs/sec: %.1f\n", gflops_sec);
186 |     printf("BW realized: %.1f\n", bw_realized);
187 | 
188 |     if (verify) {
189 |         // run serial version for verification
190 |         uint64_t st0 = rdtsc();
191 |         for (i = 0;  i < iters;  ++i) {
192 |             l3d_orig(nx, ny, nz, u1_orig, u2_orig);
193 |             foo = u1_orig; u1_orig = u2_orig; u2_orig = foo;
194 |         }
195 |         uint64_t ser = rdtsc() - st0;
196 |         elapsed = ser / (ghz * 1e9);
197 |         gflops_sec = gflops / elapsed;
198 |         bw_realized = traffic / elapsed;
199 | 
200 |         printf("laplace3d_orig completed in %.2lf seconds\n", elapsed);
201 |         printf("GFLOPs/sec: %.1f\n", gflops_sec);
202 |         printf("BW realized: %.1f\n", bw_realized);
203 | 
204 |         // verify
205 |         for (k = 0;  k < nz;  ++k) {
206 |             for (j = 0;  j < ny;  ++j) {
207 |                 for (i = 0;  i < nx;  ++i) {
208 |                     ind = i + j*nx + k*nx*ny;
209 |                     p_ind = i + j*padded_nx + k*padded_nx*ny;
210 | 
211 |                     if (fabs(u1[p_ind] - u1_orig[ind]) > error_tol) {
212 |                         printf("ERROR %f - %f [%d, %d, %d]\n",
213 |                                u1[p_ind], u1_orig[ind], i, j, k);
214 |                         goto done;
215 |                     }
216 |                 }
217 |             }
218 |         }
219 |         printf("verified, no error\n");
220 |     }
221 | 
222 |     if (strncasecmp(argv[5], "partr", 5) == 0)
223 |         partr_shutdown();
224 | 
225 |     done:
226 |     _mm_free(u1_p);
227 |     _mm_free(u2_p);
228 |     _mm_free(u1_orig);
229 |     _mm_free(u2_orig);
230 | 
231 |     return 0;
232 | }
233 | 
234 | void l3d_naive(int nx, int padded_nx, int ny, int nz, float *u1, float *u2)
235 | {
236 |     int i, j, k, ind;
237 |     const float sixth = 1.0f/6.0f;
238 | 
239 |     /* compute on the grid */
240 |     #pragma omp parallel for private(i,j,k,ind)
241 |     for (k = 1;  k < nz-1;  ++k) {
242 |         for (j = 1;  j < ny-1;  ++j) {
243 |             #pragma ivdep
244 |             for (i = 1;  i < nx-1;  ++i) {
245 |                 ind = i + j*padded_nx + k*padded_nx*ny;
246 |                 u2[ind] =
247 |                    ( u1[ind-1    ]        + u1[ind+1    ]
248 |                    + u1[ind-padded_nx   ] + u1[ind+padded_nx   ]
249 |                    + u1[ind-padded_nx*ny] + u1[ind+padded_nx*ny] ) * sixth;
250 |             }
251 |         }
252 |     }
253 | }
254 | 
255 | void l3d_auto(int nx, int padded_nx, int ny, int nz, float *u1, float *u2)
256 | {
257 |     int i, j, k, ind;
258 | 
259 |     float sixth = 1.0f/6.0f;
260 | 
261 | #if defined(__INTEL_COMPILER)
262 |     __assume(padded_nx%8==0);
263 |     __assume_aligned(&u1[1],32);
264 |     __assume_aligned(&u2[1],32);
265 | #elif defined(__GNUC__)
266 |     if (!(padded_nx%8==0))
267 |         __builtin_unreachable();
268 |     // third argument is the misalignment
269 |     u1 = __builtin_assume_aligned(u1, 32, sizeof(float));
270 |     u2 = __builtin_assume_aligned(u2, 32, sizeof(float));
271 | #endif
272 | 
273 |     /* compute on the grid */
274 |     #pragma omp parallel for private(i,j,k,ind)
275 |     for (k = 1;  k < nz-1;  ++k) {
276 |         for (j = 1;  j < ny-1;  ++j) {
277 |             #pragma vector nontemporal(u2)
278 |             for (i = 1;  i < nx-1;  ++i) {
279 |                 ind = i + j*padded_nx + k*padded_nx*ny;
280 |                 u2[ind] =
281 |                     ( u1[ind-1    ]        + u1[ind+1    ]
282 |                     + u1[ind-padded_nx   ] + u1[ind+padded_nx   ]
283 |                     + u1[ind-padded_nx*ny] + u1[ind+padded_nx*ny] ) * sixth;
284 |             }
285 |         }
286 |     }
287 | }
288 | 
289 | void l3d_sse(int nx, int padded_nx, int ny, int nz, float *u1, float *u2)
290 | {
291 |     int i, j, k, ind;
292 | 
293 |     float fsixth = 1.0f/6.0f;
294 |     __m128 sixth = _mm_set_ps1(fsixth);
295 | 
296 |     /* compute on the grid */
297 |     #pragma omp parallel for private(i,j,k,ind)
298 |     for (k = 1;  k < nz-1;  ++k) {
299 |         for (j = 1;  j < ny-1;  ++j) {
300 |             for (i = 1;  i < nx-1;  i += 4) {
301 |                 ind = i + j*padded_nx + k*padded_nx*ny;
302 | 
303 |                 __m128 pSrc1 = _mm_loadu_ps(&u1[ind-1]);
304 |                 __m128 pSrc2 = _mm_loadu_ps(&u1[ind+1]);
305 |                 __m128 pSrc3 = _mm_load_ps(&u1[ind-padded_nx]);
306 |                 __m128 pSrc4 = _mm_load_ps(&u1[ind+padded_nx]);
307 |                 __m128 pSrc5 = _mm_load_ps(&u1[ind-padded_nx*ny]);
308 |                 __m128 pSrc6 = _mm_load_ps(&u1[ind+padded_nx*ny]);
309 | 
310 |                 __m128 sum1 = _mm_add_ps(pSrc1, pSrc2);
311 |                 __m128 sum2 = _mm_add_ps(pSrc3, pSrc4);
312 |                 __m128 sum3 = _mm_add_ps(pSrc5, pSrc6);
313 |                 __m128 sum4 = _mm_add_ps(sum1, sum2);
314 |                 __m128 vsum = _mm_add_ps(sum3, sum4);
315 | 
316 |                 vsum = _mm_mul_ps(vsum, sixth);
317 | 
318 |                 _mm_stream_ps(&u2[ind], vsum);
319 |             }
320 |         }
321 |     }
322 | }
323 | 
324 | void l3d_avx(int nx, int padded_nx, int ny, int nz, float *u1, float *u2)
325 | {
326 |     int i, j, k, ind;
327 | 
328 |     float fsixth = 1.0f/6.0f;
329 |     __m256 sixth = _mm256_set1_ps(fsixth);
330 | 
331 |     /* compute on the grid */
332 |     #pragma omp parallel for private(i,j,k,ind)
333 |     for (k = 1;  k < nz-1;  ++k) {
334 |         for (j = 1;  j < ny-1;  ++j) {
335 |             for (i = 1;  i < nx-1;  i += 8) {
336 |                 ind = i + j*padded_nx + k*padded_nx*ny;
337 | 
338 |                 __m256 pSrc1 = _mm256_loadu_ps(&u1[ind-1]);
339 |                 __m256 pSrc2 = _mm256_loadu_ps(&u1[ind+1]);
340 |                 __m256 pSrc3 = _mm256_load_ps(&u1[ind-padded_nx]);
341 |                 __m256 pSrc4 = _mm256_load_ps(&u1[ind+padded_nx]);
342 |                 __m256 pSrc5 = _mm256_load_ps(&u1[ind-padded_nx*ny]);
343 |                 __m256 pSrc6 = _mm256_load_ps(&u1[ind+padded_nx*ny]);
344 | 
345 |                 __m256 sum1 = _mm256_add_ps(pSrc1, pSrc2);
346 |                 __m256 sum2 = _mm256_add_ps(pSrc3, pSrc4);
347 |                 __m256 sum3 = _mm256_add_ps(pSrc5, pSrc6);
348 |                 __m256 sum4 = _mm256_add_ps(sum1, sum2);
349 |                 __m256 vsum = _mm256_add_ps(sum3, sum4);
350 | 
351 |                 vsum = _mm256_mul_ps(vsum, sixth);
352 | 
353 |                 _mm256_stream_ps(&u2[ind], vsum);
354 |             }
355 |         }
356 |     }
357 | }
358 | 
359 | typedef struct task_arg_tag {
360 |     int nx, padded_nx, ny, nz;
361 |     float *u1, *u2;
362 | } task_arg_t;
363 | 
364 | void *l3d_partr_iter(void *arg, int64_t start, int64_t end)
365 | {
366 |     int i, j, k, ind;
367 |     const float sixth = 1.0f/6.0f;
368 |     task_arg_t *ta = (task_arg_t *)arg;
369 |     int nx = ta->nx;
370 |     int ny = ta->ny;
371 |     int nz = ta->nz;
372 |     float *u1 = ta->u1;
373 |     float *u2 = ta->u2;
374 | 
375 |     for (k = start;  k < end;  ++k) {
376 |         for (j = 0;  j < ny;  ++j) {
377 |             for (i = 0;  i < nx;  ++i) {
378 |                 ind = i + j*nx + k*nx*ny;
379 | 
380 |                 if (i == 0  ||  i == nx - 1
381 |                         ||  j == 0  ||  j == ny - 1
382 |                         ||  k == 0  ||  k == nz - 1) {
383 |                     u2[ind] = u1[ind];          // Dirichlet b.c.'s
384 |                 }
385 |                 else {
386 |                     u2[ind] = ( u1[ind-1    ] + u1[ind+1    ]
387 |                               + u1[ind-nx   ] + u1[ind+nx   ]
388 |                               + u1[ind-nx*ny] + u1[ind+nx*ny] ) * sixth;
389 |                 }
390 |             }
391 |         }
392 |     }
393 |     return NULL;
394 | }
395 | 
396 | void *l3d_partr_run(void *arg, int64_t start, int64_t end)
397 | {
398 |     partr_t t;
399 |     partr_parfor(&t, l3d_partr_iter, arg, end - start, NULL);
400 |     partr_sync(NULL, t, 1);
401 |     return NULL;
402 | }
403 | 
404 | void l3d_partr(int nx, int padded_nx, int ny, int nz, float *u1, float *u2)
405 | {
406 |     task_arg_t task_arg;
407 |     task_arg.nx = nx;
408 |     task_arg.padded_nx = padded_nx;
409 |     task_arg.ny = ny;
410 |     task_arg.nz = nz;
411 |     task_arg.u1 = u1;
412 |     task_arg.u2 = u2;
413 |     partr_start(NULL, l3d_partr_run, (void *)&task_arg, 0, nz);
414 | }
415 | 
416 | void l3d_orig(int nx, int ny, int nz, float *u1, float *u2)
417 | {
418 |     int i, j, k, ind;
419 |     const float sixth = 1.0f/6.0f;
420 | 
421 |     for (k = 0;  k < nz;  ++k) {
422 |         for (j = 0;  j < ny;  ++j) {
423 |             for (i = 0;  i < nx;  ++i) {
424 |                 ind = i + j*nx + k*nx*ny;
425 | 
426 |                 if (i == 0  ||  i == nx - 1
427 |                         ||  j == 0  ||  j == ny - 1
428 |                         ||  k == 0  ||  k == nz - 1) {
429 |                     u2[ind] = u1[ind];          // Dirichlet b.c.'s
430 |                 }
431 |                 else {
432 |                     u2[ind] = ( u1[ind-1    ] + u1[ind+1    ]
433 |                               + u1[ind-nx   ] + u1[ind+nx   ]
434 |                               + u1[ind-nx*ny] + u1[ind+nx*ny] ) * sixth;
435 |                 }
436 |             }
437 |         }
438 |     }
439 | }
440 | 


--------------------------------------------------------------------------------
/test/makefile:
--------------------------------------------------------------------------------
 1 | # parallel tasks runtime
 2 | #
 3 | # makefile for tests
 4 | #
 5 | # 2016.06.01   kiran.pamnany   Initial code
 6 | #
 7 | 
 8 | CC=gcc
 9 | 
10 | .SUFFIXES: .c .h .o
11 | .PHONY: clean
12 | 
13 | CFLAGS+=-Wall
14 | CFLAGS+=-std=c11
15 | CFLAGS+=-D_GNU_SOURCE
16 | CFLAGS+=-I../../hwloc/include
17 | CFLAGS+=-I../../libconcurrent/include
18 | CFLAGS+=-I.
19 | CFLAGS+=-I../include
20 | CFLAGS+=-I../src
21 | 
22 | LDOBJS+=-lpthread
23 | LDOBJS+=tap.o
24 | LDOBJS+=../libpartr.a
25 | LDOBJS+=../../hwloc/src/.libs/libhwloc.a
26 | LDOBJS+=../../libconcurrent/libconcurrent.a
27 | 
28 | TAPSRC=tap.c
29 | TAPOBJ=tap.o
30 | 
31 | SRCS=taskpoolstest.c multiqtest.c parfortest.c sleeptest.c fib.c l3d.c l3d_partr.c
32 | OBJS=${SRCS:.c=.o}
33 | BINS=${SRCS:.c=}
34 | OBJS=$(subst .c,.o, $(SRCS))
35 | BINS=$(subst .c,, $(SRCS))
36 | 
37 | ifeq ($(DEBUG),yes)
38 |     CFLAGS+=-O0 -g
39 | else
40 |     CFLAGS+=-O3
41 | endif
42 | 
43 | all: $(BINS)
44 | 
45 | $(BINS): $(SRCS) $(TAPOBJ) ../libpartr.a
46 | 	#$(CC) -fopenmp $(CFLAGS) l3d.c -o l3d $(LDOBJS)
47 | 	$(CC) $(CFLAGS) l3d_partr.c -o l3d_partr $(LDOBJS)
48 | 	$(CC) $(CFLAGS) taskpoolstest.c -o taskpoolstest $(LDOBJS)
49 | 	$(CC) $(CFLAGS) multiqtest.c -o multiqtest $(LDOBJS)
50 | 	$(CC) $(CFLAGS) parfortest.c -o parfortest $(LDOBJS)
51 | 	$(CC) $(CFLAGS) sleeptest.c -o sleeptest $(LDOBJS)
52 | 	$(CC) $(CFLAGS) fib.c -o fib $(LDOBJS)
53 | 
54 | $(TAPOBJ): $(TAPSRC)
55 | 	$(CC) $(CFLAGS) -c $(TAPSRC)
56 | 
57 | clean:
58 | 	$(RM) $(BINS) $(OBJS) $(TAPOBJ)
59 | 
60 | 


--------------------------------------------------------------------------------
/test/multiqtest.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <pthread.h>
  4 | 
  5 | #include "tap.h"
  6 | #include "perfutil.h"
  7 | #include "congrng.h"
  8 | #include "partr.h"
  9 | #include "taskpools.h"
 10 | #include "multiq.h"
 11 | 
 12 | #include <unistd.h>
 13 | 
 14 | log_t plog;
 15 | int16_t nthreads;
 16 | __thread int16_t tid;
 17 | __thread uint64_t rngseed;
 18 | 
 19 | #define NTASKS_PER_POOL 512
 20 | static ptask_t *tasks[NTASKS_PER_POOL];
 21 | 
 22 | static int *success;
 23 | 
 24 | /* thread barrier */
 25 | static int volatile barcnt;
 26 | static int volatile barsense = 1;
 27 | 
 28 | #define BARRIER_INIT()          barcnt=nthreads
 29 | 
 30 | #define BARRIER() do {                                                  \
 31 |     mysense = !mysense;                                                 \
 32 |     if (!__atomic_sub_fetch(&barcnt, 1, __ATOMIC_SEQ_CST)) {            \
 33 |         barcnt = nthreads;                                              \
 34 |         barsense = mysense;                                             \
 35 |     } else while (barsense != mysense);                                 \
 36 | } while(0)
 37 | 
 38 | 
 39 | typedef struct lthread_arg_tag {
 40 |     int16_t tid;
 41 | } lthread_arg_t;
 42 | 
 43 | /* used for reducing across threads */
 44 | static int N = 0;
 45 | 
 46 | 
 47 | static void *threadfun(void *targ)
 48 | {
 49 |     int mysense = 1; /* for the barrier */
 50 | 
 51 |     ptask_t *task;
 52 |     int16_t curr_prio, ooo;
 53 | 
 54 |     lthread_arg_t *arg = (lthread_arg_t *)targ;
 55 |     tid = arg->tid;
 56 |     free(targ);
 57 |     seed_cong(&rngseed);
 58 | 
 59 |     if (tid == 0) {
 60 |         taskpools_init();
 61 |         multiq_init();
 62 | 
 63 |         success = (int *)calloc(nthreads, sizeof(int));
 64 | 
 65 |         /* single-thread alloc and insert */
 66 |         const int16_t tprios[] = { 1, 0, 3, 2 };
 67 |         int16_t t = 0;
 68 |         for (int16_t j = 0;  j < 4;  ++j) {
 69 |             success[tid] = 1;
 70 |             for (int16_t i = 0;  i < NTASKS_PER_POOL/4;  ++i) {
 71 |                 tasks[t] = task_alloc();
 72 |                 if (multiq_insert(tasks[t], tprios[j]) != 0)
 73 |                     success[tid] = 0;
 74 |                 ++t;
 75 |             }
 76 |             ok(success[tid], "insert with priority %d", tprios[j]);
 77 |         }
 78 | 
 79 |         /* single-thread deletemin */
 80 |         success[tid] = 1;
 81 |         curr_prio = ooo = 0;
 82 |         for (int16_t i = 0;  i < NTASKS_PER_POOL;  ++i) {
 83 |             task = multiq_deletemin();
 84 |             if (task == NULL) {
 85 |                 success[tid] = 0;
 86 |                 break;
 87 |             }
 88 |             if (task->prio > curr_prio)
 89 |                 curr_prio = task->prio;
 90 |             else if (task->prio < curr_prio) {
 91 |                 diag("(tid %d) curr_prio: %d, task->prio: %d\n", tid, curr_prio, task->prio);
 92 |                 ++ooo;
 93 |                 curr_prio = task->prio;
 94 |             }
 95 |         }
 96 |         ok(success[tid], "deletemin (%d out-of-order)", ooo);
 97 |     }
 98 | 
 99 |     BARRIER();
100 | 
101 |     int each = NTASKS_PER_POOL/nthreads, start = tid * each, end = start + each;
102 |     if (tid == nthreads-1) end = NTASKS_PER_POOL;
103 | 
104 |     /* parallel insert tests */
105 |     success[tid] = 1;
106 |     for (int16_t i = start;  i < end;  ++i) {
107 |         if (multiq_insert(tasks[i], tid) != 0)
108 |             success[tid] = 0;
109 |     }
110 | 
111 |     BARRIER();
112 | 
113 |     if (tid == 0) {
114 |         for (int16_t i = 1;  i < nthreads;  ++i)
115 |             if (!success[i])
116 |                 success[0] = 0;
117 |         ok(success[0], "parallel insertion, %d tasks\n", NTASKS_PER_POOL);
118 |     }
119 | 
120 |     BARRIER();
121 | 
122 |     /* parallel deletemin tests */
123 |     curr_prio = ooo = 0;
124 |     int ndeq = 0;
125 |     for (int16_t i = 0;  i < NTASKS_PER_POOL/nthreads;  ++i) {
126 |         task = multiq_deletemin();
127 |         if (task == NULL) {
128 |             diag("(tid %d) !task\n", tid);
129 |             continue;
130 |         }
131 |         ++ndeq;
132 |         if (task->prio > curr_prio)
133 |             curr_prio = task->prio;
134 |         else if (task->prio < curr_prio) {
135 |             diag("(tid %d) curr_prio: %d, task->prio: %d\n", tid, curr_prio, task->prio);
136 |             ++ooo;
137 |             curr_prio = task->prio;
138 |         }
139 |     }
140 |     __atomic_add_fetch(&N, ndeq, __ATOMIC_SEQ_CST);
141 | 
142 |     BARRIER();
143 | 
144 |     if (tid == 0) {
145 |         ok(N == NTASKS_PER_POOL, "parallel deletemin %d tasks (%d out-of-order)", N, ooo);
146 | 
147 |         for (int16_t i = 0;  i < NTASKS_PER_POOL;  ++i)
148 |             task_free(tasks[i]);
149 | 
150 |         free(success);
151 |         multiq_destroy();
152 |         taskpools_destroy();
153 |     }
154 | 
155 |     return NULL;
156 | }
157 | 
158 | int main(int argc, char **argv)
159 | {
160 |     LOG_SETUP(plog, LOG_LEVEL_INFO, stdout);
161 |     LOG_INFO(plog, "taskpools test\n");
162 | 
163 |     nthreads = DEFAULT_NUM_THREADS;
164 |     char *cp = getenv(NUM_THREADS_NAME);
165 |     if (cp) nthreads = strtol(cp, NULL, 10);
166 |     LOG_INFO(plog, "  %d threads\n", nthreads);
167 | 
168 |     BARRIER_INIT();
169 | 
170 |     tid = 0;
171 | 
172 |     pthread_t pthread_id;
173 |     pthread_attr_t pthread_attr;
174 | 
175 |     pthread_attr_init(&pthread_attr);
176 |     pthread_attr_setdetachstate(&pthread_attr, PTHREAD_CREATE_DETACHED);
177 | 
178 |     for (int16_t i = 1;  i < nthreads;  ++i) {
179 |         lthread_arg_t *targ = (lthread_arg_t *)calloc(1, sizeof(lthread_arg_t));
180 |         targ->tid = i;
181 |         pthread_create(&pthread_id, &pthread_attr, threadfun, targ);
182 |     }
183 |     pthread_attr_destroy(&pthread_attr);
184 | 
185 |     /* thread 0 enters the thread function too */
186 |     lthread_arg_t *targ = (lthread_arg_t *)calloc(1, sizeof(lthread_arg_t));
187 |     targ->tid = 0;
188 |     threadfun(targ);
189 | 
190 |     done_testing();
191 | }
192 | 
193 | 


--------------------------------------------------------------------------------
/test/parfortest.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "tap.h"
 3 | #include "partr.h"
 4 | 
 5 | int arr[1024];
 6 | 
 7 | void *fill_arr(void *arg_, int64_t start, int64_t end)
 8 | {
 9 |     int64_t sum = 0;
10 |     for (int64_t i = start;  i < end;  ++i) {
11 |         arr[i] = i;
12 |         sum += i;
13 |     }
14 | 
15 |     return (void *)sum;
16 | }
17 | 
18 | void *add(void *arg1, void *arg2)
19 | {
20 |     int64_t a1 = (int64_t)arg1;
21 |     int64_t a2 = (int64_t)arg2;
22 |     return (void *)(a1 + a2);
23 | }
24 | 
25 | void *run(void *arg, int64_t start, int64_t end)
26 | {
27 |     int64_t sum;
28 |     partr_t t;
29 |     partr_parfor(&t, fill_arr, NULL, 1024, add);
30 |     partr_sync((void *)&sum, t, 1);
31 | 
32 |     return (void *)sum;
33 | }
34 | 
35 | int main(int argc, char **argv)
36 | {
37 |     for (int i = 0;  i < 1024;  ++i)
38 |         arr[i] = -1;
39 | 
40 |     int64_t par_sum;
41 | 
42 |     partr_init();
43 |     partr_start((void *)&par_sum, run, NULL, 0, 0);
44 |     partr_shutdown();
45 | 
46 |     printf("sum: %lld\n", par_sum);
47 | 
48 |     int success = 1, sum = 0;
49 |     for (int i = 0;  i < 1024;  ++i) {
50 |         if (arr[i] != i) {
51 |             success = 0;
52 |             break;
53 |         }
54 |         sum = sum + arr[i];
55 |     }
56 | 
57 |     ok(success, "all elements filled");
58 |     ok(sum == par_sum, "%lld == %lld", sum, par_sum);
59 | 
60 |     return 0;
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/test/sleeptest.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "tap.h"
 3 | #include "partr.h"
 4 | 
 5 | int arr[1024];
 6 | 
 7 | void *fill_arr(void *arg_, int64_t start, int64_t end)
 8 | {
 9 |     int64_t sum = 0;
10 |     for (int64_t i = start;  i < end;  ++i) {
11 |         arr[i] = i;
12 |         sum += i;
13 |     }
14 | 
15 |     return (void *)sum;
16 | }
17 | 
18 | void *add(void *arg1, void *arg2)
19 | {
20 |     int64_t a1 = (int64_t)arg1;
21 |     int64_t a2 = (int64_t)arg2;
22 |     return (void *)(a1 + a2);
23 | }
24 | 
25 | void *run(void *arg, int64_t start, int64_t end)
26 | {
27 |     int64_t sum;
28 |     partr_t t;
29 |     partr_parfor(&t, fill_arr, NULL, 1024, add);
30 |     partr_sync((void *)&sum, t, 1);
31 | 
32 |     return (void *)sum;
33 | }
34 | 
35 | void start_test()
36 | {
37 |     for (int i = 0;  i < 1024;  ++i)
38 |         arr[i] = -1;
39 | 
40 |     int64_t par_sum;
41 | 
42 |     partr_start((void *)&par_sum, run, NULL, 0, 0);
43 |     printf("sum: %lld\n", par_sum);
44 | 
45 |     int success = 1, sum = 0;
46 |     for (int i = 0;  i < 1024;  ++i) {
47 |         if (arr[i] != i) {
48 |             success = 0;
49 |             break;
50 |         }
51 |         sum = sum + arr[i];
52 |     }
53 | 
54 |     ok(success, "all elements filled");
55 |     ok(sum == par_sum, "%lld == %lld", sum, par_sum);
56 | 
57 | }
58 | 
59 | int main(int argc, char **argv)
60 | {
61 |     partr_init();
62 | 
63 |     start_test();
64 |     diag("pausing for 5 seconds to let threads sleep\n");
65 |     sleep(5);
66 |     diag("re-running test\n");
67 |     start_test();
68 | 
69 |     partr_shutdown();
70 |     return 0;
71 | }
72 | 
73 | 


--------------------------------------------------------------------------------
/test/tap.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | libtap - Write tests in C
  3 | Copyright 2012 Jake Gelbman <gelbman@gmail.com>
  4 | This file is licensed under the LGPL
  5 | */
  6 | 
  7 | #define _DEFAULT_SOURCE 1
  8 | 
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <stdarg.h>
 12 | #include <string.h>
 13 | #include "tap.h"
 14 | 
 15 | static int expected_tests = NO_PLAN;
 16 | static int failed_tests;
 17 | static int current_test;
 18 | static char *todo_mesg;
 19 | 
 20 | static char *
 21 | vstrdupf (const char *fmt, va_list args) {
 22 |     char *str;
 23 |     int size;
 24 |     va_list args2;
 25 |     va_copy(args2, args);
 26 |     if (!fmt)
 27 |         fmt = "";
 28 |     size = vsnprintf(NULL, 0, fmt, args2) + 2;
 29 |     str = malloc(size);
 30 |     if (!str) {
 31 |         perror("malloc error");
 32 |         exit(1);
 33 |     }
 34 |     vsprintf(str, fmt, args);
 35 |     va_end(args2);
 36 |     return str;
 37 | }
 38 | 
 39 | void
 40 | tap_plan (int tests, const char *fmt, ...) {
 41 |     expected_tests = tests;
 42 |     if (tests == SKIP_ALL) {
 43 |         char *why;
 44 |         va_list args;
 45 |         va_start(args, fmt);
 46 |         why = vstrdupf(fmt, args);
 47 |         va_end(args);
 48 |         printf("1..0 ");
 49 |         diag("SKIP %s\n", why);
 50 |         exit(0);
 51 |     }
 52 |     if (tests != NO_PLAN) {
 53 |         printf("1..%d\n", tests);
 54 |     }
 55 | }
 56 | 
 57 | int
 58 | vok_at_loc (const char *file, int line, int test, const char *fmt,
 59 |             va_list args)
 60 | {
 61 |     char *name = vstrdupf(fmt, args);
 62 |     if (!test) {
 63 |         printf("not ");
 64 |     }
 65 |     printf("ok %d", ++current_test);
 66 |     if (*name)
 67 |         printf(" - %s", name);
 68 |     if (todo_mesg) {
 69 |         printf(" # TODO");
 70 |         if (*todo_mesg)
 71 |             printf(" %s", todo_mesg);
 72 |     }
 73 |     printf("\n");
 74 |     if (!test) {
 75 |         printf("#   Failed ");
 76 |         if (todo_mesg)
 77 |             printf("(TODO) ");
 78 |         printf("test ");
 79 |         if (*name)
 80 |             printf("'%s'\n#   ", name);
 81 |         printf("at %s line %d.\n", file, line);
 82 |         if (!todo_mesg)
 83 |             failed_tests++;
 84 |     }
 85 |     free(name);
 86 |     return test;
 87 | }
 88 | 
 89 | int
 90 | ok_at_loc (const char *file, int line, int test, const char *fmt, ...) {
 91 |     va_list args;
 92 |     va_start(args, fmt);
 93 |     vok_at_loc(file, line, test, fmt, args);
 94 |     va_end(args);
 95 |     return test;
 96 | }
 97 | 
 98 | static int
 99 | mystrcmp (const char *a, const char *b) {
100 |     return a == b ? 0 : !a ? -1 : !b ? 1 : strcmp(a, b);
101 | }
102 | 
103 | #define eq(a, b) (!mystrcmp(a, b))
104 | #define ne(a, b) (mystrcmp(a, b))
105 | 
106 | int
107 | is_at_loc (const char *file, int line, const char *got, const char *expected,
108 |            const char *fmt, ...)
109 | {
110 |     int test = eq(got, expected);
111 |     va_list args;
112 |     va_start(args, fmt);
113 |     vok_at_loc(file, line, test, fmt, args);
114 |     va_end(args);
115 |     if (!test) {
116 |         diag("         got: '%s'", got);
117 |         diag("    expected: '%s'", expected);
118 |     }
119 |     return test;
120 | }
121 | 
122 | int
123 | isnt_at_loc (const char *file, int line, const char *got, const char *expected,
124 |              const char *fmt, ...)
125 | {
126 |     int test = ne(got, expected);
127 |     va_list args;
128 |     va_start(args, fmt);
129 |     vok_at_loc(file, line, test, fmt, args);
130 |     va_end(args);
131 |     if (!test) {
132 |         diag("         got: '%s'", got);
133 |         diag("    expected: anything else");
134 |     }
135 |     return test;
136 | }
137 | 
138 | int
139 | cmp_ok_at_loc (const char *file, int line, int a, const char *op, int b,
140 |                const char *fmt, ...)
141 | {
142 |     int test = eq(op, "||") ? a || b
143 |              : eq(op, "&&") ? a && b
144 |              : eq(op, "|")  ? a |  b
145 |              : eq(op, "^")  ? a ^  b
146 |              : eq(op, "&")  ? a &  b
147 |              : eq(op, "==") ? a == b
148 |              : eq(op, "!=") ? a != b
149 |              : eq(op, "<")  ? a <  b
150 |              : eq(op, ">")  ? a >  b
151 |              : eq(op, "<=") ? a <= b
152 |              : eq(op, ">=") ? a >= b
153 |              : eq(op, "<<") ? a << b
154 |              : eq(op, ">>") ? a >> b
155 |              : eq(op, "+")  ? a +  b
156 |              : eq(op, "-")  ? a -  b
157 |              : eq(op, "*")  ? a *  b
158 |              : eq(op, "/")  ? a /  b
159 |              : eq(op, "%")  ? a %  b
160 |              : diag("unrecognized operator '%s'", op);
161 |     va_list args;
162 |     va_start(args, fmt);
163 |     vok_at_loc(file, line, test, fmt, args);
164 |     va_end(args);
165 |     if (!test) {
166 |         diag("    %d", a);
167 |         diag("        %s", op);
168 |         diag("    %d", b);
169 |     }
170 |     return test;
171 | }
172 | 
173 | static int
174 | find_mem_diff (const char *a, const char *b, size_t n, size_t *offset) {
175 |     size_t i;
176 |     if (a == b)
177 |         return 0;
178 |     if (!a || !b)
179 |         return 2;
180 |     for (i = 0; i < n; i++) {
181 |         if (a[i] != b[i]) {
182 |             *offset = i;
183 |             return 1;
184 |         }
185 |     }
186 |     return 0;
187 | }
188 | 
189 | int
190 | cmp_mem_at_loc (const char *file, int line, const void *got,
191 |                 const void *expected, size_t n, const char *fmt, ...)
192 | {
193 |     size_t offset;
194 |     int diff = find_mem_diff(got, expected, n, &offset);
195 |     va_list args;
196 |     va_start(args, fmt);
197 |     vok_at_loc(file, line, !diff, fmt, args);
198 |     va_end(args);
199 |     if (diff == 1) {
200 |         diag("    Difference starts at offset %d", offset);
201 |         diag("         got: 0x%02x", ((unsigned char *)got)[offset]);
202 |         diag("    expected: 0x%02x", ((unsigned char *)expected)[offset]);
203 |     }
204 |     else if (diff == 2) {
205 |         diag("         got: %s", got ? "not NULL" : "NULL");
206 |         diag("    expected: %s", expected ? "not NULL" : "NULL");
207 |     }
208 |     return !diff;
209 | }
210 | 
211 | int
212 | diag (const char *fmt, ...) {
213 |     va_list args;
214 |     char *mesg, *line;
215 |     int i;
216 |     va_start(args, fmt);
217 |     if (!fmt)
218 |         return 0;
219 |     mesg = vstrdupf(fmt, args);
220 |     line = mesg;
221 |     for (i = 0; *line; i++) {
222 |         char c = mesg[i];
223 |         if (!c || c == '\n') {
224 |             mesg[i] = '\0';
225 |             printf("# %s\n", line);
226 |             if (!c)
227 |                 break;
228 |             mesg[i] = c;
229 |             line = mesg + i + 1;
230 |         }
231 |     }
232 |     free(mesg);
233 |     va_end(args);
234 |     return 0;
235 | }
236 | 
237 | int
238 | exit_status () {
239 |     int retval = 0;
240 |     if (expected_tests == NO_PLAN) {
241 |         printf("1..%d\n", current_test);
242 |     }
243 |     else if (current_test != expected_tests) {
244 |         diag("Looks like you planned %d test%s but ran %d.",
245 |             expected_tests, expected_tests > 1 ? "s" : "", current_test);
246 |         retval = 2;
247 |     }
248 |     if (failed_tests) {
249 |         diag("Looks like you failed %d test%s of %d run.",
250 |             failed_tests, failed_tests > 1 ? "s" : "", current_test);
251 |         retval = 1;
252 |     }
253 |     return retval;
254 | }
255 | 
256 | int
257 | bail_out (int ignore, const char *fmt, ...) {
258 |     va_list args;
259 |     va_start(args, fmt);
260 |     printf("Bail out!  ");
261 |     vprintf(fmt, args);
262 |     printf("\n");
263 |     va_end(args);
264 |     exit(255);
265 |     return 0;
266 | }
267 | 
268 | void
269 | tap_skip (int n, const char *fmt, ...) {
270 |     char *why;
271 |     va_list args;
272 |     va_start(args, fmt);
273 |     why = vstrdupf(fmt, args);
274 |     va_end(args);
275 |     while (n --> 0) {
276 |         printf("ok %d ", ++current_test);
277 |         diag("skip %s\n", why);
278 |     }
279 |     free(why);
280 | }
281 | 
282 | void
283 | tap_todo (int ignore, const char *fmt, ...) {
284 |     va_list args;
285 |     va_start(args, fmt);
286 |     todo_mesg = vstrdupf(fmt, args);
287 |     va_end(args);
288 | }
289 | 
290 | void
291 | tap_end_todo () {
292 |     free(todo_mesg);
293 |     todo_mesg = NULL;
294 | }
295 | 
296 | #ifndef _WIN32
297 | #include <sys/mman.h>
298 | #include <sys/param.h>
299 | #include <regex.h>
300 | 
301 | #if defined __APPLE__ || defined BSD
302 | #define MAP_ANONYMOUS MAP_ANON
303 | #endif
304 | 
305 | /* Create a shared memory int to keep track of whether a piece of code executed
306 | dies. to be used in the dies_ok and lives_ok macros.  */
307 | int
308 | tap_test_died (int status) {
309 |     static int *test_died = NULL;
310 |     int prev;
311 |     if (!test_died) {
312 |         test_died = mmap(0, sizeof (int), PROT_READ | PROT_WRITE,
313 |                          MAP_SHARED | MAP_ANONYMOUS, -1, 0);
314 |         *test_died = 0;
315 |     }
316 |     prev = *test_died;
317 |     *test_died = status;
318 |     return prev;
319 | }
320 | 
321 | int
322 | like_at_loc (int for_match, const char *file, int line, const char *got,
323 |              const char *expected, const char *fmt, ...)
324 | {
325 |     int test;
326 |     regex_t re;
327 |     va_list args;
328 |     int err = regcomp(&re, expected, REG_EXTENDED);
329 |     if (err) {
330 |         char errbuf[256];
331 |         regerror(err, &re, errbuf, sizeof errbuf);
332 |         fprintf(stderr, "Unable to compile regex '%s': %s at %s line %d\n",
333 |                         expected, errbuf, file, line);
334 |         exit(255);
335 |     }
336 |     err = regexec(&re, got, 0, NULL, 0);
337 |     regfree(&re);
338 |     test = for_match ? !err : err;
339 |     va_start(args, fmt);
340 |     vok_at_loc(file, line, test, fmt, args);
341 |     va_end(args);
342 |     if (!test) {
343 |         if (for_match) {
344 |             diag("                   '%s'", got);
345 |             diag("    doesn't match: '%s'", expected);
346 |         }
347 |         else {
348 |             diag("                   '%s'", got);
349 |             diag("          matches: '%s'", expected);
350 |         }
351 |     }
352 |     return test;
353 | }
354 | #endif
355 | 


--------------------------------------------------------------------------------
/test/tap.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | libtap - Write tests in C
  3 | Copyright 2012 Jake Gelbman <gelbman@gmail.com>
  4 | This file is licensed under the LGPL
  5 | */
  6 | 
  7 | #ifndef __TAP_H__
  8 | #define __TAP_H__
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | #ifndef va_copy
 15 | #ifdef __va_copy
 16 | #define va_copy __va_copy
 17 | #else
 18 | #define va_copy(d, s) ((d) = (s))
 19 | #endif
 20 | #endif
 21 | 
 22 | #include <stdio.h>
 23 | #include <stdlib.h>
 24 | #include <stdarg.h>
 25 | 
 26 | int     vok_at_loc      (const char *file, int line, int test, const char *fmt,
 27 |                          va_list args);
 28 | int     ok_at_loc       (const char *file, int line, int test, const char *fmt,
 29 |                          ...);
 30 | int     is_at_loc       (const char *file, int line, const char *got,
 31 |                          const char *expected, const char *fmt, ...);
 32 | int     isnt_at_loc     (const char *file, int line, const char *got,
 33 |                          const char *expected, const char *fmt, ...);
 34 | int     cmp_ok_at_loc   (const char *file, int line, int a, const char *op,
 35 |                          int b, const char *fmt, ...);
 36 | int     cmp_mem_at_loc  (const char *file, int line, const void *got,
 37 |                          const void *expected, size_t n, const char *fmt, ...);
 38 | int     bail_out        (int ignore, const char *fmt, ...);
 39 | void    tap_plan        (int tests, const char *fmt, ...);
 40 | int     diag            (const char *fmt, ...);
 41 | int     exit_status     (void);
 42 | void    tap_skip        (int n, const char *fmt, ...);
 43 | void    tap_todo        (int ignore, const char *fmt, ...);
 44 | void    tap_end_todo    (void);
 45 | 
 46 | #define NO_PLAN          -1
 47 | #define SKIP_ALL         -2
 48 | #define ok(...)          ok_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL)
 49 | #define is(...)          is_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL)
 50 | #define isnt(...)        isnt_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL)
 51 | #define cmp_ok(...)      cmp_ok_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL)
 52 | #define cmp_mem(...)     cmp_mem_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL);
 53 | #define plan(...)        tap_plan(__VA_ARGS__, NULL)
 54 | #define done_testing()   return exit_status()
 55 | #define BAIL_OUT(...)    bail_out(0, "" __VA_ARGS__, NULL)
 56 | #define pass(...)        ok(1, "" __VA_ARGS__)
 57 | #define fail(...)        ok(0, "" __VA_ARGS__)
 58 | 
 59 | #define skip(test, ...)  do {if (test) {tap_skip(__VA_ARGS__, NULL); break;}
 60 | #define end_skip         } while (0)
 61 | 
 62 | #define todo(...)        tap_todo(0, "" __VA_ARGS__, NULL)
 63 | #define end_todo         tap_end_todo()
 64 | 
 65 | #define dies_ok(...)     dies_ok_common(1, __VA_ARGS__)
 66 | #define lives_ok(...)    dies_ok_common(0, __VA_ARGS__)
 67 | 
 68 | #ifdef _WIN32
 69 | #define like(...)        tap_skip(1, "like is not implemented on Windows")
 70 | #define unlike           tap_skip(1, "unlike is not implemented on Windows")
 71 | #define dies_ok_common(...) \
 72 |                          tap_skip(1, "Death detection is not supported on Windows")
 73 | #else
 74 | #define like(...)        like_at_loc(1, __FILE__, __LINE__, __VA_ARGS__, NULL)
 75 | #define unlike(...)      like_at_loc(0, __FILE__, __LINE__, __VA_ARGS__, NULL)
 76 | int     like_at_loc     (int for_match, const char *file, int line,
 77 |                          const char *got, const char *expected,
 78 |                          const char *fmt, ...);
 79 | #include <unistd.h>
 80 | #include <sys/types.h>
 81 | #include <sys/wait.h>
 82 | int tap_test_died (int status);
 83 | #define dies_ok_common(for_death, code, ...)                \
 84 |     do {                                                    \
 85 |         int cpid;                                           \
 86 |         int it_died;                                        \
 87 |         tap_test_died(1);                                   \
 88 |         cpid = fork();                                      \
 89 |         switch (cpid) {                                     \
 90 |         case -1:                                            \
 91 |             perror("fork error");                           \
 92 |             exit(1);                                        \
 93 |         case 0:                                             \
 94 |             close(1);                                       \
 95 |             close(2);                                       \
 96 |             code                                            \
 97 |             tap_test_died(0);                               \
 98 |             exit(0);                                        \
 99 |         }                                                   \
100 |         if (waitpid(cpid, NULL, 0) < 0) {                   \
101 |             perror("waitpid error");                        \
102 |             exit(1);                                        \
103 |         }                                                   \
104 |         it_died = tap_test_died(0);                         \
105 |         if (!it_died)                                       \
106 |             {code}                                          \
107 |         ok(for_death ? it_died : !it_died, "" __VA_ARGS__); \
108 |     } while (0)
109 | #endif
110 | 
111 | #ifdef __cplusplus
112 | }
113 | #endif
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------
/test/taskpoolstest.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <pthread.h>
  4 | #include "tap.h"
  5 | #include "partr.h"
  6 | #include "taskpools.h"
  7 | 
  8 | log_t plog;
  9 | int16_t nthreads;
 10 | __thread int16_t tid;
 11 | 
 12 | #define NTASKS_PER_POOL 1024
 13 | static ptask_t *tasks[NTASKS_PER_POOL];
 14 | 
 15 | /* thread barrier */
 16 | static int volatile barcnt;
 17 | static int volatile barsense = 1;
 18 | 
 19 | #define BARRIER_INIT()          barcnt=nthreads
 20 | 
 21 | #define BARRIER() do {                                                  \
 22 |     mysense = !mysense;                                                 \
 23 |     if (!__atomic_sub_fetch(&barcnt, 1, __ATOMIC_SEQ_CST)) {            \
 24 |         barcnt = nthreads;                                              \
 25 |         barsense = mysense;                                             \
 26 |     } else while (barsense != mysense);                                 \
 27 | } while(0)
 28 | 
 29 | typedef struct pthread_arg_tag {
 30 |     int16_t tid;
 31 | } pthread_arg_t;
 32 | 
 33 | 
 34 | static void *threadfun(void *targ)
 35 | {
 36 |     int success;
 37 |     int mysense = 1; /* for the barrier */
 38 | 
 39 |     pthread_arg_t *arg = (pthread_arg_t *)targ;
 40 |     tid = arg->tid;
 41 |     free(targ);
 42 | 
 43 |     if (tid == 0) {
 44 |         taskpools_init();
 45 | 
 46 |         /* single-thread tests */
 47 |         success = 1;
 48 |         for (int16_t i = 0;  i < NTASKS_PER_POOL;  ++i)
 49 |             if ((tasks[i] = task_alloc()) == NULL)
 50 |                 success = 0;
 51 |         ok(success, "single-thread allocation");
 52 |         ok(task_alloc() == NULL, "expected number of tasks");
 53 |         for (int16_t i = 0;  i < NTASKS_PER_POOL;  ++i)
 54 |             task_free(tasks[i]);
 55 |         success = 1;
 56 |         for (int16_t i = 0;  i < NTASKS_PER_POOL;  ++i)
 57 |             if ((tasks[i] = task_alloc()) == NULL)
 58 |                 success = 0;
 59 |         ok(success, "free and alloc again");
 60 |     }
 61 | 
 62 |     BARRIER();
 63 | 
 64 |     int each = NTASKS_PER_POOL/nthreads, start = tid * each, end = start + each;
 65 |     if (tid == nthreads-1) end = NTASKS_PER_POOL;
 66 | 
 67 |     /* parallel free tests */
 68 |     for (int i = start;  i < end;  ++i) {
 69 |         task_free(tasks[i]);
 70 |         __atomic_store_n(&tasks[i], NULL, __ATOMIC_SEQ_CST);
 71 |     }
 72 | 
 73 |     BARRIER();
 74 | 
 75 |     if (tid == 0) {
 76 |         /* verify that all tasks were freed */
 77 |         int success = 1;
 78 |         for (int16_t i = 0;  i < NTASKS_PER_POOL;  ++i) {
 79 |             if (tasks[i] != NULL)
 80 |                 success = 0;
 81 |             tasks[i] = task_alloc();
 82 |         }
 83 |         ok(success, "all tasks freed");
 84 | 
 85 |         /* verify that tasks were freed out of order */
 86 |         success = 0;
 87 |         int16_t last_index = tasks[0]->index;
 88 |         for (int16_t i = 1;  i < NTASKS_PER_POOL;  ++i) {
 89 |             if (tasks[i]->index != last_index++)
 90 |                 success = 1;
 91 |             task_free(tasks[i]);
 92 |         }
 93 |         ok(success, "frees happened in parallel");
 94 |     }
 95 | 
 96 |     BARRIER();
 97 | 
 98 |     /* parallel alloc tests */
 99 |     for (int i = start;  i < end;  ++i)
100 |         tasks[i] = task_alloc();
101 | 
102 |     BARRIER();
103 | 
104 |     if (tid == 0) {
105 |         /* verify that tasks were allocated concurrently */
106 |         success = 0;
107 |         int16_t last_index = tasks[0]->index;
108 |         for (int16_t i = 1;  i < NTASKS_PER_POOL;  ++i) {
109 |             if (tasks[i]->index != last_index++) {
110 |                 success = 1;
111 |                 break;
112 |             }
113 |         }
114 |         ok(success, "concurrent allocs");
115 |     }
116 | 
117 |     BARRIER();
118 | 
119 |     /* TODO: parallel alloc and free tests */
120 | 
121 |     if (tid == 0) {
122 |         todo("parallel allocs/frees");
123 |         ok(0);
124 |         end_todo;
125 |     }
126 | 
127 |     BARRIER();
128 | 
129 |     if (tid == 0)
130 |         taskpools_destroy();
131 | 
132 |     return NULL;
133 | }
134 | 
135 | int main(int argc, char **argv)
136 | {
137 |     LOG_SETUP(plog, LOG_LEVEL_INFO, stdout);
138 |     LOG_INFO(plog, "taskpools test\n");
139 | 
140 |     nthreads = DEFAULT_NUM_THREADS;
141 |     char *cp = getenv(NUM_THREADS_NAME);
142 |     if (cp) nthreads = strtol(cp, NULL, 10);
143 |     LOG_INFO(plog, "  %d threads\n", nthreads);
144 | 
145 |     BARRIER_INIT();
146 | 
147 |     /* create threads */
148 |     pthread_t pthread_id;
149 |     pthread_attr_t pthread_attr;
150 |     pthread_attr_init(&pthread_attr);
151 |     pthread_attr_setdetachstate(&pthread_attr, PTHREAD_CREATE_DETACHED);
152 |     for (int16_t i = 1;  i < nthreads;  ++i) {
153 |         pthread_arg_t *targ = (pthread_arg_t *)calloc(1, sizeof(pthread_arg_t));
154 |         targ->tid = i;
155 |         pthread_create(&pthread_id, &pthread_attr, threadfun, targ);
156 |     }
157 |     pthread_attr_destroy(&pthread_attr);
158 | 
159 |     /* thread 0 enters the thread function too */
160 |     pthread_arg_t *targ = (pthread_arg_t *)calloc(1, sizeof(pthread_arg_t));
161 |     targ->tid = 0;
162 |     threadfun(targ);
163 | 
164 |     done_testing();
165 | }
166 | 
167 | 


--------------------------------------------------------------------------------