├── .gitignore ├── LICENSE ├── README.md ├── include └── partr.h ├── makefile ├── src ├── congrng.c ├── congrng.h ├── log.h ├── multiq.c ├── multiq.h ├── partr.c ├── perfutil.h ├── profile.h ├── synctreepool.c ├── synctreepool.h ├── task.h ├── taskpools.c └── taskpools.h └── test ├── fib.c ├── l3d.c ├── makefile ├── multiqtest.c ├── parfortest.c ├── sleeptest.c ├── tap.c ├── tap.h └── taskpoolstest.c /.gitignore: -------------------------------------------------------------------------------- 1 | # VIM files 2 | .*.swp 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Libraries 15 | *.lib 16 | *.a 17 | *.la 18 | *.lo 19 | 20 | # Shared objects (inc. Windows DLLs) 21 | *.dll 22 | *.so 23 | *.so.* 24 | *.dylib 25 | 26 | # Executables 27 | l3d 28 | fib 29 | parfortest 30 | multiqtest 31 | taskpoolstest 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Parallel Tasks Runtime 2 | 3 | A parallel task execution runtime that uses parallel depth-first (PDF) scheduling [1]. 4 | 5 | [1] Shimin Chen, Phillip B. Gibbons, Michael Kozuch, Vasileios Liaskovitis, Anastassia Ailamaki, Guy E. Blelloch, Babak Falsafi, Limor Fix, Nikos Hardavellas, Todd C. Mowry, and Chris Wilkerson. 2007. Scheduling threads for constructive cache sharing on CMPs. In Proceedings of the nineteenth annual ACM symposium on Parallel algorithms and architectures (SPAA '07). ACM, New York, NY, USA, 105-115. DOI=http://dx.doi.org/10.1145/1248377.1248396 6 | -------------------------------------------------------------------------------- /include/partr.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | */ 3 | 4 | #ifndef PARTR_H 5 | #define PARTR_H 6 | 7 | #include 8 | #include 9 | 10 | #include "log.h" 11 | 12 | 13 | /* tasks */ 14 | #define TASK_STACK_SIZE (1024*4) 15 | 16 | /* pools */ 17 | #define TASKS_PER_POOL 1024 18 | /* number allocated = TASKS_PER_POOL * nthreads */ 19 | 20 | /* multiq */ 21 | #define MULTIQ_HEAP_C 4 22 | /* number of heaps = MULTIQ_HEAP_C * nthreads */ 23 | #define MULTIQ_TASKS_PER_HEAP 129 24 | /* how many in each heap */ 25 | 26 | /* parfor */ 27 | #define GRAIN_K 4 28 | /* tasks = niters / (GRAIN_K * nthreads) */ 29 | 30 | /* synchronization */ 31 | #define ARRIVERS_P 2 32 | /* narrivers = ((GRAIN_K * nthreads) ^ ARRIVERS_P) + 1 33 | limit for number of recursive parfors */ 34 | #define REDUCERS_FRAC 1 35 | /* nreducers = narrivers * REDUCERS_FRAC */ 36 | 37 | /* logging (debug, info, warn, err, critical, none) */ 38 | #define LOG_LEVEL_NAME "PARTR_LOG_LEVEL" 39 | /* environment variable name */ 40 | #define DEFAULT_LOG_LEVEL "debug" 41 | 42 | /* controls for when threads sleep */ 43 | #define THREAD_SLEEP_THRESHOLD_NAME "PARTR_THREAD_SLEEP_THRESHOLD" 44 | /* environment variable name */ 45 | #define DEFAULT_THREAD_SLEEP_THRESHOLD 4e9 46 | /* in cycles (1e9 == 1sec@1GHz) */ 47 | 48 | /* defaults for # threads */ 49 | #define NUM_THREADS_NAME "PARTR_NUM_THREADS" 50 | /* environment variable name */ 51 | #define DEFAULT_NUM_THREADS 4 52 | 53 | /* affinitization behavior */ 54 | #define MACHINE_EXCLUSIVE_NAME "PARTR_EXCLUSIVE" 55 | /* environment variable name */ 56 | #define DEFAULT_MACHINE_EXCLUSIVE 0 57 | /* don't assume we own the machine */ 58 | 59 | /* performance profiling */ 60 | #define PERF_PROFILE 1 61 | /* comment to disable profiling */ 62 | 63 | 64 | /* externally visible globals */ 65 | extern log_t plog; /* message logger */ 66 | extern int16_t nthreads; /* number of threads */ 67 | 68 | 69 | /* externally visible thread-local globals */ 70 | extern __thread int16_t tid; /* 0-based thread ID */ 71 | extern __thread uint64_t rngseed; /* per-thread RNG seed */ 72 | 73 | 74 | /* external interface */ 75 | typedef void *partr_t; 76 | 77 | void partr_init(); 78 | void partr_shutdown(); 79 | int partr_start(void **ret, void *(*f)(void *, int64_t, int64_t), 80 | void *arg, int64_t start, int64_t end); 81 | int partr_spawn(partr_t *t, void *(*f)(void *, int64_t, int64_t), 82 | void *arg, int64_t start, int64_t end, int8_t sticky, 83 | int8_t detach); 84 | int partr_sync(void **r, partr_t t, int done_with_task); 85 | int partr_parfor(partr_t *t, void *(*f)(void *, int64_t, int64_t), 86 | void *arg, int64_t count, void *(*rf)(void *, void *)); 87 | 88 | 89 | #endif /* PARTR_H */ 90 | 91 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | # parallel tasks runtime 2 | # 3 | # makefile 4 | # 5 | # 2016.06.01 kiran.pamnany Initial code 6 | # 7 | 8 | CC=gcc 9 | 10 | .SUFFIXES: .c .h .o .a 11 | .PHONY: clean test 12 | 13 | CFLAGS+=-DPERF_PROFILE 14 | CFLAGS+=-Wall 15 | CFLAGS+=-std=c11 16 | CFLAGS+=-D_GNU_SOURCE 17 | 18 | CFLAGS+=-I../hwloc/include 19 | CFLAGS+=-I../libconcurrent/include 20 | CFLAGS+=-I./include 21 | CFLAGS+=-I./src 22 | 23 | SRCS=src/partr.c src/synctreepool.c src/taskpools.c src/multiq.c src/congrng.c 24 | INCS=include/partr.h src/task.h src/synctreepool.h src/taskpools.h src/multiq.h src/congrng.h src/log.h src/perfutil.h src/profile.h 25 | OBJS=${SRCS:.c=.o} 26 | 27 | ifeq ($(DEBUG),yes) 28 | CFLAGS+=-O0 -g 29 | else 30 | CFLAGS+=-O3 31 | endif 32 | 33 | TARGET=libpartr.a 34 | 35 | all: $(TARGET) 36 | 37 | test: $(TARGET) 38 | $(MAKE) -C test 39 | 40 | $(TARGET): $(OBJS) 41 | $(RM) $(TARGET) 42 | $(AR) qvs $(TARGET) $(OBJS) 43 | 44 | %.o: %.c $(INCS) makefile 45 | $(CC) $(CFLAGS) -c $< -o $@ 46 | 47 | clean: 48 | $(MAKE) -C test clean 49 | $(RM) $(TARGET) $(OBJS) 50 | 51 | -------------------------------------------------------------------------------- /src/congrng.c: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | Simple congruential random number generator (from VAX). No modulo bias. 4 | */ 5 | 6 | #include "congrng.h" 7 | #include "perfutil.h" 8 | 9 | 10 | /* seed_cong() -- each thread needs its own seed! 11 | */ 12 | void seed_cong(uint64_t *seed) 13 | { 14 | *seed = rdtscp(); 15 | } 16 | 17 | 18 | /* unbias_cong() -- sets up state to avoid modulo bias for the given max. 19 | */ 20 | void unbias_cong(uint64_t max, uint64_t *unbias) 21 | { 22 | *unbias = UINT64_MAX - ((UINT64_MAX % max)+1); 23 | } 24 | 25 | 26 | /* cong() -- linear congruential generator (was system RNG on VAXen). 27 | * Loop to avoid modulo bias. 28 | */ 29 | uint64_t cong(uint64_t max, uint64_t unbias, uint64_t *seed) 30 | { 31 | while ((*seed = 69069 * (*seed) + 362437) > unbias) 32 | ; 33 | return *seed % max; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /src/congrng.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | Simple random number generator (linear congruential). No modulo bias. 4 | */ 5 | 6 | #ifndef CONGRNG_H 7 | #define CONGRNG_H 8 | 9 | #include 10 | 11 | 12 | void seed_cong(uint64_t *seed); 13 | void unbias_cong(uint64_t max, uint64_t *unbias); 14 | uint64_t cong(uint64_t max, uint64_t unbias, uint64_t *seed); 15 | 16 | 17 | #endif /* CONGRNG_H */ 18 | 19 | -------------------------------------------------------------------------------- /src/log.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | log -- message logging 4 | */ 5 | 6 | #ifndef LOG_H 7 | #define LOG_H 8 | 9 | #include 10 | 11 | enum { 12 | LOG_LEVEL_DEBUG, 13 | LOG_LEVEL_INFO, 14 | LOG_LEVEL_WARN, 15 | LOG_LEVEL_ERR, 16 | LOG_LEVEL_CRITICAL 17 | }; 18 | 19 | typedef struct log_tag { 20 | int level; 21 | FILE *f; 22 | } log_t; 23 | 24 | #define LOG_SETUP(l, lvl, fp) do { \ 25 | (l).level = (lvl); \ 26 | (l).f = (fp); \ 27 | } while(0) 28 | 29 | #define LOG_DEBUG(l, ...) do { \ 30 | if ((l).level <= LOG_LEVEL_DEBUG) { \ 31 | fprintf((l).f, __VA_ARGS__); \ 32 | fflush((l).f); \ 33 | } \ 34 | } while(0) 35 | 36 | #define LOG_INFO(l, ...) do { \ 37 | if ((l).level <= LOG_LEVEL_INFO) \ 38 | fprintf((l).f, __VA_ARGS__); \ 39 | } while(0) 40 | 41 | #define LOG_WARN(l, ...) do { \ 42 | if ((l).level <= LOG_LEVEL_WARN) \ 43 | fprintf((l).f, __VA_ARGS__); \ 44 | } while(0) 45 | 46 | #define LOG_ERR(l, ...) do { \ 47 | if ((l).level <= LOG_LEVEL_ERR) \ 48 | fprintf((l).f, __VA_ARGS__); \ 49 | } while(0) 50 | 51 | #define LOG_CRITICAL(l, ...) do { \ 52 | if ((l).level <= LOG_LEVEL_CRITICAL) \ 53 | fprintf((l).f, __VA_ARGS__); \ 54 | } while(0) 55 | 56 | #endif /* LOG_H */ 57 | 58 | -------------------------------------------------------------------------------- /src/multiq.c: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | MultiQueues (http://arxiv.org/abs/1411.1209) 4 | */ 5 | 6 | 7 | #include 8 | #include 9 | #include "partr.h" 10 | #include "multiq.h" 11 | #include "congrng.h" 12 | #include "perfutil.h" 13 | 14 | 15 | /* individual spin-lock synchronized task heap */ 16 | typedef struct taskheap_tag { 17 | char lock; 18 | ptask_t **tasks; 19 | int16_t ntasks, prio; 20 | } taskheap_t; 21 | 22 | /* heap 'n'ary */ 23 | static const int16_t heap_d = 8; 24 | 25 | /* the multiqueue itself is 'p' task heaps */ 26 | static taskheap_t *heaps; 27 | static int16_t heap_p; 28 | 29 | /* for atomic snapshot */ 30 | static uint64_t snapshot_owner = -1; 31 | 32 | /* unbias state for the RNG */ 33 | static uint64_t cong_unbias; 34 | 35 | /* state for sleep checking */ 36 | static const int16_t not_sleeping = 0; 37 | static const int16_t checking_for_sleeping = 1; 38 | static const int16_t sleeping = 2; 39 | static int16_t sleep_check_state = not_sleeping; 40 | 41 | 42 | /* multiq_init() 43 | */ 44 | void multiq_init() 45 | { 46 | heap_p = MULTIQ_HEAP_C * nthreads; 47 | heaps = (taskheap_t *)calloc(heap_p, sizeof(taskheap_t)); 48 | for (int16_t i = 0; i < heap_p; ++i) { 49 | __atomic_clear(&heaps[i].lock, __ATOMIC_RELAXED); 50 | heaps[i].tasks = (ptask_t **) 51 | calloc(MULTIQ_TASKS_PER_HEAP, sizeof(ptask_t *)); 52 | heaps[i].ntasks = 0; 53 | heaps[i].prio = INT16_MAX; 54 | } 55 | unbias_cong(heap_p, &cong_unbias); 56 | LOG_INFO(plog, " %d %d-ary heaps of %d tasks each\n", 57 | heap_p, heap_d, MULTIQ_TASKS_PER_HEAP); 58 | } 59 | 60 | 61 | /* multiq_destroy() 62 | */ 63 | void multiq_destroy() 64 | { 65 | for (int16_t i = 0; i < heap_p; ++i) 66 | free(heaps[i].tasks); 67 | free(heaps); 68 | } 69 | 70 | 71 | /* sift_up() 72 | */ 73 | static void sift_up(taskheap_t *heap, int16_t idx) 74 | { 75 | if (idx > 0) { 76 | int16_t parent = (idx-1)/heap_d; 77 | if (heap->tasks[idx]->prio <= heap->tasks[parent]->prio) { 78 | ptask_t *t = heap->tasks[parent]; 79 | heap->tasks[parent] = heap->tasks[idx]; 80 | heap->tasks[idx] = t; 81 | sift_up(heap, parent); 82 | } 83 | } 84 | } 85 | 86 | 87 | /* sift_down() 88 | */ 89 | void sift_down(taskheap_t *heap, int16_t idx) 90 | { 91 | if (idx < heap->ntasks) { 92 | for (int16_t child = heap_d*idx + 1; 93 | child < MULTIQ_TASKS_PER_HEAP && child <= heap_d*idx + heap_d; 94 | ++child) { 95 | if (heap->tasks[child] 96 | && heap->tasks[child]->prio <= heap->tasks[idx]->prio) { 97 | ptask_t *t = heap->tasks[idx]; 98 | heap->tasks[idx] = heap->tasks[child]; 99 | heap->tasks[child] = t; 100 | sift_down(heap, child); 101 | } 102 | } 103 | } 104 | } 105 | 106 | 107 | /* multiq_insert() 108 | */ 109 | int multiq_insert(ptask_t *task, int16_t priority) 110 | { 111 | uint64_t rn; 112 | 113 | task->prio = priority; 114 | do { 115 | rn = cong(heap_p, cong_unbias, &rngseed); 116 | } while (__atomic_test_and_set(&heaps[rn].lock, __ATOMIC_ACQUIRE)); 117 | 118 | if (heaps[rn].ntasks >= MULTIQ_TASKS_PER_HEAP) { 119 | LOG_ERR(plog, " heap %llu is full\n", rn); 120 | __atomic_clear(&heaps[rn].lock, __ATOMIC_RELEASE); 121 | return -1; 122 | } 123 | 124 | heaps[rn].tasks[heaps[rn].ntasks++] = task; 125 | sift_up(&heaps[rn], heaps[rn].ntasks-1); 126 | __atomic_clear(&heaps[rn].lock, __ATOMIC_RELEASE); 127 | int16_t prio = __atomic_load_n(&heaps[rn].prio, __ATOMIC_SEQ_CST); 128 | if (task->prio < prio) 129 | __atomic_compare_exchange_n(&heaps[rn].prio, &prio, task->prio, 130 | 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); 131 | 132 | return 0; 133 | } 134 | 135 | 136 | /* multiq_deletemin() 137 | */ 138 | ptask_t *multiq_deletemin() 139 | { 140 | uint64_t rn1, rn2; 141 | int16_t i, prio1, prio2; 142 | ptask_t *task; 143 | 144 | for (i = 0; i < heap_p; ++i) { 145 | rn1 = cong(heap_p, cong_unbias, &rngseed); 146 | rn2 = cong(heap_p, cong_unbias, &rngseed); 147 | prio1 = __atomic_load_n(&heaps[rn1].prio, __ATOMIC_SEQ_CST); 148 | prio2 = __atomic_load_n(&heaps[rn2].prio, __ATOMIC_SEQ_CST); 149 | if (prio1 > prio2) { 150 | prio1 = prio2; 151 | rn1 = rn2; 152 | } 153 | else if (prio1 == prio2 && prio1 == INT16_MAX) 154 | continue; 155 | if (!__atomic_test_and_set(&heaps[rn1].lock, __ATOMIC_ACQUIRE)) { 156 | if (prio1 == heaps[rn1].prio) 157 | break; 158 | __atomic_clear(&heaps[rn1].lock, __ATOMIC_RELEASE); 159 | } 160 | } 161 | if (i == heap_p) 162 | return NULL; 163 | 164 | task = heaps[rn1].tasks[0]; 165 | heaps[rn1].tasks[0] = heaps[rn1].tasks[--heaps[rn1].ntasks]; 166 | heaps[rn1].tasks[heaps[rn1].ntasks] = NULL; 167 | prio1 = INT16_MAX; 168 | if (heaps[rn1].ntasks > 0) { 169 | sift_down(&heaps[rn1], 0); 170 | prio1 = heaps[rn1].tasks[0]->prio; 171 | } 172 | __atomic_store_n(&heaps[rn1].prio, prio1, __ATOMIC_SEQ_CST); 173 | __atomic_clear(&heaps[rn1].lock, __ATOMIC_RELEASE); 174 | 175 | return task; 176 | } 177 | 178 | 179 | /* multiq_minprio() 180 | */ 181 | int16_t multiq_minprio() 182 | { 183 | uint64_t rn1, rn2; 184 | int16_t prio1, prio2; 185 | 186 | rn1 = cong(heap_p, cong_unbias, &rngseed); 187 | rn2 = cong(heap_p, cong_unbias, &rngseed); 188 | prio1 = __atomic_load_n(&heaps[rn1].prio, __ATOMIC_SEQ_CST); 189 | prio2 = __atomic_load_n(&heaps[rn2].prio, __ATOMIC_SEQ_CST); 190 | if (prio2 < prio1) 191 | return prio2; 192 | return prio1; 193 | } 194 | 195 | 196 | /* just_sleep() 197 | */ 198 | static void just_sleep(pthread_mutex_t *lock, pthread_cond_t *wakeup) 199 | { 200 | pthread_mutex_lock(lock); 201 | if (__atomic_load_n(&sleep_check_state, __ATOMIC_SEQ_CST) == sleeping) 202 | pthread_cond_wait(wakeup, lock); 203 | else 204 | pthread_mutex_unlock(lock); 205 | } 206 | 207 | 208 | /* snapshot_and_sleep() 209 | */ 210 | static void snapshot_and_sleep(pthread_mutex_t *lock, pthread_cond_t *wakeup) 211 | { 212 | uint64_t snapshot_id = cong(UINT64_MAX, UINT64_MAX, &rngseed), previous = -1; 213 | if (!__atomic_compare_exchange_n(&snapshot_owner, &previous, snapshot_id, 0, 214 | __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { 215 | LOG_ERR(plog, " snapshot has previous owner!\n"); 216 | return; 217 | } 218 | 219 | int16_t i; 220 | for (i = 0; i < heap_p; ++i) { 221 | if (heaps[i].ntasks != 0) 222 | break; 223 | } 224 | if (i != heap_p) { 225 | LOG_INFO(plog, " heap has tasks, snapshot aborted\n"); 226 | return; 227 | } 228 | 229 | if (!__atomic_compare_exchange_n(&snapshot_owner, &snapshot_id, previous, 0, 230 | __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { 231 | LOG_INFO(plog, " snapshot owner changed, snapshot aborted\n"); 232 | return; 233 | } 234 | if (!__atomic_compare_exchange_n(&sleep_check_state, (int16_t *)&checking_for_sleeping, 235 | sleeping, 0, 236 | __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { 237 | LOG_ERR(plog, " sleep aborted at snapshot end\n"); 238 | return; 239 | } 240 | just_sleep(lock, wakeup); 241 | } 242 | 243 | 244 | /* multiq_sleep_if_empty() 245 | */ 246 | void multiq_sleep_if_empty(pthread_mutex_t *lock, pthread_cond_t *wakeup) 247 | { 248 | int16_t state; 249 | 250 | sleep_start: 251 | state = __atomic_load_n(&sleep_check_state, __ATOMIC_SEQ_CST); 252 | if (state == checking_for_sleeping) { 253 | for (; ;) { 254 | cpu_pause(); 255 | state = __atomic_load_n(&sleep_check_state, __ATOMIC_SEQ_CST); 256 | if (state == not_sleeping) 257 | break; 258 | else if (state == sleeping) { 259 | just_sleep(lock, wakeup); 260 | break; 261 | } 262 | } 263 | } 264 | else if (state == not_sleeping) { 265 | if (!__atomic_compare_exchange_n(&sleep_check_state, (int16_t *)¬_sleeping, 266 | checking_for_sleeping, 0, 267 | __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) 268 | goto sleep_start; 269 | snapshot_and_sleep(lock, wakeup); 270 | if (!__atomic_compare_exchange_n(&sleep_check_state, (int16_t *)&sleeping, 271 | not_sleeping, 0, 272 | __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) 273 | LOG_ERR(plog, " sleep check state update failed\n"); 274 | } 275 | else /* state == sleeping */ 276 | just_sleep(lock, wakeup); 277 | } 278 | 279 | -------------------------------------------------------------------------------- /src/multiq.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | MultiQueues (http://arxiv.org/abs/1411.1209) 4 | */ 5 | 6 | #ifndef MULTIQ_H 7 | #define MULTIQ_H 8 | 9 | #include 10 | #include "task.h" 11 | 12 | 13 | void multiq_init(); 14 | void multiq_destroy(); 15 | int multiq_insert(ptask_t *elem, int16_t priority); 16 | ptask_t *multiq_deletemin(); 17 | int16_t multiq_minprio(); 18 | void multiq_sleep_if_empty(pthread_mutex_t *lock, pthread_cond_t *wakeup); 19 | 20 | 21 | #endif /* MULTIQ_H */ 22 | 23 | -------------------------------------------------------------------------------- /src/partr.c: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | interface -- implementation of spawn/sync/parfor, thread function, etc. 4 | */ 5 | 6 | #include "partr.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "congrng.h" 17 | #include "synctreepool.h" 18 | #include "taskpools.h" 19 | #include "multiq.h" 20 | 21 | #include "profile.h" 22 | 23 | 24 | /* used for logging by the entire runtime */ 25 | log_t plog; 26 | 27 | /* number of threads created */ 28 | int16_t nthreads; 29 | 30 | /* thread-local 0-based identifier */ 31 | __thread int16_t tid; 32 | 33 | /* the `start` task */ 34 | ptask_t *start_task; 35 | 36 | /* task currently being executed */ 37 | __thread ptask_t *curr_task; 38 | 39 | /* RNG seed */ 40 | __thread uint64_t rngseed; 41 | 42 | /* per-thread task queues, for sticky tasks */ 43 | __thread ptask_t **taskq; 44 | __thread int8_t *taskq_lock; 45 | 46 | /* sticky task queues need to be visible to all threads */ 47 | ptask_t ***all_taskqs; 48 | int8_t **all_taskq_locks; 49 | 50 | /* thread sleep threshold */ 51 | uint64_t sleep_threshold; 52 | 53 | /* per-thread sleep lock/wakeup signal */ 54 | __thread pthread_mutex_t *sleep_lock; 55 | __thread pthread_cond_t *wake_signal; 56 | 57 | /* thread sleep/wakeup signals need to be visible to all threads */ 58 | pthread_mutex_t **all_sleep_locks; 59 | pthread_cond_t **all_wake_signals; 60 | 61 | /* thread IDs */ 62 | pthread_t *all_thread_ids; 63 | 64 | /* forward declare thread function */ 65 | static void *partr_thread(void *arg_); 66 | 67 | /* internally used to indicate a yield occurred in the runtime itself */ 68 | static const int64_t yield_from_sync = 1; 69 | 70 | /* initialization thread barrier */ 71 | static int volatile barcnt; 72 | static int volatile barsense = 1; 73 | 74 | #define BARRIER_INIT() barcnt=nthreads 75 | #define BARRIER_THREAD_DECL int mysense = 1 76 | #define BARRIER() do { \ 77 | mysense = !mysense; \ 78 | if (!__atomic_sub_fetch(&barcnt, 1, __ATOMIC_SEQ_CST)) { \ 79 | barcnt = nthreads; \ 80 | barsense = mysense; \ 81 | } else while (barsense != mysense); \ 82 | } while(0) 83 | 84 | 85 | /* thread function argument */ 86 | typedef struct lthread_arg_tag { 87 | int16_t tid; 88 | int8_t exclusive; 89 | hwloc_topology_t topology; 90 | hwloc_cpuset_t cpuset; 91 | } lthread_arg_t; 92 | 93 | 94 | /* log_init() -- set up runtime logging 95 | */ 96 | static void log_init() 97 | { 98 | int level; 99 | char *cp; 100 | 101 | cp = getenv(LOG_LEVEL_NAME); 102 | if (!cp) 103 | cp = DEFAULT_LOG_LEVEL; 104 | if (strncasecmp(cp, "debug", 5) == 0) 105 | level = LOG_LEVEL_DEBUG; 106 | else if (strncasecmp(cp, "info", 4) == 0) 107 | level = LOG_LEVEL_INFO; 108 | else if (strncasecmp(cp, "err", 3) == 0) 109 | level = LOG_LEVEL_ERR; 110 | else if (strncasecmp(cp, "critical", 8) == 0) 111 | level = LOG_LEVEL_CRITICAL; 112 | else /* if (strncasecmp(cp, "warn", 4) == 0) */ 113 | level = LOG_LEVEL_WARN; 114 | 115 | LOG_SETUP(plog, level, stdout); 116 | LOG_INFO(plog, "partr threading\n"); 117 | } 118 | 119 | 120 | /* show_affinity() 121 | */ 122 | #ifdef __linux__ 123 | static void show_affinity() 124 | { 125 | int i; 126 | cpu_set_t cset; 127 | char buf[2048], num[16]; 128 | 129 | if (plog.level > LOG_LEVEL_DEBUG) return; 130 | 131 | pthread_t pthread_id = pthread_self(); 132 | 133 | CPU_ZERO(&cset); 134 | pthread_getaffinity_np(pthread_id, sizeof(cset), &cset); 135 | buf[0] = '\0'; 136 | for (i = 0; i < CPU_SETSIZE; ++i) { 137 | if (CPU_ISSET(i, &cset)) { 138 | snprintf(num, 15, "%d ", i); 139 | strcat(buf, num); 140 | } 141 | } 142 | LOG_DEBUG(plog, " <%d> bound to %d CPU(s): %s\n", 143 | tid, CPU_COUNT(&cset), buf); 144 | } 145 | #else 146 | static void show_affinity() 147 | { 148 | } 149 | #endif 150 | 151 | 152 | /* wake_thread() 153 | */ 154 | static void wake_thread(int16_t wtid) 155 | { 156 | if (wtid != tid) { 157 | pthread_mutex_lock(all_sleep_locks[wtid]); 158 | pthread_cond_signal(all_wake_signals[wtid]); 159 | pthread_mutex_unlock(all_sleep_locks[wtid]); 160 | } 161 | } 162 | 163 | 164 | /* wake_all_threads() 165 | */ 166 | static void wake_all_threads() 167 | { 168 | for (int16_t i = 0; i < nthreads; ++i) 169 | wake_thread(i); 170 | } 171 | 172 | 173 | /* partr_init() -- initialization entry point 174 | */ 175 | void partr_init() 176 | { 177 | log_init(); 178 | 179 | char *cp; 180 | 181 | /* get requested # threads */ 182 | nthreads = DEFAULT_NUM_THREADS; 183 | cp = getenv(NUM_THREADS_NAME); 184 | if (cp) 185 | nthreads = strtol(cp, NULL, 10); 186 | LOG_INFO(plog, " %d threads requested\n", nthreads); 187 | 188 | /* check if we have exclusive use of the machine */ 189 | int exclusive = DEFAULT_MACHINE_EXCLUSIVE; 190 | cp = getenv(MACHINE_EXCLUSIVE_NAME); 191 | if (cp) 192 | exclusive = strtol(cp, NULL, 10); 193 | 194 | /* check machine topology */ 195 | hwloc_topology_t topology; 196 | hwloc_topology_init(&topology); 197 | hwloc_topology_load(topology); 198 | int core_depth = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_CORE); 199 | unsigned ncores = hwloc_get_nbobjs_by_depth(topology, core_depth); 200 | LOG_INFO(plog, " %d cores detected\n", ncores); 201 | int pu_depth = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_PU); 202 | unsigned npus = hwloc_get_nbobjs_by_depth(topology, pu_depth); 203 | LOG_INFO(plog, " %d PUs detected\n", npus); 204 | 205 | /* some sanity checks */ 206 | if (nthreads > npus) { 207 | LOG_WARN(plog, " won't over-subscribe; adjusting number of threads" 208 | " to %d\n", npus); 209 | nthreads = npus; 210 | } 211 | if (nthreads < 1) { 212 | LOG_INFO(plog, " setting number of threads to number of cores (%d)\n", 213 | ncores); 214 | nthreads = ncores; 215 | } 216 | int depth; 217 | if (nthreads <= ncores) { 218 | LOG_INFO(plog, " 1 thread per core\n"); 219 | depth = core_depth; 220 | } 221 | else { 222 | LOG_INFO(plog, " >1 thread per core\n"); 223 | depth = pu_depth; 224 | } 225 | 226 | /* set affinity if we have exclusive use of the machine */ 227 | hwloc_obj_t obj; 228 | hwloc_cpuset_t cpuset; 229 | if (exclusive) { 230 | LOG_INFO(plog, " exclusive machine use\n"); 231 | 232 | /* rebind this thread to the first core/PU */ 233 | obj = hwloc_get_obj_by_depth(topology, depth, 0); 234 | assert(obj != NULL); 235 | cpuset = hwloc_bitmap_dup(obj->cpuset); 236 | /* hwloc_bitmap_singlify(cpuset); */ 237 | hwloc_set_cpubind(topology, cpuset, HWLOC_CPUBIND_THREAD); 238 | hwloc_bitmap_free(cpuset); 239 | } 240 | else 241 | LOG_INFO(plog, " non-exclusive machine use\n"); 242 | 243 | tid = 0; 244 | seed_cong(&rngseed); 245 | show_affinity(); 246 | 247 | /* initialize task pools */ 248 | taskpools_init(); 249 | 250 | /* initialize sync trees */ 251 | synctreepool_init(); 252 | 253 | /* initialize task multiqueue */ 254 | multiq_init(); 255 | 256 | /* initialize libconcurrent */ 257 | concurrent_init(); 258 | 259 | /* set up the sleep threshold */ 260 | sleep_threshold = DEFAULT_THREAD_SLEEP_THRESHOLD; 261 | cp = getenv(THREAD_SLEEP_THRESHOLD_NAME); 262 | if (cp) { 263 | if (!strncasecmp(cp, "infinite", 8)) 264 | sleep_threshold = 0; 265 | else 266 | sleep_threshold = (uint64_t)strtol(cp, NULL, 10); 267 | LOG_INFO(plog, " thread sleep threshold is %llu cycles\n", sleep_threshold); 268 | } 269 | 270 | /* allocate per-thread task queues, for sticky tasks */ 271 | posix_memalign((void **)&all_taskqs, 64, nthreads * sizeof(ptask_t **)); 272 | posix_memalign((void **)&all_taskq_locks, 64, nthreads * sizeof(int8_t *)); 273 | 274 | /* allocate per-thread sleep/wakeup locks and signals */ 275 | posix_memalign((void **)&all_sleep_locks, 64, nthreads * sizeof(pthread_mutex_t *)); 276 | posix_memalign((void **)&all_wake_signals, 64, nthreads * sizeof(pthread_cond_t *)); 277 | 278 | /* setup profiling */ 279 | PROFILE_SETUP(); 280 | 281 | /* start threads */ 282 | BARRIER_THREAD_DECL; 283 | BARRIER_INIT(); 284 | 285 | /* allocate space for all thread IDs */ 286 | all_thread_ids = (pthread_t *)calloc(nthreads, sizeof(pthread_t)); 287 | all_thread_ids[0] = pthread_self(); 288 | 289 | for (int16_t i = 1; i < nthreads; ++i) { 290 | lthread_arg_t *targ = (lthread_arg_t *)calloc(1, sizeof(lthread_arg_t)); 291 | targ->tid = i; 292 | targ->exclusive = exclusive; 293 | 294 | if (exclusive) { 295 | /* tell the thread which core to bind to */ 296 | obj = hwloc_get_obj_by_depth(topology, depth, i); 297 | cpuset = hwloc_bitmap_dup(obj->cpuset); 298 | targ->topology = topology; 299 | targ->cpuset = cpuset; 300 | } 301 | pthread_create(&all_thread_ids[i], NULL, partr_thread, targ); 302 | } 303 | 304 | /* allocate this thread's sticky task queue pointer and initialize the lock */ 305 | posix_memalign((void **)&taskq_lock, 64, sizeof(int8_t) + sizeof(ptask_t *)); 306 | taskq = (ptask_t **)(taskq_lock + sizeof(int8_t)); 307 | __atomic_clear(taskq_lock, __ATOMIC_RELAXED); 308 | *taskq = NULL; 309 | all_taskqs[tid] = taskq; 310 | all_taskq_locks[tid] = taskq_lock; 311 | 312 | /* allocate this thread's sleep lock and wakeup signal */ 313 | posix_memalign((void **)&sleep_lock, 64, sizeof(pthread_mutex_t)); 314 | posix_memalign((void **)&wake_signal, 64, sizeof(pthread_cond_t)); 315 | pthread_mutex_init(sleep_lock, NULL); 316 | pthread_cond_init(wake_signal, NULL); 317 | all_sleep_locks[tid] = sleep_lock; 318 | all_wake_signals[tid] = wake_signal; 319 | 320 | /* set up profiling in thread 0 also */ 321 | PROFILE_INIT_THREAD(); 322 | 323 | /* wait for all threads to start up and bind to their CPUs */ 324 | BARRIER(); 325 | hwloc_topology_destroy(topology); 326 | } 327 | 328 | 329 | /* partr_shutdown() -- shutdown all threads and clean up 330 | */ 331 | void partr_shutdown() 332 | { 333 | /* create and add 'nthreads' terminate tasks */ 334 | LOG_INFO(plog, " thread %d adding %d terminate tasks\n", tid, nthreads); 335 | 336 | for (int64_t i = 0; i < nthreads; ++i) { 337 | ptask_t *task = task_alloc(); 338 | if (task == NULL) { 339 | LOG_CRITICAL(plog, " thread %d terminate task allocation failed!\n", 340 | tid); 341 | break; 342 | } 343 | task->settings = TASK_TERMINATE; 344 | if (multiq_insert(task, tid) != 0) { 345 | task_free(task); 346 | LOG_CRITICAL(plog, " thread %d shutdown task insertion failed!\n", tid); 347 | break; 348 | } 349 | } 350 | wake_all_threads(); 351 | 352 | /* wait for all threads to shut down */ 353 | for (int64_t i = 1; i < nthreads; ++i) 354 | pthread_join(all_thread_ids[i], NULL); 355 | 356 | /* show profiling information */ 357 | PROFILE_PRINT(); 358 | 359 | /* free thread IDs array */ 360 | free(all_thread_ids); 361 | 362 | /* free sleep lock and wakeup signal */ 363 | free(wake_signal); 364 | free(sleep_lock); 365 | free(all_wake_signals); 366 | free(all_sleep_locks); 367 | 368 | /* free task queues and their locks */ 369 | free(taskq_lock); 370 | free(all_taskq_locks); 371 | free(all_taskqs); 372 | 373 | /* shut down the tasking library */ 374 | concurrent_fin(); 375 | 376 | /* destroy the task queues */ 377 | multiq_destroy(); 378 | 379 | /* destroy the sync trees */ 380 | synctreepool_destroy(); 381 | 382 | /* destroy the task pools and free all tasks */ 383 | taskpools_destroy(); 384 | } 385 | 386 | 387 | /* partr_coro() -- coroutine entry point 388 | */ 389 | static void partr_coro(struct concurrent_ctx *ctx) 390 | { 391 | ptask_t *task = ctx_get_user_ptr(ctx); 392 | task->result = task->f(task->arg, task->start, task->end); 393 | 394 | /* grain tasks must synchronize */ 395 | if (task->grain_num >= 0) { 396 | int was_last = 0; 397 | 398 | /* reduce... */ 399 | if (task->red) { 400 | task->result = reduce(task->arr, task->red, task->rf, 401 | task->result, task->grain_num); 402 | /* if this task is last, set the result in the parent task */ 403 | if (task->result) { 404 | task->parent->red_result = task->result; 405 | was_last = 1; 406 | } 407 | } 408 | /* ... or just sync */ 409 | else { 410 | if (last_arriver(task->arr, task->grain_num)) 411 | was_last = 1; 412 | } 413 | 414 | /* the last task to finish needs to finish up the loop */ 415 | if (was_last) { 416 | LOG_DEBUG(plog, " thread %d grain task %d (%p) was last\n", 417 | tid, task->grain_num, task); 418 | 419 | /* a non-parent task must wake up the parent */ 420 | if (task->grain_num > 0) { 421 | LOG_DEBUG(plog, " thread %d waking loop parent task %p\n", 422 | tid, task->parent); 423 | multiq_insert(task->parent, 0); 424 | wake_all_threads(); 425 | } 426 | /* the parent task was last; it can just end */ 427 | } 428 | else { 429 | /* the parent task needs to wait */ 430 | if (task->grain_num == 0) { 431 | LOG_DEBUG(plog, " thread %d loop parent task %p yielding\n", 432 | tid, task); 433 | yield_value(task->ctx, (void *)yield_from_sync); 434 | } 435 | } 436 | 437 | if (task->grain_num == 0) 438 | LOG_DEBUG(plog, " thread %d completed loop task %p\n", tid, task); 439 | } 440 | } 441 | 442 | 443 | /* setup_task() -- allocate and initialize a task 444 | */ 445 | static ptask_t *setup_task(void *(*f)(void *, int64_t, int64_t), void *arg, 446 | int64_t start, int64_t end) 447 | { 448 | ptask_t *task = task_alloc(); 449 | if (task == NULL) 450 | return NULL; 451 | 452 | ctx_construct(task->ctx, task->stack, TASK_STACK_SIZE, partr_coro, task); 453 | task->f = f; 454 | task->arg = arg; 455 | task->start = start; 456 | task->end = end; 457 | task->settings = 0; 458 | task->sticky_tid = -1; 459 | task->grain_num = -1; 460 | 461 | return task; 462 | } 463 | 464 | 465 | /* release_task() -- destroy the coroutine context and free the task 466 | */ 467 | static void *release_task(ptask_t *task) 468 | { 469 | void *result = task->result; 470 | ctx_destruct(task->ctx); 471 | if (task->grain_num == 0 && task->red) 472 | reducer_free(task->red); 473 | if (task->grain_num == 0 && task->arr) 474 | arriver_free(task->arr); 475 | task->f = NULL; 476 | task->arg = task->result = task->red_result = NULL; 477 | task->start = task->end = 0; 478 | task->rf = NULL; 479 | task->parent = task->cq = NULL; 480 | task->arr = NULL; 481 | task->red = NULL; 482 | task_free(task); 483 | return result; 484 | } 485 | 486 | 487 | /* add_to_taskq() -- add the specified task to the sticky task queue 488 | */ 489 | static void add_to_taskq(ptask_t *task) 490 | { 491 | assert(task->sticky_tid != -1); 492 | 493 | ptask_t **q = all_taskqs[task->sticky_tid]; 494 | int8_t *lock = all_taskq_locks[task->sticky_tid]; 495 | 496 | while (__atomic_test_and_set(lock, __ATOMIC_ACQUIRE)) 497 | cpu_pause(); 498 | 499 | if (*q == NULL) 500 | *q = task; 501 | else { 502 | ptask_t *pt = *q; 503 | while (pt->next) 504 | pt = pt->next; 505 | pt->next = task; 506 | } 507 | 508 | __atomic_clear(lock, __ATOMIC_RELEASE); 509 | 510 | wake_thread(task->sticky_tid); 511 | } 512 | 513 | 514 | /* get_from_taskq() -- pop the first task off the sticky task queue 515 | */ 516 | static ptask_t *get_from_taskq() 517 | { 518 | /* racy check for quick path */ 519 | if (*taskq == NULL) 520 | return NULL; 521 | 522 | while (__atomic_test_and_set(taskq_lock, __ATOMIC_ACQUIRE)) 523 | cpu_pause(); 524 | 525 | ptask_t *task = *taskq; 526 | if (task) { 527 | *taskq = task->next; 528 | task->next = NULL; 529 | } 530 | 531 | __atomic_clear(taskq_lock, __ATOMIC_RELEASE); 532 | 533 | return task; 534 | } 535 | 536 | 537 | /* sleep_after_threshold() -- if sleep_threshold cycles have passed, sleep the thread 538 | */ 539 | static void sleep_after_threshold(uint64_t *start_cycles) 540 | { 541 | if (sleep_threshold) { 542 | if (!(*start_cycles)) { 543 | *start_cycles = rdtscp(); 544 | return; 545 | } 546 | uint64_t elapsed_cycles = rdtscp() - (*start_cycles); 547 | if (elapsed_cycles >= sleep_threshold) { 548 | multiq_sleep_if_empty(sleep_lock, wake_signal); 549 | *start_cycles = 0; 550 | } 551 | } 552 | } 553 | 554 | 555 | /* run_next() -- run the next available task 556 | */ 557 | static int run_next() 558 | { 559 | ptask_t *task; 560 | 561 | /* first check for sticky tasks */ 562 | task = get_from_taskq(); 563 | if (task == NULL) { 564 | /* no sticky tasks, go to the multiq */ 565 | task = multiq_deletemin(); 566 | if (task != NULL) 567 | assert(!(task->settings & TASK_IS_STICKY)); 568 | } 569 | if (task == NULL) 570 | return 0; 571 | 572 | /* terminate tasks tell the thread to die */ 573 | if (task->settings & TASK_TERMINATE) { 574 | release_task(task); 575 | LOG_INFO(plog, " thread %d got terminate task\n", tid); 576 | return -1; 577 | } 578 | 579 | LOG_DEBUG(plog, " thread %d resuming task %p\n", tid, task); 580 | 581 | /* run/resume the task */ 582 | curr_task = task; 583 | int64_t y = (int64_t)resume(task->ctx); 584 | curr_task = NULL; 585 | 586 | /* if the task isn't done, it is either in a CQ, or must be re-queued */ 587 | if (!ctx_is_done(task->ctx)) { 588 | /* the yield value tells us if the task is in a CQ */ 589 | if (y != yield_from_sync) { 590 | LOG_DEBUG(plog, " thread %d had task %p yield\n", tid, task); 591 | 592 | /* sticky tasks go to the thread's sticky queue */ 593 | if (task->settings & TASK_IS_STICKY) 594 | add_to_taskq(task); 595 | 596 | /* all others go back into the multiq */ 597 | else { 598 | multiq_insert(task, task->prio); 599 | wake_all_threads(); 600 | } 601 | } 602 | return 1; 603 | } 604 | 605 | LOG_DEBUG(plog, " thread %d completed task %p\n", tid, task); 606 | 607 | /* The task completed. As detached tasks cannot be synced, clean 608 | those up here. 609 | */ 610 | if (task->settings & TASK_IS_DETACHED) { 611 | release_task(task); 612 | return 2; 613 | } 614 | 615 | /* add back all the tasks in this one's completion queue */ 616 | while (__atomic_test_and_set(&task->cq_lock, __ATOMIC_ACQUIRE)) 617 | cpu_pause(); 618 | ptask_t *cqtask, *cqnext; 619 | cqtask = task->cq; 620 | task->cq = NULL; 621 | while (cqtask) { 622 | cqnext = cqtask->next; 623 | cqtask->next = NULL; 624 | LOG_DEBUG(plog, " thread %d adding from task %p's CQ: %p\n", 625 | tid, task, cqtask); 626 | if (cqtask->settings & TASK_IS_STICKY) 627 | add_to_taskq(cqtask); 628 | else { 629 | multiq_insert(cqtask, cqtask->prio); 630 | wake_all_threads(); 631 | } 632 | cqtask = cqnext; 633 | } 634 | __atomic_clear(&task->cq_lock, __ATOMIC_RELEASE); 635 | 636 | return 2; 637 | } 638 | 639 | 640 | /* partr_start() -- the runtime entry point 641 | 642 | To be called from thread 0, before creating any tasks. Wraps into 643 | a task and invokes `f(arg)`; tasks should only be spawned/synced 644 | from within tasks. 645 | */ 646 | int partr_start(void **ret, void *(*f)(void *, int64_t, int64_t), 647 | void *arg, int64_t start, int64_t end) 648 | { 649 | assert(tid == 0); 650 | 651 | start_task = setup_task(f, arg, start, end); 652 | if (start_task == NULL) 653 | return -1; 654 | start_task->settings |= TASK_IS_STICKY; 655 | start_task->sticky_tid = tid; 656 | 657 | LOG_DEBUG(plog, " thread %d invoking start task %p\n", tid, start_task); 658 | curr_task = start_task; 659 | int64_t y = (int64_t)resume(start_task->ctx); 660 | curr_task = NULL; 661 | 662 | if (!ctx_is_done(start_task->ctx)) { 663 | LOG_DEBUG(plog, " thread %d had start task %p yield\n", tid, start_task); 664 | if (y != yield_from_sync) { 665 | LOG_DEBUG(plog, " thread %d re-inserting start task %p\n", 666 | tid, start_task); 667 | add_to_taskq(start_task); 668 | } 669 | 670 | while (run_next() != -1) 671 | if (ctx_is_done(start_task->ctx)) 672 | break; 673 | } 674 | 675 | void *r = release_task(start_task); 676 | if (ret) 677 | *ret = r; 678 | 679 | LOG_DEBUG(plog, " thread %d released start task %p\n", tid, start_task); 680 | return 0; 681 | } 682 | 683 | 684 | /* partr_thread() -- the thread function 685 | 686 | Loops, getting tasks from the multiqueue and executing them. 687 | */ 688 | static void *partr_thread(void *arg_) 689 | { 690 | BARRIER_THREAD_DECL; 691 | lthread_arg_t *arg = (lthread_arg_t *)arg_; 692 | 693 | tid = arg->tid; 694 | seed_cong(&rngseed); 695 | 696 | /* set affinity if requested */ 697 | if (arg->exclusive) { 698 | hwloc_set_cpubind(arg->topology, arg->cpuset, HWLOC_CPUBIND_THREAD); 699 | hwloc_bitmap_free(arg->cpuset); 700 | } 701 | show_affinity(); 702 | 703 | /* allocate this thread's sticky task queue pointer and initialize the lock */ 704 | posix_memalign((void **)&taskq_lock, 64, sizeof(int8_t) + sizeof(ptask_t *)); 705 | taskq = (ptask_t **)(taskq_lock + sizeof(int8_t)); 706 | __atomic_clear(taskq_lock, __ATOMIC_RELAXED); 707 | *taskq = NULL; 708 | all_taskqs[tid] = taskq; 709 | all_taskq_locks[tid] = taskq_lock; 710 | 711 | /* allocate this thread's sleep lock and wakeup signal */ 712 | posix_memalign((void **)&sleep_lock, 64, sizeof(pthread_mutex_t)); 713 | posix_memalign((void **)&wake_signal, 64, sizeof(pthread_cond_t)); 714 | pthread_mutex_init(sleep_lock, NULL); 715 | pthread_cond_init(wake_signal, NULL); 716 | all_sleep_locks[tid] = sleep_lock; 717 | all_wake_signals[tid] = wake_signal; 718 | 719 | /* set up per-thread profiling */ 720 | PROFILE_INIT_THREAD(); 721 | 722 | BARRIER(); 723 | 724 | /* free the thread function argument */ 725 | free(arg); 726 | 727 | /* run the scheduler */ 728 | uint64_t start_cycles = 0; 729 | int r = 1; 730 | while (r != -1) { 731 | r = run_next(); 732 | if (r == 0) 733 | sleep_after_threshold(&start_cycles); 734 | else if (r > 0) 735 | start_cycles = 0; 736 | } 737 | 738 | /* free the sleep lock and wakeup signal */ 739 | free(wake_signal); 740 | free(sleep_lock); 741 | 742 | /* free the sticky task queue pointer (and its lock) */ 743 | free(taskq_lock); 744 | 745 | LOG_INFO(plog, " thread %d exiting\n", tid); 746 | return NULL; 747 | } 748 | 749 | 750 | /* partr_spawn() -- create a task for `f(arg)` and enqueue it for execution 751 | 752 | Implicitly asserts that `f(arg)` can run concurrently with everything 753 | else that's currently running. If `detach` is set, the spawned task 754 | will not be returned (and cannot be synced). Yields. 755 | */ 756 | int partr_spawn(partr_t *t, void *(*f)(void *, int64_t, int64_t), 757 | void *arg, int64_t start, int64_t end, int8_t sticky, int8_t detach) 758 | { 759 | PROFILE_START(PERF_SPAWN); 760 | 761 | ptask_t *task = setup_task(f, arg, start, end); 762 | if (task == NULL) 763 | return -1; 764 | if (detach) 765 | task->settings |= TASK_IS_DETACHED; 766 | if (sticky) { 767 | task->settings |= TASK_IS_STICKY; 768 | task->sticky_tid = tid; 769 | add_to_taskq(task); 770 | } 771 | else { 772 | if (multiq_insert(task, tid) != 0) { 773 | release_task(task); 774 | return -2; 775 | } 776 | wake_all_threads(); 777 | } 778 | 779 | *t = detach ? NULL : (partr_t)task; 780 | 781 | LOG_DEBUG(plog, " thread %d task %p spawned task %p\n", tid, curr_task, task); 782 | 783 | PROFILE_STAMP(PERF_SPAWN); 784 | 785 | /* only yield if we're running a non-sticky task */ 786 | if (!(curr_task->settings & TASK_IS_STICKY)) 787 | yield(curr_task->ctx); 788 | 789 | return 0; 790 | } 791 | 792 | 793 | /* partr_sync() -- get the return value of task `t` 794 | 795 | Returns only when task `t` has completed. 796 | */ 797 | int partr_sync(void **r, partr_t t, int done_with_task) 798 | { 799 | PROFILE_START(PERF_SYNC); 800 | 801 | ptask_t *task = (ptask_t *)t; 802 | 803 | /* if the target task has not finished, add the current task to its 804 | completion queue; the thread that runs the target task will add 805 | this task back to the ready queue 806 | */ 807 | if (!ctx_is_done(task->ctx)) { 808 | curr_task->next = NULL; 809 | while (__atomic_test_and_set(&task->cq_lock, __ATOMIC_ACQUIRE)) 810 | cpu_pause(); 811 | 812 | /* ensure the task didn't finish before we got the lock */ 813 | if (!ctx_is_done(task->ctx)) { 814 | LOG_DEBUG(plog, " thread %d task %p sync on task %p\n", 815 | tid, curr_task, task); 816 | 817 | /* add the current task to the CQ */ 818 | if (task->cq == NULL) 819 | task->cq = curr_task; 820 | else { 821 | ptask_t *pt = task->cq; 822 | while (pt->next) 823 | pt = pt->next; 824 | pt->next = curr_task; 825 | } 826 | 827 | /* unlock the CQ and yield the current task */ 828 | __atomic_clear(&task->cq_lock, __ATOMIC_RELEASE); 829 | PROFILE_STAMP(PERF_SYNC); 830 | yield_value(curr_task->ctx, (void *)yield_from_sync); 831 | PROFILE_START(PERF_SYNC); 832 | } 833 | 834 | /* the task finished before we could add to its CQ */ 835 | else { 836 | __atomic_clear(&task->cq_lock, __ATOMIC_RELEASE); 837 | LOG_DEBUG(plog, " thread %d task %p sync on task %p success\n", 838 | tid, curr_task, task); 839 | } 840 | } 841 | 842 | if (r) 843 | *r = task->grain_num >= 0 && task->red ? 844 | task->red_result : task->result; 845 | 846 | if (done_with_task) 847 | release_task(task); 848 | 849 | PROFILE_STAMP(PERF_SYNC); 850 | 851 | return 0; 852 | } 853 | 854 | 855 | /* partr_parfor() -- spawn multiple tasks for a parallel loop 856 | 857 | Spawn tasks that invoke `f(arg, start, end)` such that the sum of `end-start` 858 | for all tasks is `count`. Uses `rf()`, if provided, to reduce the return 859 | values from the tasks, and returns the result. Yields. 860 | */ 861 | int partr_parfor(partr_t *t, void *(*f)(void *, int64_t, int64_t), 862 | void *arg, int64_t count, void *(*rf)(void *, void *)) 863 | { 864 | PROFILE_START(PERF_PARFOR); 865 | 866 | int64_t n = GRAIN_K * nthreads; 867 | lldiv_t each = lldiv(count, n); 868 | 869 | /* allocate synchronization tree(s) */ 870 | arriver_t *arr = arriver_alloc(); 871 | if (arr == NULL) { 872 | LOG_CRITICAL(plog, " thread %d parfor arriver alloc failed!\n", tid); 873 | return -1; 874 | } 875 | reducer_t *red = NULL; 876 | if (rf != NULL) { 877 | red = reducer_alloc(); 878 | if (red == NULL) { 879 | arriver_free(arr); 880 | LOG_CRITICAL(plog, " thread %d parfor reducer alloc failed!\n", tid); 881 | return -2; 882 | } 883 | } 884 | 885 | /* allocate and enqueue (GRAIN_K * nthreads) tasks */ 886 | *t = NULL; 887 | int64_t start = 0, end; 888 | for (int64_t i = 0; i < n; ++i) { 889 | end = start + each.quot + (i < each.rem ? 1 : 0); 890 | ptask_t *task = setup_task(f, arg, start, end); 891 | if (task == NULL) { 892 | LOG_CRITICAL(plog, " thread %d parfor task setup failed!\n", tid); 893 | return -1; 894 | } 895 | 896 | /* The first task is the parent (root) task of the parfor, thus only 897 | this can be synced. So, we create the remaining tasks detached. 898 | */ 899 | if (*t == NULL) *t = task; 900 | else task->settings = TASK_IS_DETACHED; 901 | 902 | task->parent = *t; 903 | task->grain_num = i; 904 | task->rf = rf; 905 | task->arr = arr; 906 | task->red = red; 907 | 908 | if (multiq_insert(task, tid) != 0) { 909 | release_task(task); 910 | LOG_CRITICAL(plog, " thread %d parfor multiq insert failed!\n", tid); 911 | return -3; 912 | } 913 | 914 | start = end; 915 | } 916 | wake_all_threads(); 917 | 918 | LOG_DEBUG(plog, " thread %d task %p parfor spawned %lld tasks\n", 919 | tid, curr_task, n); 920 | 921 | PROFILE_STAMP(PERF_PARFOR); 922 | 923 | /* only yield if we're running a non-sticky task */ 924 | if (!(curr_task->settings & TASK_IS_STICKY)) 925 | yield(curr_task->ctx); 926 | 927 | return 0; 928 | } 929 | 930 | -------------------------------------------------------------------------------- /src/perfutil.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | */ 3 | 4 | #ifndef PERFUTIL_H 5 | #define PERFUTIL_H 6 | 7 | #include 8 | 9 | #define cpu_pause() __asm__("pause;"); 10 | 11 | static inline uint64_t rdtscp() 12 | { 13 | uint32_t lo, hi; 14 | __asm__ volatile ("rdtscp" 15 | : /* outputs */ "=a" (lo), "=d" (hi) 16 | : /* no inputs */ 17 | : /* clobbers */ "%rcx"); 18 | return ((uint64_t)hi << 32) + lo; 19 | } 20 | 21 | 22 | #endif /* PERFUTIL_H */ 23 | 24 | -------------------------------------------------------------------------------- /src/profile.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | thread-aware performance profiling 4 | */ 5 | 6 | #ifndef PROFILE_H 7 | #define PROFILE_H 8 | 9 | #include 10 | #include 11 | #include "perfutil.h" 12 | 13 | #ifdef PERF_PROFILE 14 | enum { 15 | PERF_SPAWN, PERF_SYNC, PERF_PARFOR, NTIMES 16 | }; 17 | 18 | char *times_names[] = { 19 | "spawn", "sync", "parfor", "run", "" 20 | }; 21 | 22 | typedef struct thread_timing_tag { 23 | uint64_t last, min, max, total, count; 24 | } thread_timing_t; 25 | 26 | thread_timing_t **thread_times; 27 | 28 | #define PROFILE_SETUP() \ 29 | posix_memalign((void **)&thread_times, \ 30 | 64, nthreads * sizeof(thread_timing_t *)); 31 | 32 | #define PROFILE_INIT_THREAD() \ 33 | posix_memalign((void **)&thread_times[tid], 64, \ 34 | NTIMES * sizeof(thread_timing_t)); \ 35 | for (int i = 0; i < NTIMES; i++) { \ 36 | thread_times[tid][i].last = thread_times[tid][i].max = \ 37 | thread_times[tid][i].total = thread_times[tid][i].count = 0; \ 38 | thread_times[tid][i].min = UINT64_MAX; \ 39 | } 40 | 41 | #define PROFILE_START(w) \ 42 | thread_times[tid][(w)].last = rdtscp() 43 | 44 | #define PROFILE_STAMP(w) \ 45 | { \ 46 | uint64_t l = thread_times[tid][(w)].last = \ 47 | rdtscp() - thread_times[tid][(w)].last; \ 48 | if (l < thread_times[tid][(w)].min) thread_times[tid][(w)].min = l; \ 49 | if (l > thread_times[tid][(w)].max) thread_times[tid][(w)].max = l; \ 50 | thread_times[tid][(w)].total += l; \ 51 | ++thread_times[tid][(w)].count; \ 52 | } 53 | 54 | #define PROFILE_PRINT() \ 55 | { \ 56 | thread_timing_t coll_times[NTIMES]; \ 57 | for (int i = 0; i < NTIMES; i++) { \ 58 | memset(&coll_times[i], 0, sizeof (thread_timing_t)); \ 59 | coll_times[i].min = UINT64_MAX; \ 60 | } \ 61 | for (int tnum = 0; tnum < nthreads; tnum++) { \ 62 | for (int i = 0; i < NTIMES; i++) { \ 63 | coll_times[i].total += thread_times[tnum][i].total; \ 64 | coll_times[i].count += thread_times[tnum][i].count; \ 65 | if (thread_times[tnum][i].max > coll_times[i].max) \ 66 | coll_times[i].max = thread_times[tnum][i].max; \ 67 | if (thread_times[tnum][i].min < coll_times[i].min) \ 68 | coll_times[i].min = thread_times[tnum][i].min; \ 69 | } \ 70 | } \ 71 | printf("partr profile: #calls, mean ticks, [min, max]\n"); \ 72 | for (int i = 0; i < NTIMES; i++) { \ 73 | uint64_t m = 0; \ 74 | if (coll_times[i].count > 0) \ 75 | m = coll_times[i].total / (double)coll_times[i].count; \ 76 | printf("%s: %llu, %llu, [%llu, %llu]\n", \ 77 | times_names[i], coll_times[i].count, m, \ 78 | coll_times[i].min, coll_times[i].max); \ 79 | } \ 80 | } 81 | 82 | 83 | #else /* !PERF_PROFILE */ 84 | 85 | #define PROFILE_SETUP() 86 | #define PROFILE_INIT_THREAD() 87 | #define PROFILE_START(w) 88 | #define PROFILE_STAMP(w) 89 | #define PROFILE_PRINT() 90 | 91 | #endif /* PERF_PROFILE */ 92 | 93 | #endif /* PROFILE_H */ 94 | 95 | -------------------------------------------------------------------------------- /src/synctreepool.c: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | Pool of synchronization trees, for synchronizing parfor-generated tasks. 4 | Synchronization and reduction are managed via two binary trees. 5 | */ 6 | 7 | #include 8 | #include "partr.h" 9 | #include "synctreepool.h" 10 | 11 | 12 | /* arrival tree */ 13 | struct arriver_tag { 14 | int16_t index, next_avail; 15 | int16_t **tree; 16 | }; 17 | 18 | 19 | /* reduction tree */ 20 | struct reducer_tag { 21 | int16_t index, next_avail; 22 | void ***tree; 23 | }; 24 | 25 | 26 | 27 | /* pool of arrival trees */ 28 | static arriver_t *arriverpool; 29 | static int16_t num_arrivers, num_arriver_tree_nodes, next_arriver; 30 | 31 | 32 | /* pool of reduction trees */ 33 | static reducer_t *reducerpool; 34 | static int16_t num_reducers, num_reducer_tree_nodes, next_reducer; 35 | 36 | 37 | /* synctreepool_init() 38 | */ 39 | void synctreepool_init() 40 | { 41 | num_arriver_tree_nodes = (GRAIN_K * nthreads) - 1; 42 | num_reducer_tree_nodes = (2 * GRAIN_K * nthreads) - 1; 43 | 44 | /* num_arrivers = ((GRAIN_K * nthreads) ^ ARRIVERS_P) + 1 */ 45 | num_arrivers = GRAIN_K * nthreads; 46 | for (int i = 1; i < ARRIVERS_P; ++i) 47 | num_arrivers = num_arrivers * num_arrivers; 48 | ++num_arrivers; 49 | 50 | num_reducers = num_arrivers * REDUCERS_FRAC; 51 | 52 | /* allocate */ 53 | arriverpool = (arriver_t *)calloc(num_arrivers, sizeof (arriver_t)); 54 | next_arriver = 0; 55 | for (int i = 0; i < num_arrivers; ++i) { 56 | arriverpool[i].index = i; 57 | arriverpool[i].next_avail = i + 1; 58 | posix_memalign((void **)&arriverpool[i].tree, 64, 59 | num_arriver_tree_nodes * sizeof (int16_t *)); 60 | //arriverpool[i].tree = 61 | // aligned_alloc(64, num_arriver_tree_nodes * sizeof (int16_t *)); 62 | for (int j = 0; j < num_arriver_tree_nodes; ++j) 63 | posix_memalign((void **)&arriverpool[i].tree[j], 64, sizeof (int16_t)); 64 | //arriverpool[i].tree[j] = aligned_alloc(64, sizeof (int16_t)); 65 | } 66 | arriverpool[num_arrivers - 1].next_avail = -1; 67 | 68 | reducerpool = (reducer_t *)calloc(num_reducers, sizeof (reducer_t)); 69 | next_reducer = 0; 70 | for (int i = 0; i < num_reducers; ++i) { 71 | reducerpool[i].index = i; 72 | reducerpool[i].next_avail = i + 1; 73 | posix_memalign((void **)&reducerpool[i].tree, 64, 74 | num_reducer_tree_nodes * sizeof (void **)); 75 | //reducerpool[i].tree = 76 | // aligned_alloc(64, num_reducer_tree_nodes * sizeof (void **)); 77 | for (int j = 0; j < num_reducer_tree_nodes; ++j) 78 | posix_memalign((void **)&reducerpool[i].tree[j], 64, sizeof (void *)); 79 | //reducerpool[i].tree[j] = aligned_alloc(64, sizeof (void *)); 80 | } 81 | if (num_reducers > 0) 82 | reducerpool[num_reducers - 1].next_avail = -1; 83 | else 84 | next_reducer = -1; 85 | 86 | LOG_INFO(plog, " %d arrivers and %d reducers allocated\n", 87 | num_arrivers, num_reducers); 88 | } 89 | 90 | 91 | /* synctreepool_destroy() 92 | */ 93 | void synctreepool_destroy() 94 | { 95 | for (int i = 0; i < num_arrivers; ++i) { 96 | for (int j = 0; j < num_arriver_tree_nodes; ++j) 97 | free(arriverpool[i].tree[j]); 98 | free(arriverpool[i].tree); 99 | } 100 | free(arriverpool); 101 | 102 | arriverpool = NULL; 103 | num_arrivers = 0; 104 | num_arriver_tree_nodes = 0; 105 | next_arriver = -1; 106 | 107 | for (int i = 0; i < num_reducers; ++i) { 108 | for (int j = 0; j < num_reducer_tree_nodes; ++j) 109 | free(reducerpool[i].tree[j]); 110 | free(reducerpool[i].tree); 111 | } 112 | free(reducerpool); 113 | 114 | reducerpool = NULL; 115 | num_reducers = 0; 116 | num_reducer_tree_nodes = 0; 117 | next_reducer = -1; 118 | } 119 | 120 | 121 | /* arriver_alloc() 122 | */ 123 | arriver_t *arriver_alloc() 124 | { 125 | int16_t candidate; 126 | arriver_t *arr; 127 | 128 | do { 129 | candidate = __atomic_load_n(&next_arriver, __ATOMIC_SEQ_CST); 130 | if (candidate == -1) { 131 | LOG_ERR(plog, " <%d> arriver allocation failed\n", tid); 132 | return NULL; 133 | } 134 | arr = &arriverpool[candidate]; 135 | } while (!__atomic_compare_exchange_n(&next_arriver, 136 | &candidate, arr->next_avail, 137 | 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)); 138 | return arr; 139 | } 140 | 141 | 142 | /* arriver_free() 143 | */ 144 | void arriver_free(arriver_t *arr) 145 | { 146 | for (int i = 0; i < num_arriver_tree_nodes; ++i) 147 | *arr->tree[i] = 0; 148 | 149 | __atomic_exchange(&next_arriver, &arr->index, &arr->next_avail, 150 | __ATOMIC_SEQ_CST); 151 | } 152 | 153 | 154 | /* reducer_alloc() 155 | */ 156 | reducer_t *reducer_alloc() 157 | { 158 | int16_t candidate; 159 | reducer_t *red; 160 | 161 | do { 162 | candidate = __atomic_load_n(&next_reducer, __ATOMIC_SEQ_CST); 163 | if (candidate == -1) { 164 | LOG_ERR(plog, " <%d> reducer allocation failed\n", tid); 165 | return NULL; 166 | } 167 | red = &reducerpool[candidate]; 168 | } while (!__atomic_compare_exchange_n(&next_reducer, 169 | &candidate, red->next_avail, 170 | 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)); 171 | return red; 172 | } 173 | 174 | 175 | /* reducer_free() 176 | */ 177 | void reducer_free(reducer_t *red) 178 | { 179 | for (int i = 0; i < num_reducer_tree_nodes; ++i) 180 | *red->tree[i] = 0; 181 | 182 | __atomic_exchange(&next_reducer, &red->index, &red->next_avail, 183 | __ATOMIC_SEQ_CST); 184 | } 185 | 186 | 187 | /* last_arriver() 188 | */ 189 | int last_arriver(arriver_t *arr, int idx) 190 | { 191 | int arrived, aidx = idx + (GRAIN_K * nthreads) - 1; 192 | 193 | while (aidx > 0) { 194 | --aidx; 195 | aidx >>= 1; 196 | arrived = __atomic_fetch_add(arr->tree[aidx], 1, __ATOMIC_SEQ_CST); 197 | if (!arrived) return 0; 198 | } 199 | 200 | return 1; 201 | } 202 | 203 | 204 | /* reduce() 205 | */ 206 | void *reduce(arriver_t *arr, reducer_t *red, void *(*rf)(void *, void *), 207 | void *val, int idx) 208 | { 209 | int arrived, aidx = idx + (GRAIN_K * nthreads) - 1, ridx = aidx, nidx; 210 | 211 | *red->tree[ridx] = val; 212 | while (aidx > 0) { 213 | --aidx; 214 | aidx >>= 1; 215 | arrived = __atomic_fetch_add(arr->tree[aidx], 1, __ATOMIC_SEQ_CST); 216 | if (!arrived) return NULL; 217 | 218 | /* neighbor has already arrived, get its value and reduce it */ 219 | nidx = ridx & 0x1 ? ridx + 1 : ridx - 1; 220 | val = rf(val, *red->tree[nidx]); 221 | 222 | /* move up the tree */ 223 | --ridx; 224 | ridx >>= 1; 225 | *red->tree[ridx] = val; 226 | } 227 | 228 | return val; 229 | } 230 | 231 | -------------------------------------------------------------------------------- /src/synctreepool.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | Pool of synchronization trees, for synchronizing parfor-generated tasks. 4 | */ 5 | 6 | #ifndef SYNCTREEPOOL_H 7 | #define SYNCTREEPOOL_H 8 | 9 | #include 10 | 11 | typedef struct arriver_tag arriver_t; 12 | typedef struct reducer_tag reducer_t; 13 | 14 | /* interface */ 15 | void synctreepool_init(); 16 | void synctreepool_destroy(); 17 | arriver_t *arriver_alloc(); 18 | void arriver_free(arriver_t *); 19 | reducer_t *reducer_alloc(); 20 | void reducer_free(reducer_t *); 21 | 22 | int last_arriver(arriver_t *, int); 23 | void *reduce(arriver_t *, reducer_t *, void *(*rf)(void *, void *), void *, int); 24 | 25 | 26 | #endif /* SYNCTREEPOOL_H */ 27 | 28 | -------------------------------------------------------------------------------- /src/task.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | task definition 4 | */ 5 | 6 | #ifndef TASK_H 7 | #define TASK_H 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | /* task settings */ 15 | #define TASK_TERMINATE 0x01 16 | /* terminate thread */ 17 | #define TASK_IS_DETACHED 0x02 18 | /* clean up the task on completion */ 19 | #define TASK_IS_STICKY 0x04 20 | /* task is sticky to the thread that first runs it */ 21 | 22 | typedef struct arriver_tag arriver_t; 23 | typedef struct reducer_tag reducer_t; 24 | 25 | typedef struct ptask_tag ptask_t; 26 | 27 | 28 | /* a task */ 29 | struct ptask_tag { 30 | /* to link this task into queues */ 31 | ptask_t *next; 32 | 33 | /* coroutine context and stack */ 34 | struct concurrent_ctx *ctx; 35 | uint8_t *stack; 36 | 37 | /* task entry point, arguments, result, reduction function */ 38 | void *(*f)(void *, int64_t, int64_t); 39 | void *arg, *result; 40 | int64_t start, end; 41 | 42 | /* ----- IA-64 cache line boundary ----- */ 43 | 44 | /* reduction function, for parfors */ 45 | void *(*rf)(void *, void *); 46 | 47 | /* parent (first) task of a parfor set */ 48 | ptask_t *parent; 49 | 50 | /* to synchronize/reduce grains of a parfor */ 51 | arriver_t *arr; 52 | reducer_t *red; 53 | 54 | /* parfor reduction result */ 55 | void *red_result; 56 | 57 | /* completion queue and lock */ 58 | ptask_t *cq; 59 | int8_t cq_lock; 60 | 61 | /* task settings */ 62 | int8_t settings; 63 | 64 | /* tid of the thread to which this task is sticky */ 65 | int16_t sticky_tid; 66 | 67 | /* the index of this task in the set of grains of a parfor */ 68 | int16_t grain_num; 69 | 70 | /* for the multiqueue */ 71 | int16_t prio; 72 | 73 | /* to manage task pools */ 74 | int16_t pool, index, next_avail; 75 | 76 | /* padding to cache line boundary */ 77 | int8_t cl2_padding[2]; 78 | 79 | /* ----- IA-64 cache line boundary ----- */ 80 | }; 81 | 82 | 83 | #endif /* TASK_H */ 84 | 85 | -------------------------------------------------------------------------------- /src/taskpools.c: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | taskpools for fast allocation/freeing 4 | */ 5 | 6 | #include 7 | #include "partr.h" 8 | #include "taskpools.h" 9 | 10 | 11 | /* task pool for quick allocation/freeing of tasks */ 12 | typedef struct ptaskpool_tag { 13 | int16_t num_tasks, next_avail; 14 | ptask_t *tasks; 15 | } ptaskpool_t; 16 | 17 | 18 | /* a task pool for each thread */ 19 | static ptaskpool_t *ptaskpools; 20 | 21 | 22 | /* taskpools_init() 23 | */ 24 | void taskpools_init() 25 | { 26 | ptaskpools = (ptaskpool_t *)calloc(nthreads, sizeof(ptaskpool_t)); 27 | for (int16_t i = 0; i < nthreads; ++i) { 28 | ptaskpools[i].num_tasks = TASKS_PER_POOL; 29 | ptaskpools[i].next_avail = 0; 30 | ptaskpools[i].tasks = (ptask_t *) 31 | calloc(TASKS_PER_POOL, sizeof(ptask_t)); 32 | for (int16_t j = 0; j < TASKS_PER_POOL; ++j) { 33 | ptaskpools[i].tasks[j].ctx = 34 | calloc(ctx_sizeof(), sizeof(uint8_t)); 35 | ptaskpools[i].tasks[j].stack = 36 | calloc(TASK_STACK_SIZE, sizeof(uint8_t)); 37 | ptaskpools[i].tasks[j].pool = i; 38 | ptaskpools[i].tasks[j].index = j; 39 | ptaskpools[i].tasks[j].next_avail = j + 1; 40 | } 41 | ptaskpools[i].tasks[TASKS_PER_POOL-1].next_avail = -1; 42 | } 43 | LOG_INFO(plog, " %d tasks allocated per pool\n", TASKS_PER_POOL); 44 | } 45 | 46 | 47 | /* taskpools_destroy() 48 | */ 49 | void taskpools_destroy() 50 | { 51 | for (int16_t i = 0; i < nthreads; ++i) { 52 | for (int16_t j = 0; j < TASKS_PER_POOL; ++j) { 53 | free(ptaskpools[i].tasks[j].stack); 54 | free(ptaskpools[i].tasks[j].ctx); 55 | } 56 | free(ptaskpools[i].tasks); 57 | } 58 | free(ptaskpools); 59 | } 60 | 61 | 62 | /* task_alloc() 63 | */ 64 | ptask_t *task_alloc() 65 | { 66 | int16_t candidate; 67 | ptask_t *task; 68 | ptaskpool_t *pool = &ptaskpools[tid]; 69 | 70 | do { 71 | candidate = __atomic_load_n(&pool->next_avail, __ATOMIC_SEQ_CST); 72 | if (candidate == -1) { 73 | LOG_ERR(plog, " <%d> task allocation failed\n", tid); 74 | return NULL; 75 | } 76 | task = &pool->tasks[candidate]; 77 | } while (!__atomic_compare_exchange_n(&pool->next_avail, 78 | &candidate, task->next_avail, 79 | 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)); 80 | return task; 81 | } 82 | 83 | 84 | /* task_free() 85 | */ 86 | void task_free(ptask_t *task) 87 | { 88 | ptaskpool_t *pool = &ptaskpools[task->pool]; 89 | __atomic_exchange(&pool->next_avail, &task->index, &task->next_avail, 90 | __ATOMIC_SEQ_CST); 91 | } 92 | 93 | -------------------------------------------------------------------------------- /src/taskpools.h: -------------------------------------------------------------------------------- 1 | /* partr -- parallel tasks runtime 2 | 3 | taskpools for fast task allocation/freeing 4 | */ 5 | 6 | #ifndef TASKPOOLS_H 7 | #define TASKPOOLS_H 8 | 9 | #include 10 | #include "task.h" 11 | 12 | 13 | /* interface */ 14 | void taskpools_init(); 15 | void taskpools_destroy(); 16 | ptask_t *task_alloc(); 17 | void task_free(ptask_t *); 18 | 19 | 20 | #endif /* TASKPOOLS_H */ 21 | 22 | -------------------------------------------------------------------------------- /test/fib.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "partr.h" 4 | 5 | void *fib(void *arg_, int64_t start, int64_t end) 6 | { 7 | partr_t tx; 8 | int64_t x, y, n = (int64_t)arg_; 9 | if (n < 2) 10 | return (void *)n; 11 | 12 | partr_spawn(&tx, fib, (void *)n-1, 0, 0, 0, 0); 13 | y = (int64_t)fib((void *)n-2, 0, 0); 14 | partr_sync((void *)&x, tx, 1); 15 | 16 | return (void *)x + y; 17 | } 18 | 19 | void *serial_fib(void *arg_) 20 | { 21 | int64_t x, y, n = (int64_t)arg_; 22 | if (n < 2) 23 | return (void *)n; 24 | x = (int64_t)serial_fib((void *)n-1); 25 | y = (int64_t)serial_fib((void *)n-2); 26 | return (void *)x + y; 27 | } 28 | 29 | void *run(void *arg, int64_t start, int64_t end) 30 | { 31 | int64_t v = 10, result, sresult; 32 | result = (int64_t)fib((void *)v, 0, 0); 33 | sresult = (int64_t)serial_fib((void *)v); 34 | printf("fib(%lld)=%lld\nserial_fib(%lld)=%lld\n", v, result, v, sresult); 35 | 36 | return 0; 37 | } 38 | 39 | int main(int argc, char **argv) 40 | { 41 | void *ret; 42 | partr_init(); 43 | partr_start(&ret, run, NULL, 0, 0); 44 | partr_shutdown(); 45 | return 0; 46 | } 47 | 48 | -------------------------------------------------------------------------------- /test/l3d.c: -------------------------------------------------------------------------------- 1 | // This file is a part of Julia. License is MIT: http://julialang.org/license 2 | 3 | // GCC command line: gcc -fopenmp -mavx2 laplace3d.c -o laplace3d 4 | 5 | /* Laplace 3D 6 | 7 | orig: simple serial version 8 | naive: simple parallelized version 9 | auto: some ninja knowledge, using icc directives 10 | sse/avx: ninja-optimized 11 | 12 | Requires Sandy Bridge and up. 13 | 14 | Note that the SSE/AVX versions do not handle boundary conditions 15 | and thus each dimension must be 4n+2/8n+2. Try 258x258x258. 16 | 17 | 2014.08.06 anand.deshpande Initial code. 18 | 2014.08.06 dhiraj.kalamkar Padding and streaming stores. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #if defined(__i386__) 33 | static inline uint64_t rdtsc(void) 34 | { 35 | uint64_t x; 36 | __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); 37 | return x; 38 | } 39 | #elif defined(__x86_64__) 40 | static inline uint64_t rdtsc(void) 41 | { 42 | unsigned hi, lo; 43 | __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); 44 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 45 | } 46 | #elif defined(_COMPILER_MICROSOFT_) 47 | #include 48 | static inline uint64_t rdtsc(void) 49 | { 50 | return __rdtsc(); 51 | } 52 | #endif 53 | 54 | void l3d_naive(int nx, int padded_nx, int ny, int nz, float *u1, float *u2); 55 | void l3d_auto(int nx, int padded_nx, int ny, int nz, float *u1, float *u2); 56 | void l3d_sse(int nx, int padded_nx, int ny, int nz, float *u1, float *u2); 57 | void l3d_avx(int nx, int padded_nx, int ny, int nz, float *u1, float *u2); 58 | void l3d_partr(int nx, int padded_nx, int ny, int nz, float *u1, float *u2); 59 | void l3d_orig(int nx, int ny, int nz, float *u1, float *u2); 60 | 61 | double cpughz() 62 | { 63 | uint64_t t0 = rdtsc(); 64 | sleep(1); 65 | uint64_t onesec = rdtsc() - t0; 66 | return onesec*1.0/1e9; 67 | } 68 | 69 | int main(int argc, char **argv) 70 | { 71 | int nx, padded_nx, ny, nz, iters, i, j, k, ind, p_ind, verify, 72 | nthreads, pad_size; 73 | float *u1, *u1_p, *u1_orig, *u2, *u2_p, *u2_orig, *foo, 74 | error_tol = 0.00001; 75 | double ghz; 76 | void (*l3d)(int nx, int padded_nx, int ny, int nz, 77 | float *u1, float *u2); 78 | 79 | if (argc != 7) { 80 | fprintf(stderr, "Usage:\n" 81 | " laplace3d <#iters> " 82 | "\n"); 83 | exit(-1); 84 | } 85 | 86 | nx = strtol(argv[1], NULL, 10); 87 | ny = strtol(argv[2], NULL, 10); 88 | nz = strtol(argv[3], NULL, 10); 89 | 90 | padded_nx = ((nx + 0x7) & (~0x7)); 91 | iters = strtol(argv[4], NULL, 10); 92 | 93 | if (strncasecmp(argv[5], "naive", 5) == 0) 94 | l3d = l3d_naive; 95 | else if (strncasecmp(argv[5], "auto", 4) == 0) 96 | l3d = l3d_auto; 97 | else if (strncasecmp(argv[5], "sse", 3) == 0) 98 | l3d = l3d_sse; 99 | else if (strncasecmp(argv[5], "avx", 3) == 0) 100 | l3d = l3d_avx; 101 | else if (strncasecmp(argv[5], "partr", 5) == 0) 102 | l3d = l3d_partr; 103 | else { 104 | fprintf(stderr, 105 | "don't recognize %s. naive, auto, sse, avx, or partr?\n", 106 | argv[5]); 107 | exit(-1); 108 | } 109 | 110 | verify = strtol(argv[6], NULL, 10); 111 | 112 | ghz = cpughz(); 113 | nthreads = omp_get_max_threads(); 114 | printf("machine speed is %g GHz, using %d threads\n", ghz, nthreads); 115 | 116 | printf("laplace3d: %d iterations on %dx%dx%d grid, " 117 | "verification is %s\n", 118 | iters, nx, ny, nz, verify ? "on" : "off"); 119 | 120 | /* pad for aligned access; non-naive only */ 121 | if (strncasecmp(argv[5], "naive", 5) != 0) { 122 | pad_size = (((1 + padded_nx + padded_nx * ny) + 0xF) & (~0xF)) - 123 | (1 + padded_nx + padded_nx * ny); 124 | printf("using padded_nx = %d, pad_size = %d\n", 125 | padded_nx, pad_size); 126 | 127 | u1_p = (float *)_mm_malloc(sizeof (float) * 128 | (padded_nx * ny * nz + pad_size), 64); 129 | u2_p = (float *)_mm_malloc(sizeof (float) * 130 | (padded_nx * ny * nz + pad_size), 64); 131 | u1 = u1_p + pad_size; 132 | u2 = u2_p + pad_size; 133 | } 134 | else { 135 | u1_p = (float *)_mm_malloc(sizeof (float) * (nx * ny * nz), 64); 136 | u2_p = (float *)_mm_malloc(sizeof (float) * (nx * ny * nz), 64); 137 | u1 = u1_p; 138 | u2 = u2_p; 139 | padded_nx = nx; 140 | } 141 | u1_orig = (float *)_mm_malloc(sizeof (float) * nx * ny * nz, 64); 142 | u2_orig = (float *)_mm_malloc(sizeof (float) * nx * ny * nz, 64); 143 | 144 | // initialize 145 | #pragma omp parallel for private(k,j,i,ind,p_ind) 146 | for (k = 0; k < nz; ++k) { 147 | for (j = 0; j < ny; ++j) { 148 | for (i = 0; i < nx; ++i) { 149 | ind = i + j*nx + k*nx*ny; 150 | p_ind = i + j*padded_nx + k*padded_nx*ny; 151 | 152 | if (i == 0 || i == nx - 1 153 | || j == 0 || j == ny - 1 154 | || k == 0 || k == nz - 1) { 155 | // Dirichlet b.c.'s 156 | u1[p_ind] = u1_orig[ind] = u2[p_ind] = 1.0f; 157 | } 158 | else { 159 | u1[p_ind] = u1_orig[ind] = u2[p_ind] = 0.0f; 160 | } 161 | } 162 | } 163 | } 164 | 165 | if (strncasecmp(argv[5], "partr", 5) == 0) 166 | partr_init(); 167 | 168 | // run optimized version 169 | uint64_t t0 = rdtsc(); 170 | for (i = 0; i < iters; ++i) { 171 | l3d(nx, padded_nx, ny, nz, u1, u2); 172 | foo = u1; u1 = u2; u2 = foo; 173 | } 174 | uint64_t gold = rdtsc() - t0; 175 | double elapsed = gold / (ghz * 1e9); 176 | 177 | double grid_size = nx * ny * nz; 178 | double gflops = grid_size * iters * 6.0 / 1e9; 179 | double gflops_sec = gflops / elapsed; 180 | 181 | double traffic = grid_size * iters * 4 * 2.0 / 1e9; 182 | double bw_realized = traffic / elapsed; 183 | 184 | printf("laplace3d completed in %.4lf seconds\n", elapsed); 185 | printf("GFLOPs/sec: %.1f\n", gflops_sec); 186 | printf("BW realized: %.1f\n", bw_realized); 187 | 188 | if (verify) { 189 | // run serial version for verification 190 | uint64_t st0 = rdtsc(); 191 | for (i = 0; i < iters; ++i) { 192 | l3d_orig(nx, ny, nz, u1_orig, u2_orig); 193 | foo = u1_orig; u1_orig = u2_orig; u2_orig = foo; 194 | } 195 | uint64_t ser = rdtsc() - st0; 196 | elapsed = ser / (ghz * 1e9); 197 | gflops_sec = gflops / elapsed; 198 | bw_realized = traffic / elapsed; 199 | 200 | printf("laplace3d_orig completed in %.2lf seconds\n", elapsed); 201 | printf("GFLOPs/sec: %.1f\n", gflops_sec); 202 | printf("BW realized: %.1f\n", bw_realized); 203 | 204 | // verify 205 | for (k = 0; k < nz; ++k) { 206 | for (j = 0; j < ny; ++j) { 207 | for (i = 0; i < nx; ++i) { 208 | ind = i + j*nx + k*nx*ny; 209 | p_ind = i + j*padded_nx + k*padded_nx*ny; 210 | 211 | if (fabs(u1[p_ind] - u1_orig[ind]) > error_tol) { 212 | printf("ERROR %f - %f [%d, %d, %d]\n", 213 | u1[p_ind], u1_orig[ind], i, j, k); 214 | goto done; 215 | } 216 | } 217 | } 218 | } 219 | printf("verified, no error\n"); 220 | } 221 | 222 | if (strncasecmp(argv[5], "partr", 5) == 0) 223 | partr_shutdown(); 224 | 225 | done: 226 | _mm_free(u1_p); 227 | _mm_free(u2_p); 228 | _mm_free(u1_orig); 229 | _mm_free(u2_orig); 230 | 231 | return 0; 232 | } 233 | 234 | void l3d_naive(int nx, int padded_nx, int ny, int nz, float *u1, float *u2) 235 | { 236 | int i, j, k, ind; 237 | const float sixth = 1.0f/6.0f; 238 | 239 | /* compute on the grid */ 240 | #pragma omp parallel for private(i,j,k,ind) 241 | for (k = 1; k < nz-1; ++k) { 242 | for (j = 1; j < ny-1; ++j) { 243 | #pragma ivdep 244 | for (i = 1; i < nx-1; ++i) { 245 | ind = i + j*padded_nx + k*padded_nx*ny; 246 | u2[ind] = 247 | ( u1[ind-1 ] + u1[ind+1 ] 248 | + u1[ind-padded_nx ] + u1[ind+padded_nx ] 249 | + u1[ind-padded_nx*ny] + u1[ind+padded_nx*ny] ) * sixth; 250 | } 251 | } 252 | } 253 | } 254 | 255 | void l3d_auto(int nx, int padded_nx, int ny, int nz, float *u1, float *u2) 256 | { 257 | int i, j, k, ind; 258 | 259 | float sixth = 1.0f/6.0f; 260 | 261 | #if defined(__INTEL_COMPILER) 262 | __assume(padded_nx%8==0); 263 | __assume_aligned(&u1[1],32); 264 | __assume_aligned(&u2[1],32); 265 | #elif defined(__GNUC__) 266 | if (!(padded_nx%8==0)) 267 | __builtin_unreachable(); 268 | // third argument is the misalignment 269 | u1 = __builtin_assume_aligned(u1, 32, sizeof(float)); 270 | u2 = __builtin_assume_aligned(u2, 32, sizeof(float)); 271 | #endif 272 | 273 | /* compute on the grid */ 274 | #pragma omp parallel for private(i,j,k,ind) 275 | for (k = 1; k < nz-1; ++k) { 276 | for (j = 1; j < ny-1; ++j) { 277 | #pragma vector nontemporal(u2) 278 | for (i = 1; i < nx-1; ++i) { 279 | ind = i + j*padded_nx + k*padded_nx*ny; 280 | u2[ind] = 281 | ( u1[ind-1 ] + u1[ind+1 ] 282 | + u1[ind-padded_nx ] + u1[ind+padded_nx ] 283 | + u1[ind-padded_nx*ny] + u1[ind+padded_nx*ny] ) * sixth; 284 | } 285 | } 286 | } 287 | } 288 | 289 | void l3d_sse(int nx, int padded_nx, int ny, int nz, float *u1, float *u2) 290 | { 291 | int i, j, k, ind; 292 | 293 | float fsixth = 1.0f/6.0f; 294 | __m128 sixth = _mm_set_ps1(fsixth); 295 | 296 | /* compute on the grid */ 297 | #pragma omp parallel for private(i,j,k,ind) 298 | for (k = 1; k < nz-1; ++k) { 299 | for (j = 1; j < ny-1; ++j) { 300 | for (i = 1; i < nx-1; i += 4) { 301 | ind = i + j*padded_nx + k*padded_nx*ny; 302 | 303 | __m128 pSrc1 = _mm_loadu_ps(&u1[ind-1]); 304 | __m128 pSrc2 = _mm_loadu_ps(&u1[ind+1]); 305 | __m128 pSrc3 = _mm_load_ps(&u1[ind-padded_nx]); 306 | __m128 pSrc4 = _mm_load_ps(&u1[ind+padded_nx]); 307 | __m128 pSrc5 = _mm_load_ps(&u1[ind-padded_nx*ny]); 308 | __m128 pSrc6 = _mm_load_ps(&u1[ind+padded_nx*ny]); 309 | 310 | __m128 sum1 = _mm_add_ps(pSrc1, pSrc2); 311 | __m128 sum2 = _mm_add_ps(pSrc3, pSrc4); 312 | __m128 sum3 = _mm_add_ps(pSrc5, pSrc6); 313 | __m128 sum4 = _mm_add_ps(sum1, sum2); 314 | __m128 vsum = _mm_add_ps(sum3, sum4); 315 | 316 | vsum = _mm_mul_ps(vsum, sixth); 317 | 318 | _mm_stream_ps(&u2[ind], vsum); 319 | } 320 | } 321 | } 322 | } 323 | 324 | void l3d_avx(int nx, int padded_nx, int ny, int nz, float *u1, float *u2) 325 | { 326 | int i, j, k, ind; 327 | 328 | float fsixth = 1.0f/6.0f; 329 | __m256 sixth = _mm256_set1_ps(fsixth); 330 | 331 | /* compute on the grid */ 332 | #pragma omp parallel for private(i,j,k,ind) 333 | for (k = 1; k < nz-1; ++k) { 334 | for (j = 1; j < ny-1; ++j) { 335 | for (i = 1; i < nx-1; i += 8) { 336 | ind = i + j*padded_nx + k*padded_nx*ny; 337 | 338 | __m256 pSrc1 = _mm256_loadu_ps(&u1[ind-1]); 339 | __m256 pSrc2 = _mm256_loadu_ps(&u1[ind+1]); 340 | __m256 pSrc3 = _mm256_load_ps(&u1[ind-padded_nx]); 341 | __m256 pSrc4 = _mm256_load_ps(&u1[ind+padded_nx]); 342 | __m256 pSrc5 = _mm256_load_ps(&u1[ind-padded_nx*ny]); 343 | __m256 pSrc6 = _mm256_load_ps(&u1[ind+padded_nx*ny]); 344 | 345 | __m256 sum1 = _mm256_add_ps(pSrc1, pSrc2); 346 | __m256 sum2 = _mm256_add_ps(pSrc3, pSrc4); 347 | __m256 sum3 = _mm256_add_ps(pSrc5, pSrc6); 348 | __m256 sum4 = _mm256_add_ps(sum1, sum2); 349 | __m256 vsum = _mm256_add_ps(sum3, sum4); 350 | 351 | vsum = _mm256_mul_ps(vsum, sixth); 352 | 353 | _mm256_stream_ps(&u2[ind], vsum); 354 | } 355 | } 356 | } 357 | } 358 | 359 | typedef struct task_arg_tag { 360 | int nx, padded_nx, ny, nz; 361 | float *u1, *u2; 362 | } task_arg_t; 363 | 364 | void *l3d_partr_iter(void *arg, int64_t start, int64_t end) 365 | { 366 | int i, j, k, ind; 367 | const float sixth = 1.0f/6.0f; 368 | task_arg_t *ta = (task_arg_t *)arg; 369 | int nx = ta->nx; 370 | int ny = ta->ny; 371 | int nz = ta->nz; 372 | float *u1 = ta->u1; 373 | float *u2 = ta->u2; 374 | 375 | for (k = start; k < end; ++k) { 376 | for (j = 0; j < ny; ++j) { 377 | for (i = 0; i < nx; ++i) { 378 | ind = i + j*nx + k*nx*ny; 379 | 380 | if (i == 0 || i == nx - 1 381 | || j == 0 || j == ny - 1 382 | || k == 0 || k == nz - 1) { 383 | u2[ind] = u1[ind]; // Dirichlet b.c.'s 384 | } 385 | else { 386 | u2[ind] = ( u1[ind-1 ] + u1[ind+1 ] 387 | + u1[ind-nx ] + u1[ind+nx ] 388 | + u1[ind-nx*ny] + u1[ind+nx*ny] ) * sixth; 389 | } 390 | } 391 | } 392 | } 393 | return NULL; 394 | } 395 | 396 | void *l3d_partr_run(void *arg, int64_t start, int64_t end) 397 | { 398 | partr_t t; 399 | partr_parfor(&t, l3d_partr_iter, arg, end - start, NULL); 400 | partr_sync(NULL, t, 1); 401 | return NULL; 402 | } 403 | 404 | void l3d_partr(int nx, int padded_nx, int ny, int nz, float *u1, float *u2) 405 | { 406 | task_arg_t task_arg; 407 | task_arg.nx = nx; 408 | task_arg.padded_nx = padded_nx; 409 | task_arg.ny = ny; 410 | task_arg.nz = nz; 411 | task_arg.u1 = u1; 412 | task_arg.u2 = u2; 413 | partr_start(NULL, l3d_partr_run, (void *)&task_arg, 0, nz); 414 | } 415 | 416 | void l3d_orig(int nx, int ny, int nz, float *u1, float *u2) 417 | { 418 | int i, j, k, ind; 419 | const float sixth = 1.0f/6.0f; 420 | 421 | for (k = 0; k < nz; ++k) { 422 | for (j = 0; j < ny; ++j) { 423 | for (i = 0; i < nx; ++i) { 424 | ind = i + j*nx + k*nx*ny; 425 | 426 | if (i == 0 || i == nx - 1 427 | || j == 0 || j == ny - 1 428 | || k == 0 || k == nz - 1) { 429 | u2[ind] = u1[ind]; // Dirichlet b.c.'s 430 | } 431 | else { 432 | u2[ind] = ( u1[ind-1 ] + u1[ind+1 ] 433 | + u1[ind-nx ] + u1[ind+nx ] 434 | + u1[ind-nx*ny] + u1[ind+nx*ny] ) * sixth; 435 | } 436 | } 437 | } 438 | } 439 | } 440 | -------------------------------------------------------------------------------- /test/makefile: -------------------------------------------------------------------------------- 1 | # parallel tasks runtime 2 | # 3 | # makefile for tests 4 | # 5 | # 2016.06.01 kiran.pamnany Initial code 6 | # 7 | 8 | CC=gcc 9 | 10 | .SUFFIXES: .c .h .o 11 | .PHONY: clean 12 | 13 | CFLAGS+=-Wall 14 | CFLAGS+=-std=c11 15 | CFLAGS+=-D_GNU_SOURCE 16 | CFLAGS+=-I../../hwloc/include 17 | CFLAGS+=-I../../libconcurrent/include 18 | CFLAGS+=-I. 19 | CFLAGS+=-I../include 20 | CFLAGS+=-I../src 21 | 22 | LDOBJS+=-lpthread 23 | LDOBJS+=tap.o 24 | LDOBJS+=../libpartr.a 25 | LDOBJS+=../../hwloc/src/.libs/libhwloc.a 26 | LDOBJS+=../../libconcurrent/libconcurrent.a 27 | 28 | TAPSRC=tap.c 29 | TAPOBJ=tap.o 30 | 31 | SRCS=taskpoolstest.c multiqtest.c parfortest.c sleeptest.c fib.c l3d.c l3d_partr.c 32 | OBJS=${SRCS:.c=.o} 33 | BINS=${SRCS:.c=} 34 | OBJS=$(subst .c,.o, $(SRCS)) 35 | BINS=$(subst .c,, $(SRCS)) 36 | 37 | ifeq ($(DEBUG),yes) 38 | CFLAGS+=-O0 -g 39 | else 40 | CFLAGS+=-O3 41 | endif 42 | 43 | all: $(BINS) 44 | 45 | $(BINS): $(SRCS) $(TAPOBJ) ../libpartr.a 46 | #$(CC) -fopenmp $(CFLAGS) l3d.c -o l3d $(LDOBJS) 47 | $(CC) $(CFLAGS) l3d_partr.c -o l3d_partr $(LDOBJS) 48 | $(CC) $(CFLAGS) taskpoolstest.c -o taskpoolstest $(LDOBJS) 49 | $(CC) $(CFLAGS) multiqtest.c -o multiqtest $(LDOBJS) 50 | $(CC) $(CFLAGS) parfortest.c -o parfortest $(LDOBJS) 51 | $(CC) $(CFLAGS) sleeptest.c -o sleeptest $(LDOBJS) 52 | $(CC) $(CFLAGS) fib.c -o fib $(LDOBJS) 53 | 54 | $(TAPOBJ): $(TAPSRC) 55 | $(CC) $(CFLAGS) -c $(TAPSRC) 56 | 57 | clean: 58 | $(RM) $(BINS) $(OBJS) $(TAPOBJ) 59 | 60 | -------------------------------------------------------------------------------- /test/multiqtest.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "tap.h" 6 | #include "perfutil.h" 7 | #include "congrng.h" 8 | #include "partr.h" 9 | #include "taskpools.h" 10 | #include "multiq.h" 11 | 12 | #include 13 | 14 | log_t plog; 15 | int16_t nthreads; 16 | __thread int16_t tid; 17 | __thread uint64_t rngseed; 18 | 19 | #define NTASKS_PER_POOL 512 20 | static ptask_t *tasks[NTASKS_PER_POOL]; 21 | 22 | static int *success; 23 | 24 | /* thread barrier */ 25 | static int volatile barcnt; 26 | static int volatile barsense = 1; 27 | 28 | #define BARRIER_INIT() barcnt=nthreads 29 | 30 | #define BARRIER() do { \ 31 | mysense = !mysense; \ 32 | if (!__atomic_sub_fetch(&barcnt, 1, __ATOMIC_SEQ_CST)) { \ 33 | barcnt = nthreads; \ 34 | barsense = mysense; \ 35 | } else while (barsense != mysense); \ 36 | } while(0) 37 | 38 | 39 | typedef struct lthread_arg_tag { 40 | int16_t tid; 41 | } lthread_arg_t; 42 | 43 | /* used for reducing across threads */ 44 | static int N = 0; 45 | 46 | 47 | static void *threadfun(void *targ) 48 | { 49 | int mysense = 1; /* for the barrier */ 50 | 51 | ptask_t *task; 52 | int16_t curr_prio, ooo; 53 | 54 | lthread_arg_t *arg = (lthread_arg_t *)targ; 55 | tid = arg->tid; 56 | free(targ); 57 | seed_cong(&rngseed); 58 | 59 | if (tid == 0) { 60 | taskpools_init(); 61 | multiq_init(); 62 | 63 | success = (int *)calloc(nthreads, sizeof(int)); 64 | 65 | /* single-thread alloc and insert */ 66 | const int16_t tprios[] = { 1, 0, 3, 2 }; 67 | int16_t t = 0; 68 | for (int16_t j = 0; j < 4; ++j) { 69 | success[tid] = 1; 70 | for (int16_t i = 0; i < NTASKS_PER_POOL/4; ++i) { 71 | tasks[t] = task_alloc(); 72 | if (multiq_insert(tasks[t], tprios[j]) != 0) 73 | success[tid] = 0; 74 | ++t; 75 | } 76 | ok(success[tid], "insert with priority %d", tprios[j]); 77 | } 78 | 79 | /* single-thread deletemin */ 80 | success[tid] = 1; 81 | curr_prio = ooo = 0; 82 | for (int16_t i = 0; i < NTASKS_PER_POOL; ++i) { 83 | task = multiq_deletemin(); 84 | if (task == NULL) { 85 | success[tid] = 0; 86 | break; 87 | } 88 | if (task->prio > curr_prio) 89 | curr_prio = task->prio; 90 | else if (task->prio < curr_prio) { 91 | diag("(tid %d) curr_prio: %d, task->prio: %d\n", tid, curr_prio, task->prio); 92 | ++ooo; 93 | curr_prio = task->prio; 94 | } 95 | } 96 | ok(success[tid], "deletemin (%d out-of-order)", ooo); 97 | } 98 | 99 | BARRIER(); 100 | 101 | int each = NTASKS_PER_POOL/nthreads, start = tid * each, end = start + each; 102 | if (tid == nthreads-1) end = NTASKS_PER_POOL; 103 | 104 | /* parallel insert tests */ 105 | success[tid] = 1; 106 | for (int16_t i = start; i < end; ++i) { 107 | if (multiq_insert(tasks[i], tid) != 0) 108 | success[tid] = 0; 109 | } 110 | 111 | BARRIER(); 112 | 113 | if (tid == 0) { 114 | for (int16_t i = 1; i < nthreads; ++i) 115 | if (!success[i]) 116 | success[0] = 0; 117 | ok(success[0], "parallel insertion, %d tasks\n", NTASKS_PER_POOL); 118 | } 119 | 120 | BARRIER(); 121 | 122 | /* parallel deletemin tests */ 123 | curr_prio = ooo = 0; 124 | int ndeq = 0; 125 | for (int16_t i = 0; i < NTASKS_PER_POOL/nthreads; ++i) { 126 | task = multiq_deletemin(); 127 | if (task == NULL) { 128 | diag("(tid %d) !task\n", tid); 129 | continue; 130 | } 131 | ++ndeq; 132 | if (task->prio > curr_prio) 133 | curr_prio = task->prio; 134 | else if (task->prio < curr_prio) { 135 | diag("(tid %d) curr_prio: %d, task->prio: %d\n", tid, curr_prio, task->prio); 136 | ++ooo; 137 | curr_prio = task->prio; 138 | } 139 | } 140 | __atomic_add_fetch(&N, ndeq, __ATOMIC_SEQ_CST); 141 | 142 | BARRIER(); 143 | 144 | if (tid == 0) { 145 | ok(N == NTASKS_PER_POOL, "parallel deletemin %d tasks (%d out-of-order)", N, ooo); 146 | 147 | for (int16_t i = 0; i < NTASKS_PER_POOL; ++i) 148 | task_free(tasks[i]); 149 | 150 | free(success); 151 | multiq_destroy(); 152 | taskpools_destroy(); 153 | } 154 | 155 | return NULL; 156 | } 157 | 158 | int main(int argc, char **argv) 159 | { 160 | LOG_SETUP(plog, LOG_LEVEL_INFO, stdout); 161 | LOG_INFO(plog, "taskpools test\n"); 162 | 163 | nthreads = DEFAULT_NUM_THREADS; 164 | char *cp = getenv(NUM_THREADS_NAME); 165 | if (cp) nthreads = strtol(cp, NULL, 10); 166 | LOG_INFO(plog, " %d threads\n", nthreads); 167 | 168 | BARRIER_INIT(); 169 | 170 | tid = 0; 171 | 172 | pthread_t pthread_id; 173 | pthread_attr_t pthread_attr; 174 | 175 | pthread_attr_init(&pthread_attr); 176 | pthread_attr_setdetachstate(&pthread_attr, PTHREAD_CREATE_DETACHED); 177 | 178 | for (int16_t i = 1; i < nthreads; ++i) { 179 | lthread_arg_t *targ = (lthread_arg_t *)calloc(1, sizeof(lthread_arg_t)); 180 | targ->tid = i; 181 | pthread_create(&pthread_id, &pthread_attr, threadfun, targ); 182 | } 183 | pthread_attr_destroy(&pthread_attr); 184 | 185 | /* thread 0 enters the thread function too */ 186 | lthread_arg_t *targ = (lthread_arg_t *)calloc(1, sizeof(lthread_arg_t)); 187 | targ->tid = 0; 188 | threadfun(targ); 189 | 190 | done_testing(); 191 | } 192 | 193 | -------------------------------------------------------------------------------- /test/parfortest.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "tap.h" 3 | #include "partr.h" 4 | 5 | int arr[1024]; 6 | 7 | void *fill_arr(void *arg_, int64_t start, int64_t end) 8 | { 9 | int64_t sum = 0; 10 | for (int64_t i = start; i < end; ++i) { 11 | arr[i] = i; 12 | sum += i; 13 | } 14 | 15 | return (void *)sum; 16 | } 17 | 18 | void *add(void *arg1, void *arg2) 19 | { 20 | int64_t a1 = (int64_t)arg1; 21 | int64_t a2 = (int64_t)arg2; 22 | return (void *)(a1 + a2); 23 | } 24 | 25 | void *run(void *arg, int64_t start, int64_t end) 26 | { 27 | int64_t sum; 28 | partr_t t; 29 | partr_parfor(&t, fill_arr, NULL, 1024, add); 30 | partr_sync((void *)&sum, t, 1); 31 | 32 | return (void *)sum; 33 | } 34 | 35 | int main(int argc, char **argv) 36 | { 37 | for (int i = 0; i < 1024; ++i) 38 | arr[i] = -1; 39 | 40 | int64_t par_sum; 41 | 42 | partr_init(); 43 | partr_start((void *)&par_sum, run, NULL, 0, 0); 44 | partr_shutdown(); 45 | 46 | printf("sum: %lld\n", par_sum); 47 | 48 | int success = 1, sum = 0; 49 | for (int i = 0; i < 1024; ++i) { 50 | if (arr[i] != i) { 51 | success = 0; 52 | break; 53 | } 54 | sum = sum + arr[i]; 55 | } 56 | 57 | ok(success, "all elements filled"); 58 | ok(sum == par_sum, "%lld == %lld", sum, par_sum); 59 | 60 | return 0; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /test/sleeptest.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "tap.h" 3 | #include "partr.h" 4 | 5 | int arr[1024]; 6 | 7 | void *fill_arr(void *arg_, int64_t start, int64_t end) 8 | { 9 | int64_t sum = 0; 10 | for (int64_t i = start; i < end; ++i) { 11 | arr[i] = i; 12 | sum += i; 13 | } 14 | 15 | return (void *)sum; 16 | } 17 | 18 | void *add(void *arg1, void *arg2) 19 | { 20 | int64_t a1 = (int64_t)arg1; 21 | int64_t a2 = (int64_t)arg2; 22 | return (void *)(a1 + a2); 23 | } 24 | 25 | void *run(void *arg, int64_t start, int64_t end) 26 | { 27 | int64_t sum; 28 | partr_t t; 29 | partr_parfor(&t, fill_arr, NULL, 1024, add); 30 | partr_sync((void *)&sum, t, 1); 31 | 32 | return (void *)sum; 33 | } 34 | 35 | void start_test() 36 | { 37 | for (int i = 0; i < 1024; ++i) 38 | arr[i] = -1; 39 | 40 | int64_t par_sum; 41 | 42 | partr_start((void *)&par_sum, run, NULL, 0, 0); 43 | printf("sum: %lld\n", par_sum); 44 | 45 | int success = 1, sum = 0; 46 | for (int i = 0; i < 1024; ++i) { 47 | if (arr[i] != i) { 48 | success = 0; 49 | break; 50 | } 51 | sum = sum + arr[i]; 52 | } 53 | 54 | ok(success, "all elements filled"); 55 | ok(sum == par_sum, "%lld == %lld", sum, par_sum); 56 | 57 | } 58 | 59 | int main(int argc, char **argv) 60 | { 61 | partr_init(); 62 | 63 | start_test(); 64 | diag("pausing for 5 seconds to let threads sleep\n"); 65 | sleep(5); 66 | diag("re-running test\n"); 67 | start_test(); 68 | 69 | partr_shutdown(); 70 | return 0; 71 | } 72 | 73 | -------------------------------------------------------------------------------- /test/tap.c: -------------------------------------------------------------------------------- 1 | /* 2 | libtap - Write tests in C 3 | Copyright 2012 Jake Gelbman 4 | This file is licensed under the LGPL 5 | */ 6 | 7 | #define _DEFAULT_SOURCE 1 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "tap.h" 14 | 15 | static int expected_tests = NO_PLAN; 16 | static int failed_tests; 17 | static int current_test; 18 | static char *todo_mesg; 19 | 20 | static char * 21 | vstrdupf (const char *fmt, va_list args) { 22 | char *str; 23 | int size; 24 | va_list args2; 25 | va_copy(args2, args); 26 | if (!fmt) 27 | fmt = ""; 28 | size = vsnprintf(NULL, 0, fmt, args2) + 2; 29 | str = malloc(size); 30 | if (!str) { 31 | perror("malloc error"); 32 | exit(1); 33 | } 34 | vsprintf(str, fmt, args); 35 | va_end(args2); 36 | return str; 37 | } 38 | 39 | void 40 | tap_plan (int tests, const char *fmt, ...) { 41 | expected_tests = tests; 42 | if (tests == SKIP_ALL) { 43 | char *why; 44 | va_list args; 45 | va_start(args, fmt); 46 | why = vstrdupf(fmt, args); 47 | va_end(args); 48 | printf("1..0 "); 49 | diag("SKIP %s\n", why); 50 | exit(0); 51 | } 52 | if (tests != NO_PLAN) { 53 | printf("1..%d\n", tests); 54 | } 55 | } 56 | 57 | int 58 | vok_at_loc (const char *file, int line, int test, const char *fmt, 59 | va_list args) 60 | { 61 | char *name = vstrdupf(fmt, args); 62 | if (!test) { 63 | printf("not "); 64 | } 65 | printf("ok %d", ++current_test); 66 | if (*name) 67 | printf(" - %s", name); 68 | if (todo_mesg) { 69 | printf(" # TODO"); 70 | if (*todo_mesg) 71 | printf(" %s", todo_mesg); 72 | } 73 | printf("\n"); 74 | if (!test) { 75 | printf("# Failed "); 76 | if (todo_mesg) 77 | printf("(TODO) "); 78 | printf("test "); 79 | if (*name) 80 | printf("'%s'\n# ", name); 81 | printf("at %s line %d.\n", file, line); 82 | if (!todo_mesg) 83 | failed_tests++; 84 | } 85 | free(name); 86 | return test; 87 | } 88 | 89 | int 90 | ok_at_loc (const char *file, int line, int test, const char *fmt, ...) { 91 | va_list args; 92 | va_start(args, fmt); 93 | vok_at_loc(file, line, test, fmt, args); 94 | va_end(args); 95 | return test; 96 | } 97 | 98 | static int 99 | mystrcmp (const char *a, const char *b) { 100 | return a == b ? 0 : !a ? -1 : !b ? 1 : strcmp(a, b); 101 | } 102 | 103 | #define eq(a, b) (!mystrcmp(a, b)) 104 | #define ne(a, b) (mystrcmp(a, b)) 105 | 106 | int 107 | is_at_loc (const char *file, int line, const char *got, const char *expected, 108 | const char *fmt, ...) 109 | { 110 | int test = eq(got, expected); 111 | va_list args; 112 | va_start(args, fmt); 113 | vok_at_loc(file, line, test, fmt, args); 114 | va_end(args); 115 | if (!test) { 116 | diag(" got: '%s'", got); 117 | diag(" expected: '%s'", expected); 118 | } 119 | return test; 120 | } 121 | 122 | int 123 | isnt_at_loc (const char *file, int line, const char *got, const char *expected, 124 | const char *fmt, ...) 125 | { 126 | int test = ne(got, expected); 127 | va_list args; 128 | va_start(args, fmt); 129 | vok_at_loc(file, line, test, fmt, args); 130 | va_end(args); 131 | if (!test) { 132 | diag(" got: '%s'", got); 133 | diag(" expected: anything else"); 134 | } 135 | return test; 136 | } 137 | 138 | int 139 | cmp_ok_at_loc (const char *file, int line, int a, const char *op, int b, 140 | const char *fmt, ...) 141 | { 142 | int test = eq(op, "||") ? a || b 143 | : eq(op, "&&") ? a && b 144 | : eq(op, "|") ? a | b 145 | : eq(op, "^") ? a ^ b 146 | : eq(op, "&") ? a & b 147 | : eq(op, "==") ? a == b 148 | : eq(op, "!=") ? a != b 149 | : eq(op, "<") ? a < b 150 | : eq(op, ">") ? a > b 151 | : eq(op, "<=") ? a <= b 152 | : eq(op, ">=") ? a >= b 153 | : eq(op, "<<") ? a << b 154 | : eq(op, ">>") ? a >> b 155 | : eq(op, "+") ? a + b 156 | : eq(op, "-") ? a - b 157 | : eq(op, "*") ? a * b 158 | : eq(op, "/") ? a / b 159 | : eq(op, "%") ? a % b 160 | : diag("unrecognized operator '%s'", op); 161 | va_list args; 162 | va_start(args, fmt); 163 | vok_at_loc(file, line, test, fmt, args); 164 | va_end(args); 165 | if (!test) { 166 | diag(" %d", a); 167 | diag(" %s", op); 168 | diag(" %d", b); 169 | } 170 | return test; 171 | } 172 | 173 | static int 174 | find_mem_diff (const char *a, const char *b, size_t n, size_t *offset) { 175 | size_t i; 176 | if (a == b) 177 | return 0; 178 | if (!a || !b) 179 | return 2; 180 | for (i = 0; i < n; i++) { 181 | if (a[i] != b[i]) { 182 | *offset = i; 183 | return 1; 184 | } 185 | } 186 | return 0; 187 | } 188 | 189 | int 190 | cmp_mem_at_loc (const char *file, int line, const void *got, 191 | const void *expected, size_t n, const char *fmt, ...) 192 | { 193 | size_t offset; 194 | int diff = find_mem_diff(got, expected, n, &offset); 195 | va_list args; 196 | va_start(args, fmt); 197 | vok_at_loc(file, line, !diff, fmt, args); 198 | va_end(args); 199 | if (diff == 1) { 200 | diag(" Difference starts at offset %d", offset); 201 | diag(" got: 0x%02x", ((unsigned char *)got)[offset]); 202 | diag(" expected: 0x%02x", ((unsigned char *)expected)[offset]); 203 | } 204 | else if (diff == 2) { 205 | diag(" got: %s", got ? "not NULL" : "NULL"); 206 | diag(" expected: %s", expected ? "not NULL" : "NULL"); 207 | } 208 | return !diff; 209 | } 210 | 211 | int 212 | diag (const char *fmt, ...) { 213 | va_list args; 214 | char *mesg, *line; 215 | int i; 216 | va_start(args, fmt); 217 | if (!fmt) 218 | return 0; 219 | mesg = vstrdupf(fmt, args); 220 | line = mesg; 221 | for (i = 0; *line; i++) { 222 | char c = mesg[i]; 223 | if (!c || c == '\n') { 224 | mesg[i] = '\0'; 225 | printf("# %s\n", line); 226 | if (!c) 227 | break; 228 | mesg[i] = c; 229 | line = mesg + i + 1; 230 | } 231 | } 232 | free(mesg); 233 | va_end(args); 234 | return 0; 235 | } 236 | 237 | int 238 | exit_status () { 239 | int retval = 0; 240 | if (expected_tests == NO_PLAN) { 241 | printf("1..%d\n", current_test); 242 | } 243 | else if (current_test != expected_tests) { 244 | diag("Looks like you planned %d test%s but ran %d.", 245 | expected_tests, expected_tests > 1 ? "s" : "", current_test); 246 | retval = 2; 247 | } 248 | if (failed_tests) { 249 | diag("Looks like you failed %d test%s of %d run.", 250 | failed_tests, failed_tests > 1 ? "s" : "", current_test); 251 | retval = 1; 252 | } 253 | return retval; 254 | } 255 | 256 | int 257 | bail_out (int ignore, const char *fmt, ...) { 258 | va_list args; 259 | va_start(args, fmt); 260 | printf("Bail out! "); 261 | vprintf(fmt, args); 262 | printf("\n"); 263 | va_end(args); 264 | exit(255); 265 | return 0; 266 | } 267 | 268 | void 269 | tap_skip (int n, const char *fmt, ...) { 270 | char *why; 271 | va_list args; 272 | va_start(args, fmt); 273 | why = vstrdupf(fmt, args); 274 | va_end(args); 275 | while (n --> 0) { 276 | printf("ok %d ", ++current_test); 277 | diag("skip %s\n", why); 278 | } 279 | free(why); 280 | } 281 | 282 | void 283 | tap_todo (int ignore, const char *fmt, ...) { 284 | va_list args; 285 | va_start(args, fmt); 286 | todo_mesg = vstrdupf(fmt, args); 287 | va_end(args); 288 | } 289 | 290 | void 291 | tap_end_todo () { 292 | free(todo_mesg); 293 | todo_mesg = NULL; 294 | } 295 | 296 | #ifndef _WIN32 297 | #include 298 | #include 299 | #include 300 | 301 | #if defined __APPLE__ || defined BSD 302 | #define MAP_ANONYMOUS MAP_ANON 303 | #endif 304 | 305 | /* Create a shared memory int to keep track of whether a piece of code executed 306 | dies. to be used in the dies_ok and lives_ok macros. */ 307 | int 308 | tap_test_died (int status) { 309 | static int *test_died = NULL; 310 | int prev; 311 | if (!test_died) { 312 | test_died = mmap(0, sizeof (int), PROT_READ | PROT_WRITE, 313 | MAP_SHARED | MAP_ANONYMOUS, -1, 0); 314 | *test_died = 0; 315 | } 316 | prev = *test_died; 317 | *test_died = status; 318 | return prev; 319 | } 320 | 321 | int 322 | like_at_loc (int for_match, const char *file, int line, const char *got, 323 | const char *expected, const char *fmt, ...) 324 | { 325 | int test; 326 | regex_t re; 327 | va_list args; 328 | int err = regcomp(&re, expected, REG_EXTENDED); 329 | if (err) { 330 | char errbuf[256]; 331 | regerror(err, &re, errbuf, sizeof errbuf); 332 | fprintf(stderr, "Unable to compile regex '%s': %s at %s line %d\n", 333 | expected, errbuf, file, line); 334 | exit(255); 335 | } 336 | err = regexec(&re, got, 0, NULL, 0); 337 | regfree(&re); 338 | test = for_match ? !err : err; 339 | va_start(args, fmt); 340 | vok_at_loc(file, line, test, fmt, args); 341 | va_end(args); 342 | if (!test) { 343 | if (for_match) { 344 | diag(" '%s'", got); 345 | diag(" doesn't match: '%s'", expected); 346 | } 347 | else { 348 | diag(" '%s'", got); 349 | diag(" matches: '%s'", expected); 350 | } 351 | } 352 | return test; 353 | } 354 | #endif 355 | -------------------------------------------------------------------------------- /test/tap.h: -------------------------------------------------------------------------------- 1 | /* 2 | libtap - Write tests in C 3 | Copyright 2012 Jake Gelbman 4 | This file is licensed under the LGPL 5 | */ 6 | 7 | #ifndef __TAP_H__ 8 | #define __TAP_H__ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | #ifndef va_copy 15 | #ifdef __va_copy 16 | #define va_copy __va_copy 17 | #else 18 | #define va_copy(d, s) ((d) = (s)) 19 | #endif 20 | #endif 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | int vok_at_loc (const char *file, int line, int test, const char *fmt, 27 | va_list args); 28 | int ok_at_loc (const char *file, int line, int test, const char *fmt, 29 | ...); 30 | int is_at_loc (const char *file, int line, const char *got, 31 | const char *expected, const char *fmt, ...); 32 | int isnt_at_loc (const char *file, int line, const char *got, 33 | const char *expected, const char *fmt, ...); 34 | int cmp_ok_at_loc (const char *file, int line, int a, const char *op, 35 | int b, const char *fmt, ...); 36 | int cmp_mem_at_loc (const char *file, int line, const void *got, 37 | const void *expected, size_t n, const char *fmt, ...); 38 | int bail_out (int ignore, const char *fmt, ...); 39 | void tap_plan (int tests, const char *fmt, ...); 40 | int diag (const char *fmt, ...); 41 | int exit_status (void); 42 | void tap_skip (int n, const char *fmt, ...); 43 | void tap_todo (int ignore, const char *fmt, ...); 44 | void tap_end_todo (void); 45 | 46 | #define NO_PLAN -1 47 | #define SKIP_ALL -2 48 | #define ok(...) ok_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL) 49 | #define is(...) is_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL) 50 | #define isnt(...) isnt_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL) 51 | #define cmp_ok(...) cmp_ok_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL) 52 | #define cmp_mem(...) cmp_mem_at_loc(__FILE__, __LINE__, __VA_ARGS__, NULL); 53 | #define plan(...) tap_plan(__VA_ARGS__, NULL) 54 | #define done_testing() return exit_status() 55 | #define BAIL_OUT(...) bail_out(0, "" __VA_ARGS__, NULL) 56 | #define pass(...) ok(1, "" __VA_ARGS__) 57 | #define fail(...) ok(0, "" __VA_ARGS__) 58 | 59 | #define skip(test, ...) do {if (test) {tap_skip(__VA_ARGS__, NULL); break;} 60 | #define end_skip } while (0) 61 | 62 | #define todo(...) tap_todo(0, "" __VA_ARGS__, NULL) 63 | #define end_todo tap_end_todo() 64 | 65 | #define dies_ok(...) dies_ok_common(1, __VA_ARGS__) 66 | #define lives_ok(...) dies_ok_common(0, __VA_ARGS__) 67 | 68 | #ifdef _WIN32 69 | #define like(...) tap_skip(1, "like is not implemented on Windows") 70 | #define unlike tap_skip(1, "unlike is not implemented on Windows") 71 | #define dies_ok_common(...) \ 72 | tap_skip(1, "Death detection is not supported on Windows") 73 | #else 74 | #define like(...) like_at_loc(1, __FILE__, __LINE__, __VA_ARGS__, NULL) 75 | #define unlike(...) like_at_loc(0, __FILE__, __LINE__, __VA_ARGS__, NULL) 76 | int like_at_loc (int for_match, const char *file, int line, 77 | const char *got, const char *expected, 78 | const char *fmt, ...); 79 | #include 80 | #include 81 | #include 82 | int tap_test_died (int status); 83 | #define dies_ok_common(for_death, code, ...) \ 84 | do { \ 85 | int cpid; \ 86 | int it_died; \ 87 | tap_test_died(1); \ 88 | cpid = fork(); \ 89 | switch (cpid) { \ 90 | case -1: \ 91 | perror("fork error"); \ 92 | exit(1); \ 93 | case 0: \ 94 | close(1); \ 95 | close(2); \ 96 | code \ 97 | tap_test_died(0); \ 98 | exit(0); \ 99 | } \ 100 | if (waitpid(cpid, NULL, 0) < 0) { \ 101 | perror("waitpid error"); \ 102 | exit(1); \ 103 | } \ 104 | it_died = tap_test_died(0); \ 105 | if (!it_died) \ 106 | {code} \ 107 | ok(for_death ? it_died : !it_died, "" __VA_ARGS__); \ 108 | } while (0) 109 | #endif 110 | 111 | #ifdef __cplusplus 112 | } 113 | #endif 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /test/taskpoolstest.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "tap.h" 5 | #include "partr.h" 6 | #include "taskpools.h" 7 | 8 | log_t plog; 9 | int16_t nthreads; 10 | __thread int16_t tid; 11 | 12 | #define NTASKS_PER_POOL 1024 13 | static ptask_t *tasks[NTASKS_PER_POOL]; 14 | 15 | /* thread barrier */ 16 | static int volatile barcnt; 17 | static int volatile barsense = 1; 18 | 19 | #define BARRIER_INIT() barcnt=nthreads 20 | 21 | #define BARRIER() do { \ 22 | mysense = !mysense; \ 23 | if (!__atomic_sub_fetch(&barcnt, 1, __ATOMIC_SEQ_CST)) { \ 24 | barcnt = nthreads; \ 25 | barsense = mysense; \ 26 | } else while (barsense != mysense); \ 27 | } while(0) 28 | 29 | typedef struct pthread_arg_tag { 30 | int16_t tid; 31 | } pthread_arg_t; 32 | 33 | 34 | static void *threadfun(void *targ) 35 | { 36 | int success; 37 | int mysense = 1; /* for the barrier */ 38 | 39 | pthread_arg_t *arg = (pthread_arg_t *)targ; 40 | tid = arg->tid; 41 | free(targ); 42 | 43 | if (tid == 0) { 44 | taskpools_init(); 45 | 46 | /* single-thread tests */ 47 | success = 1; 48 | for (int16_t i = 0; i < NTASKS_PER_POOL; ++i) 49 | if ((tasks[i] = task_alloc()) == NULL) 50 | success = 0; 51 | ok(success, "single-thread allocation"); 52 | ok(task_alloc() == NULL, "expected number of tasks"); 53 | for (int16_t i = 0; i < NTASKS_PER_POOL; ++i) 54 | task_free(tasks[i]); 55 | success = 1; 56 | for (int16_t i = 0; i < NTASKS_PER_POOL; ++i) 57 | if ((tasks[i] = task_alloc()) == NULL) 58 | success = 0; 59 | ok(success, "free and alloc again"); 60 | } 61 | 62 | BARRIER(); 63 | 64 | int each = NTASKS_PER_POOL/nthreads, start = tid * each, end = start + each; 65 | if (tid == nthreads-1) end = NTASKS_PER_POOL; 66 | 67 | /* parallel free tests */ 68 | for (int i = start; i < end; ++i) { 69 | task_free(tasks[i]); 70 | __atomic_store_n(&tasks[i], NULL, __ATOMIC_SEQ_CST); 71 | } 72 | 73 | BARRIER(); 74 | 75 | if (tid == 0) { 76 | /* verify that all tasks were freed */ 77 | int success = 1; 78 | for (int16_t i = 0; i < NTASKS_PER_POOL; ++i) { 79 | if (tasks[i] != NULL) 80 | success = 0; 81 | tasks[i] = task_alloc(); 82 | } 83 | ok(success, "all tasks freed"); 84 | 85 | /* verify that tasks were freed out of order */ 86 | success = 0; 87 | int16_t last_index = tasks[0]->index; 88 | for (int16_t i = 1; i < NTASKS_PER_POOL; ++i) { 89 | if (tasks[i]->index != last_index++) 90 | success = 1; 91 | task_free(tasks[i]); 92 | } 93 | ok(success, "frees happened in parallel"); 94 | } 95 | 96 | BARRIER(); 97 | 98 | /* parallel alloc tests */ 99 | for (int i = start; i < end; ++i) 100 | tasks[i] = task_alloc(); 101 | 102 | BARRIER(); 103 | 104 | if (tid == 0) { 105 | /* verify that tasks were allocated concurrently */ 106 | success = 0; 107 | int16_t last_index = tasks[0]->index; 108 | for (int16_t i = 1; i < NTASKS_PER_POOL; ++i) { 109 | if (tasks[i]->index != last_index++) { 110 | success = 1; 111 | break; 112 | } 113 | } 114 | ok(success, "concurrent allocs"); 115 | } 116 | 117 | BARRIER(); 118 | 119 | /* TODO: parallel alloc and free tests */ 120 | 121 | if (tid == 0) { 122 | todo("parallel allocs/frees"); 123 | ok(0); 124 | end_todo; 125 | } 126 | 127 | BARRIER(); 128 | 129 | if (tid == 0) 130 | taskpools_destroy(); 131 | 132 | return NULL; 133 | } 134 | 135 | int main(int argc, char **argv) 136 | { 137 | LOG_SETUP(plog, LOG_LEVEL_INFO, stdout); 138 | LOG_INFO(plog, "taskpools test\n"); 139 | 140 | nthreads = DEFAULT_NUM_THREADS; 141 | char *cp = getenv(NUM_THREADS_NAME); 142 | if (cp) nthreads = strtol(cp, NULL, 10); 143 | LOG_INFO(plog, " %d threads\n", nthreads); 144 | 145 | BARRIER_INIT(); 146 | 147 | /* create threads */ 148 | pthread_t pthread_id; 149 | pthread_attr_t pthread_attr; 150 | pthread_attr_init(&pthread_attr); 151 | pthread_attr_setdetachstate(&pthread_attr, PTHREAD_CREATE_DETACHED); 152 | for (int16_t i = 1; i < nthreads; ++i) { 153 | pthread_arg_t *targ = (pthread_arg_t *)calloc(1, sizeof(pthread_arg_t)); 154 | targ->tid = i; 155 | pthread_create(&pthread_id, &pthread_attr, threadfun, targ); 156 | } 157 | pthread_attr_destroy(&pthread_attr); 158 | 159 | /* thread 0 enters the thread function too */ 160 | pthread_arg_t *targ = (pthread_arg_t *)calloc(1, sizeof(pthread_arg_t)); 161 | targ->tid = 0; 162 | threadfun(targ); 163 | 164 | done_testing(); 165 | } 166 | 167 | --------------------------------------------------------------------------------