├── README.md ├── benchmark.patch ├── benchmark ├── LICENSE ├── Makefile ├── README.md ├── align.h ├── benchmark ├── benchmark.h ├── bits.h ├── cas.c ├── ccqueue.c ├── ccqueue.h ├── ccsynch.h ├── cpumap.h ├── delay.c ├── delay.h ├── driver ├── faa.c ├── halfhalf.c ├── harness.c ├── hzdptr.c ├── hzdptr.h ├── lcrq.c ├── lcrq.h ├── msqueue.c ├── msqueue.h ├── ncq.c ├── ncq.h ├── pairwise.c ├── primitives.h ├── queue.h ├── scq.c ├── scq.h ├── scq2.c ├── scq2.h ├── scqd.c ├── scqd.h ├── wcq.c ├── wcq.h ├── wfqueue.c ├── wfqueue.h ├── xxhash.c └── xxhash.h ├── lf ├── c11.h ├── config.h ├── gcc_x86.h └── lf.h ├── lfring_cas1.h ├── lfring_cas2.h ├── lfring_naive.h └── wfring_cas2.h /README.md: -------------------------------------------------------------------------------- 1 | # A Scalable, Portable, and Memory-Efficient Lock-Free FIFO Queue 2 | 3 | * Publications 4 | 5 | wCQ: A Fast Wait-Free Queue with Bounded Memory Usage. 6 | In Proceedings of the 34th ACM Symposium on Parallelism in Algorithms 7 | and Architectures (SPAA'22). Philadelphia, PA, USA. 8 | 9 | [Paper](https://dl.acm.org/doi/pdf/10.1145/3490148.3538572) 10 | 11 | A Scalable, Portable, and Memory-Efficient Lock-Free FIFO Queue. 12 | In Proceedings of the 33rd International Symposium on DIStributed 13 | Computing (DISC'19). Budapest, Hungary. 14 | 15 | [Paper](http://drops.dagstuhl.de/opus/volltexte/2019/11335/pdf/LIPIcs-DISC-2019-28.pdf) 16 | 17 | * Source code license 18 | 19 | Copyright (c) 2019, 2021 Ruslan Nikolaev. All Rights Reserved. 20 | 21 | The SCQ/SCQD/SCQ2/NCQ/wCQ code is dual-licensed under 2-Clause BSD and MIT. 22 | 23 | * Description 24 | 25 | The benchmark code is in the "benchmark" directory, 26 | which is forked from the original WFQUEUE's benchmark 27 | available [here](https://github.com/chaoran/fast-wait-free-queue). 28 | For usage, see its original README file in "benchmark". 29 | 30 | Additional queues were implemented. See the description below for 31 | details. 32 | 33 | Both GCC and LLVM should be supported. Older versions may lack 34 | support for lfring\_cas2.h and wfring\_cas2.h and/or have 35 | suboptimal performance. We have tested the code with 36 | GCC 8.3.0+ and LLVM 7.0.1+. 37 | 38 | * CAS 39 | 40 | An implementation of the FAA test using CAS emulation (based on 41 | the original FAA test). 42 | 43 | * NCQ 44 | 45 | A naive implementation of the ring buffer. The implementation is in 46 | lfring\_naive.h. 47 | 48 | * SCQ 49 | 50 | This is a "bare-bones" SCQ which simply implements *enqueue* and *dequeue* 51 | (all platforms). The implementation is in lfring\_cas1.h. 52 | 53 | * SCQD 54 | 55 | This is a version which stores pointers as data entries through indirection 56 | (all platforms). The implementation is in lfring\_cas1.h. 57 | 58 | * SCQ2 59 | 60 | This is a version which stores pointers through double-width CAS 61 | (certain platforms such as x86-64). The implementation is in lfring\_cas2.h. 62 | 63 | * wCQ 64 | 65 | An implementation of a wait-free version of SCQ, which uses double-width 66 | CAS (certain platforms such as x86-64). The implementation is in 67 | wfring\_cas2.h. 68 | -------------------------------------------------------------------------------- /benchmark.patch: -------------------------------------------------------------------------------- 1 | diff -urN benchmark.orig/align.h benchmark/align.h 2 | --- benchmark.orig/align.h 2020-12-22 17:17:27.685566626 -0500 3 | +++ benchmark/align.h 2020-12-22 17:14:37.945420646 -0500 4 | @@ -16,7 +16,7 @@ 5 | 6 | int ret = posix_memalign(&ptr, align, size); 7 | if (ret != 0) { 8 | - fprintf(stderr, strerror(ret)); 9 | + fprintf(stderr, "error: %s\n", strerror(ret)); 10 | abort(); 11 | } 12 | 13 | diff -urN benchmark.orig/cas.c benchmark/cas.c 14 | --- benchmark.orig/cas.c 1969-12-31 19:00:00.000000000 -0500 15 | +++ benchmark/cas.c 2019-10-22 10:31:31.124409717 -0400 16 | @@ -0,0 +1,26 @@ 17 | +#include "queue.h" 18 | +#include "primitives.h" 19 | + 20 | +void queue_init(queue_t * q, int nprocs) {} 21 | +void queue_register(queue_t * q, handle_t * hd, int id) 22 | +{ 23 | + *hd = id + 1; 24 | +} 25 | + 26 | +void enqueue(queue_t * q, handle_t * th, void * val) 27 | +{ 28 | + long p = q->P; 29 | + while (!CAS(&q->P, &p, p + 1)) 30 | + ; 31 | +} 32 | + 33 | +void * dequeue(queue_t * q, handle_t * th) 34 | +{ 35 | + long c = q->C; 36 | + while (!CAS(&q->C, &c, c + 1)) 37 | + ; 38 | + return (void *) (long) *th; 39 | +} 40 | + 41 | +void queue_free(queue_t * q, handle_t * h) {} 42 | + 43 | diff -urN benchmark.orig/halfhalf.c benchmark/halfhalf.c 44 | --- benchmark.orig/halfhalf.c 2019-10-22 10:44:25.757993930 -0400 45 | +++ benchmark/halfhalf.c 2019-10-22 10:31:31.124409717 -0400 46 | @@ -25,7 +25,8 @@ 47 | 48 | printf(" Number of operations: %ld\n", nops); 49 | 50 | - q = align_malloc(PAGE_SIZE, sizeof(queue_t)); 51 | + // FIXME: sizeof(queue_t) varies, allocate 4MB 52 | + q = align_malloc(PAGE_SIZE, 4194304); 53 | queue_init(q, nprocs); 54 | 55 | hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs])); 56 | @@ -60,7 +61,7 @@ 57 | else 58 | dequeue(q, th); 59 | 60 | - delay_exec(&state); 61 | +// delay_exec(&state); 62 | } 63 | 64 | return val; 65 | diff -urN benchmark.orig/lcrq.c benchmark/lcrq.c 66 | --- benchmark.orig/lcrq.c 2019-10-22 10:44:25.761993916 -0400 67 | +++ benchmark/lcrq.c 2019-10-22 10:31:31.124409717 -0400 68 | @@ -112,6 +112,7 @@ 69 | alloc: 70 | nrq = handle->next; 71 | 72 | + void *org_nrq = nrq; 73 | if (nrq == NULL) { 74 | nrq = align_malloc(PAGE_SIZE, sizeof(RingQueue)); 75 | init_ring(nrq); 76 | @@ -127,6 +128,9 @@ 77 | handle->next = NULL; 78 | return; 79 | } 80 | + 81 | + // Did not succeed, free the buffer 82 | + if (org_nrq == NULL) free(nrq); 83 | continue; 84 | } 85 | 86 | diff -urN benchmark.orig/Makefile benchmark/Makefile 87 | --- benchmark.orig/Makefile 2019-10-22 10:44:25.757993930 -0400 88 | +++ benchmark/Makefile 2019-10-22 10:31:31.124409717 -0400 89 | @@ -1,8 +1,9 @@ 90 | -TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay 91 | +TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay cas scq scq2 scqd ncq 92 | 93 | +# if using clang, please also specify -mcx16 for x86-64 94 | CC = gcc 95 | CFLAGS = -g -Wall -O3 -pthread -D_GNU_SOURCE 96 | -LDLIBS = -lpthread -lm 97 | +LDLIBS = -ljemalloc -lpthread -lm 98 | 99 | ifeq (${VERIFY}, 1) 100 | CFLAGS += -DVERIFY 101 | @@ -39,7 +40,12 @@ 102 | ccqueue: CFLAGS += -DCCQUEUE 103 | msqueue: CFLAGS += -DMSQUEUE 104 | faa: CFLAGS += -DFAAQ 105 | +cas: CFLAGS += -DFAAQ 106 | delay: CFLAGS += -DDELAY 107 | +scq: CFLAGS += -DSCQ 108 | +scqd: CFLAGS += -DSCQD 109 | +scq2: CFLAGS += -DSCQ2 110 | +ncq: CFLAGS += -DNCQ 111 | 112 | $(TESTS): harness.o 113 | ifeq (${HALFHALF}, 1) 114 | diff -urN benchmark.orig/ncq.c benchmark/ncq.c 115 | --- benchmark.orig/ncq.c 1969-12-31 19:00:00.000000000 -0500 116 | +++ benchmark/ncq.c 2019-10-22 10:31:31.124409717 -0400 117 | @@ -0,0 +1,29 @@ 118 | +#include 119 | +#include 120 | +#include 121 | +#include "ncq.h" 122 | + 123 | +void queue_init(queue_t * q, int nprocs) 124 | +{ 125 | + lfring_init_empty((struct lfring *) q->ring, NCQ_ORDER); 126 | +} 127 | + 128 | + 129 | +void queue_register(queue_t * q, handle_t * th, int id) 130 | +{ 131 | +} 132 | + 133 | +void enqueue(queue_t * q, handle_t * th, void * val) 134 | +{ 135 | + size_t eidx = (size_t) val; 136 | + lfring_enqueue((struct lfring *) q->ring, NCQ_ORDER, eidx, false); 137 | +} 138 | + 139 | +void * dequeue(queue_t * q, handle_t * th) 140 | +{ 141 | + return (void *) lfring_dequeue((struct lfring *) q->ring, NCQ_ORDER, false); 142 | +} 143 | + 144 | +void queue_free(queue_t * q, handle_t * h) 145 | +{ 146 | +} 147 | diff -urN benchmark.orig/ncq.h benchmark/ncq.h 148 | --- benchmark.orig/ncq.h 1969-12-31 19:00:00.000000000 -0500 149 | +++ benchmark/ncq.h 2019-10-22 10:31:31.124409717 -0400 150 | @@ -0,0 +1,23 @@ 151 | +#ifndef NCQ_H 152 | +#define NCQ_H 153 | + 154 | +#ifdef NCQ 155 | + 156 | +#include 157 | +#include "../lfring_naive.h" 158 | +#include "align.h" 159 | + 160 | +#define NCQ_ORDER 16 161 | +#define EMPTY (void *) LFRING_EMPTY 162 | + 163 | +typedef struct _queue_t { 164 | + char ring[LFRING_SIZE(NCQ_ORDER)]; 165 | +} queue_t DOUBLE_CACHE_ALIGNED; 166 | + 167 | +typedef struct _handle_t { 168 | + int pad; 169 | +} handle_t DOUBLE_CACHE_ALIGNED; 170 | + 171 | +#endif 172 | + 173 | +#endif /* end of include guard: NCQ_H */ 174 | diff -urN benchmark.orig/pairwise.c benchmark/pairwise.c 175 | --- benchmark.orig/pairwise.c 2019-10-22 10:44:25.761993916 -0400 176 | +++ benchmark/pairwise.c 2019-10-22 10:31:31.124409717 -0400 177 | @@ -26,7 +26,8 @@ 178 | 179 | printf(" Number of operations: %ld\n", nops); 180 | 181 | - q = align_malloc(PAGE_SIZE, sizeof(queue_t)); 182 | + // FIXME: sizeof(queue_t) varies, allocate 4MB 183 | + q = align_malloc(PAGE_SIZE, 4194304); 184 | queue_init(q, nprocs); 185 | 186 | hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs])); 187 | @@ -47,10 +48,10 @@ 188 | int i; 189 | for (i = 0; i < nops / nprocs; ++i) { 190 | enqueue(q, th, val); 191 | - delay_exec(&state); 192 | +// delay_exec(&state); 193 | 194 | val = dequeue(q, th); 195 | - delay_exec(&state); 196 | +// delay_exec(&state); 197 | } 198 | 199 | return val; 200 | diff -urN benchmark.orig/queue.h benchmark/queue.h 201 | --- benchmark.orig/queue.h 2019-10-22 10:44:25.761993916 -0400 202 | +++ benchmark/queue.h 2019-10-22 10:31:31.124409717 -0400 203 | @@ -28,6 +28,18 @@ 204 | typedef int queue_t; 205 | typedef int handle_t; 206 | 207 | +#elif NCQ 208 | +#include "ncq.h" 209 | + 210 | +#elif SCQ 211 | +#include "scq.h" 212 | + 213 | +#elif SCQ 214 | +#include "scqd.h" 215 | + 216 | +#elif SCQ2 217 | +#include "scq2.h" 218 | + 219 | #else 220 | #error "Please specify a queue implementation." 221 | 222 | diff -urN benchmark.orig/README.md benchmark/README.md 223 | --- benchmark.orig/README.md 2019-10-22 10:44:25.757993930 -0400 224 | +++ benchmark/README.md 2019-10-22 10:43:39.170156671 -0400 225 | @@ -1,4 +1,9 @@ 226 | -# Fast Wait Free Queue 227 | +# Benchmark 228 | + 229 | +The benchmark is forked from the "Fast Wait Free Queue" paper. The 230 | +original code is 231 | +available [here](https://github.com/chaoran/fast-wait-free-queue). 232 | +See the original README file below. 233 | 234 | This is a benchmark framework for evaluating the performance of concurrent queues. Currently, it contains four concurrent queue implementations. They are: 235 | 236 | diff -urN benchmark.orig/scq2.c benchmark/scq2.c 237 | --- benchmark.orig/scq2.c 1969-12-31 19:00:00.000000000 -0500 238 | +++ benchmark/scq2.c 2019-10-22 10:31:31.124409717 -0400 239 | @@ -0,0 +1,34 @@ 240 | +#include 241 | +#include 242 | +#include 243 | +#include "scq2.h" 244 | + 245 | +void queue_init(queue_t * q, int nprocs) 246 | +{ 247 | + lfring_ptr_init_empty((struct lfring_ptr *) q->ring, SCQ2_ORDER); 248 | +} 249 | + 250 | + 251 | +void queue_register(queue_t * q, handle_t * th, int id) 252 | +{ 253 | +} 254 | + 255 | +void enqueue(queue_t * q, handle_t * th, void * val) 256 | +{ 257 | + lfring_ptr_enqueue((struct lfring_ptr *) q->ring, SCQ2_ORDER, val + 1, 258 | + false, true); 259 | +} 260 | + 261 | +void * dequeue(queue_t * q, handle_t * th) 262 | +{ 263 | + void *ptr; 264 | + if (!lfring_ptr_dequeue((struct lfring_ptr *) q->ring, SCQ2_ORDER, 265 | + &ptr, false)) 266 | + return EMPTY; 267 | + ptr--; 268 | + return ptr; 269 | +} 270 | + 271 | +void queue_free(queue_t * q, handle_t * h) 272 | +{ 273 | +} 274 | diff -urN benchmark.orig/scq2.h benchmark/scq2.h 275 | --- benchmark.orig/scq2.h 1969-12-31 19:00:00.000000000 -0500 276 | +++ benchmark/scq2.h 2019-10-22 10:31:31.124409717 -0400 277 | @@ -0,0 +1,23 @@ 278 | +#ifndef SCQ2_H 279 | +#define SCQ2_H 280 | + 281 | +#ifdef SCQ2 282 | + 283 | +#include 284 | +#include "../lfring_cas2.h" 285 | +#include "align.h" 286 | + 287 | +#define SCQ2_ORDER 15 288 | +#define EMPTY (void *) -1 289 | + 290 | +typedef struct _queue_t { 291 | + char ring[LFRING_PTR_SIZE(SCQ2_ORDER)]; 292 | +} queue_t DOUBLE_CACHE_ALIGNED; 293 | + 294 | +typedef struct _handle_t { 295 | + int pad; 296 | +} handle_t DOUBLE_CACHE_ALIGNED; 297 | + 298 | +#endif 299 | + 300 | +#endif /* end of include guard: SCQ_H */ 301 | diff -urN benchmark.orig/scq.c benchmark/scq.c 302 | --- benchmark.orig/scq.c 1969-12-31 19:00:00.000000000 -0500 303 | +++ benchmark/scq.c 2019-10-22 10:31:31.124409717 -0400 304 | @@ -0,0 +1,29 @@ 305 | +#include 306 | +#include 307 | +#include 308 | +#include "scq.h" 309 | + 310 | +void queue_init(queue_t * q, int nprocs) 311 | +{ 312 | + lfring_init_empty((struct lfring *) q->ring, SCQ_ORDER); 313 | +} 314 | + 315 | + 316 | +void queue_register(queue_t * q, handle_t * th, int id) 317 | +{ 318 | +} 319 | + 320 | +void enqueue(queue_t * q, handle_t * th, void * val) 321 | +{ 322 | + size_t eidx = (size_t) val; 323 | + lfring_enqueue((struct lfring *) q->ring, SCQ_ORDER, eidx, false); 324 | +} 325 | + 326 | +void * dequeue(queue_t * q, handle_t * th) 327 | +{ 328 | + return (void *) lfring_dequeue((struct lfring *) q->ring, SCQ_ORDER, false); 329 | +} 330 | + 331 | +void queue_free(queue_t * q, handle_t * h) 332 | +{ 333 | +} 334 | diff -urN benchmark.orig/scqd.c benchmark/scqd.c 335 | --- benchmark.orig/scqd.c 1969-12-31 19:00:00.000000000 -0500 336 | +++ benchmark/scqd.c 2019-10-22 10:31:31.128409708 -0400 337 | @@ -0,0 +1,39 @@ 338 | +#include 339 | +#include 340 | +#include 341 | +#include "scqd.h" 342 | + 343 | +void queue_init(queue_t * q, int nprocs) 344 | +{ 345 | + lfring_init_empty((struct lfring *) q->aq, SCQD_ORDER); 346 | + lfring_init_full((struct lfring *) q->fq, SCQD_ORDER); 347 | +} 348 | + 349 | + 350 | +void queue_register(queue_t * q, handle_t * th, int id) 351 | +{ 352 | +} 353 | + 354 | +void enqueue(queue_t * q, handle_t * th, void * val) 355 | +{ 356 | + size_t eidx; 357 | + eidx = lfring_dequeue((struct lfring *) q->fq, SCQD_ORDER, true); 358 | + if (eidx == LFRING_EMPTY) return; 359 | + q->val[eidx] = val; 360 | + lfring_enqueue((struct lfring *) q->aq, SCQD_ORDER, eidx, false); 361 | +} 362 | + 363 | +void * dequeue(queue_t * q, handle_t * th) 364 | +{ 365 | + size_t eidx; 366 | + void *val; 367 | + eidx = lfring_dequeue((struct lfring *) q->aq, SCQD_ORDER, false); 368 | + if (eidx == LFRING_EMPTY) return EMPTY; 369 | + val = q->val[eidx]; 370 | + lfring_enqueue((struct lfring *) q->fq, SCQD_ORDER, eidx, true); 371 | + return val; 372 | +} 373 | + 374 | +void queue_free(queue_t * q, handle_t * h) 375 | +{ 376 | +} 377 | diff -urN benchmark.orig/scqd.h benchmark/scqd.h 378 | --- benchmark.orig/scqd.h 1969-12-31 19:00:00.000000000 -0500 379 | +++ benchmark/scqd.h 2019-10-22 10:31:31.128409708 -0400 380 | @@ -0,0 +1,25 @@ 381 | +#ifndef SCQD_H 382 | +#define SCQD_H 383 | + 384 | +#ifdef SCQD 385 | + 386 | +#include 387 | +#include "../lfring_cas1.h" 388 | +#include "align.h" 389 | + 390 | +#define SCQD_ORDER 16 391 | +#define EMPTY (void *) LFRING_EMPTY 392 | + 393 | +typedef struct _queue_t { 394 | + char aq[LFRING_SIZE(SCQD_ORDER)]; 395 | + char fq[LFRING_SIZE(SCQD_ORDER)]; 396 | + void *val[(1U << SCQD_ORDER)]; 397 | +} queue_t DOUBLE_CACHE_ALIGNED; 398 | + 399 | +typedef struct _handle_t { 400 | + int pad; 401 | +} handle_t DOUBLE_CACHE_ALIGNED; 402 | + 403 | +#endif 404 | + 405 | +#endif /* end of include guard: SCQD_H */ 406 | diff -urN benchmark.orig/scq.h benchmark/scq.h 407 | --- benchmark.orig/scq.h 1969-12-31 19:00:00.000000000 -0500 408 | +++ benchmark/scq.h 2019-10-22 10:31:31.124409717 -0400 409 | @@ -0,0 +1,23 @@ 410 | +#ifndef SCQ_H 411 | +#define SCQ_H 412 | + 413 | +#ifdef SCQ 414 | + 415 | +#include 416 | +#include "../lfring_cas1.h" 417 | +#include "align.h" 418 | + 419 | +#define SCQ_ORDER 15 420 | +#define EMPTY (void *) LFRING_EMPTY 421 | + 422 | +typedef struct _queue_t { 423 | + char ring[LFRING_SIZE(SCQ_ORDER)]; 424 | +} queue_t DOUBLE_CACHE_ALIGNED; 425 | + 426 | +typedef struct _handle_t { 427 | + int pad; 428 | +} handle_t DOUBLE_CACHE_ALIGNED; 429 | + 430 | +#endif 431 | + 432 | +#endif /* end of include guard: SCQ_H */ 433 | -------------------------------------------------------------------------------- /benchmark/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Chaoran Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /benchmark/Makefile: -------------------------------------------------------------------------------- 1 | TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay cas scq scq2 scqd ncq wcq 2 | 3 | # if using clang, please also specify -mcx16 for x86-64 4 | CC = gcc 5 | CFLAGS = -g -Wall -O3 -pthread -D_GNU_SOURCE 6 | LDLIBS = -ljemalloc -lpthread -lm 7 | 8 | ifeq (${VERIFY}, 1) 9 | CFLAGS += -DVERIFY 10 | endif 11 | 12 | ifeq (${SANITIZE}, 1) 13 | CFLAGS += -fsanitize=address -fno-omit-frame-pointer 14 | LDLIBS += -lasan 15 | LDFLAGS = -fsanitize=address 16 | endif 17 | 18 | ifdef JEMALLOC_PATH 19 | LDFLAGS += -L${JEMALLOC_PATH}/lib -Wl,-rpath,${JEMALLOC_PATH}/lib 20 | LDLIBS += -ljemalloc 21 | endif 22 | 23 | all: $(TESTS) 24 | 25 | wfqueue0: CFLAGS += -DMAX_PATIENCE=0 26 | wfqueue0.o: wfqueue.c 27 | $(CC) $(CFLAGS) -c -o $@ $^ 28 | 29 | haswell: CFLAGS += -DGUADALUPE_COMPACT 30 | haswell: all 31 | 32 | mic: CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc 33 | mic: CFLAGS += -DGUADALUPE_MIC_COMPACT -DLOGN_OPS=6 34 | mic biou: $(filter-out lcrq,$(TESTS)) 35 | 36 | biou: CFLAGS += -DBIOU_COMPACT 37 | 38 | wfqueue wfqueue0: CFLAGS += -DWFQUEUE 39 | lcrq: CFLAGS += -DLCRQ 40 | ccqueue: CFLAGS += -DCCQUEUE 41 | msqueue: CFLAGS += -DMSQUEUE 42 | faa: CFLAGS += -DFAAQ 43 | cas: CFLAGS += -DFAAQ 44 | delay: CFLAGS += -DDELAY 45 | scq: CFLAGS += -DSCQ 46 | scqd: CFLAGS += -DSCQD 47 | scq2: CFLAGS += -DSCQ2 48 | ncq: CFLAGS += -DNCQ 49 | wcq: CFLAGS += -DWCQ 50 | 51 | $(TESTS): harness.o 52 | ifeq (${HALFHALF}, 1) 53 | $(TESTS): halfhalf.o 54 | else 55 | $(TESTS): pairwise.o 56 | endif 57 | 58 | msqueue lcrq: hzdptr.o xxhash.o 59 | 60 | clean: 61 | rm -f $(TESTS) *.o 62 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | The benchmark is forked from the "Fast Wait Free Queue" paper. The 4 | original code is 5 | available [here](https://github.com/chaoran/fast-wait-free-queue). 6 | See the original README file below. 7 | 8 | This is a benchmark framework for evaluating the performance of concurrent queues. Currently, it contains four concurrent queue implementations. They are: 9 | 10 | - A fast wait-free queue `wfqueue`, 11 | - Morrison and Afek's `lcrq`, 12 | - Fatourou and Kallimanis's `ccqueue`, and 13 | - Michael and Scott's `msqueue` 14 | 15 | The benchmark framework also includes a synthetic queue benchmark, `faa`, which emulates both an enqueue and a dequeue with a `fetch-and-add` primitive to test the performance of `fetch-and-add` on a system. 16 | 17 | The framework currently contains one benchmark, `pairwise`, in which all threads repeatedly execute pairs of enqueue and dequeue operations. Between two operations, `pairwise` uses a delay routine that adds an arbitrary delay (between 50~150ns) to avoid artificial long run scenarios, where a cache line is held by one thread for a long time. 18 | 19 | ## Requirements 20 | 21 | - **GCC 4.1.0 or later (Recommend GCC 4.7.3 or later)**: current implementations uses GCC `__atomic` or `__sync` primitives for atomic memory access. 22 | - **Linux kernel 2.5.8 or later** 23 | - **glibc 2.3**: we use `sched_setaffinity` to bind threads to cores. 24 | - **atomic `CAS2`**: `lcrq` requires `CAS2`, a 16 Byte wide `compare-and-swap` primitive. This is available on most recent Intel processors and IBM Power8. 25 | - **jemalloc** (optional): `jemalloc` eliminates the bottleneck of the memory allocator. You can link with `jemalloc` by setting `JEMALLOC_PATH` environment variable to the path where your `jemalloc` is installed. 26 | 27 | ## How to install 28 | 29 | Download one of the released source code tarball, then execute the following commands. The filename used may be different depending on the name of the tarball you have downloaded. 30 | ``` 31 | $ tar zxf fast-wait-free-queue-1.0.0.tar.gz 32 | $ cd fast-wait-free-queue-1.0.0 33 | $ make 34 | ``` 35 | 36 | This should generate 6 binaries (or 5 if your system does not support `CAS2`, `lcrq` will fail to compile): `wfqueue`, `wfqueue0`, `lcrq`, `ccqueue`, `msqueue`, `faa`, and `delay`. These are the `pairwise` benchmark compiled using different queue implementations. 37 | - `wfqueue0`: the same as `wfqueue` except that its `PATIENCE` is set to `0`. 38 | - `delay`: a synthetic benchmark used to measure the time spent in the delay routine. 39 | 40 | ## How to run 41 | 42 | You can execute a binary directly, using the number of threads as an argument. Without an argument, the execution will use all available cores on the system. 43 | 44 | For example, 45 | ``` 46 | ./wfqueue 8 47 | ``` 48 | runs `wfqueue` with 8 threads. 49 | 50 | If you would like to verify the result, compile the binary with `VERIFY=1 make`. Then execute a binary directly will print either `PASSED` or error messages. 51 | 52 | You can also use the `driver` script, which invokes a binary up to 10 times and measures the **mean of running times**, the **running time of the current run**, the **standard deviation**, **margin of error** (both in time and percentage) of each run. 53 | The script terminates when the **margin of error** is relatively small (**< 0.02**), or has invoked the binary 10 times. 54 | 55 | For example, 56 | ``` 57 | ./driver ./wfqueue 8 58 | ``` 59 | runs `wfqueue` with 8 threads up to 10 times and collect statistic results. 60 | 61 | You can use the `benchmark` script, which invokes `driver` on all combinations of a list of binaries and a list of numbers of threads, and report the `mean running time` and `margin of error` for each combination. You can specify the list of binaries using the environment variable `TESTS`. You can specify the list of numbers of threads using the environment variable `PROCS`. 62 | 63 | The generated output of `benchmark` can be used as a datafile for gnuplot. The first column of `benchmark`'s output is the number threads. Then every two columns are the `mean running time` and `margin of error` for each queue implementation. They are in the same order as they are specified in `TESTS`. 64 | 65 | For example, 66 | ``` 67 | TESTS=wfqueue:lcrq:faa:delay PROCS=1:2:4:8 ./benchmark 68 | ``` 69 | runs each of `wfqueue`, `lcrq`, `faa`, and `delay` using 1, 2, 4, and 8 threads. 70 | 71 | Then you can plot them using, 72 | ``` 73 | set logscale x 2 74 | plot "t" using 1:(20000/($2-$8)) t "wfqueue" w lines, \ 75 | "t" using 1:(20000/($4-$8)) t "lcrq" w lines, \ 76 | "t" using 1:(20000/($6-$8)) t "faa" w lines 77 | ``` 78 | 79 | ## How to map threads to cores 80 | 81 | By default, the framework will map a thread with id `i` to the core with id `i % p`, where *p* is the number of available cores on a system; you can check each core's id in `proc/cpuinfo`. 82 | 83 | To implement a custom mapping, you can add a `cpumap` function in `cpumap.h`. The signature of `cpumap` is 84 | ``` 85 | int cpumap(int id, int nprocs) 86 | ``` 87 | where `id` is the id of the current thread, `nprocs` is the number of threads. `cpumap` should return the corresponding core id for the thread. `cpumap.h` contains several examples of the cpumap function. You should guard the definition of the added `cpumap` using a conditional macro, and add the macro to `CFLAGS` in the makefile. 88 | 89 | ## How to add a new queue implementation 90 | 91 | We use a generic pointer `void *` to represent a value that can be stored in the queue. 92 | A queue should implements the queue interface, defined in `queue.h`. 93 | 94 | - `queue_t`: the struct type of the queue, 95 | - `handle_t`: a thread's handle to the queue, used to store thread local state, 96 | - `void queue_init(queue_t * q, int nprocs)`: initialize a queue; this will be called only once, 97 | - `void queue_register(queue_t * q, handle_t * th, int id)`: initialize a thread's handle; this will be called by every thread that uses the queue, 98 | - `void enqueue(queue_t * q, handle_t * th, void * val)`: enqueues a value, 99 | - `void * dequeue(queue_t * q, handle_t * th)`: dequeues a value, 100 | - `void queue_free(queue_t * q, handle_t * h)`: deallocate a queue and cleanup all resources associated with it, 101 | - `EMPTY`: a value that will be returned if a `dequeue` fails. This should be a macro that is defined in the header file. 102 | 103 | ## How to add a new benchmark 104 | 105 | A benchmark should implement the benchmark interface, defined in `benchmark.h`, and interact with a queue using the queue interface. 106 | The benchmark interface includes: 107 | 108 | - `void init(int nprocs, int n)`: performs initialization of the benchmark; called only once at the beginning. 109 | - `void thread_init(int id, int nprocs)`: performs thread local initialization of the benchmark; called once per thread, after `init` but before `benchmark`. 110 | - `void * benchmark(int id, int nprocs)`: run the benchmark once, called by each thread to run the benchmark. Each call will be timed and report as one iteration. It can return a result, which will be passed to `verify` to verify correctness. 111 | - `int verify(int nprocs, void * results)`: should verify the result of each thread and return `0` on success and non-zero values on error. 112 | -------------------------------------------------------------------------------- /benchmark/align.h: -------------------------------------------------------------------------------- 1 | #ifndef ALIGN_H 2 | #define ALIGN_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define PAGE_SIZE 4096 9 | #define CACHE_LINE_SIZE 64 10 | #define CACHE_ALIGNED __attribute__((aligned(CACHE_LINE_SIZE))) 11 | #define DOUBLE_CACHE_ALIGNED __attribute__((aligned(2 * CACHE_LINE_SIZE))) 12 | 13 | static inline void * align_malloc(size_t align, size_t size) 14 | { 15 | void * ptr; 16 | 17 | int ret = posix_memalign(&ptr, align, size); 18 | if (ret != 0) { 19 | fprintf(stderr, "error: %s\n", strerror(ret)); 20 | abort(); 21 | } 22 | 23 | return ptr; 24 | } 25 | 26 | #endif /* end of include guard: ALIGN_H */ 27 | -------------------------------------------------------------------------------- /benchmark/benchmark: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$TESTS" ]; then 4 | TESTS=(wfqueue wfqueue0 faa lcrq ccqueue msqueue delay) 5 | else 6 | IFS=':' read -r -a TESTS <<< "${TESTS}" 7 | fi 8 | 9 | if [ -z "$PROCS" ]; then 10 | PROCS=(1 2 4 8) 11 | else 12 | IFS=':' read -r -a PROCS <<< "${PROCS}" 13 | fi 14 | 15 | printf '#! Host: %s\n' $( hostname ) 16 | printf '#! Benchmarks: %s\n' "${TESTS[*]}" 17 | printf '#! Threads: %s\n' "${PROCS[*]}" 18 | 19 | for j in ${PROCS[@]}; do 20 | printf '%d' $j 21 | for i in ${TESTS[@]}; do 22 | echo -ne \ 23 | "$(./driver ./$i $j | tail -n 1 | awk '{printf " %.2f %.2f", $3, $5}')" 24 | done 25 | printf '\n' 26 | done 27 | -------------------------------------------------------------------------------- /benchmark/benchmark.h: -------------------------------------------------------------------------------- 1 | #ifndef BENCHMARK_H 2 | #define BENCHMARK_H 3 | 4 | extern void init(int nprocs, int n); 5 | extern void thread_init(int id, int nprocs); 6 | extern void * benchmark(int id, int nprocs); 7 | extern void thread_exit(int id, int nprocs); 8 | extern int verify(int nprocs, void ** results); 9 | 10 | #endif /* end of include guard: BENCHMARK_H */ 11 | -------------------------------------------------------------------------------- /benchmark/bits.h: -------------------------------------------------------------------------------- 1 | #ifndef BITS_H 2 | #define BITS_H 3 | 4 | static void * bits_join(int hi, int lo) 5 | { 6 | intptr_t int64 = hi; 7 | int64 <<= 32; 8 | int64 += lo; 9 | return (void *) int64; 10 | } 11 | 12 | static int bits_lo(void * ptr) 13 | { 14 | intptr_t int64 = (intptr_t) ptr; 15 | int64 &= 0x00000000ffffffff; 16 | return (int) int64; 17 | } 18 | 19 | static int bits_hi(void * ptr) 20 | { 21 | intptr_t int64 = (intptr_t) ptr; 22 | int64 >>= 32; 23 | return (int) int64; 24 | } 25 | 26 | #endif /* end of include guard: BITS_H */ 27 | -------------------------------------------------------------------------------- /benchmark/cas.c: -------------------------------------------------------------------------------- 1 | #include "queue.h" 2 | #include "primitives.h" 3 | 4 | void queue_init(queue_t * q, int nprocs) {} 5 | void queue_register(queue_t * q, handle_t * hd, int id) 6 | { 7 | *hd = id + 1; 8 | } 9 | 10 | void enqueue(queue_t * q, handle_t * th, void * val) 11 | { 12 | long p = q->P; 13 | while (!CAS(&q->P, &p, p + 1)) 14 | ; 15 | } 16 | 17 | void * dequeue(queue_t * q, handle_t * th) 18 | { 19 | long c = q->C; 20 | while (!CAS(&q->C, &c, c + 1)) 21 | ; 22 | return (void *) (long) *th; 23 | } 24 | 25 | void queue_free(queue_t * q, handle_t * h) {} 26 | 27 | -------------------------------------------------------------------------------- /benchmark/ccqueue.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "delay.h" 4 | #include "ccqueue.h" 5 | 6 | static inline 7 | void serialEnqueue(void * state, void * data) 8 | { 9 | node_t * volatile * tail = (node_t **) state; 10 | node_t * node = (node_t *) data; 11 | 12 | (*tail)->next = node; 13 | *tail = node; 14 | } 15 | 16 | static inline 17 | void serialDequeue(void * state, void * data) 18 | { 19 | node_t * volatile * head = (node_t **) state; 20 | node_t ** ptr = (node_t **) data; 21 | 22 | node_t * node = *head; 23 | node_t * next = node->next; 24 | 25 | if (next) { 26 | node->data = next->data; 27 | *head = next; 28 | } else { 29 | node = (void *) -1; 30 | } 31 | 32 | *ptr = node; 33 | } 34 | 35 | void queue_init(queue_t * queue, int nprocs) 36 | { 37 | ccsynch_init(&queue->enq); 38 | ccsynch_init(&queue->deq); 39 | 40 | node_t * dummy = align_malloc(CACHE_LINE_SIZE, sizeof(node_t)); 41 | dummy->data = 0; 42 | dummy->next = NULL; 43 | 44 | queue->head = dummy; 45 | queue->tail = dummy; 46 | } 47 | 48 | void queue_register(queue_t * queue, handle_t * handle, int id) 49 | { 50 | ccsynch_handle_init(&handle->enq); 51 | ccsynch_handle_init(&handle->deq); 52 | 53 | handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(node_t)); 54 | } 55 | 56 | void enqueue(queue_t * queue, handle_t * handle, void * data) 57 | { 58 | node_t * node = handle->next; 59 | 60 | if (node) handle->next = NULL; 61 | else node = align_malloc(CACHE_LINE_SIZE, sizeof(node_t)); 62 | 63 | node->data = data; 64 | node->next = NULL; 65 | 66 | ccsynch_apply(&queue->enq, &handle->enq, &serialEnqueue, &queue->tail, node); 67 | } 68 | 69 | void * dequeue(queue_t * queue, handle_t * handle) 70 | { 71 | node_t * node; 72 | ccsynch_apply(&queue->deq, &handle->deq, &serialDequeue, &queue->head, &node); 73 | 74 | void * data; 75 | 76 | if (node == (void *) -1) { 77 | data = (void *) -1; 78 | } else { 79 | data = node->data; 80 | if (handle->next) free(node); 81 | else handle->next = node; 82 | } 83 | 84 | return data; 85 | } 86 | 87 | void queue_free(int id, int nprocs) {} 88 | -------------------------------------------------------------------------------- /benchmark/ccqueue.h: -------------------------------------------------------------------------------- 1 | #ifndef CCQUEUE_H 2 | #define CCQUEUE_H 3 | 4 | #ifdef CCQUEUE 5 | #include "ccsynch.h" 6 | 7 | #define EMPTY (void *) -1 8 | 9 | typedef struct _node_t { 10 | struct _node_t * next CACHE_ALIGNED; 11 | void * volatile data; 12 | } node_t; 13 | 14 | typedef struct _queue_t { 15 | ccsynch_t enq DOUBLE_CACHE_ALIGNED; 16 | ccsynch_t deq DOUBLE_CACHE_ALIGNED; 17 | node_t * head DOUBLE_CACHE_ALIGNED; 18 | node_t * tail DOUBLE_CACHE_ALIGNED; 19 | } queue_t DOUBLE_CACHE_ALIGNED; 20 | 21 | typedef struct _handle_t { 22 | ccsynch_handle_t enq; 23 | ccsynch_handle_t deq; 24 | node_t * next; 25 | } handle_t DOUBLE_CACHE_ALIGNED; 26 | 27 | #endif 28 | 29 | #endif /* end of include guard: CCQUEUE_H */ 30 | -------------------------------------------------------------------------------- /benchmark/ccsynch.h: -------------------------------------------------------------------------------- 1 | #ifndef _CCSYNCH_H_ 2 | #define _CCSYNCH_H_ 3 | 4 | #include 5 | #include "align.h" 6 | #include "primitives.h" 7 | 8 | typedef struct _ccsynch_node_t { 9 | struct _ccsynch_node_t * volatile next CACHE_ALIGNED; 10 | void * volatile data; 11 | int volatile status CACHE_ALIGNED; 12 | } ccsynch_node_t; 13 | 14 | typedef struct _ccsynch_handle_t { 15 | struct _ccsynch_node_t * next; 16 | } ccsynch_handle_t; 17 | 18 | typedef struct _ccsynch_t { 19 | struct _ccsynch_node_t * volatile tail DOUBLE_CACHE_ALIGNED; 20 | } ccsynch_t; 21 | 22 | #define CCSYNCH_WAIT 0x0 23 | #define CCSYNCH_READY 0x1 24 | #define CCSYNCH_DONE 0x3 25 | 26 | static inline 27 | void ccsynch_apply(ccsynch_t * synch, ccsynch_handle_t * handle, 28 | void (*apply)(void *, void *), void * state, void * data) 29 | { 30 | ccsynch_node_t * next = handle->next; 31 | next->next = NULL; 32 | next->status = CCSYNCH_WAIT; 33 | 34 | ccsynch_node_t * curr = SWAPra(&synch->tail, next); 35 | handle->next = curr; 36 | 37 | int status = ACQUIRE(&curr->status); 38 | 39 | if (status == CCSYNCH_WAIT) { 40 | curr->data = data; 41 | RELEASE(&curr->next, next); 42 | 43 | do { 44 | PAUSE(); 45 | status = ACQUIRE(&curr->status); 46 | } while (status == CCSYNCH_WAIT); 47 | } 48 | 49 | if (status != CCSYNCH_DONE) { 50 | apply(state, data); 51 | 52 | curr = next; 53 | next = ACQUIRE(&curr->next); 54 | 55 | int count = 0; 56 | const int CCSYNCH_HELP_BOUND = 256; 57 | 58 | while (next && count++ < CCSYNCH_HELP_BOUND) { 59 | apply(state, curr->data); 60 | RELEASE(&curr->status, CCSYNCH_DONE); 61 | 62 | curr = next; 63 | next = ACQUIRE(&curr->next); 64 | } 65 | 66 | RELEASE(&curr->status, CCSYNCH_READY); 67 | } 68 | } 69 | 70 | static inline void ccsynch_init(ccsynch_t * synch) 71 | { 72 | ccsynch_node_t * node = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t)); 73 | node->next = NULL; 74 | node->status = CCSYNCH_READY; 75 | 76 | synch->tail = node; 77 | } 78 | 79 | static inline void ccsynch_handle_init(ccsynch_handle_t * handle) 80 | { 81 | handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t)); 82 | } 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /benchmark/cpumap.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUMAP_H 2 | #define CPUMAP_H 3 | 4 | #include 5 | 6 | #ifdef GUADALUPE_SPREAD 7 | int cpumap(int i, int nprocs) 8 | { 9 | return (i / 36) * 36 + (i % 2) * 18 + (i % 36 / 2); 10 | } 11 | 12 | #elif GUADALUPE_OVERSUB 13 | int cpumap(int i, int nprocs) { 14 | return (i % 18); 15 | } 16 | 17 | #elif GUADALUPE_COMPACT 18 | int cpumap(int i, int nprocs) 19 | { 20 | return (i % 2) * 36 + i / 2; 21 | } 22 | 23 | #elif GUADALUPE_MIC_COMPACT 24 | int cpumap(int i, int nprocs) 25 | { 26 | return (i + 1) % 228; 27 | } 28 | 29 | #elif LES_SPREAD 30 | int cpumap(int i, int nprocs) 31 | { 32 | return i % 4 * 12 + i / 4 % 12; 33 | } 34 | 35 | #elif BIOU_COMPACT 36 | int cpumap(int i, int nprocs) 37 | { 38 | return (i % 2) * 32 + i / 2; 39 | } 40 | 41 | #else 42 | int cpumap(int id, int nprocs) 43 | { 44 | return id % nprocs; 45 | } 46 | 47 | #endif 48 | 49 | #endif /* end of include guard: CPUMAP_H */ 50 | -------------------------------------------------------------------------------- /benchmark/delay.c: -------------------------------------------------------------------------------- 1 | #include "queue.h" 2 | #include "primitives.h" 3 | 4 | void queue_init(queue_t * q, int nprocs) {} 5 | void queue_register(queue_t * q, handle_t * hd, int id) 6 | { 7 | *hd = id + 1; 8 | } 9 | 10 | void enqueue(queue_t * q, handle_t * th, void * val) 11 | { 12 | } 13 | 14 | void * dequeue(queue_t * q, handle_t * th) 15 | { 16 | return (void *) (long) *th; 17 | } 18 | 19 | void queue_free(queue_t * q, handle_t * h) {} 20 | 21 | -------------------------------------------------------------------------------- /benchmark/delay.h: -------------------------------------------------------------------------------- 1 | #ifndef DELAY_H 2 | #define DELAY_H 3 | 4 | //#include 5 | #include 6 | 7 | typedef struct drand48_data delay_t; 8 | 9 | static inline void delay_init(delay_t * state, int id) 10 | { 11 | srand48_r(id, state); 12 | } 13 | 14 | static inline void delay_exec(delay_t * state) 15 | { 16 | long n; 17 | lrand48_r(state, &n); 18 | 19 | int j; 20 | for (j = 50; j < 50 + n % 100; ++j) { 21 | __asm__ ("nop"); 22 | } 23 | } 24 | 25 | #endif /* end of include guard: DELAY_H */ 26 | -------------------------------------------------------------------------------- /benchmark/driver: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | T90=( \ 4 | 6.314 2.920 2.353 2.132 2.015 1.943 1.895 1.860 1.833 1.812 \ 5 | 1.796 1.782 1.771 1.761 1.753 1.746 1.740 1.734 1.729 1.725 \ 6 | 1.721 1.717 1.714 1.711 1.708 1.706 1.703 1.701 1.699 1.697 \ 7 | ) 8 | 9 | T95=( \ 10 | 12.71 4.303 3.182 2.776 2.571 2.447 2.365 2.306 2.262 2.228 \ 11 | 2.201 2.179 2.160 2.145 2.131 2.120 2.110 2.101 2.093 2.086 \ 12 | 2.080 2.074 2.069 2.064 2.060 2.056 2.052 2.048 2.045 2.042 \ 13 | ) 14 | 15 | TIMES[0]=$($@ | grep Mean | awk '{ print $5 }') 16 | SUM=${TIMES[0]} 17 | printf '#%-2d %.2f\n' 1 ${TIMES[0]} 18 | 19 | i=1 20 | while true; do 21 | TIME=$($@ | grep Mean | awk '{ print $5 }') 22 | TIMES[$i]=$TIME 23 | SUM=$(echo "$SUM + $TIME" | bc) 24 | N=$(($i + 1)) 25 | 26 | MEAN=$(echo "$SUM / $N" | bc -l) 27 | 28 | STD=0 29 | for j in "${TIMES[@]}"; do 30 | STD=$(echo "($j - $MEAN) ^ 2 + $STD" | bc -l) 31 | done 32 | STD=$(echo "sqrt ($STD / $i)" | bc -l) 33 | 34 | ERR=$(echo "${T95[$i]} * $STD / sqrt($N)" | bc -l) 35 | PRECISION=$(echo "$ERR / $MEAN" | bc -l) 36 | 37 | printf '#%-2d %.2f %.2f %.4f %.2f %.3f\n' \ 38 | $N $TIME $MEAN $STD $ERR $PRECISION 39 | 40 | if (($N >= 10 || $N >= 5 && $(echo "$PRECISION < 0.02" | bc) == 1)); then 41 | break 42 | else 43 | i=$N 44 | fi 45 | done 46 | 47 | -------------------------------------------------------------------------------- /benchmark/faa.c: -------------------------------------------------------------------------------- 1 | #include "queue.h" 2 | #include "primitives.h" 3 | 4 | void queue_init(queue_t * q, int nprocs) {} 5 | void queue_register(queue_t * q, handle_t * hd, int id) 6 | { 7 | *hd = id + 1; 8 | } 9 | 10 | void enqueue(queue_t * q, handle_t * th, void * val) 11 | { 12 | FAA(&q->P, 1); 13 | } 14 | 15 | void * dequeue(queue_t * q, handle_t * th) 16 | { 17 | FAA(&q->C, 1); 18 | return (void *) (long) *th; 19 | } 20 | 21 | void queue_free(queue_t * q, handle_t * h) {} 22 | 23 | -------------------------------------------------------------------------------- /benchmark/halfhalf.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "delay.h" 5 | #include "queue.h" 6 | 7 | #ifndef LOGN_OPS 8 | #define LOGN_OPS 7 9 | #endif 10 | 11 | static long nops; 12 | static queue_t * q; 13 | static handle_t ** hds; 14 | 15 | void init(int nprocs, int logn) { 16 | /** Use 10^7 as default input size. */ 17 | if (logn == 0) logn = LOGN_OPS; 18 | 19 | /** Compute the number of ops to perform. */ 20 | nops = 1; 21 | int i; 22 | for (i = 0; i < logn; ++i) { 23 | nops *= 10; 24 | } 25 | 26 | printf(" Number of operations: %ld\n", nops); 27 | 28 | // FIXME: sizeof(queue_t) varies, allocate 4MB 29 | q = align_malloc(PAGE_SIZE, 4194304); 30 | queue_init(q, nprocs); 31 | 32 | hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs])); 33 | } 34 | 35 | void thread_init(int id, int nprocs) { 36 | hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t)); 37 | queue_register(q, hds[id], id); 38 | } 39 | 40 | void thread_exit(int id, int nprocs) { 41 | queue_free(q, hds[id]); 42 | } 43 | 44 | void * benchmark(int id, int nprocs) { 45 | void * val = (void *) (intptr_t) (id + 1); 46 | handle_t * th = hds[id]; 47 | 48 | delay_t state; 49 | delay_init(&state, id); 50 | 51 | struct drand48_data rstate; 52 | srand48_r(id, &rstate); 53 | 54 | int i; 55 | for (i = 0; i < nops / nprocs; ++i) { 56 | long n; 57 | lrand48_r(&rstate, &n); 58 | 59 | if (n % 2 == 0) 60 | enqueue(q, th, val); 61 | else 62 | dequeue(q, th); 63 | 64 | // delay_exec(&state); 65 | } 66 | 67 | return val; 68 | } 69 | 70 | int verify(int nprocs, void ** results) { 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /benchmark/harness.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "bits.h" 10 | #include "cpumap.h" 11 | #include "benchmark.h" 12 | 13 | #ifndef NUM_ITERS 14 | #define NUM_ITERS 5 15 | #endif 16 | 17 | #ifndef MAX_PROCS 18 | #define MAX_PROCS 512 19 | #endif 20 | 21 | #ifndef MAX_ITERS 22 | #define MAX_ITERS 20 23 | #endif 24 | 25 | #ifndef COV_THRESHOLD 26 | #define COV_THRESHOLD 0.02 27 | #endif 28 | 29 | static pthread_barrier_t barrier; 30 | static double times[MAX_ITERS]; 31 | static double means[MAX_ITERS]; 32 | static double covs[MAX_ITERS]; 33 | static volatile int target; 34 | 35 | static size_t elapsed_time(size_t us) 36 | { 37 | struct timeval t; 38 | gettimeofday(&t, NULL); 39 | return t.tv_sec * 1000000 + t.tv_usec - us; 40 | } 41 | 42 | static double compute_mean(const double * times) 43 | { 44 | int i; 45 | double sum = 0; 46 | 47 | for (i = 0; i < NUM_ITERS; ++i) { 48 | sum += times[i]; 49 | } 50 | 51 | return sum / NUM_ITERS; 52 | } 53 | 54 | static double compute_cov(const double * times, double mean) 55 | { 56 | double variance = 0; 57 | 58 | int i; 59 | for (i = 0; i < NUM_ITERS; ++i) { 60 | variance += (times[i] - mean) * (times[i] - mean); 61 | } 62 | 63 | variance /= NUM_ITERS; 64 | 65 | double cov = sqrt(variance);; 66 | cov /= mean; 67 | return cov; 68 | } 69 | 70 | static size_t reduce_min(long val, int id, int nprocs) 71 | { 72 | static long buffer[MAX_PROCS]; 73 | 74 | buffer[id] = val; 75 | pthread_barrier_wait(&barrier); 76 | 77 | long min = LONG_MAX; 78 | int i; 79 | for (i = 0; i < nprocs; ++i) { 80 | if (buffer[i] < min) min = buffer[i]; 81 | } 82 | 83 | return min; 84 | } 85 | 86 | static void report(int id, int nprocs, int i, long us) 87 | { 88 | long ms = reduce_min(us, id, nprocs); 89 | 90 | if (id == 0) { 91 | times[i] = ms / 1000.0; 92 | printf(" #%d elapsed time: %.2f ms\n", i + 1, times[i]); 93 | 94 | if (i + 1 >= NUM_ITERS) { 95 | int n = i + 1 - NUM_ITERS; 96 | 97 | means[i] = compute_mean(times + n); 98 | covs[i] = compute_cov(times + n, means[i]); 99 | 100 | if (covs[i] < COV_THRESHOLD) { 101 | target = i; 102 | } 103 | } 104 | } 105 | 106 | pthread_barrier_wait(&barrier); 107 | } 108 | 109 | static void * thread(void * bits) 110 | { 111 | int id = bits_hi(bits); 112 | int nprocs = bits_lo(bits); 113 | 114 | cpu_set_t set; 115 | CPU_ZERO(&set); 116 | 117 | int cpu = cpumap(id, nprocs); 118 | CPU_SET(cpu, &set); 119 | sched_setaffinity(0, sizeof(set), &set); 120 | 121 | thread_init(id, nprocs); 122 | pthread_barrier_wait(&barrier); 123 | 124 | int i; 125 | void * result = NULL; 126 | 127 | for (i = 0; i < MAX_ITERS && target == 0; ++i) { 128 | long us = elapsed_time(0); 129 | result = benchmark(id, nprocs); 130 | pthread_barrier_wait(&barrier); 131 | us = elapsed_time(us); 132 | report(id, nprocs, i, us); 133 | } 134 | 135 | thread_exit(id, nprocs); 136 | return result; 137 | } 138 | 139 | int main(int argc, const char *argv[]) 140 | { 141 | int nprocs = 0; 142 | int n = 0; 143 | 144 | /** The first argument is nprocs. */ 145 | if (argc > 1) { 146 | nprocs = atoi(argv[1]); 147 | } 148 | 149 | /** 150 | * Use the number of processors online as nprocs if it is not 151 | * specified. 152 | */ 153 | if (nprocs == 0) { 154 | nprocs = sysconf(_SC_NPROCESSORS_ONLN); 155 | } 156 | 157 | if (nprocs <= 0) return 1; 158 | else { 159 | /** Set concurrency level. */ 160 | pthread_setconcurrency(nprocs); 161 | } 162 | 163 | /** 164 | * The second argument is input size n. 165 | */ 166 | if (argc > 2) { 167 | n = atoi(argv[2]); 168 | } 169 | 170 | pthread_barrier_init(&barrier, NULL, nprocs); 171 | printf("===========================================\n"); 172 | printf(" Benchmark: %s\n", argv[0]); 173 | printf(" Number of processors: %d\n", nprocs); 174 | 175 | init(nprocs, n); 176 | 177 | pthread_t ths[nprocs]; 178 | void * res[nprocs]; 179 | 180 | int i; 181 | for (i = 1; i < nprocs; i++) { 182 | pthread_create(&ths[i], NULL, thread, bits_join(i, nprocs)); 183 | } 184 | 185 | res[0] = thread(bits_join(0, nprocs)); 186 | 187 | for (i = 1; i < nprocs; i++) { 188 | pthread_join(ths[i], &res[i]); 189 | } 190 | 191 | if (target == 0) { 192 | target = NUM_ITERS - 1; 193 | double minCov = covs[target]; 194 | 195 | /** Pick the result that has the lowest CoV. */ 196 | int i; 197 | for (i = NUM_ITERS; i < MAX_ITERS; ++i) { 198 | if (covs[i] < minCov) { 199 | minCov = covs[i]; 200 | target = i; 201 | } 202 | } 203 | } 204 | 205 | double mean = means[target]; 206 | double cov = covs[target]; 207 | int i1 = target - NUM_ITERS + 2; 208 | int i2 = target + 1; 209 | 210 | printf(" Steady-state iterations: %d~%d\n", i1, i2); 211 | printf(" Coefficient of variation: %.2f\n", cov); 212 | printf(" Number of measurements: %d\n", NUM_ITERS); 213 | printf(" Mean of elapsed time: %.2f ms\n", mean); 214 | printf("===========================================\n"); 215 | 216 | pthread_barrier_destroy(&barrier); 217 | return verify(nprocs, res); 218 | } 219 | 220 | -------------------------------------------------------------------------------- /benchmark/hzdptr.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "hzdptr.h" 4 | #include "xxhash.h" 5 | 6 | #define HZDPTR_HTBL_SIZE(nprocs, nptrs) (4 * nprocs * nptrs) 7 | 8 | typedef struct _node_t { 9 | struct _node_t * next; 10 | } node_t; 11 | 12 | static int htable_insert(void ** tbl, size_t size, void * ptr) 13 | { 14 | int index = XXH32(ptr, 1, 0) % size; 15 | int i; 16 | 17 | for (i = index; i < size; ++i ) { 18 | if (tbl[i] == NULL) { 19 | tbl[i] = ptr; 20 | return 0; 21 | } 22 | } 23 | 24 | for (i = 0; i < index; ++i) { 25 | if (tbl[i] == NULL) { 26 | tbl[i] = ptr; 27 | return 0; 28 | } 29 | } 30 | 31 | return -1; 32 | } 33 | 34 | static int htable_lookup(void ** tbl, size_t size, void * ptr) 35 | { 36 | int index = XXH32(ptr, 1, 0) % size; 37 | int i; 38 | 39 | for (i = index; i < size; ++i) { 40 | if (tbl[i] == ptr) { 41 | return 1; 42 | } else if (tbl[i] == NULL) { 43 | return 0; 44 | } 45 | } 46 | 47 | for (i = 0; i < index; ++i) { 48 | if (tbl[i] == ptr) { 49 | return 1; 50 | } else if (tbl[i] == NULL) { 51 | return 0; 52 | } 53 | } 54 | 55 | return 0; 56 | } 57 | 58 | void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs) 59 | { 60 | hzd->nprocs = nprocs; 61 | hzd->nptrs = nptrs; 62 | hzd->nretired = 0; 63 | hzd->ptrs = calloc(hzdptr_size(nprocs, nptrs), 1); 64 | 65 | _hzdptr_enlist(hzd); 66 | } 67 | 68 | void _hzdptr_retire(hzdptr_t * hzd, void ** rlist) 69 | { 70 | size_t size = HZDPTR_HTBL_SIZE(hzd->nprocs, hzd->nptrs); 71 | void * plist[size]; 72 | memset(plist, 0, sizeof(plist)); 73 | 74 | hzdptr_t * me = hzd; 75 | void * ptr; 76 | 77 | while ((hzd = hzd->next) != me) { 78 | int i; 79 | for (i = 0; i < hzd->nptrs; ++i) { 80 | ptr = hzd->ptrs[i]; 81 | 82 | if (ptr != NULL) { 83 | htable_insert(plist, size, ptr); 84 | } 85 | } 86 | } 87 | 88 | int nretired = 0; 89 | 90 | /** Check pointers in retire list with plist. */ 91 | int i; 92 | for (i = 0; i < hzd->nretired; ++i) { 93 | ptr = rlist[i]; 94 | 95 | if (htable_lookup(plist, size, ptr)) { 96 | rlist[nretired++] = ptr; 97 | } else { 98 | free(ptr); 99 | } 100 | } 101 | 102 | hzd->nretired = nretired; 103 | } 104 | 105 | void hzdptr_exit(hzdptr_t * hzd) 106 | { 107 | int i; 108 | void ** rlist = &hzd->ptrs[hzd->nptrs]; 109 | 110 | for (i = 0; i < hzd->nretired; ++i) { 111 | free(rlist[i]); 112 | } 113 | 114 | hzd->nretired = 0; 115 | hzd->next = hzd; 116 | } 117 | 118 | -------------------------------------------------------------------------------- /benchmark/hzdptr.h: -------------------------------------------------------------------------------- 1 | #ifndef HZDPTR_H 2 | #define HZDPTR_H 3 | 4 | #include "primitives.h" 5 | 6 | typedef struct _hzdptr_t { 7 | struct _hzdptr_t * next; 8 | int nprocs; 9 | int nptrs; 10 | int nretired; 11 | void ** ptrs; 12 | } hzdptr_t; 13 | 14 | #define HZDPTR_THRESHOLD(nprocs) (2 * nprocs) 15 | 16 | extern void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs); 17 | extern void hzdptr_exit(hzdptr_t * hzd); 18 | extern void _hzdptr_retire(hzdptr_t * hzd, void ** rlist); 19 | 20 | static inline 21 | int hzdptr_size(int nprocs, int nptrs) 22 | { 23 | return sizeof(void * [HZDPTR_THRESHOLD(nprocs) + nptrs]); 24 | } 25 | 26 | static inline 27 | void * _hzdptr_set(void volatile * ptr_, void * hzd_) 28 | { 29 | void * volatile * ptr = (void * volatile *) ptr_; 30 | void * volatile * hzd = (void * volatile *) hzd_; 31 | 32 | void * val = *ptr; 33 | *hzd = val; 34 | return val; 35 | } 36 | 37 | static inline 38 | void * hzdptr_set(void volatile * ptr, hzdptr_t * hzd, int idx) 39 | { 40 | return _hzdptr_set(ptr, &hzd->ptrs[idx]); 41 | } 42 | 43 | static inline 44 | void * _hzdptr_setv(void volatile * ptr_, void * hzd_) 45 | { 46 | void * volatile * ptr = (void * volatile *) ptr_; 47 | void * volatile * hzd = (void * volatile *) hzd_; 48 | 49 | void * val = *ptr; 50 | void * tmp; 51 | 52 | do { 53 | *hzd = val; 54 | tmp = val; 55 | FENCE(); 56 | val = *ptr; 57 | } while (val != tmp); 58 | 59 | return val; 60 | } 61 | 62 | static inline 63 | void * hzdptr_setv(void volatile * ptr, hzdptr_t * hzd, int idx) 64 | { 65 | return _hzdptr_setv(ptr, &hzd->ptrs[idx]); 66 | } 67 | 68 | static inline 69 | void hzdptr_clear(hzdptr_t * hzd, int idx) 70 | { 71 | RELEASE(&hzd->ptrs[idx], NULL); 72 | } 73 | 74 | static inline 75 | void hzdptr_retire(hzdptr_t * hzd, void * ptr) 76 | { 77 | void ** rlist = &hzd->ptrs[hzd->nptrs]; 78 | rlist[hzd->nretired++] = ptr; 79 | 80 | if (hzd->nretired == HZDPTR_THRESHOLD(hzd->nprocs)) { 81 | _hzdptr_retire(hzd, rlist); 82 | } 83 | } 84 | 85 | static inline 86 | void _hzdptr_enlist(hzdptr_t * hzd) 87 | { 88 | static hzdptr_t * volatile _tail; 89 | hzdptr_t * tail = _tail; 90 | 91 | if (tail == NULL) { 92 | hzd->next = hzd; 93 | if (CASra(&_tail, &tail, hzd)) return; 94 | } 95 | 96 | hzdptr_t * next = tail->next; 97 | 98 | do hzd->next = next; 99 | while (!CASra(&tail->next, &next, hzd)); 100 | } 101 | 102 | #endif /* end of include guard: HZDPTR_H */ 103 | -------------------------------------------------------------------------------- /benchmark/lcrq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "lcrq.h" 5 | #include "align.h" 6 | #include "delay.h" 7 | #include "hzdptr.h" 8 | #include "primitives.h" 9 | 10 | #define RING_SIZE LCRQ_RING_SIZE 11 | 12 | static inline int is_empty(uint64_t v) __attribute__ ((pure)); 13 | static inline uint64_t node_index(uint64_t i) __attribute__ ((pure)); 14 | static inline uint64_t set_unsafe(uint64_t i) __attribute__ ((pure)); 15 | static inline uint64_t node_unsafe(uint64_t i) __attribute__ ((pure)); 16 | static inline uint64_t tail_index(uint64_t t) __attribute__ ((pure)); 17 | static inline int crq_is_closed(uint64_t t) __attribute__ ((pure)); 18 | 19 | static inline void init_ring(RingQueue *r) { 20 | int i; 21 | 22 | for (i = 0; i < RING_SIZE; i++) { 23 | r->array[i].val = -1; 24 | r->array[i].idx = i; 25 | } 26 | 27 | r->head = r->tail = 0; 28 | r->next = NULL; 29 | } 30 | 31 | inline int is_empty(uint64_t v) { 32 | return (v == (uint64_t)-1); 33 | } 34 | 35 | 36 | inline uint64_t node_index(uint64_t i) { 37 | return (i & ~(1ull << 63)); 38 | } 39 | 40 | 41 | inline uint64_t set_unsafe(uint64_t i) { 42 | return (i | (1ull << 63)); 43 | } 44 | 45 | 46 | inline uint64_t node_unsafe(uint64_t i) { 47 | return (i & (1ull << 63)); 48 | } 49 | 50 | 51 | inline uint64_t tail_index(uint64_t t) { 52 | return (t & ~(1ull << 63)); 53 | } 54 | 55 | 56 | inline int crq_is_closed(uint64_t t) { 57 | return (t & (1ull << 63)) != 0; 58 | } 59 | 60 | void queue_init(queue_t * q, int nprocs) 61 | { 62 | RingQueue *rq = align_malloc(PAGE_SIZE, sizeof(RingQueue)); 63 | init_ring(rq); 64 | 65 | q->head = rq; 66 | q->tail = rq; 67 | q->nprocs = nprocs; 68 | } 69 | 70 | static inline void fixState(RingQueue *rq) { 71 | 72 | while (1) { 73 | uint64_t t = rq->tail; 74 | uint64_t h = rq->head; 75 | 76 | if (rq->tail != t) 77 | continue; 78 | 79 | if (h > t) { 80 | if (CAS(&rq->tail, &t, h)) break; 81 | continue; 82 | } 83 | break; 84 | } 85 | } 86 | 87 | static inline int close_crq(RingQueue *rq, const uint64_t t, const int tries) { 88 | uint64_t tt = t + 1; 89 | 90 | if (tries < 10) 91 | return CAS(&rq->tail, &tt, tt|(1ull<<63)); 92 | else 93 | return BTAS(&rq->tail, 63); 94 | } 95 | 96 | static void lcrq_put(queue_t * q, handle_t * handle, uint64_t arg) { 97 | int try_close = 0; 98 | 99 | while (1) { 100 | RingQueue *rq = hzdptr_setv(&q->tail, &handle->hzdptr, 0); 101 | RingQueue *next = rq->next; 102 | 103 | if (next != NULL) { 104 | CAS(&q->tail, &rq, next); 105 | continue; 106 | } 107 | 108 | uint64_t t = FAA(&rq->tail, 1); 109 | 110 | if (crq_is_closed(t)) { 111 | RingQueue * nrq; 112 | alloc: 113 | nrq = handle->next; 114 | 115 | void *org_nrq = nrq; 116 | if (nrq == NULL) { 117 | nrq = align_malloc(PAGE_SIZE, sizeof(RingQueue)); 118 | init_ring(nrq); 119 | } 120 | 121 | // Solo enqueue 122 | nrq->tail = 1; 123 | nrq->array[0].val = (uint64_t) arg; 124 | nrq->array[0].idx = 0; 125 | 126 | if (CAS(&rq->next, &next, nrq)) { 127 | CAS(&q->tail, &rq, nrq); 128 | handle->next = NULL; 129 | return; 130 | } 131 | 132 | // Did not succeed, free the buffer 133 | if (org_nrq == NULL) free(nrq); 134 | continue; 135 | } 136 | 137 | RingNode* cell = &rq->array[t & (RING_SIZE-1)]; 138 | 139 | uint64_t idx = cell->idx; 140 | uint64_t val = cell->val; 141 | 142 | if (is_empty(val)) { 143 | if (node_index(idx) <= t) { 144 | if ((!node_unsafe(idx) || rq->head < t) && 145 | CAS2(cell, &val, &idx, arg, t)) { 146 | return; 147 | } 148 | } 149 | } 150 | 151 | uint64_t h = rq->head; 152 | 153 | if ((int64_t)(t - h) >= (int64_t)RING_SIZE && 154 | close_crq(rq, t, ++try_close)) { 155 | goto alloc; 156 | } 157 | } 158 | 159 | hzdptr_clear(&handle->hzdptr, 0); 160 | } 161 | 162 | static uint64_t lcrq_get(queue_t * q, handle_t * handle) { 163 | while (1) { 164 | RingQueue *rq = hzdptr_setv(&q->head, &handle->hzdptr, 0); 165 | RingQueue *next; 166 | 167 | uint64_t h = FAA(&rq->head, 1); 168 | 169 | RingNode* cell = &rq->array[h & (RING_SIZE-1)]; 170 | 171 | uint64_t tt = 0; 172 | int r = 0; 173 | 174 | while (1) { 175 | 176 | uint64_t cell_idx = cell->idx; 177 | uint64_t unsafe = node_unsafe(cell_idx); 178 | uint64_t idx = node_index(cell_idx); 179 | uint64_t val = cell->val; 180 | 181 | if (idx > h) break; 182 | 183 | if (!is_empty(val)) { 184 | if (idx == h) { 185 | if (CAS2(cell, &val, &cell_idx, -1, (unsafe | h) + RING_SIZE)) 186 | return val; 187 | } else { 188 | if (CAS2(cell, &val, &cell_idx, val, set_unsafe(idx))) { 189 | break; 190 | } 191 | } 192 | } else { 193 | if ((r & ((1ull << 10) - 1)) == 0) 194 | tt = rq->tail; 195 | 196 | // Optimization: try to bail quickly if queue is closed. 197 | int crq_closed = crq_is_closed(tt); 198 | uint64_t t = tail_index(tt); 199 | 200 | if (unsafe) { // Nothing to do, move along 201 | if (CAS2(cell, &val, &cell_idx, val, (unsafe | h) + RING_SIZE)) 202 | break; 203 | } else if (t < h + 1 || r > 200000 || crq_closed) { 204 | if (CAS2(cell, &val, &idx, val, h + RING_SIZE)) { 205 | if (r > 200000 && tt > RING_SIZE) 206 | BTAS(&rq->tail, 63); 207 | break; 208 | } 209 | } else { 210 | ++r; 211 | } 212 | } 213 | } 214 | 215 | if (tail_index(rq->tail) <= h + 1) { 216 | fixState(rq); 217 | // try to return empty 218 | next = rq->next; 219 | if (next == NULL) 220 | return -1; // EMPTY 221 | if (tail_index(rq->tail) <= h + 1) { 222 | if (CAS(&q->head, &rq, next)) { 223 | hzdptr_retire(&handle->hzdptr, rq); 224 | } 225 | } 226 | } 227 | } 228 | 229 | hzdptr_clear(&handle->hzdptr, 0); 230 | } 231 | 232 | void queue_register(queue_t * q, handle_t * th, int id) 233 | { 234 | hzdptr_init(&th->hzdptr, q->nprocs, 1); 235 | } 236 | 237 | void enqueue(queue_t * q, handle_t * th, void * val) 238 | { 239 | lcrq_put(q, th, (uint64_t) val); 240 | } 241 | 242 | void * dequeue(queue_t * q, handle_t * th) 243 | { 244 | return (void *) lcrq_get(q, th); 245 | } 246 | void queue_free(queue_t * q, handle_t * h){ 247 | RingQueue *rq = q->orignialHead; 248 | while(rq){ 249 | RingQueue *n = rq->next; 250 | free(rq); 251 | rq = n; 252 | }; 253 | } 254 | -------------------------------------------------------------------------------- /benchmark/lcrq.h: -------------------------------------------------------------------------------- 1 | #ifndef LCRQ_H 2 | #define LCRQ_H 3 | 4 | #ifdef LCRQ 5 | 6 | #include "align.h" 7 | #include "hzdptr.h" 8 | 9 | #define EMPTY ((void *) -1) 10 | 11 | #ifndef LCRQ_RING_SIZE 12 | #define LCRQ_RING_SIZE (1ull << 12) 13 | #endif 14 | 15 | typedef struct RingNode { 16 | volatile uint64_t val; 17 | volatile uint64_t idx; 18 | uint64_t pad[14]; 19 | } RingNode DOUBLE_CACHE_ALIGNED; 20 | 21 | typedef struct RingQueue { 22 | volatile int64_t head DOUBLE_CACHE_ALIGNED; 23 | volatile int64_t tail DOUBLE_CACHE_ALIGNED; 24 | struct RingQueue *next DOUBLE_CACHE_ALIGNED; 25 | RingNode array[LCRQ_RING_SIZE]; 26 | } RingQueue DOUBLE_CACHE_ALIGNED; 27 | 28 | typedef struct { 29 | RingQueue * volatile head DOUBLE_CACHE_ALIGNED; 30 | RingQueue * volatile tail DOUBLE_CACHE_ALIGNED; 31 | RingQueue * orignialHead; 32 | int nprocs; 33 | } queue_t; 34 | 35 | typedef struct { 36 | RingQueue * next; 37 | hzdptr_t hzdptr; 38 | } handle_t; 39 | 40 | #endif 41 | 42 | #endif /* end of include guard: LCRQ_H */ 43 | -------------------------------------------------------------------------------- /benchmark/msqueue.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "delay.h" 3 | #include "msqueue.h" 4 | #include "primitives.h" 5 | 6 | void queue_init(queue_t * q, int nprocs) 7 | { 8 | node_t * node = malloc(sizeof(node_t)); 9 | node->next = NULL; 10 | 11 | q->head = node; 12 | q->tail = node; 13 | q->nprocs = nprocs; 14 | } 15 | 16 | void queue_register(queue_t * q, handle_t * th, int id) 17 | { 18 | hzdptr_init(&th->hzd, q->nprocs, 2); 19 | } 20 | 21 | void enqueue(queue_t * q, handle_t * handle, void * data) 22 | { 23 | node_t * node = malloc(sizeof(node_t)); 24 | 25 | node->data = data; 26 | node->next = NULL; 27 | 28 | node_t * tail; 29 | node_t * next; 30 | 31 | while (1) { 32 | tail = hzdptr_setv(&q->tail, &handle->hzd, 0); 33 | next = tail->next; 34 | 35 | if (tail != q->tail) { 36 | continue; 37 | } 38 | 39 | if (next != NULL) { 40 | CAS(&q->tail, &tail, next); 41 | continue; 42 | } 43 | 44 | if (CAS(&tail->next, &next, node)) break; 45 | } 46 | 47 | CAS(&q->tail, &tail, node); 48 | } 49 | 50 | void * dequeue(queue_t * q, handle_t * handle) 51 | { 52 | void * data; 53 | 54 | node_t * head; 55 | node_t * tail; 56 | node_t * next; 57 | 58 | while (1) { 59 | head = hzdptr_setv(&q->head, &handle->hzd, 0); 60 | tail = q->tail; 61 | next = hzdptr_set(&head->next, &handle->hzd, 1); 62 | 63 | if (head != q->head) { 64 | continue; 65 | } 66 | 67 | if (next == NULL) { 68 | return (void *) -1; 69 | } 70 | 71 | if (head == tail) { 72 | CAS(&q->tail, &tail, next); 73 | continue; 74 | } 75 | 76 | data = next->data; 77 | if (CAS(&q->head, &head, next)) break; 78 | } 79 | 80 | hzdptr_retire(&handle->hzd, head); 81 | return data; 82 | } 83 | 84 | void queue_free(int id, int nprocs) {} 85 | -------------------------------------------------------------------------------- /benchmark/msqueue.h: -------------------------------------------------------------------------------- 1 | #ifndef MSQUEUE_H 2 | #define MSQUEUE_H 3 | 4 | #ifdef MSQUEUE 5 | #include "align.h" 6 | #include "hzdptr.h" 7 | 8 | #define EMPTY (void *) -1 9 | 10 | typedef struct _node_t { 11 | struct _node_t * volatile next DOUBLE_CACHE_ALIGNED; 12 | void * data DOUBLE_CACHE_ALIGNED; 13 | } node_t DOUBLE_CACHE_ALIGNED; 14 | 15 | typedef struct _queue_t { 16 | struct _node_t * volatile head DOUBLE_CACHE_ALIGNED; 17 | struct _node_t * volatile tail DOUBLE_CACHE_ALIGNED; 18 | int nprocs; 19 | } queue_t DOUBLE_CACHE_ALIGNED; 20 | 21 | typedef struct _handle_t { 22 | hzdptr_t hzd; 23 | } handle_t DOUBLE_CACHE_ALIGNED; 24 | 25 | #endif 26 | 27 | #endif /* end of include guard: MSQUEUE_H */ 28 | -------------------------------------------------------------------------------- /benchmark/ncq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "ncq.h" 5 | 6 | void queue_init(queue_t * q, int nprocs) 7 | { 8 | lfring_init_empty((struct lfring *) q->ring, NCQ_ORDER); 9 | } 10 | 11 | 12 | void queue_register(queue_t * q, handle_t * th, int id) 13 | { 14 | } 15 | 16 | void enqueue(queue_t * q, handle_t * th, void * val) 17 | { 18 | size_t eidx = (size_t) val; 19 | lfring_enqueue((struct lfring *) q->ring, NCQ_ORDER, eidx, false); 20 | } 21 | 22 | void * dequeue(queue_t * q, handle_t * th) 23 | { 24 | return (void *) lfring_dequeue((struct lfring *) q->ring, NCQ_ORDER, false); 25 | } 26 | 27 | void queue_free(queue_t * q, handle_t * h) 28 | { 29 | } 30 | -------------------------------------------------------------------------------- /benchmark/ncq.h: -------------------------------------------------------------------------------- 1 | #ifndef NCQ_H 2 | #define NCQ_H 3 | 4 | #ifdef NCQ 5 | 6 | #include 7 | #include "../lfring_naive.h" 8 | #include "align.h" 9 | 10 | #define NCQ_ORDER 16 11 | #define EMPTY (void *) LFRING_EMPTY 12 | 13 | typedef struct _queue_t { 14 | char ring[LFRING_SIZE(NCQ_ORDER)]; 15 | } queue_t DOUBLE_CACHE_ALIGNED; 16 | 17 | typedef struct _handle_t { 18 | int pad; 19 | } handle_t DOUBLE_CACHE_ALIGNED; 20 | 21 | #endif 22 | 23 | #endif /* end of include guard: NCQ_H */ 24 | -------------------------------------------------------------------------------- /benchmark/pairwise.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "delay.h" 5 | #include "queue.h" 6 | 7 | #ifndef LOGN_OPS 8 | #define LOGN_OPS 7 9 | #endif 10 | 11 | static long nops; 12 | static queue_t * q; 13 | static handle_t ** hds; 14 | 15 | void init(int nprocs, int logn) { 16 | 17 | /** Use 10^7 as default input size. */ 18 | if (logn == 0) logn = LOGN_OPS; 19 | 20 | /** Compute the number of ops to perform. */ 21 | nops = 1; 22 | int i; 23 | for (i = 0; i < logn; ++i) { 24 | nops *= 10; 25 | } 26 | 27 | printf(" Number of operations: %ld\n", nops); 28 | 29 | // FIXME: sizeof(queue_t) varies, allocate 4MB 30 | q = align_malloc(PAGE_SIZE, 4194304); 31 | queue_init(q, nprocs); 32 | 33 | hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs])); 34 | } 35 | 36 | void thread_init(int id, int nprocs) { 37 | hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t)); 38 | queue_register(q, hds[id], id); 39 | } 40 | 41 | void * benchmark(int id, int nprocs) { 42 | void * val = (void *) (intptr_t) (id + 1); 43 | handle_t * th = hds[id]; 44 | 45 | delay_t state; 46 | delay_init(&state, id); 47 | 48 | int i; 49 | for (i = 0; i < nops / nprocs; ++i) { 50 | enqueue(q, th, val); 51 | // delay_exec(&state); 52 | 53 | val = dequeue(q, th); 54 | // delay_exec(&state); 55 | } 56 | 57 | return val; 58 | } 59 | 60 | void thread_exit(int id, int nprocs) { 61 | queue_free(q, hds[id]); 62 | } 63 | 64 | #ifdef VERIFY 65 | static int compare(const void * a, const void * b) { 66 | return *(long *) a - *(long *) b; 67 | } 68 | #endif 69 | 70 | int verify(int nprocs, void ** results) { 71 | #ifndef VERIFY 72 | return 0; 73 | #else 74 | qsort(results, nprocs, sizeof(void *), compare); 75 | 76 | int i; 77 | int ret = 0; 78 | 79 | for (i = 0; i < nprocs; ++i) { 80 | int res = (int) (intptr_t) results[i]; 81 | if (res != i + 1) { 82 | fprintf(stderr, "expected %d but received %d\n", i + 1, res); 83 | ret = 1; 84 | } 85 | } 86 | 87 | if (ret != 1) fprintf(stdout, "PASSED\n"); 88 | return ret; 89 | #endif 90 | } 91 | -------------------------------------------------------------------------------- /benchmark/primitives.h: -------------------------------------------------------------------------------- 1 | /** @file */ 2 | 3 | #ifndef PRIMITIVES_H 4 | #define PRIMITIVES_H 5 | 6 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7 7 | /** 8 | * An atomic fetch-and-add. 9 | */ 10 | #define FAA(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_RELAXED) 11 | /** 12 | * An atomic fetch-and-add that also ensures sequential consistency. 13 | */ 14 | #define FAAcs(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST) 15 | 16 | /** 17 | * An atomic compare-and-swap. 18 | */ 19 | #define CAS(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 20 | __ATOMIC_RELAXED, __ATOMIC_RELAXED) 21 | /** 22 | * An atomic compare-and-swap that also ensures sequential consistency. 23 | */ 24 | #define CAScs(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 25 | __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 26 | /** 27 | * An atomic compare-and-swap that ensures release semantic when succeed 28 | * or acquire semantic when failed. 29 | */ 30 | #define CASra(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 31 | __ATOMIC_RELEASE, __ATOMIC_ACQUIRE) 32 | /** 33 | * An atomic compare-and-swap that ensures acquire semantic when succeed 34 | * or relaxed semantic when failed. 35 | */ 36 | #define CASa(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 37 | __ATOMIC_ACQUIRE, __ATOMIC_RELAXED) 38 | 39 | /** 40 | * An atomic swap. 41 | */ 42 | #define SWAP(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_RELAXED) 43 | 44 | /** 45 | * An atomic swap that ensures acquire release semantics. 46 | */ 47 | #define SWAPra(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_ACQ_REL) 48 | 49 | /** 50 | * A memory fence to ensure sequential consistency. 51 | */ 52 | #define FENCE() __atomic_thread_fence(__ATOMIC_SEQ_CST) 53 | 54 | /** 55 | * An atomic store. 56 | */ 57 | #define STORE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELAXED) 58 | 59 | /** 60 | * A store with a preceding release fence to ensure all previous load 61 | * and stores completes before the current store is visiable. 62 | */ 63 | #define RELEASE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELEASE) 64 | 65 | /** 66 | * A load with a following acquire fence to ensure no following load and 67 | * stores can start before the current load completes. 68 | */ 69 | #define ACQUIRE(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 70 | 71 | #else /** Non-GCC or old GCC. */ 72 | #if defined(__x86_64__) || defined(_M_X64_) 73 | 74 | #define FAA __sync_fetch_and_add 75 | #define FAAcs __sync_fetch_and_add 76 | 77 | static inline int 78 | _compare_and_swap(void ** ptr, void ** expected, void * desired) { 79 | void * oldval = *expected; 80 | void * newval = __sync_val_compare_and_swap(ptr, oldval, desired); 81 | 82 | if (newval == oldval) { 83 | return 1; 84 | } else { 85 | *expected = newval; 86 | return 0; 87 | } 88 | } 89 | #define CAS(ptr, expected, desired) \ 90 | _compare_and_swap((void **) (ptr), (void **) (expected), (void *) (desired)) 91 | #define CAScs CAS 92 | #define CASra CAS 93 | #define CASa CAS 94 | 95 | #define SWAP __sync_lock_test_and_set 96 | #define SWAPra SWAP 97 | 98 | #define ACQUIRE(p) ({ \ 99 | __typeof__(*(p)) __ret = *p; \ 100 | __asm__("":::"memory"); \ 101 | __ret; \ 102 | }) 103 | 104 | #define RELEASE(p, v) do {\ 105 | __asm__("":::"memory"); \ 106 | *p = v; \ 107 | } while (0) 108 | #define FENCE() __sync_synchronize() 109 | 110 | #endif 111 | #endif 112 | 113 | #if defined(__x86_64__) || defined(_M_X64_) 114 | #define PAUSE() __asm__ ("pause") 115 | 116 | static inline 117 | int _CAS2(volatile long * ptr, long * cmp1, long * cmp2, long val1, long val2) 118 | { 119 | char success; 120 | long tmp1 = *cmp1; 121 | long tmp2 = *cmp2; 122 | 123 | __asm__ __volatile__( 124 | "lock cmpxchg16b %1\n" 125 | "setz %0" 126 | : "=q" (success), "+m" (*ptr), "+a" (tmp1), "+d" (tmp2) 127 | : "b" (val1), "c" (val2) 128 | : "cc" ); 129 | 130 | *cmp1 = tmp1; 131 | *cmp2 = tmp2; 132 | return success; 133 | } 134 | #define CAS2(p, o1, o2, n1, n2) \ 135 | _CAS2((volatile long *) p, (long *) o1, (long *) o2, (long) n1, (long) n2) 136 | 137 | #define BTAS(ptr, bit) ({ \ 138 | char __ret; \ 139 | __asm__ __volatile__( \ 140 | "lock btsq %2, %0; setnc %1" \ 141 | : "+m" (*ptr), "=r" (__ret) : "ri" (bit) : "cc" ); \ 142 | __ret; \ 143 | }) 144 | 145 | #else 146 | #define PAUSE() 147 | #endif 148 | 149 | #endif /* end of include guard: PRIMITIVES_H */ 150 | -------------------------------------------------------------------------------- /benchmark/queue.h: -------------------------------------------------------------------------------- 1 | #ifndef QUEUE_H 2 | #define QUEUE_H 3 | 4 | #ifdef WFQUEUE 5 | #include "wfqueue.h" 6 | 7 | #elif LCRQ 8 | #include "lcrq.h" 9 | 10 | #elif CCQUEUE 11 | #include "ccqueue.h" 12 | 13 | #elif MSQUEUE 14 | #include "msqueue.h" 15 | 16 | #elif FAAQ 17 | #include "align.h" 18 | 19 | typedef struct { 20 | volatile long P DOUBLE_CACHE_ALIGNED; 21 | volatile long C DOUBLE_CACHE_ALIGNED; 22 | } queue_t DOUBLE_CACHE_ALIGNED; 23 | 24 | typedef int handle_t; 25 | 26 | #elif DELAY 27 | 28 | typedef int queue_t; 29 | typedef int handle_t; 30 | 31 | #elif NCQ 32 | #include "ncq.h" 33 | 34 | #elif SCQ 35 | #include "scq.h" 36 | 37 | #elif SCQ 38 | #include "scqd.h" 39 | 40 | #elif SCQ2 41 | #include "scq2.h" 42 | 43 | #else 44 | #error "Please specify a queue implementation." 45 | 46 | #endif 47 | 48 | void queue_init(queue_t * q, int nprocs); 49 | void queue_register(queue_t * q, handle_t * th, int id); 50 | void enqueue(queue_t * q, handle_t * th, void * v); 51 | void * dequeue(queue_t * q, handle_t * th); 52 | void queue_free(queue_t * q, handle_t * h); 53 | 54 | #endif /* end of include guard: QUEUE_H */ 55 | -------------------------------------------------------------------------------- /benchmark/scq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "scq.h" 5 | 6 | void queue_init(queue_t * q, int nprocs) 7 | { 8 | lfring_init_empty((struct lfring *) q->ring, SCQ_ORDER); 9 | } 10 | 11 | 12 | void queue_register(queue_t * q, handle_t * th, int id) 13 | { 14 | } 15 | 16 | void enqueue(queue_t * q, handle_t * th, void * val) 17 | { 18 | size_t eidx = (size_t) val; 19 | lfring_enqueue((struct lfring *) q->ring, SCQ_ORDER, eidx, false); 20 | } 21 | 22 | void * dequeue(queue_t * q, handle_t * th) 23 | { 24 | return (void *) lfring_dequeue((struct lfring *) q->ring, SCQ_ORDER, false); 25 | } 26 | 27 | void queue_free(queue_t * q, handle_t * h) 28 | { 29 | } 30 | -------------------------------------------------------------------------------- /benchmark/scq.h: -------------------------------------------------------------------------------- 1 | #ifndef SCQ_H 2 | #define SCQ_H 3 | 4 | #ifdef SCQ 5 | 6 | #include 7 | #include "../lfring_cas1.h" 8 | #include "align.h" 9 | 10 | #define SCQ_ORDER 15 11 | #define EMPTY (void *) LFRING_EMPTY 12 | 13 | typedef struct _queue_t { 14 | char ring[LFRING_SIZE(SCQ_ORDER)]; 15 | } queue_t DOUBLE_CACHE_ALIGNED; 16 | 17 | typedef struct _handle_t { 18 | int pad; 19 | } handle_t DOUBLE_CACHE_ALIGNED; 20 | 21 | #endif 22 | 23 | #endif /* end of include guard: SCQ_H */ 24 | -------------------------------------------------------------------------------- /benchmark/scq2.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "scq2.h" 5 | 6 | void queue_init(queue_t * q, int nprocs) 7 | { 8 | lfring_ptr_init_empty((struct lfring_ptr *) q->ring, SCQ2_ORDER); 9 | } 10 | 11 | 12 | void queue_register(queue_t * q, handle_t * th, int id) 13 | { 14 | lfring_ptr_init_lhead(&th->lhead, SCQ2_ORDER); 15 | } 16 | 17 | void enqueue(queue_t * q, handle_t * th, void * val) 18 | { 19 | lfring_ptr_enqueue((struct lfring_ptr *) q->ring, SCQ2_ORDER, val + 1, 20 | false, false, &th->lhead); 21 | } 22 | 23 | void * dequeue(queue_t * q, handle_t * th) 24 | { 25 | void *ptr; 26 | if (!lfring_ptr_dequeue((struct lfring_ptr *) q->ring, SCQ2_ORDER, 27 | &ptr, false)) 28 | return EMPTY; 29 | ptr--; 30 | return ptr; 31 | } 32 | 33 | void queue_free(queue_t * q, handle_t * h) 34 | { 35 | } 36 | -------------------------------------------------------------------------------- /benchmark/scq2.h: -------------------------------------------------------------------------------- 1 | #ifndef SCQ2_H 2 | #define SCQ2_H 3 | 4 | #ifdef SCQ2 5 | 6 | #include 7 | #include "../lfring_cas2.h" 8 | #include "align.h" 9 | 10 | #define SCQ2_ORDER 15 11 | #define EMPTY (void *) -1 12 | 13 | typedef struct _queue_t { 14 | char ring[LFRING_PTR_SIZE(SCQ2_ORDER)]; 15 | } queue_t DOUBLE_CACHE_ALIGNED; 16 | 17 | typedef struct _handle_t { 18 | lfatomic_t lhead; 19 | } handle_t DOUBLE_CACHE_ALIGNED; 20 | 21 | #endif 22 | 23 | #endif /* end of include guard: SCQ_H */ 24 | -------------------------------------------------------------------------------- /benchmark/scqd.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "scqd.h" 5 | 6 | void queue_init(queue_t * q, int nprocs) 7 | { 8 | lfring_init_empty((struct lfring *) q->aq, SCQD_ORDER); 9 | lfring_init_full((struct lfring *) q->fq, SCQD_ORDER); 10 | } 11 | 12 | 13 | void queue_register(queue_t * q, handle_t * th, int id) 14 | { 15 | } 16 | 17 | void enqueue(queue_t * q, handle_t * th, void * val) 18 | { 19 | size_t eidx; 20 | eidx = lfring_dequeue((struct lfring *) q->fq, SCQD_ORDER, true); 21 | if (eidx == LFRING_EMPTY) return; 22 | q->val[eidx] = val; 23 | lfring_enqueue((struct lfring *) q->aq, SCQD_ORDER, eidx, false); 24 | } 25 | 26 | void * dequeue(queue_t * q, handle_t * th) 27 | { 28 | size_t eidx; 29 | void *val; 30 | eidx = lfring_dequeue((struct lfring *) q->aq, SCQD_ORDER, false); 31 | if (eidx == LFRING_EMPTY) return EMPTY; 32 | val = q->val[eidx]; 33 | lfring_enqueue((struct lfring *) q->fq, SCQD_ORDER, eidx, true); 34 | return val; 35 | } 36 | 37 | void queue_free(queue_t * q, handle_t * h) 38 | { 39 | } 40 | -------------------------------------------------------------------------------- /benchmark/scqd.h: -------------------------------------------------------------------------------- 1 | #ifndef SCQD_H 2 | #define SCQD_H 3 | 4 | #ifdef SCQD 5 | 6 | #include 7 | #include "../lfring_cas1.h" 8 | #include "align.h" 9 | 10 | #define SCQD_ORDER 16 11 | #define EMPTY (void *) LFRING_EMPTY 12 | 13 | typedef struct _queue_t { 14 | char aq[LFRING_SIZE(SCQD_ORDER)]; 15 | char fq[LFRING_SIZE(SCQD_ORDER)]; 16 | void *val[(1U << SCQD_ORDER)]; 17 | } queue_t DOUBLE_CACHE_ALIGNED; 18 | 19 | typedef struct _handle_t { 20 | int pad; 21 | } handle_t DOUBLE_CACHE_ALIGNED; 22 | 23 | #endif 24 | 25 | #endif /* end of include guard: SCQD_H */ 26 | -------------------------------------------------------------------------------- /benchmark/wcq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "wcq.h" 5 | 6 | void queue_init(queue_t * q, int nprocs) 7 | { 8 | wfring_init_empty((struct wfring *) q->ring, WCQ_ORDER); 9 | } 10 | 11 | static _Atomic(struct wfring_state *) handle_tail = ATOMIC_VAR_INIT(NULL); 12 | 13 | void queue_register(queue_t * q, handle_t * th, int id) 14 | { 15 | wfring_init_state((struct wfring *) q->ring, &th->state); 16 | 17 | struct wfring_state *tail = atomic_load(&handle_tail); 18 | if (tail == NULL) { 19 | th->state.next = &th->state; 20 | if (atomic_compare_exchange_strong(&handle_tail, &tail, &th->state)) 21 | return; 22 | } 23 | 24 | struct wfring_state *next = atomic_load(&tail->next); 25 | do { 26 | th->state.next = next; 27 | } while (!atomic_compare_exchange_weak(&tail->next, &next, &th->state)); 28 | } 29 | 30 | void enqueue(queue_t * q, handle_t * th, void * val) 31 | { 32 | size_t eidx = (size_t) val; 33 | wfring_enqueue((struct wfring *) q->ring, WCQ_ORDER, eidx, false, &th->state); 34 | } 35 | 36 | void * dequeue(queue_t * q, handle_t * th) 37 | { 38 | return (void *) wfring_dequeue((struct wfring *) q->ring, WCQ_ORDER, false, &th->state); 39 | } 40 | 41 | void queue_free(queue_t * q, handle_t * h) 42 | { 43 | } 44 | -------------------------------------------------------------------------------- /benchmark/wcq.h: -------------------------------------------------------------------------------- 1 | #ifndef WCQ_H 2 | #define WCQ_H 3 | 4 | #ifdef WCQ 5 | 6 | #include 7 | #include "../wfring_cas2.h" 8 | #include "align.h" 9 | 10 | #define WCQ_ORDER 15 11 | #define EMPTY (void *) WFRING_EMPTY 12 | 13 | typedef struct _queue_t { 14 | char ring[WFRING_SIZE(WCQ_ORDER)]; 15 | } queue_t DOUBLE_CACHE_ALIGNED; 16 | 17 | typedef struct _handle_t { 18 | struct wfring_state state; 19 | } handle_t DOUBLE_CACHE_ALIGNED; 20 | 21 | #endif 22 | 23 | #endif /* end of include guard: WCQ_H */ 24 | -------------------------------------------------------------------------------- /benchmark/wfqueue.c: -------------------------------------------------------------------------------- 1 | #include "wfqueue.h" 2 | #include 3 | #include 4 | #include 5 | #include "primitives.h" 6 | 7 | #define N WFQUEUE_NODE_SIZE 8 | #define BOT ((void *)0) 9 | #define TOP ((void *)-1) 10 | 11 | #define MAX_GARBAGE(n) (2 * n) 12 | 13 | #ifndef MAX_SPIN 14 | #define MAX_SPIN 100 15 | #endif 16 | 17 | #ifndef MAX_PATIENCE 18 | #define MAX_PATIENCE 10 19 | #endif 20 | 21 | typedef struct _enq_t enq_t; 22 | typedef struct _deq_t deq_t; 23 | typedef struct _cell_t cell_t; 24 | typedef struct _node_t node_t; 25 | 26 | static inline void *spin(void *volatile *p) { 27 | int patience = MAX_SPIN; 28 | void *v = *p; 29 | 30 | while (!v && patience-- > 0) { 31 | v = *p; 32 | PAUSE(); 33 | } 34 | 35 | return v; 36 | } 37 | 38 | static inline node_t *new_node() { 39 | node_t *n = align_malloc(PAGE_SIZE, sizeof(node_t)); 40 | memset(n, 0, sizeof(node_t)); 41 | return n; 42 | } 43 | 44 | static node_t *check(unsigned long volatile *p_hzd_node_id, node_t *cur, 45 | node_t *old) { 46 | unsigned long hzd_node_id = ACQUIRE(p_hzd_node_id); 47 | 48 | if (hzd_node_id < cur->id) { 49 | node_t *tmp = old; 50 | while (tmp->id < hzd_node_id) { 51 | tmp = tmp->next; 52 | } 53 | cur = tmp; 54 | } 55 | 56 | return cur; 57 | } 58 | 59 | static node_t *update(node_t *volatile *pPn, node_t *cur, 60 | unsigned long volatile *p_hzd_node_id, node_t *old) { 61 | node_t *ptr = ACQUIRE(pPn); 62 | 63 | if (ptr->id < cur->id) { 64 | if (!CAScs(pPn, &ptr, cur)) { 65 | if (ptr->id < cur->id) cur = ptr; 66 | } 67 | 68 | cur = check(p_hzd_node_id, cur, old); 69 | } 70 | 71 | return cur; 72 | } 73 | 74 | static void cleanup(queue_t *q, handle_t *th) { 75 | long oid = ACQUIRE(&q->Hi); 76 | node_t *new = th->Dp; 77 | 78 | if (oid == -1) return; 79 | if (new->id - oid < MAX_GARBAGE(q->nprocs)) return; 80 | if (!CASa(&q->Hi, &oid, -1)) return; 81 | 82 | node_t *old = q->Hp; 83 | handle_t *ph = th; 84 | handle_t *phs[q->nprocs]; 85 | int i = 0; 86 | 87 | do { 88 | new = check(&ph->hzd_node_id, new, old); 89 | new = update(&ph->Ep, new, &ph->hzd_node_id, old); 90 | new = update(&ph->Dp, new, &ph->hzd_node_id, old); 91 | 92 | phs[i++] = ph; 93 | ph = ph->next; 94 | } while (new->id > oid && ph != th); 95 | 96 | while (new->id > oid && --i >= 0) { 97 | new = check(&phs[i]->hzd_node_id, new, old); 98 | } 99 | 100 | long nid = new->id; 101 | 102 | if (nid <= oid) { 103 | RELEASE(&q->Hi, oid); 104 | } else { 105 | q->Hp = new; 106 | RELEASE(&q->Hi, nid); 107 | 108 | while (old != new) { 109 | node_t *tmp = old->next; 110 | free(old); 111 | old = tmp; 112 | } 113 | } 114 | } 115 | 116 | static cell_t *find_cell(node_t *volatile *ptr, long i, handle_t *th) { 117 | node_t *curr = *ptr; 118 | 119 | long j; 120 | for (j = curr->id; j < i / N; ++j) { 121 | node_t *next = curr->next; 122 | 123 | if (next == NULL) { 124 | node_t *temp = th->spare; 125 | 126 | if (!temp) { 127 | temp = new_node(); 128 | th->spare = temp; 129 | } 130 | 131 | temp->id = j + 1; 132 | 133 | if (CASra(&curr->next, &next, temp)) { 134 | next = temp; 135 | th->spare = NULL; 136 | } 137 | } 138 | 139 | curr = next; 140 | } 141 | 142 | *ptr = curr; 143 | return &curr->cells[i % N]; 144 | } 145 | 146 | static int enq_fast(queue_t *q, handle_t *th, void *v, long *id) { 147 | long i = FAAcs(&q->Ei, 1); 148 | cell_t *c = find_cell(&th->Ep, i, th); 149 | void *cv = BOT; 150 | 151 | if (CAS(&c->val, &cv, v)) { 152 | #ifdef RECORD 153 | th->fastenq++; 154 | #endif 155 | return 1; 156 | } else { 157 | *id = i; 158 | return 0; 159 | } 160 | } 161 | 162 | static void enq_slow(queue_t *q, handle_t *th, void *v, long id) { 163 | enq_t *enq = &th->Er; 164 | enq->val = v; 165 | RELEASE(&enq->id, id); 166 | 167 | node_t *tail = th->Ep; 168 | long i; 169 | cell_t *c; 170 | 171 | do { 172 | i = FAA(&q->Ei, 1); 173 | c = find_cell(&tail, i, th); 174 | enq_t *ce = BOT; 175 | 176 | if (CAScs(&c->enq, &ce, enq) && c->val != TOP) { 177 | if (CAS(&enq->id, &id, -i)) id = -i; 178 | break; 179 | } 180 | } while (enq->id > 0); 181 | 182 | id = -enq->id; 183 | c = find_cell(&th->Ep, id, th); 184 | if (id > i) { 185 | long Ei = q->Ei; 186 | while (Ei <= id && !CAS(&q->Ei, &Ei, id + 1)) 187 | ; 188 | } 189 | c->val = v; 190 | 191 | #ifdef RECORD 192 | th->slowenq++; 193 | #endif 194 | } 195 | 196 | void enqueue(queue_t *q, handle_t *th, void *v) { 197 | th->hzd_node_id = th->enq_node_id; 198 | 199 | long id; 200 | int p = MAX_PATIENCE; 201 | while (!enq_fast(q, th, v, &id) && p-- > 0) 202 | ; 203 | if (p < 0) enq_slow(q, th, v, id); 204 | 205 | th->enq_node_id = th->Ep->id; 206 | RELEASE(&th->hzd_node_id, -1); 207 | } 208 | 209 | static void *help_enq(queue_t *q, handle_t *th, cell_t *c, long i) { 210 | void *v = spin(&c->val); 211 | 212 | if ((v != TOP && v != BOT) || 213 | (v == BOT && !CAScs(&c->val, &v, TOP) && v != TOP)) { 214 | return v; 215 | } 216 | 217 | enq_t *e = c->enq; 218 | 219 | if (e == BOT) { 220 | handle_t *ph; 221 | enq_t *pe; 222 | long id; 223 | ph = th->Eh, pe = &ph->Er, id = pe->id; 224 | 225 | if (th->Ei != 0 && th->Ei != id) { 226 | th->Ei = 0; 227 | th->Eh = ph->next; 228 | ph = th->Eh, pe = &ph->Er, id = pe->id; 229 | } 230 | 231 | if (id > 0 && id <= i && !CAS(&c->enq, &e, pe)) 232 | th->Ei = id; 233 | else 234 | th->Eh = ph->next; 235 | 236 | if (e == BOT && CAS(&c->enq, &e, TOP)) e = TOP; 237 | } 238 | 239 | if (e == TOP) return (q->Ei <= i ? BOT : TOP); 240 | 241 | long ei = ACQUIRE(&e->id); 242 | void *ev = ACQUIRE(&e->val); 243 | 244 | if (ei > i) { 245 | if (c->val == TOP && q->Ei <= i) return BOT; 246 | } else { 247 | if ((ei > 0 && CAS(&e->id, &ei, -i)) || (ei == -i && c->val == TOP)) { 248 | long Ei = q->Ei; 249 | while (Ei <= i && !CAS(&q->Ei, &Ei, i + 1)) 250 | ; 251 | c->val = ev; 252 | } 253 | } 254 | 255 | return c->val; 256 | } 257 | 258 | static void help_deq(queue_t *q, handle_t *th, handle_t *ph) { 259 | deq_t *deq = &ph->Dr; 260 | long idx = ACQUIRE(&deq->idx); 261 | long id = deq->id; 262 | 263 | if (idx < id) return; 264 | 265 | node_t *Dp = ph->Dp; 266 | th->hzd_node_id = ph->hzd_node_id; 267 | FENCE(); 268 | idx = deq->idx; 269 | 270 | long i = id + 1, old = id, new = 0; 271 | while (1) { 272 | node_t *h = Dp; 273 | for (; idx == old && new == 0; ++i) { 274 | cell_t *c = find_cell(&h, i, th); 275 | 276 | long Di = q->Di; 277 | while (Di <= i && !CAS(&q->Di, &Di, i + 1)) 278 | ; 279 | 280 | void *v = help_enq(q, th, c, i); 281 | if (v == BOT || (v != TOP && c->deq == BOT)) 282 | new = i; 283 | else 284 | idx = ACQUIRE(&deq->idx); 285 | } 286 | 287 | if (new != 0) { 288 | if (CASra(&deq->idx, &idx, new)) idx = new; 289 | if (idx >= new) new = 0; 290 | } 291 | 292 | if (idx < 0 || deq->id != id) break; 293 | 294 | cell_t *c = find_cell(&Dp, idx, th); 295 | deq_t *cd = BOT; 296 | if (c->val == TOP || CAS(&c->deq, &cd, deq) || cd == deq) { 297 | CAS(&deq->idx, &idx, -idx); 298 | break; 299 | } 300 | 301 | old = idx; 302 | if (idx >= i) i = idx + 1; 303 | } 304 | } 305 | 306 | static void *deq_fast(queue_t *q, handle_t *th, long *id) { 307 | long i = FAAcs(&q->Di, 1); 308 | cell_t *c = find_cell(&th->Dp, i, th); 309 | void *v = help_enq(q, th, c, i); 310 | deq_t *cd = BOT; 311 | 312 | if (v == BOT) return BOT; 313 | if (v != TOP && CAS(&c->deq, &cd, TOP)) return v; 314 | 315 | *id = i; 316 | return TOP; 317 | } 318 | 319 | static void *deq_slow(queue_t *q, handle_t *th, long id) { 320 | deq_t *deq = &th->Dr; 321 | RELEASE(&deq->id, id); 322 | RELEASE(&deq->idx, id); 323 | 324 | help_deq(q, th, th); 325 | long i = -deq->idx; 326 | cell_t *c = find_cell(&th->Dp, i, th); 327 | void *val = c->val; 328 | 329 | #ifdef RECORD 330 | th->slowdeq++; 331 | #endif 332 | return val == TOP ? BOT : val; 333 | } 334 | 335 | void *dequeue(queue_t *q, handle_t *th) { 336 | th->hzd_node_id = th->deq_node_id; 337 | 338 | void *v; 339 | long id = 0; 340 | int p = MAX_PATIENCE; 341 | 342 | do 343 | v = deq_fast(q, th, &id); 344 | while (v == TOP && p-- > 0); 345 | if (v == TOP) 346 | v = deq_slow(q, th, id); 347 | else { 348 | #ifdef RECORD 349 | th->fastdeq++; 350 | #endif 351 | } 352 | 353 | if (v != EMPTY) { 354 | help_deq(q, th, th->Dh); 355 | th->Dh = th->Dh->next; 356 | } 357 | 358 | th->deq_node_id = th->Dp->id; 359 | RELEASE(&th->hzd_node_id, -1); 360 | 361 | if (th->spare == NULL) { 362 | cleanup(q, th); 363 | th->spare = new_node(); 364 | } 365 | 366 | #ifdef RECORD 367 | if (v == EMPTY) th->empty++; 368 | #endif 369 | return v; 370 | } 371 | 372 | static pthread_barrier_t barrier; 373 | 374 | void queue_init(queue_t *q, int nprocs) { 375 | q->Hi = 0; 376 | q->Hp = new_node(); 377 | 378 | q->Ei = 1; 379 | q->Di = 1; 380 | 381 | q->nprocs = nprocs; 382 | 383 | #ifdef RECORD 384 | q->fastenq = 0; 385 | q->slowenq = 0; 386 | q->fastdeq = 0; 387 | q->slowdeq = 0; 388 | q->empty = 0; 389 | #endif 390 | pthread_barrier_init(&barrier, NULL, nprocs); 391 | } 392 | 393 | void queue_free(queue_t *q, handle_t *h) { 394 | #ifdef RECORD 395 | static int lock = 0; 396 | 397 | FAA(&q->fastenq, h->fastenq); 398 | FAA(&q->slowenq, h->slowenq); 399 | FAA(&q->fastdeq, h->fastdeq); 400 | FAA(&q->slowdeq, h->slowdeq); 401 | FAA(&q->empty, h->empty); 402 | 403 | pthread_barrier_wait(&barrier); 404 | 405 | if (FAA(&lock, 1) == 0) 406 | printf("Enq: %f Deq: %f Empty: %f\n", 407 | q->slowenq * 100.0 / (q->fastenq + q->slowenq), 408 | q->slowdeq * 100.0 / (q->fastdeq + q->slowdeq), 409 | q->empty * 100.0 / (q->fastdeq + q->slowdeq)); 410 | #endif 411 | } 412 | 413 | void queue_register(queue_t *q, handle_t *th, int id) { 414 | th->next = NULL; 415 | th->hzd_node_id = -1; 416 | th->Ep = q->Hp; 417 | th->enq_node_id = th->Ep->id; 418 | th->Dp = q->Hp; 419 | th->deq_node_id = th->Dp->id; 420 | 421 | th->Er.id = 0; 422 | th->Er.val = BOT; 423 | th->Dr.id = 0; 424 | th->Dr.idx = -1; 425 | 426 | th->Ei = 0; 427 | th->spare = new_node(); 428 | #ifdef RECORD 429 | th->slowenq = 0; 430 | th->slowdeq = 0; 431 | th->fastenq = 0; 432 | th->fastdeq = 0; 433 | th->empty = 0; 434 | #endif 435 | 436 | static handle_t *volatile _tail; 437 | handle_t *tail = _tail; 438 | 439 | if (tail == NULL) { 440 | th->next = th; 441 | if (CASra(&_tail, &tail, th)) { 442 | th->Eh = th->next; 443 | th->Dh = th->next; 444 | return; 445 | } 446 | } 447 | 448 | handle_t *next = tail->next; 449 | do 450 | th->next = next; 451 | while (!CASra(&tail->next, &next, th)); 452 | 453 | th->Eh = th->next; 454 | th->Dh = th->next; 455 | } 456 | -------------------------------------------------------------------------------- /benchmark/wfqueue.h: -------------------------------------------------------------------------------- 1 | #ifndef WFQUEUE_H 2 | #define WFQUEUE_H 3 | 4 | #ifdef WFQUEUE 5 | 6 | #include "align.h" 7 | #define EMPTY ((void *) 0) 8 | 9 | #ifndef WFQUEUE_NODE_SIZE 10 | #define WFQUEUE_NODE_SIZE ((1 << 10) - 2) 11 | #endif 12 | 13 | struct _enq_t { 14 | long volatile id; 15 | void * volatile val; 16 | } CACHE_ALIGNED; 17 | 18 | struct _deq_t { 19 | long volatile id; 20 | long volatile idx; 21 | } CACHE_ALIGNED; 22 | 23 | struct _cell_t { 24 | void * volatile val; 25 | struct _enq_t * volatile enq; 26 | struct _deq_t * volatile deq; 27 | void * pad[5]; 28 | }; 29 | 30 | struct _node_t { 31 | struct _node_t * volatile next CACHE_ALIGNED; 32 | long id CACHE_ALIGNED; 33 | struct _cell_t cells[WFQUEUE_NODE_SIZE] CACHE_ALIGNED; 34 | }; 35 | 36 | typedef struct DOUBLE_CACHE_ALIGNED { 37 | /** 38 | * Index of the next position for enqueue. 39 | */ 40 | volatile long Ei DOUBLE_CACHE_ALIGNED; 41 | 42 | /** 43 | * Index of the next position for dequeue. 44 | */ 45 | volatile long Di DOUBLE_CACHE_ALIGNED; 46 | 47 | /** 48 | * Index of the head of the queue. 49 | */ 50 | volatile long Hi DOUBLE_CACHE_ALIGNED; 51 | 52 | /** 53 | * Pointer to the head node of the queue. 54 | */ 55 | struct _node_t * volatile Hp; 56 | 57 | /** 58 | * Number of processors. 59 | */ 60 | long nprocs; 61 | #ifdef RECORD 62 | long slowenq; 63 | long slowdeq; 64 | long fastenq; 65 | long fastdeq; 66 | long empty; 67 | #endif 68 | } queue_t; 69 | 70 | typedef struct _handle_t { 71 | /** 72 | * Pointer to the next handle. 73 | */ 74 | struct _handle_t * next; 75 | 76 | /** 77 | * Hazard pointer. 78 | */ 79 | //struct _node_t * volatile Hp; 80 | unsigned long volatile hzd_node_id; 81 | 82 | /** 83 | * Pointer to the node for enqueue. 84 | */ 85 | struct _node_t * volatile Ep; 86 | unsigned long enq_node_id; 87 | 88 | /** 89 | * Pointer to the node for dequeue. 90 | */ 91 | struct _node_t * volatile Dp; 92 | unsigned long deq_node_id; 93 | 94 | /** 95 | * Enqueue request. 96 | */ 97 | struct _enq_t Er CACHE_ALIGNED; 98 | 99 | /** 100 | * Dequeue request. 101 | */ 102 | struct _deq_t Dr CACHE_ALIGNED; 103 | 104 | /** 105 | * Handle of the next enqueuer to help. 106 | */ 107 | struct _handle_t * Eh CACHE_ALIGNED; 108 | 109 | long Ei; 110 | 111 | /** 112 | * Handle of the next dequeuer to help. 113 | */ 114 | struct _handle_t * Dh; 115 | 116 | /** 117 | * Pointer to a spare node to use, to speedup adding a new node. 118 | */ 119 | struct _node_t * spare CACHE_ALIGNED; 120 | 121 | /** 122 | * Count the delay rounds of helping another dequeuer. 123 | */ 124 | int delay; 125 | 126 | #ifdef RECORD 127 | long slowenq; 128 | long slowdeq; 129 | long fastenq; 130 | long fastdeq; 131 | long empty; 132 | #endif 133 | } handle_t; 134 | 135 | #endif 136 | 137 | #endif /* end of include guard: WFQUEUE_H */ 138 | -------------------------------------------------------------------------------- /benchmark/xxhash.c: -------------------------------------------------------------------------------- 1 | /* 2 | xxHash - Fast Hash algorithm 3 | Copyright (C) 2012-2014, Yann Collet. 4 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following disclaimer 14 | in the documentation and/or other materials provided with the 15 | distribution. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | You can contact the author at : 30 | - xxHash source repository : http://code.google.com/p/xxhash/ 31 | - public discussion board : https://groups.google.com/forum/#!forum/lz4c 32 | */ 33 | 34 | 35 | //************************************** 36 | // Tuning parameters 37 | //************************************** 38 | // Unaligned memory access is automatically enabled for "common" CPU, such as x86. 39 | // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected. 40 | // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance. 41 | // You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32). 42 | #if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) 43 | # define XXH_USE_UNALIGNED_ACCESS 1 44 | #endif 45 | 46 | // XXH_ACCEPT_NULL_INPUT_POINTER : 47 | // If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. 48 | // When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. 49 | // This option has a very small performance cost (only measurable on small inputs). 50 | // By default, this option is disabled. To enable it, uncomment below define : 51 | // #define XXH_ACCEPT_NULL_INPUT_POINTER 1 52 | 53 | // XXH_FORCE_NATIVE_FORMAT : 54 | // By default, xxHash library provides endian-independant Hash values, based on little-endian convention. 55 | // Results are therefore identical for little-endian and big-endian CPU. 56 | // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. 57 | // Should endian-independance be of no importance for your application, you may set the #define below to 1. 58 | // It will improve speed for Big-endian CPU. 59 | // This option has no impact on Little_Endian CPU. 60 | #define XXH_FORCE_NATIVE_FORMAT 0 61 | 62 | //************************************** 63 | // Compiler Specific Options 64 | //************************************** 65 | // Disable some Visual warning messages 66 | #ifdef _MSC_VER // Visual Studio 67 | # pragma warning(disable : 4127) // disable: C4127: conditional expression is constant 68 | #endif 69 | 70 | #ifdef _MSC_VER // Visual Studio 71 | # define FORCE_INLINE static __forceinline 72 | #else 73 | # ifdef __GNUC__ 74 | # define FORCE_INLINE static inline __attribute__((always_inline)) 75 | # else 76 | # define FORCE_INLINE static inline 77 | # endif 78 | #endif 79 | 80 | //************************************** 81 | // Includes & Memory related functions 82 | //************************************** 83 | #include "xxhash.h" 84 | // Modify the local functions below should you wish to use some other memory routines 85 | // for malloc(), free() 86 | #include 87 | static void* XXH_malloc(size_t s) { return malloc(s); } 88 | static void XXH_free (void* p) { free(p); } 89 | // for memcpy() 90 | #include 91 | static void* XXH_memcpy(void* dest, const void* src, size_t size) 92 | { 93 | return memcpy(dest,src,size); 94 | } 95 | 96 | 97 | //************************************** 98 | // Basic Types 99 | //************************************** 100 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 101 | # include 102 | typedef uint8_t BYTE; 103 | typedef uint16_t U16; 104 | typedef uint32_t U32; 105 | typedef int32_t S32; 106 | typedef uint64_t U64; 107 | #else 108 | typedef unsigned char BYTE; 109 | typedef unsigned short U16; 110 | typedef unsigned int U32; 111 | typedef signed int S32; 112 | typedef unsigned long long U64; 113 | #endif 114 | 115 | #if defined(__GNUC__) && !defined(XXH_USE_UNALIGNED_ACCESS) 116 | # define _PACKED __attribute__ ((packed)) 117 | #else 118 | # define _PACKED 119 | #endif 120 | 121 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) 122 | # ifdef __IBMC__ 123 | # pragma pack(1) 124 | # else 125 | # pragma pack(push, 1) 126 | # endif 127 | #endif 128 | 129 | typedef struct _U32_S 130 | { 131 | U32 v; 132 | } _PACKED U32_S; 133 | typedef struct _U64_S 134 | { 135 | U64 v; 136 | } _PACKED U64_S; 137 | 138 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) 139 | # pragma pack(pop) 140 | #endif 141 | 142 | #define A32(x) (((U32_S *)(x))->v) 143 | #define A64(x) (((U64_S *)(x))->v) 144 | 145 | 146 | //*************************************** 147 | // Compiler-specific Functions and Macros 148 | //*************************************** 149 | #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) 150 | 151 | // Note : although _rotl exists for minGW (GCC under windows), performance seems poor 152 | #if defined(_MSC_VER) 153 | # define XXH_rotl32(x,r) _rotl(x,r) 154 | # define XXH_rotl64(x,r) _rotl64(x,r) 155 | #else 156 | # define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) 157 | # define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) 158 | #endif 159 | 160 | #if defined(_MSC_VER) // Visual Studio 161 | # define XXH_swap32 _byteswap_ulong 162 | # define XXH_swap64 _byteswap_uint64 163 | #elif GCC_VERSION >= 403 164 | # define XXH_swap32 __builtin_bswap32 165 | # define XXH_swap64 __builtin_bswap64 166 | #else 167 | static inline U32 XXH_swap32 (U32 x) 168 | { 169 | return ((x << 24) & 0xff000000 ) | 170 | ((x << 8) & 0x00ff0000 ) | 171 | ((x >> 8) & 0x0000ff00 ) | 172 | ((x >> 24) & 0x000000ff ); 173 | } 174 | static inline U64 XXH_swap64 (U64 x) 175 | { 176 | return ((x << 56) & 0xff00000000000000ULL) | 177 | ((x << 40) & 0x00ff000000000000ULL) | 178 | ((x << 24) & 0x0000ff0000000000ULL) | 179 | ((x << 8) & 0x000000ff00000000ULL) | 180 | ((x >> 8) & 0x00000000ff000000ULL) | 181 | ((x >> 24) & 0x0000000000ff0000ULL) | 182 | ((x >> 40) & 0x000000000000ff00ULL) | 183 | ((x >> 56) & 0x00000000000000ffULL); 184 | } 185 | #endif 186 | 187 | 188 | //************************************** 189 | // Constants 190 | //************************************** 191 | #define PRIME32_1 2654435761U 192 | #define PRIME32_2 2246822519U 193 | #define PRIME32_3 3266489917U 194 | #define PRIME32_4 668265263U 195 | #define PRIME32_5 374761393U 196 | 197 | #define PRIME64_1 11400714785074694791ULL 198 | #define PRIME64_2 14029467366897019727ULL 199 | #define PRIME64_3 1609587929392839161ULL 200 | #define PRIME64_4 9650029242287828579ULL 201 | #define PRIME64_5 2870177450012600261ULL 202 | 203 | //************************************** 204 | // Architecture Macros 205 | //************************************** 206 | typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; 207 | #ifndef XXH_CPU_LITTLE_ENDIAN // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch 208 | static const int one = 1; 209 | # define XXH_CPU_LITTLE_ENDIAN (*(char*)(&one)) 210 | #endif 211 | 212 | 213 | //************************************** 214 | // Macros 215 | //************************************** 216 | #define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } // use only *after* variable declarations 217 | 218 | 219 | //**************************** 220 | // Memory reads 221 | //**************************** 222 | typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; 223 | 224 | FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) 225 | { 226 | if (align==XXH_unaligned) 227 | return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); 228 | else 229 | return endian==XXH_littleEndian ? *(U32*)ptr : XXH_swap32(*(U32*)ptr); 230 | } 231 | 232 | FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) 233 | { 234 | return XXH_readLE32_align(ptr, endian, XXH_unaligned); 235 | } 236 | 237 | FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) 238 | { 239 | if (align==XXH_unaligned) 240 | return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr)); 241 | else 242 | return endian==XXH_littleEndian ? *(U64*)ptr : XXH_swap64(*(U64*)ptr); 243 | } 244 | 245 | FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) 246 | { 247 | return XXH_readLE64_align(ptr, endian, XXH_unaligned); 248 | } 249 | 250 | 251 | //**************************** 252 | // Simple Hash Functions 253 | //**************************** 254 | FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) 255 | { 256 | const BYTE* p = (const BYTE*)input; 257 | const BYTE* bEnd = p + len; 258 | U32 h32; 259 | #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) 260 | 261 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 262 | if (p==NULL) 263 | { 264 | len=0; 265 | bEnd=p=(const BYTE*)(size_t)16; 266 | } 267 | #endif 268 | 269 | if (len>=16) 270 | { 271 | const BYTE* const limit = bEnd - 16; 272 | U32 v1 = seed + PRIME32_1 + PRIME32_2; 273 | U32 v2 = seed + PRIME32_2; 274 | U32 v3 = seed + 0; 275 | U32 v4 = seed - PRIME32_1; 276 | 277 | do 278 | { 279 | v1 += XXH_get32bits(p) * PRIME32_2; 280 | v1 = XXH_rotl32(v1, 13); 281 | v1 *= PRIME32_1; 282 | p+=4; 283 | v2 += XXH_get32bits(p) * PRIME32_2; 284 | v2 = XXH_rotl32(v2, 13); 285 | v2 *= PRIME32_1; 286 | p+=4; 287 | v3 += XXH_get32bits(p) * PRIME32_2; 288 | v3 = XXH_rotl32(v3, 13); 289 | v3 *= PRIME32_1; 290 | p+=4; 291 | v4 += XXH_get32bits(p) * PRIME32_2; 292 | v4 = XXH_rotl32(v4, 13); 293 | v4 *= PRIME32_1; 294 | p+=4; 295 | } 296 | while (p<=limit); 297 | 298 | h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); 299 | } 300 | else 301 | { 302 | h32 = seed + PRIME32_5; 303 | } 304 | 305 | h32 += (U32) len; 306 | 307 | while (p+4<=bEnd) 308 | { 309 | h32 += XXH_get32bits(p) * PRIME32_3; 310 | h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; 311 | p+=4; 312 | } 313 | 314 | while (p> 15; 322 | h32 *= PRIME32_2; 323 | h32 ^= h32 >> 13; 324 | h32 *= PRIME32_3; 325 | h32 ^= h32 >> 16; 326 | 327 | return h32; 328 | } 329 | 330 | 331 | unsigned int XXH32 (const void* input, size_t len, unsigned seed) 332 | { 333 | #if 0 334 | // Simple version, good for code maintenance, but unfortunately slow for small inputs 335 | XXH32_state_t state; 336 | XXH32_reset(&state, seed); 337 | XXH32_update(&state, input, len); 338 | return XXH32_digest(&state); 339 | #else 340 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 341 | 342 | # if !defined(XXH_USE_UNALIGNED_ACCESS) 343 | if ((((size_t)input) & 3) == 0) // Input is aligned, let's leverage the speed advantage 344 | { 345 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 346 | return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); 347 | else 348 | return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); 349 | } 350 | # endif 351 | 352 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 353 | return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); 354 | else 355 | return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); 356 | #endif 357 | } 358 | 359 | FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) 360 | { 361 | const BYTE* p = (const BYTE*)input; 362 | const BYTE* bEnd = p + len; 363 | U64 h64; 364 | #define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) 365 | 366 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 367 | if (p==NULL) 368 | { 369 | len=0; 370 | bEnd=p=(const BYTE*)(size_t)32; 371 | } 372 | #endif 373 | 374 | if (len>=32) 375 | { 376 | const BYTE* const limit = bEnd - 32; 377 | U64 v1 = seed + PRIME64_1 + PRIME64_2; 378 | U64 v2 = seed + PRIME64_2; 379 | U64 v3 = seed + 0; 380 | U64 v4 = seed - PRIME64_1; 381 | 382 | do 383 | { 384 | v1 += XXH_get64bits(p) * PRIME64_2; 385 | p+=8; 386 | v1 = XXH_rotl64(v1, 31); 387 | v1 *= PRIME64_1; 388 | v2 += XXH_get64bits(p) * PRIME64_2; 389 | p+=8; 390 | v2 = XXH_rotl64(v2, 31); 391 | v2 *= PRIME64_1; 392 | v3 += XXH_get64bits(p) * PRIME64_2; 393 | p+=8; 394 | v3 = XXH_rotl64(v3, 31); 395 | v3 *= PRIME64_1; 396 | v4 += XXH_get64bits(p) * PRIME64_2; 397 | p+=8; 398 | v4 = XXH_rotl64(v4, 31); 399 | v4 *= PRIME64_1; 400 | } 401 | while (p<=limit); 402 | 403 | h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); 404 | 405 | v1 *= PRIME64_2; 406 | v1 = XXH_rotl64(v1, 31); 407 | v1 *= PRIME64_1; 408 | h64 ^= v1; 409 | h64 = h64 * PRIME64_1 + PRIME64_4; 410 | 411 | v2 *= PRIME64_2; 412 | v2 = XXH_rotl64(v2, 31); 413 | v2 *= PRIME64_1; 414 | h64 ^= v2; 415 | h64 = h64 * PRIME64_1 + PRIME64_4; 416 | 417 | v3 *= PRIME64_2; 418 | v3 = XXH_rotl64(v3, 31); 419 | v3 *= PRIME64_1; 420 | h64 ^= v3; 421 | h64 = h64 * PRIME64_1 + PRIME64_4; 422 | 423 | v4 *= PRIME64_2; 424 | v4 = XXH_rotl64(v4, 31); 425 | v4 *= PRIME64_1; 426 | h64 ^= v4; 427 | h64 = h64 * PRIME64_1 + PRIME64_4; 428 | } 429 | else 430 | { 431 | h64 = seed + PRIME64_5; 432 | } 433 | 434 | h64 += (U64) len; 435 | 436 | while (p+8<=bEnd) 437 | { 438 | U64 k1 = XXH_get64bits(p); 439 | k1 *= PRIME64_2; 440 | k1 = XXH_rotl64(k1,31); 441 | k1 *= PRIME64_1; 442 | h64 ^= k1; 443 | h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; 444 | p+=8; 445 | } 446 | 447 | if (p+4<=bEnd) 448 | { 449 | h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; 450 | h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; 451 | p+=4; 452 | } 453 | 454 | while (p> 33; 462 | h64 *= PRIME64_2; 463 | h64 ^= h64 >> 29; 464 | h64 *= PRIME64_3; 465 | h64 ^= h64 >> 32; 466 | 467 | return h64; 468 | } 469 | 470 | 471 | unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) 472 | { 473 | #if 0 474 | // Simple version, good for code maintenance, but unfortunately slow for small inputs 475 | XXH64_state_t state; 476 | XXH64_reset(&state, seed); 477 | XXH64_update(&state, input, len); 478 | return XXH64_digest(&state); 479 | #else 480 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 481 | 482 | # if !defined(XXH_USE_UNALIGNED_ACCESS) 483 | if ((((size_t)input) & 7)==0) // Input is aligned, let's leverage the speed advantage 484 | { 485 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 486 | return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); 487 | else 488 | return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); 489 | } 490 | # endif 491 | 492 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 493 | return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); 494 | else 495 | return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); 496 | #endif 497 | } 498 | 499 | /**************************************************** 500 | * Advanced Hash Functions 501 | ****************************************************/ 502 | 503 | /*** Allocation ***/ 504 | typedef struct 505 | { 506 | U64 total_len; 507 | U32 seed; 508 | U32 v1; 509 | U32 v2; 510 | U32 v3; 511 | U32 v4; 512 | U32 mem32[4]; /* defined as U32 for alignment */ 513 | U32 memsize; 514 | } XXH_istate32_t; 515 | 516 | typedef struct 517 | { 518 | U64 total_len; 519 | U64 seed; 520 | U64 v1; 521 | U64 v2; 522 | U64 v3; 523 | U64 v4; 524 | U64 mem64[4]; /* defined as U64 for alignment */ 525 | U32 memsize; 526 | } XXH_istate64_t; 527 | 528 | 529 | XXH32_state_t* XXH32_createState(void) 530 | { 531 | XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t)); // A compilation error here means XXH32_state_t is not large enough 532 | return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); 533 | } 534 | XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) 535 | { 536 | XXH_free(statePtr); 537 | return XXH_OK; 538 | }; 539 | 540 | XXH64_state_t* XXH64_createState(void) 541 | { 542 | XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t)); // A compilation error here means XXH64_state_t is not large enough 543 | return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); 544 | } 545 | XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) 546 | { 547 | XXH_free(statePtr); 548 | return XXH_OK; 549 | }; 550 | 551 | 552 | /*** Hash feed ***/ 553 | 554 | XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed) 555 | { 556 | XXH_istate32_t* state = (XXH_istate32_t*) state_in; 557 | state->seed = seed; 558 | state->v1 = seed + PRIME32_1 + PRIME32_2; 559 | state->v2 = seed + PRIME32_2; 560 | state->v3 = seed + 0; 561 | state->v4 = seed - PRIME32_1; 562 | state->total_len = 0; 563 | state->memsize = 0; 564 | return XXH_OK; 565 | } 566 | 567 | XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed) 568 | { 569 | XXH_istate64_t* state = (XXH_istate64_t*) state_in; 570 | state->seed = seed; 571 | state->v1 = seed + PRIME64_1 + PRIME64_2; 572 | state->v2 = seed + PRIME64_2; 573 | state->v3 = seed + 0; 574 | state->v4 = seed - PRIME64_1; 575 | state->total_len = 0; 576 | state->memsize = 0; 577 | return XXH_OK; 578 | } 579 | 580 | 581 | FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian) 582 | { 583 | XXH_istate32_t* state = (XXH_istate32_t *) state_in; 584 | const BYTE* p = (const BYTE*)input; 585 | const BYTE* const bEnd = p + len; 586 | 587 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 588 | if (input==NULL) return XXH_ERROR; 589 | #endif 590 | 591 | state->total_len += len; 592 | 593 | if (state->memsize + len < 16) // fill in tmp buffer 594 | { 595 | XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); 596 | state->memsize += (U32)len; 597 | return XXH_OK; 598 | } 599 | 600 | if (state->memsize) // some data left from previous update 601 | { 602 | XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); 603 | { 604 | const U32* p32 = state->mem32; 605 | state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; 606 | state->v1 = XXH_rotl32(state->v1, 13); 607 | state->v1 *= PRIME32_1; 608 | p32++; 609 | state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; 610 | state->v2 = XXH_rotl32(state->v2, 13); 611 | state->v2 *= PRIME32_1; 612 | p32++; 613 | state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; 614 | state->v3 = XXH_rotl32(state->v3, 13); 615 | state->v3 *= PRIME32_1; 616 | p32++; 617 | state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; 618 | state->v4 = XXH_rotl32(state->v4, 13); 619 | state->v4 *= PRIME32_1; 620 | p32++; 621 | } 622 | p += 16-state->memsize; 623 | state->memsize = 0; 624 | } 625 | 626 | if (p <= bEnd-16) 627 | { 628 | const BYTE* const limit = bEnd - 16; 629 | U32 v1 = state->v1; 630 | U32 v2 = state->v2; 631 | U32 v3 = state->v3; 632 | U32 v4 = state->v4; 633 | 634 | do 635 | { 636 | v1 += XXH_readLE32(p, endian) * PRIME32_2; 637 | v1 = XXH_rotl32(v1, 13); 638 | v1 *= PRIME32_1; 639 | p+=4; 640 | v2 += XXH_readLE32(p, endian) * PRIME32_2; 641 | v2 = XXH_rotl32(v2, 13); 642 | v2 *= PRIME32_1; 643 | p+=4; 644 | v3 += XXH_readLE32(p, endian) * PRIME32_2; 645 | v3 = XXH_rotl32(v3, 13); 646 | v3 *= PRIME32_1; 647 | p+=4; 648 | v4 += XXH_readLE32(p, endian) * PRIME32_2; 649 | v4 = XXH_rotl32(v4, 13); 650 | v4 *= PRIME32_1; 651 | p+=4; 652 | } 653 | while (p<=limit); 654 | 655 | state->v1 = v1; 656 | state->v2 = v2; 657 | state->v3 = v3; 658 | state->v4 = v4; 659 | } 660 | 661 | if (p < bEnd) 662 | { 663 | XXH_memcpy(state->mem32, p, bEnd-p); 664 | state->memsize = (int)(bEnd-p); 665 | } 666 | 667 | return XXH_OK; 668 | } 669 | 670 | XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) 671 | { 672 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 673 | 674 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 675 | return XXH32_update_endian(state_in, input, len, XXH_littleEndian); 676 | else 677 | return XXH32_update_endian(state_in, input, len, XXH_bigEndian); 678 | } 679 | 680 | 681 | 682 | FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian) 683 | { 684 | XXH_istate32_t* state = (XXH_istate32_t*) state_in; 685 | const BYTE * p = (const BYTE*)state->mem32; 686 | BYTE* bEnd = (BYTE*)(state->mem32) + state->memsize; 687 | U32 h32; 688 | 689 | if (state->total_len >= 16) 690 | { 691 | h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); 692 | } 693 | else 694 | { 695 | h32 = state->seed + PRIME32_5; 696 | } 697 | 698 | h32 += (U32) state->total_len; 699 | 700 | while (p+4<=bEnd) 701 | { 702 | h32 += XXH_readLE32(p, endian) * PRIME32_3; 703 | h32 = XXH_rotl32(h32, 17) * PRIME32_4; 704 | p+=4; 705 | } 706 | 707 | while (p> 15; 715 | h32 *= PRIME32_2; 716 | h32 ^= h32 >> 13; 717 | h32 *= PRIME32_3; 718 | h32 ^= h32 >> 16; 719 | 720 | return h32; 721 | } 722 | 723 | 724 | U32 XXH32_digest (const XXH32_state_t* state_in) 725 | { 726 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 727 | 728 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 729 | return XXH32_digest_endian(state_in, XXH_littleEndian); 730 | else 731 | return XXH32_digest_endian(state_in, XXH_bigEndian); 732 | } 733 | 734 | 735 | FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian) 736 | { 737 | XXH_istate64_t * state = (XXH_istate64_t *) state_in; 738 | const BYTE* p = (const BYTE*)input; 739 | const BYTE* const bEnd = p + len; 740 | 741 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 742 | if (input==NULL) return XXH_ERROR; 743 | #endif 744 | 745 | state->total_len += len; 746 | 747 | if (state->memsize + len < 32) // fill in tmp buffer 748 | { 749 | XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); 750 | state->memsize += (U32)len; 751 | return XXH_OK; 752 | } 753 | 754 | if (state->memsize) // some data left from previous update 755 | { 756 | XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); 757 | { 758 | const U64* p64 = state->mem64; 759 | state->v1 += XXH_readLE64(p64, endian) * PRIME64_2; 760 | state->v1 = XXH_rotl64(state->v1, 31); 761 | state->v1 *= PRIME64_1; 762 | p64++; 763 | state->v2 += XXH_readLE64(p64, endian) * PRIME64_2; 764 | state->v2 = XXH_rotl64(state->v2, 31); 765 | state->v2 *= PRIME64_1; 766 | p64++; 767 | state->v3 += XXH_readLE64(p64, endian) * PRIME64_2; 768 | state->v3 = XXH_rotl64(state->v3, 31); 769 | state->v3 *= PRIME64_1; 770 | p64++; 771 | state->v4 += XXH_readLE64(p64, endian) * PRIME64_2; 772 | state->v4 = XXH_rotl64(state->v4, 31); 773 | state->v4 *= PRIME64_1; 774 | p64++; 775 | } 776 | p += 32-state->memsize; 777 | state->memsize = 0; 778 | } 779 | 780 | if (p+32 <= bEnd) 781 | { 782 | const BYTE* const limit = bEnd - 32; 783 | U64 v1 = state->v1; 784 | U64 v2 = state->v2; 785 | U64 v3 = state->v3; 786 | U64 v4 = state->v4; 787 | 788 | do 789 | { 790 | v1 += XXH_readLE64(p, endian) * PRIME64_2; 791 | v1 = XXH_rotl64(v1, 31); 792 | v1 *= PRIME64_1; 793 | p+=8; 794 | v2 += XXH_readLE64(p, endian) * PRIME64_2; 795 | v2 = XXH_rotl64(v2, 31); 796 | v2 *= PRIME64_1; 797 | p+=8; 798 | v3 += XXH_readLE64(p, endian) * PRIME64_2; 799 | v3 = XXH_rotl64(v3, 31); 800 | v3 *= PRIME64_1; 801 | p+=8; 802 | v4 += XXH_readLE64(p, endian) * PRIME64_2; 803 | v4 = XXH_rotl64(v4, 31); 804 | v4 *= PRIME64_1; 805 | p+=8; 806 | } 807 | while (p<=limit); 808 | 809 | state->v1 = v1; 810 | state->v2 = v2; 811 | state->v3 = v3; 812 | state->v4 = v4; 813 | } 814 | 815 | if (p < bEnd) 816 | { 817 | XXH_memcpy(state->mem64, p, bEnd-p); 818 | state->memsize = (int)(bEnd-p); 819 | } 820 | 821 | return XXH_OK; 822 | } 823 | 824 | XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) 825 | { 826 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 827 | 828 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 829 | return XXH64_update_endian(state_in, input, len, XXH_littleEndian); 830 | else 831 | return XXH64_update_endian(state_in, input, len, XXH_bigEndian); 832 | } 833 | 834 | 835 | 836 | FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian) 837 | { 838 | XXH_istate64_t * state = (XXH_istate64_t *) state_in; 839 | const BYTE * p = (const BYTE*)state->mem64; 840 | BYTE* bEnd = (BYTE*)state->mem64 + state->memsize; 841 | U64 h64; 842 | 843 | if (state->total_len >= 32) 844 | { 845 | U64 v1 = state->v1; 846 | U64 v2 = state->v2; 847 | U64 v3 = state->v3; 848 | U64 v4 = state->v4; 849 | 850 | h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); 851 | 852 | v1 *= PRIME64_2; 853 | v1 = XXH_rotl64(v1, 31); 854 | v1 *= PRIME64_1; 855 | h64 ^= v1; 856 | h64 = h64*PRIME64_1 + PRIME64_4; 857 | 858 | v2 *= PRIME64_2; 859 | v2 = XXH_rotl64(v2, 31); 860 | v2 *= PRIME64_1; 861 | h64 ^= v2; 862 | h64 = h64*PRIME64_1 + PRIME64_4; 863 | 864 | v3 *= PRIME64_2; 865 | v3 = XXH_rotl64(v3, 31); 866 | v3 *= PRIME64_1; 867 | h64 ^= v3; 868 | h64 = h64*PRIME64_1 + PRIME64_4; 869 | 870 | v4 *= PRIME64_2; 871 | v4 = XXH_rotl64(v4, 31); 872 | v4 *= PRIME64_1; 873 | h64 ^= v4; 874 | h64 = h64*PRIME64_1 + PRIME64_4; 875 | } 876 | else 877 | { 878 | h64 = state->seed + PRIME64_5; 879 | } 880 | 881 | h64 += (U64) state->total_len; 882 | 883 | while (p+8<=bEnd) 884 | { 885 | U64 k1 = XXH_readLE64(p, endian); 886 | k1 *= PRIME64_2; 887 | k1 = XXH_rotl64(k1,31); 888 | k1 *= PRIME64_1; 889 | h64 ^= k1; 890 | h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; 891 | p+=8; 892 | } 893 | 894 | if (p+4<=bEnd) 895 | { 896 | h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; 897 | h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; 898 | p+=4; 899 | } 900 | 901 | while (p> 33; 909 | h64 *= PRIME64_2; 910 | h64 ^= h64 >> 29; 911 | h64 *= PRIME64_3; 912 | h64 ^= h64 >> 32; 913 | 914 | return h64; 915 | } 916 | 917 | 918 | unsigned long long XXH64_digest (const XXH64_state_t* state_in) 919 | { 920 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 921 | 922 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 923 | return XXH64_digest_endian(state_in, XXH_littleEndian); 924 | else 925 | return XXH64_digest_endian(state_in, XXH_bigEndian); 926 | } 927 | 928 | 929 | -------------------------------------------------------------------------------- /benchmark/xxhash.h: -------------------------------------------------------------------------------- 1 | /* 2 | xxHash - Extremely Fast Hash algorithm 3 | Header File 4 | Copyright (C) 2012-2014, Yann Collet. 5 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are 9 | met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of conditions and the following disclaimer 15 | in the documentation and/or other materials provided with the 16 | distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | You can contact the author at : 31 | - xxHash source repository : http://code.google.com/p/xxhash/ 32 | */ 33 | 34 | /* Notice extracted from xxHash homepage : 35 | 36 | xxHash is an extremely fast Hash algorithm, running at RAM speed limits. 37 | It also successfully passes all tests from the SMHasher suite. 38 | 39 | Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) 40 | 41 | Name Speed Q.Score Author 42 | xxHash 5.4 GB/s 10 43 | CrapWow 3.2 GB/s 2 Andrew 44 | MumurHash 3a 2.7 GB/s 10 Austin Appleby 45 | SpookyHash 2.0 GB/s 10 Bob Jenkins 46 | SBox 1.4 GB/s 9 Bret Mulvey 47 | Lookup3 1.2 GB/s 9 Bob Jenkins 48 | SuperFastHash 1.2 GB/s 1 Paul Hsieh 49 | CityHash64 1.05 GB/s 10 Pike & Alakuijala 50 | FNV 0.55 GB/s 5 Fowler, Noll, Vo 51 | CRC32 0.43 GB/s 9 52 | MD5-32 0.33 GB/s 10 Ronald L. Rivest 53 | SHA1-32 0.28 GB/s 10 54 | 55 | Q.Score is a measure of quality of the hash function. 56 | It depends on successfully passing SMHasher test set. 57 | 10 is a perfect score. 58 | */ 59 | 60 | #pragma once 61 | 62 | #if defined (__cplusplus) 63 | extern "C" { 64 | #endif 65 | 66 | 67 | /***************************** 68 | Includes 69 | *****************************/ 70 | #include /* size_t */ 71 | 72 | 73 | /***************************** 74 | Type 75 | *****************************/ 76 | typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; 77 | 78 | 79 | 80 | /***************************** 81 | Simple Hash Functions 82 | *****************************/ 83 | 84 | unsigned int XXH32 (const void* input, size_t length, unsigned seed); 85 | unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed); 86 | 87 | /* 88 | XXH32() : 89 | Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". 90 | The memory between input & input+length must be valid (allocated and read-accessible). 91 | "seed" can be used to alter the result predictably. 92 | This function successfully passes all SMHasher tests. 93 | Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s 94 | XXH64() : 95 | Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". 96 | */ 97 | 98 | 99 | 100 | /***************************** 101 | Advanced Hash Functions 102 | *****************************/ 103 | typedef struct { long long ll[ 6]; } XXH32_state_t; 104 | typedef struct { long long ll[11]; } XXH64_state_t; 105 | 106 | /* 107 | These structures allow static allocation of XXH states. 108 | States must then be initialized using XXHnn_reset() before first use. 109 | 110 | If you prefer dynamic allocation, please refer to functions below. 111 | */ 112 | 113 | XXH32_state_t* XXH32_createState(void); 114 | XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); 115 | 116 | XXH64_state_t* XXH64_createState(void); 117 | XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); 118 | 119 | /* 120 | These functions create and release memory for XXH state. 121 | States must then be initialized using XXHnn_reset() before first use. 122 | */ 123 | 124 | 125 | XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); 126 | XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); 127 | unsigned int XXH32_digest (const XXH32_state_t* statePtr); 128 | 129 | XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); 130 | XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); 131 | unsigned long long XXH64_digest (const XXH64_state_t* statePtr); 132 | 133 | /* 134 | These functions calculate the xxHash of an input provided in multiple smaller packets, 135 | as opposed to an input provided as a single block. 136 | 137 | XXH state space must first be allocated, using either static or dynamic method provided above. 138 | 139 | Start a new hash by initializing state with a seed, using XXHnn_reset(). 140 | 141 | Then, feed the hash state by calling XXHnn_update() as many times as necessary. 142 | Obviously, input must be valid, meaning allocated and read accessible. 143 | The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. 144 | 145 | Finally, you can produce a hash anytime, by using XXHnn_digest(). 146 | This function returns the final nn-bits hash. 147 | You can nonetheless continue feeding the hash state with more input, 148 | and therefore get some new hashes, by calling again XXHnn_digest(). 149 | 150 | When you are done, don't forget to free XXH state space, using typically XXHnn_freeState(). 151 | */ 152 | 153 | 154 | #if defined (__cplusplus) 155 | } 156 | #endif 157 | -------------------------------------------------------------------------------- /lf/c11.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2019 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2019 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __BITS_LF_C11_H 56 | #define __BITS_LF_C11_H 1 57 | 58 | #include 59 | #include 60 | 61 | #define LFATOMIC(x) _Atomic(x) 62 | #define LFATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) 63 | 64 | static inline void __lfaba_init(_Atomic(lfatomic_big_t) * obj, 65 | lfatomic_big_t val) 66 | { 67 | atomic_init(obj, val); 68 | } 69 | 70 | static inline lfatomic_big_t __lfaba_load(_Atomic(lfatomic_big_t) * obj, 71 | memory_order order) 72 | { 73 | #if __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 1 74 | lfatomic_big_t res; 75 | _Atomic(lfatomic_t) * hobj = (_Atomic(lfatomic_t) *) ((uintptr_t) obj); 76 | lfatomic_t * hres = (lfatomic_t *) &res; 77 | 78 | hres[0] = atomic_load_explicit(hobj, order); 79 | hres[1] = atomic_load_explicit(hobj + 1, order); 80 | return res; 81 | #elif __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 0 82 | return atomic_load_explicit(obj, order); 83 | #endif 84 | } 85 | 86 | static inline lfatomic_big_t __lfaba_load_atomic(_Atomic(lfatomic_big_t) * obj, 87 | memory_order order) 88 | { 89 | return atomic_load_explicit(obj, order); 90 | } 91 | 92 | static inline bool __lfaba_cmpxchg_weak(_Atomic(lfatomic_big_t) * obj, 93 | lfatomic_big_t * expected, lfatomic_big_t desired, 94 | memory_order succ, memory_order fail) 95 | { 96 | return atomic_compare_exchange_weak_explicit(obj, expected, desired, 97 | succ, fail); 98 | } 99 | 100 | static inline bool __lfaba_cmpxchg_strong(_Atomic(lfatomic_big_t) * obj, 101 | lfatomic_big_t * expected, lfatomic_big_t desired, 102 | memory_order succ, memory_order fail) 103 | { 104 | return atomic_compare_exchange_strong_explicit(obj, expected, desired, 105 | succ, fail); 106 | } 107 | 108 | static inline lfatomic_big_t __lfaba_fetch_and(_Atomic(lfatomic_big_t) * obj, 109 | lfatomic_big_t arg, memory_order order) 110 | { 111 | return atomic_fetch_and_explicit(obj, arg, order); 112 | } 113 | 114 | #endif /* !__BITS_LF_C11_H */ 115 | 116 | /* vi: set tabstop=4: */ 117 | -------------------------------------------------------------------------------- /lf/config.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2018 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2018 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __BITS_LF_CONFIG_H 56 | #define __BITS_LF_CONFIG_H 1 57 | 58 | #include 59 | #include 60 | 61 | /* For the following architectures, it is cheaper to use split (word-atomic) 62 | loads whenever possible. */ 63 | #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ 64 | defined(__aarch64__) 65 | # define __LFLOAD_SPLIT(dtype_width) (dtype_width > LFATOMIC_WIDTH) 66 | #else 67 | # define __LFLOAD_SPLIT(dtype_width) 0 68 | #endif 69 | 70 | /* IA-64 provides a 128-bit single-compare/double-swap instruction, so 71 | LFCMPXCHG_SPLIT is true for 128-bit types. */ 72 | #if defined(__ia64__) 73 | # define __LFCMPXCHG_SPLIT(dtype_width) (dtype_width > LFATOMIC_WIDTH) 74 | #else 75 | # define __LFCMPXCHG_SPLIT(dtype_width) 0 76 | #endif 77 | 78 | #if defined(__x86_64__) || defined (__aarch64__) || defined(__powerpc64__) \ 79 | || (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64) 80 | typedef int64_t lfsatomic_t; 81 | typedef uint64_t lfatomic_t; 82 | typedef __uint128_t lfatomic_big_t; 83 | # define LFATOMIC_LOG2 3 84 | # define LFATOMIC_WIDTH 64 85 | # define LFATOMIC_BIG_WIDTH 128 86 | #elif defined(__i386__) || defined(__arm__) || defined(__powerpc__) \ 87 | || (defined(__mips__) && \ 88 | (_MIPS_SIM == _MIPS_SIM_ABI32 || _MIPS_SIM == _MIPS_SIM_NABI32)) 89 | typedef int32_t lfsatomic_t; 90 | typedef uint32_t lfatomic_t; 91 | typedef uint64_t lfatomic_big_t; 92 | # define LFATOMIC_LOG2 2 93 | # define LFATOMIC_WIDTH 32 94 | # define LFATOMIC_BIG_WIDTH 64 95 | #else 96 | typedef intptr_t lfsatomic_t; 97 | typedef uintptr_t lfatomic_t; 98 | typedef uintptr_t lfatomic_big_t; 99 | # if UINTPTR_MAX == UINT32_C(0xFFFFFFFF) 100 | # define LFATOMIC_LOG2 2 101 | # define LFATOMIC_WIDTH 32 102 | # define LFATOMIC_BIG_WIDTH 32 103 | # elif UINTPTR_MAX == UINT64_C(0xFFFFFFFFFFFFFFFF) 104 | # define LFATOMIC_LOG2 3 105 | # define LFATOMIC_WIDTH 64 106 | # define LFATOMIC_BIG_WIDTH 64 107 | # endif 108 | #endif 109 | 110 | /* XXX: True for x86/x86-64 but needs to be properly defined for other CPUs. */ 111 | #define LF_CACHE_SHIFT 7U 112 | #define LF_CACHE_BYTES (1U << LF_CACHE_SHIFT) 113 | 114 | /* Allow to use LEA for x86/x86-64. */ 115 | #if defined(__i386__) || defined(__x86_64__) 116 | # define __LFMERGE(x,y) ((x) + (y)) 117 | #else 118 | # define __LFMERGE(x,y) ((x) | (y)) 119 | #endif 120 | 121 | #endif /* !__BITS_LF_CONFIG_H */ 122 | 123 | /* vi: set tabstop=4: */ 124 | -------------------------------------------------------------------------------- /lf/gcc_x86.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2018 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2018 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __BITS_LF_GCC_X86_H 56 | #define __BITS_LF_GCC_X86_H 1 57 | 58 | #include 59 | #include 60 | 61 | #define LFATOMIC(x) _Atomic(x) 62 | #define LFATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) 63 | 64 | static inline void __lfbig_init(_Atomic(lfatomic_big_t) * obj, 65 | lfatomic_big_t val) 66 | { 67 | *((volatile lfatomic_big_t *) ((uintptr_t) obj)) = val; 68 | } 69 | 70 | static inline lfatomic_big_t __lfbig_load(_Atomic(lfatomic_big_t) * obj, 71 | memory_order order) 72 | { 73 | return *((volatile lfatomic_big_t *) ((uintptr_t) obj)); 74 | } 75 | 76 | static inline bool __lfbig_cmpxchg_strong(_Atomic(lfatomic_big_t) * obj, 77 | lfatomic_big_t * expected, lfatomic_big_t desired, 78 | memory_order succ, memory_order fail) 79 | { 80 | lfatomic_t low = (lfatomic_t) desired; 81 | lfatomic_t high = (lfatomic_t) (desired >> (sizeof(lfatomic_t) * 8)); 82 | bool result; 83 | 84 | #if defined(__x86_64__) 85 | # define __LFX86_CMPXCHG "cmpxchg16b" 86 | #elif defined(__i386__) 87 | # define __LFX86_CMPXCHG "cmpxchg8b" 88 | #endif 89 | __asm__ __volatile__ ("lock " __LFX86_CMPXCHG " %0" 90 | : "+m" (*obj), "=@ccz" (result), "+A" (*expected) 91 | : "b" (low), "c" (high) 92 | ); 93 | #undef __LFX86_CMPXCHG 94 | 95 | return result; 96 | } 97 | 98 | static inline lfatomic_big_t __lfbig_load_atomic(_Atomic(lfatomic_big_t) * obj, 99 | memory_order order) 100 | { 101 | lfatomic_big_t value = 0; 102 | __lfbig_cmpxchg_strong(obj, &value, 0, order, order); 103 | return value; 104 | } 105 | 106 | static inline bool __lfbig_cmpxchg_weak(_Atomic(lfatomic_big_t) * obj, 107 | lfatomic_big_t * expected, lfatomic_big_t desired, 108 | memory_order succ, memory_order fail) 109 | { 110 | return __lfbig_cmpxchg_strong(obj, expected, desired, succ, fail); 111 | } 112 | 113 | static inline lfatomic_big_t __lfbig_fetch_and(_Atomic(lfatomic_big_t) * obj, 114 | lfatomic_big_t arg, memory_order order) 115 | { 116 | lfatomic_big_t new_val, old_val = __lfbig_load(obj, order); 117 | do { 118 | new_val = old_val & arg; 119 | } while (!__lfbig_cmpxchg_weak(obj, &old_val, new_val, order, order)); 120 | __LF_ASSUME(new_val == (old_val & arg)); 121 | return old_val; 122 | } 123 | 124 | #define __lfaba_init __lfbig_init 125 | #define __lfaba_load __lfbig_load 126 | #define __lfaba_load_atomic __lfbig_load_atomic 127 | #define __lfaba_cmpxchg_weak __lfbig_cmpxchg_weak 128 | #define __lfaba_cmpxchg_strong __lfbig_cmpxchg_strong 129 | #define __lfaba_fetch_and __lfbig_fetch_and 130 | 131 | #endif /* !__BITS_LF_GGC_X86_H */ 132 | 133 | /* vi: set tabstop=4: */ 134 | -------------------------------------------------------------------------------- /lf/lf.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2019 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2019 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __BITS_LF_H 56 | #define __BITS_LF_H 1 57 | 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | 64 | #include "config.h" 65 | 66 | #ifdef __GNUC__ 67 | # define __LF_ASSUME(c) do { if (!(c)) __builtin_unreachable(); } while (0) 68 | #else 69 | # define __LF_ASSUME(c) 70 | #endif 71 | 72 | /* GCC does not have a sane implementation of wide atomics for x86-64 73 | in recent versions, so use inline assembly workarounds whenever possible. 74 | No aarch64 support in GCC for right now. */ 75 | #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) && \ 76 | !defined(__llvm__) && defined(__GCC_ASM_FLAG_OUTPUTS__) 77 | # include "gcc_x86.h" 78 | #else 79 | # include "c11.h" 80 | #endif 81 | 82 | /* ABA tagging with split (word-atomic) load/cmpxchg operation. */ 83 | #if __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 1 || \ 84 | __LFCMPXCHG_SPLIT(LFATOMIC_BIG_WIDTH) == 1 85 | # define __LFABA_IMPL(w, type_t) \ 86 | static const size_t __lfaba_shift##w = sizeof(lfatomic_big_t) * 4; \ 87 | static const size_t __lfaptr_shift##w = 0; \ 88 | static const lfatomic_big_t __lfaba_mask##w = \ 89 | ~(lfatomic_big_t) 0U << (sizeof(lfatomic_big_t) * 4); \ 90 | static const lfatomic_big_t __lfaba_step##w = \ 91 | (lfatomic_big_t) 1U << (sizeof(lfatomic_big_t) * 4); 92 | #endif 93 | 94 | /* ABA tagging when load/cmpxchg is not split. Note that unlike previous 95 | case, __lfaptr_shift is required to be 0. */ 96 | #if __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 0 && \ 97 | __LFCMPXCHG_SPLIT(LFATOMIC_BIG_WIDTH) == 0 98 | # define __LFABA_IMPL(w, type_t) \ 99 | static const size_t __lfaba_shift##w = sizeof(type_t) * 8; \ 100 | static const size_t __lfaptr_shift##w = 0; \ 101 | static const lfatomic_big_t __lfaba_mask##w = \ 102 | ~(lfatomic_big_t) 0U << (sizeof(type_t) * 8); \ 103 | static const lfatomic_big_t __lfaba_step##w = \ 104 | (lfatomic_big_t) 1U << (sizeof(type_t) * 8); 105 | #endif 106 | 107 | /* Available on CAS2 32/64-bit architectures. */ 108 | #if LFATOMIC_BIG_WIDTH >= 2 * __LFPTR_WIDTH 109 | __LFABA_IMPL(, uintptr_t) 110 | #endif 111 | 112 | #endif /* !__BITS_LF_H */ 113 | 114 | /* vi: set tabstop=4: */ 115 | -------------------------------------------------------------------------------- /lfring_cas1.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2019 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2019 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __LFRING_H 56 | #define __LFRING_H 1 57 | 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | 64 | #include "lf/lf.h" 65 | 66 | #if LFATOMIC_WIDTH == 32 67 | # define LFRING_MIN (LF_CACHE_SHIFT - 2) 68 | #elif LFATOMIC_WIDTH == 64 69 | # define LFRING_MIN (LF_CACHE_SHIFT - 3) 70 | #elif LFATOMIC_WIDTH == 128 71 | # define LFRING_MIN (LF_CACHE_SHIFT - 4) 72 | #else 73 | # error "Unsupported LFATOMIC_WIDTH." 74 | #endif 75 | 76 | #define LFRING_ALIGN (_Alignof(struct __lfring)) 77 | #define LFRING_SIZE(o) \ 78 | (offsetof(struct __lfring, array) + (sizeof(lfatomic_t) << ((o) + 1))) 79 | 80 | #define LFRING_EMPTY (~(size_t) 0U) 81 | 82 | #define __lfring_cmp(x, op, y) ((lfsatomic_t) ((x) - (y)) op 0) 83 | 84 | #if LFRING_MIN != 0 85 | static inline size_t __lfring_raw_map(lfatomic_t idx, size_t order, size_t n) 86 | { 87 | return (size_t) (((idx & (n - 1)) >> (order - LFRING_MIN)) | 88 | ((idx << LFRING_MIN) & (n - 1))); 89 | } 90 | #else 91 | static inline size_t __lfring_raw_map(lfatomic_t idx, size_t order, size_t n) 92 | { 93 | return (size_t) (idx & (n - 1)); 94 | } 95 | #endif 96 | 97 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n) 98 | { 99 | return __lfring_raw_map(idx, order + 1, n); 100 | } 101 | 102 | #define __lfring_threshold3(half, n) ((long) ((half) + (n) - 1)) 103 | 104 | static inline size_t lfring_pow2(size_t order) 105 | { 106 | return (size_t) 1U << order; 107 | } 108 | 109 | struct __lfring { 110 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) head; 111 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfsatomic_t) threshold; 112 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) tail; 113 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) array[1]; 114 | }; 115 | 116 | struct lfring; 117 | 118 | static inline void lfring_init_empty(struct lfring * ring, size_t order) 119 | { 120 | struct __lfring * q = (struct __lfring *) ring; 121 | size_t i, n = lfring_pow2(order + 1); 122 | 123 | for (i = 0; i != n; i++) 124 | atomic_init(&q->array[i], (lfsatomic_t) -1); 125 | 126 | atomic_init(&q->head, 0); 127 | atomic_init(&q->threshold, -1); 128 | atomic_init(&q->tail, 0); 129 | } 130 | 131 | static inline void lfring_init_full(struct lfring * ring, size_t order) 132 | { 133 | struct __lfring * q = (struct __lfring *) ring; 134 | size_t i, half = lfring_pow2(order), n = half * 2; 135 | 136 | for (i = 0; i != half; i++) 137 | atomic_init(&q->array[__lfring_map(i, order, n)], n + __lfring_raw_map(i, order, half)); 138 | for (; i != n; i++) 139 | atomic_init(&q->array[__lfring_map(i, order, n)], (lfsatomic_t) -1); 140 | 141 | atomic_init(&q->head, 0); 142 | atomic_init(&q->threshold, __lfring_threshold3(half, n)); 143 | atomic_init(&q->tail, half); 144 | } 145 | 146 | static inline void lfring_init_fill(struct lfring * ring, 147 | size_t s, size_t e, size_t order) 148 | { 149 | struct __lfring * q = (struct __lfring *) ring; 150 | size_t i, half = lfring_pow2(order), n = half * 2; 151 | 152 | for (i = 0; i != s; i++) 153 | atomic_init(&q->array[__lfring_map(i, order, n)], 2 * n - 1); 154 | for (; i != e; i++) 155 | atomic_init(&q->array[__lfring_map(i, order, n)], n + i); 156 | for (; i != n; i++) 157 | atomic_init(&q->array[__lfring_map(i, order, n)], (lfsatomic_t) -1); 158 | 159 | atomic_init(&q->head, s); 160 | atomic_init(&q->threshold, __lfring_threshold3(half, n)); 161 | atomic_init(&q->tail, e); 162 | } 163 | 164 | static inline bool lfring_enqueue(struct lfring * ring, size_t order, 165 | size_t eidx, bool nonempty) 166 | { 167 | struct __lfring * q = (struct __lfring *) ring; 168 | size_t tidx, half = lfring_pow2(order), n = half * 2; 169 | lfatomic_t tail, entry, ecycle, tcycle; 170 | 171 | eidx ^= (n - 1); 172 | 173 | while (1) { 174 | tail = atomic_fetch_add_explicit(&q->tail, 1, memory_order_acq_rel); 175 | tcycle = (tail << 1) | (2 * n - 1); 176 | tidx = __lfring_map(tail, order, n); 177 | entry = atomic_load_explicit(&q->array[tidx], memory_order_acquire); 178 | retry: 179 | ecycle = entry | (2 * n - 1); 180 | if (__lfring_cmp(ecycle, <, tcycle) && ((entry == ecycle) || 181 | ((entry == (ecycle ^ n)) && 182 | __lfring_cmp(atomic_load_explicit(&q->head, 183 | memory_order_acquire), <=, tail)))) { 184 | 185 | if (!atomic_compare_exchange_weak_explicit(&q->array[tidx], 186 | &entry, tcycle ^ eidx, 187 | memory_order_acq_rel, memory_order_acquire)) 188 | goto retry; 189 | 190 | if (!nonempty && (atomic_load(&q->threshold) != __lfring_threshold3(half, n))) 191 | atomic_store(&q->threshold, __lfring_threshold3(half, n)); 192 | return true; 193 | } 194 | } 195 | } 196 | 197 | static inline void __lfring_catchup(struct lfring * ring, 198 | lfatomic_t tail, lfatomic_t head) 199 | { 200 | struct __lfring * q = (struct __lfring *) ring; 201 | 202 | while (!atomic_compare_exchange_weak_explicit(&q->tail, &tail, head, 203 | memory_order_acq_rel, memory_order_acquire)) { 204 | head = atomic_load_explicit(&q->head, memory_order_acquire); 205 | tail = atomic_load_explicit(&q->tail, memory_order_acquire); 206 | if (__lfring_cmp(tail, >=, head)) 207 | break; 208 | } 209 | } 210 | 211 | static inline size_t lfring_dequeue(struct lfring * ring, size_t order, 212 | bool nonempty) 213 | { 214 | struct __lfring * q = (struct __lfring *) ring; 215 | size_t hidx, n = lfring_pow2(order + 1); 216 | lfatomic_t head, entry, entry_new, ecycle, hcycle, tail; 217 | size_t attempt; 218 | 219 | if (!nonempty && atomic_load(&q->threshold) < 0) { 220 | return LFRING_EMPTY; 221 | } 222 | 223 | while (1) { 224 | head = atomic_fetch_add_explicit(&q->head, 1, memory_order_acq_rel); 225 | hcycle = (head << 1) | (2 * n - 1); 226 | hidx = __lfring_map(head, order, n); 227 | attempt = 0; 228 | again: 229 | entry = atomic_load_explicit(&q->array[hidx], memory_order_acquire); 230 | 231 | do { 232 | ecycle = entry | (2 * n - 1); 233 | if (ecycle == hcycle) { 234 | atomic_fetch_or_explicit(&q->array[hidx], (n - 1), 235 | memory_order_acq_rel); 236 | return (size_t) (entry & (n - 1)); 237 | } 238 | 239 | if ((entry | n) != ecycle) { 240 | entry_new = entry & ~(lfatomic_t) n; 241 | if (entry == entry_new) 242 | break; 243 | } else { 244 | if (++attempt <= 10000) 245 | goto again; 246 | entry_new = hcycle ^ ((~entry) & n); 247 | } 248 | } while (__lfring_cmp(ecycle, <, hcycle) && 249 | !atomic_compare_exchange_weak_explicit(&q->array[hidx], 250 | &entry, entry_new, 251 | memory_order_acq_rel, memory_order_acquire)); 252 | 253 | if (!nonempty) { 254 | tail = atomic_load_explicit(&q->tail, memory_order_acquire); 255 | if (__lfring_cmp(tail, <=, head + 1)) { 256 | __lfring_catchup(ring, tail, head + 1); 257 | atomic_fetch_sub_explicit(&q->threshold, 1, 258 | memory_order_acq_rel); 259 | return LFRING_EMPTY; 260 | } 261 | 262 | if (atomic_fetch_sub_explicit(&q->threshold, 1, 263 | memory_order_acq_rel) <= 0) 264 | return LFRING_EMPTY; 265 | } 266 | } 267 | } 268 | 269 | #endif /* !__LFRING_H */ 270 | 271 | /* vi: set tabstop=4: */ 272 | -------------------------------------------------------------------------------- /lfring_cas2.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2019 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2019 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __LFRING_H 56 | #define __LFRING_H 1 57 | 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | 64 | #include "lf/lf.h" 65 | 66 | #if LFATOMIC_WIDTH == 32 67 | # define LFRING_PTR_MIN (LF_CACHE_SHIFT - 3) 68 | #elif LFATOMIC_WIDTH == 64 69 | # define LFRING_PTR_MIN (LF_CACHE_SHIFT - 4) 70 | #elif LFATOMIC_WIDTH == 128 71 | # define LFRING_PTR_MIN (LF_CACHE_SHIFT - 5) 72 | #else 73 | # error "Unsupported LFATOMIC_WIDTH." 74 | #endif 75 | 76 | #define LFRING_PTR_ALIGN (_Alignof(struct __lfring_ptr)) 77 | #define LFRING_PTR_SIZE(o) \ 78 | (offsetof(struct __lfring_ptr, array) + (sizeof(lfatomic_big_t) << ((o) + 1))) 79 | 80 | #define __lfring_cmp(x, op, y) ((lfsatomic_t) ((x) - (y)) op 0) 81 | 82 | #if LFRING_PTR_MIN != 0 83 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n) 84 | { 85 | return (size_t) (((idx & (n - 1)) >> (order + 1 - LFRING_PTR_MIN)) | 86 | ((idx << LFRING_PTR_MIN) & (n - 1))); 87 | } 88 | #else 89 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n) 90 | { 91 | return (size_t) (idx & (n - 1)); 92 | } 93 | #endif 94 | 95 | #define __lfring_threshold4(n) ((long) (2 * (n) - 1)) 96 | 97 | static inline size_t lfring_pow2(size_t order) 98 | { 99 | return (size_t) 1U << order; 100 | } 101 | 102 | struct __lfring_ptr { 103 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) head; 104 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfsatomic_t) threshold; 105 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) tail; 106 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) array[1]; 107 | }; 108 | 109 | struct lfring_ptr; 110 | 111 | #if defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) 112 | # define __lfring_array_pointer(x) ((_Atomic(lfatomic_t) *) (x)) 113 | # define __lfring_array_entry(x) ((_Atomic(lfatomic_t) *) (x) + 1) 114 | #else 115 | # define __lfring_array_pointer(x) ((_Atomic(lfatomic_t) *) (x) + 1) 116 | # define __lfring_array_entry(x) ((_Atomic(lfatomic_t) *) (x)) 117 | #endif 118 | 119 | #define __lfring_entry(x) ((lfatomic_t) (((x) & __lfaba_mask) >> \ 120 | __lfaba_shift)) 121 | #define __lfring_pointer(x) ((lfatomic_t) (((x) & ~__lfaba_mask) >> \ 122 | __lfaptr_shift)) 123 | #define __lfring_pair(e,p) (((lfatomic_big_t) (e) << __lfaba_shift) | \ 124 | ((lfatomic_big_t) (p) << __lfaptr_shift)) 125 | 126 | static inline void lfring_ptr_init_empty(struct lfring_ptr * ring, size_t order) 127 | { 128 | struct __lfring_ptr * q = (struct __lfring_ptr *) ring; 129 | size_t i, n = lfring_pow2(order + 1); 130 | 131 | for (i = 0; i != n; i++) 132 | __lfaba_init(&q->array[i], 0); 133 | 134 | atomic_init(&q->head, n); 135 | atomic_init(&q->threshold, -1); 136 | atomic_init(&q->tail, n); 137 | } 138 | 139 | static inline void lfring_ptr_init_lhead(lfatomic_t *lhead, size_t order) 140 | { 141 | *lhead = lfring_pow2(order + 1); 142 | } 143 | 144 | static inline bool lfring_ptr_enqueue(struct lfring_ptr * ring, size_t order, 145 | void * ptr, bool nonempty, bool nonfull, lfatomic_t *lhead) 146 | { 147 | struct __lfring_ptr * q = (struct __lfring_ptr *) ring; 148 | size_t tidx, n = lfring_pow2(order + 1); 149 | lfatomic_t tail, entry, ecycle, tcycle; 150 | lfatomic_big_t pair; 151 | 152 | if (!nonfull) { 153 | tail = atomic_load(&q->tail); 154 | if (tail >= *lhead + n) { 155 | *lhead = atomic_load(&q->head); 156 | if (tail >= *lhead + n) 157 | return false; 158 | } 159 | } 160 | 161 | while (1) { 162 | tail = atomic_fetch_add_explicit(&q->tail, 1, memory_order_acq_rel); 163 | tcycle = tail & ~(lfatomic_t) (n - 1); 164 | tidx = __lfring_map(tail, order, n); 165 | pair = __lfaba_load(&q->array[tidx], memory_order_acquire); 166 | retry: 167 | entry = __lfring_entry(pair); 168 | ecycle = entry & ~(lfatomic_t) (n - 1); 169 | if (__lfring_cmp(ecycle, <, tcycle) && (entry == ecycle || 170 | (entry == (ecycle | 0x2) && atomic_load_explicit(&q->head, 171 | memory_order_acquire) <= tail))) { 172 | 173 | if (!__lfaba_cmpxchg_weak(&q->array[tidx], 174 | &pair, __lfring_pair(tcycle | 0x1, (lfatomic_t) ptr), 175 | memory_order_acq_rel, memory_order_acquire)) 176 | goto retry; 177 | 178 | if (!nonempty && atomic_load(&q->threshold) != __lfring_threshold4(n)) 179 | atomic_store(&q->threshold, __lfring_threshold4(n)); 180 | 181 | return true; 182 | } 183 | 184 | if (!nonfull) { 185 | if (tail + 1 >= *lhead + n) { 186 | *lhead = atomic_load(&q->head); 187 | if (tail + 1 >= *lhead + n) 188 | return false; 189 | } 190 | } 191 | } 192 | } 193 | 194 | static inline void __lfring_ptr_catchup(struct lfring_ptr * ring, 195 | lfatomic_t tail, lfatomic_t head) 196 | { 197 | struct __lfring_ptr * q = (struct __lfring_ptr *) ring; 198 | 199 | while (!atomic_compare_exchange_weak_explicit(&q->tail, &tail, head, 200 | memory_order_acq_rel, memory_order_acquire)) { 201 | head = atomic_load_explicit(&q->head, memory_order_acquire); 202 | tail = atomic_load_explicit(&q->tail, memory_order_acquire); 203 | if (__lfring_cmp(tail, >=, head)) 204 | break; 205 | } 206 | } 207 | 208 | static inline bool lfring_ptr_dequeue(struct lfring_ptr * ring, size_t order, 209 | void ** ptr, bool nonempty) 210 | { 211 | struct __lfring_ptr * q = (struct __lfring_ptr *) ring; 212 | size_t hidx, n = lfring_pow2(order + 1); 213 | lfatomic_t head, entry, entry_new, ecycle, hcycle, tail; 214 | lfatomic_big_t pair; 215 | 216 | if (!nonempty && atomic_load(&q->threshold) < 0) { 217 | return false; 218 | } 219 | 220 | while (1) { 221 | head = atomic_fetch_add_explicit(&q->head, 1, memory_order_acq_rel); 222 | hcycle = head & ~(lfatomic_t) (n - 1); 223 | hidx = __lfring_map(head, order, n); 224 | entry = atomic_load_explicit(__lfring_array_entry(&q->array[hidx]), 225 | memory_order_acquire); 226 | do { 227 | ecycle = entry & ~(lfatomic_t) (n - 1); 228 | if (ecycle == hcycle) { 229 | pair = __lfaba_fetch_and(&q->array[hidx], 230 | __lfring_pair(~(lfatomic_t) 0x1, 0), memory_order_acq_rel); 231 | *ptr = (void *) __lfring_pointer(pair); 232 | return true; 233 | } 234 | if ((entry & (~(lfatomic_t) 0x2)) != ecycle) { 235 | entry_new = entry | 0x2; 236 | if (entry == entry_new) 237 | break; 238 | } else { 239 | entry_new = hcycle | (entry & 0x2); 240 | } 241 | } while (__lfring_cmp(ecycle, <, hcycle) && 242 | !atomic_compare_exchange_weak_explicit( 243 | __lfring_array_entry(&q->array[hidx]), 244 | &entry, entry_new, 245 | memory_order_acq_rel, memory_order_acquire)); 246 | 247 | if (!nonempty) { 248 | tail = atomic_load_explicit(&q->tail, memory_order_acquire); 249 | if (__lfring_cmp(tail, <=, head + 1)) { 250 | __lfring_ptr_catchup(ring, tail, head + 1); 251 | atomic_fetch_sub_explicit(&q->threshold, 1, 252 | memory_order_acq_rel); 253 | return false; 254 | } 255 | if (atomic_fetch_sub_explicit(&q->threshold, 1, 256 | memory_order_acq_rel) <= 0) 257 | return false; 258 | } 259 | } 260 | } 261 | 262 | #endif /* !__LFRING_H */ 263 | 264 | /* vi: set tabstop=4: */ 265 | -------------------------------------------------------------------------------- /lfring_naive.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2019 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2019 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __LFRING_H 56 | #define __LFRING_H 1 57 | 58 | #include 59 | #include 60 | #include 61 | #include 62 | 63 | #include "lf/lf.h" 64 | 65 | #if LFATOMIC_WIDTH == 32 66 | # define LFRING_MIN (LF_CACHE_SHIFT - 2) 67 | #elif LFATOMIC_WIDTH == 64 68 | # define LFRING_MIN (LF_CACHE_SHIFT - 3) 69 | #elif LFATOMIC_WIDTH == 128 70 | # define LFRING_MIN (LF_CACHE_SHIFT - 4) 71 | #else 72 | # error "Unsupported LFATOMIC_WIDTH." 73 | #endif 74 | 75 | #define LFRING_ALIGN (_Alignof(struct __lfring)) 76 | #define LFRING_SIZE(o) \ 77 | (offsetof(struct __lfring, array) + (sizeof(lfatomic_t) << (o))) 78 | 79 | #define LFRING_EMPTY (~(size_t) 0U) 80 | 81 | #if LFRING_MIN != 0 82 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n) 83 | { 84 | return (size_t) (((idx & (n - 1)) >> (order - LFRING_MIN)) | 85 | ((idx << LFRING_MIN) & (n - 1))); 86 | } 87 | #else 88 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n) 89 | { 90 | return (size_t) (idx & (n - 1)); 91 | } 92 | #endif 93 | 94 | static inline size_t lfring_pow2(size_t order) 95 | { 96 | return (size_t) 1U << order; 97 | } 98 | 99 | struct __lfring { 100 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) head; 101 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) tail; 102 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) array[1]; 103 | }; 104 | 105 | struct lfring; 106 | 107 | static inline void lfring_init_empty(struct lfring * ring, size_t order) 108 | { 109 | struct __lfring * q = (struct __lfring *) ring; 110 | size_t i, n = lfring_pow2(order); 111 | 112 | for (i = 0; i != n; i++) { 113 | q->array[i] = 0; 114 | } 115 | 116 | q->head = n; 117 | q->tail = n; 118 | } 119 | 120 | static inline void lfring_init_full(struct lfring * ring, size_t order) 121 | { 122 | struct __lfring * q = (struct __lfring *) ring; 123 | size_t i, n = lfring_pow2(order); 124 | 125 | for (i = 0; i != n; i++) { 126 | q->array[i] = i; 127 | } 128 | 129 | q->head = 0; 130 | q->tail = n; 131 | } 132 | 133 | static inline void lfring_init_fill(struct lfring * ring, 134 | size_t s, size_t e, size_t order) 135 | { 136 | struct __lfring * q = (struct __lfring *) ring; 137 | size_t i, n = lfring_pow2(order); 138 | 139 | for (i = 0; i != s; i++) { 140 | q->array[__lfring_map(i, order, n)] = 0; 141 | } 142 | for (; i != e; i++) { 143 | q->array[__lfring_map(i, order, n)] = i; 144 | } 145 | for (; i != n; i++) { 146 | q->array[__lfring_map(i, order, n)] = (lfatomic_t) -n; 147 | } 148 | q->head = s; 149 | q->tail = e; 150 | } 151 | 152 | static inline size_t lfring_enqueue(struct lfring * ring, 153 | size_t order, size_t eidx, bool nonempty) 154 | { 155 | struct __lfring * q = (struct __lfring *) ring; 156 | size_t n = lfring_pow2(order); 157 | lfatomic_t tail; 158 | 159 | start_over: 160 | tail = atomic_load_explicit(&q->tail, memory_order_acquire); 161 | 162 | while (1) { 163 | lfatomic_t tcycle = tail & ~(n - 1); 164 | size_t tidx = __lfring_map(tail, order, n); 165 | lfatomic_t entry = atomic_load_explicit(&q->array[tidx], memory_order_acquire); 166 | 167 | while (1) { 168 | lfatomic_t ecycle = entry & ~(n - 1); 169 | 170 | if (ecycle == tcycle) { 171 | /* Advance the tail pointer. */ 172 | if (atomic_compare_exchange_strong_explicit(&q->tail, &tail, 173 | tail + 1, memory_order_acq_rel, memory_order_acquire)) { 174 | tail++; 175 | } 176 | break; 177 | } 178 | 179 | /* Wrapping around. */ 180 | if ((lfatomic_t) (ecycle + n) != tcycle) { 181 | goto start_over; 182 | } 183 | 184 | /* An empty entry. */ 185 | if (atomic_compare_exchange_strong_explicit(&q->array[tidx], 186 | &entry, __LFMERGE(tcycle, eidx), 187 | memory_order_acq_rel, memory_order_acquire)) { 188 | /* Try to advance the tail pointer. */ 189 | atomic_compare_exchange_weak_explicit(&q->tail, &tail, tail + 1, 190 | memory_order_acq_rel, memory_order_acquire); 191 | return entry & (n - 1); 192 | } 193 | } 194 | } 195 | } 196 | 197 | static inline size_t lfring_dequeue(struct lfring * ring, size_t order, 198 | bool nonempty) 199 | { 200 | struct __lfring * q = (struct __lfring *) ring; 201 | size_t n = lfring_pow2(order); 202 | lfatomic_t head, entry; 203 | 204 | start_over: 205 | head = atomic_load_explicit(&q->head, memory_order_acquire); 206 | 207 | do { 208 | lfatomic_t ecycle, hcycle = head & ~(n - 1); 209 | size_t hidx = __lfring_map(head, order, n); 210 | entry = atomic_load_explicit(&q->array[hidx], memory_order_acquire); 211 | ecycle = entry & ~(n - 1); 212 | if (ecycle != hcycle) { 213 | /* Wrapping around. */ 214 | if (!nonempty && (lfatomic_t) (ecycle + n) == hcycle) { 215 | return LFRING_EMPTY; 216 | } 217 | goto start_over; 218 | } 219 | } while (!atomic_compare_exchange_weak_explicit(&q->head, &head, head + 1, 220 | memory_order_acq_rel, memory_order_acquire)); 221 | 222 | return (size_t) (entry & (n - 1)); 223 | } 224 | 225 | #endif /* !__LFRING_H */ 226 | 227 | /* vi: set tabstop=4: */ 228 | -------------------------------------------------------------------------------- /wfring_cas2.h: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------------- 2 | * 3 | * Dual 2-BSD/MIT license. Either or both licenses can be used. 4 | * 5 | * ---------------------------------------------------------------------------- 6 | * 7 | * Copyright (c) 2021 Ruslan Nikolaev. All Rights Reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 1. Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * ---------------------------------------------------------------------------- 31 | * 32 | * Copyright (c) 2021 Ruslan Nikolaev 33 | * 34 | * Permission is hereby granted, free of charge, to any person obtaining a 35 | * copy of this software and associated documentation files (the "Software"), 36 | * to deal in the Software without restriction, including without limitation 37 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | * and/or sell copies of the Software, and to permit persons to whom the 39 | * Software is furnished to do so, subject to the following conditions: 40 | * 41 | * The above copyright notice and this permission notice shall be included in 42 | * all copies or substantial portions of the Software. 43 | * 44 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 50 | * IN THE SOFTWARE. 51 | * 52 | * ---------------------------------------------------------------------------- 53 | */ 54 | 55 | #ifndef __WFRING_H 56 | #define __WFRING_H 1 57 | 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | 64 | #include "lf/lf.h" 65 | 66 | #if LFATOMIC_WIDTH == 32 67 | # define WFRING_MIN (LF_CACHE_SHIFT - 3) 68 | #elif LFATOMIC_WIDTH == 64 69 | # define WFRING_MIN (LF_CACHE_SHIFT - 4) 70 | #elif LFATOMIC_WIDTH == 128 71 | # define WFRING_MIN (LF_CACHE_SHIFT - 5) 72 | #else 73 | # error "Unsupported LFATOMIC_WIDTH." 74 | #endif 75 | 76 | #define WFRING_PATIENCE_ENQ 16 77 | #define WFRING_PATIENCE_DEQ 64 78 | #define WFRING_DELAY 16 79 | 80 | #define WFRING_ALIGN (_Alignof(struct __wfring)) 81 | #define WFRING_SIZE(o) \ 82 | (offsetof(struct __wfring, array) + (sizeof(lfatomic_big_t) << ((o) + 1))) 83 | 84 | #define WFRING_EMPTY (~(size_t) 0U) 85 | 86 | #define __wfring_cmp(x, op, y) ((lfsatomic_t) ((x) - (y)) op 0) 87 | 88 | #if WFRING_MIN != 0 89 | static inline size_t __wfring_raw_map(lfatomic_t idx, size_t order, size_t n) 90 | { 91 | return (size_t) (((idx & (n - 1)) >> (order - WFRING_MIN)) | 92 | ((idx << WFRING_MIN) & (n - 1))); 93 | } 94 | #else 95 | static inline size_t __wfring_raw_map(lfatomic_t idx, size_t order, size_t n) 96 | { 97 | return (size_t) (idx & (n - 1)); 98 | } 99 | #endif 100 | 101 | static inline size_t __wfring_map(lfatomic_t idx, size_t order, size_t n) 102 | { 103 | return __wfring_raw_map(idx, order + 1, n); 104 | } 105 | 106 | #define __wfring_threshold3(half, n) ((long) ((half) + (n) - 1)) 107 | 108 | #if defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) 109 | # define __wfring_pair_addon(x) ((_Atomic(lfatomic_t) *) (x)) 110 | # define __wfring_pair_entry(x) ((_Atomic(lfatomic_t) *) (x) + 1) 111 | #else 112 | # define __wfring_pair_addon(x) ((_Atomic(lfatomic_t) *) (x) + 1) 113 | # define __wfring_pair_entry(x) ((_Atomic(lfatomic_t) *) (x)) 114 | #endif 115 | 116 | #define __wfring_entry(x) ((lfatomic_t) (((x) & __lfaba_mask) >> \ 117 | __lfaba_shift)) 118 | #define __wfring_addon(x) ((lfatomic_t) (((x) & ~__lfaba_mask) >> \ 119 | __lfaptr_shift)) 120 | #define __wfring_pair(e,c) (((lfatomic_big_t) (e) << __lfaba_shift) | \ 121 | ((lfatomic_big_t) ((lfatomic_t) (c)) << __lfaptr_shift)) 122 | 123 | #define __WFRING_FIN 0x1 124 | #define __WFRING_INC 0x2 125 | 126 | #define __WFRING_EIDX_TERM 0 127 | #define __WFRING_EIDX_DEQ 1 128 | 129 | static inline size_t wfring_pow2(size_t order) 130 | { 131 | return (size_t) 1U << order; 132 | } 133 | 134 | struct __wfring { 135 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) head; 136 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfsatomic_t) threshold; 137 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) tail; 138 | __attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) array[1]; 139 | }; 140 | 141 | struct wfring; 142 | 143 | struct wfring_phase2 { 144 | _Atomic(lfatomic_t) seq1; 145 | _Atomic(lfatomic_t) *local; 146 | lfatomic_t cnt; 147 | _Atomic(lfatomic_t) seq2; 148 | }; 149 | 150 | struct wfring_state { 151 | __attribute__ ((aligned(LF_CACHE_BYTES))) 152 | _Atomic(struct wfring_state *) next; 153 | size_t nextCheck; 154 | struct wfring_state * currThread; 155 | 156 | struct wfring_phase2 phase2; 157 | 158 | _Atomic(lfatomic_t) seq1; 159 | _Atomic(lfatomic_t) tail; 160 | lfatomic_t initTail; 161 | _Atomic(lfatomic_t) head; 162 | lfatomic_t initHead; 163 | _Atomic(size_t) eidx; 164 | _Atomic(lfatomic_t) seq2; 165 | }; 166 | 167 | static inline void wfring_init_empty(struct wfring * ring, size_t order) 168 | { 169 | struct __wfring * q = (struct __wfring *) ring; 170 | size_t i, n = wfring_pow2(order + 1); 171 | 172 | for (i = 0; i != n; i++) 173 | __lfaba_init(&q->array[i], __wfring_pair((lfsatomic_t) -1, (lfsatomic_t) (-n - 1))); 174 | 175 | __lfaba_init(&q->head, 0); 176 | atomic_init(&q->threshold, -1); 177 | __lfaba_init(&q->tail, 0); 178 | } 179 | 180 | static inline void wfring_init_full(struct wfring * ring, size_t order) 181 | { 182 | struct __wfring * q = (struct __wfring *) ring; 183 | size_t i, half = wfring_pow2(order), n = half * 2; 184 | 185 | for (i = 0; i != half; i++) 186 | __lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair(2 * n + n + __wfring_raw_map(i, order, half), (lfsatomic_t) -1)); 187 | for (; i != n; i++) 188 | __lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair((lfsatomic_t) -1, (lfsatomic_t) (-n - 1))); 189 | 190 | __lfaba_init(&q->head, 0); 191 | atomic_init(&q->threshold, __wfring_threshold3(half, n)); 192 | __lfaba_init(&q->tail, __wfring_pair(half << 2, 0)); 193 | } 194 | 195 | static inline void wfring_init_fill(struct wfring * ring, 196 | size_t s, size_t e, size_t order) 197 | { 198 | struct __wfring * q = (struct __wfring *) ring; 199 | size_t i, half = wfring_pow2(order), n = half * 2; 200 | 201 | for (i = 0; i != s; i++) 202 | __lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair(4 * n - 1, (lfsatomic_t) -1)); 203 | for (; i != e; i++) 204 | __lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair(2 * n + n + i, (lfsatomic_t) -1)); 205 | for (; i != n; i++) 206 | __lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair((lfsatomic_t) -1, (lfsatomic_t) (-n - 1))); 207 | 208 | __lfaba_init(&q->head, __wfring_pair(s << 2, 0)); 209 | atomic_init(&q->threshold, __wfring_threshold3(half, n)); 210 | __lfaba_init(&q->tail, __wfring_pair(e << 2, 0)); 211 | } 212 | 213 | static inline void wfring_init_state(struct wfring * ring, 214 | struct wfring_state * state) 215 | { 216 | atomic_init(&state->next, NULL); 217 | 218 | state->currThread = state; 219 | state->nextCheck = WFRING_DELAY; 220 | 221 | atomic_init(&state->seq1, 1); 222 | atomic_init(&state->eidx, __WFRING_EIDX_TERM); 223 | atomic_init(&state->seq2, 0); 224 | 225 | atomic_init(&state->phase2.seq1, 1); 226 | atomic_init(&state->phase2.seq2, 0); 227 | } 228 | 229 | static inline lfatomic_t __wfring_load_global_help_phase2( 230 | _Atomic(lfatomic_big_t) * global, lfatomic_big_t gp) 231 | { 232 | struct wfring_phase2 * phase2; 233 | _Atomic(lfatomic_t) * local; 234 | lfatomic_t seq, cnt; 235 | 236 | do { 237 | phase2 = (struct wfring_phase2 *) __wfring_addon(gp); 238 | if (phase2 == NULL) break; 239 | seq = atomic_load(&phase2->seq2); 240 | local = phase2->local; 241 | cnt = phase2->cnt; 242 | if (atomic_load(&phase2->seq1) == seq) { 243 | lfatomic_t cnt_inc = cnt + __WFRING_INC; 244 | atomic_compare_exchange_strong(local, &cnt_inc, cnt); 245 | } 246 | } while (!__lfaba_cmpxchg_strong(global, &gp, 247 | __wfring_pair(__wfring_entry(gp), 0), 248 | memory_order_acq_rel, memory_order_acquire)); 249 | return __wfring_entry(gp); 250 | } 251 | 252 | static inline bool __wfring_slow_inc(_Atomic(lfatomic_big_t) * global, 253 | _Atomic(lfatomic_t) * local, lfatomic_t * prev, 254 | _Atomic(lfsatomic_t) * threshold, struct wfring_phase2 * phase2) 255 | { 256 | lfatomic_t seq, cnt, cnt_inc; 257 | lfatomic_big_t gp = __lfaba_load_atomic(global, memory_order_acquire); 258 | 259 | do { 260 | if (atomic_load(local) & __WFRING_FIN) 261 | return false; 262 | cnt = __wfring_load_global_help_phase2(global, gp); 263 | if (!atomic_compare_exchange_strong(local, prev, cnt + __WFRING_INC)) { 264 | if (*prev & __WFRING_FIN) return false; 265 | if (!(*prev & __WFRING_INC)) return true; 266 | cnt = *prev - __WFRING_INC; 267 | } else { 268 | *prev = cnt + __WFRING_INC; 269 | } 270 | seq = atomic_load(&phase2->seq1) + 1; 271 | atomic_store(&phase2->seq1, seq); 272 | phase2->local = local; 273 | phase2->cnt = cnt; 274 | atomic_store(&phase2->seq2, seq); 275 | gp = __wfring_pair(cnt, 0); 276 | } while (!__lfaba_cmpxchg_strong(global, &gp, 277 | __wfring_pair(cnt + 1, phase2), 278 | memory_order_acq_rel, memory_order_acquire)); 279 | 280 | if (threshold != NULL) 281 | atomic_fetch_sub_explicit(threshold, 1, memory_order_acq_rel); 282 | 283 | cnt_inc = cnt + __WFRING_INC; 284 | atomic_compare_exchange_strong(local, &cnt_inc, cnt); 285 | gp = __wfring_pair(cnt + 1, phase2); 286 | __lfaba_cmpxchg_strong(global, &gp, __wfring_pair(cnt + 1, 0), 287 | memory_order_acq_rel, memory_order_acquire); 288 | *prev = cnt; 289 | 290 | return true; 291 | } 292 | 293 | static inline void __wfring_do_enqueue_slow(struct __wfring * q, size_t order, 294 | size_t eidx, lfatomic_t seq, lfatomic_t tail, bool nonempty, 295 | struct wfring_state * state) 296 | { 297 | size_t tidx, half = wfring_pow2(order), n = half * 2; 298 | lfatomic_t entry, note, ecycle, tcycle; 299 | lfatomic_big_t pair; 300 | 301 | while (__wfring_slow_inc(&q->tail, &state->tail, &tail, NULL, 302 | &state->phase2)) { 303 | if (atomic_load(&state->seq1) != seq) 304 | break; 305 | tcycle = tail | (4 * n - 1); 306 | tidx = __wfring_map(tail >> 2, order, n); 307 | pair = __lfaba_load(&q->array[tidx], memory_order_acquire); 308 | retry: 309 | entry = __wfring_entry(pair); 310 | note = __wfring_addon(pair); 311 | ecycle = entry | (4 * n - 1); 312 | if (__wfring_cmp(ecycle, <, tcycle) && __wfring_cmp(note, <, tcycle)) { 313 | if ((((entry | 0x1) == ecycle) || 314 | (((entry | 0x1) == (ecycle ^ n)) && 315 | __wfring_cmp( 316 | atomic_load_explicit(__wfring_pair_entry(&q->head), 317 | memory_order_acquire), <=, tail)))) { 318 | 319 | if (!__lfaba_cmpxchg_weak(&q->array[tidx], 320 | &pair, __wfring_pair(tcycle ^ eidx ^ n, note), 321 | memory_order_acq_rel, memory_order_acquire)) 322 | goto retry; 323 | 324 | entry = tcycle ^ eidx; 325 | 326 | if (atomic_compare_exchange_strong_explicit(&state->tail, &tail, 327 | tail + 0x1, 328 | memory_order_acq_rel, memory_order_acquire)) { 329 | 330 | /* Finalize the entry. */ 331 | atomic_compare_exchange_strong_explicit( 332 | __wfring_pair_entry(&q->array[tidx]), 333 | &entry, entry ^ n, memory_order_acq_rel, 334 | memory_order_acquire); 335 | } 336 | 337 | if (!nonempty && (atomic_load(&q->threshold) != __wfring_threshold3(half, n))) 338 | atomic_store(&q->threshold, __wfring_threshold3(half, n)); 339 | return; 340 | } else if ((entry | (2 * n + n)) == tcycle) { 341 | /* Already produced. */ 342 | return; 343 | } else { 344 | /* Skip this entry. */ 345 | if (!__lfaba_cmpxchg_weak(&q->array[tidx], 346 | &pair, __wfring_pair(entry, tcycle), 347 | memory_order_acq_rel, memory_order_acquire)) 348 | goto retry; 349 | } 350 | } 351 | } 352 | } 353 | 354 | __attribute__((noinline)) static void __wfring_enqueue_slow( 355 | struct __wfring * q, size_t order, size_t eidx, 356 | lfatomic_t tail, bool nonempty, struct wfring_state * state) 357 | { 358 | lfatomic_t seq = atomic_load(&state->seq1); 359 | 360 | /* Initiate a helping request. */ 361 | atomic_store(&state->tail, tail); 362 | state->initTail = tail; 363 | atomic_store(&state->eidx, eidx); 364 | atomic_store(&state->seq2, seq); 365 | 366 | __wfring_do_enqueue_slow(q, order, eidx, seq, tail, nonempty, state); 367 | 368 | /* Terminate the helping request. */ 369 | atomic_store(&state->seq1, seq + 1); 370 | atomic_store(&state->eidx, __WFRING_EIDX_TERM); 371 | } 372 | 373 | __attribute__((noinline)) static void __wfring_enqueue_help_thread( 374 | struct __wfring * q, size_t order, bool nonempty, 375 | struct wfring_state * state) 376 | { 377 | lfatomic_t seq = atomic_load(&state->seq2); 378 | size_t eidx = atomic_load(&state->eidx); 379 | lfatomic_t tail = state->initTail; 380 | if (eidx <= __WFRING_EIDX_DEQ || atomic_load(&state->seq1) != seq) 381 | return; 382 | 383 | __wfring_do_enqueue_slow(q, order, eidx, seq, tail, nonempty, state); 384 | } 385 | 386 | static inline void __wfring_catchup(struct __wfring * q, 387 | lfatomic_t tail, lfatomic_t head) 388 | { 389 | while (!atomic_compare_exchange_weak_explicit(__wfring_pair_entry(&q->tail), 390 | &tail, head, memory_order_acq_rel, memory_order_acquire)) { 391 | head = atomic_load(__wfring_pair_entry(&q->head)); 392 | tail = atomic_load(__wfring_pair_entry(&q->tail)); 393 | if (__wfring_cmp(tail, >=, head)) 394 | break; 395 | } 396 | } 397 | 398 | static inline void __wfring_lookup(struct wfring_state * state, 399 | lfatomic_t tail, size_t n) 400 | { 401 | struct wfring_state * curr = atomic_load(&state->next); 402 | while (curr != state) { 403 | if ((atomic_load(&curr->tail) & ~((lfatomic_t) 0x3)) == tail) { 404 | atomic_compare_exchange_strong(&curr->tail, &tail, tail ^ 0x1); 405 | return; 406 | } 407 | curr = atomic_load(&curr->next); 408 | } 409 | return; 410 | } 411 | 412 | static inline void __wfring_do_dequeue_slow(struct __wfring * q, size_t order, 413 | lfatomic_t seq, lfatomic_t head, bool nonempty, struct wfring_state * state) 414 | { 415 | size_t hidx, n = wfring_pow2(order + 1); 416 | lfatomic_t entry, note, entry_new, ecycle, hcycle, tail; 417 | lfatomic_big_t pair; 418 | _Atomic(lfsatomic_t) * threshold = nonempty ? NULL : &q->threshold; 419 | 420 | while (__wfring_slow_inc(&q->head, &state->head, &head, threshold, 421 | &state->phase2)) { 422 | hcycle = head | (4 * n - 1); 423 | hidx = __wfring_map(head >> 2, order, n); 424 | pair = __lfaba_load(&q->array[hidx], memory_order_acquire); 425 | retry: 426 | do { 427 | entry = __wfring_entry(pair); 428 | note = __wfring_addon(pair); 429 | ecycle = entry | (4 * n - 1); 430 | if (ecycle == hcycle && (entry & (n - 1)) != (n - 2)) { 431 | lfatomic_t _h = head; 432 | atomic_compare_exchange_strong(&state->head, &_h, head ^ 0x1); 433 | return; 434 | } 435 | 436 | if ((entry | (2 * n) | 0x1) != ecycle) { 437 | if (__wfring_cmp(ecycle, <, hcycle) && 438 | __wfring_cmp(note, <, hcycle)) { 439 | /* Do not enqueue in this entry. */ 440 | if (!__lfaba_cmpxchg_weak(&q->array[hidx], &pair, 441 | __wfring_pair(entry, hcycle), 442 | memory_order_acq_rel, memory_order_acquire)) 443 | goto retry; 444 | } 445 | entry_new = entry & ~(lfatomic_t) (2 * n); 446 | if (entry == entry_new) 447 | break; 448 | } else { 449 | entry_new = hcycle ^ ((~entry) & (2 * n)) ^ 0x1; 450 | } 451 | } while (__wfring_cmp(ecycle, <, hcycle) && 452 | !__lfaba_cmpxchg_weak(&q->array[hidx], &pair, 453 | __wfring_pair(entry_new, note), 454 | memory_order_acq_rel, memory_order_acquire)); 455 | 456 | if (!nonempty) { 457 | tail = atomic_load_explicit(__wfring_pair_entry(&q->tail), 458 | memory_order_acquire); 459 | if (__wfring_cmp(tail, <=, head + 4)) { 460 | __wfring_catchup(q, tail, head + 4); 461 | } 462 | if (atomic_load(&q->threshold) < 0) { 463 | lfatomic_t _h = head; 464 | atomic_compare_exchange_strong(&state->head, &_h, 465 | head + __WFRING_FIN); 466 | } 467 | } 468 | } 469 | } 470 | 471 | __attribute__((noinline)) static size_t __wfring_dequeue_slow(struct __wfring * q, size_t order, 472 | lfatomic_t head, bool nonempty, struct wfring_state * state) 473 | { 474 | size_t hidx, n = wfring_pow2(order + 1); 475 | lfatomic_t entry, hcycle; 476 | lfatomic_t seq = atomic_load(&state->seq1); 477 | 478 | /* Initiate a helping request. */ 479 | atomic_store(&state->head, head); 480 | state->initHead = head; 481 | atomic_store(&state->eidx, __WFRING_EIDX_DEQ); 482 | atomic_store(&state->seq2, seq); 483 | 484 | __wfring_do_dequeue_slow(q, order, seq, head, nonempty, state); 485 | 486 | /* Terminate the helping request. */ 487 | atomic_store(&state->seq1, seq + 1); 488 | atomic_store(&state->eidx, __WFRING_EIDX_TERM); 489 | 490 | /* Consume an element. */ 491 | head = atomic_load(&state->head); 492 | hcycle = head | (4 * n - 1); 493 | hidx = __wfring_map(head >> 2, order, n); 494 | entry = atomic_load_explicit(__wfring_pair_entry(&q->array[hidx]), memory_order_acquire); 495 | if (nonempty || ((entry | (2 * n + n)) == hcycle)) { 496 | if (!(entry & n)) 497 | __wfring_lookup(state, head, n); 498 | atomic_fetch_or_explicit(__wfring_pair_entry(&q->array[hidx]), 499 | (2 * n - 1), memory_order_acq_rel); 500 | return (size_t) (entry & (n - 1)); 501 | } 502 | 503 | return WFRING_EMPTY; 504 | } 505 | 506 | __attribute__((noinline)) static void __wfring_dequeue_help_thread(struct __wfring * q, 507 | size_t order, bool nonempty, struct wfring_state * state) 508 | { 509 | lfatomic_t seq = atomic_load(&state->seq2); 510 | lfatomic_t eidx = atomic_load(&state->eidx); 511 | lfatomic_t head = atomic_load(&state->initHead); 512 | if (eidx != __WFRING_EIDX_DEQ || atomic_load(&state->seq1) != seq) 513 | return; 514 | 515 | __wfring_do_dequeue_slow(q, order, seq, head, nonempty, state); 516 | } 517 | 518 | __attribute__((noinline)) static void __wfring_help(struct __wfring * q, size_t order, 519 | bool nonempty, struct wfring_state * state) 520 | { 521 | struct wfring_state * curr = state->currThread; 522 | if (curr != state) { 523 | size_t eidx = atomic_load(&curr->eidx); 524 | if (eidx != __WFRING_EIDX_TERM) { 525 | if (eidx != __WFRING_EIDX_DEQ) 526 | __wfring_enqueue_help_thread(q, order, nonempty, curr); 527 | else 528 | __wfring_dequeue_help_thread(q, order, nonempty, curr); 529 | } 530 | curr = atomic_load(&curr->next); 531 | } 532 | state->currThread = atomic_load(&curr->next); 533 | state->nextCheck = WFRING_DELAY; 534 | } 535 | 536 | static inline void wfring_enqueue(struct wfring * ring, 537 | size_t order, size_t eidx, bool nonempty, struct wfring_state * state) 538 | { 539 | struct __wfring * q = (struct __wfring *) ring; 540 | size_t tidx, half = wfring_pow2(order), n = half * 2; 541 | lfatomic_t tail, entry, ecycle, tcycle; 542 | size_t patience = WFRING_PATIENCE_ENQ; 543 | 544 | eidx ^= (n - 1); 545 | if (--state->nextCheck == 0) 546 | __wfring_help(q, order, nonempty, state); 547 | 548 | do { 549 | tail = atomic_fetch_add_explicit(__wfring_pair_entry(&q->tail), 4, memory_order_acq_rel); 550 | tcycle = tail | (4 * n - 1); 551 | tidx = __wfring_map(tail >> 2, order, n); 552 | entry = atomic_load_explicit(__wfring_pair_entry(&q->array[tidx]), memory_order_acquire); 553 | retry: 554 | ecycle = entry | (4 * n - 1); 555 | if (__wfring_cmp(ecycle, <, tcycle) && (((entry | 0x1) == ecycle) || 556 | (((entry | 0x1) == (ecycle ^ (2 * n))) && 557 | __wfring_cmp(atomic_load_explicit( 558 | __wfring_pair_entry(&q->head), 559 | memory_order_acquire), <=, tail)))) { 560 | 561 | if (!atomic_compare_exchange_weak_explicit( 562 | __wfring_pair_entry(&q->array[tidx]), 563 | &entry, tcycle ^ eidx, 564 | memory_order_acq_rel, memory_order_acquire)) 565 | goto retry; 566 | 567 | if (!nonempty && (atomic_load(&q->threshold) != __wfring_threshold3(half, n))) 568 | atomic_store(&q->threshold, __wfring_threshold3(half, n)); 569 | return; 570 | } 571 | } while (--patience != 0); 572 | 573 | __wfring_enqueue_slow(q, order, eidx, tail, nonempty, state); 574 | } 575 | 576 | static inline size_t wfring_dequeue(struct wfring * ring, size_t order, 577 | bool nonempty, struct wfring_state * state) 578 | { 579 | struct __wfring * q = (struct __wfring *) ring; 580 | size_t hidx, n = wfring_pow2(order + 1); 581 | lfatomic_t head, entry, entry_new, ecycle, hcycle, tail; 582 | // size_t attempt; 583 | size_t patience = WFRING_PATIENCE_DEQ; 584 | 585 | if (!nonempty && atomic_load(&q->threshold) < 0) { 586 | return WFRING_EMPTY; 587 | } 588 | 589 | if (--state->nextCheck == 0) 590 | __wfring_help(q, order, nonempty, state); 591 | 592 | do { 593 | head = atomic_fetch_add_explicit(__wfring_pair_entry(&q->head), 4, memory_order_acq_rel); 594 | hcycle = head | (4 * n - 1); 595 | hidx = __wfring_map(head >> 2, order, n); 596 | //attempt = 0; 597 | //again: 598 | entry = atomic_load_explicit(__wfring_pair_entry(&q->array[hidx]), memory_order_acquire); 599 | 600 | do { 601 | ecycle = entry | (4 * n - 1); 602 | if (ecycle == hcycle) { 603 | /* Need to help finalizing the entry. */ 604 | if (!(entry & n)) 605 | __wfring_lookup(state, head, n); 606 | atomic_fetch_or_explicit(__wfring_pair_entry(&q->array[hidx]), 607 | (2 * n - 1), memory_order_acq_rel); 608 | return (size_t) (entry & (n - 1)); 609 | } 610 | 611 | if ((entry | (2 * n) | 0x1) != ecycle) { 612 | entry_new = entry & ~(lfatomic_t) (2 * n); 613 | if (entry == entry_new) 614 | break; 615 | } else { 616 | // if (++attempt <= 10000) 617 | // goto again; 618 | entry_new = hcycle ^ ((~entry) & (2 * n)) ^ 0x1; 619 | } 620 | } while (__wfring_cmp(ecycle, <, hcycle) && 621 | !atomic_compare_exchange_weak_explicit( 622 | __wfring_pair_entry(&q->array[hidx]), &entry, entry_new, 623 | memory_order_acq_rel, memory_order_acquire)); 624 | 625 | if (!nonempty) { 626 | tail = atomic_load_explicit(__wfring_pair_entry(&q->tail), 627 | memory_order_acquire); 628 | if (__wfring_cmp(tail, <=, head + 4)) { 629 | __wfring_catchup(q, tail, head + 4); 630 | atomic_fetch_sub_explicit(&q->threshold, 1, 631 | memory_order_acq_rel); 632 | return WFRING_EMPTY; 633 | } 634 | 635 | if (atomic_fetch_sub_explicit(&q->threshold, 1, 636 | memory_order_acq_rel) <= 0) 637 | return WFRING_EMPTY; 638 | } 639 | } while (--patience != 0); 640 | 641 | return __wfring_dequeue_slow(q, order, head, nonempty, state); 642 | } 643 | 644 | #endif /* !__WFRING_H */ 645 | 646 | /* vi: set tabstop=4: */ 647 | --------------------------------------------------------------------------------