├── README.md
├── benchmark.patch
├── benchmark
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── align.h
    ├── benchmark
    ├── benchmark.h
    ├── bits.h
    ├── cas.c
    ├── ccqueue.c
    ├── ccqueue.h
    ├── ccsynch.h
    ├── cpumap.h
    ├── delay.c
    ├── delay.h
    ├── driver
    ├── faa.c
    ├── halfhalf.c
    ├── harness.c
    ├── hzdptr.c
    ├── hzdptr.h
    ├── lcrq.c
    ├── lcrq.h
    ├── msqueue.c
    ├── msqueue.h
    ├── ncq.c
    ├── ncq.h
    ├── pairwise.c
    ├── primitives.h
    ├── queue.h
    ├── scq.c
    ├── scq.h
    ├── scq2.c
    ├── scq2.h
    ├── scqd.c
    ├── scqd.h
    ├── wcq.c
    ├── wcq.h
    ├── wfqueue.c
    ├── wfqueue.h
    ├── xxhash.c
    └── xxhash.h
├── lf
    ├── c11.h
    ├── config.h
    ├── gcc_x86.h
    └── lf.h
├── lfring_cas1.h
├── lfring_cas2.h
├── lfring_naive.h
└── wfring_cas2.h


/README.md:
--------------------------------------------------------------------------------
 1 | # A Scalable, Portable, and Memory-Efficient Lock-Free FIFO Queue
 2 | 
 3 | * Publications
 4 | 
 5 | 	wCQ: A Fast Wait-Free Queue with Bounded Memory Usage.
 6 | 	In Proceedings of the 34th ACM Symposium on Parallelism in Algorithms
 7 | 	and Architectures (SPAA'22). Philadelphia, PA, USA.
 8 | 
 9 | 	[Paper](https://dl.acm.org/doi/pdf/10.1145/3490148.3538572)
10 | 
11 | 	A Scalable, Portable, and Memory-Efficient Lock-Free FIFO Queue.
12 | 	In Proceedings of the 33rd International Symposium on DIStributed
13 | 	Computing (DISC'19). Budapest, Hungary.
14 | 
15 | 	[Paper](http://drops.dagstuhl.de/opus/volltexte/2019/11335/pdf/LIPIcs-DISC-2019-28.pdf)
16 | 
17 | * Source code license
18 | 
19 | 	Copyright (c) 2019, 2021 Ruslan Nikolaev. All Rights Reserved.
20 | 
21 | 	The SCQ/SCQD/SCQ2/NCQ/wCQ code is dual-licensed under 2-Clause BSD and MIT.
22 | 
23 | * Description
24 | 
25 | 	The benchmark code is in the "benchmark" directory,
26 | 	which is forked from the original WFQUEUE's benchmark
27 | 	available [here](https://github.com/chaoran/fast-wait-free-queue).
28 | 	For usage, see its original README file in "benchmark".
29 | 
30 | 	Additional queues were implemented. See the description below for
31 | 	details.
32 | 
33 | 	Both GCC and LLVM should be supported. Older versions may lack
34 | 	support for lfring\_cas2.h and wfring\_cas2.h and/or have
35 | 	suboptimal performance. We have tested the code with
36 | 	GCC 8.3.0+ and LLVM 7.0.1+.
37 | 
38 | * CAS
39 | 
40 | 	An implementation of the FAA test using CAS emulation (based on
41 | 	the original FAA test).
42 | 
43 | * NCQ
44 | 
45 | 	A naive implementation of the ring buffer. The implementation is in
46 | 	lfring\_naive.h.
47 | 
48 | * SCQ
49 | 
50 | 	This is a "bare-bones" SCQ which simply implements *enqueue* and *dequeue*
51 | 	(all platforms). The implementation is in lfring\_cas1.h.
52 | 
53 | * SCQD
54 | 
55 | 	This is a version which stores pointers as data entries through indirection
56 | 	(all platforms). The implementation is in lfring\_cas1.h.
57 | 
58 | * SCQ2
59 | 
60 | 	This is a version which stores pointers through double-width CAS
61 | 	(certain platforms such as x86-64). The implementation is in lfring\_cas2.h.
62 | 
63 | * wCQ
64 | 
65 | 	An implementation of a wait-free version of SCQ, which uses double-width
66 | 	CAS (certain platforms such as x86-64). The implementation is in
67 | 	wfring\_cas2.h.
68 | 


--------------------------------------------------------------------------------
/benchmark.patch:
--------------------------------------------------------------------------------
  1 | diff -urN benchmark.orig/align.h benchmark/align.h
  2 | --- benchmark.orig/align.h	2020-12-22 17:17:27.685566626 -0500
  3 | +++ benchmark/align.h	2020-12-22 17:14:37.945420646 -0500
  4 | @@ -16,7 +16,7 @@
  5 |  
  6 |    int ret = posix_memalign(&ptr, align, size);
  7 |    if (ret != 0) {
  8 | -    fprintf(stderr, strerror(ret));
  9 | +    fprintf(stderr, "error: %s\n", strerror(ret));
 10 |      abort();
 11 |    }
 12 |  
 13 | diff -urN benchmark.orig/cas.c benchmark/cas.c
 14 | --- benchmark.orig/cas.c	1969-12-31 19:00:00.000000000 -0500
 15 | +++ benchmark/cas.c	2019-10-22 10:31:31.124409717 -0400
 16 | @@ -0,0 +1,26 @@
 17 | +#include "queue.h"
 18 | +#include "primitives.h"
 19 | +
 20 | +void queue_init(queue_t * q, int nprocs) {}
 21 | +void queue_register(queue_t * q, handle_t * hd, int id)
 22 | +{
 23 | +  *hd = id + 1;
 24 | +}
 25 | +
 26 | +void enqueue(queue_t * q, handle_t * th, void * val)
 27 | +{
 28 | +  long p = q->P;
 29 | +  while (!CAS(&q->P, &p, p + 1))
 30 | +    ;
 31 | +}
 32 | +
 33 | +void * dequeue(queue_t * q, handle_t * th)
 34 | +{
 35 | +  long c = q->C;
 36 | +  while (!CAS(&q->C, &c, c + 1))
 37 | +    ;
 38 | +  return (void *) (long) *th;
 39 | +}
 40 | +
 41 | +void queue_free(queue_t * q, handle_t * h) {}
 42 | +
 43 | diff -urN benchmark.orig/halfhalf.c benchmark/halfhalf.c
 44 | --- benchmark.orig/halfhalf.c	2019-10-22 10:44:25.757993930 -0400
 45 | +++ benchmark/halfhalf.c	2019-10-22 10:31:31.124409717 -0400
 46 | @@ -25,7 +25,8 @@
 47 |  
 48 |    printf("  Number of operations: %ld\n", nops);
 49 |  
 50 | -  q = align_malloc(PAGE_SIZE, sizeof(queue_t));
 51 | +  // FIXME: sizeof(queue_t) varies, allocate 4MB
 52 | +  q = align_malloc(PAGE_SIZE, 4194304);
 53 |    queue_init(q, nprocs);
 54 |  
 55 |    hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs]));
 56 | @@ -60,7 +61,7 @@
 57 |      else
 58 |        dequeue(q, th);
 59 |  
 60 | -    delay_exec(&state);
 61 | +//    delay_exec(&state);
 62 |    }
 63 |  
 64 |    return val;
 65 | diff -urN benchmark.orig/lcrq.c benchmark/lcrq.c
 66 | --- benchmark.orig/lcrq.c	2019-10-22 10:44:25.761993916 -0400
 67 | +++ benchmark/lcrq.c	2019-10-22 10:31:31.124409717 -0400
 68 | @@ -112,6 +112,7 @@
 69 |  alloc:
 70 |        nrq = handle->next;
 71 |  
 72 | +      void *org_nrq = nrq;
 73 |        if (nrq == NULL) {
 74 |          nrq = align_malloc(PAGE_SIZE, sizeof(RingQueue));
 75 |          init_ring(nrq);
 76 | @@ -127,6 +128,9 @@
 77 |          handle->next = NULL;
 78 |          return;
 79 |        }
 80 | +
 81 | +      // Did not succeed, free the buffer
 82 | +      if (org_nrq == NULL) free(nrq);
 83 |        continue;
 84 |      }
 85 |  
 86 | diff -urN benchmark.orig/Makefile benchmark/Makefile
 87 | --- benchmark.orig/Makefile	2019-10-22 10:44:25.757993930 -0400
 88 | +++ benchmark/Makefile	2019-10-22 10:31:31.124409717 -0400
 89 | @@ -1,8 +1,9 @@
 90 | -TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay
 91 | +TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay cas scq scq2 scqd ncq
 92 |  
 93 | +# if using clang, please also specify -mcx16 for x86-64
 94 |  CC = gcc
 95 |  CFLAGS = -g -Wall -O3 -pthread -D_GNU_SOURCE
 96 | -LDLIBS = -lpthread -lm
 97 | +LDLIBS = -ljemalloc -lpthread -lm
 98 |  
 99 |  ifeq (${VERIFY}, 1)
100 |  	CFLAGS += -DVERIFY
101 | @@ -39,7 +40,12 @@
102 |  ccqueue: CFLAGS += -DCCQUEUE
103 |  msqueue: CFLAGS += -DMSQUEUE
104 |  faa: CFLAGS += -DFAAQ
105 | +cas: CFLAGS += -DFAAQ
106 |  delay: CFLAGS += -DDELAY
107 | +scq: CFLAGS += -DSCQ
108 | +scqd: CFLAGS += -DSCQD
109 | +scq2: CFLAGS += -DSCQ2
110 | +ncq: CFLAGS += -DNCQ
111 |  
112 |  $(TESTS): harness.o
113 |  ifeq (${HALFHALF}, 1)
114 | diff -urN benchmark.orig/ncq.c benchmark/ncq.c
115 | --- benchmark.orig/ncq.c	1969-12-31 19:00:00.000000000 -0500
116 | +++ benchmark/ncq.c	2019-10-22 10:31:31.124409717 -0400
117 | @@ -0,0 +1,29 @@
118 | +#include <stdint.h>
119 | +#include <stdlib.h>
120 | +#include <string.h>
121 | +#include "ncq.h"
122 | +
123 | +void queue_init(queue_t * q, int nprocs)
124 | +{
125 | +  lfring_init_empty((struct lfring *) q->ring, NCQ_ORDER);
126 | +}
127 | +
128 | +
129 | +void queue_register(queue_t * q, handle_t * th, int id)
130 | +{
131 | +}
132 | +
133 | +void enqueue(queue_t * q, handle_t * th, void * val)
134 | +{
135 | +  size_t eidx = (size_t) val;
136 | +  lfring_enqueue((struct lfring *) q->ring, NCQ_ORDER, eidx, false);
137 | +}
138 | +
139 | +void * dequeue(queue_t * q, handle_t * th)
140 | +{
141 | +  return (void *) lfring_dequeue((struct lfring *) q->ring, NCQ_ORDER, false);
142 | +}
143 | +
144 | +void queue_free(queue_t * q, handle_t * h)
145 | +{
146 | +}
147 | diff -urN benchmark.orig/ncq.h benchmark/ncq.h
148 | --- benchmark.orig/ncq.h	1969-12-31 19:00:00.000000000 -0500
149 | +++ benchmark/ncq.h	2019-10-22 10:31:31.124409717 -0400
150 | @@ -0,0 +1,23 @@
151 | +#ifndef NCQ_H
152 | +#define NCQ_H
153 | +
154 | +#ifdef NCQ
155 | +
156 | +#include <stddef.h>
157 | +#include "../lfring_naive.h"
158 | +#include "align.h"
159 | +
160 | +#define NCQ_ORDER 16
161 | +#define EMPTY (void *) LFRING_EMPTY
162 | +
163 | +typedef struct _queue_t {
164 | +  char ring[LFRING_SIZE(NCQ_ORDER)];
165 | +} queue_t DOUBLE_CACHE_ALIGNED;
166 | +
167 | +typedef struct _handle_t {
168 | +  int pad;
169 | +} handle_t DOUBLE_CACHE_ALIGNED;
170 | +
171 | +#endif
172 | +
173 | +#endif /* end of include guard: NCQ_H */
174 | diff -urN benchmark.orig/pairwise.c benchmark/pairwise.c
175 | --- benchmark.orig/pairwise.c	2019-10-22 10:44:25.761993916 -0400
176 | +++ benchmark/pairwise.c	2019-10-22 10:31:31.124409717 -0400
177 | @@ -26,7 +26,8 @@
178 |  
179 |    printf("  Number of operations: %ld\n", nops);
180 |  
181 | -  q = align_malloc(PAGE_SIZE, sizeof(queue_t));
182 | +  // FIXME: sizeof(queue_t) varies, allocate 4MB
183 | +  q = align_malloc(PAGE_SIZE, 4194304);
184 |    queue_init(q, nprocs);
185 |  
186 |    hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs]));
187 | @@ -47,10 +48,10 @@
188 |    int i;
189 |    for (i = 0; i < nops / nprocs; ++i) {
190 |      enqueue(q, th, val);
191 | -    delay_exec(&state);
192 | +//    delay_exec(&state);
193 |  
194 |      val = dequeue(q, th);
195 | -    delay_exec(&state);
196 | +//    delay_exec(&state);
197 |    }
198 |  
199 |    return val;
200 | diff -urN benchmark.orig/queue.h benchmark/queue.h
201 | --- benchmark.orig/queue.h	2019-10-22 10:44:25.761993916 -0400
202 | +++ benchmark/queue.h	2019-10-22 10:31:31.124409717 -0400
203 | @@ -28,6 +28,18 @@
204 |  typedef int queue_t;
205 |  typedef int handle_t;
206 |  
207 | +#elif NCQ
208 | +#include "ncq.h"
209 | +
210 | +#elif SCQ
211 | +#include "scq.h"
212 | +
213 | +#elif SCQ
214 | +#include "scqd.h"
215 | +
216 | +#elif SCQ2
217 | +#include "scq2.h"
218 | +
219 |  #else
220 |  #error "Please specify a queue implementation."
221 |  
222 | diff -urN benchmark.orig/README.md benchmark/README.md
223 | --- benchmark.orig/README.md	2019-10-22 10:44:25.757993930 -0400
224 | +++ benchmark/README.md	2019-10-22 10:43:39.170156671 -0400
225 | @@ -1,4 +1,9 @@
226 | -# Fast Wait Free Queue
227 | +# Benchmark
228 | +
229 | +The benchmark is forked from the "Fast Wait Free Queue" paper. The
230 | +original code is
231 | +available [here](https://github.com/chaoran/fast-wait-free-queue).
232 | +See the original README file below.
233 |  
234 |  This is a benchmark framework for evaluating the performance of concurrent queues. Currently, it contains four concurrent queue implementations. They are:
235 |  
236 | diff -urN benchmark.orig/scq2.c benchmark/scq2.c
237 | --- benchmark.orig/scq2.c	1969-12-31 19:00:00.000000000 -0500
238 | +++ benchmark/scq2.c	2019-10-22 10:31:31.124409717 -0400
239 | @@ -0,0 +1,34 @@
240 | +#include <stdint.h>
241 | +#include <stdlib.h>
242 | +#include <string.h>
243 | +#include "scq2.h"
244 | +
245 | +void queue_init(queue_t * q, int nprocs)
246 | +{
247 | +  lfring_ptr_init_empty((struct lfring_ptr *) q->ring, SCQ2_ORDER);
248 | +}
249 | +
250 | +
251 | +void queue_register(queue_t * q, handle_t * th, int id)
252 | +{
253 | +}
254 | +
255 | +void enqueue(queue_t * q, handle_t * th, void * val)
256 | +{
257 | +  lfring_ptr_enqueue((struct lfring_ptr *) q->ring, SCQ2_ORDER, val + 1,
258 | +    false, true);
259 | +}
260 | +
261 | +void * dequeue(queue_t * q, handle_t * th)
262 | +{
263 | +  void *ptr;
264 | +  if (!lfring_ptr_dequeue((struct lfring_ptr *) q->ring, SCQ2_ORDER,
265 | +      &ptr, false))
266 | +    return EMPTY;
267 | +  ptr--;
268 | +  return ptr;
269 | +}
270 | +
271 | +void queue_free(queue_t * q, handle_t * h)
272 | +{
273 | +}
274 | diff -urN benchmark.orig/scq2.h benchmark/scq2.h
275 | --- benchmark.orig/scq2.h	1969-12-31 19:00:00.000000000 -0500
276 | +++ benchmark/scq2.h	2019-10-22 10:31:31.124409717 -0400
277 | @@ -0,0 +1,23 @@
278 | +#ifndef SCQ2_H
279 | +#define SCQ2_H
280 | +
281 | +#ifdef SCQ2
282 | +
283 | +#include <stddef.h>
284 | +#include "../lfring_cas2.h"
285 | +#include "align.h"
286 | +
287 | +#define SCQ2_ORDER 15
288 | +#define EMPTY (void *) -1
289 | +
290 | +typedef struct _queue_t {
291 | +  char ring[LFRING_PTR_SIZE(SCQ2_ORDER)];
292 | +} queue_t DOUBLE_CACHE_ALIGNED;
293 | +
294 | +typedef struct _handle_t {
295 | +  int pad;
296 | +} handle_t DOUBLE_CACHE_ALIGNED;
297 | +
298 | +#endif
299 | +
300 | +#endif /* end of include guard: SCQ_H */
301 | diff -urN benchmark.orig/scq.c benchmark/scq.c
302 | --- benchmark.orig/scq.c	1969-12-31 19:00:00.000000000 -0500
303 | +++ benchmark/scq.c	2019-10-22 10:31:31.124409717 -0400
304 | @@ -0,0 +1,29 @@
305 | +#include <stdint.h>
306 | +#include <stdlib.h>
307 | +#include <string.h>
308 | +#include "scq.h"
309 | +
310 | +void queue_init(queue_t * q, int nprocs)
311 | +{
312 | +  lfring_init_empty((struct lfring *) q->ring, SCQ_ORDER);
313 | +}
314 | +
315 | +
316 | +void queue_register(queue_t * q, handle_t * th, int id)
317 | +{
318 | +}
319 | +
320 | +void enqueue(queue_t * q, handle_t * th, void * val)
321 | +{
322 | +  size_t eidx = (size_t) val;
323 | +  lfring_enqueue((struct lfring *) q->ring, SCQ_ORDER, eidx, false);
324 | +}
325 | +
326 | +void * dequeue(queue_t * q, handle_t * th)
327 | +{
328 | +  return (void *) lfring_dequeue((struct lfring *) q->ring, SCQ_ORDER, false);
329 | +}
330 | +
331 | +void queue_free(queue_t * q, handle_t * h)
332 | +{
333 | +}
334 | diff -urN benchmark.orig/scqd.c benchmark/scqd.c
335 | --- benchmark.orig/scqd.c	1969-12-31 19:00:00.000000000 -0500
336 | +++ benchmark/scqd.c	2019-10-22 10:31:31.128409708 -0400
337 | @@ -0,0 +1,39 @@
338 | +#include <stdint.h>
339 | +#include <stdlib.h>
340 | +#include <string.h>
341 | +#include "scqd.h"
342 | +
343 | +void queue_init(queue_t * q, int nprocs)
344 | +{
345 | +  lfring_init_empty((struct lfring *) q->aq, SCQD_ORDER);
346 | +  lfring_init_full((struct lfring *) q->fq, SCQD_ORDER);
347 | +}
348 | +
349 | +
350 | +void queue_register(queue_t * q, handle_t * th, int id)
351 | +{
352 | +}
353 | +
354 | +void enqueue(queue_t * q, handle_t * th, void * val)
355 | +{
356 | +  size_t eidx;
357 | +  eidx = lfring_dequeue((struct lfring *) q->fq, SCQD_ORDER, true);
358 | +  if (eidx == LFRING_EMPTY) return;
359 | +  q->val[eidx] = val;
360 | +  lfring_enqueue((struct lfring *) q->aq, SCQD_ORDER, eidx, false);
361 | +}
362 | +
363 | +void * dequeue(queue_t * q, handle_t * th)
364 | +{
365 | +  size_t eidx;
366 | +  void *val;
367 | +  eidx = lfring_dequeue((struct lfring *) q->aq, SCQD_ORDER, false);
368 | +  if (eidx == LFRING_EMPTY) return EMPTY;
369 | +  val = q->val[eidx];
370 | +  lfring_enqueue((struct lfring *) q->fq, SCQD_ORDER, eidx, true);
371 | +  return val;
372 | +}
373 | +
374 | +void queue_free(queue_t * q, handle_t * h)
375 | +{
376 | +}
377 | diff -urN benchmark.orig/scqd.h benchmark/scqd.h
378 | --- benchmark.orig/scqd.h	1969-12-31 19:00:00.000000000 -0500
379 | +++ benchmark/scqd.h	2019-10-22 10:31:31.128409708 -0400
380 | @@ -0,0 +1,25 @@
381 | +#ifndef SCQD_H
382 | +#define SCQD_H
383 | +
384 | +#ifdef SCQD
385 | +
386 | +#include <stddef.h>
387 | +#include "../lfring_cas1.h"
388 | +#include "align.h"
389 | +
390 | +#define SCQD_ORDER 16
391 | +#define EMPTY (void *) LFRING_EMPTY
392 | +
393 | +typedef struct _queue_t {
394 | +  char aq[LFRING_SIZE(SCQD_ORDER)];
395 | +  char fq[LFRING_SIZE(SCQD_ORDER)];
396 | +  void  *val[(1U << SCQD_ORDER)];
397 | +} queue_t DOUBLE_CACHE_ALIGNED;
398 | +
399 | +typedef struct _handle_t {
400 | +  int pad;
401 | +} handle_t DOUBLE_CACHE_ALIGNED;
402 | +
403 | +#endif
404 | +
405 | +#endif /* end of include guard: SCQD_H */
406 | diff -urN benchmark.orig/scq.h benchmark/scq.h
407 | --- benchmark.orig/scq.h	1969-12-31 19:00:00.000000000 -0500
408 | +++ benchmark/scq.h	2019-10-22 10:31:31.124409717 -0400
409 | @@ -0,0 +1,23 @@
410 | +#ifndef SCQ_H
411 | +#define SCQ_H
412 | +
413 | +#ifdef SCQ
414 | +
415 | +#include <stddef.h>
416 | +#include "../lfring_cas1.h"
417 | +#include "align.h"
418 | +
419 | +#define SCQ_ORDER 15
420 | +#define EMPTY (void *) LFRING_EMPTY
421 | +
422 | +typedef struct _queue_t {
423 | +  char ring[LFRING_SIZE(SCQ_ORDER)];
424 | +} queue_t DOUBLE_CACHE_ALIGNED;
425 | +
426 | +typedef struct _handle_t {
427 | +  int pad;
428 | +} handle_t DOUBLE_CACHE_ALIGNED;
429 | +
430 | +#endif
431 | +
432 | +#endif /* end of include guard: SCQ_H */
433 | 


--------------------------------------------------------------------------------
/benchmark/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Chaoran Yang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/benchmark/Makefile:
--------------------------------------------------------------------------------
 1 | TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay cas scq scq2 scqd ncq wcq
 2 | 
 3 | # if using clang, please also specify -mcx16 for x86-64
 4 | CC = gcc
 5 | CFLAGS = -g -Wall -O3 -pthread -D_GNU_SOURCE
 6 | LDLIBS = -ljemalloc -lpthread -lm
 7 | 
 8 | ifeq (${VERIFY}, 1)
 9 | 	CFLAGS += -DVERIFY
10 | endif
11 | 
12 | ifeq (${SANITIZE}, 1)
13 | 	CFLAGS += -fsanitize=address -fno-omit-frame-pointer
14 | 	LDLIBS += -lasan
15 | 	LDFLAGS = -fsanitize=address
16 | endif
17 | 
18 | ifdef JEMALLOC_PATH
19 | 	LDFLAGS += -L${JEMALLOC_PATH}/lib -Wl,-rpath,${JEMALLOC_PATH}/lib
20 | 	LDLIBS += -ljemalloc
21 | endif
22 | 
23 | all: $(TESTS)
24 | 
25 | wfqueue0: CFLAGS += -DMAX_PATIENCE=0
26 | wfqueue0.o: wfqueue.c
27 | 	$(CC) $(CFLAGS) -c -o $@ $^
28 | 
29 | haswell: CFLAGS += -DGUADALUPE_COMPACT
30 | haswell: all
31 | 
32 | mic: CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc
33 | mic: CFLAGS += -DGUADALUPE_MIC_COMPACT -DLOGN_OPS=6
34 | mic biou: $(filter-out lcrq,$(TESTS))
35 | 
36 | biou: CFLAGS += -DBIOU_COMPACT
37 | 
38 | wfqueue wfqueue0: CFLAGS += -DWFQUEUE
39 | lcrq: CFLAGS += -DLCRQ
40 | ccqueue: CFLAGS += -DCCQUEUE
41 | msqueue: CFLAGS += -DMSQUEUE
42 | faa: CFLAGS += -DFAAQ
43 | cas: CFLAGS += -DFAAQ
44 | delay: CFLAGS += -DDELAY
45 | scq: CFLAGS += -DSCQ
46 | scqd: CFLAGS += -DSCQD
47 | scq2: CFLAGS += -DSCQ2
48 | ncq: CFLAGS += -DNCQ
49 | wcq: CFLAGS += -DWCQ
50 | 
51 | $(TESTS): harness.o
52 | ifeq (${HALFHALF}, 1)
53 | $(TESTS): halfhalf.o
54 | else
55 | $(TESTS): pairwise.o
56 | endif
57 | 
58 | msqueue lcrq: hzdptr.o xxhash.o
59 | 
60 | clean:
61 | 	rm -f $(TESTS) *.o
62 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
  1 | # Benchmark
  2 | 
  3 | The benchmark is forked from the "Fast Wait Free Queue" paper. The
  4 | original code is
  5 | available [here](https://github.com/chaoran/fast-wait-free-queue).
  6 | See the original README file below.
  7 | 
  8 | This is a benchmark framework for evaluating the performance of concurrent queues. Currently, it contains four concurrent queue implementations. They are:
  9 | 
 10 | - A fast wait-free queue `wfqueue`,
 11 | - Morrison and Afek's `lcrq`,
 12 | - Fatourou and Kallimanis's `ccqueue`, and
 13 | - Michael and Scott's `msqueue`
 14 | 
 15 | The benchmark framework also includes a synthetic queue benchmark, `faa`, which emulates both an enqueue and a dequeue with a `fetch-and-add` primitive to test the performance of `fetch-and-add` on a system.
 16 | 
 17 | The framework currently contains one benchmark, `pairwise`, in which all threads repeatedly execute pairs of enqueue and dequeue operations. Between two operations, `pairwise` uses a delay routine that adds an arbitrary delay (between 50~150ns) to avoid artificial long run scenarios, where a cache line is held by one thread for a long time.
 18 | 
 19 | ## Requirements
 20 | 
 21 | - **GCC 4.1.0 or later (Recommend GCC 4.7.3 or later)**: current implementations uses GCC `__atomic` or `__sync` primitives for atomic memory access.
 22 | - **Linux kernel 2.5.8 or later**
 23 | - **glibc 2.3**: we use `sched_setaffinity` to bind threads to cores.
 24 | - **atomic `CAS2`**: `lcrq` requires `CAS2`, a 16 Byte wide `compare-and-swap` primitive. This is available on most recent Intel processors and IBM Power8.
 25 | - **jemalloc** (optional): `jemalloc` eliminates the bottleneck of the memory allocator. You can link with `jemalloc` by setting `JEMALLOC_PATH` environment variable to the path where your `jemalloc` is installed.
 26 |  
 27 | ## How to install
 28 | 
 29 | Download one of the released source code tarball, then execute the following commands. The filename used may be different depending on the name of the tarball you have downloaded.
 30 | ```
 31 | $ tar zxf fast-wait-free-queue-1.0.0.tar.gz
 32 | $ cd fast-wait-free-queue-1.0.0
 33 | $ make
 34 | ```
 35 | 
 36 | This should generate 6 binaries (or 5 if your system does not support `CAS2`, `lcrq` will fail to compile): `wfqueue`, `wfqueue0`, `lcrq`, `ccqueue`, `msqueue`, `faa`, and `delay`. These are the `pairwise` benchmark compiled using different queue implementations.
 37 | - `wfqueue0`: the same as `wfqueue` except that its `PATIENCE` is set to `0`.
 38 | - `delay`: a synthetic benchmark used to measure the time spent in the delay routine.
 39 | 
 40 | ## How to run
 41 | 
 42 | You can execute a binary directly, using the number of threads as an argument. Without an argument, the execution will use all available cores on the system. 
 43 | 
 44 | For example,
 45 | ```
 46 | ./wfqueue 8
 47 | ```
 48 | runs `wfqueue` with 8 threads.
 49 | 
 50 | If you would like to verify the result, compile the binary with `VERIFY=1 make`. Then execute a binary directly will print either `PASSED` or error messages.
 51 | 
 52 | You can also use the `driver` script, which invokes a binary up to 10 times and measures the **mean of running times**, the **running time of the current run**, the **standard deviation**, **margin of error** (both in time and percentage) of each run.
 53 | The script terminates when the **margin of error** is relatively small (**< 0.02**), or has invoked the binary 10 times.
 54 | 
 55 | For example, 
 56 | ```
 57 | ./driver ./wfqueue 8
 58 | ```
 59 | runs `wfqueue` with 8 threads up to 10 times and collect statistic results.
 60 | 
 61 | You can use the `benchmark` script, which invokes `driver` on all combinations of a list of binaries and a list of numbers of threads, and report the `mean running time` and `margin of error` for each combination. You can specify the list of binaries using the environment variable `TESTS`. You can specify the list of numbers of threads using the environment variable `PROCS`.
 62 | 
 63 | The generated output of `benchmark` can be used as a datafile for gnuplot. The first column of `benchmark`'s output is the number threads. Then every two columns are the `mean running time` and `margin of error` for each queue implementation. They are in the same order as they are specified in `TESTS`.
 64 | 
 65 | For example,
 66 | ```
 67 | TESTS=wfqueue:lcrq:faa:delay PROCS=1:2:4:8 ./benchmark
 68 | ```
 69 | runs each of `wfqueue`, `lcrq`, `faa`, and `delay` using 1, 2, 4, and 8 threads.
 70 | 
 71 | Then you can plot them using,
 72 | ```
 73 | set logscale x 2
 74 | plot "t" using 1:(20000/($2-$8)) t "wfqueue" w lines, \
 75 |      "t" using 1:(20000/($4-$8)) t "lcrq" w lines, \
 76 |      "t" using 1:(20000/($6-$8)) t "faa" w lines
 77 | ```
 78 | 
 79 | ## How to map threads to cores
 80 | 
 81 | By default, the framework will map a thread with id `i` to the core with id `i % p`, where *p* is the number of available cores on a system; you can check each core's id in `proc/cpuinfo`.
 82 | 
 83 | To implement a custom mapping, you can add a `cpumap` function in `cpumap.h`. The signature of `cpumap` is
 84 | ```
 85 | int cpumap(int id, int nprocs)
 86 | ```
 87 | where `id` is the id of the current thread, `nprocs` is the number of threads. `cpumap` should return the corresponding core id for the thread. `cpumap.h` contains several examples of the cpumap function. You should guard the definition of the added `cpumap` using a conditional macro, and add the macro to `CFLAGS` in the makefile.
 88 | 
 89 | ## How to add a new queue implementation
 90 | 
 91 | We use a generic pointer `void *` to represent a value that can be stored in the queue.
 92 | A queue should implements the queue interface, defined in `queue.h`.
 93 | 
 94 | - `queue_t`: the struct type of the queue,
 95 | - `handle_t`: a thread's handle to the queue, used to store thread local state,
 96 | - `void queue_init(queue_t * q, int nprocs)`: initialize a queue; this will be called only once,
 97 | - `void queue_register(queue_t * q, handle_t * th, int id)`: initialize a thread's handle; this will be called by every thread that uses the queue,
 98 | - `void enqueue(queue_t * q, handle_t * th, void * val)`: enqueues a value,
 99 | - `void * dequeue(queue_t * q, handle_t * th)`: dequeues a value,
100 | - `void queue_free(queue_t * q, handle_t * h)`: deallocate a queue and cleanup all resources associated with it,
101 | - `EMPTY`: a value that will be returned if a `dequeue` fails. This should be a macro that is defined in the header file.
102 | 
103 | ## How to add a new benchmark
104 | 
105 | A benchmark should implement the benchmark interface, defined in `benchmark.h`, and interact with a queue using the queue interface.
106 | The benchmark interface includes:
107 | 
108 | - `void init(int nprocs, int n)`: performs initialization of the benchmark; called only once at the beginning.
109 | - `void thread_init(int id, int nprocs)`: performs thread local initialization of the benchmark; called once per thread, after `init` but before `benchmark`.
110 | - `void * benchmark(int id, int nprocs)`: run the benchmark once, called by each thread to run the benchmark. Each call will be timed and report as one iteration. It can return a result, which will be passed to `verify` to verify correctness.
111 | - `int verify(int nprocs, void * results)`: should verify the result of each thread and return `0` on success and non-zero values on error.
112 | 


--------------------------------------------------------------------------------
/benchmark/align.h:
--------------------------------------------------------------------------------
 1 | #ifndef ALIGN_H
 2 | #define ALIGN_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | #define PAGE_SIZE 4096
 9 | #define CACHE_LINE_SIZE 64
10 | #define CACHE_ALIGNED __attribute__((aligned(CACHE_LINE_SIZE)))
11 | #define DOUBLE_CACHE_ALIGNED __attribute__((aligned(2 * CACHE_LINE_SIZE)))
12 | 
13 | static inline void * align_malloc(size_t align, size_t size)
14 | {
15 |   void * ptr;
16 | 
17 |   int ret = posix_memalign(&ptr, align, size);
18 |   if (ret != 0) {
19 |     fprintf(stderr, "error: %s\n", strerror(ret));
20 |     abort();
21 |   }
22 | 
23 |   return ptr;
24 | }
25 | 
26 | #endif /* end of include guard: ALIGN_H */
27 | 


--------------------------------------------------------------------------------
/benchmark/benchmark:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$TESTS" ]; then
 4 |   TESTS=(wfqueue wfqueue0 faa lcrq ccqueue msqueue delay)
 5 | else
 6 |   IFS=':' read -r -a TESTS <<< "${TESTS}"
 7 | fi
 8 | 
 9 | if [ -z "$PROCS" ]; then
10 |   PROCS=(1 2 4 8)
11 | else
12 |   IFS=':' read -r -a PROCS <<< "${PROCS}"
13 | fi
14 | 
15 | printf '#! Host: %s\n' $( hostname )
16 | printf '#! Benchmarks: %s\n' "${TESTS[*]}"
17 | printf '#! Threads: %s\n' "${PROCS[*]}"
18 | 
19 | for j in ${PROCS[@]}; do
20 |   printf '%d' $j
21 |   for i in ${TESTS[@]}; do
22 |     echo -ne \
23 |       "$(./driver ./$i $j | tail -n 1 | awk '{printf " %.2f %.2f", $3, $5}')"
24 |   done
25 |   printf '\n'
26 | done
27 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.h:
--------------------------------------------------------------------------------
 1 | #ifndef BENCHMARK_H
 2 | #define BENCHMARK_H
 3 | 
 4 | extern void init(int nprocs, int n);
 5 | extern void thread_init(int id, int nprocs);
 6 | extern void * benchmark(int id, int nprocs);
 7 | extern void thread_exit(int id, int nprocs);
 8 | extern int verify(int nprocs, void ** results);
 9 | 
10 | #endif /* end of include guard: BENCHMARK_H */
11 | 


--------------------------------------------------------------------------------
/benchmark/bits.h:
--------------------------------------------------------------------------------
 1 | #ifndef BITS_H
 2 | #define BITS_H
 3 | 
 4 | static void * bits_join(int hi, int lo)
 5 | {
 6 |   intptr_t int64 = hi;
 7 |   int64 <<= 32;
 8 |   int64  += lo;
 9 |   return (void *) int64;
10 | }
11 | 
12 | static int bits_lo(void * ptr)
13 | {
14 |   intptr_t int64 = (intptr_t) ptr;
15 |   int64 &= 0x00000000ffffffff;
16 |   return (int) int64;
17 | }
18 | 
19 | static int bits_hi(void * ptr)
20 | {
21 |   intptr_t int64 = (intptr_t) ptr;
22 |   int64 >>= 32;
23 |   return (int) int64;
24 | }
25 | 
26 | #endif /* end of include guard: BITS_H */
27 | 


--------------------------------------------------------------------------------
/benchmark/cas.c:
--------------------------------------------------------------------------------
 1 | #include "queue.h"
 2 | #include "primitives.h"
 3 | 
 4 | void queue_init(queue_t * q, int nprocs) {}
 5 | void queue_register(queue_t * q, handle_t * hd, int id)
 6 | {
 7 |   *hd = id + 1;
 8 | }
 9 | 
10 | void enqueue(queue_t * q, handle_t * th, void * val)
11 | {
12 |   long p = q->P;
13 |   while (!CAS(&q->P, &p, p + 1))
14 |     ;
15 | }
16 | 
17 | void * dequeue(queue_t * q, handle_t * th)
18 | {
19 |   long c = q->C;
20 |   while (!CAS(&q->C, &c, c + 1))
21 |     ;
22 |   return (void *) (long) *th;
23 | }
24 | 
25 | void queue_free(queue_t * q, handle_t * h) {}
26 | 
27 | 


--------------------------------------------------------------------------------
/benchmark/ccqueue.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <assert.h>
 3 | #include "delay.h"
 4 | #include "ccqueue.h"
 5 | 
 6 | static inline
 7 | void serialEnqueue(void * state, void * data)
 8 | {
 9 |   node_t * volatile * tail = (node_t **) state;
10 |   node_t * node = (node_t *) data;
11 | 
12 |   (*tail)->next = node;
13 |   *tail = node;
14 | }
15 | 
16 | static inline
17 | void serialDequeue(void * state, void * data)
18 | {
19 |   node_t * volatile * head = (node_t **) state;
20 |   node_t ** ptr = (node_t **) data;
21 | 
22 |   node_t * node = *head;
23 |   node_t * next = node->next;
24 | 
25 |   if (next) {
26 |     node->data = next->data;
27 |     *head = next;
28 |   } else {
29 |     node = (void *) -1;
30 |   }
31 | 
32 |   *ptr = node;
33 | }
34 | 
35 | void queue_init(queue_t * queue, int nprocs)
36 | {
37 |   ccsynch_init(&queue->enq);
38 |   ccsynch_init(&queue->deq);
39 | 
40 |   node_t * dummy = align_malloc(CACHE_LINE_SIZE, sizeof(node_t));
41 |   dummy->data = 0;
42 |   dummy->next = NULL;
43 | 
44 |   queue->head = dummy;
45 |   queue->tail = dummy;
46 | }
47 | 
48 | void queue_register(queue_t * queue, handle_t * handle, int id)
49 | {
50 |   ccsynch_handle_init(&handle->enq);
51 |   ccsynch_handle_init(&handle->deq);
52 | 
53 |   handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(node_t));
54 | }
55 | 
56 | void enqueue(queue_t * queue, handle_t * handle, void * data)
57 | {
58 |   node_t * node = handle->next;
59 | 
60 |   if (node) handle->next = NULL;
61 |   else node = align_malloc(CACHE_LINE_SIZE, sizeof(node_t));
62 | 
63 |   node->data = data;
64 |   node->next = NULL;
65 | 
66 |   ccsynch_apply(&queue->enq, &handle->enq, &serialEnqueue, &queue->tail, node);
67 | }
68 | 
69 | void * dequeue(queue_t * queue, handle_t * handle)
70 | {
71 |   node_t * node;
72 |   ccsynch_apply(&queue->deq, &handle->deq, &serialDequeue, &queue->head, &node);
73 | 
74 |   void * data;
75 | 
76 |   if (node == (void *) -1) {
77 |     data = (void *) -1;
78 |   } else {
79 |     data = node->data;
80 |     if (handle->next) free(node);
81 |     else handle->next = node;
82 |   }
83 | 
84 |   return data;
85 | }
86 | 
87 | void queue_free(int id, int nprocs) {}
88 | 


--------------------------------------------------------------------------------
/benchmark/ccqueue.h:
--------------------------------------------------------------------------------
 1 | #ifndef CCQUEUE_H
 2 | #define CCQUEUE_H
 3 | 
 4 | #ifdef CCQUEUE
 5 | #include "ccsynch.h"
 6 | 
 7 | #define EMPTY (void *) -1
 8 | 
 9 | typedef struct _node_t {
10 |   struct _node_t * next CACHE_ALIGNED;
11 |   void * volatile data;
12 | } node_t;
13 | 
14 | typedef struct _queue_t {
15 |   ccsynch_t enq DOUBLE_CACHE_ALIGNED;
16 |   ccsynch_t deq DOUBLE_CACHE_ALIGNED;
17 |   node_t * head DOUBLE_CACHE_ALIGNED;
18 |   node_t * tail DOUBLE_CACHE_ALIGNED;
19 | } queue_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | typedef struct _handle_t {
22 |   ccsynch_handle_t enq;
23 |   ccsynch_handle_t deq;
24 |   node_t * next;
25 | } handle_t DOUBLE_CACHE_ALIGNED;
26 | 
27 | #endif
28 | 
29 | #endif /* end of include guard: CCQUEUE_H */
30 | 


--------------------------------------------------------------------------------
/benchmark/ccsynch.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CCSYNCH_H_
 2 | #define _CCSYNCH_H_
 3 | 
 4 | #include <stdlib.h>
 5 | #include "align.h"
 6 | #include "primitives.h"
 7 | 
 8 | typedef struct _ccsynch_node_t {
 9 |   struct _ccsynch_node_t * volatile next CACHE_ALIGNED;
10 |   void * volatile data;
11 |   int volatile status CACHE_ALIGNED;
12 | } ccsynch_node_t;
13 | 
14 | typedef struct _ccsynch_handle_t {
15 |   struct _ccsynch_node_t * next;
16 | } ccsynch_handle_t;
17 | 
18 | typedef struct _ccsynch_t {
19 |   struct _ccsynch_node_t * volatile tail DOUBLE_CACHE_ALIGNED;
20 | } ccsynch_t;
21 | 
22 | #define CCSYNCH_WAIT  0x0
23 | #define CCSYNCH_READY 0x1
24 | #define CCSYNCH_DONE  0x3
25 | 
26 | static inline
27 | void ccsynch_apply(ccsynch_t * synch, ccsynch_handle_t * handle,
28 |     void (*apply)(void *, void *), void * state, void * data)
29 | {
30 |   ccsynch_node_t * next = handle->next;
31 |   next->next = NULL;
32 |   next->status = CCSYNCH_WAIT;
33 | 
34 |   ccsynch_node_t * curr = SWAPra(&synch->tail, next);
35 |   handle->next = curr;
36 | 
37 |   int status = ACQUIRE(&curr->status);
38 | 
39 |   if (status == CCSYNCH_WAIT) {
40 |     curr->data = data;
41 |     RELEASE(&curr->next, next);
42 | 
43 |     do {
44 |       PAUSE();
45 |       status = ACQUIRE(&curr->status);
46 |     } while (status == CCSYNCH_WAIT);
47 |   }
48 | 
49 |   if (status != CCSYNCH_DONE) {
50 |     apply(state, data);
51 | 
52 |     curr = next;
53 |     next = ACQUIRE(&curr->next);
54 | 
55 |     int count = 0;
56 |     const int CCSYNCH_HELP_BOUND = 256;
57 | 
58 |     while (next && count++ < CCSYNCH_HELP_BOUND) {
59 |       apply(state, curr->data);
60 |       RELEASE(&curr->status, CCSYNCH_DONE);
61 | 
62 |       curr = next;
63 |       next = ACQUIRE(&curr->next);
64 |     }
65 | 
66 |     RELEASE(&curr->status, CCSYNCH_READY);
67 |   }
68 | }
69 | 
70 | static inline void ccsynch_init(ccsynch_t * synch)
71 | {
72 |   ccsynch_node_t * node = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t));
73 |   node->next = NULL;
74 |   node->status = CCSYNCH_READY;
75 | 
76 |   synch->tail = node;
77 | }
78 | 
79 | static inline void ccsynch_handle_init(ccsynch_handle_t * handle)
80 | {
81 |   handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t));
82 | }
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/benchmark/cpumap.h:
--------------------------------------------------------------------------------
 1 | #ifndef CPUMAP_H
 2 | #define CPUMAP_H
 3 | 
 4 | #include <sched.h>
 5 | 
 6 | #ifdef GUADALUPE_SPREAD
 7 | int cpumap(int i, int nprocs)
 8 | {
 9 |   return (i / 36) * 36 + (i % 2) * 18 + (i % 36 / 2);
10 | }
11 | 
12 | #elif GUADALUPE_OVERSUB
13 | int cpumap(int i, int nprocs) {
14 |   return (i % 18);
15 | }
16 | 
17 | #elif GUADALUPE_COMPACT
18 | int cpumap(int i, int nprocs)
19 | {
20 |   return (i % 2) * 36 + i / 2;
21 | }
22 | 
23 | #elif GUADALUPE_MIC_COMPACT
24 | int cpumap(int i, int nprocs)
25 | {
26 |   return (i + 1) % 228;
27 | }
28 | 
29 | #elif LES_SPREAD
30 | int cpumap(int i, int nprocs)
31 | {
32 |   return i % 4 * 12 + i / 4 % 12;
33 | }
34 | 
35 | #elif BIOU_COMPACT
36 | int cpumap(int i, int nprocs)
37 | {
38 |   return (i % 2) * 32 + i / 2;
39 | }
40 | 
41 | #else
42 | int cpumap(int id, int nprocs)
43 | {
44 |   return id % nprocs;
45 | }
46 | 
47 | #endif
48 | 
49 | #endif /* end of include guard: CPUMAP_H */
50 | 


--------------------------------------------------------------------------------
/benchmark/delay.c:
--------------------------------------------------------------------------------
 1 | #include "queue.h"
 2 | #include "primitives.h"
 3 | 
 4 | void queue_init(queue_t * q, int nprocs) {}
 5 | void queue_register(queue_t * q, handle_t * hd, int id)
 6 | {
 7 |   *hd = id + 1;
 8 | }
 9 | 
10 | void enqueue(queue_t * q, handle_t * th, void * val)
11 | {
12 | }
13 | 
14 | void * dequeue(queue_t * q, handle_t * th)
15 | {
16 |   return (void *) (long) *th;
17 | }
18 | 
19 | void queue_free(queue_t * q, handle_t * h) {}
20 | 
21 | 


--------------------------------------------------------------------------------
/benchmark/delay.h:
--------------------------------------------------------------------------------
 1 | #ifndef DELAY_H
 2 | #define DELAY_H
 3 | 
 4 | //#include <time.h>
 5 | #include <stdlib.h>
 6 | 
 7 | typedef struct drand48_data delay_t;
 8 | 
 9 | static inline void delay_init(delay_t * state, int id)
10 | {
11 |   srand48_r(id, state);
12 | }
13 | 
14 | static inline void delay_exec(delay_t * state)
15 | {
16 |   long n;
17 |   lrand48_r(state, &n);
18 | 
19 |   int j;
20 |   for (j = 50; j < 50 + n % 100; ++j) {
21 |     __asm__ ("nop");
22 |   }
23 | }
24 | 
25 | #endif /* end of include guard: DELAY_H */
26 | 


--------------------------------------------------------------------------------
/benchmark/driver:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | T90=( \
 4 |   6.314 2.920 2.353 2.132 2.015 1.943 1.895 1.860 1.833 1.812 \
 5 |   1.796 1.782 1.771 1.761 1.753 1.746 1.740 1.734 1.729 1.725 \
 6 |   1.721 1.717 1.714 1.711 1.708 1.706 1.703 1.701 1.699 1.697 \
 7 |   )
 8 | 
 9 | T95=( \
10 |   12.71 4.303 3.182 2.776 2.571 2.447 2.365 2.306 2.262 2.228 \
11 |   2.201 2.179 2.160 2.145 2.131 2.120 2.110 2.101 2.093 2.086 \
12 |   2.080 2.074 2.069 2.064 2.060 2.056 2.052 2.048 2.045 2.042 \
13 |   )
14 | 
15 | TIMES[0]=$($@ | grep Mean | awk '{ print $5 }')
16 | SUM=${TIMES[0]}
17 | printf '#%-2d %.2f\n' 1 ${TIMES[0]}
18 | 
19 | i=1
20 | while true; do
21 |   TIME=$($@ | grep Mean | awk '{ print $5 }')
22 |   TIMES[$i]=$TIME
23 |   SUM=$(echo "$SUM + $TIME" | bc)
24 |   N=$(($i + 1))
25 | 
26 |   MEAN=$(echo "$SUM / $N" | bc -l)
27 | 
28 |   STD=0
29 |   for j in "${TIMES[@]}"; do
30 |     STD=$(echo "($j - $MEAN) ^ 2 + $STD" | bc -l)
31 |   done
32 |   STD=$(echo "sqrt ($STD / $i)" | bc -l)
33 | 
34 |   ERR=$(echo "${T95[$i]} * $STD / sqrt($N)" | bc -l)
35 |   PRECISION=$(echo "$ERR / $MEAN" | bc -l)
36 | 
37 |   printf '#%-2d %.2f %.2f %.4f %.2f %.3f\n' \
38 |     $N $TIME $MEAN $STD $ERR $PRECISION
39 | 
40 |   if (($N >= 10 || $N >= 5 && $(echo "$PRECISION < 0.02" | bc) == 1)); then
41 |     break
42 |   else
43 |     i=$N
44 |   fi
45 | done
46 | 
47 | 


--------------------------------------------------------------------------------
/benchmark/faa.c:
--------------------------------------------------------------------------------
 1 | #include "queue.h"
 2 | #include "primitives.h"
 3 | 
 4 | void queue_init(queue_t * q, int nprocs) {}
 5 | void queue_register(queue_t * q, handle_t * hd, int id)
 6 | {
 7 |   *hd = id + 1;
 8 | }
 9 | 
10 | void enqueue(queue_t * q, handle_t * th, void * val)
11 | {
12 |   FAA(&q->P, 1);
13 | }
14 | 
15 | void * dequeue(queue_t * q, handle_t * th)
16 | {
17 |   FAA(&q->C, 1);
18 |   return (void *) (long) *th;
19 | }
20 | 
21 | void queue_free(queue_t * q, handle_t * h) {}
22 | 
23 | 


--------------------------------------------------------------------------------
/benchmark/halfhalf.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | #include "delay.h"
 5 | #include "queue.h"
 6 | 
 7 | #ifndef LOGN_OPS
 8 | #define LOGN_OPS 7
 9 | #endif
10 | 
11 | static long nops;
12 | static queue_t * q;
13 | static handle_t ** hds;
14 | 
15 | void init(int nprocs, int logn) {
16 |   /** Use 10^7 as default input size. */
17 |   if (logn == 0) logn = LOGN_OPS;
18 | 
19 |   /** Compute the number of ops to perform. */
20 |   nops = 1;
21 |   int i;
22 |   for (i = 0; i < logn; ++i) {
23 |     nops *= 10;
24 |   }
25 | 
26 |   printf("  Number of operations: %ld\n", nops);
27 | 
28 |   // FIXME: sizeof(queue_t) varies, allocate 4MB
29 |   q = align_malloc(PAGE_SIZE, 4194304);
30 |   queue_init(q, nprocs);
31 | 
32 |   hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs]));
33 | }
34 | 
35 | void thread_init(int id, int nprocs) {
36 |   hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t));
37 |   queue_register(q, hds[id], id);
38 | }
39 | 
40 | void thread_exit(int id, int nprocs) {
41 |   queue_free(q, hds[id]);
42 | }
43 | 
44 | void * benchmark(int id, int nprocs) {
45 |   void * val = (void *) (intptr_t) (id + 1);
46 |   handle_t * th = hds[id];
47 | 
48 |   delay_t state;
49 |   delay_init(&state, id);
50 | 
51 |   struct drand48_data rstate;
52 |   srand48_r(id, &rstate);
53 | 
54 |   int i;
55 |   for (i = 0; i < nops / nprocs; ++i) {
56 |     long n;
57 |     lrand48_r(&rstate, &n);
58 | 
59 |     if (n % 2 == 0)
60 |       enqueue(q, th, val);
61 |     else
62 |       dequeue(q, th);
63 | 
64 | //    delay_exec(&state);
65 |   }
66 | 
67 |   return val;
68 | }
69 | 
70 | int verify(int nprocs, void ** results) {
71 |   return 0;
72 | }
73 | 


--------------------------------------------------------------------------------
/benchmark/harness.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <limits.h>
  4 | #include <stdlib.h>
  5 | #include <stdint.h>
  6 | #include <unistd.h>
  7 | #include <pthread.h>
  8 | #include <sys/time.h>
  9 | #include "bits.h"
 10 | #include "cpumap.h"
 11 | #include "benchmark.h"
 12 | 
 13 | #ifndef NUM_ITERS
 14 | #define NUM_ITERS 5
 15 | #endif
 16 | 
 17 | #ifndef MAX_PROCS
 18 | #define MAX_PROCS 512
 19 | #endif
 20 | 
 21 | #ifndef MAX_ITERS
 22 | #define MAX_ITERS 20
 23 | #endif
 24 | 
 25 | #ifndef COV_THRESHOLD
 26 | #define COV_THRESHOLD 0.02
 27 | #endif
 28 | 
 29 | static pthread_barrier_t barrier;
 30 | static double times[MAX_ITERS];
 31 | static double means[MAX_ITERS];
 32 | static double covs[MAX_ITERS];
 33 | static volatile int target;
 34 | 
 35 | static size_t elapsed_time(size_t us)
 36 | {
 37 |   struct timeval t;
 38 |   gettimeofday(&t, NULL);
 39 |   return t.tv_sec * 1000000 + t.tv_usec - us;
 40 | }
 41 | 
 42 | static double compute_mean(const double * times)
 43 | {
 44 |   int i;
 45 |   double sum = 0;
 46 | 
 47 |   for (i = 0; i < NUM_ITERS; ++i) {
 48 |     sum += times[i];
 49 |   }
 50 | 
 51 |   return sum / NUM_ITERS;
 52 | }
 53 | 
 54 | static double compute_cov(const double * times, double mean)
 55 | {
 56 |   double variance = 0;
 57 | 
 58 |   int i;
 59 |   for (i = 0; i < NUM_ITERS; ++i) {
 60 |     variance += (times[i] - mean) * (times[i] - mean);
 61 |   }
 62 | 
 63 |   variance /= NUM_ITERS;
 64 | 
 65 |   double cov = sqrt(variance);;
 66 |   cov /= mean;
 67 |   return cov;
 68 | }
 69 | 
 70 | static size_t reduce_min(long val, int id, int nprocs)
 71 | {
 72 |   static long buffer[MAX_PROCS];
 73 | 
 74 |   buffer[id] = val;
 75 |   pthread_barrier_wait(&barrier);
 76 | 
 77 |   long min = LONG_MAX;
 78 |   int i;
 79 |   for (i = 0; i < nprocs; ++i) {
 80 |     if (buffer[i] < min) min = buffer[i];
 81 |   }
 82 | 
 83 |   return min;
 84 | }
 85 | 
 86 | static void report(int id, int nprocs, int i, long us)
 87 | {
 88 |   long ms = reduce_min(us, id, nprocs);
 89 | 
 90 |   if (id == 0) {
 91 |     times[i] = ms / 1000.0;
 92 |     printf("  #%d elapsed time: %.2f ms\n", i + 1, times[i]);
 93 | 
 94 |     if (i + 1 >= NUM_ITERS) {
 95 |       int n = i + 1 - NUM_ITERS;
 96 | 
 97 |       means[i] = compute_mean(times + n);
 98 |       covs[i] = compute_cov(times + n, means[i]);
 99 | 
100 |       if (covs[i] < COV_THRESHOLD) {
101 |         target = i;
102 |       }
103 |     }
104 |   }
105 | 
106 |   pthread_barrier_wait(&barrier);
107 | }
108 | 
109 | static void * thread(void * bits)
110 | {
111 |   int id = bits_hi(bits);
112 |   int nprocs = bits_lo(bits);
113 | 
114 |   cpu_set_t set;
115 |   CPU_ZERO(&set);
116 | 
117 |   int cpu = cpumap(id, nprocs);
118 |   CPU_SET(cpu, &set);
119 |   sched_setaffinity(0, sizeof(set), &set);
120 | 
121 |   thread_init(id, nprocs);
122 |   pthread_barrier_wait(&barrier);
123 | 
124 |   int i;
125 |   void * result = NULL;
126 | 
127 |   for (i = 0; i < MAX_ITERS && target == 0; ++i) {
128 |     long us = elapsed_time(0);
129 |     result = benchmark(id, nprocs);
130 |     pthread_barrier_wait(&barrier);
131 |     us = elapsed_time(us);
132 |     report(id, nprocs, i, us);
133 |   }
134 | 
135 |   thread_exit(id, nprocs);
136 |   return result;
137 | }
138 | 
139 | int main(int argc, const char *argv[])
140 | {
141 |   int nprocs = 0;
142 |   int n = 0;
143 | 
144 |   /** The first argument is nprocs. */
145 |   if (argc > 1) {
146 |     nprocs = atoi(argv[1]);
147 |   }
148 | 
149 |   /**
150 |    * Use the number of processors online as nprocs if it is not
151 |    * specified.
152 |    */
153 |   if (nprocs == 0) {
154 |     nprocs = sysconf(_SC_NPROCESSORS_ONLN);
155 |   }
156 | 
157 |   if (nprocs <= 0) return 1;
158 |   else {
159 |     /** Set concurrency level. */
160 |     pthread_setconcurrency(nprocs);
161 |   }
162 | 
163 |   /**
164 |    * The second argument is input size n.
165 |    */
166 |   if (argc > 2) {
167 |     n = atoi(argv[2]);
168 |   }
169 | 
170 |   pthread_barrier_init(&barrier, NULL, nprocs);
171 |   printf("===========================================\n");
172 |   printf("  Benchmark: %s\n", argv[0]);
173 |   printf("  Number of processors: %d\n", nprocs);
174 | 
175 |   init(nprocs, n);
176 | 
177 |   pthread_t ths[nprocs];
178 |   void * res[nprocs];
179 | 
180 |   int i;
181 |   for (i = 1; i < nprocs; i++) {
182 |     pthread_create(&ths[i], NULL, thread, bits_join(i, nprocs));
183 |   }
184 | 
185 |   res[0] = thread(bits_join(0, nprocs));
186 | 
187 |   for (i = 1; i < nprocs; i++) {
188 |     pthread_join(ths[i], &res[i]);
189 |   }
190 | 
191 |   if (target == 0) {
192 |     target = NUM_ITERS - 1;
193 |     double minCov = covs[target];
194 | 
195 |     /** Pick the result that has the lowest CoV. */
196 |     int i;
197 |     for (i = NUM_ITERS; i < MAX_ITERS; ++i) {
198 |       if (covs[i] < minCov) {
199 |         minCov = covs[i];
200 |         target = i;
201 |       }
202 |     }
203 |   }
204 | 
205 |   double mean = means[target];
206 |   double cov = covs[target];
207 |   int i1 = target - NUM_ITERS + 2;
208 |   int i2 = target + 1;
209 | 
210 |   printf("  Steady-state iterations: %d~%d\n", i1, i2);
211 |   printf("  Coefficient of variation: %.2f\n", cov);
212 |   printf("  Number of measurements: %d\n", NUM_ITERS);
213 |   printf("  Mean of elapsed time: %.2f ms\n", mean);
214 |   printf("===========================================\n");
215 | 
216 |   pthread_barrier_destroy(&barrier);
217 |   return verify(nprocs, res);
218 | }
219 | 
220 | 


--------------------------------------------------------------------------------
/benchmark/hzdptr.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <string.h>
  3 | #include "hzdptr.h"
  4 | #include "xxhash.h"
  5 | 
  6 | #define HZDPTR_HTBL_SIZE(nprocs, nptrs) (4 * nprocs * nptrs)
  7 | 
  8 | typedef struct _node_t {
  9 |   struct _node_t * next;
 10 | } node_t;
 11 | 
 12 | static int htable_insert(void ** tbl, size_t size, void * ptr)
 13 | {
 14 |   int index = XXH32(ptr, 1, 0) % size;
 15 |   int i;
 16 | 
 17 |   for (i = index; i < size; ++i ) {
 18 |     if (tbl[i] == NULL) {
 19 |       tbl[i] = ptr;
 20 |       return 0;
 21 |     }
 22 |   }
 23 | 
 24 |   for (i = 0; i < index; ++i) {
 25 |     if (tbl[i] == NULL) {
 26 |       tbl[i] = ptr;
 27 |       return 0;
 28 |     }
 29 |   }
 30 | 
 31 |   return -1;
 32 | }
 33 | 
 34 | static int htable_lookup(void ** tbl, size_t size, void * ptr)
 35 | {
 36 |   int index = XXH32(ptr, 1, 0) % size;
 37 |   int i;
 38 | 
 39 |   for (i = index; i < size; ++i) {
 40 |     if (tbl[i] == ptr) {
 41 |       return 1;
 42 |     } else if (tbl[i] == NULL) {
 43 |       return 0;
 44 |     }
 45 |   }
 46 | 
 47 |   for (i = 0; i < index; ++i) {
 48 |     if (tbl[i] == ptr) {
 49 |       return 1;
 50 |     } else if (tbl[i] == NULL) {
 51 |       return 0;
 52 |     }
 53 |   }
 54 | 
 55 |   return 0;
 56 | }
 57 | 
 58 | void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs)
 59 | {
 60 |   hzd->nprocs = nprocs;
 61 |   hzd->nptrs  = nptrs;
 62 |   hzd->nretired = 0;
 63 |   hzd->ptrs = calloc(hzdptr_size(nprocs, nptrs), 1);
 64 | 
 65 |   _hzdptr_enlist(hzd);
 66 | }
 67 | 
 68 | void _hzdptr_retire(hzdptr_t * hzd, void ** rlist)
 69 | {
 70 |   size_t size = HZDPTR_HTBL_SIZE(hzd->nprocs, hzd->nptrs);
 71 |   void * plist[size];
 72 |   memset(plist, 0, sizeof(plist));
 73 | 
 74 |   hzdptr_t * me = hzd;
 75 |   void * ptr;
 76 | 
 77 |   while ((hzd = hzd->next) != me) {
 78 |     int i;
 79 |     for (i = 0; i < hzd->nptrs; ++i) {
 80 |       ptr = hzd->ptrs[i];
 81 | 
 82 |       if (ptr != NULL) {
 83 |         htable_insert(plist, size, ptr);
 84 |       }
 85 |     }
 86 |   }
 87 | 
 88 |   int nretired = 0;
 89 | 
 90 |   /** Check pointers in retire list with plist. */
 91 |   int i;
 92 |   for (i = 0; i < hzd->nretired; ++i) {
 93 |     ptr = rlist[i];
 94 | 
 95 |     if (htable_lookup(plist, size, ptr)) {
 96 |       rlist[nretired++] = ptr;
 97 |     } else {
 98 |       free(ptr);
 99 |     }
100 |   }
101 | 
102 |   hzd->nretired = nretired;
103 | }
104 | 
105 | void hzdptr_exit(hzdptr_t * hzd)
106 | {
107 |   int i;
108 |   void ** rlist = &hzd->ptrs[hzd->nptrs];
109 | 
110 |   for (i = 0; i < hzd->nretired; ++i) {
111 |     free(rlist[i]);
112 |   }
113 | 
114 |   hzd->nretired = 0;
115 |   hzd->next = hzd;
116 | }
117 | 
118 | 


--------------------------------------------------------------------------------
/benchmark/hzdptr.h:
--------------------------------------------------------------------------------
  1 | #ifndef HZDPTR_H
  2 | #define HZDPTR_H
  3 | 
  4 | #include "primitives.h"
  5 | 
  6 | typedef struct _hzdptr_t {
  7 |   struct _hzdptr_t * next;
  8 |   int nprocs;
  9 |   int nptrs;
 10 |   int nretired;
 11 |   void ** ptrs;
 12 | } hzdptr_t;
 13 | 
 14 | #define HZDPTR_THRESHOLD(nprocs) (2 * nprocs)
 15 | 
 16 | extern void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs);
 17 | extern void hzdptr_exit(hzdptr_t * hzd);
 18 | extern void _hzdptr_retire(hzdptr_t * hzd, void ** rlist);
 19 | 
 20 | static inline
 21 | int hzdptr_size(int nprocs, int nptrs)
 22 | {
 23 |   return sizeof(void * [HZDPTR_THRESHOLD(nprocs) + nptrs]);
 24 | }
 25 | 
 26 | static inline
 27 | void * _hzdptr_set(void volatile * ptr_, void * hzd_)
 28 | {
 29 |   void * volatile * ptr = (void * volatile *) ptr_;
 30 |   void * volatile * hzd = (void * volatile *) hzd_;
 31 | 
 32 |   void * val = *ptr;
 33 |   *hzd = val;
 34 |   return val;
 35 | }
 36 | 
 37 | static inline
 38 | void * hzdptr_set(void volatile * ptr, hzdptr_t * hzd, int idx)
 39 | {
 40 |   return _hzdptr_set(ptr, &hzd->ptrs[idx]);
 41 | }
 42 | 
 43 | static inline
 44 | void * _hzdptr_setv(void volatile * ptr_, void * hzd_)
 45 | {
 46 |   void * volatile * ptr = (void * volatile *) ptr_;
 47 |   void * volatile * hzd = (void * volatile *) hzd_;
 48 | 
 49 |   void * val = *ptr;
 50 |   void * tmp;
 51 | 
 52 |   do {
 53 |     *hzd = val;
 54 |     tmp = val;
 55 |     FENCE();
 56 |     val = *ptr;
 57 |   } while (val != tmp);
 58 | 
 59 |   return val;
 60 | }
 61 | 
 62 | static inline
 63 | void * hzdptr_setv(void volatile * ptr, hzdptr_t * hzd, int idx)
 64 | {
 65 |   return _hzdptr_setv(ptr, &hzd->ptrs[idx]);
 66 | }
 67 | 
 68 | static inline
 69 | void hzdptr_clear(hzdptr_t * hzd, int idx)
 70 | {
 71 |   RELEASE(&hzd->ptrs[idx], NULL);
 72 | }
 73 | 
 74 | static inline
 75 | void hzdptr_retire(hzdptr_t * hzd, void * ptr)
 76 | {
 77 |   void ** rlist = &hzd->ptrs[hzd->nptrs];
 78 |   rlist[hzd->nretired++] = ptr;
 79 | 
 80 |   if (hzd->nretired == HZDPTR_THRESHOLD(hzd->nprocs)) {
 81 |     _hzdptr_retire(hzd, rlist);
 82 |   }
 83 | }
 84 | 
 85 | static inline
 86 | void _hzdptr_enlist(hzdptr_t * hzd)
 87 | {
 88 |   static hzdptr_t * volatile _tail;
 89 |   hzdptr_t * tail = _tail;
 90 | 
 91 |   if (tail == NULL) {
 92 |     hzd->next = hzd;
 93 |     if (CASra(&_tail, &tail, hzd)) return;
 94 |   }
 95 | 
 96 |   hzdptr_t * next = tail->next;
 97 | 
 98 |   do hzd->next = next;
 99 |   while (!CASra(&tail->next, &next, hzd));
100 | }
101 | 
102 | #endif /* end of include guard: HZDPTR_H */
103 | 


--------------------------------------------------------------------------------
/benchmark/lcrq.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include "lcrq.h"
  5 | #include "align.h"
  6 | #include "delay.h"
  7 | #include "hzdptr.h"
  8 | #include "primitives.h"
  9 | 
 10 | #define RING_SIZE LCRQ_RING_SIZE
 11 | 
 12 | static inline int is_empty(uint64_t v) __attribute__ ((pure));
 13 | static inline uint64_t node_index(uint64_t i) __attribute__ ((pure));
 14 | static inline uint64_t set_unsafe(uint64_t i) __attribute__ ((pure));
 15 | static inline uint64_t node_unsafe(uint64_t i) __attribute__ ((pure));
 16 | static inline uint64_t tail_index(uint64_t t) __attribute__ ((pure));
 17 | static inline int crq_is_closed(uint64_t t) __attribute__ ((pure));
 18 | 
 19 | static inline void init_ring(RingQueue *r) {
 20 |   int i;
 21 | 
 22 |   for (i = 0; i < RING_SIZE; i++) {
 23 |     r->array[i].val = -1;
 24 |     r->array[i].idx = i;
 25 |   }
 26 | 
 27 |   r->head = r->tail = 0;
 28 |   r->next = NULL;
 29 | }
 30 | 
 31 | inline int is_empty(uint64_t v)  {
 32 |   return (v == (uint64_t)-1);
 33 | }
 34 | 
 35 | 
 36 | inline uint64_t node_index(uint64_t i) {
 37 |   return (i & ~(1ull << 63));
 38 | }
 39 | 
 40 | 
 41 | inline uint64_t set_unsafe(uint64_t i) {
 42 |   return (i | (1ull << 63));
 43 | }
 44 | 
 45 | 
 46 | inline uint64_t node_unsafe(uint64_t i) {
 47 |   return (i & (1ull << 63));
 48 | }
 49 | 
 50 | 
 51 | inline uint64_t tail_index(uint64_t t) {
 52 |   return (t & ~(1ull << 63));
 53 | }
 54 | 
 55 | 
 56 | inline int crq_is_closed(uint64_t t) {
 57 |   return (t & (1ull << 63)) != 0;
 58 | }
 59 | 
 60 | void queue_init(queue_t * q, int nprocs)
 61 | {
 62 |   RingQueue *rq = align_malloc(PAGE_SIZE, sizeof(RingQueue));
 63 |   init_ring(rq);
 64 | 
 65 |   q->head = rq;
 66 |   q->tail = rq;
 67 |   q->nprocs = nprocs;
 68 | }
 69 | 
 70 | static inline void fixState(RingQueue *rq) {
 71 | 
 72 |   while (1) {
 73 |     uint64_t t = rq->tail;
 74 |     uint64_t h = rq->head;
 75 | 
 76 |     if (rq->tail != t)
 77 |       continue;
 78 | 
 79 |     if (h > t) {
 80 |       if (CAS(&rq->tail, &t, h)) break;
 81 |       continue;
 82 |     }
 83 |     break;
 84 |   }
 85 | }
 86 | 
 87 | static inline int close_crq(RingQueue *rq, const uint64_t t, const int tries) {
 88 |   uint64_t tt = t + 1;
 89 | 
 90 |   if (tries < 10)
 91 |     return CAS(&rq->tail, &tt, tt|(1ull<<63));
 92 |   else
 93 |     return BTAS(&rq->tail, 63);
 94 | }
 95 | 
 96 | static void lcrq_put(queue_t * q, handle_t * handle, uint64_t arg) {
 97 |   int try_close = 0;
 98 | 
 99 |   while (1) {
100 |     RingQueue *rq = hzdptr_setv(&q->tail, &handle->hzdptr, 0);
101 |     RingQueue *next = rq->next;
102 | 
103 |     if (next != NULL) {
104 |       CAS(&q->tail, &rq, next);
105 |       continue;
106 |     }
107 | 
108 |     uint64_t t = FAA(&rq->tail, 1);
109 | 
110 |     if (crq_is_closed(t)) {
111 |       RingQueue * nrq;
112 | alloc:
113 |       nrq = handle->next;
114 | 
115 |       void *org_nrq = nrq;
116 |       if (nrq == NULL) {
117 |         nrq = align_malloc(PAGE_SIZE, sizeof(RingQueue));
118 |         init_ring(nrq);
119 |       }
120 | 
121 |       // Solo enqueue
122 |       nrq->tail = 1;
123 |       nrq->array[0].val = (uint64_t) arg;
124 |       nrq->array[0].idx = 0;
125 | 
126 |       if (CAS(&rq->next, &next, nrq)) {
127 |         CAS(&q->tail, &rq, nrq);
128 |         handle->next = NULL;
129 |         return;
130 |       }
131 | 
132 |       // Did not succeed, free the buffer
133 |       if (org_nrq == NULL) free(nrq);
134 |       continue;
135 |     }
136 | 
137 |     RingNode* cell = &rq->array[t & (RING_SIZE-1)];
138 | 
139 |     uint64_t idx = cell->idx;
140 |     uint64_t val = cell->val;
141 | 
142 |     if (is_empty(val)) {
143 |       if (node_index(idx) <= t) {
144 |         if ((!node_unsafe(idx) || rq->head < t) &&
145 |             CAS2(cell, &val, &idx, arg, t)) {
146 |           return;
147 |         }
148 |       }
149 |     }
150 | 
151 |     uint64_t h = rq->head;
152 | 
153 |     if ((int64_t)(t - h) >= (int64_t)RING_SIZE &&
154 |         close_crq(rq, t, ++try_close)) {
155 |       goto alloc;
156 |     }
157 |   }
158 | 
159 |   hzdptr_clear(&handle->hzdptr, 0);
160 | }
161 | 
162 | static uint64_t lcrq_get(queue_t * q, handle_t * handle) {
163 |   while (1) {
164 |     RingQueue *rq = hzdptr_setv(&q->head, &handle->hzdptr, 0);
165 |     RingQueue *next;
166 | 
167 |     uint64_t h = FAA(&rq->head, 1);
168 | 
169 |     RingNode* cell = &rq->array[h & (RING_SIZE-1)];
170 | 
171 |     uint64_t tt = 0;
172 |     int r = 0;
173 | 
174 |     while (1) {
175 | 
176 |       uint64_t cell_idx = cell->idx;
177 |       uint64_t unsafe = node_unsafe(cell_idx);
178 |       uint64_t idx = node_index(cell_idx);
179 |       uint64_t val = cell->val;
180 | 
181 |       if (idx > h) break;
182 | 
183 |       if (!is_empty(val)) {
184 |         if (idx == h) {
185 |           if (CAS2(cell, &val, &cell_idx, -1, (unsafe | h) + RING_SIZE))
186 |             return val;
187 |         } else {
188 |           if (CAS2(cell, &val, &cell_idx, val, set_unsafe(idx))) {
189 |             break;
190 |           }
191 |         }
192 |       } else {
193 |         if ((r & ((1ull << 10) - 1)) == 0)
194 |           tt = rq->tail;
195 | 
196 |         // Optimization: try to bail quickly if queue is closed.
197 |         int crq_closed = crq_is_closed(tt);
198 |         uint64_t t = tail_index(tt);
199 | 
200 |         if (unsafe) { // Nothing to do, move along
201 |           if (CAS2(cell, &val, &cell_idx, val, (unsafe | h) + RING_SIZE))
202 |             break;
203 |         } else if (t < h + 1 || r > 200000 || crq_closed) {
204 |           if (CAS2(cell, &val, &idx, val, h + RING_SIZE)) {
205 |             if (r > 200000 && tt > RING_SIZE)
206 |               BTAS(&rq->tail, 63);
207 |             break;
208 |           }
209 |         } else {
210 |           ++r;
211 |         }
212 |       }
213 |     }
214 | 
215 |     if (tail_index(rq->tail) <= h + 1) {
216 |       fixState(rq);
217 |       // try to return empty
218 |       next = rq->next;
219 |       if (next == NULL)
220 |         return -1;  // EMPTY
221 |       if (tail_index(rq->tail) <= h + 1) {
222 |         if (CAS(&q->head, &rq, next)) {
223 |           hzdptr_retire(&handle->hzdptr, rq);
224 |         }
225 |       }
226 |     }
227 |   }
228 | 
229 |   hzdptr_clear(&handle->hzdptr, 0);
230 | }
231 | 
232 | void queue_register(queue_t * q, handle_t * th, int id)
233 | {
234 |   hzdptr_init(&th->hzdptr, q->nprocs, 1);
235 | }
236 | 
237 | void enqueue(queue_t * q, handle_t * th, void * val)
238 | {
239 |   lcrq_put(q, th, (uint64_t) val);
240 | }
241 | 
242 | void * dequeue(queue_t * q, handle_t * th)
243 | {
244 |   return (void *) lcrq_get(q, th);
245 | }
246 | void queue_free(queue_t * q, handle_t * h){
247 |   RingQueue *rq = q->orignialHead;
248 |   while(rq){
249 |     RingQueue *n = rq->next;
250 |     free(rq);
251 |     rq = n;
252 |   };
253 | }
254 | 


--------------------------------------------------------------------------------
/benchmark/lcrq.h:
--------------------------------------------------------------------------------
 1 | #ifndef LCRQ_H
 2 | #define LCRQ_H
 3 | 
 4 | #ifdef LCRQ
 5 | 
 6 | #include "align.h"
 7 | #include "hzdptr.h"
 8 | 
 9 | #define EMPTY ((void *) -1)
10 | 
11 | #ifndef LCRQ_RING_SIZE
12 | #define LCRQ_RING_SIZE (1ull << 12)
13 | #endif
14 | 
15 | typedef struct RingNode {
16 |   volatile uint64_t val;
17 |   volatile uint64_t idx;
18 |   uint64_t pad[14];
19 | } RingNode DOUBLE_CACHE_ALIGNED;
20 | 
21 | typedef struct RingQueue {
22 |   volatile int64_t head DOUBLE_CACHE_ALIGNED;
23 |   volatile int64_t tail DOUBLE_CACHE_ALIGNED;
24 |   struct RingQueue *next DOUBLE_CACHE_ALIGNED;
25 |   RingNode array[LCRQ_RING_SIZE];
26 | } RingQueue DOUBLE_CACHE_ALIGNED;
27 | 
28 | typedef struct {
29 |   RingQueue * volatile head DOUBLE_CACHE_ALIGNED;
30 |   RingQueue * volatile tail DOUBLE_CACHE_ALIGNED;
31 |   RingQueue * orignialHead;
32 |   int nprocs;
33 | } queue_t;
34 | 
35 | typedef struct {
36 |   RingQueue * next;
37 |   hzdptr_t hzdptr;
38 | } handle_t;
39 | 
40 | #endif
41 | 
42 | #endif /* end of include guard: LCRQ_H */
43 | 


--------------------------------------------------------------------------------
/benchmark/msqueue.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include "delay.h"
 3 | #include "msqueue.h"
 4 | #include "primitives.h"
 5 | 
 6 | void queue_init(queue_t * q, int nprocs)
 7 | {
 8 |   node_t * node = malloc(sizeof(node_t));
 9 |   node->next = NULL;
10 | 
11 |   q->head = node;
12 |   q->tail = node;
13 |   q->nprocs = nprocs;
14 | }
15 | 
16 | void queue_register(queue_t * q, handle_t * th, int id)
17 | {
18 |   hzdptr_init(&th->hzd, q->nprocs, 2);
19 | }
20 | 
21 | void enqueue(queue_t * q, handle_t * handle, void * data)
22 | {
23 |   node_t * node = malloc(sizeof(node_t));
24 | 
25 |   node->data = data;
26 |   node->next = NULL;
27 | 
28 |   node_t * tail;
29 |   node_t * next;
30 | 
31 |   while (1) {
32 |     tail = hzdptr_setv(&q->tail, &handle->hzd, 0);
33 |     next = tail->next;
34 | 
35 |     if (tail != q->tail) {
36 |       continue;
37 |     }
38 | 
39 |     if (next != NULL) {
40 |       CAS(&q->tail, &tail, next);
41 |       continue;
42 |     }
43 | 
44 |     if (CAS(&tail->next, &next, node)) break;
45 |   }
46 | 
47 |   CAS(&q->tail, &tail, node);
48 | }
49 | 
50 | void * dequeue(queue_t * q, handle_t * handle)
51 | {
52 |   void * data;
53 | 
54 |   node_t * head;
55 |   node_t * tail;
56 |   node_t * next;
57 | 
58 |   while (1) {
59 |     head = hzdptr_setv(&q->head, &handle->hzd, 0);
60 |     tail = q->tail;
61 |     next = hzdptr_set(&head->next, &handle->hzd, 1);
62 | 
63 |     if (head != q->head) {
64 |       continue;
65 |     }
66 | 
67 |     if (next == NULL) {
68 |       return (void *) -1;
69 |     }
70 | 
71 |     if (head == tail) {
72 |       CAS(&q->tail, &tail, next);
73 |       continue;
74 |     }
75 | 
76 |     data = next->data;
77 |     if (CAS(&q->head, &head, next)) break;
78 |   }
79 | 
80 |   hzdptr_retire(&handle->hzd, head);
81 |   return data;
82 | }
83 | 
84 | void queue_free(int id, int nprocs) {}
85 | 


--------------------------------------------------------------------------------
/benchmark/msqueue.h:
--------------------------------------------------------------------------------
 1 | #ifndef MSQUEUE_H
 2 | #define MSQUEUE_H
 3 | 
 4 | #ifdef MSQUEUE
 5 | #include "align.h"
 6 | #include "hzdptr.h"
 7 | 
 8 | #define EMPTY (void *) -1
 9 | 
10 | typedef struct _node_t {
11 |   struct _node_t * volatile next DOUBLE_CACHE_ALIGNED;
12 |   void * data DOUBLE_CACHE_ALIGNED;
13 | } node_t DOUBLE_CACHE_ALIGNED;
14 | 
15 | typedef struct _queue_t {
16 |   struct _node_t * volatile head DOUBLE_CACHE_ALIGNED;
17 |   struct _node_t * volatile tail DOUBLE_CACHE_ALIGNED;
18 |   int nprocs;
19 | } queue_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | typedef struct _handle_t {
22 |   hzdptr_t hzd;
23 | } handle_t DOUBLE_CACHE_ALIGNED;
24 | 
25 | #endif
26 | 
27 | #endif /* end of include guard: MSQUEUE_H */
28 | 


--------------------------------------------------------------------------------
/benchmark/ncq.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "ncq.h"
 5 | 
 6 | void queue_init(queue_t * q, int nprocs)
 7 | {
 8 |   lfring_init_empty((struct lfring *) q->ring, NCQ_ORDER);
 9 | }
10 | 
11 | 
12 | void queue_register(queue_t * q, handle_t * th, int id)
13 | {
14 | }
15 | 
16 | void enqueue(queue_t * q, handle_t * th, void * val)
17 | {
18 |   size_t eidx = (size_t) val;
19 |   lfring_enqueue((struct lfring *) q->ring, NCQ_ORDER, eidx, false);
20 | }
21 | 
22 | void * dequeue(queue_t * q, handle_t * th)
23 | {
24 |   return (void *) lfring_dequeue((struct lfring *) q->ring, NCQ_ORDER, false);
25 | }
26 | 
27 | void queue_free(queue_t * q, handle_t * h)
28 | {
29 | }
30 | 


--------------------------------------------------------------------------------
/benchmark/ncq.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCQ_H
 2 | #define NCQ_H
 3 | 
 4 | #ifdef NCQ
 5 | 
 6 | #include <stddef.h>
 7 | #include "../lfring_naive.h"
 8 | #include "align.h"
 9 | 
10 | #define NCQ_ORDER 16
11 | #define EMPTY (void *) LFRING_EMPTY
12 | 
13 | typedef struct _queue_t {
14 |   char ring[LFRING_SIZE(NCQ_ORDER)];
15 | } queue_t DOUBLE_CACHE_ALIGNED;
16 | 
17 | typedef struct _handle_t {
18 |   int pad;
19 | } handle_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | #endif
22 | 
23 | #endif /* end of include guard: NCQ_H */
24 | 


--------------------------------------------------------------------------------
/benchmark/pairwise.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | #include "delay.h"
 5 | #include "queue.h"
 6 | 
 7 | #ifndef LOGN_OPS
 8 | #define LOGN_OPS 7
 9 | #endif
10 | 
11 | static long nops;
12 | static queue_t * q;
13 | static handle_t ** hds;
14 | 
15 | void init(int nprocs, int logn) {
16 | 
17 |   /** Use 10^7 as default input size. */
18 |   if (logn == 0) logn = LOGN_OPS;
19 | 
20 |   /** Compute the number of ops to perform. */
21 |   nops = 1;
22 |   int i;
23 |   for (i = 0; i < logn; ++i) {
24 |     nops *= 10;
25 |   }
26 | 
27 |   printf("  Number of operations: %ld\n", nops);
28 | 
29 |   // FIXME: sizeof(queue_t) varies, allocate 4MB
30 |   q = align_malloc(PAGE_SIZE, 4194304);
31 |   queue_init(q, nprocs);
32 | 
33 |   hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs]));
34 | }
35 | 
36 | void thread_init(int id, int nprocs) {
37 |   hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t));
38 |   queue_register(q, hds[id], id);
39 | }
40 | 
41 | void * benchmark(int id, int nprocs) {
42 |   void * val = (void *) (intptr_t) (id + 1);
43 |   handle_t * th = hds[id];
44 | 
45 |   delay_t state;
46 |   delay_init(&state, id);
47 | 
48 |   int i;
49 |   for (i = 0; i < nops / nprocs; ++i) {
50 |     enqueue(q, th, val);
51 | //    delay_exec(&state);
52 | 
53 |     val = dequeue(q, th);
54 | //    delay_exec(&state);
55 |   }
56 | 
57 |   return val;
58 | }
59 | 
60 | void thread_exit(int id, int nprocs) {
61 |   queue_free(q, hds[id]);
62 | }
63 | 
64 | #ifdef VERIFY
65 | static int compare(const void * a, const void * b) {
66 |   return *(long *) a - *(long *) b;
67 | }
68 | #endif
69 | 
70 | int verify(int nprocs, void ** results) {
71 | #ifndef VERIFY
72 |   return 0;
73 | #else
74 |   qsort(results, nprocs, sizeof(void *), compare);
75 | 
76 |   int i;
77 |   int ret = 0;
78 | 
79 |   for (i = 0; i < nprocs; ++i) {
80 |     int res = (int) (intptr_t) results[i];
81 |     if (res != i + 1) {
82 |       fprintf(stderr, "expected %d but received %d\n", i + 1, res);
83 |       ret = 1;
84 |     }
85 |   }
86 | 
87 |   if (ret != 1) fprintf(stdout, "PASSED\n");
88 |   return ret;
89 | #endif
90 | }
91 | 


--------------------------------------------------------------------------------
/benchmark/primitives.h:
--------------------------------------------------------------------------------
  1 | /** @file */
  2 | 
  3 | #ifndef PRIMITIVES_H
  4 | #define PRIMITIVES_H
  5 | 
  6 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7
  7 | /**
  8 |  * An atomic fetch-and-add.
  9 |  */
 10 | #define FAA(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_RELAXED)
 11 | /**
 12 |  * An atomic fetch-and-add that also ensures sequential consistency.
 13 |  */
 14 | #define FAAcs(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST)
 15 | 
 16 | /**
 17 |  * An atomic compare-and-swap.
 18 |  */
 19 | #define CAS(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 20 |     __ATOMIC_RELAXED, __ATOMIC_RELAXED)
 21 | /**
 22 |  * An atomic compare-and-swap that also ensures sequential consistency.
 23 |  */
 24 | #define CAScs(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 25 |     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
 26 | /**
 27 |  * An atomic compare-and-swap that ensures release semantic when succeed
 28 |  * or acquire semantic when failed.
 29 |  */
 30 | #define CASra(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 31 |     __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)
 32 | /**
 33 |  * An atomic compare-and-swap that ensures acquire semantic when succeed
 34 |  * or relaxed semantic when failed.
 35 |  */
 36 | #define CASa(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 37 |     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)
 38 | 
 39 | /**
 40 |  * An atomic swap.
 41 |  */
 42 | #define SWAP(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_RELAXED)
 43 | 
 44 | /**
 45 |  * An atomic swap that ensures acquire release semantics.
 46 |  */
 47 | #define SWAPra(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_ACQ_REL)
 48 | 
 49 | /**
 50 |  * A memory fence to ensure sequential consistency.
 51 |  */
 52 | #define FENCE() __atomic_thread_fence(__ATOMIC_SEQ_CST)
 53 | 
 54 | /**
 55 |  * An atomic store.
 56 |  */
 57 | #define STORE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELAXED)
 58 | 
 59 | /**
 60 |  * A store with a preceding release fence to ensure all previous load
 61 |  * and stores completes before the current store is visiable.
 62 |  */
 63 | #define RELEASE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELEASE)
 64 | 
 65 | /**
 66 |  * A load with a following acquire fence to ensure no following load and
 67 |  * stores can start before the current load completes.
 68 |  */
 69 | #define ACQUIRE(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE)
 70 | 
 71 | #else /** Non-GCC or old GCC. */
 72 | #if defined(__x86_64__) || defined(_M_X64_)
 73 | 
 74 | #define FAA __sync_fetch_and_add
 75 | #define FAAcs __sync_fetch_and_add
 76 | 
 77 | static inline int
 78 | _compare_and_swap(void ** ptr, void ** expected, void * desired) {
 79 |   void * oldval = *expected;
 80 |   void * newval = __sync_val_compare_and_swap(ptr, oldval, desired);
 81 | 
 82 |   if (newval == oldval) {
 83 |     return 1;
 84 |   } else {
 85 |     *expected = newval;
 86 |     return 0;
 87 |   }
 88 | }
 89 | #define CAS(ptr, expected, desired) \
 90 |   _compare_and_swap((void **) (ptr), (void **) (expected), (void *) (desired))
 91 | #define CAScs CAS
 92 | #define CASra CAS
 93 | #define CASa  CAS
 94 | 
 95 | #define SWAP __sync_lock_test_and_set
 96 | #define SWAPra SWAP
 97 | 
 98 | #define ACQUIRE(p) ({ \
 99 |   __typeof__(*(p)) __ret = *p; \
100 |   __asm__("":::"memory"); \
101 |   __ret; \
102 | })
103 | 
104 | #define RELEASE(p, v) do {\
105 |   __asm__("":::"memory"); \
106 |   *p = v; \
107 | } while (0)
108 | #define FENCE() __sync_synchronize()
109 | 
110 | #endif
111 | #endif
112 | 
113 | #if defined(__x86_64__) || defined(_M_X64_)
114 | #define PAUSE() __asm__ ("pause")
115 | 
116 | static inline
117 | int _CAS2(volatile long * ptr, long * cmp1, long * cmp2, long val1, long val2)
118 | {
119 |   char success;
120 |   long tmp1 = *cmp1;
121 |   long tmp2 = *cmp2;
122 | 
123 |   __asm__ __volatile__(
124 |       "lock cmpxchg16b %1\n"
125 |       "setz %0"
126 |       : "=q" (success), "+m" (*ptr), "+a" (tmp1), "+d" (tmp2)
127 |       : "b" (val1), "c" (val2)
128 |       : "cc" );
129 | 
130 |   *cmp1 = tmp1;
131 |   *cmp2 = tmp2;
132 |   return success;
133 | }
134 | #define CAS2(p, o1, o2, n1, n2) \
135 |   _CAS2((volatile long *) p, (long *) o1, (long *) o2, (long) n1, (long) n2)
136 | 
137 | #define BTAS(ptr, bit) ({ \
138 |   char __ret; \
139 |   __asm__ __volatile__( \
140 |       "lock btsq %2, %0; setnc %1" \
141 |       : "+m" (*ptr), "=r" (__ret) : "ri" (bit) : "cc" ); \
142 |   __ret; \
143 | })
144 | 
145 | #else
146 | #define PAUSE()
147 | #endif
148 | 
149 | #endif /* end of include guard: PRIMITIVES_H */
150 | 


--------------------------------------------------------------------------------
/benchmark/queue.h:
--------------------------------------------------------------------------------
 1 | #ifndef QUEUE_H
 2 | #define QUEUE_H
 3 | 
 4 | #ifdef WFQUEUE
 5 | #include "wfqueue.h"
 6 | 
 7 | #elif LCRQ
 8 | #include "lcrq.h"
 9 | 
10 | #elif CCQUEUE
11 | #include "ccqueue.h"
12 | 
13 | #elif MSQUEUE
14 | #include "msqueue.h"
15 | 
16 | #elif FAAQ
17 | #include "align.h"
18 | 
19 | typedef struct {
20 |   volatile long P DOUBLE_CACHE_ALIGNED;
21 |   volatile long C DOUBLE_CACHE_ALIGNED;
22 | } queue_t DOUBLE_CACHE_ALIGNED;
23 | 
24 | typedef int handle_t;
25 | 
26 | #elif DELAY
27 | 
28 | typedef int queue_t;
29 | typedef int handle_t;
30 | 
31 | #elif NCQ
32 | #include "ncq.h"
33 | 
34 | #elif SCQ
35 | #include "scq.h"
36 | 
37 | #elif SCQ
38 | #include "scqd.h"
39 | 
40 | #elif SCQ2
41 | #include "scq2.h"
42 | 
43 | #else
44 | #error "Please specify a queue implementation."
45 | 
46 | #endif
47 | 
48 | void queue_init(queue_t * q, int nprocs);
49 | void queue_register(queue_t * q, handle_t * th, int id);
50 | void enqueue(queue_t * q, handle_t * th, void * v);
51 | void * dequeue(queue_t * q, handle_t * th);
52 | void queue_free(queue_t * q, handle_t * h);
53 | 
54 | #endif /* end of include guard: QUEUE_H */
55 | 


--------------------------------------------------------------------------------
/benchmark/scq.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "scq.h"
 5 | 
 6 | void queue_init(queue_t * q, int nprocs)
 7 | {
 8 |   lfring_init_empty((struct lfring *) q->ring, SCQ_ORDER);
 9 | }
10 | 
11 | 
12 | void queue_register(queue_t * q, handle_t * th, int id)
13 | {
14 | }
15 | 
16 | void enqueue(queue_t * q, handle_t * th, void * val)
17 | {
18 |   size_t eidx = (size_t) val;
19 |   lfring_enqueue((struct lfring *) q->ring, SCQ_ORDER, eidx, false);
20 | }
21 | 
22 | void * dequeue(queue_t * q, handle_t * th)
23 | {
24 |   return (void *) lfring_dequeue((struct lfring *) q->ring, SCQ_ORDER, false);
25 | }
26 | 
27 | void queue_free(queue_t * q, handle_t * h)
28 | {
29 | }
30 | 


--------------------------------------------------------------------------------
/benchmark/scq.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCQ_H
 2 | #define SCQ_H
 3 | 
 4 | #ifdef SCQ
 5 | 
 6 | #include <stddef.h>
 7 | #include "../lfring_cas1.h"
 8 | #include "align.h"
 9 | 
10 | #define SCQ_ORDER 15
11 | #define EMPTY (void *) LFRING_EMPTY
12 | 
13 | typedef struct _queue_t {
14 |   char ring[LFRING_SIZE(SCQ_ORDER)];
15 | } queue_t DOUBLE_CACHE_ALIGNED;
16 | 
17 | typedef struct _handle_t {
18 |   int pad;
19 | } handle_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | #endif
22 | 
23 | #endif /* end of include guard: SCQ_H */
24 | 


--------------------------------------------------------------------------------
/benchmark/scq2.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "scq2.h"
 5 | 
 6 | void queue_init(queue_t * q, int nprocs)
 7 | {
 8 |   lfring_ptr_init_empty((struct lfring_ptr *) q->ring, SCQ2_ORDER);
 9 | }
10 | 
11 | 
12 | void queue_register(queue_t * q, handle_t * th, int id)
13 | {
14 |   lfring_ptr_init_lhead(&th->lhead, SCQ2_ORDER);
15 | }
16 | 
17 | void enqueue(queue_t * q, handle_t * th, void * val)
18 | {
19 |   lfring_ptr_enqueue((struct lfring_ptr *) q->ring, SCQ2_ORDER, val + 1,
20 |     false, false, &th->lhead);
21 | }
22 | 
23 | void * dequeue(queue_t * q, handle_t * th)
24 | {
25 |   void *ptr;
26 |   if (!lfring_ptr_dequeue((struct lfring_ptr *) q->ring, SCQ2_ORDER,
27 |       &ptr, false))
28 |     return EMPTY;
29 |   ptr--;
30 |   return ptr;
31 | }
32 | 
33 | void queue_free(queue_t * q, handle_t * h)
34 | {
35 | }
36 | 


--------------------------------------------------------------------------------
/benchmark/scq2.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCQ2_H
 2 | #define SCQ2_H
 3 | 
 4 | #ifdef SCQ2
 5 | 
 6 | #include <stddef.h>
 7 | #include "../lfring_cas2.h"
 8 | #include "align.h"
 9 | 
10 | #define SCQ2_ORDER 15
11 | #define EMPTY (void *) -1
12 | 
13 | typedef struct _queue_t {
14 |   char ring[LFRING_PTR_SIZE(SCQ2_ORDER)];
15 | } queue_t DOUBLE_CACHE_ALIGNED;
16 | 
17 | typedef struct _handle_t {
18 |   lfatomic_t lhead;
19 | } handle_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | #endif
22 | 
23 | #endif /* end of include guard: SCQ_H */
24 | 


--------------------------------------------------------------------------------
/benchmark/scqd.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "scqd.h"
 5 | 
 6 | void queue_init(queue_t * q, int nprocs)
 7 | {
 8 |   lfring_init_empty((struct lfring *) q->aq, SCQD_ORDER);
 9 |   lfring_init_full((struct lfring *) q->fq, SCQD_ORDER);
10 | }
11 | 
12 | 
13 | void queue_register(queue_t * q, handle_t * th, int id)
14 | {
15 | }
16 | 
17 | void enqueue(queue_t * q, handle_t * th, void * val)
18 | {
19 |   size_t eidx;
20 |   eidx = lfring_dequeue((struct lfring *) q->fq, SCQD_ORDER, true);
21 |   if (eidx == LFRING_EMPTY) return;
22 |   q->val[eidx] = val;
23 |   lfring_enqueue((struct lfring *) q->aq, SCQD_ORDER, eidx, false);
24 | }
25 | 
26 | void * dequeue(queue_t * q, handle_t * th)
27 | {
28 |   size_t eidx;
29 |   void *val;
30 |   eidx = lfring_dequeue((struct lfring *) q->aq, SCQD_ORDER, false);
31 |   if (eidx == LFRING_EMPTY) return EMPTY;
32 |   val = q->val[eidx];
33 |   lfring_enqueue((struct lfring *) q->fq, SCQD_ORDER, eidx, true);
34 |   return val;
35 | }
36 | 
37 | void queue_free(queue_t * q, handle_t * h)
38 | {
39 | }
40 | 


--------------------------------------------------------------------------------
/benchmark/scqd.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCQD_H
 2 | #define SCQD_H
 3 | 
 4 | #ifdef SCQD
 5 | 
 6 | #include <stddef.h>
 7 | #include "../lfring_cas1.h"
 8 | #include "align.h"
 9 | 
10 | #define SCQD_ORDER 16
11 | #define EMPTY (void *) LFRING_EMPTY
12 | 
13 | typedef struct _queue_t {
14 |   char aq[LFRING_SIZE(SCQD_ORDER)];
15 |   char fq[LFRING_SIZE(SCQD_ORDER)];
16 |   void  *val[(1U << SCQD_ORDER)];
17 | } queue_t DOUBLE_CACHE_ALIGNED;
18 | 
19 | typedef struct _handle_t {
20 |   int pad;
21 | } handle_t DOUBLE_CACHE_ALIGNED;
22 | 
23 | #endif
24 | 
25 | #endif /* end of include guard: SCQD_H */
26 | 


--------------------------------------------------------------------------------
/benchmark/wcq.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "wcq.h"
 5 | 
 6 | void queue_init(queue_t * q, int nprocs)
 7 | {
 8 |   wfring_init_empty((struct wfring *) q->ring, WCQ_ORDER);
 9 | }
10 | 
11 | static _Atomic(struct wfring_state *) handle_tail = ATOMIC_VAR_INIT(NULL);
12 | 
13 | void queue_register(queue_t * q, handle_t * th, int id)
14 | {
15 |   wfring_init_state((struct wfring *) q->ring, &th->state);
16 | 
17 |   struct wfring_state *tail = atomic_load(&handle_tail);
18 |   if (tail == NULL) {
19 |     th->state.next = &th->state;
20 |     if (atomic_compare_exchange_strong(&handle_tail, &tail, &th->state))
21 |       return;
22 |   }
23 | 
24 |   struct wfring_state *next = atomic_load(&tail->next);
25 |   do {
26 |     th->state.next = next;
27 |   } while (!atomic_compare_exchange_weak(&tail->next, &next, &th->state));
28 | }
29 | 
30 | void enqueue(queue_t * q, handle_t * th, void * val)
31 | {
32 |   size_t eidx = (size_t) val;
33 |   wfring_enqueue((struct wfring *) q->ring, WCQ_ORDER, eidx, false, &th->state);
34 | }
35 | 
36 | void * dequeue(queue_t * q, handle_t * th)
37 | {
38 |   return (void *) wfring_dequeue((struct wfring *) q->ring, WCQ_ORDER, false, &th->state);
39 | }
40 | 
41 | void queue_free(queue_t * q, handle_t * h)
42 | {
43 | }
44 | 


--------------------------------------------------------------------------------
/benchmark/wcq.h:
--------------------------------------------------------------------------------
 1 | #ifndef WCQ_H
 2 | #define WCQ_H
 3 | 
 4 | #ifdef WCQ
 5 | 
 6 | #include <stddef.h>
 7 | #include "../wfring_cas2.h"
 8 | #include "align.h"
 9 | 
10 | #define WCQ_ORDER 15
11 | #define EMPTY (void *) WFRING_EMPTY
12 | 
13 | typedef struct _queue_t {
14 |   char ring[WFRING_SIZE(WCQ_ORDER)];
15 | } queue_t DOUBLE_CACHE_ALIGNED;
16 | 
17 | typedef struct _handle_t {
18 |   struct wfring_state state;
19 | } handle_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | #endif
22 | 
23 | #endif /* end of include guard: WCQ_H */
24 | 


--------------------------------------------------------------------------------
/benchmark/wfqueue.c:
--------------------------------------------------------------------------------
  1 | #include "wfqueue.h"
  2 | #include <pthread.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include "primitives.h"
  6 | 
  7 | #define N WFQUEUE_NODE_SIZE
  8 | #define BOT ((void *)0)
  9 | #define TOP ((void *)-1)
 10 | 
 11 | #define MAX_GARBAGE(n) (2 * n)
 12 | 
 13 | #ifndef MAX_SPIN
 14 | #define MAX_SPIN 100
 15 | #endif
 16 | 
 17 | #ifndef MAX_PATIENCE
 18 | #define MAX_PATIENCE 10
 19 | #endif
 20 | 
 21 | typedef struct _enq_t enq_t;
 22 | typedef struct _deq_t deq_t;
 23 | typedef struct _cell_t cell_t;
 24 | typedef struct _node_t node_t;
 25 | 
 26 | static inline void *spin(void *volatile *p) {
 27 |     int patience = MAX_SPIN;
 28 |     void *v = *p;
 29 | 
 30 |     while (!v && patience-- > 0) {
 31 |         v = *p;
 32 |         PAUSE();
 33 |     }
 34 | 
 35 |     return v;
 36 | }
 37 | 
 38 | static inline node_t *new_node() {
 39 |     node_t *n = align_malloc(PAGE_SIZE, sizeof(node_t));
 40 |     memset(n, 0, sizeof(node_t));
 41 |     return n;
 42 | }
 43 | 
 44 | static node_t *check(unsigned long volatile *p_hzd_node_id, node_t *cur,
 45 |                      node_t *old) {
 46 |     unsigned long hzd_node_id = ACQUIRE(p_hzd_node_id);
 47 | 
 48 |     if (hzd_node_id < cur->id) {
 49 |         node_t *tmp = old;
 50 |         while (tmp->id < hzd_node_id) {
 51 |             tmp = tmp->next;
 52 |         }
 53 |         cur = tmp;
 54 |     }
 55 | 
 56 |     return cur;
 57 | }
 58 | 
 59 | static node_t *update(node_t *volatile *pPn, node_t *cur,
 60 |                       unsigned long volatile *p_hzd_node_id, node_t *old) {
 61 |     node_t *ptr = ACQUIRE(pPn);
 62 | 
 63 |     if (ptr->id < cur->id) {
 64 |         if (!CAScs(pPn, &ptr, cur)) {
 65 |             if (ptr->id < cur->id) cur = ptr;
 66 |         }
 67 | 
 68 |         cur = check(p_hzd_node_id, cur, old);
 69 |     }
 70 | 
 71 |     return cur;
 72 | }
 73 | 
 74 | static void cleanup(queue_t *q, handle_t *th) {
 75 |     long oid = ACQUIRE(&q->Hi);
 76 |     node_t *new = th->Dp;
 77 | 
 78 |     if (oid == -1) return;
 79 |     if (new->id - oid < MAX_GARBAGE(q->nprocs)) return;
 80 |     if (!CASa(&q->Hi, &oid, -1)) return;
 81 | 
 82 |     node_t *old = q->Hp;
 83 |     handle_t *ph = th;
 84 |     handle_t *phs[q->nprocs];
 85 |     int i = 0;
 86 | 
 87 |     do {
 88 |         new = check(&ph->hzd_node_id, new, old);
 89 |         new = update(&ph->Ep, new, &ph->hzd_node_id, old);
 90 |         new = update(&ph->Dp, new, &ph->hzd_node_id, old);
 91 | 
 92 |         phs[i++] = ph;
 93 |         ph = ph->next;
 94 |     } while (new->id > oid && ph != th);
 95 | 
 96 |     while (new->id > oid && --i >= 0) {
 97 |         new = check(&phs[i]->hzd_node_id, new, old);
 98 |     }
 99 | 
100 |     long nid = new->id;
101 | 
102 |     if (nid <= oid) {
103 |         RELEASE(&q->Hi, oid);
104 |     } else {
105 |         q->Hp = new;
106 |         RELEASE(&q->Hi, nid);
107 | 
108 |         while (old != new) {
109 |             node_t *tmp = old->next;
110 |             free(old);
111 |             old = tmp;
112 |         }
113 |     }
114 | }
115 | 
116 | static cell_t *find_cell(node_t *volatile *ptr, long i, handle_t *th) {
117 |     node_t *curr = *ptr;
118 | 
119 |     long j;
120 |     for (j = curr->id; j < i / N; ++j) {
121 |         node_t *next = curr->next;
122 | 
123 |         if (next == NULL) {
124 |             node_t *temp = th->spare;
125 | 
126 |             if (!temp) {
127 |                 temp = new_node();
128 |                 th->spare = temp;
129 |             }
130 | 
131 |             temp->id = j + 1;
132 | 
133 |             if (CASra(&curr->next, &next, temp)) {
134 |                 next = temp;
135 |                 th->spare = NULL;
136 |             }
137 |         }
138 | 
139 |         curr = next;
140 |     }
141 | 
142 |     *ptr = curr;
143 |     return &curr->cells[i % N];
144 | }
145 | 
146 | static int enq_fast(queue_t *q, handle_t *th, void *v, long *id) {
147 |     long i = FAAcs(&q->Ei, 1);
148 |     cell_t *c = find_cell(&th->Ep, i, th);
149 |     void *cv = BOT;
150 | 
151 |     if (CAS(&c->val, &cv, v)) {
152 | #ifdef RECORD
153 |         th->fastenq++;
154 | #endif
155 |         return 1;
156 |     } else {
157 |         *id = i;
158 |         return 0;
159 |     }
160 | }
161 | 
162 | static void enq_slow(queue_t *q, handle_t *th, void *v, long id) {
163 |     enq_t *enq = &th->Er;
164 |     enq->val = v;
165 |     RELEASE(&enq->id, id);
166 | 
167 |     node_t *tail = th->Ep;
168 |     long i;
169 |     cell_t *c;
170 | 
171 |     do {
172 |         i = FAA(&q->Ei, 1);
173 |         c = find_cell(&tail, i, th);
174 |         enq_t *ce = BOT;
175 | 
176 |         if (CAScs(&c->enq, &ce, enq) && c->val != TOP) {
177 |             if (CAS(&enq->id, &id, -i)) id = -i;
178 |             break;
179 |         }
180 |     } while (enq->id > 0);
181 | 
182 |     id = -enq->id;
183 |     c = find_cell(&th->Ep, id, th);
184 |     if (id > i) {
185 |         long Ei = q->Ei;
186 |         while (Ei <= id && !CAS(&q->Ei, &Ei, id + 1))
187 |             ;
188 |     }
189 |     c->val = v;
190 | 
191 | #ifdef RECORD
192 |     th->slowenq++;
193 | #endif
194 | }
195 | 
196 | void enqueue(queue_t *q, handle_t *th, void *v) {
197 |     th->hzd_node_id = th->enq_node_id;
198 | 
199 |     long id;
200 |     int p = MAX_PATIENCE;
201 |     while (!enq_fast(q, th, v, &id) && p-- > 0)
202 |         ;
203 |     if (p < 0) enq_slow(q, th, v, id);
204 | 
205 |     th->enq_node_id = th->Ep->id;
206 |     RELEASE(&th->hzd_node_id, -1);
207 | }
208 | 
209 | static void *help_enq(queue_t *q, handle_t *th, cell_t *c, long i) {
210 |     void *v = spin(&c->val);
211 | 
212 |     if ((v != TOP && v != BOT) ||
213 |         (v == BOT && !CAScs(&c->val, &v, TOP) && v != TOP)) {
214 |         return v;
215 |     }
216 | 
217 |     enq_t *e = c->enq;
218 | 
219 |     if (e == BOT) {
220 |         handle_t *ph;
221 |         enq_t *pe;
222 |         long id;
223 |         ph = th->Eh, pe = &ph->Er, id = pe->id;
224 | 
225 |         if (th->Ei != 0 && th->Ei != id) {
226 |             th->Ei = 0;
227 |             th->Eh = ph->next;
228 |             ph = th->Eh, pe = &ph->Er, id = pe->id;
229 |         }
230 | 
231 |         if (id > 0 && id <= i && !CAS(&c->enq, &e, pe))
232 |             th->Ei = id;
233 |         else
234 |             th->Eh = ph->next;
235 | 
236 |         if (e == BOT && CAS(&c->enq, &e, TOP)) e = TOP;
237 |     }
238 | 
239 |     if (e == TOP) return (q->Ei <= i ? BOT : TOP);
240 | 
241 |     long ei = ACQUIRE(&e->id);
242 |     void *ev = ACQUIRE(&e->val);
243 | 
244 |     if (ei > i) {
245 |         if (c->val == TOP && q->Ei <= i) return BOT;
246 |     } else {
247 |         if ((ei > 0 && CAS(&e->id, &ei, -i)) || (ei == -i && c->val == TOP)) {
248 |             long Ei = q->Ei;
249 |             while (Ei <= i && !CAS(&q->Ei, &Ei, i + 1))
250 |                 ;
251 |             c->val = ev;
252 |         }
253 |     }
254 | 
255 |     return c->val;
256 | }
257 | 
258 | static void help_deq(queue_t *q, handle_t *th, handle_t *ph) {
259 |     deq_t *deq = &ph->Dr;
260 |     long idx = ACQUIRE(&deq->idx);
261 |     long id = deq->id;
262 | 
263 |     if (idx < id) return;
264 | 
265 |     node_t *Dp = ph->Dp;
266 |     th->hzd_node_id = ph->hzd_node_id;
267 |     FENCE();
268 |     idx = deq->idx;
269 | 
270 |     long i = id + 1, old = id, new = 0;
271 |     while (1) {
272 |         node_t *h = Dp;
273 |         for (; idx == old && new == 0; ++i) {
274 |             cell_t *c = find_cell(&h, i, th);
275 | 
276 |             long Di = q->Di;
277 |             while (Di <= i && !CAS(&q->Di, &Di, i + 1))
278 |                 ;
279 | 
280 |             void *v = help_enq(q, th, c, i);
281 |             if (v == BOT || (v != TOP && c->deq == BOT))
282 |                 new = i;
283 |             else
284 |                 idx = ACQUIRE(&deq->idx);
285 |         }
286 | 
287 |         if (new != 0) {
288 |             if (CASra(&deq->idx, &idx, new)) idx = new;
289 |             if (idx >= new) new = 0;
290 |         }
291 | 
292 |         if (idx < 0 || deq->id != id) break;
293 | 
294 |         cell_t *c = find_cell(&Dp, idx, th);
295 |         deq_t *cd = BOT;
296 |         if (c->val == TOP || CAS(&c->deq, &cd, deq) || cd == deq) {
297 |             CAS(&deq->idx, &idx, -idx);
298 |             break;
299 |         }
300 | 
301 |         old = idx;
302 |         if (idx >= i) i = idx + 1;
303 |     }
304 | }
305 | 
306 | static void *deq_fast(queue_t *q, handle_t *th, long *id) {
307 |     long i = FAAcs(&q->Di, 1);
308 |     cell_t *c = find_cell(&th->Dp, i, th);
309 |     void *v = help_enq(q, th, c, i);
310 |     deq_t *cd = BOT;
311 | 
312 |     if (v == BOT) return BOT;
313 |     if (v != TOP && CAS(&c->deq, &cd, TOP)) return v;
314 | 
315 |     *id = i;
316 |     return TOP;
317 | }
318 | 
319 | static void *deq_slow(queue_t *q, handle_t *th, long id) {
320 |     deq_t *deq = &th->Dr;
321 |     RELEASE(&deq->id, id);
322 |     RELEASE(&deq->idx, id);
323 | 
324 |     help_deq(q, th, th);
325 |     long i = -deq->idx;
326 |     cell_t *c = find_cell(&th->Dp, i, th);
327 |     void *val = c->val;
328 | 
329 | #ifdef RECORD
330 |     th->slowdeq++;
331 | #endif
332 |     return val == TOP ? BOT : val;
333 | }
334 | 
335 | void *dequeue(queue_t *q, handle_t *th) {
336 |     th->hzd_node_id = th->deq_node_id;
337 | 
338 |     void *v;
339 |     long id = 0;
340 |     int p = MAX_PATIENCE;
341 | 
342 |     do
343 |         v = deq_fast(q, th, &id);
344 |     while (v == TOP && p-- > 0);
345 |     if (v == TOP)
346 |         v = deq_slow(q, th, id);
347 |     else {
348 | #ifdef RECORD
349 |         th->fastdeq++;
350 | #endif
351 |     }
352 | 
353 |     if (v != EMPTY) {
354 |         help_deq(q, th, th->Dh);
355 |         th->Dh = th->Dh->next;
356 |     }
357 | 
358 |     th->deq_node_id = th->Dp->id;
359 |     RELEASE(&th->hzd_node_id, -1);
360 | 
361 |     if (th->spare == NULL) {
362 |         cleanup(q, th);
363 |         th->spare = new_node();
364 |     }
365 | 
366 | #ifdef RECORD
367 |     if (v == EMPTY) th->empty++;
368 | #endif
369 |     return v;
370 | }
371 | 
372 | static pthread_barrier_t barrier;
373 | 
374 | void queue_init(queue_t *q, int nprocs) {
375 |     q->Hi = 0;
376 |     q->Hp = new_node();
377 | 
378 |     q->Ei = 1;
379 |     q->Di = 1;
380 | 
381 |     q->nprocs = nprocs;
382 | 
383 | #ifdef RECORD
384 |     q->fastenq = 0;
385 |     q->slowenq = 0;
386 |     q->fastdeq = 0;
387 |     q->slowdeq = 0;
388 |     q->empty = 0;
389 | #endif
390 |     pthread_barrier_init(&barrier, NULL, nprocs);
391 | }
392 | 
393 | void queue_free(queue_t *q, handle_t *h) {
394 | #ifdef RECORD
395 |     static int lock = 0;
396 | 
397 |     FAA(&q->fastenq, h->fastenq);
398 |     FAA(&q->slowenq, h->slowenq);
399 |     FAA(&q->fastdeq, h->fastdeq);
400 |     FAA(&q->slowdeq, h->slowdeq);
401 |     FAA(&q->empty, h->empty);
402 | 
403 |     pthread_barrier_wait(&barrier);
404 | 
405 |     if (FAA(&lock, 1) == 0)
406 |         printf("Enq: %f Deq: %f Empty: %f\n",
407 |                q->slowenq * 100.0 / (q->fastenq + q->slowenq),
408 |                q->slowdeq * 100.0 / (q->fastdeq + q->slowdeq),
409 |                q->empty * 100.0 / (q->fastdeq + q->slowdeq));
410 | #endif
411 | }
412 | 
413 | void queue_register(queue_t *q, handle_t *th, int id) {
414 |     th->next = NULL;
415 |     th->hzd_node_id = -1;
416 |     th->Ep = q->Hp;
417 |     th->enq_node_id = th->Ep->id;
418 |     th->Dp = q->Hp;
419 |     th->deq_node_id = th->Dp->id;
420 | 
421 |     th->Er.id = 0;
422 |     th->Er.val = BOT;
423 |     th->Dr.id = 0;
424 |     th->Dr.idx = -1;
425 | 
426 |     th->Ei = 0;
427 |     th->spare = new_node();
428 | #ifdef RECORD
429 |     th->slowenq = 0;
430 |     th->slowdeq = 0;
431 |     th->fastenq = 0;
432 |     th->fastdeq = 0;
433 |     th->empty = 0;
434 | #endif
435 | 
436 |     static handle_t *volatile _tail;
437 |     handle_t *tail = _tail;
438 | 
439 |     if (tail == NULL) {
440 |         th->next = th;
441 |         if (CASra(&_tail, &tail, th)) {
442 |             th->Eh = th->next;
443 |             th->Dh = th->next;
444 |             return;
445 |         }
446 |     }
447 | 
448 |     handle_t *next = tail->next;
449 |     do
450 |         th->next = next;
451 |     while (!CASra(&tail->next, &next, th));
452 | 
453 |     th->Eh = th->next;
454 |     th->Dh = th->next;
455 | }
456 | 


--------------------------------------------------------------------------------
/benchmark/wfqueue.h:
--------------------------------------------------------------------------------
  1 | #ifndef WFQUEUE_H
  2 | #define WFQUEUE_H
  3 | 
  4 | #ifdef WFQUEUE
  5 | 
  6 | #include "align.h"
  7 | #define EMPTY ((void *) 0)
  8 | 
  9 | #ifndef WFQUEUE_NODE_SIZE
 10 | #define WFQUEUE_NODE_SIZE ((1 << 10) - 2)
 11 | #endif
 12 | 
 13 | struct _enq_t {
 14 |   long volatile id;
 15 |   void * volatile val;
 16 | } CACHE_ALIGNED;
 17 | 
 18 | struct _deq_t {
 19 |   long volatile id;
 20 |   long volatile idx;
 21 | } CACHE_ALIGNED;
 22 | 
 23 | struct _cell_t {
 24 |   void * volatile val;
 25 |   struct _enq_t * volatile enq;
 26 |   struct _deq_t * volatile deq;
 27 |   void * pad[5];
 28 | };
 29 | 
 30 | struct _node_t {
 31 |   struct _node_t * volatile next CACHE_ALIGNED;
 32 |   long id CACHE_ALIGNED;
 33 |   struct _cell_t cells[WFQUEUE_NODE_SIZE] CACHE_ALIGNED;
 34 | };
 35 | 
 36 | typedef struct DOUBLE_CACHE_ALIGNED {
 37 |   /**
 38 |    * Index of the next position for enqueue.
 39 |    */
 40 |   volatile long Ei DOUBLE_CACHE_ALIGNED;
 41 | 
 42 |   /**
 43 |    * Index of the next position for dequeue.
 44 |    */
 45 |   volatile long Di DOUBLE_CACHE_ALIGNED;
 46 | 
 47 |   /**
 48 |    * Index of the head of the queue.
 49 |    */
 50 |   volatile long Hi DOUBLE_CACHE_ALIGNED;
 51 | 
 52 |   /**
 53 |    * Pointer to the head node of the queue.
 54 |    */
 55 |   struct _node_t * volatile Hp;
 56 | 
 57 |   /**
 58 |    * Number of processors.
 59 |    */
 60 |   long nprocs;
 61 | #ifdef RECORD
 62 |   long slowenq;
 63 |   long slowdeq;
 64 |   long fastenq;
 65 |   long fastdeq;
 66 |   long empty;
 67 | #endif
 68 | } queue_t;
 69 | 
 70 | typedef struct _handle_t {
 71 |   /**
 72 |    * Pointer to the next handle.
 73 |    */
 74 |   struct _handle_t * next;
 75 | 
 76 |   /**
 77 |    * Hazard pointer.
 78 |    */
 79 |   //struct _node_t * volatile Hp;
 80 |   unsigned long volatile hzd_node_id;
 81 | 
 82 |   /**
 83 |    * Pointer to the node for enqueue.
 84 |    */
 85 |   struct _node_t * volatile Ep;
 86 |   unsigned long enq_node_id;
 87 | 
 88 |   /**
 89 |    * Pointer to the node for dequeue.
 90 |    */
 91 |   struct _node_t * volatile Dp;
 92 |   unsigned long deq_node_id;
 93 | 
 94 |   /**
 95 |    * Enqueue request.
 96 |    */
 97 |   struct _enq_t Er CACHE_ALIGNED;
 98 | 
 99 |   /**
100 |    * Dequeue request.
101 |    */
102 |   struct _deq_t Dr CACHE_ALIGNED;
103 | 
104 |   /**
105 |    * Handle of the next enqueuer to help.
106 |    */
107 |   struct _handle_t * Eh CACHE_ALIGNED;
108 | 
109 |   long Ei;
110 | 
111 |   /**
112 |    * Handle of the next dequeuer to help.
113 |    */
114 |   struct _handle_t * Dh;
115 | 
116 |   /**
117 |    * Pointer to a spare node to use, to speedup adding a new node.
118 |    */
119 |   struct _node_t * spare CACHE_ALIGNED;
120 | 
121 |   /**
122 |    * Count the delay rounds of helping another dequeuer.
123 |    */
124 |   int delay;
125 | 
126 | #ifdef RECORD
127 |   long slowenq;
128 |   long slowdeq;
129 |   long fastenq;
130 |   long fastdeq;
131 |   long empty;
132 | #endif
133 | } handle_t;
134 | 
135 | #endif
136 | 
137 | #endif /* end of include guard: WFQUEUE_H */
138 | 


--------------------------------------------------------------------------------
/benchmark/xxhash.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | xxHash - Fast Hash algorithm
  3 | Copyright (C) 2012-2014, Yann Collet.
  4 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright
 11 | notice, this list of conditions and the following disclaimer.
 12 | * Redistributions in binary form must reproduce the above
 13 | copyright notice, this list of conditions and the following disclaimer
 14 | in the documentation and/or other materials provided with the
 15 | distribution.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | You can contact the author at :
 30 | - xxHash source repository : http://code.google.com/p/xxhash/
 31 | - public discussion board : https://groups.google.com/forum/#!forum/lz4c
 32 | */
 33 | 
 34 | 
 35 | //**************************************
 36 | // Tuning parameters
 37 | //**************************************
 38 | // Unaligned memory access is automatically enabled for "common" CPU, such as x86.
 39 | // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
 40 | // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
 41 | // You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
 42 | #if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
 43 | #  define XXH_USE_UNALIGNED_ACCESS 1
 44 | #endif
 45 | 
 46 | // XXH_ACCEPT_NULL_INPUT_POINTER :
 47 | // If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
 48 | // When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
 49 | // This option has a very small performance cost (only measurable on small inputs).
 50 | // By default, this option is disabled. To enable it, uncomment below define :
 51 | // #define XXH_ACCEPT_NULL_INPUT_POINTER 1
 52 | 
 53 | // XXH_FORCE_NATIVE_FORMAT :
 54 | // By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
 55 | // Results are therefore identical for little-endian and big-endian CPU.
 56 | // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
 57 | // Should endian-independance be of no importance for your application, you may set the #define below to 1.
 58 | // It will improve speed for Big-endian CPU.
 59 | // This option has no impact on Little_Endian CPU.
 60 | #define XXH_FORCE_NATIVE_FORMAT 0
 61 | 
 62 | //**************************************
 63 | // Compiler Specific Options
 64 | //**************************************
 65 | // Disable some Visual warning messages
 66 | #ifdef _MSC_VER  // Visual Studio
 67 | #  pragma warning(disable : 4127)      // disable: C4127: conditional expression is constant
 68 | #endif
 69 | 
 70 | #ifdef _MSC_VER    // Visual Studio
 71 | #  define FORCE_INLINE static __forceinline
 72 | #else
 73 | #  ifdef __GNUC__
 74 | #    define FORCE_INLINE static inline __attribute__((always_inline))
 75 | #  else
 76 | #    define FORCE_INLINE static inline
 77 | #  endif
 78 | #endif
 79 | 
 80 | //**************************************
 81 | // Includes & Memory related functions
 82 | //**************************************
 83 | #include "xxhash.h"
 84 | // Modify the local functions below should you wish to use some other memory routines
 85 | // for malloc(), free()
 86 | #include <stdlib.h>
 87 | static void* XXH_malloc(size_t s) { return malloc(s); }
 88 | static void  XXH_free  (void* p)  { free(p); }
 89 | // for memcpy()
 90 | #include <string.h>
 91 | static void* XXH_memcpy(void* dest, const void* src, size_t size)
 92 | {
 93 |     return memcpy(dest,src,size);
 94 | }
 95 | 
 96 | 
 97 | //**************************************
 98 | // Basic Types
 99 | //**************************************
100 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
101 | # include <stdint.h>
102 | typedef uint8_t  BYTE;
103 | typedef uint16_t U16;
104 | typedef uint32_t U32;
105 | typedef  int32_t S32;
106 | typedef uint64_t U64;
107 | #else
108 | typedef unsigned char      BYTE;
109 | typedef unsigned short     U16;
110 | typedef unsigned int       U32;
111 | typedef   signed int       S32;
112 | typedef unsigned long long U64;
113 | #endif
114 | 
115 | #if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
116 | #  define _PACKED __attribute__ ((packed))
117 | #else
118 | #  define _PACKED
119 | #endif
120 | 
121 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
122 | #  ifdef __IBMC__
123 | #    pragma pack(1)
124 | #  else
125 | #    pragma pack(push, 1)
126 | #  endif
127 | #endif
128 | 
129 | typedef struct _U32_S
130 | {
131 |     U32 v;
132 | } _PACKED U32_S;
133 | typedef struct _U64_S
134 | {
135 |     U64 v;
136 | } _PACKED U64_S;
137 | 
138 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
139 | #  pragma pack(pop)
140 | #endif
141 | 
142 | #define A32(x) (((U32_S *)(x))->v)
143 | #define A64(x) (((U64_S *)(x))->v)
144 | 
145 | 
146 | //***************************************
147 | // Compiler-specific Functions and Macros
148 | //***************************************
149 | #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
150 | 
151 | // Note : although _rotl exists for minGW (GCC under windows), performance seems poor
152 | #if defined(_MSC_VER)
153 | #  define XXH_rotl32(x,r) _rotl(x,r)
154 | #  define XXH_rotl64(x,r) _rotl64(x,r)
155 | #else
156 | #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
157 | #  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
158 | #endif
159 | 
160 | #if defined(_MSC_VER)     // Visual Studio
161 | #  define XXH_swap32 _byteswap_ulong
162 | #  define XXH_swap64 _byteswap_uint64
163 | #elif GCC_VERSION >= 403
164 | #  define XXH_swap32 __builtin_bswap32
165 | #  define XXH_swap64 __builtin_bswap64
166 | #else
167 | static inline U32 XXH_swap32 (U32 x)
168 | {
169 |     return  ((x << 24) & 0xff000000 ) |
170 |             ((x <<  8) & 0x00ff0000 ) |
171 |             ((x >>  8) & 0x0000ff00 ) |
172 |             ((x >> 24) & 0x000000ff );
173 | }
174 | static inline U64 XXH_swap64 (U64 x)
175 | {
176 |     return  ((x << 56) & 0xff00000000000000ULL) |
177 |             ((x << 40) & 0x00ff000000000000ULL) |
178 |             ((x << 24) & 0x0000ff0000000000ULL) |
179 |             ((x << 8)  & 0x000000ff00000000ULL) |
180 |             ((x >> 8)  & 0x00000000ff000000ULL) |
181 |             ((x >> 24) & 0x0000000000ff0000ULL) |
182 |             ((x >> 40) & 0x000000000000ff00ULL) |
183 |             ((x >> 56) & 0x00000000000000ffULL);
184 | }
185 | #endif
186 | 
187 | 
188 | //**************************************
189 | // Constants
190 | //**************************************
191 | #define PRIME32_1   2654435761U
192 | #define PRIME32_2   2246822519U
193 | #define PRIME32_3   3266489917U
194 | #define PRIME32_4    668265263U
195 | #define PRIME32_5    374761393U
196 | 
197 | #define PRIME64_1 11400714785074694791ULL
198 | #define PRIME64_2 14029467366897019727ULL
199 | #define PRIME64_3  1609587929392839161ULL
200 | #define PRIME64_4  9650029242287828579ULL
201 | #define PRIME64_5  2870177450012600261ULL
202 | 
203 | //**************************************
204 | // Architecture Macros
205 | //**************************************
206 | typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
207 | #ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
208 | static const int one = 1;
209 | #   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
210 | #endif
211 | 
212 | 
213 | //**************************************
214 | // Macros
215 | //**************************************
216 | #define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
217 | 
218 | 
219 | //****************************
220 | // Memory reads
221 | //****************************
222 | typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
223 | 
224 | FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
225 | {
226 |     if (align==XXH_unaligned)
227 |         return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
228 |     else
229 |         return endian==XXH_littleEndian ? *(U32*)ptr : XXH_swap32(*(U32*)ptr);
230 | }
231 | 
232 | FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
233 | {
234 |     return XXH_readLE32_align(ptr, endian, XXH_unaligned);
235 | }
236 | 
237 | FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
238 | {
239 |     if (align==XXH_unaligned)
240 |         return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr));
241 |     else
242 |         return endian==XXH_littleEndian ? *(U64*)ptr : XXH_swap64(*(U64*)ptr);
243 | }
244 | 
245 | FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
246 | {
247 |     return XXH_readLE64_align(ptr, endian, XXH_unaligned);
248 | }
249 | 
250 | 
251 | //****************************
252 | // Simple Hash Functions
253 | //****************************
254 | FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
255 | {
256 |     const BYTE* p = (const BYTE*)input;
257 |     const BYTE* bEnd = p + len;
258 |     U32 h32;
259 | #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
260 | 
261 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
262 |     if (p==NULL)
263 |     {
264 |         len=0;
265 |         bEnd=p=(const BYTE*)(size_t)16;
266 |     }
267 | #endif
268 | 
269 |     if (len>=16)
270 |     {
271 |         const BYTE* const limit = bEnd - 16;
272 |         U32 v1 = seed + PRIME32_1 + PRIME32_2;
273 |         U32 v2 = seed + PRIME32_2;
274 |         U32 v3 = seed + 0;
275 |         U32 v4 = seed - PRIME32_1;
276 | 
277 |         do
278 |         {
279 |             v1 += XXH_get32bits(p) * PRIME32_2;
280 |             v1 = XXH_rotl32(v1, 13);
281 |             v1 *= PRIME32_1;
282 |             p+=4;
283 |             v2 += XXH_get32bits(p) * PRIME32_2;
284 |             v2 = XXH_rotl32(v2, 13);
285 |             v2 *= PRIME32_1;
286 |             p+=4;
287 |             v3 += XXH_get32bits(p) * PRIME32_2;
288 |             v3 = XXH_rotl32(v3, 13);
289 |             v3 *= PRIME32_1;
290 |             p+=4;
291 |             v4 += XXH_get32bits(p) * PRIME32_2;
292 |             v4 = XXH_rotl32(v4, 13);
293 |             v4 *= PRIME32_1;
294 |             p+=4;
295 |         }
296 |         while (p<=limit);
297 | 
298 |         h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
299 |     }
300 |     else
301 |     {
302 |         h32  = seed + PRIME32_5;
303 |     }
304 | 
305 |     h32 += (U32) len;
306 | 
307 |     while (p+4<=bEnd)
308 |     {
309 |         h32 += XXH_get32bits(p) * PRIME32_3;
310 |         h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
311 |         p+=4;
312 |     }
313 | 
314 |     while (p<bEnd)
315 |     {
316 |         h32 += (*p) * PRIME32_5;
317 |         h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
318 |         p++;
319 |     }
320 | 
321 |     h32 ^= h32 >> 15;
322 |     h32 *= PRIME32_2;
323 |     h32 ^= h32 >> 13;
324 |     h32 *= PRIME32_3;
325 |     h32 ^= h32 >> 16;
326 | 
327 |     return h32;
328 | }
329 | 
330 | 
331 | unsigned int XXH32 (const void* input, size_t len, unsigned seed)
332 | {
333 | #if 0
334 |     // Simple version, good for code maintenance, but unfortunately slow for small inputs
335 |     XXH32_state_t state;
336 |     XXH32_reset(&state, seed);
337 |     XXH32_update(&state, input, len);
338 |     return XXH32_digest(&state);
339 | #else
340 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
341 | 
342 | #  if !defined(XXH_USE_UNALIGNED_ACCESS)
343 |     if ((((size_t)input) & 3) == 0)   // Input is aligned, let's leverage the speed advantage
344 |     {
345 |         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
346 |             return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
347 |         else
348 |             return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
349 |     }
350 | #  endif
351 | 
352 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
353 |         return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
354 |     else
355 |         return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
356 | #endif
357 | }
358 | 
359 | FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
360 | {
361 |     const BYTE* p = (const BYTE*)input;
362 |     const BYTE* bEnd = p + len;
363 |     U64 h64;
364 | #define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
365 | 
366 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
367 |     if (p==NULL)
368 |     {
369 |         len=0;
370 |         bEnd=p=(const BYTE*)(size_t)32;
371 |     }
372 | #endif
373 | 
374 |     if (len>=32)
375 |     {
376 |         const BYTE* const limit = bEnd - 32;
377 |         U64 v1 = seed + PRIME64_1 + PRIME64_2;
378 |         U64 v2 = seed + PRIME64_2;
379 |         U64 v3 = seed + 0;
380 |         U64 v4 = seed - PRIME64_1;
381 | 
382 |         do
383 |         {
384 |             v1 += XXH_get64bits(p) * PRIME64_2;
385 |             p+=8;
386 |             v1 = XXH_rotl64(v1, 31);
387 |             v1 *= PRIME64_1;
388 |             v2 += XXH_get64bits(p) * PRIME64_2;
389 |             p+=8;
390 |             v2 = XXH_rotl64(v2, 31);
391 |             v2 *= PRIME64_1;
392 |             v3 += XXH_get64bits(p) * PRIME64_2;
393 |             p+=8;
394 |             v3 = XXH_rotl64(v3, 31);
395 |             v3 *= PRIME64_1;
396 |             v4 += XXH_get64bits(p) * PRIME64_2;
397 |             p+=8;
398 |             v4 = XXH_rotl64(v4, 31);
399 |             v4 *= PRIME64_1;
400 |         }
401 |         while (p<=limit);
402 | 
403 |         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
404 | 
405 |         v1 *= PRIME64_2;
406 |         v1 = XXH_rotl64(v1, 31);
407 |         v1 *= PRIME64_1;
408 |         h64 ^= v1;
409 |         h64 = h64 * PRIME64_1 + PRIME64_4;
410 | 
411 |         v2 *= PRIME64_2;
412 |         v2 = XXH_rotl64(v2, 31);
413 |         v2 *= PRIME64_1;
414 |         h64 ^= v2;
415 |         h64 = h64 * PRIME64_1 + PRIME64_4;
416 | 
417 |         v3 *= PRIME64_2;
418 |         v3 = XXH_rotl64(v3, 31);
419 |         v3 *= PRIME64_1;
420 |         h64 ^= v3;
421 |         h64 = h64 * PRIME64_1 + PRIME64_4;
422 | 
423 |         v4 *= PRIME64_2;
424 |         v4 = XXH_rotl64(v4, 31);
425 |         v4 *= PRIME64_1;
426 |         h64 ^= v4;
427 |         h64 = h64 * PRIME64_1 + PRIME64_4;
428 |     }
429 |     else
430 |     {
431 |         h64  = seed + PRIME64_5;
432 |     }
433 | 
434 |     h64 += (U64) len;
435 | 
436 |     while (p+8<=bEnd)
437 |     {
438 |         U64 k1 = XXH_get64bits(p);
439 |         k1 *= PRIME64_2;
440 |         k1 = XXH_rotl64(k1,31);
441 |         k1 *= PRIME64_1;
442 |         h64 ^= k1;
443 |         h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
444 |         p+=8;
445 |     }
446 | 
447 |     if (p+4<=bEnd)
448 |     {
449 |         h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
450 |         h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
451 |         p+=4;
452 |     }
453 | 
454 |     while (p<bEnd)
455 |     {
456 |         h64 ^= (*p) * PRIME64_5;
457 |         h64 = XXH_rotl64(h64, 11) * PRIME64_1;
458 |         p++;
459 |     }
460 | 
461 |     h64 ^= h64 >> 33;
462 |     h64 *= PRIME64_2;
463 |     h64 ^= h64 >> 29;
464 |     h64 *= PRIME64_3;
465 |     h64 ^= h64 >> 32;
466 | 
467 |     return h64;
468 | }
469 | 
470 | 
471 | unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
472 | {
473 | #if 0
474 |     // Simple version, good for code maintenance, but unfortunately slow for small inputs
475 |     XXH64_state_t state;
476 |     XXH64_reset(&state, seed);
477 |     XXH64_update(&state, input, len);
478 |     return XXH64_digest(&state);
479 | #else
480 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
481 | 
482 | #  if !defined(XXH_USE_UNALIGNED_ACCESS)
483 |     if ((((size_t)input) & 7)==0)   // Input is aligned, let's leverage the speed advantage
484 |     {
485 |         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
486 |             return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
487 |         else
488 |             return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
489 |     }
490 | #  endif
491 | 
492 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
493 |         return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
494 |     else
495 |         return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
496 | #endif
497 | }
498 | 
499 | /****************************************************
500 |  *  Advanced Hash Functions
501 | ****************************************************/
502 | 
503 | /*** Allocation ***/
504 | typedef struct
505 | {
506 |     U64 total_len;
507 |     U32 seed;
508 |     U32 v1;
509 |     U32 v2;
510 |     U32 v3;
511 |     U32 v4;
512 |     U32 mem32[4];   /* defined as U32 for alignment */
513 |     U32 memsize;
514 | } XXH_istate32_t;
515 | 
516 | typedef struct
517 | {
518 |     U64 total_len;
519 |     U64 seed;
520 |     U64 v1;
521 |     U64 v2;
522 |     U64 v3;
523 |     U64 v4;
524 |     U64 mem64[4];   /* defined as U64 for alignment */
525 |     U32 memsize;
526 | } XXH_istate64_t;
527 | 
528 | 
529 | XXH32_state_t* XXH32_createState(void)
530 | {
531 |     XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   // A compilation error here means XXH32_state_t is not large enough
532 |     return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
533 | }
534 | XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
535 | {
536 |     XXH_free(statePtr);
537 |     return XXH_OK;
538 | };
539 | 
540 | XXH64_state_t* XXH64_createState(void)
541 | {
542 |     XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   // A compilation error here means XXH64_state_t is not large enough
543 |     return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
544 | }
545 | XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
546 | {
547 |     XXH_free(statePtr);
548 |     return XXH_OK;
549 | };
550 | 
551 | 
552 | /*** Hash feed ***/
553 | 
554 | XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
555 | {
556 |     XXH_istate32_t* state = (XXH_istate32_t*) state_in;
557 |     state->seed = seed;
558 |     state->v1 = seed + PRIME32_1 + PRIME32_2;
559 |     state->v2 = seed + PRIME32_2;
560 |     state->v3 = seed + 0;
561 |     state->v4 = seed - PRIME32_1;
562 |     state->total_len = 0;
563 |     state->memsize = 0;
564 |     return XXH_OK;
565 | }
566 | 
567 | XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
568 | {
569 |     XXH_istate64_t* state = (XXH_istate64_t*) state_in;
570 |     state->seed = seed;
571 |     state->v1 = seed + PRIME64_1 + PRIME64_2;
572 |     state->v2 = seed + PRIME64_2;
573 |     state->v3 = seed + 0;
574 |     state->v4 = seed - PRIME64_1;
575 |     state->total_len = 0;
576 |     state->memsize = 0;
577 |     return XXH_OK;
578 | }
579 | 
580 | 
581 | FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
582 | {
583 |     XXH_istate32_t* state = (XXH_istate32_t *) state_in;
584 |     const BYTE* p = (const BYTE*)input;
585 |     const BYTE* const bEnd = p + len;
586 | 
587 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
588 |     if (input==NULL) return XXH_ERROR;
589 | #endif
590 | 
591 |     state->total_len += len;
592 | 
593 |     if (state->memsize + len < 16)   // fill in tmp buffer
594 |     {
595 |         XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
596 |         state->memsize += (U32)len;
597 |         return XXH_OK;
598 |     }
599 | 
600 |     if (state->memsize)   // some data left from previous update
601 |     {
602 |         XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
603 |         {
604 |             const U32* p32 = state->mem32;
605 |             state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
606 |             state->v1 = XXH_rotl32(state->v1, 13);
607 |             state->v1 *= PRIME32_1;
608 |             p32++;
609 |             state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
610 |             state->v2 = XXH_rotl32(state->v2, 13);
611 |             state->v2 *= PRIME32_1;
612 |             p32++;
613 |             state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
614 |             state->v3 = XXH_rotl32(state->v3, 13);
615 |             state->v3 *= PRIME32_1;
616 |             p32++;
617 |             state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
618 |             state->v4 = XXH_rotl32(state->v4, 13);
619 |             state->v4 *= PRIME32_1;
620 |             p32++;
621 |         }
622 |         p += 16-state->memsize;
623 |         state->memsize = 0;
624 |     }
625 | 
626 |     if (p <= bEnd-16)
627 |     {
628 |         const BYTE* const limit = bEnd - 16;
629 |         U32 v1 = state->v1;
630 |         U32 v2 = state->v2;
631 |         U32 v3 = state->v3;
632 |         U32 v4 = state->v4;
633 | 
634 |         do
635 |         {
636 |             v1 += XXH_readLE32(p, endian) * PRIME32_2;
637 |             v1 = XXH_rotl32(v1, 13);
638 |             v1 *= PRIME32_1;
639 |             p+=4;
640 |             v2 += XXH_readLE32(p, endian) * PRIME32_2;
641 |             v2 = XXH_rotl32(v2, 13);
642 |             v2 *= PRIME32_1;
643 |             p+=4;
644 |             v3 += XXH_readLE32(p, endian) * PRIME32_2;
645 |             v3 = XXH_rotl32(v3, 13);
646 |             v3 *= PRIME32_1;
647 |             p+=4;
648 |             v4 += XXH_readLE32(p, endian) * PRIME32_2;
649 |             v4 = XXH_rotl32(v4, 13);
650 |             v4 *= PRIME32_1;
651 |             p+=4;
652 |         }
653 |         while (p<=limit);
654 | 
655 |         state->v1 = v1;
656 |         state->v2 = v2;
657 |         state->v3 = v3;
658 |         state->v4 = v4;
659 |     }
660 | 
661 |     if (p < bEnd)
662 |     {
663 |         XXH_memcpy(state->mem32, p, bEnd-p);
664 |         state->memsize = (int)(bEnd-p);
665 |     }
666 | 
667 |     return XXH_OK;
668 | }
669 | 
670 | XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
671 | {
672 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
673 | 
674 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
675 |         return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
676 |     else
677 |         return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
678 | }
679 | 
680 | 
681 | 
682 | FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
683 | {
684 |     XXH_istate32_t* state = (XXH_istate32_t*) state_in;
685 |     const BYTE * p = (const BYTE*)state->mem32;
686 |     BYTE* bEnd = (BYTE*)(state->mem32) + state->memsize;
687 |     U32 h32;
688 | 
689 |     if (state->total_len >= 16)
690 |     {
691 |         h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
692 |     }
693 |     else
694 |     {
695 |         h32  = state->seed + PRIME32_5;
696 |     }
697 | 
698 |     h32 += (U32) state->total_len;
699 | 
700 |     while (p+4<=bEnd)
701 |     {
702 |         h32 += XXH_readLE32(p, endian) * PRIME32_3;
703 |         h32  = XXH_rotl32(h32, 17) * PRIME32_4;
704 |         p+=4;
705 |     }
706 | 
707 |     while (p<bEnd)
708 |     {
709 |         h32 += (*p) * PRIME32_5;
710 |         h32 = XXH_rotl32(h32, 11) * PRIME32_1;
711 |         p++;
712 |     }
713 | 
714 |     h32 ^= h32 >> 15;
715 |     h32 *= PRIME32_2;
716 |     h32 ^= h32 >> 13;
717 |     h32 *= PRIME32_3;
718 |     h32 ^= h32 >> 16;
719 | 
720 |     return h32;
721 | }
722 | 
723 | 
724 | U32 XXH32_digest (const XXH32_state_t* state_in)
725 | {
726 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
727 | 
728 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
729 |         return XXH32_digest_endian(state_in, XXH_littleEndian);
730 |     else
731 |         return XXH32_digest_endian(state_in, XXH_bigEndian);
732 | }
733 | 
734 | 
735 | FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
736 | {
737 |     XXH_istate64_t * state = (XXH_istate64_t *) state_in;
738 |     const BYTE* p = (const BYTE*)input;
739 |     const BYTE* const bEnd = p + len;
740 | 
741 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
742 |     if (input==NULL) return XXH_ERROR;
743 | #endif
744 | 
745 |     state->total_len += len;
746 | 
747 |     if (state->memsize + len < 32)   // fill in tmp buffer
748 |     {
749 |         XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
750 |         state->memsize += (U32)len;
751 |         return XXH_OK;
752 |     }
753 | 
754 |     if (state->memsize)   // some data left from previous update
755 |     {
756 |         XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
757 |         {
758 |             const U64* p64 = state->mem64;
759 |             state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
760 |             state->v1 = XXH_rotl64(state->v1, 31);
761 |             state->v1 *= PRIME64_1;
762 |             p64++;
763 |             state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
764 |             state->v2 = XXH_rotl64(state->v2, 31);
765 |             state->v2 *= PRIME64_1;
766 |             p64++;
767 |             state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
768 |             state->v3 = XXH_rotl64(state->v3, 31);
769 |             state->v3 *= PRIME64_1;
770 |             p64++;
771 |             state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
772 |             state->v4 = XXH_rotl64(state->v4, 31);
773 |             state->v4 *= PRIME64_1;
774 |             p64++;
775 |         }
776 |         p += 32-state->memsize;
777 |         state->memsize = 0;
778 |     }
779 | 
780 |     if (p+32 <= bEnd)
781 |     {
782 |         const BYTE* const limit = bEnd - 32;
783 |         U64 v1 = state->v1;
784 |         U64 v2 = state->v2;
785 |         U64 v3 = state->v3;
786 |         U64 v4 = state->v4;
787 | 
788 |         do
789 |         {
790 |             v1 += XXH_readLE64(p, endian) * PRIME64_2;
791 |             v1 = XXH_rotl64(v1, 31);
792 |             v1 *= PRIME64_1;
793 |             p+=8;
794 |             v2 += XXH_readLE64(p, endian) * PRIME64_2;
795 |             v2 = XXH_rotl64(v2, 31);
796 |             v2 *= PRIME64_1;
797 |             p+=8;
798 |             v3 += XXH_readLE64(p, endian) * PRIME64_2;
799 |             v3 = XXH_rotl64(v3, 31);
800 |             v3 *= PRIME64_1;
801 |             p+=8;
802 |             v4 += XXH_readLE64(p, endian) * PRIME64_2;
803 |             v4 = XXH_rotl64(v4, 31);
804 |             v4 *= PRIME64_1;
805 |             p+=8;
806 |         }
807 |         while (p<=limit);
808 | 
809 |         state->v1 = v1;
810 |         state->v2 = v2;
811 |         state->v3 = v3;
812 |         state->v4 = v4;
813 |     }
814 | 
815 |     if (p < bEnd)
816 |     {
817 |         XXH_memcpy(state->mem64, p, bEnd-p);
818 |         state->memsize = (int)(bEnd-p);
819 |     }
820 | 
821 |     return XXH_OK;
822 | }
823 | 
824 | XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
825 | {
826 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
827 | 
828 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
829 |         return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
830 |     else
831 |         return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
832 | }
833 | 
834 | 
835 | 
836 | FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
837 | {
838 |     XXH_istate64_t * state = (XXH_istate64_t *) state_in;
839 |     const BYTE * p = (const BYTE*)state->mem64;
840 |     BYTE* bEnd = (BYTE*)state->mem64 + state->memsize;
841 |     U64 h64;
842 | 
843 |     if (state->total_len >= 32)
844 |     {
845 |         U64 v1 = state->v1;
846 |         U64 v2 = state->v2;
847 |         U64 v3 = state->v3;
848 |         U64 v4 = state->v4;
849 | 
850 |         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
851 | 
852 |         v1 *= PRIME64_2;
853 |         v1 = XXH_rotl64(v1, 31);
854 |         v1 *= PRIME64_1;
855 |         h64 ^= v1;
856 |         h64 = h64*PRIME64_1 + PRIME64_4;
857 | 
858 |         v2 *= PRIME64_2;
859 |         v2 = XXH_rotl64(v2, 31);
860 |         v2 *= PRIME64_1;
861 |         h64 ^= v2;
862 |         h64 = h64*PRIME64_1 + PRIME64_4;
863 | 
864 |         v3 *= PRIME64_2;
865 |         v3 = XXH_rotl64(v3, 31);
866 |         v3 *= PRIME64_1;
867 |         h64 ^= v3;
868 |         h64 = h64*PRIME64_1 + PRIME64_4;
869 | 
870 |         v4 *= PRIME64_2;
871 |         v4 = XXH_rotl64(v4, 31);
872 |         v4 *= PRIME64_1;
873 |         h64 ^= v4;
874 |         h64 = h64*PRIME64_1 + PRIME64_4;
875 |     }
876 |     else
877 |     {
878 |         h64  = state->seed + PRIME64_5;
879 |     }
880 | 
881 |     h64 += (U64) state->total_len;
882 | 
883 |     while (p+8<=bEnd)
884 |     {
885 |         U64 k1 = XXH_readLE64(p, endian);
886 |         k1 *= PRIME64_2;
887 |         k1 = XXH_rotl64(k1,31);
888 |         k1 *= PRIME64_1;
889 |         h64 ^= k1;
890 |         h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
891 |         p+=8;
892 |     }
893 | 
894 |     if (p+4<=bEnd)
895 |     {
896 |         h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
897 |         h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
898 |         p+=4;
899 |     }
900 | 
901 |     while (p<bEnd)
902 |     {
903 |         h64 ^= (*p) * PRIME64_5;
904 |         h64 = XXH_rotl64(h64, 11) * PRIME64_1;
905 |         p++;
906 |     }
907 | 
908 |     h64 ^= h64 >> 33;
909 |     h64 *= PRIME64_2;
910 |     h64 ^= h64 >> 29;
911 |     h64 *= PRIME64_3;
912 |     h64 ^= h64 >> 32;
913 | 
914 |     return h64;
915 | }
916 | 
917 | 
918 | unsigned long long XXH64_digest (const XXH64_state_t* state_in)
919 | {
920 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
921 | 
922 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
923 |         return XXH64_digest_endian(state_in, XXH_littleEndian);
924 |     else
925 |         return XXH64_digest_endian(state_in, XXH_bigEndian);
926 | }
927 | 
928 | 
929 | 


--------------------------------------------------------------------------------
/benchmark/xxhash.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |    xxHash - Extremely Fast Hash algorithm
  3 |    Header File
  4 |    Copyright (C) 2012-2014, Yann Collet.
  5 |    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
  6 | 
  7 |    Redistribution and use in source and binary forms, with or without
  8 |    modification, are permitted provided that the following conditions are
  9 |    met:
 10 | 
 11 |        * Redistributions of source code must retain the above copyright
 12 |    notice, this list of conditions and the following disclaimer.
 13 |        * Redistributions in binary form must reproduce the above
 14 |    copyright notice, this list of conditions and the following disclaimer
 15 |    in the documentation and/or other materials provided with the
 16 |    distribution.
 17 | 
 18 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 |    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 |    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 |    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 |    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 |    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 |    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 |    You can contact the author at :
 31 |    - xxHash source repository : http://code.google.com/p/xxhash/
 32 | */
 33 | 
 34 | /* Notice extracted from xxHash homepage :
 35 | 
 36 | xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
 37 | It also successfully passes all tests from the SMHasher suite.
 38 | 
 39 | Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
 40 | 
 41 | Name            Speed       Q.Score   Author
 42 | xxHash          5.4 GB/s     10
 43 | CrapWow         3.2 GB/s      2       Andrew
 44 | MumurHash 3a    2.7 GB/s     10       Austin Appleby
 45 | SpookyHash      2.0 GB/s     10       Bob Jenkins
 46 | SBox            1.4 GB/s      9       Bret Mulvey
 47 | Lookup3         1.2 GB/s      9       Bob Jenkins
 48 | SuperFastHash   1.2 GB/s      1       Paul Hsieh
 49 | CityHash64      1.05 GB/s    10       Pike & Alakuijala
 50 | FNV             0.55 GB/s     5       Fowler, Noll, Vo
 51 | CRC32           0.43 GB/s     9
 52 | MD5-32          0.33 GB/s    10       Ronald L. Rivest
 53 | SHA1-32         0.28 GB/s    10
 54 | 
 55 | Q.Score is a measure of quality of the hash function.
 56 | It depends on successfully passing SMHasher test set.
 57 | 10 is a perfect score.
 58 | */
 59 | 
 60 | #pragma once
 61 | 
 62 | #if defined (__cplusplus)
 63 | extern "C" {
 64 | #endif
 65 | 
 66 | 
 67 | /*****************************
 68 |    Includes
 69 | *****************************/
 70 | #include <stddef.h>   /* size_t */
 71 | 
 72 | 
 73 | /*****************************
 74 |    Type
 75 | *****************************/
 76 | typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 77 | 
 78 | 
 79 | 
 80 | /*****************************
 81 |    Simple Hash Functions
 82 | *****************************/
 83 | 
 84 | unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
 85 | unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
 86 | 
 87 | /*
 88 | XXH32() :
 89 |     Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
 90 |     The memory between input & input+length must be valid (allocated and read-accessible).
 91 |     "seed" can be used to alter the result predictably.
 92 |     This function successfully passes all SMHasher tests.
 93 |     Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
 94 | XXH64() :
 95 |     Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
 96 | */
 97 | 
 98 | 
 99 | 
100 | /*****************************
101 |    Advanced Hash Functions
102 | *****************************/
103 | typedef struct { long long ll[ 6]; } XXH32_state_t;
104 | typedef struct { long long ll[11]; } XXH64_state_t;
105 | 
106 | /*
107 | These structures allow static allocation of XXH states.
108 | States must then be initialized using XXHnn_reset() before first use.
109 | 
110 | If you prefer dynamic allocation, please refer to functions below.
111 | */
112 | 
113 | XXH32_state_t* XXH32_createState(void);
114 | XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
115 | 
116 | XXH64_state_t* XXH64_createState(void);
117 | XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
118 | 
119 | /*
120 | These functions create and release memory for XXH state.
121 | States must then be initialized using XXHnn_reset() before first use.
122 | */
123 | 
124 | 
125 | XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
126 | XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
127 | unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
128 | 
129 | XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
130 | XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
131 | unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
132 | 
133 | /*
134 | These functions calculate the xxHash of an input provided in multiple smaller packets,
135 | as opposed to an input provided as a single block.
136 | 
137 | XXH state space must first be allocated, using either static or dynamic method provided above.
138 | 
139 | Start a new hash by initializing state with a seed, using XXHnn_reset().
140 | 
141 | Then, feed the hash state by calling XXHnn_update() as many times as necessary.
142 | Obviously, input must be valid, meaning allocated and read accessible.
143 | The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
144 | 
145 | Finally, you can produce a hash anytime, by using XXHnn_digest().
146 | This function returns the final nn-bits hash.
147 | You can nonetheless continue feeding the hash state with more input,
148 | and therefore get some new hashes, by calling again XXHnn_digest().
149 | 
150 | When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
151 | */
152 | 
153 | 
154 | #if defined (__cplusplus)
155 | }
156 | #endif
157 | 


--------------------------------------------------------------------------------
/lf/c11.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2019 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2019 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __BITS_LF_C11_H
 56 | #define __BITS_LF_C11_H 1
 57 | 
 58 | #include <stdatomic.h>
 59 | #include <stdint.h>
 60 | 
 61 | #define LFATOMIC(x)				_Atomic(x)
 62 | #define LFATOMIC_VAR_INIT(x)	ATOMIC_VAR_INIT(x)
 63 | 
 64 | static inline void __lfaba_init(_Atomic(lfatomic_big_t) * obj,
 65 | 		lfatomic_big_t val)
 66 | {
 67 | 	atomic_init(obj, val);
 68 | }
 69 | 
 70 | static inline lfatomic_big_t __lfaba_load(_Atomic(lfatomic_big_t) * obj,
 71 | 		memory_order order)
 72 | {
 73 | #if __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 1
 74 | 	lfatomic_big_t res;
 75 | 	_Atomic(lfatomic_t) * hobj = (_Atomic(lfatomic_t) *) ((uintptr_t) obj);
 76 | 	lfatomic_t * hres = (lfatomic_t *) &res;
 77 | 
 78 | 	hres[0] = atomic_load_explicit(hobj, order);
 79 | 	hres[1] = atomic_load_explicit(hobj + 1, order);
 80 | 	return res;
 81 | #elif __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 0
 82 | 	return atomic_load_explicit(obj, order);
 83 | #endif
 84 | }
 85 | 
 86 | static inline lfatomic_big_t __lfaba_load_atomic(_Atomic(lfatomic_big_t) * obj,
 87 | 		memory_order order)
 88 | {
 89 | 	return atomic_load_explicit(obj, order);
 90 | }
 91 | 
 92 | static inline bool __lfaba_cmpxchg_weak(_Atomic(lfatomic_big_t) * obj,
 93 | 		lfatomic_big_t * expected, lfatomic_big_t desired,
 94 | 		memory_order succ, memory_order fail)
 95 | {
 96 | 	return atomic_compare_exchange_weak_explicit(obj, expected, desired,
 97 | 					succ, fail);
 98 | }
 99 | 
100 | static inline bool __lfaba_cmpxchg_strong(_Atomic(lfatomic_big_t) * obj,
101 | 		lfatomic_big_t * expected, lfatomic_big_t desired,
102 | 		memory_order succ, memory_order fail)
103 | {
104 | 	return atomic_compare_exchange_strong_explicit(obj, expected, desired,
105 | 					succ, fail);
106 | }
107 | 
108 | static inline lfatomic_big_t __lfaba_fetch_and(_Atomic(lfatomic_big_t) * obj,
109 | 		lfatomic_big_t arg, memory_order order)
110 | {
111 | 	return atomic_fetch_and_explicit(obj, arg, order);
112 | }
113 | 
114 | #endif /* !__BITS_LF_C11_H */
115 | 
116 | /* vi: set tabstop=4: */
117 | 


--------------------------------------------------------------------------------
/lf/config.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2018 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2018 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __BITS_LF_CONFIG_H
 56 | #define __BITS_LF_CONFIG_H	1
 57 | 
 58 | #include <inttypes.h>
 59 | #include <stdint.h>
 60 | 
 61 | /* For the following architectures, it is cheaper to use split (word-atomic)
 62 |    loads whenever possible. */
 63 | #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) ||	\
 64 | 	defined(__aarch64__)
 65 | # define __LFLOAD_SPLIT(dtype_width)	(dtype_width > LFATOMIC_WIDTH)
 66 | #else
 67 | # define __LFLOAD_SPLIT(dtype_width)	0
 68 | #endif
 69 | 
 70 | /* IA-64 provides a 128-bit single-compare/double-swap instruction, so
 71 |    LFCMPXCHG_SPLIT is true for 128-bit types. */
 72 | #if defined(__ia64__)
 73 | # define __LFCMPXCHG_SPLIT(dtype_width)	(dtype_width > LFATOMIC_WIDTH)
 74 | #else
 75 | # define __LFCMPXCHG_SPLIT(dtype_width)	0
 76 | #endif
 77 | 
 78 | #if defined(__x86_64__) || defined (__aarch64__) || defined(__powerpc64__)	\
 79 | 	|| (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64)
 80 | typedef int64_t lfsatomic_t;
 81 | typedef uint64_t lfatomic_t;
 82 | typedef __uint128_t lfatomic_big_t;
 83 | # define LFATOMIC_LOG2			3
 84 | # define LFATOMIC_WIDTH			64
 85 | # define LFATOMIC_BIG_WIDTH		128
 86 | #elif defined(__i386__) || defined(__arm__) || defined(__powerpc__)			\
 87 | 	|| (defined(__mips__) &&												\
 88 | 		(_MIPS_SIM == _MIPS_SIM_ABI32 || _MIPS_SIM == _MIPS_SIM_NABI32))
 89 | typedef int32_t lfsatomic_t;
 90 | typedef uint32_t lfatomic_t;
 91 | typedef uint64_t lfatomic_big_t;
 92 | # define LFATOMIC_LOG2			2
 93 | # define LFATOMIC_WIDTH			32
 94 | # define LFATOMIC_BIG_WIDTH		64
 95 | #else
 96 | typedef intptr_t lfsatomic_t;
 97 | typedef uintptr_t lfatomic_t;
 98 | typedef uintptr_t lfatomic_big_t;
 99 | # if UINTPTR_MAX == UINT32_C(0xFFFFFFFF)
100 | #  define LFATOMIC_LOG2			2
101 | #  define LFATOMIC_WIDTH		32
102 | #  define LFATOMIC_BIG_WIDTH	32
103 | # elif UINTPTR_MAX == UINT64_C(0xFFFFFFFFFFFFFFFF)
104 | #  define LFATOMIC_LOG2			3
105 | #  define LFATOMIC_WIDTH		64
106 | #  define LFATOMIC_BIG_WIDTH	64
107 | # endif
108 | #endif
109 | 
110 | /* XXX: True for x86/x86-64 but needs to be properly defined for other CPUs. */
111 | #define LF_CACHE_SHIFT		7U
112 | #define LF_CACHE_BYTES		(1U << LF_CACHE_SHIFT)
113 | 
114 | /* Allow to use LEA for x86/x86-64. */
115 | #if defined(__i386__) || defined(__x86_64__)
116 | # define __LFMERGE(x,y)	((x) + (y))
117 | #else
118 | # define __LFMERGE(x,y)	((x) | (y))
119 | #endif
120 | 
121 | #endif /* !__BITS_LF_CONFIG_H */
122 | 
123 | /* vi: set tabstop=4: */
124 | 


--------------------------------------------------------------------------------
/lf/gcc_x86.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2018 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2018 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __BITS_LF_GCC_X86_H
 56 | #define __BITS_LF_GCC_X86_H 1
 57 | 
 58 | #include <stdatomic.h>
 59 | #include <stdint.h>
 60 | 
 61 | #define LFATOMIC(x)				_Atomic(x)
 62 | #define LFATOMIC_VAR_INIT(x)	ATOMIC_VAR_INIT(x)
 63 | 
 64 | static inline void __lfbig_init(_Atomic(lfatomic_big_t) * obj,
 65 | 		lfatomic_big_t val)
 66 | {
 67 | 	*((volatile lfatomic_big_t *) ((uintptr_t) obj)) = val;
 68 | }
 69 | 
 70 | static inline lfatomic_big_t __lfbig_load(_Atomic(lfatomic_big_t) * obj,
 71 | 		memory_order order)
 72 | {
 73 | 	return *((volatile lfatomic_big_t *) ((uintptr_t) obj));
 74 | }
 75 | 
 76 | static inline bool __lfbig_cmpxchg_strong(_Atomic(lfatomic_big_t) * obj,
 77 | 		lfatomic_big_t * expected, lfatomic_big_t desired,
 78 | 		memory_order succ, memory_order fail)
 79 | {
 80 | 	lfatomic_t low = (lfatomic_t) desired;
 81 | 	lfatomic_t high = (lfatomic_t) (desired >> (sizeof(lfatomic_t) * 8));
 82 | 	bool result;
 83 | 
 84 | #if defined(__x86_64__)
 85 | # define __LFX86_CMPXCHG "cmpxchg16b"
 86 | #elif defined(__i386__)
 87 | # define __LFX86_CMPXCHG "cmpxchg8b"
 88 | #endif
 89 | 	__asm__ __volatile__ ("lock " __LFX86_CMPXCHG " %0"
 90 | 						  : "+m" (*obj), "=@ccz" (result), "+A" (*expected)
 91 | 						  : "b" (low), "c" (high)
 92 | 	);
 93 | #undef __LFX86_CMPXCHG
 94 | 
 95 | 	return result;
 96 | }
 97 | 
 98 | static inline lfatomic_big_t __lfbig_load_atomic(_Atomic(lfatomic_big_t) * obj,
 99 | 		memory_order order)
100 | {
101 | 	lfatomic_big_t value = 0;
102 | 	__lfbig_cmpxchg_strong(obj, &value, 0, order, order);
103 | 	return value;
104 | }
105 | 
106 | static inline bool __lfbig_cmpxchg_weak(_Atomic(lfatomic_big_t) * obj,
107 | 		lfatomic_big_t * expected, lfatomic_big_t desired,
108 | 		memory_order succ, memory_order fail)
109 | {
110 | 	return __lfbig_cmpxchg_strong(obj, expected, desired, succ, fail);
111 | }
112 | 
113 | static inline lfatomic_big_t __lfbig_fetch_and(_Atomic(lfatomic_big_t) * obj,
114 | 		lfatomic_big_t arg, memory_order order)
115 | {
116 | 	lfatomic_big_t new_val, old_val = __lfbig_load(obj, order);
117 | 	do {
118 | 		new_val = old_val & arg;
119 | 	} while (!__lfbig_cmpxchg_weak(obj, &old_val, new_val, order, order));
120 | 	__LF_ASSUME(new_val == (old_val & arg));
121 | 	return old_val;
122 | }
123 | 
124 | #define __lfaba_init			__lfbig_init
125 | #define __lfaba_load			__lfbig_load
126 | #define __lfaba_load_atomic		__lfbig_load_atomic
127 | #define __lfaba_cmpxchg_weak	__lfbig_cmpxchg_weak
128 | #define __lfaba_cmpxchg_strong	__lfbig_cmpxchg_strong
129 | #define __lfaba_fetch_and		__lfbig_fetch_and
130 | 
131 | #endif /* !__BITS_LF_GGC_X86_H */
132 | 
133 | /* vi: set tabstop=4: */
134 | 


--------------------------------------------------------------------------------
/lf/lf.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2019 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2019 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __BITS_LF_H
 56 | #define __BITS_LF_H	1
 57 | 
 58 | #include <inttypes.h>
 59 | #include <sys/types.h>
 60 | #include <stdbool.h>
 61 | #include <stddef.h>
 62 | #include <stdint.h>
 63 | 
 64 | #include "config.h"
 65 | 
 66 | #ifdef __GNUC__
 67 | # define __LF_ASSUME(c) do { if (!(c)) __builtin_unreachable(); } while (0)
 68 | #else
 69 | # define __LF_ASSUME(c)
 70 | #endif
 71 | 
 72 | /* GCC does not have a sane implementation of wide atomics for x86-64
 73 |    in recent versions, so use inline assembly workarounds whenever possible.
 74 |    No aarch64 support in GCC for right now. */
 75 | #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) &&	\
 76 | 	!defined(__llvm__) && defined(__GCC_ASM_FLAG_OUTPUTS__)
 77 | # include "gcc_x86.h"
 78 | #else
 79 | # include "c11.h"
 80 | #endif
 81 | 
 82 | /* ABA tagging with split (word-atomic) load/cmpxchg operation. */
 83 | #if __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 1 ||								\
 84 | 		__LFCMPXCHG_SPLIT(LFATOMIC_BIG_WIDTH) == 1
 85 | # define __LFABA_IMPL(w, type_t)											\
 86 | static const size_t __lfaba_shift##w = sizeof(lfatomic_big_t) * 4;			\
 87 | static const size_t __lfaptr_shift##w = 0;									\
 88 | static const lfatomic_big_t __lfaba_mask##w =								\
 89 | 				~(lfatomic_big_t) 0U << (sizeof(lfatomic_big_t) * 4);		\
 90 | static const lfatomic_big_t __lfaba_step##w =								\
 91 | 				(lfatomic_big_t) 1U << (sizeof(lfatomic_big_t) * 4);
 92 | #endif
 93 | 
 94 | /* ABA tagging when load/cmpxchg is not split. Note that unlike previous
 95 |    case, __lfaptr_shift is required to be 0. */
 96 | #if __LFLOAD_SPLIT(LFATOMIC_BIG_WIDTH) == 0 &&								\
 97 | 		__LFCMPXCHG_SPLIT(LFATOMIC_BIG_WIDTH) == 0
 98 | # define __LFABA_IMPL(w, type_t)											\
 99 | static const size_t __lfaba_shift##w = sizeof(type_t) * 8;					\
100 | static const size_t __lfaptr_shift##w = 0;									\
101 | static const lfatomic_big_t __lfaba_mask##w =								\
102 | 				~(lfatomic_big_t) 0U << (sizeof(type_t) * 8);				\
103 | static const lfatomic_big_t __lfaba_step##w =								\
104 | 				(lfatomic_big_t) 1U << (sizeof(type_t) * 8);
105 | #endif
106 | 
107 | /* Available on CAS2 32/64-bit architectures. */
108 | #if LFATOMIC_BIG_WIDTH >= 2 * __LFPTR_WIDTH
109 | __LFABA_IMPL(, uintptr_t)
110 | #endif
111 | 
112 | #endif	/* !__BITS_LF_H */
113 | 
114 | /* vi: set tabstop=4: */
115 | 


--------------------------------------------------------------------------------
/lfring_cas1.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2019 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2019 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __LFRING_H
 56 | #define __LFRING_H	1
 57 | 
 58 | #include <stdbool.h>
 59 | #include <inttypes.h>
 60 | #include <sys/types.h>
 61 | #include <stdatomic.h>
 62 | #include <stdio.h>
 63 | 
 64 | #include "lf/lf.h"
 65 | 
 66 | #if LFATOMIC_WIDTH == 32
 67 | # define LFRING_MIN	(LF_CACHE_SHIFT - 2)
 68 | #elif LFATOMIC_WIDTH == 64
 69 | # define LFRING_MIN	(LF_CACHE_SHIFT - 3)
 70 | #elif LFATOMIC_WIDTH == 128
 71 | # define LFRING_MIN	(LF_CACHE_SHIFT - 4)
 72 | #else
 73 | # error "Unsupported LFATOMIC_WIDTH."
 74 | #endif
 75 | 
 76 | #define LFRING_ALIGN	(_Alignof(struct __lfring))
 77 | #define LFRING_SIZE(o)	\
 78 | 	(offsetof(struct __lfring, array) + (sizeof(lfatomic_t) << ((o) + 1)))
 79 | 
 80 | #define LFRING_EMPTY	(~(size_t) 0U)
 81 | 
 82 | #define __lfring_cmp(x, op, y)	((lfsatomic_t) ((x) - (y)) op 0)
 83 | 
 84 | #if LFRING_MIN != 0
 85 | static inline size_t __lfring_raw_map(lfatomic_t idx, size_t order, size_t n)
 86 | {
 87 | 	return (size_t) (((idx & (n - 1)) >> (order - LFRING_MIN)) |
 88 | 			((idx << LFRING_MIN) & (n - 1)));
 89 | }
 90 | #else
 91 | static inline size_t __lfring_raw_map(lfatomic_t idx, size_t order, size_t n)
 92 | {
 93 | 	return (size_t) (idx & (n - 1));
 94 | }
 95 | #endif
 96 | 
 97 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n)
 98 | {
 99 | 	return __lfring_raw_map(idx, order + 1, n);
100 | }
101 | 
102 | #define __lfring_threshold3(half, n) ((long) ((half) + (n) - 1))
103 | 
104 | static inline size_t lfring_pow2(size_t order)
105 | {
106 | 	return (size_t) 1U << order;
107 | }
108 | 
109 | struct __lfring {
110 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) head;
111 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfsatomic_t) threshold;
112 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) tail;
113 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) array[1];
114 | };
115 | 
116 | struct lfring;
117 | 
118 | static inline void lfring_init_empty(struct lfring * ring, size_t order)
119 | {
120 | 	struct __lfring * q = (struct __lfring *) ring;
121 | 	size_t i, n = lfring_pow2(order + 1);
122 | 
123 | 	for (i = 0; i != n; i++)
124 | 		atomic_init(&q->array[i], (lfsatomic_t) -1);
125 | 
126 | 	atomic_init(&q->head, 0);
127 | 	atomic_init(&q->threshold, -1);
128 | 	atomic_init(&q->tail, 0);
129 | }
130 | 
131 | static inline void lfring_init_full(struct lfring * ring, size_t order)
132 | {
133 | 	struct __lfring * q = (struct __lfring *) ring;
134 | 	size_t i, half = lfring_pow2(order), n = half * 2;
135 | 
136 | 	for (i = 0; i != half; i++)
137 | 		atomic_init(&q->array[__lfring_map(i, order, n)], n + __lfring_raw_map(i, order, half));
138 | 	for (; i != n; i++)
139 | 		atomic_init(&q->array[__lfring_map(i, order, n)], (lfsatomic_t) -1);
140 | 
141 | 	atomic_init(&q->head, 0);
142 | 	atomic_init(&q->threshold, __lfring_threshold3(half, n));
143 | 	atomic_init(&q->tail, half);
144 | }
145 | 
146 | static inline void lfring_init_fill(struct lfring * ring,
147 | 		size_t s, size_t e, size_t order)
148 | {
149 | 	struct __lfring * q = (struct __lfring *) ring;
150 | 	size_t i, half = lfring_pow2(order), n = half * 2;
151 | 
152 | 	for (i = 0; i != s; i++)
153 | 		atomic_init(&q->array[__lfring_map(i, order, n)], 2 * n - 1);
154 | 	for (; i != e; i++)
155 | 		atomic_init(&q->array[__lfring_map(i, order, n)], n + i);
156 | 	for (; i != n; i++)
157 | 		atomic_init(&q->array[__lfring_map(i, order, n)], (lfsatomic_t) -1);
158 | 
159 | 	atomic_init(&q->head, s);
160 | 	atomic_init(&q->threshold, __lfring_threshold3(half, n));
161 | 	atomic_init(&q->tail, e);
162 | }
163 | 
164 | static inline bool lfring_enqueue(struct lfring * ring, size_t order,
165 | 		size_t eidx, bool nonempty)
166 | {
167 | 	struct __lfring * q = (struct __lfring *) ring;
168 | 	size_t tidx, half = lfring_pow2(order), n = half * 2;
169 | 	lfatomic_t tail, entry, ecycle, tcycle;
170 | 
171 | 	eidx ^= (n - 1);
172 | 
173 | 	while (1) {
174 | 		tail = atomic_fetch_add_explicit(&q->tail, 1, memory_order_acq_rel);
175 | 		tcycle = (tail << 1) | (2 * n - 1);
176 | 		tidx = __lfring_map(tail, order, n);
177 | 		entry = atomic_load_explicit(&q->array[tidx], memory_order_acquire);
178 | retry:
179 | 		ecycle = entry | (2 * n - 1);
180 | 		if (__lfring_cmp(ecycle, <, tcycle) && ((entry == ecycle) ||
181 | 				((entry == (ecycle ^ n)) &&
182 | 				 __lfring_cmp(atomic_load_explicit(&q->head,
183 | 				  memory_order_acquire), <=, tail)))) {
184 | 
185 | 			if (!atomic_compare_exchange_weak_explicit(&q->array[tidx],
186 | 					&entry, tcycle ^ eidx,
187 | 					memory_order_acq_rel, memory_order_acquire))
188 | 				goto retry;
189 | 
190 | 			if (!nonempty && (atomic_load(&q->threshold) != __lfring_threshold3(half, n)))
191 | 				atomic_store(&q->threshold, __lfring_threshold3(half, n));
192 | 			return true;
193 | 		}
194 | 	}
195 | }
196 | 
197 | static inline void __lfring_catchup(struct lfring * ring,
198 | 	lfatomic_t tail, lfatomic_t head)
199 | {
200 | 	struct __lfring * q = (struct __lfring *) ring;
201 | 
202 | 	while (!atomic_compare_exchange_weak_explicit(&q->tail, &tail, head,
203 | 			memory_order_acq_rel, memory_order_acquire)) {
204 | 		head = atomic_load_explicit(&q->head, memory_order_acquire);
205 | 		tail = atomic_load_explicit(&q->tail, memory_order_acquire);
206 | 		if (__lfring_cmp(tail, >=, head))
207 | 			break;
208 | 	}
209 | }
210 | 
211 | static inline size_t lfring_dequeue(struct lfring * ring, size_t order,
212 | 		bool nonempty)
213 | {
214 | 	struct __lfring * q = (struct __lfring *) ring;
215 | 	size_t hidx, n = lfring_pow2(order + 1);
216 | 	lfatomic_t head, entry, entry_new, ecycle, hcycle, tail;
217 | 	size_t attempt;
218 | 
219 | 	if (!nonempty && atomic_load(&q->threshold) < 0) {
220 | 		return LFRING_EMPTY;
221 | 	}
222 | 
223 | 	while (1) {
224 | 		head = atomic_fetch_add_explicit(&q->head, 1, memory_order_acq_rel);
225 | 		hcycle = (head << 1) | (2 * n - 1);
226 | 		hidx = __lfring_map(head, order, n);
227 | 		attempt = 0;
228 | again:
229 | 		entry = atomic_load_explicit(&q->array[hidx], memory_order_acquire);
230 | 
231 | 		do {
232 | 			ecycle = entry | (2 * n - 1);
233 | 			if (ecycle == hcycle) {
234 | 				atomic_fetch_or_explicit(&q->array[hidx], (n - 1),
235 | 						memory_order_acq_rel);
236 | 				return (size_t) (entry & (n - 1));
237 | 			}
238 | 
239 | 			if ((entry | n) != ecycle) {
240 | 				entry_new = entry & ~(lfatomic_t) n;
241 | 				if (entry == entry_new)
242 | 					break;
243 | 			} else {
244 | 				if (++attempt <= 10000)
245 | 					goto again;
246 | 				entry_new = hcycle ^ ((~entry) & n);
247 | 			}
248 | 		} while (__lfring_cmp(ecycle, <, hcycle) &&
249 | 					!atomic_compare_exchange_weak_explicit(&q->array[hidx],
250 | 					&entry, entry_new,
251 | 					memory_order_acq_rel, memory_order_acquire));
252 | 
253 | 		if (!nonempty) {
254 | 			tail = atomic_load_explicit(&q->tail, memory_order_acquire);
255 | 			if (__lfring_cmp(tail, <=, head + 1)) {
256 | 				__lfring_catchup(ring, tail, head + 1);
257 | 				atomic_fetch_sub_explicit(&q->threshold, 1,
258 | 					memory_order_acq_rel);
259 | 				return LFRING_EMPTY;
260 | 			}
261 | 
262 | 			if (atomic_fetch_sub_explicit(&q->threshold, 1,
263 | 					memory_order_acq_rel) <= 0)
264 | 				return LFRING_EMPTY;
265 | 		}
266 | 	}
267 | }
268 | 
269 | #endif	/* !__LFRING_H */
270 | 
271 | /* vi: set tabstop=4: */
272 | 


--------------------------------------------------------------------------------
/lfring_cas2.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2019 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2019 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __LFRING_H
 56 | #define __LFRING_H	1
 57 | 
 58 | #include <stdbool.h>
 59 | #include <inttypes.h>
 60 | #include <sys/types.h>
 61 | #include <stdatomic.h>
 62 | #include <stdio.h>
 63 | 
 64 | #include "lf/lf.h"
 65 | 
 66 | #if LFATOMIC_WIDTH == 32
 67 | # define LFRING_PTR_MIN	(LF_CACHE_SHIFT - 3)
 68 | #elif LFATOMIC_WIDTH == 64
 69 | # define LFRING_PTR_MIN	(LF_CACHE_SHIFT - 4)
 70 | #elif LFATOMIC_WIDTH == 128
 71 | # define LFRING_PTR_MIN	(LF_CACHE_SHIFT - 5)
 72 | #else
 73 | # error "Unsupported LFATOMIC_WIDTH."
 74 | #endif
 75 | 
 76 | #define LFRING_PTR_ALIGN	(_Alignof(struct __lfring_ptr))
 77 | #define LFRING_PTR_SIZE(o)	\
 78 | 	(offsetof(struct __lfring_ptr, array) + (sizeof(lfatomic_big_t) << ((o) + 1)))
 79 | 
 80 | #define __lfring_cmp(x, op, y)	((lfsatomic_t) ((x) - (y)) op 0)
 81 | 
 82 | #if LFRING_PTR_MIN != 0
 83 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n)
 84 | {
 85 | 	return (size_t) (((idx & (n - 1)) >> (order + 1 - LFRING_PTR_MIN)) |
 86 | 			((idx << LFRING_PTR_MIN) & (n - 1)));
 87 | }
 88 | #else
 89 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n)
 90 | {
 91 | 	return (size_t) (idx & (n - 1));
 92 | }
 93 | #endif
 94 | 
 95 | #define __lfring_threshold4(n) ((long) (2 * (n) - 1))
 96 | 
 97 | static inline size_t lfring_pow2(size_t order)
 98 | {
 99 | 	return (size_t) 1U << order;
100 | }
101 | 
102 | struct __lfring_ptr {
103 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) head;
104 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfsatomic_t) threshold;
105 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) tail;
106 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) array[1];
107 | };
108 | 
109 | struct lfring_ptr;
110 | 
111 | #if defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN)
112 | # define __lfring_array_pointer(x)	((_Atomic(lfatomic_t) *) (x))
113 | # define __lfring_array_entry(x)	((_Atomic(lfatomic_t) *) (x) + 1)
114 | #else
115 | # define __lfring_array_pointer(x)	((_Atomic(lfatomic_t) *) (x) + 1)
116 | # define __lfring_array_entry(x)	((_Atomic(lfatomic_t) *) (x))
117 | #endif
118 | 
119 | #define __lfring_entry(x)	((lfatomic_t) (((x) & __lfaba_mask) >>	\
120 | 								__lfaba_shift))
121 | #define __lfring_pointer(x)	((lfatomic_t) (((x) & ~__lfaba_mask) >>	\
122 | 								__lfaptr_shift))
123 | #define __lfring_pair(e,p)	(((lfatomic_big_t) (e) << __lfaba_shift) |	\
124 | 								((lfatomic_big_t) (p) << __lfaptr_shift))
125 | 
126 | static inline void lfring_ptr_init_empty(struct lfring_ptr * ring, size_t order)
127 | {
128 | 	struct __lfring_ptr * q = (struct __lfring_ptr *) ring;
129 | 	size_t i, n = lfring_pow2(order + 1);
130 | 
131 | 	for (i = 0; i != n; i++)
132 | 		__lfaba_init(&q->array[i], 0);
133 | 
134 | 	atomic_init(&q->head, n);
135 | 	atomic_init(&q->threshold, -1);
136 | 	atomic_init(&q->tail, n);
137 | }
138 | 
139 | static inline void lfring_ptr_init_lhead(lfatomic_t *lhead, size_t order)
140 | {
141 | 	*lhead = lfring_pow2(order + 1);
142 | }
143 | 
144 | static inline bool lfring_ptr_enqueue(struct lfring_ptr * ring, size_t order,
145 | 		void * ptr, bool nonempty, bool nonfull, lfatomic_t *lhead)
146 | {
147 | 	struct __lfring_ptr * q = (struct __lfring_ptr *) ring;
148 | 	size_t tidx, n = lfring_pow2(order + 1);
149 | 	lfatomic_t tail, entry, ecycle, tcycle;
150 | 	lfatomic_big_t pair;
151 | 
152 | 	if (!nonfull) {
153 | 		tail = atomic_load(&q->tail);
154 | 		if (tail >= *lhead + n) {
155 | 			*lhead = atomic_load(&q->head);
156 | 			if (tail >= *lhead + n)
157 | 				return false;
158 | 		}
159 | 	}
160 | 
161 | 	while (1) {
162 | 		tail = atomic_fetch_add_explicit(&q->tail, 1, memory_order_acq_rel);
163 | 		tcycle = tail & ~(lfatomic_t) (n - 1);
164 | 		tidx = __lfring_map(tail, order, n);
165 | 		pair = __lfaba_load(&q->array[tidx], memory_order_acquire);
166 | retry:
167 | 		entry = __lfring_entry(pair);
168 | 		ecycle = entry & ~(lfatomic_t) (n - 1);
169 | 		if (__lfring_cmp(ecycle, <, tcycle) && (entry == ecycle ||
170 | 				(entry == (ecycle | 0x2) && atomic_load_explicit(&q->head,
171 | 				 memory_order_acquire) <= tail))) {
172 | 
173 | 			if (!__lfaba_cmpxchg_weak(&q->array[tidx],
174 | 					&pair, __lfring_pair(tcycle | 0x1, (lfatomic_t) ptr),
175 | 					memory_order_acq_rel, memory_order_acquire))
176 | 				goto retry;
177 | 
178 | 			if (!nonempty && atomic_load(&q->threshold) != __lfring_threshold4(n))
179 | 				atomic_store(&q->threshold, __lfring_threshold4(n));
180 | 
181 | 			return true;
182 | 		}
183 | 
184 | 		if (!nonfull) {
185 | 			if (tail + 1 >= *lhead + n) {
186 | 				*lhead = atomic_load(&q->head);
187 | 				if (tail + 1 >= *lhead + n)
188 | 					return false;
189 | 			}
190 | 		}
191 | 	}
192 | }
193 | 
194 | static inline void __lfring_ptr_catchup(struct lfring_ptr * ring,
195 | 	lfatomic_t tail, lfatomic_t head)
196 | {
197 | 	struct __lfring_ptr * q = (struct __lfring_ptr *) ring;
198 | 
199 | 	while (!atomic_compare_exchange_weak_explicit(&q->tail, &tail, head,
200 | 			memory_order_acq_rel, memory_order_acquire)) {
201 | 		head = atomic_load_explicit(&q->head, memory_order_acquire);
202 | 		tail = atomic_load_explicit(&q->tail, memory_order_acquire);
203 | 		if (__lfring_cmp(tail, >=, head))
204 | 			break;
205 | 	}
206 | }
207 | 
208 | static inline bool lfring_ptr_dequeue(struct lfring_ptr * ring, size_t order,
209 | 		void ** ptr, bool nonempty)
210 | {
211 | 	struct __lfring_ptr * q = (struct __lfring_ptr *) ring;
212 | 	size_t hidx, n = lfring_pow2(order + 1);
213 | 	lfatomic_t head, entry, entry_new, ecycle, hcycle, tail;
214 | 	lfatomic_big_t pair;
215 | 
216 | 	if (!nonempty && atomic_load(&q->threshold) < 0) {
217 | 		return false;
218 | 	}
219 | 
220 | 	while (1) {
221 | 		head = atomic_fetch_add_explicit(&q->head, 1, memory_order_acq_rel);
222 | 		hcycle = head & ~(lfatomic_t) (n - 1);
223 | 		hidx = __lfring_map(head, order, n);
224 | 		entry = atomic_load_explicit(__lfring_array_entry(&q->array[hidx]),
225 | 					memory_order_acquire);
226 | 		do {
227 | 			ecycle = entry & ~(lfatomic_t) (n - 1);
228 | 			if (ecycle == hcycle) {
229 | 				pair = __lfaba_fetch_and(&q->array[hidx],
230 | 					__lfring_pair(~(lfatomic_t) 0x1, 0), memory_order_acq_rel);
231 | 				*ptr = (void *) __lfring_pointer(pair);
232 | 				return true;
233 | 			}
234 | 			if ((entry & (~(lfatomic_t) 0x2)) != ecycle) {
235 | 				entry_new = entry | 0x2;
236 | 				if (entry == entry_new)
237 | 					break;
238 | 			} else {
239 | 				entry_new = hcycle | (entry & 0x2);
240 | 			}
241 | 		} while (__lfring_cmp(ecycle, <, hcycle) &&
242 | 				!atomic_compare_exchange_weak_explicit(
243 | 				__lfring_array_entry(&q->array[hidx]),
244 | 				&entry, entry_new,
245 | 				memory_order_acq_rel, memory_order_acquire));
246 | 
247 | 		if (!nonempty) {
248 | 			tail = atomic_load_explicit(&q->tail, memory_order_acquire);
249 | 			if (__lfring_cmp(tail, <=, head + 1)) {
250 | 				__lfring_ptr_catchup(ring, tail, head + 1);
251 | 				atomic_fetch_sub_explicit(&q->threshold, 1,
252 | 					memory_order_acq_rel);
253 | 				return false;
254 | 			}
255 | 			if (atomic_fetch_sub_explicit(&q->threshold, 1,
256 | 					memory_order_acq_rel) <= 0)
257 | 				return false;
258 | 		}
259 | 	}
260 | }
261 | 
262 | #endif	/* !__LFRING_H */
263 | 
264 | /* vi: set tabstop=4: */
265 | 


--------------------------------------------------------------------------------
/lfring_naive.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2019 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2019 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __LFRING_H
 56 | #define __LFRING_H	1
 57 | 
 58 | #include <stdbool.h>
 59 | #include <stdatomic.h>
 60 | #include <inttypes.h>
 61 | #include <sys/types.h>
 62 | 
 63 | #include "lf/lf.h"
 64 | 
 65 | #if LFATOMIC_WIDTH == 32
 66 | # define LFRING_MIN	(LF_CACHE_SHIFT - 2)
 67 | #elif LFATOMIC_WIDTH == 64
 68 | # define LFRING_MIN	(LF_CACHE_SHIFT - 3)
 69 | #elif LFATOMIC_WIDTH == 128
 70 | # define LFRING_MIN	(LF_CACHE_SHIFT - 4)
 71 | #else
 72 | # error "Unsupported LFATOMIC_WIDTH."
 73 | #endif
 74 | 
 75 | #define LFRING_ALIGN	(_Alignof(struct __lfring))
 76 | #define LFRING_SIZE(o)	\
 77 | 	(offsetof(struct __lfring, array) + (sizeof(lfatomic_t) << (o)))
 78 | 
 79 | #define LFRING_EMPTY	(~(size_t) 0U)
 80 | 
 81 | #if LFRING_MIN != 0
 82 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n)
 83 | {
 84 | 	return (size_t) (((idx & (n - 1)) >> (order - LFRING_MIN)) |
 85 | 			((idx << LFRING_MIN) & (n - 1)));
 86 | }
 87 | #else
 88 | static inline size_t __lfring_map(lfatomic_t idx, size_t order, size_t n)
 89 | {
 90 | 	return (size_t) (idx & (n - 1));
 91 | }
 92 | #endif
 93 | 
 94 | static inline size_t lfring_pow2(size_t order)
 95 | {
 96 | 	return (size_t) 1U << order;
 97 | }
 98 | 
 99 | struct __lfring {
100 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) head;
101 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) tail;
102 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_t) array[1];
103 | };
104 | 
105 | struct lfring;
106 | 
107 | static inline void lfring_init_empty(struct lfring * ring, size_t order)
108 | {
109 | 	struct __lfring * q = (struct __lfring *) ring;
110 | 	size_t i, n = lfring_pow2(order);
111 | 
112 | 	for (i = 0; i != n; i++) {
113 | 		q->array[i] = 0;
114 | 	}
115 | 
116 | 	q->head = n;
117 | 	q->tail = n;
118 | }
119 | 
120 | static inline void lfring_init_full(struct lfring * ring, size_t order)
121 | {
122 | 	struct __lfring * q = (struct __lfring *) ring;
123 | 	size_t i, n = lfring_pow2(order);
124 | 
125 | 	for (i = 0; i != n; i++) {
126 | 		q->array[i] = i;
127 | 	}
128 | 
129 | 	q->head = 0;
130 | 	q->tail = n;
131 | }
132 | 
133 | static inline void lfring_init_fill(struct lfring * ring,
134 | 		size_t s, size_t e, size_t order)
135 | {
136 | 	struct __lfring * q = (struct __lfring *) ring;
137 | 	size_t i, n = lfring_pow2(order);
138 | 
139 | 	for (i = 0; i != s; i++) {
140 | 		q->array[__lfring_map(i, order, n)] = 0;
141 | 	}
142 | 	for (; i != e; i++) {
143 | 		q->array[__lfring_map(i, order, n)] = i;
144 | 	}
145 | 	for (; i != n; i++) {
146 | 		q->array[__lfring_map(i, order, n)] = (lfatomic_t) -n;
147 | 	}
148 | 	q->head = s;
149 | 	q->tail = e;
150 | }
151 | 
152 | static inline size_t lfring_enqueue(struct lfring * ring,
153 | 		size_t order, size_t eidx, bool nonempty)
154 | {
155 | 	struct __lfring * q = (struct __lfring *) ring;
156 | 	size_t n = lfring_pow2(order);
157 | 	lfatomic_t tail;
158 | 
159 | start_over:
160 | 	tail = atomic_load_explicit(&q->tail, memory_order_acquire);
161 | 
162 | 	while (1) {
163 | 		lfatomic_t tcycle = tail & ~(n - 1);
164 | 		size_t tidx = __lfring_map(tail, order, n);
165 | 		lfatomic_t entry = atomic_load_explicit(&q->array[tidx], memory_order_acquire);
166 | 
167 | 		while (1) {
168 | 			lfatomic_t ecycle = entry & ~(n - 1);
169 | 
170 | 			if (ecycle == tcycle) {
171 | 				/* Advance the tail pointer. */
172 | 				if (atomic_compare_exchange_strong_explicit(&q->tail, &tail,
173 | 						tail + 1, memory_order_acq_rel, memory_order_acquire)) {
174 | 					tail++;
175 | 				}
176 | 				break;
177 | 			}
178 | 
179 | 			/* Wrapping around. */
180 | 			if ((lfatomic_t) (ecycle + n) != tcycle) {
181 | 				goto start_over;
182 | 			}
183 | 
184 | 			/* An empty entry. */
185 | 			if (atomic_compare_exchange_strong_explicit(&q->array[tidx],
186 | 					&entry, __LFMERGE(tcycle, eidx),
187 | 					memory_order_acq_rel, memory_order_acquire)) {
188 | 				/* Try to advance the tail pointer. */
189 | 				atomic_compare_exchange_weak_explicit(&q->tail, &tail, tail + 1,
190 | 					memory_order_acq_rel, memory_order_acquire);
191 | 				return entry & (n - 1);
192 | 			}
193 | 		}
194 | 	}
195 | }
196 | 
197 | static inline size_t lfring_dequeue(struct lfring * ring, size_t order,
198 | 		bool nonempty)
199 | {
200 | 	struct __lfring * q = (struct __lfring *) ring;
201 | 	size_t n = lfring_pow2(order);
202 | 	lfatomic_t head, entry;
203 | 
204 | start_over:
205 | 	head = atomic_load_explicit(&q->head, memory_order_acquire);
206 | 
207 | 	do {
208 | 		lfatomic_t ecycle, hcycle = head & ~(n - 1);
209 | 		size_t hidx = __lfring_map(head, order, n);
210 | 		entry = atomic_load_explicit(&q->array[hidx], memory_order_acquire);
211 | 		ecycle = entry & ~(n - 1);
212 | 		if (ecycle != hcycle) {
213 | 			/* Wrapping around. */
214 | 			if (!nonempty && (lfatomic_t) (ecycle + n) == hcycle) {
215 | 				return LFRING_EMPTY;
216 | 			}
217 | 			goto start_over;
218 | 		}
219 | 	} while (!atomic_compare_exchange_weak_explicit(&q->head, &head, head + 1,
220 | 				memory_order_acq_rel, memory_order_acquire));
221 | 
222 | 	return (size_t) (entry & (n - 1));
223 | }
224 | 
225 | #endif	/* !__LFRING_H */
226 | 
227 | /* vi: set tabstop=4: */
228 | 


--------------------------------------------------------------------------------
/wfring_cas2.h:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------------
  2 |  *
  3 |  * Dual 2-BSD/MIT license. Either or both licenses can be used.
  4 |  *
  5 |  * ----------------------------------------------------------------------------
  6 |  *
  7 |  * Copyright (c) 2021 Ruslan Nikolaev.  All Rights Reserved.
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions
 11 |  * are met:
 12 |  * 1. Redistributions of source code must retain the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer.
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in the
 16 |  *    documentation and/or other materials provided with the distribution.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 19 |  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 |  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 |  *
 30 |  * ----------------------------------------------------------------------------
 31 |  *
 32 |  * Copyright (c) 2021 Ruslan Nikolaev
 33 |  *
 34 |  * Permission is hereby granted, free of charge, to any person obtaining a
 35 |  * copy of this software and associated documentation files (the "Software"),
 36 |  * to deal in the Software without restriction, including without limitation
 37 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 38 |  * and/or sell copies of the Software, and to permit persons to whom the
 39 |  * Software is furnished to do so, subject to the following conditions:
 40 |  *
 41 |  * The above copyright notice and this permission notice shall be included in
 42 |  * all copies or substantial portions of the Software.
 43 |  *
 44 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 49 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 50 |  * IN THE SOFTWARE.
 51 |  *
 52 |  * ----------------------------------------------------------------------------
 53 |  */
 54 | 
 55 | #ifndef __WFRING_H
 56 | #define __WFRING_H	1
 57 | 
 58 | #include <stdbool.h>
 59 | #include <inttypes.h>
 60 | #include <sys/types.h>
 61 | #include <stdatomic.h>
 62 | #include <stdio.h>
 63 | 
 64 | #include "lf/lf.h"
 65 | 
 66 | #if LFATOMIC_WIDTH == 32
 67 | # define WFRING_MIN	(LF_CACHE_SHIFT - 3)
 68 | #elif LFATOMIC_WIDTH == 64
 69 | # define WFRING_MIN	(LF_CACHE_SHIFT - 4)
 70 | #elif LFATOMIC_WIDTH == 128
 71 | # define WFRING_MIN	(LF_CACHE_SHIFT - 5)
 72 | #else
 73 | # error "Unsupported LFATOMIC_WIDTH."
 74 | #endif
 75 | 
 76 | #define WFRING_PATIENCE_ENQ 16
 77 | #define WFRING_PATIENCE_DEQ 64
 78 | #define WFRING_DELAY		16
 79 | 
 80 | #define WFRING_ALIGN	(_Alignof(struct __wfring))
 81 | #define WFRING_SIZE(o)	\
 82 | 	(offsetof(struct __wfring, array) + (sizeof(lfatomic_big_t) << ((o) + 1)))
 83 | 
 84 | #define WFRING_EMPTY    (~(size_t) 0U)
 85 | 
 86 | #define __wfring_cmp(x, op, y)	((lfsatomic_t) ((x) - (y)) op 0)
 87 | 
 88 | #if WFRING_MIN != 0
 89 | static inline size_t __wfring_raw_map(lfatomic_t idx, size_t order, size_t n)
 90 | {
 91 | 	return (size_t) (((idx & (n - 1)) >> (order - WFRING_MIN)) |
 92 | 			((idx << WFRING_MIN) & (n - 1)));
 93 | }
 94 | #else
 95 | static inline size_t __wfring_raw_map(lfatomic_t idx, size_t order, size_t n)
 96 | {
 97 | 	return (size_t) (idx & (n - 1));
 98 | }
 99 | #endif
100 | 
101 | static inline size_t __wfring_map(lfatomic_t idx, size_t order, size_t n)
102 | {
103 | 	return __wfring_raw_map(idx, order + 1, n);
104 | }
105 | 
106 | #define __wfring_threshold3(half, n) ((long) ((half) + (n) - 1))
107 | 
108 | #if defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN)
109 | # define __wfring_pair_addon(x)	((_Atomic(lfatomic_t) *) (x))
110 | # define __wfring_pair_entry(x)	((_Atomic(lfatomic_t) *) (x) + 1)
111 | #else
112 | # define __wfring_pair_addon(x)	((_Atomic(lfatomic_t) *) (x) + 1)
113 | # define __wfring_pair_entry(x)	((_Atomic(lfatomic_t) *) (x))
114 | #endif
115 | 
116 | #define __wfring_entry(x)	((lfatomic_t) (((x) & __lfaba_mask) >>	\
117 | 								__lfaba_shift))
118 | #define __wfring_addon(x)	((lfatomic_t) (((x) & ~__lfaba_mask) >>	\
119 | 								__lfaptr_shift))
120 | #define __wfring_pair(e,c)	(((lfatomic_big_t) (e) << __lfaba_shift) |	\
121 | 					((lfatomic_big_t) ((lfatomic_t) (c)) << __lfaptr_shift))
122 | 
123 | #define __WFRING_FIN 0x1
124 | #define __WFRING_INC 0x2
125 | 
126 | #define __WFRING_EIDX_TERM 0
127 | #define __WFRING_EIDX_DEQ  1
128 | 
129 | static inline size_t wfring_pow2(size_t order)
130 | {
131 | 	return (size_t) 1U << order;
132 | }
133 | 
134 | struct __wfring {
135 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) head;
136 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfsatomic_t) threshold;
137 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) tail;
138 | 	__attribute__ ((aligned(LF_CACHE_BYTES))) _Atomic(lfatomic_big_t) array[1];
139 | };
140 | 
141 | struct wfring;
142 | 
143 | struct wfring_phase2 {
144 | 	_Atomic(lfatomic_t) seq1;
145 | 	_Atomic(lfatomic_t) *local;
146 | 	lfatomic_t cnt;
147 | 	_Atomic(lfatomic_t) seq2;
148 | };
149 | 
150 | struct wfring_state {
151 | 	__attribute__ ((aligned(LF_CACHE_BYTES)))
152 | 		_Atomic(struct wfring_state *) next;
153 | 	size_t nextCheck;
154 | 	struct wfring_state * currThread;
155 | 
156 | 	struct wfring_phase2 phase2;
157 | 
158 | 	_Atomic(lfatomic_t) seq1;
159 | 	_Atomic(lfatomic_t) tail;
160 | 	lfatomic_t initTail;
161 | 	_Atomic(lfatomic_t) head;
162 | 	lfatomic_t initHead;
163 | 	_Atomic(size_t) eidx;
164 | 	_Atomic(lfatomic_t) seq2;
165 | };
166 | 
167 | static inline void wfring_init_empty(struct wfring * ring, size_t order)
168 | {
169 | 	struct __wfring * q = (struct __wfring *) ring;
170 | 	size_t i, n = wfring_pow2(order + 1);
171 | 
172 | 	for (i = 0; i != n; i++)
173 | 		__lfaba_init(&q->array[i], __wfring_pair((lfsatomic_t) -1, (lfsatomic_t) (-n - 1)));
174 | 
175 | 	__lfaba_init(&q->head, 0);
176 | 	atomic_init(&q->threshold, -1);
177 | 	__lfaba_init(&q->tail, 0);
178 | }
179 | 
180 | static inline void wfring_init_full(struct wfring * ring, size_t order)
181 | {
182 | 	struct __wfring * q = (struct __wfring *) ring;
183 | 	size_t i, half = wfring_pow2(order), n = half * 2;
184 | 
185 | 	for (i = 0; i != half; i++)
186 | 		__lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair(2 * n + n + __wfring_raw_map(i, order, half), (lfsatomic_t) -1));
187 | 	for (; i != n; i++)
188 | 		__lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair((lfsatomic_t) -1, (lfsatomic_t) (-n - 1)));
189 | 
190 | 	__lfaba_init(&q->head, 0);
191 | 	atomic_init(&q->threshold, __wfring_threshold3(half, n));
192 | 	__lfaba_init(&q->tail, __wfring_pair(half << 2, 0));
193 | }
194 | 
195 | static inline void wfring_init_fill(struct wfring * ring,
196 | 		size_t s, size_t e, size_t order)
197 | {
198 | 	struct __wfring * q = (struct __wfring *) ring;
199 | 	size_t i, half = wfring_pow2(order), n = half * 2;
200 | 
201 | 	for (i = 0; i != s; i++)
202 | 		__lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair(4 * n - 1, (lfsatomic_t) -1));
203 | 	for (; i != e; i++)
204 | 		__lfaba_init(&q->array[__wfring_map(i, order, n)], __wfring_pair(2 * n + n + i, (lfsatomic_t) -1));
205 | 	for (; i != n; i++)
206 | 		__lfaba_init(&q->array[__wfring_map(i, order, n)],  __wfring_pair((lfsatomic_t) -1, (lfsatomic_t) (-n - 1)));
207 | 
208 | 	__lfaba_init(&q->head, __wfring_pair(s << 2, 0));
209 | 	atomic_init(&q->threshold, __wfring_threshold3(half, n));
210 | 	__lfaba_init(&q->tail, __wfring_pair(e << 2, 0));
211 | }
212 | 
213 | static inline void wfring_init_state(struct wfring * ring,
214 | 	struct wfring_state * state)
215 | {
216 | 	atomic_init(&state->next, NULL);
217 | 
218 | 	state->currThread = state;
219 | 	state->nextCheck = WFRING_DELAY;
220 | 
221 | 	atomic_init(&state->seq1, 1);
222 | 	atomic_init(&state->eidx, __WFRING_EIDX_TERM);
223 | 	atomic_init(&state->seq2, 0);
224 | 
225 | 	atomic_init(&state->phase2.seq1, 1);
226 | 	atomic_init(&state->phase2.seq2, 0);
227 | }
228 | 
229 | static inline lfatomic_t __wfring_load_global_help_phase2(
230 | 	_Atomic(lfatomic_big_t) * global, lfatomic_big_t gp)
231 | {
232 | 	struct wfring_phase2 * phase2;
233 | 	_Atomic(lfatomic_t) * local;
234 | 	lfatomic_t seq, cnt;
235 | 
236 | 	do {
237 | 		phase2 = (struct wfring_phase2 *) __wfring_addon(gp);
238 | 		if (phase2 == NULL) break;
239 | 		seq = atomic_load(&phase2->seq2);
240 | 		local = phase2->local;
241 | 		cnt = phase2->cnt;
242 | 		if (atomic_load(&phase2->seq1) == seq) {
243 | 			lfatomic_t cnt_inc = cnt + __WFRING_INC;
244 | 			atomic_compare_exchange_strong(local, &cnt_inc, cnt);
245 | 		}
246 | 	} while (!__lfaba_cmpxchg_strong(global, &gp,
247 | 				__wfring_pair(__wfring_entry(gp), 0),
248 | 				memory_order_acq_rel, memory_order_acquire));
249 | 	return __wfring_entry(gp);
250 | }
251 | 
252 | static inline bool __wfring_slow_inc(_Atomic(lfatomic_big_t) * global,
253 | 	_Atomic(lfatomic_t) * local, lfatomic_t * prev,
254 | 	_Atomic(lfsatomic_t) * threshold, struct wfring_phase2 * phase2)
255 | {
256 | 	lfatomic_t seq, cnt, cnt_inc;
257 | 	lfatomic_big_t gp = __lfaba_load_atomic(global, memory_order_acquire);
258 | 
259 | 	do {
260 | 		if (atomic_load(local) & __WFRING_FIN)
261 | 			return false;
262 | 		cnt = __wfring_load_global_help_phase2(global, gp);
263 | 		if (!atomic_compare_exchange_strong(local, prev, cnt + __WFRING_INC)) {
264 | 			if (*prev & __WFRING_FIN) return false;
265 | 			if (!(*prev & __WFRING_INC)) return true;
266 | 			cnt = *prev - __WFRING_INC;
267 | 		} else {
268 | 			*prev = cnt + __WFRING_INC;
269 | 		}
270 | 		seq = atomic_load(&phase2->seq1) + 1;
271 | 		atomic_store(&phase2->seq1, seq);
272 | 		phase2->local = local;
273 | 		phase2->cnt = cnt;
274 | 		atomic_store(&phase2->seq2, seq);
275 | 		gp = __wfring_pair(cnt, 0);
276 | 	} while (!__lfaba_cmpxchg_strong(global, &gp,
277 | 		__wfring_pair(cnt + 1, phase2),
278 | 		memory_order_acq_rel, memory_order_acquire));
279 | 
280 | 	if (threshold != NULL)
281 | 		atomic_fetch_sub_explicit(threshold, 1, memory_order_acq_rel);
282 | 
283 | 	cnt_inc = cnt + __WFRING_INC;
284 | 	atomic_compare_exchange_strong(local, &cnt_inc, cnt);
285 | 	gp = __wfring_pair(cnt + 1, phase2);
286 | 	__lfaba_cmpxchg_strong(global, &gp, __wfring_pair(cnt + 1, 0),
287 | 		memory_order_acq_rel, memory_order_acquire);
288 | 	*prev = cnt;
289 | 
290 | 	return true;
291 | }
292 | 
293 | static inline void __wfring_do_enqueue_slow(struct __wfring * q, size_t order,
294 | 	size_t eidx, lfatomic_t seq, lfatomic_t tail, bool nonempty,
295 | 	struct wfring_state * state)
296 | {
297 | 	size_t tidx, half = wfring_pow2(order), n = half * 2;
298 | 	lfatomic_t entry, note, ecycle, tcycle;
299 | 	lfatomic_big_t pair;
300 | 
301 | 	while (__wfring_slow_inc(&q->tail, &state->tail, &tail, NULL,
302 | 			&state->phase2)) {
303 | 		if (atomic_load(&state->seq1) != seq)
304 | 			break;
305 | 		tcycle = tail | (4 * n - 1);
306 | 		tidx = __wfring_map(tail >> 2, order, n);
307 | 		pair = __lfaba_load(&q->array[tidx], memory_order_acquire);
308 | retry:
309 | 		entry = __wfring_entry(pair);
310 | 		note = __wfring_addon(pair);
311 | 		ecycle = entry | (4 * n - 1);
312 | 		if (__wfring_cmp(ecycle, <, tcycle) && __wfring_cmp(note, <, tcycle)) {
313 | 			if ((((entry | 0x1) == ecycle) ||
314 | 				(((entry | 0x1) == (ecycle ^ n)) &&
315 | 				 __wfring_cmp(
316 | 					atomic_load_explicit(__wfring_pair_entry(&q->head),
317 | 						memory_order_acquire), <=, tail)))) {
318 | 
319 | 				if (!__lfaba_cmpxchg_weak(&q->array[tidx],
320 | 						&pair, __wfring_pair(tcycle ^ eidx ^ n, note),
321 | 						memory_order_acq_rel, memory_order_acquire))
322 | 					goto retry;
323 | 
324 | 				entry = tcycle ^ eidx;
325 | 
326 | 				if (atomic_compare_exchange_strong_explicit(&state->tail, &tail,
327 | 						tail + 0x1,
328 | 						memory_order_acq_rel, memory_order_acquire)) {
329 | 
330 | 					/* Finalize the entry. */
331 | 					atomic_compare_exchange_strong_explicit(
332 | 							__wfring_pair_entry(&q->array[tidx]),
333 | 							&entry, entry ^ n, memory_order_acq_rel,
334 | 							memory_order_acquire);
335 | 				}
336 | 
337 | 				if (!nonempty && (atomic_load(&q->threshold) != __wfring_threshold3(half, n)))
338 | 					atomic_store(&q->threshold, __wfring_threshold3(half, n));
339 | 				return;
340 | 			} else if ((entry | (2 * n + n)) == tcycle) {
341 | 				/* Already produced. */
342 | 				return;
343 | 			} else {
344 | 				/* Skip this entry. */
345 | 				if (!__lfaba_cmpxchg_weak(&q->array[tidx],
346 | 						&pair, __wfring_pair(entry, tcycle),
347 | 						memory_order_acq_rel, memory_order_acquire))
348 | 					goto retry;
349 | 			}
350 | 		}
351 | 	}
352 | }
353 | 
354 | __attribute__((noinline))  static void __wfring_enqueue_slow(
355 | 	struct __wfring * q, size_t order, size_t eidx,
356 | 	lfatomic_t tail, bool nonempty, struct wfring_state * state)
357 | {
358 | 	lfatomic_t seq = atomic_load(&state->seq1);
359 | 
360 | 	/* Initiate a helping request. */
361 | 	atomic_store(&state->tail, tail);
362 | 	state->initTail = tail;
363 | 	atomic_store(&state->eidx, eidx);
364 | 	atomic_store(&state->seq2, seq);
365 | 
366 | 	__wfring_do_enqueue_slow(q, order, eidx, seq, tail, nonempty, state);
367 | 
368 | 	/* Terminate the helping request. */
369 | 	atomic_store(&state->seq1, seq + 1);
370 | 	atomic_store(&state->eidx, __WFRING_EIDX_TERM);
371 | }
372 | 
373 | __attribute__((noinline)) static void __wfring_enqueue_help_thread(
374 | 	struct __wfring * q, size_t order, bool nonempty,
375 | 	struct wfring_state * state)
376 | {
377 | 	lfatomic_t seq = atomic_load(&state->seq2);
378 | 	size_t eidx = atomic_load(&state->eidx);
379 | 	lfatomic_t tail = state->initTail;
380 | 	if (eidx <= __WFRING_EIDX_DEQ || atomic_load(&state->seq1) != seq)
381 | 		return;
382 | 
383 | 	__wfring_do_enqueue_slow(q, order, eidx, seq, tail, nonempty, state);
384 | }
385 | 
386 | static inline void __wfring_catchup(struct __wfring * q,
387 | 	lfatomic_t tail, lfatomic_t head)
388 | {
389 | 	while (!atomic_compare_exchange_weak_explicit(__wfring_pair_entry(&q->tail),
390 | 			&tail, head, memory_order_acq_rel, memory_order_acquire)) {
391 | 		head = atomic_load(__wfring_pair_entry(&q->head));
392 | 		tail = atomic_load(__wfring_pair_entry(&q->tail));
393 | 		if (__wfring_cmp(tail, >=, head))
394 | 			break;
395 | 	}
396 | }
397 | 
398 | static inline void __wfring_lookup(struct wfring_state * state,
399 | 	lfatomic_t tail, size_t n)
400 | {
401 | 	struct wfring_state * curr = atomic_load(&state->next);
402 | 	while (curr != state) {
403 | 		if ((atomic_load(&curr->tail) & ~((lfatomic_t) 0x3)) == tail) {
404 | 			atomic_compare_exchange_strong(&curr->tail, &tail, tail ^ 0x1);
405 | 			return;
406 | 		}
407 | 		curr = atomic_load(&curr->next);
408 | 	}
409 | 	return;
410 | }
411 | 
412 | static inline void __wfring_do_dequeue_slow(struct __wfring * q, size_t order,
413 | 	lfatomic_t seq, lfatomic_t head, bool nonempty, struct wfring_state * state)
414 | {
415 | 	size_t hidx, n = wfring_pow2(order + 1);
416 | 	lfatomic_t entry, note, entry_new, ecycle, hcycle, tail;
417 | 	lfatomic_big_t pair;
418 | 	_Atomic(lfsatomic_t) * threshold = nonempty ? NULL : &q->threshold;
419 | 
420 | 	while (__wfring_slow_inc(&q->head, &state->head, &head, threshold,
421 | 			&state->phase2)) {
422 | 		hcycle = head | (4 * n - 1);
423 | 		hidx = __wfring_map(head >> 2, order, n);
424 | 		pair = __lfaba_load(&q->array[hidx], memory_order_acquire);
425 | retry:
426 | 		do {
427 | 			entry = __wfring_entry(pair);
428 | 			note = __wfring_addon(pair);
429 | 			ecycle = entry | (4 * n - 1);
430 | 			if (ecycle == hcycle && (entry & (n - 1)) != (n - 2)) {
431 | 				lfatomic_t _h = head;
432 | 				atomic_compare_exchange_strong(&state->head, &_h, head ^ 0x1);
433 | 				return;
434 | 			}
435 | 
436 | 			if ((entry | (2 * n) | 0x1) != ecycle) {
437 | 				if (__wfring_cmp(ecycle, <, hcycle) &&
438 | 						__wfring_cmp(note, <, hcycle)) {
439 | 					/* Do not enqueue in this entry. */
440 | 					if (!__lfaba_cmpxchg_weak(&q->array[hidx], &pair,
441 | 							__wfring_pair(entry, hcycle),
442 | 							memory_order_acq_rel, memory_order_acquire))
443 | 						goto retry;
444 | 				}
445 | 				entry_new = entry & ~(lfatomic_t) (2 * n);
446 | 				if (entry == entry_new)
447 | 					break;
448 | 			} else {
449 | 				entry_new = hcycle ^ ((~entry) & (2 * n)) ^ 0x1;
450 | 			}
451 | 		} while (__wfring_cmp(ecycle, <, hcycle) &&
452 | 					!__lfaba_cmpxchg_weak(&q->array[hidx], &pair,
453 | 						__wfring_pair(entry_new, note),
454 | 						memory_order_acq_rel, memory_order_acquire));
455 | 
456 | 		if (!nonempty) {
457 | 			tail = atomic_load_explicit(__wfring_pair_entry(&q->tail),
458 | 				memory_order_acquire);
459 | 			if (__wfring_cmp(tail, <=, head + 4)) {
460 | 				__wfring_catchup(q, tail, head + 4);
461 | 			}
462 | 			if (atomic_load(&q->threshold) < 0) {
463 | 				lfatomic_t _h = head;
464 | 				atomic_compare_exchange_strong(&state->head, &_h,
465 | 					head + __WFRING_FIN);
466 | 			}
467 | 		}
468 | 	}
469 | }
470 | 
471 | __attribute__((noinline))  static size_t __wfring_dequeue_slow(struct __wfring * q, size_t order,
472 | 	lfatomic_t head, bool nonempty, struct wfring_state * state)
473 | {
474 | 	size_t hidx, n = wfring_pow2(order + 1);
475 | 	lfatomic_t entry, hcycle;
476 | 	lfatomic_t seq = atomic_load(&state->seq1);
477 | 
478 | 	/* Initiate a helping request. */
479 | 	atomic_store(&state->head, head);
480 | 	state->initHead = head;
481 | 	atomic_store(&state->eidx, __WFRING_EIDX_DEQ);
482 | 	atomic_store(&state->seq2, seq);
483 | 
484 | 	__wfring_do_dequeue_slow(q, order, seq, head, nonempty, state);
485 | 
486 | 	/* Terminate the helping request. */
487 | 	atomic_store(&state->seq1, seq + 1);
488 | 	atomic_store(&state->eidx, __WFRING_EIDX_TERM);
489 | 
490 | 	/* Consume an element. */
491 | 	head = atomic_load(&state->head);
492 | 	hcycle = head | (4 * n - 1);
493 | 	hidx = __wfring_map(head >> 2, order, n);
494 | 	entry = atomic_load_explicit(__wfring_pair_entry(&q->array[hidx]), memory_order_acquire);
495 | 	if (nonempty || ((entry | (2 * n + n)) == hcycle)) {
496 | 		if (!(entry & n))
497 | 			__wfring_lookup(state, head, n);
498 | 		atomic_fetch_or_explicit(__wfring_pair_entry(&q->array[hidx]),
499 | 			(2 * n - 1), memory_order_acq_rel);
500 | 		return (size_t) (entry & (n - 1));
501 | 	}
502 | 
503 | 	return WFRING_EMPTY;
504 | }
505 | 
506 | __attribute__((noinline)) static void __wfring_dequeue_help_thread(struct __wfring * q,
507 | 	size_t order, bool nonempty, struct wfring_state * state)
508 | {
509 | 	lfatomic_t seq = atomic_load(&state->seq2);
510 | 	lfatomic_t eidx = atomic_load(&state->eidx);
511 | 	lfatomic_t head = atomic_load(&state->initHead);
512 | 	if (eidx != __WFRING_EIDX_DEQ || atomic_load(&state->seq1) != seq)
513 | 		return;
514 | 
515 | 	__wfring_do_dequeue_slow(q, order, seq, head, nonempty, state);
516 | }
517 | 
518 | __attribute__((noinline)) static void __wfring_help(struct __wfring * q, size_t order,
519 | 	bool nonempty, struct wfring_state * state)
520 | {
521 | 	struct wfring_state * curr = state->currThread;
522 | 	if (curr != state) {
523 | 		size_t eidx = atomic_load(&curr->eidx);
524 | 		if (eidx != __WFRING_EIDX_TERM) {
525 | 			if (eidx != __WFRING_EIDX_DEQ)
526 | 				__wfring_enqueue_help_thread(q, order, nonempty, curr);
527 | 			else
528 | 				__wfring_dequeue_help_thread(q, order, nonempty, curr);
529 | 		}
530 | 		curr = atomic_load(&curr->next);
531 | 	}
532 | 	state->currThread = atomic_load(&curr->next);
533 | 	state->nextCheck = WFRING_DELAY;
534 | }
535 | 
536 | static inline void wfring_enqueue(struct wfring * ring,
537 | 	size_t order, size_t eidx, bool nonempty, struct wfring_state * state)
538 | {
539 | 	struct __wfring * q = (struct __wfring *) ring;
540 | 	size_t tidx, half = wfring_pow2(order), n = half * 2;
541 | 	lfatomic_t tail, entry, ecycle, tcycle;
542 | 	size_t patience = WFRING_PATIENCE_ENQ;
543 | 
544 | 	eidx ^= (n - 1);
545 | 	if (--state->nextCheck == 0)
546 | 		__wfring_help(q, order, nonempty, state);
547 | 
548 | 	do {
549 | 		tail = atomic_fetch_add_explicit(__wfring_pair_entry(&q->tail), 4, memory_order_acq_rel);
550 | 		tcycle = tail | (4 * n - 1);
551 | 		tidx = __wfring_map(tail >> 2, order, n);
552 | 		entry = atomic_load_explicit(__wfring_pair_entry(&q->array[tidx]), memory_order_acquire);
553 | retry:
554 | 		ecycle = entry | (4 * n - 1);
555 | 		if (__wfring_cmp(ecycle, <, tcycle) && (((entry | 0x1) == ecycle) ||
556 | 				(((entry | 0x1) == (ecycle ^ (2 * n))) &&
557 | 					__wfring_cmp(atomic_load_explicit(
558 | 						__wfring_pair_entry(&q->head),
559 | 							memory_order_acquire), <=, tail)))) {
560 | 
561 | 			if (!atomic_compare_exchange_weak_explicit(
562 | 					__wfring_pair_entry(&q->array[tidx]),
563 | 					&entry, tcycle ^ eidx,
564 | 					memory_order_acq_rel, memory_order_acquire))
565 | 				goto retry;
566 | 
567 | 			if (!nonempty && (atomic_load(&q->threshold) != __wfring_threshold3(half, n)))
568 | 				atomic_store(&q->threshold, __wfring_threshold3(half, n));
569 | 			return;
570 | 		}
571 | 	} while (--patience != 0);
572 | 
573 | 	__wfring_enqueue_slow(q, order, eidx, tail, nonempty, state);
574 | }
575 | 
576 | static inline size_t wfring_dequeue(struct wfring * ring, size_t order,
577 | 		bool nonempty, struct wfring_state * state)
578 | {
579 | 	struct __wfring * q = (struct __wfring *) ring;
580 | 	size_t hidx, n = wfring_pow2(order + 1);
581 | 	lfatomic_t head, entry, entry_new, ecycle, hcycle, tail;
582 | //	size_t attempt;
583 | 	size_t patience = WFRING_PATIENCE_DEQ;
584 | 
585 | 	if (!nonempty && atomic_load(&q->threshold) < 0) {
586 | 		return WFRING_EMPTY;
587 | 	}
588 | 
589 | 	if (--state->nextCheck == 0)
590 | 		__wfring_help(q, order, nonempty, state);
591 | 
592 | 	do {
593 | 		head = atomic_fetch_add_explicit(__wfring_pair_entry(&q->head), 4, memory_order_acq_rel);
594 | 		hcycle = head | (4 * n - 1);
595 | 		hidx = __wfring_map(head >> 2, order, n);
596 | 		//attempt = 0;
597 | //again:
598 | 		entry = atomic_load_explicit(__wfring_pair_entry(&q->array[hidx]), memory_order_acquire);
599 | 
600 | 		do {
601 | 			ecycle = entry | (4 * n - 1);
602 | 			if (ecycle == hcycle) {
603 | 				/* Need to help finalizing the entry. */
604 | 				if (!(entry & n))
605 | 					__wfring_lookup(state, head, n);
606 | 				atomic_fetch_or_explicit(__wfring_pair_entry(&q->array[hidx]),
607 | 						(2 * n - 1), memory_order_acq_rel);
608 | 				return (size_t) (entry & (n - 1));
609 | 			}
610 | 
611 | 			if ((entry | (2 * n) | 0x1) != ecycle) {
612 | 				entry_new = entry & ~(lfatomic_t) (2 * n);
613 | 				if (entry == entry_new)
614 | 					break;
615 | 			} else {
616 | 	//			if (++attempt <= 10000)
617 | 	//				goto again;
618 | 				entry_new = hcycle ^ ((~entry) & (2 * n)) ^ 0x1;
619 | 			}
620 | 		} while (__wfring_cmp(ecycle, <, hcycle) &&
621 | 					!atomic_compare_exchange_weak_explicit(
622 | 					__wfring_pair_entry(&q->array[hidx]), &entry, entry_new,
623 | 					memory_order_acq_rel, memory_order_acquire));
624 | 
625 | 		if (!nonempty) {
626 | 			tail = atomic_load_explicit(__wfring_pair_entry(&q->tail),
627 | 				memory_order_acquire);
628 | 			if (__wfring_cmp(tail, <=, head + 4)) {
629 | 				__wfring_catchup(q, tail, head + 4);
630 | 				atomic_fetch_sub_explicit(&q->threshold, 1,
631 | 					memory_order_acq_rel);
632 | 				return WFRING_EMPTY;
633 | 			}
634 | 
635 | 			if (atomic_fetch_sub_explicit(&q->threshold, 1,
636 | 					memory_order_acq_rel) <= 0)
637 | 				return WFRING_EMPTY;
638 | 		}
639 | 	} while (--patience != 0);
640 | 
641 | 	return __wfring_dequeue_slow(q, order, head, nonempty, state);
642 | }
643 | 
644 | #endif	/* !__WFRING_H */
645 | 
646 | /* vi: set tabstop=4: */
647 | 


--------------------------------------------------------------------------------