├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── align.h
├── benchmark
├── benchmark.h
├── bits.h
├── ccqueue.c
├── ccqueue.h
├── ccsynch.h
├── cpumap.h
├── delay.c
├── delay.h
├── driver
├── faa.c
├── halfhalf.c
├── harness.c
├── hzdptr.c
├── hzdptr.h
├── lcrq.c
├── lcrq.h
├── msqueue.c
├── msqueue.h
├── pairwise.c
├── primitives.h
├── queue.h
├── wfqueue.c
├── wfqueue.h
├── xxhash.c
└── xxhash.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Output files
 2 | ccqueue
 3 | delay
 4 | faa
 5 | lcrq
 6 | msqueue
 7 | wfqueue
 8 | wfqueue0
 9 | 
10 | # Object files
11 | *.o
12 | *.ko
13 | *.obj
14 | *.elf
15 | 
16 | # Precompiled Headers
17 | *.gch
18 | *.pch
19 | 
20 | # Libraries
21 | *.lib
22 | *.a
23 | *.la
24 | *.lo
25 | 
26 | # Shared objects (inc. Windows DLLs)
27 | *.dll
28 | *.so
29 | *.so.*
30 | *.dylib
31 | 
32 | # Executables
33 | *.exe
34 | *.out
35 | *.app
36 | *.i*86
37 | *.x86_64
38 | *.hex
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Chaoran Yang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay
 2 | 
 3 | CC = gcc
 4 | CFLAGS = -g -Wall -O3 -pthread -D_GNU_SOURCE
 5 | LDLIBS = -lpthread -lm
 6 | 
 7 | ifeq (${VERIFY}, 1)
 8 | 	CFLAGS += -DVERIFY
 9 | endif
10 | 
11 | ifeq (${SANITIZE}, 1)
12 | 	CFLAGS += -fsanitize=address -fno-omit-frame-pointer
13 | 	LDLIBS += -lasan
14 | 	LDFLAGS = -fsanitize=address
15 | endif
16 | 
17 | ifdef JEMALLOC_PATH
18 | 	LDFLAGS += -L${JEMALLOC_PATH}/lib -Wl,-rpath,${JEMALLOC_PATH}/lib
19 | 	LDLIBS += -ljemalloc
20 | endif
21 | 
22 | all: $(TESTS)
23 | 
24 | wfqueue0: CFLAGS += -DMAX_PATIENCE=0
25 | wfqueue0.o: wfqueue.c
26 | 	$(CC) $(CFLAGS) -c -o $@ $^
27 | 
28 | haswell: CFLAGS += -DGUADALUPE_COMPACT
29 | haswell: all
30 | 
31 | mic: CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc
32 | mic: CFLAGS += -DGUADALUPE_MIC_COMPACT -DLOGN_OPS=6
33 | mic biou: $(filter-out lcrq,$(TESTS))
34 | 
35 | biou: CFLAGS += -DBIOU_COMPACT
36 | 
37 | wfqueue wfqueue0: CFLAGS += -DWFQUEUE
38 | lcrq: CFLAGS += -DLCRQ
39 | ccqueue: CFLAGS += -DCCQUEUE
40 | msqueue: CFLAGS += -DMSQUEUE
41 | faa: CFLAGS += -DFAAQ
42 | delay: CFLAGS += -DDELAY
43 | 
44 | $(TESTS): harness.o
45 | ifeq (${HALFHALF}, 1)
46 | $(TESTS): halfhalf.o
47 | else
48 | $(TESTS): pairwise.o
49 | endif
50 | 
51 | msqueue lcrq: hzdptr.o xxhash.o
52 | 
53 | clean:
54 | 	rm -f $(TESTS) *.o
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Fast Wait Free Queue
  2 | 
  3 | This is a benchmark framework for evaluating the performance of concurrent queues. Currently, it contains four concurrent queue implementations. They are:
  4 | 
  5 | - A fast wait-free queue `wfqueue`,
  6 | - Morrison and Afek's `lcrq`,
  7 | - Fatourou and Kallimanis's `ccqueue`, and
  8 | - Michael and Scott's `msqueue`
  9 | 
 10 | The benchmark framework also includes a synthetic queue benchmark, `faa`, which emulates both an enqueue and a dequeue with a `fetch-and-add` primitive to test the performance of `fetch-and-add` on a system.
 11 | 
 12 | The framework currently contains one benchmark, `pairwise`, in which all threads repeatedly execute pairs of enqueue and dequeue operations. Between two operations, `pairwise` uses a delay routine that adds an arbitrary delay (between 50~150ns) to avoid artificial long run scenarios, where a cache line is held by one thread for a long time.
 13 | 
 14 | ## Requirements
 15 | 
 16 | - **GCC 4.1.0 or later (Recommend GCC 4.7.3 or later)**: current implementations uses GCC `__atomic` or `__sync` primitives for atomic memory access.
 17 | - **Linux kernel 2.5.8 or later**
 18 | - **glibc 2.3**: we use `sched_setaffinity` to bind threads to cores.
 19 | - **atomic `CAS2`**: `lcrq` requires `CAS2`, a 16 Byte wide `compare-and-swap` primitive. This is available on most recent Intel processors and IBM Power8.
 20 | - **jemalloc** (optional): `jemalloc` eliminates the bottleneck of the memory allocator. You can link with `jemalloc` by setting `JEMALLOC_PATH` environment variable to the path where your `jemalloc` is installed.
 21 |  
 22 | ## How to install
 23 | 
 24 | Download one of the released source code tarball, then execute the following commands. The filename used may be different depending on the name of the tarball you have downloaded.
 25 | ```
 26 | $ tar zxf fast-wait-free-queue-1.0.0.tar.gz
 27 | $ cd fast-wait-free-queue-1.0.0
 28 | $ make
 29 | ```
 30 | 
 31 | This should generate 6 binaries (or 5 if your system does not support `CAS2`, `lcrq` will fail to compile): `wfqueue`, `wfqueue0`, `lcrq`, `ccqueue`, `msqueue`, `faa`, and `delay`. These are the `pairwise` benchmark compiled using different queue implementations.
 32 | - `wfqueue0`: the same as `wfqueue` except that its `PATIENCE` is set to `0`.
 33 | - `delay`: a synthetic benchmark used to measure the time spent in the delay routine.
 34 | 
 35 | ## How to run
 36 | 
 37 | You can execute a binary directly, using the number of threads as an argument. Without an argument, the execution will use all available cores on the system. 
 38 | 
 39 | For example,
 40 | ```
 41 | ./wfqueue 8
 42 | ```
 43 | runs `wfqueue` with 8 threads.
 44 | 
 45 | If you would like to verify the result, compile the binary with `VERIFY=1 make`. Then execute a binary directly will print either `PASSED` or error messages.
 46 | 
 47 | You can also use the `driver` script, which invokes a binary up to 10 times and measures the **mean of running times**, the **running time of the current run**, the **standard deviation**, **margin of error** (both in time and percentage) of each run.
 48 | The script terminates when the **margin of error** is relatively small (**< 0.02**), or has invoked the binary 10 times.
 49 | 
 50 | For example, 
 51 | ```
 52 | ./driver ./wfqueue 8
 53 | ```
 54 | runs `wfqueue` with 8 threads up to 10 times and collect statistic results.
 55 | 
 56 | You can use the `benchmark` script, which invokes `driver` on all combinations of a list of binaries and a list of numbers of threads, and report the `mean running time` and `margin of error` for each combination. You can specify the list of binaries using the environment variable `TESTS`. You can specify the list of numbers of threads using the environment variable `PROCS`.
 57 | 
 58 | The generated output of `benchmark` can be used as a datafile for gnuplot. The first column of `benchmark`'s output is the number threads. Then every two columns are the `mean running time` and `margin of error` for each queue implementation. They are in the same order as they are specified in `TESTS`.
 59 | 
 60 | For example,
 61 | ```
 62 | TESTS=wfqueue:lcrq:faa:delay PROCS=1:2:4:8 ./benchmark
 63 | ```
 64 | runs each of `wfqueue`, `lcrq`, `faa`, and `delay` using 1, 2, 4, and 8 threads.
 65 | 
 66 | Then you can plot them using,
 67 | ```
 68 | set logscale x 2
 69 | plot "t" using 1:(20000/($2-$8)) t "wfqueue" w lines, \
 70 |      "t" using 1:(20000/($4-$8)) t "lcrq" w lines, \
 71 |      "t" using 1:(20000/($6-$8)) t "faa" w lines
 72 | ```
 73 | 
 74 | ## How to map threads to cores
 75 | 
 76 | By default, the framework will map a thread with id `i` to the core with id `i % p`, where *p* is the number of available cores on a system; you can check each core's id in `proc/cpuinfo`.
 77 | 
 78 | To implement a custom mapping, you can add a `cpumap` function in `cpumap.h`. The signature of `cpumap` is
 79 | ```
 80 | int cpumap(int id, int nprocs)
 81 | ```
 82 | where `id` is the id of the current thread, `nprocs` is the number of threads. `cpumap` should return the corresponding core id for the thread. `cpumap.h` contains several examples of the cpumap function. You should guard the definition of the added `cpumap` using a conditional macro, and add the macro to `CFLAGS` in the makefile.
 83 | 
 84 | ## How to add a new queue implementation
 85 | 
 86 | We use a generic pointer `void *` to represent a value that can be stored in the queue.
 87 | A queue should implements the queue interface, defined in `queue.h`.
 88 | 
 89 | - `queue_t`: the struct type of the queue,
 90 | - `handle_t`: a thread's handle to the queue, used to store thread local state,
 91 | - `void queue_init(queue_t * q, int nprocs)`: initialize a queue; this will be called only once,
 92 | - `void queue_register(queue_t * q, handle_t * th, int id)`: initialize a thread's handle; this will be called by every thread that uses the queue,
 93 | - `void enqueue(queue_t * q, handle_t * th, void * val)`: enqueues a value,
 94 | - `void * dequeue(queue_t * q, handle_t * th)`: dequeues a value,
 95 | - `void queue_free(queue_t * q, handle_t * h)`: deallocate a queue and cleanup all resources associated with it,
 96 | - `EMPTY`: a value that will be returned if a `dequeue` fails. This should be a macro that is defined in the header file.
 97 | 
 98 | ## How to add a new benchmark
 99 | 
100 | A benchmark should implement the benchmark interface, defined in `benchmark.h`, and interact with a queue using the queue interface.
101 | The benchmark interface includes:
102 | 
103 | - `void init(int nprocs, int n)`: performs initialization of the benchmark; called only once at the beginning.
104 | - `void thread_init(int id, int nprocs)`: performs thread local initialization of the benchmark; called once per thread, after `init` but before `benchmark`.
105 | - `void * benchmark(int id, int nprocs)`: run the benchmark once, called by each thread to run the benchmark. Each call will be timed and report as one iteration. It can return a result, which will be passed to `verify` to verify correctness.
106 | - `int verify(int nprocs, void * results)`: should verify the result of each thread and return `0` on success and non-zero values on error.
107 | 


--------------------------------------------------------------------------------
/align.h:
--------------------------------------------------------------------------------
 1 | #ifndef ALIGN_H
 2 | #define ALIGN_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | #define PAGE_SIZE 4096
 9 | #define CACHE_LINE_SIZE 64
10 | #define CACHE_ALIGNED __attribute__((aligned(CACHE_LINE_SIZE)))
11 | #define DOUBLE_CACHE_ALIGNED __attribute__((aligned(2 * CACHE_LINE_SIZE)))
12 | 
13 | static inline void * align_malloc(size_t align, size_t size)
14 | {
15 |   void * ptr;
16 | 
17 |   int ret = posix_memalign(&ptr, align, size);
18 |   if (ret != 0) {
19 |     fprintf(stderr, strerror(ret));
20 |     abort();
21 |   }
22 | 
23 |   return ptr;
24 | }
25 | 
26 | #endif /* end of include guard: ALIGN_H */
27 | 


--------------------------------------------------------------------------------
/benchmark:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$TESTS" ]; then
 4 |   TESTS=(wfqueue wfqueue0 faa lcrq ccqueue msqueue delay)
 5 | else
 6 |   IFS=':' read -r -a TESTS <<< "${TESTS}"
 7 | fi
 8 | 
 9 | if [ -z "$PROCS" ]; then
10 |   PROCS=(1 2 4 8)
11 | else
12 |   IFS=':' read -r -a PROCS <<< "${PROCS}"
13 | fi
14 | 
15 | printf '#! Host: %s\n' $( hostname )
16 | printf '#! Benchmarks: %s\n' "${TESTS[*]}"
17 | printf '#! Threads: %s\n' "${PROCS[*]}"
18 | 
19 | for j in ${PROCS[@]}; do
20 |   printf '%d' $j
21 |   for i in ${TESTS[@]}; do
22 |     echo -ne \
23 |       "$(./driver ./$i $j | tail -n 1 | awk '{printf " %.2f %.2f", $3, $5}')"
24 |   done
25 |   printf '\n'
26 | done
27 | 


--------------------------------------------------------------------------------
/benchmark.h:
--------------------------------------------------------------------------------
 1 | #ifndef BENCHMARK_H
 2 | #define BENCHMARK_H
 3 | 
 4 | extern void init(int nprocs, int n);
 5 | extern void thread_init(int id, int nprocs);
 6 | extern void * benchmark(int id, int nprocs);
 7 | extern void thread_exit(int id, int nprocs);
 8 | extern int verify(int nprocs, void ** results);
 9 | 
10 | #endif /* end of include guard: BENCHMARK_H */
11 | 


--------------------------------------------------------------------------------
/bits.h:
--------------------------------------------------------------------------------
 1 | #ifndef BITS_H
 2 | #define BITS_H
 3 | 
 4 | static void * bits_join(int hi, int lo)
 5 | {
 6 |   intptr_t int64 = hi;
 7 |   int64 <<= 32;
 8 |   int64  += lo;
 9 |   return (void *) int64;
10 | }
11 | 
12 | static int bits_lo(void * ptr)
13 | {
14 |   intptr_t int64 = (intptr_t) ptr;
15 |   int64 &= 0x00000000ffffffff;
16 |   return (int) int64;
17 | }
18 | 
19 | static int bits_hi(void * ptr)
20 | {
21 |   intptr_t int64 = (intptr_t) ptr;
22 |   int64 >>= 32;
23 |   return (int) int64;
24 | }
25 | 
26 | #endif /* end of include guard: BITS_H */
27 | 


--------------------------------------------------------------------------------
/ccqueue.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <assert.h>
 3 | #include "delay.h"
 4 | #include "ccqueue.h"
 5 | 
 6 | static inline
 7 | void serialEnqueue(void * state, void * data)
 8 | {
 9 |   node_t * volatile * tail = (node_t **) state;
10 |   node_t * node = (node_t *) data;
11 | 
12 |   (*tail)->next = node;
13 |   *tail = node;
14 | }
15 | 
16 | static inline
17 | void serialDequeue(void * state, void * data)
18 | {
19 |   node_t * volatile * head = (node_t **) state;
20 |   node_t ** ptr = (node_t **) data;
21 | 
22 |   node_t * node = *head;
23 |   node_t * next = node->next;
24 | 
25 |   if (next) {
26 |     node->data = next->data;
27 |     *head = next;
28 |   } else {
29 |     node = (void *) -1;
30 |   }
31 | 
32 |   *ptr = node;
33 | }
34 | 
35 | void queue_init(queue_t * queue, int nprocs)
36 | {
37 |   ccsynch_init(&queue->enq);
38 |   ccsynch_init(&queue->deq);
39 | 
40 |   node_t * dummy = align_malloc(CACHE_LINE_SIZE, sizeof(node_t));
41 |   dummy->data = 0;
42 |   dummy->next = NULL;
43 | 
44 |   queue->head = dummy;
45 |   queue->tail = dummy;
46 | }
47 | 
48 | void queue_register(queue_t * queue, handle_t * handle, int id)
49 | {
50 |   ccsynch_handle_init(&handle->enq);
51 |   ccsynch_handle_init(&handle->deq);
52 | 
53 |   handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(node_t));
54 | }
55 | 
56 | void enqueue(queue_t * queue, handle_t * handle, void * data)
57 | {
58 |   node_t * node = handle->next;
59 | 
60 |   if (node) handle->next = NULL;
61 |   else node = align_malloc(CACHE_LINE_SIZE, sizeof(node_t));
62 | 
63 |   node->data = data;
64 |   node->next = NULL;
65 | 
66 |   ccsynch_apply(&queue->enq, &handle->enq, &serialEnqueue, &queue->tail, node);
67 | }
68 | 
69 | void * dequeue(queue_t * queue, handle_t * handle)
70 | {
71 |   node_t * node;
72 |   ccsynch_apply(&queue->deq, &handle->deq, &serialDequeue, &queue->head, &node);
73 | 
74 |   void * data;
75 | 
76 |   if (node == (void *) -1) {
77 |     data = (void *) -1;
78 |   } else {
79 |     data = node->data;
80 |     if (handle->next) free(node);
81 |     else handle->next = node;
82 |   }
83 | 
84 |   return data;
85 | }
86 | 
87 | void queue_free(int id, int nprocs) {}
88 | 


--------------------------------------------------------------------------------
/ccqueue.h:
--------------------------------------------------------------------------------
 1 | #ifndef CCQUEUE_H
 2 | #define CCQUEUE_H
 3 | 
 4 | #ifdef CCQUEUE
 5 | #include "ccsynch.h"
 6 | 
 7 | #define EMPTY (void *) -1
 8 | 
 9 | typedef struct _node_t {
10 |   struct _node_t * next CACHE_ALIGNED;
11 |   void * volatile data;
12 | } node_t;
13 | 
14 | typedef struct _queue_t {
15 |   ccsynch_t enq DOUBLE_CACHE_ALIGNED;
16 |   ccsynch_t deq DOUBLE_CACHE_ALIGNED;
17 |   node_t * head DOUBLE_CACHE_ALIGNED;
18 |   node_t * tail DOUBLE_CACHE_ALIGNED;
19 | } queue_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | typedef struct _handle_t {
22 |   ccsynch_handle_t enq;
23 |   ccsynch_handle_t deq;
24 |   node_t * next;
25 | } handle_t DOUBLE_CACHE_ALIGNED;
26 | 
27 | #endif
28 | 
29 | #endif /* end of include guard: CCQUEUE_H */
30 | 


--------------------------------------------------------------------------------
/ccsynch.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CCSYNCH_H_
 2 | #define _CCSYNCH_H_
 3 | 
 4 | #include <stdlib.h>
 5 | #include "align.h"
 6 | #include "primitives.h"
 7 | 
 8 | typedef struct _ccsynch_node_t {
 9 |   struct _ccsynch_node_t * volatile next CACHE_ALIGNED;
10 |   void * volatile data;
11 |   int volatile status CACHE_ALIGNED;
12 | } ccsynch_node_t;
13 | 
14 | typedef struct _ccsynch_handle_t {
15 |   struct _ccsynch_node_t * next;
16 | } ccsynch_handle_t;
17 | 
18 | typedef struct _ccsynch_t {
19 |   struct _ccsynch_node_t * volatile tail DOUBLE_CACHE_ALIGNED;
20 | } ccsynch_t;
21 | 
22 | #define CCSYNCH_WAIT  0x0
23 | #define CCSYNCH_READY 0x1
24 | #define CCSYNCH_DONE  0x3
25 | 
26 | static inline
27 | void ccsynch_apply(ccsynch_t * synch, ccsynch_handle_t * handle,
28 |     void (*apply)(void *, void *), void * state, void * data)
29 | {
30 |   ccsynch_node_t * next = handle->next;
31 |   next->next = NULL;
32 |   next->status = CCSYNCH_WAIT;
33 | 
34 |   ccsynch_node_t * curr = SWAPra(&synch->tail, next);
35 |   handle->next = curr;
36 | 
37 |   int status = ACQUIRE(&curr->status);
38 | 
39 |   if (status == CCSYNCH_WAIT) {
40 |     curr->data = data;
41 |     RELEASE(&curr->next, next);
42 | 
43 |     do {
44 |       PAUSE();
45 |       status = ACQUIRE(&curr->status);
46 |     } while (status == CCSYNCH_WAIT);
47 |   }
48 | 
49 |   if (status != CCSYNCH_DONE) {
50 |     apply(state, data);
51 | 
52 |     curr = next;
53 |     next = ACQUIRE(&curr->next);
54 | 
55 |     int count = 0;
56 |     const int CCSYNCH_HELP_BOUND = 256;
57 | 
58 |     while (next && count++ < CCSYNCH_HELP_BOUND) {
59 |       apply(state, curr->data);
60 |       RELEASE(&curr->status, CCSYNCH_DONE);
61 | 
62 |       curr = next;
63 |       next = ACQUIRE(&curr->next);
64 |     }
65 | 
66 |     RELEASE(&curr->status, CCSYNCH_READY);
67 |   }
68 | }
69 | 
70 | static inline void ccsynch_init(ccsynch_t * synch)
71 | {
72 |   ccsynch_node_t * node = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t));
73 |   node->next = NULL;
74 |   node->status = CCSYNCH_READY;
75 | 
76 |   synch->tail = node;
77 | }
78 | 
79 | static inline void ccsynch_handle_init(ccsynch_handle_t * handle)
80 | {
81 |   handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t));
82 | }
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/cpumap.h:
--------------------------------------------------------------------------------
 1 | #ifndef CPUMAP_H
 2 | #define CPUMAP_H
 3 | 
 4 | #include <sched.h>
 5 | 
 6 | #ifdef GUADALUPE_SPREAD
 7 | int cpumap(int i, int nprocs)
 8 | {
 9 |   return (i / 36) * 36 + (i % 2) * 18 + (i % 36 / 2);
10 | }
11 | 
12 | #elif GUADALUPE_OVERSUB
13 | int cpumap(int i, int nprocs) {
14 |   return (i % 18);
15 | }
16 | 
17 | #elif GUADALUPE_COMPACT
18 | int cpumap(int i, int nprocs)
19 | {
20 |   return (i % 2) * 36 + i / 2;
21 | }
22 | 
23 | #elif GUADALUPE_MIC_COMPACT
24 | int cpumap(int i, int nprocs)
25 | {
26 |   return (i + 1) % 228;
27 | }
28 | 
29 | #elif LES_SPREAD
30 | int cpumap(int i, int nprocs)
31 | {
32 |   return i % 4 * 12 + i / 4 % 12;
33 | }
34 | 
35 | #elif BIOU_COMPACT
36 | int cpumap(int i, int nprocs)
37 | {
38 |   return (i % 2) * 32 + i / 2;
39 | }
40 | 
41 | #else
42 | int cpumap(int id, int nprocs)
43 | {
44 |   return id % nprocs;
45 | }
46 | 
47 | #endif
48 | 
49 | #endif /* end of include guard: CPUMAP_H */
50 | 


--------------------------------------------------------------------------------
/delay.c:
--------------------------------------------------------------------------------
 1 | #include "queue.h"
 2 | #include "primitives.h"
 3 | 
 4 | void queue_init(queue_t * q, int nprocs) {}
 5 | void queue_register(queue_t * q, handle_t * hd, int id)
 6 | {
 7 |   *hd = id + 1;
 8 | }
 9 | 
10 | void enqueue(queue_t * q, handle_t * th, void * val)
11 | {
12 | }
13 | 
14 | void * dequeue(queue_t * q, handle_t * th)
15 | {
16 |   return (void *) (long) *th;
17 | }
18 | 
19 | void queue_free(queue_t * q, handle_t * h) {}
20 | 
21 | 


--------------------------------------------------------------------------------
/delay.h:
--------------------------------------------------------------------------------
 1 | #ifndef DELAY_H
 2 | #define DELAY_H
 3 | 
 4 | //#include <time.h>
 5 | #include <stdlib.h>
 6 | 
 7 | typedef struct drand48_data delay_t;
 8 | 
 9 | static inline void delay_init(delay_t * state, int id)
10 | {
11 |   srand48_r(id, state);
12 | }
13 | 
14 | static inline void delay_exec(delay_t * state)
15 | {
16 |   long n;
17 |   lrand48_r(state, &n);
18 | 
19 |   int j;
20 |   for (j = 50; j < 50 + n % 100; ++j) {
21 |     __asm__ ("nop");
22 |   }
23 | }
24 | 
25 | #endif /* end of include guard: DELAY_H */
26 | 


--------------------------------------------------------------------------------
/driver:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | T90=( \
 4 |   6.314 2.920 2.353 2.132 2.015 1.943 1.895 1.860 1.833 1.812 \
 5 |   1.796 1.782 1.771 1.761 1.753 1.746 1.740 1.734 1.729 1.725 \
 6 |   1.721 1.717 1.714 1.711 1.708 1.706 1.703 1.701 1.699 1.697 \
 7 |   )
 8 | 
 9 | T95=( \
10 |   12.71 4.303 3.182 2.776 2.571 2.447 2.365 2.306 2.262 2.228 \
11 |   2.201 2.179 2.160 2.145 2.131 2.120 2.110 2.101 2.093 2.086 \
12 |   2.080 2.074 2.069 2.064 2.060 2.056 2.052 2.048 2.045 2.042 \
13 |   )
14 | 
15 | TIMES[0]=$($@ | grep Mean | awk '{ print $5 }')
16 | SUM=${TIMES[0]}
17 | printf '#%-2d %.2f\n' 1 ${TIMES[0]}
18 | 
19 | i=1
20 | while true; do
21 |   TIME=$($@ | grep Mean | awk '{ print $5 }')
22 |   TIMES[$i]=$TIME
23 |   SUM=$(echo "$SUM + $TIME" | bc)
24 |   N=$(($i + 1))
25 | 
26 |   MEAN=$(echo "$SUM / $N" | bc -l)
27 | 
28 |   STD=0
29 |   for j in "${TIMES[@]}"; do
30 |     STD=$(echo "($j - $MEAN) ^ 2 + $STD" | bc -l)
31 |   done
32 |   STD=$(echo "sqrt ($STD / $i)" | bc -l)
33 | 
34 |   ERR=$(echo "${T95[$i]} * $STD / sqrt($N)" | bc -l)
35 |   PRECISION=$(echo "$ERR / $MEAN" | bc -l)
36 | 
37 |   printf '#%-2d %.2f %.2f %.4f %.2f %.3f\n' \
38 |     $N $TIME $MEAN $STD $ERR $PRECISION
39 | 
40 |   if (($N >= 10 || $N >= 5 && $(echo "$PRECISION < 0.02" | bc) == 1)); then
41 |     break
42 |   else
43 |     i=$N
44 |   fi
45 | done
46 | 
47 | 


--------------------------------------------------------------------------------
/faa.c:
--------------------------------------------------------------------------------
 1 | #include "queue.h"
 2 | #include "primitives.h"
 3 | 
 4 | void queue_init(queue_t * q, int nprocs) {}
 5 | void queue_register(queue_t * q, handle_t * hd, int id)
 6 | {
 7 |   *hd = id + 1;
 8 | }
 9 | 
10 | void enqueue(queue_t * q, handle_t * th, void * val)
11 | {
12 |   FAA(&q->P, 1);
13 | }
14 | 
15 | void * dequeue(queue_t * q, handle_t * th)
16 | {
17 |   FAA(&q->C, 1);
18 |   return (void *) (long) *th;
19 | }
20 | 
21 | void queue_free(queue_t * q, handle_t * h) {}
22 | 
23 | 


--------------------------------------------------------------------------------
/halfhalf.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | #include "delay.h"
 5 | #include "queue.h"
 6 | 
 7 | #ifndef LOGN_OPS
 8 | #define LOGN_OPS 7
 9 | #endif
10 | 
11 | static long nops;
12 | static queue_t * q;
13 | static handle_t ** hds;
14 | 
15 | void init(int nprocs, int logn) {
16 |   /** Use 10^7 as default input size. */
17 |   if (logn == 0) logn = LOGN_OPS;
18 | 
19 |   /** Compute the number of ops to perform. */
20 |   nops = 1;
21 |   int i;
22 |   for (i = 0; i < logn; ++i) {
23 |     nops *= 10;
24 |   }
25 | 
26 |   printf("  Number of operations: %ld\n", nops);
27 | 
28 |   q = align_malloc(PAGE_SIZE, sizeof(queue_t));
29 |   queue_init(q, nprocs);
30 | 
31 |   hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs]));
32 | }
33 | 
34 | void thread_init(int id, int nprocs) {
35 |   hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t));
36 |   queue_register(q, hds[id], id);
37 | }
38 | 
39 | void thread_exit(int id, int nprocs) {
40 |   queue_free(q, hds[id]);
41 | }
42 | 
43 | void * benchmark(int id, int nprocs) {
44 |   void * val = (void *) (intptr_t) (id + 1);
45 |   handle_t * th = hds[id];
46 | 
47 |   delay_t state;
48 |   delay_init(&state, id);
49 | 
50 |   struct drand48_data rstate;
51 |   srand48_r(id, &rstate);
52 | 
53 |   int i;
54 |   for (i = 0; i < nops / nprocs; ++i) {
55 |     long n;
56 |     lrand48_r(&rstate, &n);
57 | 
58 |     if (n % 2 == 0)
59 |       enqueue(q, th, val);
60 |     else
61 |       dequeue(q, th);
62 | 
63 |     delay_exec(&state);
64 |   }
65 | 
66 |   return val;
67 | }
68 | 
69 | int verify(int nprocs, void ** results) {
70 |   return 0;
71 | }
72 | 


--------------------------------------------------------------------------------
/harness.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <limits.h>
  4 | #include <stdlib.h>
  5 | #include <stdint.h>
  6 | #include <unistd.h>
  7 | #include <pthread.h>
  8 | #include <sys/time.h>
  9 | #include "bits.h"
 10 | #include "cpumap.h"
 11 | #include "benchmark.h"
 12 | 
 13 | #ifndef NUM_ITERS
 14 | #define NUM_ITERS 5
 15 | #endif
 16 | 
 17 | #ifndef MAX_PROCS
 18 | #define MAX_PROCS 512
 19 | #endif
 20 | 
 21 | #ifndef MAX_ITERS
 22 | #define MAX_ITERS 20
 23 | #endif
 24 | 
 25 | #ifndef COV_THRESHOLD
 26 | #define COV_THRESHOLD 0.02
 27 | #endif
 28 | 
 29 | static pthread_barrier_t barrier;
 30 | static double times[MAX_ITERS];
 31 | static double means[MAX_ITERS];
 32 | static double covs[MAX_ITERS];
 33 | static volatile int target;
 34 | 
 35 | static size_t elapsed_time(size_t us)
 36 | {
 37 |   struct timeval t;
 38 |   gettimeofday(&t, NULL);
 39 |   return t.tv_sec * 1000000 + t.tv_usec - us;
 40 | }
 41 | 
 42 | static double compute_mean(const double * times)
 43 | {
 44 |   int i;
 45 |   double sum = 0;
 46 | 
 47 |   for (i = 0; i < NUM_ITERS; ++i) {
 48 |     sum += times[i];
 49 |   }
 50 | 
 51 |   return sum / NUM_ITERS;
 52 | }
 53 | 
 54 | static double compute_cov(const double * times, double mean)
 55 | {
 56 |   double variance = 0;
 57 | 
 58 |   int i;
 59 |   for (i = 0; i < NUM_ITERS; ++i) {
 60 |     variance += (times[i] - mean) * (times[i] - mean);
 61 |   }
 62 | 
 63 |   variance /= NUM_ITERS;
 64 | 
 65 |   double cov = sqrt(variance);;
 66 |   cov /= mean;
 67 |   return cov;
 68 | }
 69 | 
 70 | static size_t reduce_min(long val, int id, int nprocs)
 71 | {
 72 |   static long buffer[MAX_PROCS];
 73 | 
 74 |   buffer[id] = val;
 75 |   pthread_barrier_wait(&barrier);
 76 | 
 77 |   long min = LONG_MAX;
 78 |   int i;
 79 |   for (i = 0; i < nprocs; ++i) {
 80 |     if (buffer[i] < min) min = buffer[i];
 81 |   }
 82 | 
 83 |   return min;
 84 | }
 85 | 
 86 | static void report(int id, int nprocs, int i, long us)
 87 | {
 88 |   long ms = reduce_min(us, id, nprocs);
 89 | 
 90 |   if (id == 0) {
 91 |     times[i] = ms / 1000.0;
 92 |     printf("  #%d elapsed time: %.2f ms\n", i + 1, times[i]);
 93 | 
 94 |     if (i + 1 >= NUM_ITERS) {
 95 |       int n = i + 1 - NUM_ITERS;
 96 | 
 97 |       means[i] = compute_mean(times + n);
 98 |       covs[i] = compute_cov(times + n, means[i]);
 99 | 
100 |       if (covs[i] < COV_THRESHOLD) {
101 |         target = i;
102 |       }
103 |     }
104 |   }
105 | 
106 |   pthread_barrier_wait(&barrier);
107 | }
108 | 
109 | static void * thread(void * bits)
110 | {
111 |   int id = bits_hi(bits);
112 |   int nprocs = bits_lo(bits);
113 | 
114 |   cpu_set_t set;
115 |   CPU_ZERO(&set);
116 | 
117 |   int cpu = cpumap(id, nprocs);
118 |   CPU_SET(cpu, &set);
119 |   sched_setaffinity(0, sizeof(set), &set);
120 | 
121 |   thread_init(id, nprocs);
122 |   pthread_barrier_wait(&barrier);
123 | 
124 |   int i;
125 |   void * result = NULL;
126 | 
127 |   for (i = 0; i < MAX_ITERS && target == 0; ++i) {
128 |     long us = elapsed_time(0);
129 |     result = benchmark(id, nprocs);
130 |     pthread_barrier_wait(&barrier);
131 |     us = elapsed_time(us);
132 |     report(id, nprocs, i, us);
133 |   }
134 | 
135 |   thread_exit(id, nprocs);
136 |   return result;
137 | }
138 | 
139 | int main(int argc, const char *argv[])
140 | {
141 |   int nprocs = 0;
142 |   int n = 0;
143 | 
144 |   /** The first argument is nprocs. */
145 |   if (argc > 1) {
146 |     nprocs = atoi(argv[1]);
147 |   }
148 | 
149 |   /**
150 |    * Use the number of processors online as nprocs if it is not
151 |    * specified.
152 |    */
153 |   if (nprocs == 0) {
154 |     nprocs = sysconf(_SC_NPROCESSORS_ONLN);
155 |   }
156 | 
157 |   if (nprocs <= 0) return 1;
158 |   else {
159 |     /** Set concurrency level. */
160 |     pthread_setconcurrency(nprocs);
161 |   }
162 | 
163 |   /**
164 |    * The second argument is input size n.
165 |    */
166 |   if (argc > 2) {
167 |     n = atoi(argv[2]);
168 |   }
169 | 
170 |   pthread_barrier_init(&barrier, NULL, nprocs);
171 |   printf("===========================================\n");
172 |   printf("  Benchmark: %s\n", argv[0]);
173 |   printf("  Number of processors: %d\n", nprocs);
174 | 
175 |   init(nprocs, n);
176 | 
177 |   pthread_t ths[nprocs];
178 |   void * res[nprocs];
179 | 
180 |   int i;
181 |   for (i = 1; i < nprocs; i++) {
182 |     pthread_create(&ths[i], NULL, thread, bits_join(i, nprocs));
183 |   }
184 | 
185 |   res[0] = thread(bits_join(0, nprocs));
186 | 
187 |   for (i = 1; i < nprocs; i++) {
188 |     pthread_join(ths[i], &res[i]);
189 |   }
190 | 
191 |   if (target == 0) {
192 |     target = NUM_ITERS - 1;
193 |     double minCov = covs[target];
194 | 
195 |     /** Pick the result that has the lowest CoV. */
196 |     int i;
197 |     for (i = NUM_ITERS; i < MAX_ITERS; ++i) {
198 |       if (covs[i] < minCov) {
199 |         minCov = covs[i];
200 |         target = i;
201 |       }
202 |     }
203 |   }
204 | 
205 |   double mean = means[target];
206 |   double cov = covs[target];
207 |   int i1 = target - NUM_ITERS + 2;
208 |   int i2 = target + 1;
209 | 
210 |   printf("  Steady-state iterations: %d~%d\n", i1, i2);
211 |   printf("  Coefficient of variation: %.2f\n", cov);
212 |   printf("  Number of measurements: %d\n", NUM_ITERS);
213 |   printf("  Mean of elapsed time: %.2f ms\n", mean);
214 |   printf("===========================================\n");
215 | 
216 |   pthread_barrier_destroy(&barrier);
217 |   return verify(nprocs, res);
218 | }
219 | 
220 | 


--------------------------------------------------------------------------------
/hzdptr.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <string.h>
  3 | #include "hzdptr.h"
  4 | #include "xxhash.h"
  5 | 
  6 | #define HZDPTR_HTBL_SIZE(nprocs, nptrs) (4 * nprocs * nptrs)
  7 | 
  8 | typedef struct _node_t {
  9 |   struct _node_t * next;
 10 | } node_t;
 11 | 
 12 | static int htable_insert(void ** tbl, size_t size, void * ptr)
 13 | {
 14 |   int index = XXH32(ptr, 1, 0) % size;
 15 |   int i;
 16 | 
 17 |   for (i = index; i < size; ++i ) {
 18 |     if (tbl[i] == NULL) {
 19 |       tbl[i] = ptr;
 20 |       return 0;
 21 |     }
 22 |   }
 23 | 
 24 |   for (i = 0; i < index; ++i) {
 25 |     if (tbl[i] == NULL) {
 26 |       tbl[i] = ptr;
 27 |       return 0;
 28 |     }
 29 |   }
 30 | 
 31 |   return -1;
 32 | }
 33 | 
 34 | static int htable_lookup(void ** tbl, size_t size, void * ptr)
 35 | {
 36 |   int index = XXH32(ptr, 1, 0) % size;
 37 |   int i;
 38 | 
 39 |   for (i = index; i < size; ++i) {
 40 |     if (tbl[i] == ptr) {
 41 |       return 1;
 42 |     } else if (tbl[i] == NULL) {
 43 |       return 0;
 44 |     }
 45 |   }
 46 | 
 47 |   for (i = 0; i < index; ++i) {
 48 |     if (tbl[i] == ptr) {
 49 |       return 1;
 50 |     } else if (tbl[i] == NULL) {
 51 |       return 0;
 52 |     }
 53 |   }
 54 | 
 55 |   return 0;
 56 | }
 57 | 
 58 | void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs)
 59 | {
 60 |   hzd->nprocs = nprocs;
 61 |   hzd->nptrs  = nptrs;
 62 |   hzd->nretired = 0;
 63 |   hzd->ptrs = calloc(hzdptr_size(nprocs, nptrs), 1);
 64 | 
 65 |   _hzdptr_enlist(hzd);
 66 | }
 67 | 
 68 | void _hzdptr_retire(hzdptr_t * hzd, void ** rlist)
 69 | {
 70 |   size_t size = HZDPTR_HTBL_SIZE(hzd->nprocs, hzd->nptrs);
 71 |   void * plist[size];
 72 |   memset(plist, 0, sizeof(plist));
 73 | 
 74 |   hzdptr_t * me = hzd;
 75 |   void * ptr;
 76 | 
 77 |   while ((hzd = hzd->next) != me) {
 78 |     int i;
 79 |     for (i = 0; i < hzd->nptrs; ++i) {
 80 |       ptr = hzd->ptrs[i];
 81 | 
 82 |       if (ptr != NULL) {
 83 |         htable_insert(plist, size, ptr);
 84 |       }
 85 |     }
 86 |   }
 87 | 
 88 |   int nretired = 0;
 89 | 
 90 |   /** Check pointers in retire list with plist. */
 91 |   int i;
 92 |   for (i = 0; i < hzd->nretired; ++i) {
 93 |     ptr = rlist[i];
 94 | 
 95 |     if (htable_lookup(plist, size, ptr)) {
 96 |       rlist[nretired++] = ptr;
 97 |     } else {
 98 |       free(ptr);
 99 |     }
100 |   }
101 | 
102 |   hzd->nretired = nretired;
103 | }
104 | 
105 | void hzdptr_exit(hzdptr_t * hzd)
106 | {
107 |   int i;
108 |   void ** rlist = &hzd->ptrs[hzd->nptrs];
109 | 
110 |   for (i = 0; i < hzd->nretired; ++i) {
111 |     free(rlist[i]);
112 |   }
113 | 
114 |   hzd->nretired = 0;
115 |   hzd->next = hzd;
116 | }
117 | 
118 | 


--------------------------------------------------------------------------------
/hzdptr.h:
--------------------------------------------------------------------------------
  1 | #ifndef HZDPTR_H
  2 | #define HZDPTR_H
  3 | 
  4 | #include "primitives.h"
  5 | 
  6 | typedef struct _hzdptr_t {
  7 |   struct _hzdptr_t * next;
  8 |   int nprocs;
  9 |   int nptrs;
 10 |   int nretired;
 11 |   void ** ptrs;
 12 | } hzdptr_t;
 13 | 
 14 | #define HZDPTR_THRESHOLD(nprocs) (2 * nprocs)
 15 | 
 16 | extern void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs);
 17 | extern void hzdptr_exit(hzdptr_t * hzd);
 18 | extern void _hzdptr_retire(hzdptr_t * hzd, void ** rlist);
 19 | 
 20 | static inline
 21 | int hzdptr_size(int nprocs, int nptrs)
 22 | {
 23 |   return sizeof(void * [HZDPTR_THRESHOLD(nprocs) + nptrs]);
 24 | }
 25 | 
 26 | static inline
 27 | void * _hzdptr_set(void volatile * ptr_, void * hzd_)
 28 | {
 29 |   void * volatile * ptr = (void * volatile *) ptr_;
 30 |   void * volatile * hzd = (void * volatile *) hzd_;
 31 | 
 32 |   void * val = *ptr;
 33 |   *hzd = val;
 34 |   return val;
 35 | }
 36 | 
 37 | static inline
 38 | void * hzdptr_set(void volatile * ptr, hzdptr_t * hzd, int idx)
 39 | {
 40 |   return _hzdptr_set(ptr, &hzd->ptrs[idx]);
 41 | }
 42 | 
 43 | static inline
 44 | void * _hzdptr_setv(void volatile * ptr_, void * hzd_)
 45 | {
 46 |   void * volatile * ptr = (void * volatile *) ptr_;
 47 |   void * volatile * hzd = (void * volatile *) hzd_;
 48 | 
 49 |   void * val = *ptr;
 50 |   void * tmp;
 51 | 
 52 |   do {
 53 |     *hzd = val;
 54 |     tmp = val;
 55 |     FENCE();
 56 |     val = *ptr;
 57 |   } while (val != tmp);
 58 | 
 59 |   return val;
 60 | }
 61 | 
 62 | static inline
 63 | void * hzdptr_setv(void volatile * ptr, hzdptr_t * hzd, int idx)
 64 | {
 65 |   return _hzdptr_setv(ptr, &hzd->ptrs[idx]);
 66 | }
 67 | 
 68 | static inline
 69 | void hzdptr_clear(hzdptr_t * hzd, int idx)
 70 | {
 71 |   RELEASE(&hzd->ptrs[idx], NULL);
 72 | }
 73 | 
 74 | static inline
 75 | void hzdptr_retire(hzdptr_t * hzd, void * ptr)
 76 | {
 77 |   void ** rlist = &hzd->ptrs[hzd->nptrs];
 78 |   rlist[hzd->nretired++] = ptr;
 79 | 
 80 |   if (hzd->nretired == HZDPTR_THRESHOLD(hzd->nprocs)) {
 81 |     _hzdptr_retire(hzd, rlist);
 82 |   }
 83 | }
 84 | 
 85 | static inline
 86 | void _hzdptr_enlist(hzdptr_t * hzd)
 87 | {
 88 |   static hzdptr_t * volatile _tail;
 89 |   hzdptr_t * tail = _tail;
 90 | 
 91 |   if (tail == NULL) {
 92 |     hzd->next = hzd;
 93 |     if (CASra(&_tail, &tail, hzd)) return;
 94 |   }
 95 | 
 96 |   hzdptr_t * next = tail->next;
 97 | 
 98 |   do hzd->next = next;
 99 |   while (!CASra(&tail->next, &next, hzd));
100 | }
101 | 
102 | #endif /* end of include guard: HZDPTR_H */
103 | 


--------------------------------------------------------------------------------
/lcrq.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include "lcrq.h"
  5 | #include "align.h"
  6 | #include "delay.h"
  7 | #include "hzdptr.h"
  8 | #include "primitives.h"
  9 | 
 10 | #define RING_SIZE LCRQ_RING_SIZE
 11 | 
 12 | static inline int is_empty(uint64_t v) __attribute__ ((pure));
 13 | static inline uint64_t node_index(uint64_t i) __attribute__ ((pure));
 14 | static inline uint64_t set_unsafe(uint64_t i) __attribute__ ((pure));
 15 | static inline uint64_t node_unsafe(uint64_t i) __attribute__ ((pure));
 16 | static inline uint64_t tail_index(uint64_t t) __attribute__ ((pure));
 17 | static inline int crq_is_closed(uint64_t t) __attribute__ ((pure));
 18 | 
 19 | static inline void init_ring(RingQueue *r) {
 20 |   int i;
 21 | 
 22 |   for (i = 0; i < RING_SIZE; i++) {
 23 |     r->array[i].val = -1;
 24 |     r->array[i].idx = i;
 25 |   }
 26 | 
 27 |   r->head = r->tail = 0;
 28 |   r->next = NULL;
 29 | }
 30 | 
 31 | inline int is_empty(uint64_t v)  {
 32 |   return (v == (uint64_t)-1);
 33 | }
 34 | 
 35 | 
 36 | inline uint64_t node_index(uint64_t i) {
 37 |   return (i & ~(1ull << 63));
 38 | }
 39 | 
 40 | 
 41 | inline uint64_t set_unsafe(uint64_t i) {
 42 |   return (i | (1ull << 63));
 43 | }
 44 | 
 45 | 
 46 | inline uint64_t node_unsafe(uint64_t i) {
 47 |   return (i & (1ull << 63));
 48 | }
 49 | 
 50 | 
 51 | inline uint64_t tail_index(uint64_t t) {
 52 |   return (t & ~(1ull << 63));
 53 | }
 54 | 
 55 | 
 56 | inline int crq_is_closed(uint64_t t) {
 57 |   return (t & (1ull << 63)) != 0;
 58 | }
 59 | 
 60 | void queue_init(queue_t * q, int nprocs)
 61 | {
 62 |   RingQueue *rq = align_malloc(PAGE_SIZE, sizeof(RingQueue));
 63 |   init_ring(rq);
 64 | 
 65 |   q->head = rq;
 66 |   q->tail = rq;
 67 |   q->nprocs = nprocs;
 68 | }
 69 | 
 70 | static inline void fixState(RingQueue *rq) {
 71 | 
 72 |   while (1) {
 73 |     uint64_t t = rq->tail;
 74 |     uint64_t h = rq->head;
 75 | 
 76 |     if (rq->tail != t)
 77 |       continue;
 78 | 
 79 |     if (h > t) {
 80 |       if (CAS(&rq->tail, &t, h)) break;
 81 |       continue;
 82 |     }
 83 |     break;
 84 |   }
 85 | }
 86 | 
 87 | static inline int close_crq(RingQueue *rq, const uint64_t t, const int tries) {
 88 |   uint64_t tt = t + 1;
 89 | 
 90 |   if (tries < 10)
 91 |     return CAS(&rq->tail, &tt, tt|(1ull<<63));
 92 |   else
 93 |     return BTAS(&rq->tail, 63);
 94 | }
 95 | 
 96 | static void lcrq_put(queue_t * q, handle_t * handle, uint64_t arg) {
 97 |   int try_close = 0;
 98 | 
 99 |   while (1) {
100 |     RingQueue *rq = hzdptr_setv(&q->tail, &handle->hzdptr, 0);
101 |     RingQueue *next = rq->next;
102 | 
103 |     if (next != NULL) {
104 |       CAS(&q->tail, &rq, next);
105 |       continue;
106 |     }
107 | 
108 |     uint64_t t = FAA(&rq->tail, 1);
109 | 
110 |     if (crq_is_closed(t)) {
111 |       RingQueue * nrq;
112 | alloc:
113 |       nrq = handle->next;
114 | 
115 |       if (nrq == NULL) {
116 |         nrq = align_malloc(PAGE_SIZE, sizeof(RingQueue));
117 |         init_ring(nrq);
118 |       }
119 | 
120 |       // Solo enqueue
121 |       nrq->tail = 1;
122 |       nrq->array[0].val = (uint64_t) arg;
123 |       nrq->array[0].idx = 0;
124 | 
125 |       if (CAS(&rq->next, &next, nrq)) {
126 |         CAS(&q->tail, &rq, nrq);
127 |         handle->next = NULL;
128 |         return;
129 |       }
130 |       continue;
131 |     }
132 | 
133 |     RingNode* cell = &rq->array[t & (RING_SIZE-1)];
134 | 
135 |     uint64_t idx = cell->idx;
136 |     uint64_t val = cell->val;
137 | 
138 |     if (is_empty(val)) {
139 |       if (node_index(idx) <= t) {
140 |         if ((!node_unsafe(idx) || rq->head < t) &&
141 |             CAS2(cell, &val, &idx, arg, t)) {
142 |           return;
143 |         }
144 |       }
145 |     }
146 | 
147 |     uint64_t h = rq->head;
148 | 
149 |     if ((int64_t)(t - h) >= (int64_t)RING_SIZE &&
150 |         close_crq(rq, t, ++try_close)) {
151 |       goto alloc;
152 |     }
153 |   }
154 | 
155 |   hzdptr_clear(&handle->hzdptr, 0);
156 | }
157 | 
158 | static uint64_t lcrq_get(queue_t * q, handle_t * handle) {
159 |   while (1) {
160 |     RingQueue *rq = hzdptr_setv(&q->head, &handle->hzdptr, 0);
161 |     RingQueue *next;
162 | 
163 |     uint64_t h = FAA(&rq->head, 1);
164 | 
165 |     RingNode* cell = &rq->array[h & (RING_SIZE-1)];
166 | 
167 |     uint64_t tt = 0;
168 |     int r = 0;
169 | 
170 |     while (1) {
171 | 
172 |       uint64_t cell_idx = cell->idx;
173 |       uint64_t unsafe = node_unsafe(cell_idx);
174 |       uint64_t idx = node_index(cell_idx);
175 |       uint64_t val = cell->val;
176 | 
177 |       if (idx > h) break;
178 | 
179 |       if (!is_empty(val)) {
180 |         if (idx == h) {
181 |           if (CAS2(cell, &val, &cell_idx, -1, (unsafe | h) + RING_SIZE))
182 |             return val;
183 |         } else {
184 |           if (CAS2(cell, &val, &cell_idx, val, set_unsafe(idx))) {
185 |             break;
186 |           }
187 |         }
188 |       } else {
189 |         if ((r & ((1ull << 10) - 1)) == 0)
190 |           tt = rq->tail;
191 | 
192 |         // Optimization: try to bail quickly if queue is closed.
193 |         int crq_closed = crq_is_closed(tt);
194 |         uint64_t t = tail_index(tt);
195 | 
196 |         if (unsafe) { // Nothing to do, move along
197 |           if (CAS2(cell, &val, &cell_idx, val, (unsafe | h) + RING_SIZE))
198 |             break;
199 |         } else if (t < h + 1 || r > 200000 || crq_closed) {
200 |           if (CAS2(cell, &val, &idx, val, h + RING_SIZE)) {
201 |             if (r > 200000 && tt > RING_SIZE)
202 |               BTAS(&rq->tail, 63);
203 |             break;
204 |           }
205 |         } else {
206 |           ++r;
207 |         }
208 |       }
209 |     }
210 | 
211 |     if (tail_index(rq->tail) <= h + 1) {
212 |       fixState(rq);
213 |       // try to return empty
214 |       next = rq->next;
215 |       if (next == NULL)
216 |         return -1;  // EMPTY
217 |       if (tail_index(rq->tail) <= h + 1) {
218 |         if (CAS(&q->head, &rq, next)) {
219 |           hzdptr_retire(&handle->hzdptr, rq);
220 |         }
221 |       }
222 |     }
223 |   }
224 | 
225 |   hzdptr_clear(&handle->hzdptr, 0);
226 | }
227 | 
228 | void queue_register(queue_t * q, handle_t * th, int id)
229 | {
230 |   hzdptr_init(&th->hzdptr, q->nprocs, 1);
231 | }
232 | 
233 | void enqueue(queue_t * q, handle_t * th, void * val)
234 | {
235 |   lcrq_put(q, th, (uint64_t) val);
236 | }
237 | 
238 | void * dequeue(queue_t * q, handle_t * th)
239 | {
240 |   return (void *) lcrq_get(q, th);
241 | }
242 | //By K
243 | void handle_free(handle_t *h){
244 |   hzdptr_t *hzd = &h->hzdptr;
245 |   void **rlist = &hzd->ptrs[hzd->nptrs];
246 |   for(int i = 0;i < hzd->nretired; i++){
247 |     free(rlist[i]);
248 |   }
249 |   free(h->hzdptr.ptrs);
250 | }
251 | void queue_free(queue_t * q, handle_t * h){
252 |   RingQueue *rq = q->head;
253 |   while(rq){
254 |     RingQueue *n = rq->next;
255 |     free(rq);
256 |     rq = n;
257 |   };
258 | }
259 | 


--------------------------------------------------------------------------------
/lcrq.h:
--------------------------------------------------------------------------------
 1 | #ifndef LCRQ_H
 2 | #define LCRQ_H
 3 | 
 4 | #ifdef LCRQ
 5 | 
 6 | #include "align.h"
 7 | #include "hzdptr.h"
 8 | 
 9 | #define EMPTY ((void *) -1)
10 | 
11 | #ifndef LCRQ_RING_SIZE
12 | #define LCRQ_RING_SIZE (1ull << 12)
13 | #endif
14 | 
15 | typedef struct RingNode {
16 |   volatile uint64_t val;
17 |   volatile uint64_t idx;
18 |   uint64_t pad[14];
19 | } RingNode DOUBLE_CACHE_ALIGNED;
20 | 
21 | typedef struct RingQueue {
22 |   volatile int64_t head DOUBLE_CACHE_ALIGNED;
23 |   volatile int64_t tail DOUBLE_CACHE_ALIGNED;
24 |   struct RingQueue *next DOUBLE_CACHE_ALIGNED;
25 |   RingNode array[LCRQ_RING_SIZE];
26 | } RingQueue DOUBLE_CACHE_ALIGNED;
27 | 
28 | typedef struct {
29 |   RingQueue * volatile head DOUBLE_CACHE_ALIGNED;
30 |   RingQueue * volatile tail DOUBLE_CACHE_ALIGNED;
31 |   int nprocs;
32 | } queue_t;
33 | 
34 | typedef struct {
35 |   RingQueue * next;
36 |   hzdptr_t hzdptr;
37 | } handle_t;
38 | 
39 | #endif
40 | 
41 | #endif /* end of include guard: LCRQ_H */
42 | 


--------------------------------------------------------------------------------
/msqueue.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include "delay.h"
 3 | #include "msqueue.h"
 4 | #include "primitives.h"
 5 | 
 6 | void queue_init(queue_t * q, int nprocs)
 7 | {
 8 |   node_t * node = malloc(sizeof(node_t));
 9 |   node->next = NULL;
10 | 
11 |   q->head = node;
12 |   q->tail = node;
13 |   q->nprocs = nprocs;
14 | }
15 | 
16 | void queue_register(queue_t * q, handle_t * th, int id)
17 | {
18 |   hzdptr_init(&th->hzd, q->nprocs, 2);
19 | }
20 | 
21 | void enqueue(queue_t * q, handle_t * handle, void * data)
22 | {
23 |   node_t * node = malloc(sizeof(node_t));
24 | 
25 |   node->data = data;
26 |   node->next = NULL;
27 | 
28 |   node_t * tail;
29 |   node_t * next;
30 | 
31 |   while (1) {
32 |     tail = hzdptr_setv(&q->tail, &handle->hzd, 0);
33 |     next = tail->next;
34 | 
35 |     if (tail != q->tail) {
36 |       continue;
37 |     }
38 | 
39 |     if (next != NULL) {
40 |       CAS(&q->tail, &tail, next);
41 |       continue;
42 |     }
43 | 
44 |     if (CAS(&tail->next, &next, node)) break;
45 |   }
46 | 
47 |   CAS(&q->tail, &tail, node);
48 | }
49 | 
50 | void * dequeue(queue_t * q, handle_t * handle)
51 | {
52 |   void * data;
53 | 
54 |   node_t * head;
55 |   node_t * tail;
56 |   node_t * next;
57 | 
58 |   while (1) {
59 |     head = hzdptr_setv(&q->head, &handle->hzd, 0);
60 |     tail = q->tail;
61 |     next = hzdptr_set(&head->next, &handle->hzd, 1);
62 | 
63 |     if (head != q->head) {
64 |       continue;
65 |     }
66 | 
67 |     if (next == NULL) {
68 |       return (void *) -1;
69 |     }
70 | 
71 |     if (head == tail) {
72 |       CAS(&q->tail, &tail, next);
73 |       continue;
74 |     }
75 | 
76 |     data = next->data;
77 |     if (CAS(&q->head, &head, next)) break;
78 |   }
79 | 
80 |   hzdptr_retire(&handle->hzd, head);
81 |   return data;
82 | }
83 | 
84 | void queue_free(int id, int nprocs) {}
85 | 


--------------------------------------------------------------------------------
/msqueue.h:
--------------------------------------------------------------------------------
 1 | #ifndef MSQUEUE_H
 2 | #define MSQUEUE_H
 3 | 
 4 | #ifdef MSQUEUE
 5 | #include "align.h"
 6 | #include "hzdptr.h"
 7 | 
 8 | #define EMPTY (void *) -1
 9 | 
10 | typedef struct _node_t {
11 |   struct _node_t * volatile next DOUBLE_CACHE_ALIGNED;
12 |   void * data DOUBLE_CACHE_ALIGNED;
13 | } node_t DOUBLE_CACHE_ALIGNED;
14 | 
15 | typedef struct _queue_t {
16 |   struct _node_t * volatile head DOUBLE_CACHE_ALIGNED;
17 |   struct _node_t * volatile tail DOUBLE_CACHE_ALIGNED;
18 |   int nprocs;
19 | } queue_t DOUBLE_CACHE_ALIGNED;
20 | 
21 | typedef struct _handle_t {
22 |   hzdptr_t hzd;
23 | } handle_t DOUBLE_CACHE_ALIGNED;
24 | 
25 | #endif
26 | 
27 | #endif /* end of include guard: MSQUEUE_H */
28 | 


--------------------------------------------------------------------------------
/pairwise.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | #include "delay.h"
 5 | #include "queue.h"
 6 | 
 7 | #ifndef LOGN_OPS
 8 | #define LOGN_OPS 7
 9 | #endif
10 | 
11 | static long nops;
12 | static queue_t * q;
13 | static handle_t ** hds;
14 | 
15 | void init(int nprocs, int logn) {
16 | 
17 |   /** Use 10^7 as default input size. */
18 |   if (logn == 0) logn = LOGN_OPS;
19 | 
20 |   /** Compute the number of ops to perform. */
21 |   nops = 1;
22 |   int i;
23 |   for (i = 0; i < logn; ++i) {
24 |     nops *= 10;
25 |   }
26 | 
27 |   printf("  Number of operations: %ld\n", nops);
28 | 
29 |   q = align_malloc(PAGE_SIZE, sizeof(queue_t));
30 |   queue_init(q, nprocs);
31 | 
32 |   hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs]));
33 | }
34 | 
35 | void thread_init(int id, int nprocs) {
36 |   hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t));
37 |   queue_register(q, hds[id], id);
38 | }
39 | 
40 | void * benchmark(int id, int nprocs) {
41 |   void * val = (void *) (intptr_t) (id + 1);
42 |   handle_t * th = hds[id];
43 | 
44 |   delay_t state;
45 |   delay_init(&state, id);
46 | 
47 |   int i;
48 |   for (i = 0; i < nops / nprocs; ++i) {
49 |     enqueue(q, th, val);
50 |     delay_exec(&state);
51 | 
52 |     val = dequeue(q, th);
53 |     delay_exec(&state);
54 |   }
55 | 
56 |   return val;
57 | }
58 | 
59 | void thread_exit(int id, int nprocs) {
60 |   queue_free(q, hds[id]);
61 | }
62 | 
63 | #ifdef VERIFY
64 | static int compare(const void * a, const void * b) {
65 |   return *(long *) a - *(long *) b;
66 | }
67 | #endif
68 | 
69 | int verify(int nprocs, void ** results) {
70 | #ifndef VERIFY
71 |   return 0;
72 | #else
73 |   qsort(results, nprocs, sizeof(void *), compare);
74 | 
75 |   int i;
76 |   int ret = 0;
77 | 
78 |   for (i = 0; i < nprocs; ++i) {
79 |     int res = (int) (intptr_t) results[i];
80 |     if (res != i + 1) {
81 |       fprintf(stderr, "expected %d but received %d\n", i + 1, res);
82 |       ret = 1;
83 |     }
84 |   }
85 | 
86 |   if (ret != 1) fprintf(stdout, "PASSED\n");
87 |   return ret;
88 | #endif
89 | }
90 | 


--------------------------------------------------------------------------------
/primitives.h:
--------------------------------------------------------------------------------
  1 | /** @file */
  2 | 
  3 | #ifndef PRIMITIVES_H
  4 | #define PRIMITIVES_H
  5 | 
  6 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7
  7 | /**
  8 |  * An atomic fetch-and-add.
  9 |  */
 10 | #define FAA(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_RELAXED)
 11 | /**
 12 |  * An atomic fetch-and-add that also ensures sequential consistency.
 13 |  */
 14 | #define FAAcs(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST)
 15 | 
 16 | /**
 17 |  * An atomic compare-and-swap.
 18 |  */
 19 | #define CAS(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 20 |     __ATOMIC_RELAXED, __ATOMIC_RELAXED)
 21 | /**
 22 |  * An atomic compare-and-swap that also ensures sequential consistency.
 23 |  */
 24 | #define CAScs(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 25 |     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
 26 | /**
 27 |  * An atomic compare-and-swap that ensures release semantic when succeed
 28 |  * or acquire semantic when failed.
 29 |  */
 30 | #define CASra(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 31 |     __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)
 32 | /**
 33 |  * An atomic compare-and-swap that ensures acquire semantic when succeed
 34 |  * or relaxed semantic when failed.
 35 |  */
 36 | #define CASa(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \
 37 |     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)
 38 | 
 39 | /**
 40 |  * An atomic swap.
 41 |  */
 42 | #define SWAP(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_RELAXED)
 43 | 
 44 | /**
 45 |  * An atomic swap that ensures acquire release semantics.
 46 |  */
 47 | #define SWAPra(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_ACQ_REL)
 48 | 
 49 | /**
 50 |  * A memory fence to ensure sequential consistency.
 51 |  */
 52 | #define FENCE() __atomic_thread_fence(__ATOMIC_SEQ_CST)
 53 | 
 54 | /**
 55 |  * An atomic store.
 56 |  */
 57 | #define STORE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELAXED)
 58 | 
 59 | /**
 60 |  * A store with a preceding release fence to ensure all previous load
 61 |  * and stores completes before the current store is visiable.
 62 |  */
 63 | #define RELEASE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELEASE)
 64 | 
 65 | /**
 66 |  * A load with a following acquire fence to ensure no following load and
 67 |  * stores can start before the current load completes.
 68 |  */
 69 | #define ACQUIRE(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE)
 70 | 
 71 | #else /** Non-GCC or old GCC. */
 72 | #if defined(__x86_64__) || defined(_M_X64_)
 73 | 
 74 | #define FAA __sync_fetch_and_add
 75 | #define FAAcs __sync_fetch_and_add
 76 | 
 77 | static inline int
 78 | _compare_and_swap(void ** ptr, void ** expected, void * desired) {
 79 |   void * oldval = *expected;
 80 |   void * newval = __sync_val_compare_and_swap(ptr, oldval, desired);
 81 | 
 82 |   if (newval == oldval) {
 83 |     return 1;
 84 |   } else {
 85 |     *expected = newval;
 86 |     return 0;
 87 |   }
 88 | }
 89 | #define CAS(ptr, expected, desired) \
 90 |   _compare_and_swap((void **) (ptr), (void **) (expected), (void *) (desired))
 91 | #define CAScs CAS
 92 | #define CASra CAS
 93 | #define CASa  CAS
 94 | 
 95 | #define SWAP __sync_lock_test_and_set
 96 | #define SWAPra SWAP
 97 | 
 98 | #define ACQUIRE(p) ({ \
 99 |   __typeof__(*(p)) __ret = *p; \
100 |   __asm__("":::"memory"); \
101 |   __ret; \
102 | })
103 | 
104 | #define RELEASE(p, v) do {\
105 |   __asm__("":::"memory"); \
106 |   *p = v; \
107 | } while (0)
108 | #define FENCE() __sync_synchronize()
109 | 
110 | #endif
111 | #endif
112 | 
113 | #if defined(__x86_64__) || defined(_M_X64_)
114 | #define PAUSE() __asm__ ("pause")
115 | 
116 | static inline
117 | int _CAS2(volatile long * ptr, long * cmp1, long * cmp2, long val1, long val2)
118 | {
119 |   char success;
120 |   long tmp1 = *cmp1;
121 |   long tmp2 = *cmp2;
122 | 
123 |   __asm__ __volatile__(
124 |       "lock cmpxchg16b %1\n"
125 |       "setz %0"
126 |       : "=q" (success), "+m" (*ptr), "+a" (tmp1), "+d" (tmp2)
127 |       : "b" (val1), "c" (val2)
128 |       : "cc" );
129 | 
130 |   *cmp1 = tmp1;
131 |   *cmp2 = tmp2;
132 |   return success;
133 | }
134 | #define CAS2(p, o1, o2, n1, n2) \
135 |   _CAS2((volatile long *) p, (long *) o1, (long *) o2, (long) n1, (long) n2)
136 | 
137 | #define BTAS(ptr, bit) ({ \
138 |   char __ret; \
139 |   __asm__ __volatile__( \
140 |       "lock btsq %2, %0; setnc %1" \
141 |       : "+m" (*ptr), "=r" (__ret) : "ri" (bit) : "cc" ); \
142 |   __ret; \
143 | })
144 | 
145 | #else
146 | #define PAUSE()
147 | #endif
148 | 
149 | #endif /* end of include guard: PRIMITIVES_H */
150 | 


--------------------------------------------------------------------------------
/queue.h:
--------------------------------------------------------------------------------
 1 | #ifndef QUEUE_H
 2 | #define QUEUE_H
 3 | 
 4 | #ifdef WFQUEUE
 5 | #include "wfqueue.h"
 6 | 
 7 | #elif LCRQ
 8 | #include "lcrq.h"
 9 | 
10 | #elif CCQUEUE
11 | #include "ccqueue.h"
12 | 
13 | #elif MSQUEUE
14 | #include "msqueue.h"
15 | 
16 | #elif FAAQ
17 | #include "align.h"
18 | 
19 | typedef struct {
20 |   volatile long P DOUBLE_CACHE_ALIGNED;
21 |   volatile long C DOUBLE_CACHE_ALIGNED;
22 | } queue_t DOUBLE_CACHE_ALIGNED;
23 | 
24 | typedef int handle_t;
25 | 
26 | #elif DELAY
27 | 
28 | typedef int queue_t;
29 | typedef int handle_t;
30 | 
31 | #else
32 | #error "Please specify a queue implementation."
33 | 
34 | #endif
35 | 
36 | void queue_init(queue_t * q, int nprocs);
37 | void queue_register(queue_t * q, handle_t * th, int id);
38 | void enqueue(queue_t * q, handle_t * th, void * v);
39 | void * dequeue(queue_t * q, handle_t * th);
40 | void queue_free(queue_t * q, handle_t * h);
41 | void handle_free(handle_t *h);
42 | 
43 | #endif /* end of include guard: QUEUE_H */
44 | 


--------------------------------------------------------------------------------
/wfqueue.c:
--------------------------------------------------------------------------------
  1 | #include "wfqueue.h"
  2 | #include <pthread.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include "primitives.h"
  6 | 
  7 | #define N WFQUEUE_NODE_SIZE
  8 | #define BOT ((void *)0)
  9 | #define TOP ((void *)-1)
 10 | 
 11 | #define MAX_GARBAGE(n) (2 * n)
 12 | 
 13 | #ifndef MAX_SPIN
 14 | #define MAX_SPIN 100
 15 | #endif
 16 | 
 17 | #ifndef MAX_PATIENCE
 18 | #define MAX_PATIENCE 10
 19 | #endif
 20 | 
 21 | typedef struct _enq_t enq_t;
 22 | typedef struct _deq_t deq_t;
 23 | typedef struct _cell_t cell_t;
 24 | typedef struct _node_t node_t;
 25 | 
 26 | static inline void *spin(void *volatile *p) {
 27 |     int patience = MAX_SPIN;
 28 |     void *v = *p;
 29 | 
 30 |     while (!v && patience-- > 0) {
 31 |         v = *p;
 32 |         PAUSE();
 33 |     }
 34 | 
 35 |     return v;
 36 | }
 37 | 
 38 | static inline node_t *new_node() {
 39 |     node_t *n = align_malloc(PAGE_SIZE, sizeof(node_t));
 40 |     memset(n, 0, sizeof(node_t));
 41 |     return n;
 42 | }
 43 | 
 44 | static node_t *check(unsigned long volatile *p_hzd_node_id, node_t *cur,
 45 |                      node_t *old) {
 46 |     unsigned long hzd_node_id = ACQUIRE(p_hzd_node_id);
 47 | 
 48 |     if (hzd_node_id < cur->id) {
 49 |         node_t *tmp = old;
 50 |         while (tmp->id < hzd_node_id) {
 51 |             tmp = tmp->next;
 52 |         }
 53 |         cur = tmp;
 54 |     }
 55 | 
 56 |     return cur;
 57 | }
 58 | 
 59 | static node_t *update(node_t *volatile *pPn, node_t *cur,
 60 |                       unsigned long volatile *p_hzd_node_id, node_t *old) {
 61 |     node_t *ptr = ACQUIRE(pPn);
 62 | 
 63 |     if (ptr->id < cur->id) {
 64 |         if (!CAScs(pPn, &ptr, cur)) {
 65 |             if (ptr->id < cur->id) cur = ptr;
 66 |         }
 67 | 
 68 |         cur = check(p_hzd_node_id, cur, old);
 69 |     }
 70 | 
 71 |     return cur;
 72 | }
 73 | 
 74 | static void cleanup(queue_t *q, handle_t *th) {
 75 |     long oid = ACQUIRE(&q->Hi);
 76 |     node_t *new = th->Dp;
 77 | 
 78 |     if (oid == -1) return;
 79 |     if (new->id - oid < MAX_GARBAGE(q->nprocs)) return;
 80 |     if (!CASa(&q->Hi, &oid, -1)) return;
 81 |     
 82 |     long Di = q->Di, Ei = q->Ei;
 83 |     while(Ei <= Di && !CAS(&q->Ei, &Ei, Di + 1))
 84 |         ;
 85 |     
 86 |     node_t *old = q->Hp;
 87 |     handle_t *ph = th;
 88 |     handle_t *phs[q->nprocs];
 89 |     int i = 0;
 90 | 
 91 |     do {
 92 |         new = check(&ph->hzd_node_id, new, old);
 93 |         new = update(&ph->Ep, new, &ph->hzd_node_id, old);
 94 |         new = update(&ph->Dp, new, &ph->hzd_node_id, old);
 95 | 
 96 |         phs[i++] = ph;
 97 |         ph = ph->next;
 98 |     } while (new->id > oid && ph != th);
 99 | 
100 |     while (new->id > oid && --i >= 0) {
101 |         new = check(&phs[i]->hzd_node_id, new, old);
102 |     }
103 | 
104 |     long nid = new->id;
105 | 
106 |     if (nid <= oid) {
107 |         RELEASE(&q->Hi, oid);
108 |     } else {
109 |         q->Hp = new;
110 |         RELEASE(&q->Hi, nid);
111 | 
112 |         while (old != new) {
113 |             node_t *tmp = old->next;
114 |             free(old);
115 |             old = tmp;
116 |         }
117 |     }
118 | }
119 | 
120 | static cell_t *find_cell(node_t *volatile *ptr, long i, handle_t *th) {
121 |     node_t *curr = *ptr;
122 | 
123 |     long j;
124 |     for (j = curr->id; j < i / N; ++j) {
125 |         node_t *next = curr->next;
126 | 
127 |         if (next == NULL) {
128 |             node_t *temp = th->spare;
129 | 
130 |             if (!temp) {
131 |                 temp = new_node();
132 |                 th->spare = temp;
133 |             }
134 | 
135 |             temp->id = j + 1;
136 | 
137 |             if (CASra(&curr->next, &next, temp)) {
138 |                 next = temp;
139 |                 th->spare = NULL;
140 |             }
141 |         }
142 | 
143 |         curr = next;
144 |     }
145 | 
146 |     *ptr = curr;
147 |     return &curr->cells[i % N];
148 | }
149 | 
150 | static int enq_fast(queue_t *q, handle_t *th, void *v, long *id) {
151 |     long i = FAAcs(&q->Ei, 1);
152 |     cell_t *c = find_cell(&th->Ep, i, th);
153 |     void *cv = BOT;
154 | 
155 |     if (CAS(&c->val, &cv, v)) {
156 | #ifdef RECORD
157 |         th->fastenq++;
158 | #endif
159 |         return 1;
160 |     } else {
161 |         *id = i;
162 |         return 0;
163 |     }
164 | }
165 | 
166 | static void enq_slow(queue_t *q, handle_t *th, void *v, long id) {
167 |     enq_t *enq = &th->Er;
168 |     enq->val = v;
169 |     RELEASE(&enq->id, id);
170 | 
171 |     node_t *tail = th->Ep;
172 |     long i;
173 |     cell_t *c;
174 | 
175 |     do {
176 |         i = FAA(&q->Ei, 1);
177 |         c = find_cell(&tail, i, th);
178 |         enq_t *ce = BOT;
179 | 
180 |         if (CAScs(&c->enq, &ce, enq) && c->val != TOP) {
181 |             if (CAS(&enq->id, &id, -i)) id = -i;
182 |             break;
183 |         }
184 |     } while (enq->id > 0);
185 | 
186 |     id = -enq->id;
187 |     c = find_cell(&th->Ep, id, th);
188 |     if (id > i) {
189 |         long Ei = q->Ei;
190 |         while (Ei <= id && !CAS(&q->Ei, &Ei, id + 1))
191 |             ;
192 |     }
193 |     c->val = v;
194 | 
195 | #ifdef RECORD
196 |     th->slowenq++;
197 | #endif
198 | }
199 | 
200 | void enqueue(queue_t *q, handle_t *th, void *v) {
201 |     th->hzd_node_id = th->enq_node_id;
202 | 
203 |     long id;
204 |     int p = MAX_PATIENCE;
205 |     while (!enq_fast(q, th, v, &id) && p-- > 0)
206 |         ;
207 |     if (p < 0) enq_slow(q, th, v, id);
208 | 
209 |     th->enq_node_id = th->Ep->id;
210 |     RELEASE(&th->hzd_node_id, -1);
211 | }
212 | 
213 | static void *help_enq(queue_t *q, handle_t *th, cell_t *c, long i) {
214 |     void *v = spin(&c->val);
215 | 
216 |     if ((v != TOP && v != BOT) ||
217 |         (v == BOT && !CAScs(&c->val, &v, TOP) && v != TOP)) {
218 |         return v;
219 |     }
220 | 
221 |     enq_t *e = c->enq;
222 | 
223 |     if (e == BOT) {
224 |         handle_t *ph;
225 |         enq_t *pe;
226 |         long id;
227 |         ph = th->Eh, pe = &ph->Er, id = pe->id;
228 | 
229 |         if (th->Ei != 0 && th->Ei != id) {
230 |             th->Ei = 0;
231 |             th->Eh = ph->next;
232 |             ph = th->Eh, pe = &ph->Er, id = pe->id;
233 |         }
234 | 
235 |         if (id > 0 && id <= i && !CAS(&c->enq, &e, pe) && e != pe)
236 |             th->Ei = id;
237 |         else {
238 |             th->Ei = 0;
239 |             th->Eh = ph->next;
240 |         }
241 | 
242 |         if (e == BOT && CAS(&c->enq, &e, TOP)) e = TOP;
243 |     }
244 | 
245 |     if (e == TOP) return (q->Ei <= i ? BOT : TOP);
246 | 
247 |     long ei = ACQUIRE(&e->id);
248 |     void *ev = ACQUIRE(&e->val);
249 | 
250 |     if (ei > i) {
251 |         if (c->val == TOP && q->Ei <= i) return BOT;
252 |     } else {
253 |         if ((ei > 0 && CAS(&e->id, &ei, -i)) || (ei == -i && c->val == TOP)) {
254 |             long Ei = q->Ei;
255 |             while (Ei <= i && !CAS(&q->Ei, &Ei, i + 1))
256 |                 ;
257 |             c->val = ev;
258 |         }
259 |     }
260 | 
261 |     return c->val;
262 | }
263 | 
264 | static void help_deq(queue_t *q, handle_t *th, handle_t *ph) {
265 |     deq_t *deq = &ph->Dr;
266 |     long idx = ACQUIRE(&deq->idx);
267 |     long id = deq->id;
268 | 
269 |     if (idx < id) return;
270 | 
271 |     node_t *Dp = ph->Dp;
272 |     th->hzd_node_id = ph->hzd_node_id;
273 |     FENCE();
274 |     idx = deq->idx;
275 | 
276 |     long i = id + 1, old = id, new = 0;
277 |     while (1) {
278 |         node_t *h = Dp;
279 |         for (; idx == old && new == 0; ++i) {
280 |             cell_t *c = find_cell(&h, i, th);
281 | 
282 |             long Di = q->Di;
283 |             while (Di <= i && !CAS(&q->Di, &Di, i + 1))
284 |                 ;
285 | 
286 |             void *v = help_enq(q, th, c, i);
287 |             if (v == BOT || (v != TOP && c->deq == BOT))
288 |                 new = i;
289 |             else
290 |                 idx = ACQUIRE(&deq->idx);
291 |         }
292 | 
293 |         if (new != 0) {
294 |             if (CASra(&deq->idx, &idx, new)) idx = new;
295 |             if (idx >= new) new = 0;
296 |         }
297 | 
298 |         if (idx < 0 || deq->id != id) break;
299 | 
300 |         cell_t *c = find_cell(&Dp, idx, th);
301 |         deq_t *cd = BOT;
302 |         if (c->val == TOP || CAS(&c->deq, &cd, deq) || cd == deq) {
303 |             CAS(&deq->idx, &idx, -idx);
304 |             break;
305 |         }
306 | 
307 |         old = idx;
308 |         if (idx >= i) i = idx + 1;
309 |     }
310 | }
311 | 
312 | static void *deq_fast(queue_t *q, handle_t *th, long *id) {
313 |     long i = FAAcs(&q->Di, 1);
314 |     cell_t *c = find_cell(&th->Dp, i, th);
315 |     void *v = help_enq(q, th, c, i);
316 |     deq_t *cd = BOT;
317 | 
318 |     if (v == BOT) return BOT;
319 |     if (v != TOP && CAS(&c->deq, &cd, TOP)) return v;
320 | 
321 |     *id = i;
322 |     return TOP;
323 | }
324 | 
325 | static void *deq_slow(queue_t *q, handle_t *th, long id) {
326 |     deq_t *deq = &th->Dr;
327 |     RELEASE(&deq->id, id);
328 |     RELEASE(&deq->idx, id);
329 | 
330 |     help_deq(q, th, th);
331 |     long i = -deq->idx;
332 |     cell_t *c = find_cell(&th->Dp, i, th);
333 |     void *val = c->val;
334 | 
335 | #ifdef RECORD
336 |     th->slowdeq++;
337 | #endif
338 |     return val == TOP ? BOT : val;
339 | }
340 | 
341 | void *dequeue(queue_t *q, handle_t *th) {
342 |     th->hzd_node_id = th->deq_node_id;
343 | 
344 |     void *v;
345 |     long id = 0;
346 |     int p = MAX_PATIENCE;
347 | 
348 |     do
349 |         v = deq_fast(q, th, &id);
350 |     while (v == TOP && p-- > 0);
351 |     if (v == TOP)
352 |         v = deq_slow(q, th, id);
353 |     else {
354 | #ifdef RECORD
355 |         th->fastdeq++;
356 | #endif
357 |     }
358 | 
359 |     if (v != EMPTY) {
360 |         help_deq(q, th, th->Dh);
361 |         th->Dh = th->Dh->next;
362 |     }
363 | 
364 |     th->deq_node_id = th->Dp->id;
365 |     RELEASE(&th->hzd_node_id, -1);
366 | 
367 |     if (th->spare == NULL) {
368 |         cleanup(q, th);
369 |         th->spare = new_node();
370 |     }
371 | 
372 | #ifdef RECORD
373 |     if (v == EMPTY) th->empty++;
374 | #endif
375 |     return v;
376 | }
377 | 
378 | static pthread_barrier_t barrier;
379 | 
380 | void queue_init(queue_t *q, int nprocs) {
381 |     q->Hi = 0;
382 |     q->Hp = new_node();
383 | 
384 |     q->Ei = 1;
385 |     q->Di = 1;
386 | 
387 |     q->nprocs = nprocs;
388 | 
389 | #ifdef RECORD
390 |     q->fastenq = 0;
391 |     q->slowenq = 0;
392 |     q->fastdeq = 0;
393 |     q->slowdeq = 0;
394 |     q->empty = 0;
395 | #endif
396 |     pthread_barrier_init(&barrier, NULL, nprocs);
397 | }
398 | 
399 | void queue_free(queue_t *q, handle_t *h) {
400 | #ifdef RECORD
401 |     static int lock = 0;
402 | 
403 |     FAA(&q->fastenq, h->fastenq);
404 |     FAA(&q->slowenq, h->slowenq);
405 |     FAA(&q->fastdeq, h->fastdeq);
406 |     FAA(&q->slowdeq, h->slowdeq);
407 |     FAA(&q->empty, h->empty);
408 | 
409 |     pthread_barrier_wait(&barrier);
410 | 
411 |     if (FAA(&lock, 1) == 0)
412 |         printf("Enq: %f Deq: %f Empty: %f\n",
413 |                q->slowenq * 100.0 / (q->fastenq + q->slowenq),
414 |                q->slowdeq * 100.0 / (q->fastdeq + q->slowdeq),
415 |                q->empty * 100.0 / (q->fastdeq + q->slowdeq));
416 | #endif
417 | }
418 | 
419 | void queue_register(queue_t *q, handle_t *th, int id) {
420 |     th->next = NULL;
421 |     th->hzd_node_id = -1;
422 |     th->Ep = q->Hp;
423 |     th->enq_node_id = th->Ep->id;
424 |     th->Dp = q->Hp;
425 |     th->deq_node_id = th->Dp->id;
426 | 
427 |     th->Er.id = 0;
428 |     th->Er.val = BOT;
429 |     th->Dr.id = 0;
430 |     th->Dr.idx = -1;
431 | 
432 |     th->Ei = 0;
433 |     th->spare = new_node();
434 | #ifdef RECORD
435 |     th->slowenq = 0;
436 |     th->slowdeq = 0;
437 |     th->fastenq = 0;
438 |     th->fastdeq = 0;
439 |     th->empty = 0;
440 | #endif
441 | 
442 |     static handle_t *volatile _tail;
443 |     handle_t *tail = _tail;
444 | 
445 |     if (tail == NULL) {
446 |         th->next = th;
447 |         if (CASra(&_tail, &tail, th)) {
448 |             th->Eh = th->next;
449 |             th->Dh = th->next;
450 |             return;
451 |         }
452 |     }
453 | 
454 |     handle_t *next = tail->next;
455 |     do
456 |         th->next = next;
457 |     while (!CASra(&tail->next, &next, th));
458 | 
459 |     th->Eh = th->next;
460 |     th->Dh = th->next;
461 | }
462 | 


--------------------------------------------------------------------------------
/wfqueue.h:
--------------------------------------------------------------------------------
  1 | #ifndef WFQUEUE_H
  2 | #define WFQUEUE_H
  3 | 
  4 | #ifdef WFQUEUE
  5 | 
  6 | #include "align.h"
  7 | #define EMPTY ((void *) 0)
  8 | 
  9 | #ifndef WFQUEUE_NODE_SIZE
 10 | #define WFQUEUE_NODE_SIZE ((1 << 10) - 2)
 11 | #endif
 12 | 
 13 | struct _enq_t {
 14 |   long volatile id;
 15 |   void * volatile val;
 16 | } CACHE_ALIGNED;
 17 | 
 18 | struct _deq_t {
 19 |   long volatile id;
 20 |   long volatile idx;
 21 | } CACHE_ALIGNED;
 22 | 
 23 | struct _cell_t {
 24 |   void * volatile val;
 25 |   struct _enq_t * volatile enq;
 26 |   struct _deq_t * volatile deq;
 27 |   void * pad[5];
 28 | };
 29 | 
 30 | struct _node_t {
 31 |   struct _node_t * volatile next CACHE_ALIGNED;
 32 |   long id CACHE_ALIGNED;
 33 |   struct _cell_t cells[WFQUEUE_NODE_SIZE] CACHE_ALIGNED;
 34 | };
 35 | 
 36 | typedef struct DOUBLE_CACHE_ALIGNED {
 37 |   /**
 38 |    * Index of the next position for enqueue.
 39 |    */
 40 |   volatile long Ei DOUBLE_CACHE_ALIGNED;
 41 | 
 42 |   /**
 43 |    * Index of the next position for dequeue.
 44 |    */
 45 |   volatile long Di DOUBLE_CACHE_ALIGNED;
 46 | 
 47 |   /**
 48 |    * Index of the head of the queue.
 49 |    */
 50 |   volatile long Hi DOUBLE_CACHE_ALIGNED;
 51 | 
 52 |   /**
 53 |    * Pointer to the head node of the queue.
 54 |    */
 55 |   struct _node_t * volatile Hp;
 56 | 
 57 |   /**
 58 |    * Number of processors.
 59 |    */
 60 |   long nprocs;
 61 | #ifdef RECORD
 62 |   long slowenq;
 63 |   long slowdeq;
 64 |   long fastenq;
 65 |   long fastdeq;
 66 |   long empty;
 67 | #endif
 68 | } queue_t;
 69 | 
 70 | typedef struct _handle_t {
 71 |   /**
 72 |    * Pointer to the next handle.
 73 |    */
 74 |   struct _handle_t * next;
 75 | 
 76 |   /**
 77 |    * Hazard pointer.
 78 |    */
 79 |   //struct _node_t * volatile Hp;
 80 |   unsigned long volatile hzd_node_id;
 81 | 
 82 |   /**
 83 |    * Pointer to the node for enqueue.
 84 |    */
 85 |   struct _node_t * volatile Ep;
 86 |   unsigned long enq_node_id;
 87 | 
 88 |   /**
 89 |    * Pointer to the node for dequeue.
 90 |    */
 91 |   struct _node_t * volatile Dp;
 92 |   unsigned long deq_node_id;
 93 | 
 94 |   /**
 95 |    * Enqueue request.
 96 |    */
 97 |   struct _enq_t Er CACHE_ALIGNED;
 98 | 
 99 |   /**
100 |    * Dequeue request.
101 |    */
102 |   struct _deq_t Dr CACHE_ALIGNED;
103 | 
104 |   /**
105 |    * Handle of the next enqueuer to help.
106 |    */
107 |   struct _handle_t * Eh CACHE_ALIGNED;
108 | 
109 |   long Ei;
110 | 
111 |   /**
112 |    * Handle of the next dequeuer to help.
113 |    */
114 |   struct _handle_t * Dh;
115 | 
116 |   /**
117 |    * Pointer to a spare node to use, to speedup adding a new node.
118 |    */
119 |   struct _node_t * spare CACHE_ALIGNED;
120 | 
121 |   /**
122 |    * Count the delay rounds of helping another dequeuer.
123 |    */
124 |   int delay;
125 | 
126 | #ifdef RECORD
127 |   long slowenq;
128 |   long slowdeq;
129 |   long fastenq;
130 |   long fastdeq;
131 |   long empty;
132 | #endif
133 | } handle_t;
134 | 
135 | #endif
136 | 
137 | #endif /* end of include guard: WFQUEUE_H */
138 | 


--------------------------------------------------------------------------------
/xxhash.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | xxHash - Fast Hash algorithm
  3 | Copyright (C) 2012-2014, Yann Collet.
  4 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright
 11 | notice, this list of conditions and the following disclaimer.
 12 | * Redistributions in binary form must reproduce the above
 13 | copyright notice, this list of conditions and the following disclaimer
 14 | in the documentation and/or other materials provided with the
 15 | distribution.
 16 | 
 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | You can contact the author at :
 30 | - xxHash source repository : http://code.google.com/p/xxhash/
 31 | - public discussion board : https://groups.google.com/forum/#!forum/lz4c
 32 | */
 33 | 
 34 | 
 35 | //**************************************
 36 | // Tuning parameters
 37 | //**************************************
 38 | // Unaligned memory access is automatically enabled for "common" CPU, such as x86.
 39 | // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
 40 | // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
 41 | // You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
 42 | #if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
 43 | #  define XXH_USE_UNALIGNED_ACCESS 1
 44 | #endif
 45 | 
 46 | // XXH_ACCEPT_NULL_INPUT_POINTER :
 47 | // If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
 48 | // When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
 49 | // This option has a very small performance cost (only measurable on small inputs).
 50 | // By default, this option is disabled. To enable it, uncomment below define :
 51 | // #define XXH_ACCEPT_NULL_INPUT_POINTER 1
 52 | 
 53 | // XXH_FORCE_NATIVE_FORMAT :
 54 | // By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
 55 | // Results are therefore identical for little-endian and big-endian CPU.
 56 | // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
 57 | // Should endian-independance be of no importance for your application, you may set the #define below to 1.
 58 | // It will improve speed for Big-endian CPU.
 59 | // This option has no impact on Little_Endian CPU.
 60 | #define XXH_FORCE_NATIVE_FORMAT 0
 61 | 
 62 | //**************************************
 63 | // Compiler Specific Options
 64 | //**************************************
 65 | // Disable some Visual warning messages
 66 | #ifdef _MSC_VER  // Visual Studio
 67 | #  pragma warning(disable : 4127)      // disable: C4127: conditional expression is constant
 68 | #endif
 69 | 
 70 | #ifdef _MSC_VER    // Visual Studio
 71 | #  define FORCE_INLINE static __forceinline
 72 | #else
 73 | #  ifdef __GNUC__
 74 | #    define FORCE_INLINE static inline __attribute__((always_inline))
 75 | #  else
 76 | #    define FORCE_INLINE static inline
 77 | #  endif
 78 | #endif
 79 | 
 80 | //**************************************
 81 | // Includes & Memory related functions
 82 | //**************************************
 83 | #include "xxhash.h"
 84 | // Modify the local functions below should you wish to use some other memory routines
 85 | // for malloc(), free()
 86 | #include <stdlib.h>
 87 | static void* XXH_malloc(size_t s) { return malloc(s); }
 88 | static void  XXH_free  (void* p)  { free(p); }
 89 | // for memcpy()
 90 | #include <string.h>
 91 | static void* XXH_memcpy(void* dest, const void* src, size_t size)
 92 | {
 93 |     return memcpy(dest,src,size);
 94 | }
 95 | 
 96 | 
 97 | //**************************************
 98 | // Basic Types
 99 | //**************************************
100 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
101 | # include <stdint.h>
102 | typedef uint8_t  BYTE;
103 | typedef uint16_t U16;
104 | typedef uint32_t U32;
105 | typedef  int32_t S32;
106 | typedef uint64_t U64;
107 | #else
108 | typedef unsigned char      BYTE;
109 | typedef unsigned short     U16;
110 | typedef unsigned int       U32;
111 | typedef   signed int       S32;
112 | typedef unsigned long long U64;
113 | #endif
114 | 
115 | #if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
116 | #  define _PACKED __attribute__ ((packed))
117 | #else
118 | #  define _PACKED
119 | #endif
120 | 
121 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
122 | #  ifdef __IBMC__
123 | #    pragma pack(1)
124 | #  else
125 | #    pragma pack(push, 1)
126 | #  endif
127 | #endif
128 | 
129 | typedef struct _U32_S
130 | {
131 |     U32 v;
132 | } _PACKED U32_S;
133 | typedef struct _U64_S
134 | {
135 |     U64 v;
136 | } _PACKED U64_S;
137 | 
138 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
139 | #  pragma pack(pop)
140 | #endif
141 | 
142 | #define A32(x) (((U32_S *)(x))->v)
143 | #define A64(x) (((U64_S *)(x))->v)
144 | 
145 | 
146 | //***************************************
147 | // Compiler-specific Functions and Macros
148 | //***************************************
149 | #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
150 | 
151 | // Note : although _rotl exists for minGW (GCC under windows), performance seems poor
152 | #if defined(_MSC_VER)
153 | #  define XXH_rotl32(x,r) _rotl(x,r)
154 | #  define XXH_rotl64(x,r) _rotl64(x,r)
155 | #else
156 | #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
157 | #  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
158 | #endif
159 | 
160 | #if defined(_MSC_VER)     // Visual Studio
161 | #  define XXH_swap32 _byteswap_ulong
162 | #  define XXH_swap64 _byteswap_uint64
163 | #elif GCC_VERSION >= 403
164 | #  define XXH_swap32 __builtin_bswap32
165 | #  define XXH_swap64 __builtin_bswap64
166 | #else
167 | static inline U32 XXH_swap32 (U32 x)
168 | {
169 |     return  ((x << 24) & 0xff000000 ) |
170 |             ((x <<  8) & 0x00ff0000 ) |
171 |             ((x >>  8) & 0x0000ff00 ) |
172 |             ((x >> 24) & 0x000000ff );
173 | }
174 | static inline U64 XXH_swap64 (U64 x)
175 | {
176 |     return  ((x << 56) & 0xff00000000000000ULL) |
177 |             ((x << 40) & 0x00ff000000000000ULL) |
178 |             ((x << 24) & 0x0000ff0000000000ULL) |
179 |             ((x << 8)  & 0x000000ff00000000ULL) |
180 |             ((x >> 8)  & 0x00000000ff000000ULL) |
181 |             ((x >> 24) & 0x0000000000ff0000ULL) |
182 |             ((x >> 40) & 0x000000000000ff00ULL) |
183 |             ((x >> 56) & 0x00000000000000ffULL);
184 | }
185 | #endif
186 | 
187 | 
188 | //**************************************
189 | // Constants
190 | //**************************************
191 | #define PRIME32_1   2654435761U
192 | #define PRIME32_2   2246822519U
193 | #define PRIME32_3   3266489917U
194 | #define PRIME32_4    668265263U
195 | #define PRIME32_5    374761393U
196 | 
197 | #define PRIME64_1 11400714785074694791ULL
198 | #define PRIME64_2 14029467366897019727ULL
199 | #define PRIME64_3  1609587929392839161ULL
200 | #define PRIME64_4  9650029242287828579ULL
201 | #define PRIME64_5  2870177450012600261ULL
202 | 
203 | //**************************************
204 | // Architecture Macros
205 | //**************************************
206 | typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
207 | #ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
208 | static const int one = 1;
209 | #   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
210 | #endif
211 | 
212 | 
213 | //**************************************
214 | // Macros
215 | //**************************************
216 | #define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
217 | 
218 | 
219 | //****************************
220 | // Memory reads
221 | //****************************
222 | typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
223 | 
224 | FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
225 | {
226 |     if (align==XXH_unaligned)
227 |         return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
228 |     else
229 |         return endian==XXH_littleEndian ? *(U32*)ptr : XXH_swap32(*(U32*)ptr);
230 | }
231 | 
232 | FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
233 | {
234 |     return XXH_readLE32_align(ptr, endian, XXH_unaligned);
235 | }
236 | 
237 | FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
238 | {
239 |     if (align==XXH_unaligned)
240 |         return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr));
241 |     else
242 |         return endian==XXH_littleEndian ? *(U64*)ptr : XXH_swap64(*(U64*)ptr);
243 | }
244 | 
245 | FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
246 | {
247 |     return XXH_readLE64_align(ptr, endian, XXH_unaligned);
248 | }
249 | 
250 | 
251 | //****************************
252 | // Simple Hash Functions
253 | //****************************
254 | FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
255 | {
256 |     const BYTE* p = (const BYTE*)input;
257 |     const BYTE* bEnd = p + len;
258 |     U32 h32;
259 | #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
260 | 
261 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
262 |     if (p==NULL)
263 |     {
264 |         len=0;
265 |         bEnd=p=(const BYTE*)(size_t)16;
266 |     }
267 | #endif
268 | 
269 |     if (len>=16)
270 |     {
271 |         const BYTE* const limit = bEnd - 16;
272 |         U32 v1 = seed + PRIME32_1 + PRIME32_2;
273 |         U32 v2 = seed + PRIME32_2;
274 |         U32 v3 = seed + 0;
275 |         U32 v4 = seed - PRIME32_1;
276 | 
277 |         do
278 |         {
279 |             v1 += XXH_get32bits(p) * PRIME32_2;
280 |             v1 = XXH_rotl32(v1, 13);
281 |             v1 *= PRIME32_1;
282 |             p+=4;
283 |             v2 += XXH_get32bits(p) * PRIME32_2;
284 |             v2 = XXH_rotl32(v2, 13);
285 |             v2 *= PRIME32_1;
286 |             p+=4;
287 |             v3 += XXH_get32bits(p) * PRIME32_2;
288 |             v3 = XXH_rotl32(v3, 13);
289 |             v3 *= PRIME32_1;
290 |             p+=4;
291 |             v4 += XXH_get32bits(p) * PRIME32_2;
292 |             v4 = XXH_rotl32(v4, 13);
293 |             v4 *= PRIME32_1;
294 |             p+=4;
295 |         }
296 |         while (p<=limit);
297 | 
298 |         h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
299 |     }
300 |     else
301 |     {
302 |         h32  = seed + PRIME32_5;
303 |     }
304 | 
305 |     h32 += (U32) len;
306 | 
307 |     while (p+4<=bEnd)
308 |     {
309 |         h32 += XXH_get32bits(p) * PRIME32_3;
310 |         h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
311 |         p+=4;
312 |     }
313 | 
314 |     while (p<bEnd)
315 |     {
316 |         h32 += (*p) * PRIME32_5;
317 |         h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
318 |         p++;
319 |     }
320 | 
321 |     h32 ^= h32 >> 15;
322 |     h32 *= PRIME32_2;
323 |     h32 ^= h32 >> 13;
324 |     h32 *= PRIME32_3;
325 |     h32 ^= h32 >> 16;
326 | 
327 |     return h32;
328 | }
329 | 
330 | 
331 | unsigned int XXH32 (const void* input, size_t len, unsigned seed)
332 | {
333 | #if 0
334 |     // Simple version, good for code maintenance, but unfortunately slow for small inputs
335 |     XXH32_state_t state;
336 |     XXH32_reset(&state, seed);
337 |     XXH32_update(&state, input, len);
338 |     return XXH32_digest(&state);
339 | #else
340 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
341 | 
342 | #  if !defined(XXH_USE_UNALIGNED_ACCESS)
343 |     if ((((size_t)input) & 3) == 0)   // Input is aligned, let's leverage the speed advantage
344 |     {
345 |         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
346 |             return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
347 |         else
348 |             return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
349 |     }
350 | #  endif
351 | 
352 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
353 |         return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
354 |     else
355 |         return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
356 | #endif
357 | }
358 | 
359 | FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
360 | {
361 |     const BYTE* p = (const BYTE*)input;
362 |     const BYTE* bEnd = p + len;
363 |     U64 h64;
364 | #define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
365 | 
366 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
367 |     if (p==NULL)
368 |     {
369 |         len=0;
370 |         bEnd=p=(const BYTE*)(size_t)32;
371 |     }
372 | #endif
373 | 
374 |     if (len>=32)
375 |     {
376 |         const BYTE* const limit = bEnd - 32;
377 |         U64 v1 = seed + PRIME64_1 + PRIME64_2;
378 |         U64 v2 = seed + PRIME64_2;
379 |         U64 v3 = seed + 0;
380 |         U64 v4 = seed - PRIME64_1;
381 | 
382 |         do
383 |         {
384 |             v1 += XXH_get64bits(p) * PRIME64_2;
385 |             p+=8;
386 |             v1 = XXH_rotl64(v1, 31);
387 |             v1 *= PRIME64_1;
388 |             v2 += XXH_get64bits(p) * PRIME64_2;
389 |             p+=8;
390 |             v2 = XXH_rotl64(v2, 31);
391 |             v2 *= PRIME64_1;
392 |             v3 += XXH_get64bits(p) * PRIME64_2;
393 |             p+=8;
394 |             v3 = XXH_rotl64(v3, 31);
395 |             v3 *= PRIME64_1;
396 |             v4 += XXH_get64bits(p) * PRIME64_2;
397 |             p+=8;
398 |             v4 = XXH_rotl64(v4, 31);
399 |             v4 *= PRIME64_1;
400 |         }
401 |         while (p<=limit);
402 | 
403 |         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
404 | 
405 |         v1 *= PRIME64_2;
406 |         v1 = XXH_rotl64(v1, 31);
407 |         v1 *= PRIME64_1;
408 |         h64 ^= v1;
409 |         h64 = h64 * PRIME64_1 + PRIME64_4;
410 | 
411 |         v2 *= PRIME64_2;
412 |         v2 = XXH_rotl64(v2, 31);
413 |         v2 *= PRIME64_1;
414 |         h64 ^= v2;
415 |         h64 = h64 * PRIME64_1 + PRIME64_4;
416 | 
417 |         v3 *= PRIME64_2;
418 |         v3 = XXH_rotl64(v3, 31);
419 |         v3 *= PRIME64_1;
420 |         h64 ^= v3;
421 |         h64 = h64 * PRIME64_1 + PRIME64_4;
422 | 
423 |         v4 *= PRIME64_2;
424 |         v4 = XXH_rotl64(v4, 31);
425 |         v4 *= PRIME64_1;
426 |         h64 ^= v4;
427 |         h64 = h64 * PRIME64_1 + PRIME64_4;
428 |     }
429 |     else
430 |     {
431 |         h64  = seed + PRIME64_5;
432 |     }
433 | 
434 |     h64 += (U64) len;
435 | 
436 |     while (p+8<=bEnd)
437 |     {
438 |         U64 k1 = XXH_get64bits(p);
439 |         k1 *= PRIME64_2;
440 |         k1 = XXH_rotl64(k1,31);
441 |         k1 *= PRIME64_1;
442 |         h64 ^= k1;
443 |         h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
444 |         p+=8;
445 |     }
446 | 
447 |     if (p+4<=bEnd)
448 |     {
449 |         h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
450 |         h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
451 |         p+=4;
452 |     }
453 | 
454 |     while (p<bEnd)
455 |     {
456 |         h64 ^= (*p) * PRIME64_5;
457 |         h64 = XXH_rotl64(h64, 11) * PRIME64_1;
458 |         p++;
459 |     }
460 | 
461 |     h64 ^= h64 >> 33;
462 |     h64 *= PRIME64_2;
463 |     h64 ^= h64 >> 29;
464 |     h64 *= PRIME64_3;
465 |     h64 ^= h64 >> 32;
466 | 
467 |     return h64;
468 | }
469 | 
470 | 
471 | unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
472 | {
473 | #if 0
474 |     // Simple version, good for code maintenance, but unfortunately slow for small inputs
475 |     XXH64_state_t state;
476 |     XXH64_reset(&state, seed);
477 |     XXH64_update(&state, input, len);
478 |     return XXH64_digest(&state);
479 | #else
480 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
481 | 
482 | #  if !defined(XXH_USE_UNALIGNED_ACCESS)
483 |     if ((((size_t)input) & 7)==0)   // Input is aligned, let's leverage the speed advantage
484 |     {
485 |         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
486 |             return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
487 |         else
488 |             return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
489 |     }
490 | #  endif
491 | 
492 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
493 |         return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
494 |     else
495 |         return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
496 | #endif
497 | }
498 | 
499 | /****************************************************
500 |  *  Advanced Hash Functions
501 | ****************************************************/
502 | 
503 | /*** Allocation ***/
504 | typedef struct
505 | {
506 |     U64 total_len;
507 |     U32 seed;
508 |     U32 v1;
509 |     U32 v2;
510 |     U32 v3;
511 |     U32 v4;
512 |     U32 mem32[4];   /* defined as U32 for alignment */
513 |     U32 memsize;
514 | } XXH_istate32_t;
515 | 
516 | typedef struct
517 | {
518 |     U64 total_len;
519 |     U64 seed;
520 |     U64 v1;
521 |     U64 v2;
522 |     U64 v3;
523 |     U64 v4;
524 |     U64 mem64[4];   /* defined as U64 for alignment */
525 |     U32 memsize;
526 | } XXH_istate64_t;
527 | 
528 | 
529 | XXH32_state_t* XXH32_createState(void)
530 | {
531 |     XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   // A compilation error here means XXH32_state_t is not large enough
532 |     return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
533 | }
534 | XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
535 | {
536 |     XXH_free(statePtr);
537 |     return XXH_OK;
538 | };
539 | 
540 | XXH64_state_t* XXH64_createState(void)
541 | {
542 |     XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   // A compilation error here means XXH64_state_t is not large enough
543 |     return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
544 | }
545 | XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
546 | {
547 |     XXH_free(statePtr);
548 |     return XXH_OK;
549 | };
550 | 
551 | 
552 | /*** Hash feed ***/
553 | 
554 | XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
555 | {
556 |     XXH_istate32_t* state = (XXH_istate32_t*) state_in;
557 |     state->seed = seed;
558 |     state->v1 = seed + PRIME32_1 + PRIME32_2;
559 |     state->v2 = seed + PRIME32_2;
560 |     state->v3 = seed + 0;
561 |     state->v4 = seed - PRIME32_1;
562 |     state->total_len = 0;
563 |     state->memsize = 0;
564 |     return XXH_OK;
565 | }
566 | 
567 | XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
568 | {
569 |     XXH_istate64_t* state = (XXH_istate64_t*) state_in;
570 |     state->seed = seed;
571 |     state->v1 = seed + PRIME64_1 + PRIME64_2;
572 |     state->v2 = seed + PRIME64_2;
573 |     state->v3 = seed + 0;
574 |     state->v4 = seed - PRIME64_1;
575 |     state->total_len = 0;
576 |     state->memsize = 0;
577 |     return XXH_OK;
578 | }
579 | 
580 | 
581 | FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
582 | {
583 |     XXH_istate32_t* state = (XXH_istate32_t *) state_in;
584 |     const BYTE* p = (const BYTE*)input;
585 |     const BYTE* const bEnd = p + len;
586 | 
587 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
588 |     if (input==NULL) return XXH_ERROR;
589 | #endif
590 | 
591 |     state->total_len += len;
592 | 
593 |     if (state->memsize + len < 16)   // fill in tmp buffer
594 |     {
595 |         XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
596 |         state->memsize += (U32)len;
597 |         return XXH_OK;
598 |     }
599 | 
600 |     if (state->memsize)   // some data left from previous update
601 |     {
602 |         XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
603 |         {
604 |             const U32* p32 = state->mem32;
605 |             state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
606 |             state->v1 = XXH_rotl32(state->v1, 13);
607 |             state->v1 *= PRIME32_1;
608 |             p32++;
609 |             state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
610 |             state->v2 = XXH_rotl32(state->v2, 13);
611 |             state->v2 *= PRIME32_1;
612 |             p32++;
613 |             state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
614 |             state->v3 = XXH_rotl32(state->v3, 13);
615 |             state->v3 *= PRIME32_1;
616 |             p32++;
617 |             state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
618 |             state->v4 = XXH_rotl32(state->v4, 13);
619 |             state->v4 *= PRIME32_1;
620 |             p32++;
621 |         }
622 |         p += 16-state->memsize;
623 |         state->memsize = 0;
624 |     }
625 | 
626 |     if (p <= bEnd-16)
627 |     {
628 |         const BYTE* const limit = bEnd - 16;
629 |         U32 v1 = state->v1;
630 |         U32 v2 = state->v2;
631 |         U32 v3 = state->v3;
632 |         U32 v4 = state->v4;
633 | 
634 |         do
635 |         {
636 |             v1 += XXH_readLE32(p, endian) * PRIME32_2;
637 |             v1 = XXH_rotl32(v1, 13);
638 |             v1 *= PRIME32_1;
639 |             p+=4;
640 |             v2 += XXH_readLE32(p, endian) * PRIME32_2;
641 |             v2 = XXH_rotl32(v2, 13);
642 |             v2 *= PRIME32_1;
643 |             p+=4;
644 |             v3 += XXH_readLE32(p, endian) * PRIME32_2;
645 |             v3 = XXH_rotl32(v3, 13);
646 |             v3 *= PRIME32_1;
647 |             p+=4;
648 |             v4 += XXH_readLE32(p, endian) * PRIME32_2;
649 |             v4 = XXH_rotl32(v4, 13);
650 |             v4 *= PRIME32_1;
651 |             p+=4;
652 |         }
653 |         while (p<=limit);
654 | 
655 |         state->v1 = v1;
656 |         state->v2 = v2;
657 |         state->v3 = v3;
658 |         state->v4 = v4;
659 |     }
660 | 
661 |     if (p < bEnd)
662 |     {
663 |         XXH_memcpy(state->mem32, p, bEnd-p);
664 |         state->memsize = (int)(bEnd-p);
665 |     }
666 | 
667 |     return XXH_OK;
668 | }
669 | 
670 | XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
671 | {
672 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
673 | 
674 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
675 |         return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
676 |     else
677 |         return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
678 | }
679 | 
680 | 
681 | 
682 | FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
683 | {
684 |     XXH_istate32_t* state = (XXH_istate32_t*) state_in;
685 |     const BYTE * p = (const BYTE*)state->mem32;
686 |     BYTE* bEnd = (BYTE*)(state->mem32) + state->memsize;
687 |     U32 h32;
688 | 
689 |     if (state->total_len >= 16)
690 |     {
691 |         h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
692 |     }
693 |     else
694 |     {
695 |         h32  = state->seed + PRIME32_5;
696 |     }
697 | 
698 |     h32 += (U32) state->total_len;
699 | 
700 |     while (p+4<=bEnd)
701 |     {
702 |         h32 += XXH_readLE32(p, endian) * PRIME32_3;
703 |         h32  = XXH_rotl32(h32, 17) * PRIME32_4;
704 |         p+=4;
705 |     }
706 | 
707 |     while (p<bEnd)
708 |     {
709 |         h32 += (*p) * PRIME32_5;
710 |         h32 = XXH_rotl32(h32, 11) * PRIME32_1;
711 |         p++;
712 |     }
713 | 
714 |     h32 ^= h32 >> 15;
715 |     h32 *= PRIME32_2;
716 |     h32 ^= h32 >> 13;
717 |     h32 *= PRIME32_3;
718 |     h32 ^= h32 >> 16;
719 | 
720 |     return h32;
721 | }
722 | 
723 | 
724 | U32 XXH32_digest (const XXH32_state_t* state_in)
725 | {
726 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
727 | 
728 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
729 |         return XXH32_digest_endian(state_in, XXH_littleEndian);
730 |     else
731 |         return XXH32_digest_endian(state_in, XXH_bigEndian);
732 | }
733 | 
734 | 
735 | FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
736 | {
737 |     XXH_istate64_t * state = (XXH_istate64_t *) state_in;
738 |     const BYTE* p = (const BYTE*)input;
739 |     const BYTE* const bEnd = p + len;
740 | 
741 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
742 |     if (input==NULL) return XXH_ERROR;
743 | #endif
744 | 
745 |     state->total_len += len;
746 | 
747 |     if (state->memsize + len < 32)   // fill in tmp buffer
748 |     {
749 |         XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
750 |         state->memsize += (U32)len;
751 |         return XXH_OK;
752 |     }
753 | 
754 |     if (state->memsize)   // some data left from previous update
755 |     {
756 |         XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
757 |         {
758 |             const U64* p64 = state->mem64;
759 |             state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
760 |             state->v1 = XXH_rotl64(state->v1, 31);
761 |             state->v1 *= PRIME64_1;
762 |             p64++;
763 |             state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
764 |             state->v2 = XXH_rotl64(state->v2, 31);
765 |             state->v2 *= PRIME64_1;
766 |             p64++;
767 |             state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
768 |             state->v3 = XXH_rotl64(state->v3, 31);
769 |             state->v3 *= PRIME64_1;
770 |             p64++;
771 |             state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
772 |             state->v4 = XXH_rotl64(state->v4, 31);
773 |             state->v4 *= PRIME64_1;
774 |             p64++;
775 |         }
776 |         p += 32-state->memsize;
777 |         state->memsize = 0;
778 |     }
779 | 
780 |     if (p+32 <= bEnd)
781 |     {
782 |         const BYTE* const limit = bEnd - 32;
783 |         U64 v1 = state->v1;
784 |         U64 v2 = state->v2;
785 |         U64 v3 = state->v3;
786 |         U64 v4 = state->v4;
787 | 
788 |         do
789 |         {
790 |             v1 += XXH_readLE64(p, endian) * PRIME64_2;
791 |             v1 = XXH_rotl64(v1, 31);
792 |             v1 *= PRIME64_1;
793 |             p+=8;
794 |             v2 += XXH_readLE64(p, endian) * PRIME64_2;
795 |             v2 = XXH_rotl64(v2, 31);
796 |             v2 *= PRIME64_1;
797 |             p+=8;
798 |             v3 += XXH_readLE64(p, endian) * PRIME64_2;
799 |             v3 = XXH_rotl64(v3, 31);
800 |             v3 *= PRIME64_1;
801 |             p+=8;
802 |             v4 += XXH_readLE64(p, endian) * PRIME64_2;
803 |             v4 = XXH_rotl64(v4, 31);
804 |             v4 *= PRIME64_1;
805 |             p+=8;
806 |         }
807 |         while (p<=limit);
808 | 
809 |         state->v1 = v1;
810 |         state->v2 = v2;
811 |         state->v3 = v3;
812 |         state->v4 = v4;
813 |     }
814 | 
815 |     if (p < bEnd)
816 |     {
817 |         XXH_memcpy(state->mem64, p, bEnd-p);
818 |         state->memsize = (int)(bEnd-p);
819 |     }
820 | 
821 |     return XXH_OK;
822 | }
823 | 
824 | XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
825 | {
826 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
827 | 
828 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
829 |         return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
830 |     else
831 |         return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
832 | }
833 | 
834 | 
835 | 
836 | FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
837 | {
838 |     XXH_istate64_t * state = (XXH_istate64_t *) state_in;
839 |     const BYTE * p = (const BYTE*)state->mem64;
840 |     BYTE* bEnd = (BYTE*)state->mem64 + state->memsize;
841 |     U64 h64;
842 | 
843 |     if (state->total_len >= 32)
844 |     {
845 |         U64 v1 = state->v1;
846 |         U64 v2 = state->v2;
847 |         U64 v3 = state->v3;
848 |         U64 v4 = state->v4;
849 | 
850 |         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
851 | 
852 |         v1 *= PRIME64_2;
853 |         v1 = XXH_rotl64(v1, 31);
854 |         v1 *= PRIME64_1;
855 |         h64 ^= v1;
856 |         h64 = h64*PRIME64_1 + PRIME64_4;
857 | 
858 |         v2 *= PRIME64_2;
859 |         v2 = XXH_rotl64(v2, 31);
860 |         v2 *= PRIME64_1;
861 |         h64 ^= v2;
862 |         h64 = h64*PRIME64_1 + PRIME64_4;
863 | 
864 |         v3 *= PRIME64_2;
865 |         v3 = XXH_rotl64(v3, 31);
866 |         v3 *= PRIME64_1;
867 |         h64 ^= v3;
868 |         h64 = h64*PRIME64_1 + PRIME64_4;
869 | 
870 |         v4 *= PRIME64_2;
871 |         v4 = XXH_rotl64(v4, 31);
872 |         v4 *= PRIME64_1;
873 |         h64 ^= v4;
874 |         h64 = h64*PRIME64_1 + PRIME64_4;
875 |     }
876 |     else
877 |     {
878 |         h64  = state->seed + PRIME64_5;
879 |     }
880 | 
881 |     h64 += (U64) state->total_len;
882 | 
883 |     while (p+8<=bEnd)
884 |     {
885 |         U64 k1 = XXH_readLE64(p, endian);
886 |         k1 *= PRIME64_2;
887 |         k1 = XXH_rotl64(k1,31);
888 |         k1 *= PRIME64_1;
889 |         h64 ^= k1;
890 |         h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
891 |         p+=8;
892 |     }
893 | 
894 |     if (p+4<=bEnd)
895 |     {
896 |         h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
897 |         h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
898 |         p+=4;
899 |     }
900 | 
901 |     while (p<bEnd)
902 |     {
903 |         h64 ^= (*p) * PRIME64_5;
904 |         h64 = XXH_rotl64(h64, 11) * PRIME64_1;
905 |         p++;
906 |     }
907 | 
908 |     h64 ^= h64 >> 33;
909 |     h64 *= PRIME64_2;
910 |     h64 ^= h64 >> 29;
911 |     h64 *= PRIME64_3;
912 |     h64 ^= h64 >> 32;
913 | 
914 |     return h64;
915 | }
916 | 
917 | 
918 | unsigned long long XXH64_digest (const XXH64_state_t* state_in)
919 | {
920 |     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
921 | 
922 |     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
923 |         return XXH64_digest_endian(state_in, XXH_littleEndian);
924 |     else
925 |         return XXH64_digest_endian(state_in, XXH_bigEndian);
926 | }
927 | 
928 | 
929 | 


--------------------------------------------------------------------------------
/xxhash.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |    xxHash - Extremely Fast Hash algorithm
  3 |    Header File
  4 |    Copyright (C) 2012-2014, Yann Collet.
  5 |    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
  6 | 
  7 |    Redistribution and use in source and binary forms, with or without
  8 |    modification, are permitted provided that the following conditions are
  9 |    met:
 10 | 
 11 |        * Redistributions of source code must retain the above copyright
 12 |    notice, this list of conditions and the following disclaimer.
 13 |        * Redistributions in binary form must reproduce the above
 14 |    copyright notice, this list of conditions and the following disclaimer
 15 |    in the documentation and/or other materials provided with the
 16 |    distribution.
 17 | 
 18 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 |    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 |    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 |    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 |    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 |    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 |    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 |    You can contact the author at :
 31 |    - xxHash source repository : http://code.google.com/p/xxhash/
 32 | */
 33 | 
 34 | /* Notice extracted from xxHash homepage :
 35 | 
 36 | xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
 37 | It also successfully passes all tests from the SMHasher suite.
 38 | 
 39 | Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
 40 | 
 41 | Name            Speed       Q.Score   Author
 42 | xxHash          5.4 GB/s     10
 43 | CrapWow         3.2 GB/s      2       Andrew
 44 | MumurHash 3a    2.7 GB/s     10       Austin Appleby
 45 | SpookyHash      2.0 GB/s     10       Bob Jenkins
 46 | SBox            1.4 GB/s      9       Bret Mulvey
 47 | Lookup3         1.2 GB/s      9       Bob Jenkins
 48 | SuperFastHash   1.2 GB/s      1       Paul Hsieh
 49 | CityHash64      1.05 GB/s    10       Pike & Alakuijala
 50 | FNV             0.55 GB/s     5       Fowler, Noll, Vo
 51 | CRC32           0.43 GB/s     9
 52 | MD5-32          0.33 GB/s    10       Ronald L. Rivest
 53 | SHA1-32         0.28 GB/s    10
 54 | 
 55 | Q.Score is a measure of quality of the hash function.
 56 | It depends on successfully passing SMHasher test set.
 57 | 10 is a perfect score.
 58 | */
 59 | 
 60 | #pragma once
 61 | 
 62 | #if defined (__cplusplus)
 63 | extern "C" {
 64 | #endif
 65 | 
 66 | 
 67 | /*****************************
 68 |    Includes
 69 | *****************************/
 70 | #include <stddef.h>   /* size_t */
 71 | 
 72 | 
 73 | /*****************************
 74 |    Type
 75 | *****************************/
 76 | typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 77 | 
 78 | 
 79 | 
 80 | /*****************************
 81 |    Simple Hash Functions
 82 | *****************************/
 83 | 
 84 | unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
 85 | unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
 86 | 
 87 | /*
 88 | XXH32() :
 89 |     Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
 90 |     The memory between input & input+length must be valid (allocated and read-accessible).
 91 |     "seed" can be used to alter the result predictably.
 92 |     This function successfully passes all SMHasher tests.
 93 |     Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
 94 | XXH64() :
 95 |     Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
 96 | */
 97 | 
 98 | 
 99 | 
100 | /*****************************
101 |    Advanced Hash Functions
102 | *****************************/
103 | typedef struct { long long ll[ 6]; } XXH32_state_t;
104 | typedef struct { long long ll[11]; } XXH64_state_t;
105 | 
106 | /*
107 | These structures allow static allocation of XXH states.
108 | States must then be initialized using XXHnn_reset() before first use.
109 | 
110 | If you prefer dynamic allocation, please refer to functions below.
111 | */
112 | 
113 | XXH32_state_t* XXH32_createState(void);
114 | XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
115 | 
116 | XXH64_state_t* XXH64_createState(void);
117 | XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
118 | 
119 | /*
120 | These functions create and release memory for XXH state.
121 | States must then be initialized using XXHnn_reset() before first use.
122 | */
123 | 
124 | 
125 | XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
126 | XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
127 | unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
128 | 
129 | XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
130 | XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
131 | unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
132 | 
133 | /*
134 | These functions calculate the xxHash of an input provided in multiple smaller packets,
135 | as opposed to an input provided as a single block.
136 | 
137 | XXH state space must first be allocated, using either static or dynamic method provided above.
138 | 
139 | Start a new hash by initializing state with a seed, using XXHnn_reset().
140 | 
141 | Then, feed the hash state by calling XXHnn_update() as many times as necessary.
142 | Obviously, input must be valid, meaning allocated and read accessible.
143 | The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
144 | 
145 | Finally, you can produce a hash anytime, by using XXHnn_digest().
146 | This function returns the final nn-bits hash.
147 | You can nonetheless continue feeding the hash state with more input,
148 | and therefore get some new hashes, by calling again XXHnn_digest().
149 | 
150 | When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
151 | */
152 | 
153 | 
154 | #if defined (__cplusplus)
155 | }
156 | #endif
157 | 


--------------------------------------------------------------------------------