├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── align.h ├── benchmark ├── benchmark.h ├── bits.h ├── ccqueue.c ├── ccqueue.h ├── ccsynch.h ├── cpumap.h ├── delay.c ├── delay.h ├── driver ├── faa.c ├── halfhalf.c ├── harness.c ├── hzdptr.c ├── hzdptr.h ├── lcrq.c ├── lcrq.h ├── msqueue.c ├── msqueue.h ├── pairwise.c ├── primitives.h ├── queue.h ├── wfqueue.c ├── wfqueue.h ├── xxhash.c └── xxhash.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Output files 2 | ccqueue 3 | delay 4 | faa 5 | lcrq 6 | msqueue 7 | wfqueue 8 | wfqueue0 9 | 10 | # Object files 11 | *.o 12 | *.ko 13 | *.obj 14 | *.elf 15 | 16 | # Precompiled Headers 17 | *.gch 18 | *.pch 19 | 20 | # Libraries 21 | *.lib 22 | *.a 23 | *.la 24 | *.lo 25 | 26 | # Shared objects (inc. Windows DLLs) 27 | *.dll 28 | *.so 29 | *.so.* 30 | *.dylib 31 | 32 | # Executables 33 | *.exe 34 | *.out 35 | *.app 36 | *.i*86 37 | *.x86_64 38 | *.hex 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Chaoran Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TESTS = wfqueue wfqueue0 lcrq ccqueue msqueue faa delay 2 | 3 | CC = gcc 4 | CFLAGS = -g -Wall -O3 -pthread -D_GNU_SOURCE 5 | LDLIBS = -lpthread -lm 6 | 7 | ifeq (${VERIFY}, 1) 8 | CFLAGS += -DVERIFY 9 | endif 10 | 11 | ifeq (${SANITIZE}, 1) 12 | CFLAGS += -fsanitize=address -fno-omit-frame-pointer 13 | LDLIBS += -lasan 14 | LDFLAGS = -fsanitize=address 15 | endif 16 | 17 | ifdef JEMALLOC_PATH 18 | LDFLAGS += -L${JEMALLOC_PATH}/lib -Wl,-rpath,${JEMALLOC_PATH}/lib 19 | LDLIBS += -ljemalloc 20 | endif 21 | 22 | all: $(TESTS) 23 | 24 | wfqueue0: CFLAGS += -DMAX_PATIENCE=0 25 | wfqueue0.o: wfqueue.c 26 | $(CC) $(CFLAGS) -c -o $@ $^ 27 | 28 | haswell: CFLAGS += -DGUADALUPE_COMPACT 29 | haswell: all 30 | 31 | mic: CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc 32 | mic: CFLAGS += -DGUADALUPE_MIC_COMPACT -DLOGN_OPS=6 33 | mic biou: $(filter-out lcrq,$(TESTS)) 34 | 35 | biou: CFLAGS += -DBIOU_COMPACT 36 | 37 | wfqueue wfqueue0: CFLAGS += -DWFQUEUE 38 | lcrq: CFLAGS += -DLCRQ 39 | ccqueue: CFLAGS += -DCCQUEUE 40 | msqueue: CFLAGS += -DMSQUEUE 41 | faa: CFLAGS += -DFAAQ 42 | delay: CFLAGS += -DDELAY 43 | 44 | $(TESTS): harness.o 45 | ifeq (${HALFHALF}, 1) 46 | $(TESTS): halfhalf.o 47 | else 48 | $(TESTS): pairwise.o 49 | endif 50 | 51 | msqueue lcrq: hzdptr.o xxhash.o 52 | 53 | clean: 54 | rm -f $(TESTS) *.o 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast Wait Free Queue 2 | 3 | This is a benchmark framework for evaluating the performance of concurrent queues. Currently, it contains four concurrent queue implementations. They are: 4 | 5 | - A fast wait-free queue `wfqueue`, 6 | - Morrison and Afek's `lcrq`, 7 | - Fatourou and Kallimanis's `ccqueue`, and 8 | - Michael and Scott's `msqueue` 9 | 10 | The benchmark framework also includes a synthetic queue benchmark, `faa`, which emulates both an enqueue and a dequeue with a `fetch-and-add` primitive to test the performance of `fetch-and-add` on a system. 11 | 12 | The framework currently contains one benchmark, `pairwise`, in which all threads repeatedly execute pairs of enqueue and dequeue operations. Between two operations, `pairwise` uses a delay routine that adds an arbitrary delay (between 50~150ns) to avoid artificial long run scenarios, where a cache line is held by one thread for a long time. 13 | 14 | ## Requirements 15 | 16 | - **GCC 4.1.0 or later (Recommend GCC 4.7.3 or later)**: current implementations uses GCC `__atomic` or `__sync` primitives for atomic memory access. 17 | - **Linux kernel 2.5.8 or later** 18 | - **glibc 2.3**: we use `sched_setaffinity` to bind threads to cores. 19 | - **atomic `CAS2`**: `lcrq` requires `CAS2`, a 16 Byte wide `compare-and-swap` primitive. This is available on most recent Intel processors and IBM Power8. 20 | - **jemalloc** (optional): `jemalloc` eliminates the bottleneck of the memory allocator. You can link with `jemalloc` by setting `JEMALLOC_PATH` environment variable to the path where your `jemalloc` is installed. 21 | 22 | ## How to install 23 | 24 | Download one of the released source code tarball, then execute the following commands. The filename used may be different depending on the name of the tarball you have downloaded. 25 | ``` 26 | $ tar zxf fast-wait-free-queue-1.0.0.tar.gz 27 | $ cd fast-wait-free-queue-1.0.0 28 | $ make 29 | ``` 30 | 31 | This should generate 6 binaries (or 5 if your system does not support `CAS2`, `lcrq` will fail to compile): `wfqueue`, `wfqueue0`, `lcrq`, `ccqueue`, `msqueue`, `faa`, and `delay`. These are the `pairwise` benchmark compiled using different queue implementations. 32 | - `wfqueue0`: the same as `wfqueue` except that its `PATIENCE` is set to `0`. 33 | - `delay`: a synthetic benchmark used to measure the time spent in the delay routine. 34 | 35 | ## How to run 36 | 37 | You can execute a binary directly, using the number of threads as an argument. Without an argument, the execution will use all available cores on the system. 38 | 39 | For example, 40 | ``` 41 | ./wfqueue 8 42 | ``` 43 | runs `wfqueue` with 8 threads. 44 | 45 | If you would like to verify the result, compile the binary with `VERIFY=1 make`. Then execute a binary directly will print either `PASSED` or error messages. 46 | 47 | You can also use the `driver` script, which invokes a binary up to 10 times and measures the **mean of running times**, the **running time of the current run**, the **standard deviation**, **margin of error** (both in time and percentage) of each run. 48 | The script terminates when the **margin of error** is relatively small (**< 0.02**), or has invoked the binary 10 times. 49 | 50 | For example, 51 | ``` 52 | ./driver ./wfqueue 8 53 | ``` 54 | runs `wfqueue` with 8 threads up to 10 times and collect statistic results. 55 | 56 | You can use the `benchmark` script, which invokes `driver` on all combinations of a list of binaries and a list of numbers of threads, and report the `mean running time` and `margin of error` for each combination. You can specify the list of binaries using the environment variable `TESTS`. You can specify the list of numbers of threads using the environment variable `PROCS`. 57 | 58 | The generated output of `benchmark` can be used as a datafile for gnuplot. The first column of `benchmark`'s output is the number threads. Then every two columns are the `mean running time` and `margin of error` for each queue implementation. They are in the same order as they are specified in `TESTS`. 59 | 60 | For example, 61 | ``` 62 | TESTS=wfqueue:lcrq:faa:delay PROCS=1:2:4:8 ./benchmark 63 | ``` 64 | runs each of `wfqueue`, `lcrq`, `faa`, and `delay` using 1, 2, 4, and 8 threads. 65 | 66 | Then you can plot them using, 67 | ``` 68 | set logscale x 2 69 | plot "t" using 1:(20000/($2-$8)) t "wfqueue" w lines, \ 70 | "t" using 1:(20000/($4-$8)) t "lcrq" w lines, \ 71 | "t" using 1:(20000/($6-$8)) t "faa" w lines 72 | ``` 73 | 74 | ## How to map threads to cores 75 | 76 | By default, the framework will map a thread with id `i` to the core with id `i % p`, where *p* is the number of available cores on a system; you can check each core's id in `proc/cpuinfo`. 77 | 78 | To implement a custom mapping, you can add a `cpumap` function in `cpumap.h`. The signature of `cpumap` is 79 | ``` 80 | int cpumap(int id, int nprocs) 81 | ``` 82 | where `id` is the id of the current thread, `nprocs` is the number of threads. `cpumap` should return the corresponding core id for the thread. `cpumap.h` contains several examples of the cpumap function. You should guard the definition of the added `cpumap` using a conditional macro, and add the macro to `CFLAGS` in the makefile. 83 | 84 | ## How to add a new queue implementation 85 | 86 | We use a generic pointer `void *` to represent a value that can be stored in the queue. 87 | A queue should implements the queue interface, defined in `queue.h`. 88 | 89 | - `queue_t`: the struct type of the queue, 90 | - `handle_t`: a thread's handle to the queue, used to store thread local state, 91 | - `void queue_init(queue_t * q, int nprocs)`: initialize a queue; this will be called only once, 92 | - `void queue_register(queue_t * q, handle_t * th, int id)`: initialize a thread's handle; this will be called by every thread that uses the queue, 93 | - `void enqueue(queue_t * q, handle_t * th, void * val)`: enqueues a value, 94 | - `void * dequeue(queue_t * q, handle_t * th)`: dequeues a value, 95 | - `void queue_free(queue_t * q, handle_t * h)`: deallocate a queue and cleanup all resources associated with it, 96 | - `EMPTY`: a value that will be returned if a `dequeue` fails. This should be a macro that is defined in the header file. 97 | 98 | ## How to add a new benchmark 99 | 100 | A benchmark should implement the benchmark interface, defined in `benchmark.h`, and interact with a queue using the queue interface. 101 | The benchmark interface includes: 102 | 103 | - `void init(int nprocs, int n)`: performs initialization of the benchmark; called only once at the beginning. 104 | - `void thread_init(int id, int nprocs)`: performs thread local initialization of the benchmark; called once per thread, after `init` but before `benchmark`. 105 | - `void * benchmark(int id, int nprocs)`: run the benchmark once, called by each thread to run the benchmark. Each call will be timed and report as one iteration. It can return a result, which will be passed to `verify` to verify correctness. 106 | - `int verify(int nprocs, void * results)`: should verify the result of each thread and return `0` on success and non-zero values on error. 107 | -------------------------------------------------------------------------------- /align.h: -------------------------------------------------------------------------------- 1 | #ifndef ALIGN_H 2 | #define ALIGN_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define PAGE_SIZE 4096 9 | #define CACHE_LINE_SIZE 64 10 | #define CACHE_ALIGNED __attribute__((aligned(CACHE_LINE_SIZE))) 11 | #define DOUBLE_CACHE_ALIGNED __attribute__((aligned(2 * CACHE_LINE_SIZE))) 12 | 13 | static inline void * align_malloc(size_t align, size_t size) 14 | { 15 | void * ptr; 16 | 17 | int ret = posix_memalign(&ptr, align, size); 18 | if (ret != 0) { 19 | fprintf(stderr, strerror(ret)); 20 | abort(); 21 | } 22 | 23 | return ptr; 24 | } 25 | 26 | #endif /* end of include guard: ALIGN_H */ 27 | -------------------------------------------------------------------------------- /benchmark: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$TESTS" ]; then 4 | TESTS=(wfqueue wfqueue0 faa lcrq ccqueue msqueue delay) 5 | else 6 | IFS=':' read -r -a TESTS <<< "${TESTS}" 7 | fi 8 | 9 | if [ -z "$PROCS" ]; then 10 | PROCS=(1 2 4 8) 11 | else 12 | IFS=':' read -r -a PROCS <<< "${PROCS}" 13 | fi 14 | 15 | printf '#! Host: %s\n' $( hostname ) 16 | printf '#! Benchmarks: %s\n' "${TESTS[*]}" 17 | printf '#! Threads: %s\n' "${PROCS[*]}" 18 | 19 | for j in ${PROCS[@]}; do 20 | printf '%d' $j 21 | for i in ${TESTS[@]}; do 22 | echo -ne \ 23 | "$(./driver ./$i $j | tail -n 1 | awk '{printf " %.2f %.2f", $3, $5}')" 24 | done 25 | printf '\n' 26 | done 27 | -------------------------------------------------------------------------------- /benchmark.h: -------------------------------------------------------------------------------- 1 | #ifndef BENCHMARK_H 2 | #define BENCHMARK_H 3 | 4 | extern void init(int nprocs, int n); 5 | extern void thread_init(int id, int nprocs); 6 | extern void * benchmark(int id, int nprocs); 7 | extern void thread_exit(int id, int nprocs); 8 | extern int verify(int nprocs, void ** results); 9 | 10 | #endif /* end of include guard: BENCHMARK_H */ 11 | -------------------------------------------------------------------------------- /bits.h: -------------------------------------------------------------------------------- 1 | #ifndef BITS_H 2 | #define BITS_H 3 | 4 | static void * bits_join(int hi, int lo) 5 | { 6 | intptr_t int64 = hi; 7 | int64 <<= 32; 8 | int64 += lo; 9 | return (void *) int64; 10 | } 11 | 12 | static int bits_lo(void * ptr) 13 | { 14 | intptr_t int64 = (intptr_t) ptr; 15 | int64 &= 0x00000000ffffffff; 16 | return (int) int64; 17 | } 18 | 19 | static int bits_hi(void * ptr) 20 | { 21 | intptr_t int64 = (intptr_t) ptr; 22 | int64 >>= 32; 23 | return (int) int64; 24 | } 25 | 26 | #endif /* end of include guard: BITS_H */ 27 | -------------------------------------------------------------------------------- /ccqueue.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "delay.h" 4 | #include "ccqueue.h" 5 | 6 | static inline 7 | void serialEnqueue(void * state, void * data) 8 | { 9 | node_t * volatile * tail = (node_t **) state; 10 | node_t * node = (node_t *) data; 11 | 12 | (*tail)->next = node; 13 | *tail = node; 14 | } 15 | 16 | static inline 17 | void serialDequeue(void * state, void * data) 18 | { 19 | node_t * volatile * head = (node_t **) state; 20 | node_t ** ptr = (node_t **) data; 21 | 22 | node_t * node = *head; 23 | node_t * next = node->next; 24 | 25 | if (next) { 26 | node->data = next->data; 27 | *head = next; 28 | } else { 29 | node = (void *) -1; 30 | } 31 | 32 | *ptr = node; 33 | } 34 | 35 | void queue_init(queue_t * queue, int nprocs) 36 | { 37 | ccsynch_init(&queue->enq); 38 | ccsynch_init(&queue->deq); 39 | 40 | node_t * dummy = align_malloc(CACHE_LINE_SIZE, sizeof(node_t)); 41 | dummy->data = 0; 42 | dummy->next = NULL; 43 | 44 | queue->head = dummy; 45 | queue->tail = dummy; 46 | } 47 | 48 | void queue_register(queue_t * queue, handle_t * handle, int id) 49 | { 50 | ccsynch_handle_init(&handle->enq); 51 | ccsynch_handle_init(&handle->deq); 52 | 53 | handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(node_t)); 54 | } 55 | 56 | void enqueue(queue_t * queue, handle_t * handle, void * data) 57 | { 58 | node_t * node = handle->next; 59 | 60 | if (node) handle->next = NULL; 61 | else node = align_malloc(CACHE_LINE_SIZE, sizeof(node_t)); 62 | 63 | node->data = data; 64 | node->next = NULL; 65 | 66 | ccsynch_apply(&queue->enq, &handle->enq, &serialEnqueue, &queue->tail, node); 67 | } 68 | 69 | void * dequeue(queue_t * queue, handle_t * handle) 70 | { 71 | node_t * node; 72 | ccsynch_apply(&queue->deq, &handle->deq, &serialDequeue, &queue->head, &node); 73 | 74 | void * data; 75 | 76 | if (node == (void *) -1) { 77 | data = (void *) -1; 78 | } else { 79 | data = node->data; 80 | if (handle->next) free(node); 81 | else handle->next = node; 82 | } 83 | 84 | return data; 85 | } 86 | 87 | void queue_free(int id, int nprocs) {} 88 | -------------------------------------------------------------------------------- /ccqueue.h: -------------------------------------------------------------------------------- 1 | #ifndef CCQUEUE_H 2 | #define CCQUEUE_H 3 | 4 | #ifdef CCQUEUE 5 | #include "ccsynch.h" 6 | 7 | #define EMPTY (void *) -1 8 | 9 | typedef struct _node_t { 10 | struct _node_t * next CACHE_ALIGNED; 11 | void * volatile data; 12 | } node_t; 13 | 14 | typedef struct _queue_t { 15 | ccsynch_t enq DOUBLE_CACHE_ALIGNED; 16 | ccsynch_t deq DOUBLE_CACHE_ALIGNED; 17 | node_t * head DOUBLE_CACHE_ALIGNED; 18 | node_t * tail DOUBLE_CACHE_ALIGNED; 19 | } queue_t DOUBLE_CACHE_ALIGNED; 20 | 21 | typedef struct _handle_t { 22 | ccsynch_handle_t enq; 23 | ccsynch_handle_t deq; 24 | node_t * next; 25 | } handle_t DOUBLE_CACHE_ALIGNED; 26 | 27 | #endif 28 | 29 | #endif /* end of include guard: CCQUEUE_H */ 30 | -------------------------------------------------------------------------------- /ccsynch.h: -------------------------------------------------------------------------------- 1 | #ifndef _CCSYNCH_H_ 2 | #define _CCSYNCH_H_ 3 | 4 | #include 5 | #include "align.h" 6 | #include "primitives.h" 7 | 8 | typedef struct _ccsynch_node_t { 9 | struct _ccsynch_node_t * volatile next CACHE_ALIGNED; 10 | void * volatile data; 11 | int volatile status CACHE_ALIGNED; 12 | } ccsynch_node_t; 13 | 14 | typedef struct _ccsynch_handle_t { 15 | struct _ccsynch_node_t * next; 16 | } ccsynch_handle_t; 17 | 18 | typedef struct _ccsynch_t { 19 | struct _ccsynch_node_t * volatile tail DOUBLE_CACHE_ALIGNED; 20 | } ccsynch_t; 21 | 22 | #define CCSYNCH_WAIT 0x0 23 | #define CCSYNCH_READY 0x1 24 | #define CCSYNCH_DONE 0x3 25 | 26 | static inline 27 | void ccsynch_apply(ccsynch_t * synch, ccsynch_handle_t * handle, 28 | void (*apply)(void *, void *), void * state, void * data) 29 | { 30 | ccsynch_node_t * next = handle->next; 31 | next->next = NULL; 32 | next->status = CCSYNCH_WAIT; 33 | 34 | ccsynch_node_t * curr = SWAPra(&synch->tail, next); 35 | handle->next = curr; 36 | 37 | int status = ACQUIRE(&curr->status); 38 | 39 | if (status == CCSYNCH_WAIT) { 40 | curr->data = data; 41 | RELEASE(&curr->next, next); 42 | 43 | do { 44 | PAUSE(); 45 | status = ACQUIRE(&curr->status); 46 | } while (status == CCSYNCH_WAIT); 47 | } 48 | 49 | if (status != CCSYNCH_DONE) { 50 | apply(state, data); 51 | 52 | curr = next; 53 | next = ACQUIRE(&curr->next); 54 | 55 | int count = 0; 56 | const int CCSYNCH_HELP_BOUND = 256; 57 | 58 | while (next && count++ < CCSYNCH_HELP_BOUND) { 59 | apply(state, curr->data); 60 | RELEASE(&curr->status, CCSYNCH_DONE); 61 | 62 | curr = next; 63 | next = ACQUIRE(&curr->next); 64 | } 65 | 66 | RELEASE(&curr->status, CCSYNCH_READY); 67 | } 68 | } 69 | 70 | static inline void ccsynch_init(ccsynch_t * synch) 71 | { 72 | ccsynch_node_t * node = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t)); 73 | node->next = NULL; 74 | node->status = CCSYNCH_READY; 75 | 76 | synch->tail = node; 77 | } 78 | 79 | static inline void ccsynch_handle_init(ccsynch_handle_t * handle) 80 | { 81 | handle->next = align_malloc(CACHE_LINE_SIZE, sizeof(ccsynch_node_t)); 82 | } 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /cpumap.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUMAP_H 2 | #define CPUMAP_H 3 | 4 | #include 5 | 6 | #ifdef GUADALUPE_SPREAD 7 | int cpumap(int i, int nprocs) 8 | { 9 | return (i / 36) * 36 + (i % 2) * 18 + (i % 36 / 2); 10 | } 11 | 12 | #elif GUADALUPE_OVERSUB 13 | int cpumap(int i, int nprocs) { 14 | return (i % 18); 15 | } 16 | 17 | #elif GUADALUPE_COMPACT 18 | int cpumap(int i, int nprocs) 19 | { 20 | return (i % 2) * 36 + i / 2; 21 | } 22 | 23 | #elif GUADALUPE_MIC_COMPACT 24 | int cpumap(int i, int nprocs) 25 | { 26 | return (i + 1) % 228; 27 | } 28 | 29 | #elif LES_SPREAD 30 | int cpumap(int i, int nprocs) 31 | { 32 | return i % 4 * 12 + i / 4 % 12; 33 | } 34 | 35 | #elif BIOU_COMPACT 36 | int cpumap(int i, int nprocs) 37 | { 38 | return (i % 2) * 32 + i / 2; 39 | } 40 | 41 | #else 42 | int cpumap(int id, int nprocs) 43 | { 44 | return id % nprocs; 45 | } 46 | 47 | #endif 48 | 49 | #endif /* end of include guard: CPUMAP_H */ 50 | -------------------------------------------------------------------------------- /delay.c: -------------------------------------------------------------------------------- 1 | #include "queue.h" 2 | #include "primitives.h" 3 | 4 | void queue_init(queue_t * q, int nprocs) {} 5 | void queue_register(queue_t * q, handle_t * hd, int id) 6 | { 7 | *hd = id + 1; 8 | } 9 | 10 | void enqueue(queue_t * q, handle_t * th, void * val) 11 | { 12 | } 13 | 14 | void * dequeue(queue_t * q, handle_t * th) 15 | { 16 | return (void *) (long) *th; 17 | } 18 | 19 | void queue_free(queue_t * q, handle_t * h) {} 20 | 21 | -------------------------------------------------------------------------------- /delay.h: -------------------------------------------------------------------------------- 1 | #ifndef DELAY_H 2 | #define DELAY_H 3 | 4 | //#include 5 | #include 6 | 7 | typedef struct drand48_data delay_t; 8 | 9 | static inline void delay_init(delay_t * state, int id) 10 | { 11 | srand48_r(id, state); 12 | } 13 | 14 | static inline void delay_exec(delay_t * state) 15 | { 16 | long n; 17 | lrand48_r(state, &n); 18 | 19 | int j; 20 | for (j = 50; j < 50 + n % 100; ++j) { 21 | __asm__ ("nop"); 22 | } 23 | } 24 | 25 | #endif /* end of include guard: DELAY_H */ 26 | -------------------------------------------------------------------------------- /driver: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | T90=( \ 4 | 6.314 2.920 2.353 2.132 2.015 1.943 1.895 1.860 1.833 1.812 \ 5 | 1.796 1.782 1.771 1.761 1.753 1.746 1.740 1.734 1.729 1.725 \ 6 | 1.721 1.717 1.714 1.711 1.708 1.706 1.703 1.701 1.699 1.697 \ 7 | ) 8 | 9 | T95=( \ 10 | 12.71 4.303 3.182 2.776 2.571 2.447 2.365 2.306 2.262 2.228 \ 11 | 2.201 2.179 2.160 2.145 2.131 2.120 2.110 2.101 2.093 2.086 \ 12 | 2.080 2.074 2.069 2.064 2.060 2.056 2.052 2.048 2.045 2.042 \ 13 | ) 14 | 15 | TIMES[0]=$($@ | grep Mean | awk '{ print $5 }') 16 | SUM=${TIMES[0]} 17 | printf '#%-2d %.2f\n' 1 ${TIMES[0]} 18 | 19 | i=1 20 | while true; do 21 | TIME=$($@ | grep Mean | awk '{ print $5 }') 22 | TIMES[$i]=$TIME 23 | SUM=$(echo "$SUM + $TIME" | bc) 24 | N=$(($i + 1)) 25 | 26 | MEAN=$(echo "$SUM / $N" | bc -l) 27 | 28 | STD=0 29 | for j in "${TIMES[@]}"; do 30 | STD=$(echo "($j - $MEAN) ^ 2 + $STD" | bc -l) 31 | done 32 | STD=$(echo "sqrt ($STD / $i)" | bc -l) 33 | 34 | ERR=$(echo "${T95[$i]} * $STD / sqrt($N)" | bc -l) 35 | PRECISION=$(echo "$ERR / $MEAN" | bc -l) 36 | 37 | printf '#%-2d %.2f %.2f %.4f %.2f %.3f\n' \ 38 | $N $TIME $MEAN $STD $ERR $PRECISION 39 | 40 | if (($N >= 10 || $N >= 5 && $(echo "$PRECISION < 0.02" | bc) == 1)); then 41 | break 42 | else 43 | i=$N 44 | fi 45 | done 46 | 47 | -------------------------------------------------------------------------------- /faa.c: -------------------------------------------------------------------------------- 1 | #include "queue.h" 2 | #include "primitives.h" 3 | 4 | void queue_init(queue_t * q, int nprocs) {} 5 | void queue_register(queue_t * q, handle_t * hd, int id) 6 | { 7 | *hd = id + 1; 8 | } 9 | 10 | void enqueue(queue_t * q, handle_t * th, void * val) 11 | { 12 | FAA(&q->P, 1); 13 | } 14 | 15 | void * dequeue(queue_t * q, handle_t * th) 16 | { 17 | FAA(&q->C, 1); 18 | return (void *) (long) *th; 19 | } 20 | 21 | void queue_free(queue_t * q, handle_t * h) {} 22 | 23 | -------------------------------------------------------------------------------- /halfhalf.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "delay.h" 5 | #include "queue.h" 6 | 7 | #ifndef LOGN_OPS 8 | #define LOGN_OPS 7 9 | #endif 10 | 11 | static long nops; 12 | static queue_t * q; 13 | static handle_t ** hds; 14 | 15 | void init(int nprocs, int logn) { 16 | /** Use 10^7 as default input size. */ 17 | if (logn == 0) logn = LOGN_OPS; 18 | 19 | /** Compute the number of ops to perform. */ 20 | nops = 1; 21 | int i; 22 | for (i = 0; i < logn; ++i) { 23 | nops *= 10; 24 | } 25 | 26 | printf(" Number of operations: %ld\n", nops); 27 | 28 | q = align_malloc(PAGE_SIZE, sizeof(queue_t)); 29 | queue_init(q, nprocs); 30 | 31 | hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs])); 32 | } 33 | 34 | void thread_init(int id, int nprocs) { 35 | hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t)); 36 | queue_register(q, hds[id], id); 37 | } 38 | 39 | void thread_exit(int id, int nprocs) { 40 | queue_free(q, hds[id]); 41 | } 42 | 43 | void * benchmark(int id, int nprocs) { 44 | void * val = (void *) (intptr_t) (id + 1); 45 | handle_t * th = hds[id]; 46 | 47 | delay_t state; 48 | delay_init(&state, id); 49 | 50 | struct drand48_data rstate; 51 | srand48_r(id, &rstate); 52 | 53 | int i; 54 | for (i = 0; i < nops / nprocs; ++i) { 55 | long n; 56 | lrand48_r(&rstate, &n); 57 | 58 | if (n % 2 == 0) 59 | enqueue(q, th, val); 60 | else 61 | dequeue(q, th); 62 | 63 | delay_exec(&state); 64 | } 65 | 66 | return val; 67 | } 68 | 69 | int verify(int nprocs, void ** results) { 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /harness.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "bits.h" 10 | #include "cpumap.h" 11 | #include "benchmark.h" 12 | 13 | #ifndef NUM_ITERS 14 | #define NUM_ITERS 5 15 | #endif 16 | 17 | #ifndef MAX_PROCS 18 | #define MAX_PROCS 512 19 | #endif 20 | 21 | #ifndef MAX_ITERS 22 | #define MAX_ITERS 20 23 | #endif 24 | 25 | #ifndef COV_THRESHOLD 26 | #define COV_THRESHOLD 0.02 27 | #endif 28 | 29 | static pthread_barrier_t barrier; 30 | static double times[MAX_ITERS]; 31 | static double means[MAX_ITERS]; 32 | static double covs[MAX_ITERS]; 33 | static volatile int target; 34 | 35 | static size_t elapsed_time(size_t us) 36 | { 37 | struct timeval t; 38 | gettimeofday(&t, NULL); 39 | return t.tv_sec * 1000000 + t.tv_usec - us; 40 | } 41 | 42 | static double compute_mean(const double * times) 43 | { 44 | int i; 45 | double sum = 0; 46 | 47 | for (i = 0; i < NUM_ITERS; ++i) { 48 | sum += times[i]; 49 | } 50 | 51 | return sum / NUM_ITERS; 52 | } 53 | 54 | static double compute_cov(const double * times, double mean) 55 | { 56 | double variance = 0; 57 | 58 | int i; 59 | for (i = 0; i < NUM_ITERS; ++i) { 60 | variance += (times[i] - mean) * (times[i] - mean); 61 | } 62 | 63 | variance /= NUM_ITERS; 64 | 65 | double cov = sqrt(variance);; 66 | cov /= mean; 67 | return cov; 68 | } 69 | 70 | static size_t reduce_min(long val, int id, int nprocs) 71 | { 72 | static long buffer[MAX_PROCS]; 73 | 74 | buffer[id] = val; 75 | pthread_barrier_wait(&barrier); 76 | 77 | long min = LONG_MAX; 78 | int i; 79 | for (i = 0; i < nprocs; ++i) { 80 | if (buffer[i] < min) min = buffer[i]; 81 | } 82 | 83 | return min; 84 | } 85 | 86 | static void report(int id, int nprocs, int i, long us) 87 | { 88 | long ms = reduce_min(us, id, nprocs); 89 | 90 | if (id == 0) { 91 | times[i] = ms / 1000.0; 92 | printf(" #%d elapsed time: %.2f ms\n", i + 1, times[i]); 93 | 94 | if (i + 1 >= NUM_ITERS) { 95 | int n = i + 1 - NUM_ITERS; 96 | 97 | means[i] = compute_mean(times + n); 98 | covs[i] = compute_cov(times + n, means[i]); 99 | 100 | if (covs[i] < COV_THRESHOLD) { 101 | target = i; 102 | } 103 | } 104 | } 105 | 106 | pthread_barrier_wait(&barrier); 107 | } 108 | 109 | static void * thread(void * bits) 110 | { 111 | int id = bits_hi(bits); 112 | int nprocs = bits_lo(bits); 113 | 114 | cpu_set_t set; 115 | CPU_ZERO(&set); 116 | 117 | int cpu = cpumap(id, nprocs); 118 | CPU_SET(cpu, &set); 119 | sched_setaffinity(0, sizeof(set), &set); 120 | 121 | thread_init(id, nprocs); 122 | pthread_barrier_wait(&barrier); 123 | 124 | int i; 125 | void * result = NULL; 126 | 127 | for (i = 0; i < MAX_ITERS && target == 0; ++i) { 128 | long us = elapsed_time(0); 129 | result = benchmark(id, nprocs); 130 | pthread_barrier_wait(&barrier); 131 | us = elapsed_time(us); 132 | report(id, nprocs, i, us); 133 | } 134 | 135 | thread_exit(id, nprocs); 136 | return result; 137 | } 138 | 139 | int main(int argc, const char *argv[]) 140 | { 141 | int nprocs = 0; 142 | int n = 0; 143 | 144 | /** The first argument is nprocs. */ 145 | if (argc > 1) { 146 | nprocs = atoi(argv[1]); 147 | } 148 | 149 | /** 150 | * Use the number of processors online as nprocs if it is not 151 | * specified. 152 | */ 153 | if (nprocs == 0) { 154 | nprocs = sysconf(_SC_NPROCESSORS_ONLN); 155 | } 156 | 157 | if (nprocs <= 0) return 1; 158 | else { 159 | /** Set concurrency level. */ 160 | pthread_setconcurrency(nprocs); 161 | } 162 | 163 | /** 164 | * The second argument is input size n. 165 | */ 166 | if (argc > 2) { 167 | n = atoi(argv[2]); 168 | } 169 | 170 | pthread_barrier_init(&barrier, NULL, nprocs); 171 | printf("===========================================\n"); 172 | printf(" Benchmark: %s\n", argv[0]); 173 | printf(" Number of processors: %d\n", nprocs); 174 | 175 | init(nprocs, n); 176 | 177 | pthread_t ths[nprocs]; 178 | void * res[nprocs]; 179 | 180 | int i; 181 | for (i = 1; i < nprocs; i++) { 182 | pthread_create(&ths[i], NULL, thread, bits_join(i, nprocs)); 183 | } 184 | 185 | res[0] = thread(bits_join(0, nprocs)); 186 | 187 | for (i = 1; i < nprocs; i++) { 188 | pthread_join(ths[i], &res[i]); 189 | } 190 | 191 | if (target == 0) { 192 | target = NUM_ITERS - 1; 193 | double minCov = covs[target]; 194 | 195 | /** Pick the result that has the lowest CoV. */ 196 | int i; 197 | for (i = NUM_ITERS; i < MAX_ITERS; ++i) { 198 | if (covs[i] < minCov) { 199 | minCov = covs[i]; 200 | target = i; 201 | } 202 | } 203 | } 204 | 205 | double mean = means[target]; 206 | double cov = covs[target]; 207 | int i1 = target - NUM_ITERS + 2; 208 | int i2 = target + 1; 209 | 210 | printf(" Steady-state iterations: %d~%d\n", i1, i2); 211 | printf(" Coefficient of variation: %.2f\n", cov); 212 | printf(" Number of measurements: %d\n", NUM_ITERS); 213 | printf(" Mean of elapsed time: %.2f ms\n", mean); 214 | printf("===========================================\n"); 215 | 216 | pthread_barrier_destroy(&barrier); 217 | return verify(nprocs, res); 218 | } 219 | 220 | -------------------------------------------------------------------------------- /hzdptr.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "hzdptr.h" 4 | #include "xxhash.h" 5 | 6 | #define HZDPTR_HTBL_SIZE(nprocs, nptrs) (4 * nprocs * nptrs) 7 | 8 | typedef struct _node_t { 9 | struct _node_t * next; 10 | } node_t; 11 | 12 | static int htable_insert(void ** tbl, size_t size, void * ptr) 13 | { 14 | int index = XXH32(ptr, 1, 0) % size; 15 | int i; 16 | 17 | for (i = index; i < size; ++i ) { 18 | if (tbl[i] == NULL) { 19 | tbl[i] = ptr; 20 | return 0; 21 | } 22 | } 23 | 24 | for (i = 0; i < index; ++i) { 25 | if (tbl[i] == NULL) { 26 | tbl[i] = ptr; 27 | return 0; 28 | } 29 | } 30 | 31 | return -1; 32 | } 33 | 34 | static int htable_lookup(void ** tbl, size_t size, void * ptr) 35 | { 36 | int index = XXH32(ptr, 1, 0) % size; 37 | int i; 38 | 39 | for (i = index; i < size; ++i) { 40 | if (tbl[i] == ptr) { 41 | return 1; 42 | } else if (tbl[i] == NULL) { 43 | return 0; 44 | } 45 | } 46 | 47 | for (i = 0; i < index; ++i) { 48 | if (tbl[i] == ptr) { 49 | return 1; 50 | } else if (tbl[i] == NULL) { 51 | return 0; 52 | } 53 | } 54 | 55 | return 0; 56 | } 57 | 58 | void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs) 59 | { 60 | hzd->nprocs = nprocs; 61 | hzd->nptrs = nptrs; 62 | hzd->nretired = 0; 63 | hzd->ptrs = calloc(hzdptr_size(nprocs, nptrs), 1); 64 | 65 | _hzdptr_enlist(hzd); 66 | } 67 | 68 | void _hzdptr_retire(hzdptr_t * hzd, void ** rlist) 69 | { 70 | size_t size = HZDPTR_HTBL_SIZE(hzd->nprocs, hzd->nptrs); 71 | void * plist[size]; 72 | memset(plist, 0, sizeof(plist)); 73 | 74 | hzdptr_t * me = hzd; 75 | void * ptr; 76 | 77 | while ((hzd = hzd->next) != me) { 78 | int i; 79 | for (i = 0; i < hzd->nptrs; ++i) { 80 | ptr = hzd->ptrs[i]; 81 | 82 | if (ptr != NULL) { 83 | htable_insert(plist, size, ptr); 84 | } 85 | } 86 | } 87 | 88 | int nretired = 0; 89 | 90 | /** Check pointers in retire list with plist. */ 91 | int i; 92 | for (i = 0; i < hzd->nretired; ++i) { 93 | ptr = rlist[i]; 94 | 95 | if (htable_lookup(plist, size, ptr)) { 96 | rlist[nretired++] = ptr; 97 | } else { 98 | free(ptr); 99 | } 100 | } 101 | 102 | hzd->nretired = nretired; 103 | } 104 | 105 | void hzdptr_exit(hzdptr_t * hzd) 106 | { 107 | int i; 108 | void ** rlist = &hzd->ptrs[hzd->nptrs]; 109 | 110 | for (i = 0; i < hzd->nretired; ++i) { 111 | free(rlist[i]); 112 | } 113 | 114 | hzd->nretired = 0; 115 | hzd->next = hzd; 116 | } 117 | 118 | -------------------------------------------------------------------------------- /hzdptr.h: -------------------------------------------------------------------------------- 1 | #ifndef HZDPTR_H 2 | #define HZDPTR_H 3 | 4 | #include "primitives.h" 5 | 6 | typedef struct _hzdptr_t { 7 | struct _hzdptr_t * next; 8 | int nprocs; 9 | int nptrs; 10 | int nretired; 11 | void ** ptrs; 12 | } hzdptr_t; 13 | 14 | #define HZDPTR_THRESHOLD(nprocs) (2 * nprocs) 15 | 16 | extern void hzdptr_init(hzdptr_t * hzd, int nprocs, int nptrs); 17 | extern void hzdptr_exit(hzdptr_t * hzd); 18 | extern void _hzdptr_retire(hzdptr_t * hzd, void ** rlist); 19 | 20 | static inline 21 | int hzdptr_size(int nprocs, int nptrs) 22 | { 23 | return sizeof(void * [HZDPTR_THRESHOLD(nprocs) + nptrs]); 24 | } 25 | 26 | static inline 27 | void * _hzdptr_set(void volatile * ptr_, void * hzd_) 28 | { 29 | void * volatile * ptr = (void * volatile *) ptr_; 30 | void * volatile * hzd = (void * volatile *) hzd_; 31 | 32 | void * val = *ptr; 33 | *hzd = val; 34 | return val; 35 | } 36 | 37 | static inline 38 | void * hzdptr_set(void volatile * ptr, hzdptr_t * hzd, int idx) 39 | { 40 | return _hzdptr_set(ptr, &hzd->ptrs[idx]); 41 | } 42 | 43 | static inline 44 | void * _hzdptr_setv(void volatile * ptr_, void * hzd_) 45 | { 46 | void * volatile * ptr = (void * volatile *) ptr_; 47 | void * volatile * hzd = (void * volatile *) hzd_; 48 | 49 | void * val = *ptr; 50 | void * tmp; 51 | 52 | do { 53 | *hzd = val; 54 | tmp = val; 55 | FENCE(); 56 | val = *ptr; 57 | } while (val != tmp); 58 | 59 | return val; 60 | } 61 | 62 | static inline 63 | void * hzdptr_setv(void volatile * ptr, hzdptr_t * hzd, int idx) 64 | { 65 | return _hzdptr_setv(ptr, &hzd->ptrs[idx]); 66 | } 67 | 68 | static inline 69 | void hzdptr_clear(hzdptr_t * hzd, int idx) 70 | { 71 | RELEASE(&hzd->ptrs[idx], NULL); 72 | } 73 | 74 | static inline 75 | void hzdptr_retire(hzdptr_t * hzd, void * ptr) 76 | { 77 | void ** rlist = &hzd->ptrs[hzd->nptrs]; 78 | rlist[hzd->nretired++] = ptr; 79 | 80 | if (hzd->nretired == HZDPTR_THRESHOLD(hzd->nprocs)) { 81 | _hzdptr_retire(hzd, rlist); 82 | } 83 | } 84 | 85 | static inline 86 | void _hzdptr_enlist(hzdptr_t * hzd) 87 | { 88 | static hzdptr_t * volatile _tail; 89 | hzdptr_t * tail = _tail; 90 | 91 | if (tail == NULL) { 92 | hzd->next = hzd; 93 | if (CASra(&_tail, &tail, hzd)) return; 94 | } 95 | 96 | hzdptr_t * next = tail->next; 97 | 98 | do hzd->next = next; 99 | while (!CASra(&tail->next, &next, hzd)); 100 | } 101 | 102 | #endif /* end of include guard: HZDPTR_H */ 103 | -------------------------------------------------------------------------------- /lcrq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "lcrq.h" 5 | #include "align.h" 6 | #include "delay.h" 7 | #include "hzdptr.h" 8 | #include "primitives.h" 9 | 10 | #define RING_SIZE LCRQ_RING_SIZE 11 | 12 | static inline int is_empty(uint64_t v) __attribute__ ((pure)); 13 | static inline uint64_t node_index(uint64_t i) __attribute__ ((pure)); 14 | static inline uint64_t set_unsafe(uint64_t i) __attribute__ ((pure)); 15 | static inline uint64_t node_unsafe(uint64_t i) __attribute__ ((pure)); 16 | static inline uint64_t tail_index(uint64_t t) __attribute__ ((pure)); 17 | static inline int crq_is_closed(uint64_t t) __attribute__ ((pure)); 18 | 19 | static inline void init_ring(RingQueue *r) { 20 | int i; 21 | 22 | for (i = 0; i < RING_SIZE; i++) { 23 | r->array[i].val = -1; 24 | r->array[i].idx = i; 25 | } 26 | 27 | r->head = r->tail = 0; 28 | r->next = NULL; 29 | } 30 | 31 | inline int is_empty(uint64_t v) { 32 | return (v == (uint64_t)-1); 33 | } 34 | 35 | 36 | inline uint64_t node_index(uint64_t i) { 37 | return (i & ~(1ull << 63)); 38 | } 39 | 40 | 41 | inline uint64_t set_unsafe(uint64_t i) { 42 | return (i | (1ull << 63)); 43 | } 44 | 45 | 46 | inline uint64_t node_unsafe(uint64_t i) { 47 | return (i & (1ull << 63)); 48 | } 49 | 50 | 51 | inline uint64_t tail_index(uint64_t t) { 52 | return (t & ~(1ull << 63)); 53 | } 54 | 55 | 56 | inline int crq_is_closed(uint64_t t) { 57 | return (t & (1ull << 63)) != 0; 58 | } 59 | 60 | void queue_init(queue_t * q, int nprocs) 61 | { 62 | RingQueue *rq = align_malloc(PAGE_SIZE, sizeof(RingQueue)); 63 | init_ring(rq); 64 | 65 | q->head = rq; 66 | q->tail = rq; 67 | q->nprocs = nprocs; 68 | } 69 | 70 | static inline void fixState(RingQueue *rq) { 71 | 72 | while (1) { 73 | uint64_t t = rq->tail; 74 | uint64_t h = rq->head; 75 | 76 | if (rq->tail != t) 77 | continue; 78 | 79 | if (h > t) { 80 | if (CAS(&rq->tail, &t, h)) break; 81 | continue; 82 | } 83 | break; 84 | } 85 | } 86 | 87 | static inline int close_crq(RingQueue *rq, const uint64_t t, const int tries) { 88 | uint64_t tt = t + 1; 89 | 90 | if (tries < 10) 91 | return CAS(&rq->tail, &tt, tt|(1ull<<63)); 92 | else 93 | return BTAS(&rq->tail, 63); 94 | } 95 | 96 | static void lcrq_put(queue_t * q, handle_t * handle, uint64_t arg) { 97 | int try_close = 0; 98 | 99 | while (1) { 100 | RingQueue *rq = hzdptr_setv(&q->tail, &handle->hzdptr, 0); 101 | RingQueue *next = rq->next; 102 | 103 | if (next != NULL) { 104 | CAS(&q->tail, &rq, next); 105 | continue; 106 | } 107 | 108 | uint64_t t = FAA(&rq->tail, 1); 109 | 110 | if (crq_is_closed(t)) { 111 | RingQueue * nrq; 112 | alloc: 113 | nrq = handle->next; 114 | 115 | if (nrq == NULL) { 116 | nrq = align_malloc(PAGE_SIZE, sizeof(RingQueue)); 117 | init_ring(nrq); 118 | } 119 | 120 | // Solo enqueue 121 | nrq->tail = 1; 122 | nrq->array[0].val = (uint64_t) arg; 123 | nrq->array[0].idx = 0; 124 | 125 | if (CAS(&rq->next, &next, nrq)) { 126 | CAS(&q->tail, &rq, nrq); 127 | handle->next = NULL; 128 | return; 129 | } 130 | continue; 131 | } 132 | 133 | RingNode* cell = &rq->array[t & (RING_SIZE-1)]; 134 | 135 | uint64_t idx = cell->idx; 136 | uint64_t val = cell->val; 137 | 138 | if (is_empty(val)) { 139 | if (node_index(idx) <= t) { 140 | if ((!node_unsafe(idx) || rq->head < t) && 141 | CAS2(cell, &val, &idx, arg, t)) { 142 | return; 143 | } 144 | } 145 | } 146 | 147 | uint64_t h = rq->head; 148 | 149 | if ((int64_t)(t - h) >= (int64_t)RING_SIZE && 150 | close_crq(rq, t, ++try_close)) { 151 | goto alloc; 152 | } 153 | } 154 | 155 | hzdptr_clear(&handle->hzdptr, 0); 156 | } 157 | 158 | static uint64_t lcrq_get(queue_t * q, handle_t * handle) { 159 | while (1) { 160 | RingQueue *rq = hzdptr_setv(&q->head, &handle->hzdptr, 0); 161 | RingQueue *next; 162 | 163 | uint64_t h = FAA(&rq->head, 1); 164 | 165 | RingNode* cell = &rq->array[h & (RING_SIZE-1)]; 166 | 167 | uint64_t tt = 0; 168 | int r = 0; 169 | 170 | while (1) { 171 | 172 | uint64_t cell_idx = cell->idx; 173 | uint64_t unsafe = node_unsafe(cell_idx); 174 | uint64_t idx = node_index(cell_idx); 175 | uint64_t val = cell->val; 176 | 177 | if (idx > h) break; 178 | 179 | if (!is_empty(val)) { 180 | if (idx == h) { 181 | if (CAS2(cell, &val, &cell_idx, -1, (unsafe | h) + RING_SIZE)) 182 | return val; 183 | } else { 184 | if (CAS2(cell, &val, &cell_idx, val, set_unsafe(idx))) { 185 | break; 186 | } 187 | } 188 | } else { 189 | if ((r & ((1ull << 10) - 1)) == 0) 190 | tt = rq->tail; 191 | 192 | // Optimization: try to bail quickly if queue is closed. 193 | int crq_closed = crq_is_closed(tt); 194 | uint64_t t = tail_index(tt); 195 | 196 | if (unsafe) { // Nothing to do, move along 197 | if (CAS2(cell, &val, &cell_idx, val, (unsafe | h) + RING_SIZE)) 198 | break; 199 | } else if (t < h + 1 || r > 200000 || crq_closed) { 200 | if (CAS2(cell, &val, &idx, val, h + RING_SIZE)) { 201 | if (r > 200000 && tt > RING_SIZE) 202 | BTAS(&rq->tail, 63); 203 | break; 204 | } 205 | } else { 206 | ++r; 207 | } 208 | } 209 | } 210 | 211 | if (tail_index(rq->tail) <= h + 1) { 212 | fixState(rq); 213 | // try to return empty 214 | next = rq->next; 215 | if (next == NULL) 216 | return -1; // EMPTY 217 | if (tail_index(rq->tail) <= h + 1) { 218 | if (CAS(&q->head, &rq, next)) { 219 | hzdptr_retire(&handle->hzdptr, rq); 220 | } 221 | } 222 | } 223 | } 224 | 225 | hzdptr_clear(&handle->hzdptr, 0); 226 | } 227 | 228 | void queue_register(queue_t * q, handle_t * th, int id) 229 | { 230 | hzdptr_init(&th->hzdptr, q->nprocs, 1); 231 | } 232 | 233 | void enqueue(queue_t * q, handle_t * th, void * val) 234 | { 235 | lcrq_put(q, th, (uint64_t) val); 236 | } 237 | 238 | void * dequeue(queue_t * q, handle_t * th) 239 | { 240 | return (void *) lcrq_get(q, th); 241 | } 242 | //By K 243 | void handle_free(handle_t *h){ 244 | hzdptr_t *hzd = &h->hzdptr; 245 | void **rlist = &hzd->ptrs[hzd->nptrs]; 246 | for(int i = 0;i < hzd->nretired; i++){ 247 | free(rlist[i]); 248 | } 249 | free(h->hzdptr.ptrs); 250 | } 251 | void queue_free(queue_t * q, handle_t * h){ 252 | RingQueue *rq = q->head; 253 | while(rq){ 254 | RingQueue *n = rq->next; 255 | free(rq); 256 | rq = n; 257 | }; 258 | } 259 | -------------------------------------------------------------------------------- /lcrq.h: -------------------------------------------------------------------------------- 1 | #ifndef LCRQ_H 2 | #define LCRQ_H 3 | 4 | #ifdef LCRQ 5 | 6 | #include "align.h" 7 | #include "hzdptr.h" 8 | 9 | #define EMPTY ((void *) -1) 10 | 11 | #ifndef LCRQ_RING_SIZE 12 | #define LCRQ_RING_SIZE (1ull << 12) 13 | #endif 14 | 15 | typedef struct RingNode { 16 | volatile uint64_t val; 17 | volatile uint64_t idx; 18 | uint64_t pad[14]; 19 | } RingNode DOUBLE_CACHE_ALIGNED; 20 | 21 | typedef struct RingQueue { 22 | volatile int64_t head DOUBLE_CACHE_ALIGNED; 23 | volatile int64_t tail DOUBLE_CACHE_ALIGNED; 24 | struct RingQueue *next DOUBLE_CACHE_ALIGNED; 25 | RingNode array[LCRQ_RING_SIZE]; 26 | } RingQueue DOUBLE_CACHE_ALIGNED; 27 | 28 | typedef struct { 29 | RingQueue * volatile head DOUBLE_CACHE_ALIGNED; 30 | RingQueue * volatile tail DOUBLE_CACHE_ALIGNED; 31 | int nprocs; 32 | } queue_t; 33 | 34 | typedef struct { 35 | RingQueue * next; 36 | hzdptr_t hzdptr; 37 | } handle_t; 38 | 39 | #endif 40 | 41 | #endif /* end of include guard: LCRQ_H */ 42 | -------------------------------------------------------------------------------- /msqueue.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "delay.h" 3 | #include "msqueue.h" 4 | #include "primitives.h" 5 | 6 | void queue_init(queue_t * q, int nprocs) 7 | { 8 | node_t * node = malloc(sizeof(node_t)); 9 | node->next = NULL; 10 | 11 | q->head = node; 12 | q->tail = node; 13 | q->nprocs = nprocs; 14 | } 15 | 16 | void queue_register(queue_t * q, handle_t * th, int id) 17 | { 18 | hzdptr_init(&th->hzd, q->nprocs, 2); 19 | } 20 | 21 | void enqueue(queue_t * q, handle_t * handle, void * data) 22 | { 23 | node_t * node = malloc(sizeof(node_t)); 24 | 25 | node->data = data; 26 | node->next = NULL; 27 | 28 | node_t * tail; 29 | node_t * next; 30 | 31 | while (1) { 32 | tail = hzdptr_setv(&q->tail, &handle->hzd, 0); 33 | next = tail->next; 34 | 35 | if (tail != q->tail) { 36 | continue; 37 | } 38 | 39 | if (next != NULL) { 40 | CAS(&q->tail, &tail, next); 41 | continue; 42 | } 43 | 44 | if (CAS(&tail->next, &next, node)) break; 45 | } 46 | 47 | CAS(&q->tail, &tail, node); 48 | } 49 | 50 | void * dequeue(queue_t * q, handle_t * handle) 51 | { 52 | void * data; 53 | 54 | node_t * head; 55 | node_t * tail; 56 | node_t * next; 57 | 58 | while (1) { 59 | head = hzdptr_setv(&q->head, &handle->hzd, 0); 60 | tail = q->tail; 61 | next = hzdptr_set(&head->next, &handle->hzd, 1); 62 | 63 | if (head != q->head) { 64 | continue; 65 | } 66 | 67 | if (next == NULL) { 68 | return (void *) -1; 69 | } 70 | 71 | if (head == tail) { 72 | CAS(&q->tail, &tail, next); 73 | continue; 74 | } 75 | 76 | data = next->data; 77 | if (CAS(&q->head, &head, next)) break; 78 | } 79 | 80 | hzdptr_retire(&handle->hzd, head); 81 | return data; 82 | } 83 | 84 | void queue_free(int id, int nprocs) {} 85 | -------------------------------------------------------------------------------- /msqueue.h: -------------------------------------------------------------------------------- 1 | #ifndef MSQUEUE_H 2 | #define MSQUEUE_H 3 | 4 | #ifdef MSQUEUE 5 | #include "align.h" 6 | #include "hzdptr.h" 7 | 8 | #define EMPTY (void *) -1 9 | 10 | typedef struct _node_t { 11 | struct _node_t * volatile next DOUBLE_CACHE_ALIGNED; 12 | void * data DOUBLE_CACHE_ALIGNED; 13 | } node_t DOUBLE_CACHE_ALIGNED; 14 | 15 | typedef struct _queue_t { 16 | struct _node_t * volatile head DOUBLE_CACHE_ALIGNED; 17 | struct _node_t * volatile tail DOUBLE_CACHE_ALIGNED; 18 | int nprocs; 19 | } queue_t DOUBLE_CACHE_ALIGNED; 20 | 21 | typedef struct _handle_t { 22 | hzdptr_t hzd; 23 | } handle_t DOUBLE_CACHE_ALIGNED; 24 | 25 | #endif 26 | 27 | #endif /* end of include guard: MSQUEUE_H */ 28 | -------------------------------------------------------------------------------- /pairwise.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "delay.h" 5 | #include "queue.h" 6 | 7 | #ifndef LOGN_OPS 8 | #define LOGN_OPS 7 9 | #endif 10 | 11 | static long nops; 12 | static queue_t * q; 13 | static handle_t ** hds; 14 | 15 | void init(int nprocs, int logn) { 16 | 17 | /** Use 10^7 as default input size. */ 18 | if (logn == 0) logn = LOGN_OPS; 19 | 20 | /** Compute the number of ops to perform. */ 21 | nops = 1; 22 | int i; 23 | for (i = 0; i < logn; ++i) { 24 | nops *= 10; 25 | } 26 | 27 | printf(" Number of operations: %ld\n", nops); 28 | 29 | q = align_malloc(PAGE_SIZE, sizeof(queue_t)); 30 | queue_init(q, nprocs); 31 | 32 | hds = align_malloc(PAGE_SIZE, sizeof(handle_t * [nprocs])); 33 | } 34 | 35 | void thread_init(int id, int nprocs) { 36 | hds[id] = align_malloc(PAGE_SIZE, sizeof(handle_t)); 37 | queue_register(q, hds[id], id); 38 | } 39 | 40 | void * benchmark(int id, int nprocs) { 41 | void * val = (void *) (intptr_t) (id + 1); 42 | handle_t * th = hds[id]; 43 | 44 | delay_t state; 45 | delay_init(&state, id); 46 | 47 | int i; 48 | for (i = 0; i < nops / nprocs; ++i) { 49 | enqueue(q, th, val); 50 | delay_exec(&state); 51 | 52 | val = dequeue(q, th); 53 | delay_exec(&state); 54 | } 55 | 56 | return val; 57 | } 58 | 59 | void thread_exit(int id, int nprocs) { 60 | queue_free(q, hds[id]); 61 | } 62 | 63 | #ifdef VERIFY 64 | static int compare(const void * a, const void * b) { 65 | return *(long *) a - *(long *) b; 66 | } 67 | #endif 68 | 69 | int verify(int nprocs, void ** results) { 70 | #ifndef VERIFY 71 | return 0; 72 | #else 73 | qsort(results, nprocs, sizeof(void *), compare); 74 | 75 | int i; 76 | int ret = 0; 77 | 78 | for (i = 0; i < nprocs; ++i) { 79 | int res = (int) (intptr_t) results[i]; 80 | if (res != i + 1) { 81 | fprintf(stderr, "expected %d but received %d\n", i + 1, res); 82 | ret = 1; 83 | } 84 | } 85 | 86 | if (ret != 1) fprintf(stdout, "PASSED\n"); 87 | return ret; 88 | #endif 89 | } 90 | -------------------------------------------------------------------------------- /primitives.h: -------------------------------------------------------------------------------- 1 | /** @file */ 2 | 3 | #ifndef PRIMITIVES_H 4 | #define PRIMITIVES_H 5 | 6 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7 7 | /** 8 | * An atomic fetch-and-add. 9 | */ 10 | #define FAA(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_RELAXED) 11 | /** 12 | * An atomic fetch-and-add that also ensures sequential consistency. 13 | */ 14 | #define FAAcs(ptr, val) __atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST) 15 | 16 | /** 17 | * An atomic compare-and-swap. 18 | */ 19 | #define CAS(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 20 | __ATOMIC_RELAXED, __ATOMIC_RELAXED) 21 | /** 22 | * An atomic compare-and-swap that also ensures sequential consistency. 23 | */ 24 | #define CAScs(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 25 | __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 26 | /** 27 | * An atomic compare-and-swap that ensures release semantic when succeed 28 | * or acquire semantic when failed. 29 | */ 30 | #define CASra(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 31 | __ATOMIC_RELEASE, __ATOMIC_ACQUIRE) 32 | /** 33 | * An atomic compare-and-swap that ensures acquire semantic when succeed 34 | * or relaxed semantic when failed. 35 | */ 36 | #define CASa(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, \ 37 | __ATOMIC_ACQUIRE, __ATOMIC_RELAXED) 38 | 39 | /** 40 | * An atomic swap. 41 | */ 42 | #define SWAP(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_RELAXED) 43 | 44 | /** 45 | * An atomic swap that ensures acquire release semantics. 46 | */ 47 | #define SWAPra(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_ACQ_REL) 48 | 49 | /** 50 | * A memory fence to ensure sequential consistency. 51 | */ 52 | #define FENCE() __atomic_thread_fence(__ATOMIC_SEQ_CST) 53 | 54 | /** 55 | * An atomic store. 56 | */ 57 | #define STORE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELAXED) 58 | 59 | /** 60 | * A store with a preceding release fence to ensure all previous load 61 | * and stores completes before the current store is visiable. 62 | */ 63 | #define RELEASE(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELEASE) 64 | 65 | /** 66 | * A load with a following acquire fence to ensure no following load and 67 | * stores can start before the current load completes. 68 | */ 69 | #define ACQUIRE(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 70 | 71 | #else /** Non-GCC or old GCC. */ 72 | #if defined(__x86_64__) || defined(_M_X64_) 73 | 74 | #define FAA __sync_fetch_and_add 75 | #define FAAcs __sync_fetch_and_add 76 | 77 | static inline int 78 | _compare_and_swap(void ** ptr, void ** expected, void * desired) { 79 | void * oldval = *expected; 80 | void * newval = __sync_val_compare_and_swap(ptr, oldval, desired); 81 | 82 | if (newval == oldval) { 83 | return 1; 84 | } else { 85 | *expected = newval; 86 | return 0; 87 | } 88 | } 89 | #define CAS(ptr, expected, desired) \ 90 | _compare_and_swap((void **) (ptr), (void **) (expected), (void *) (desired)) 91 | #define CAScs CAS 92 | #define CASra CAS 93 | #define CASa CAS 94 | 95 | #define SWAP __sync_lock_test_and_set 96 | #define SWAPra SWAP 97 | 98 | #define ACQUIRE(p) ({ \ 99 | __typeof__(*(p)) __ret = *p; \ 100 | __asm__("":::"memory"); \ 101 | __ret; \ 102 | }) 103 | 104 | #define RELEASE(p, v) do {\ 105 | __asm__("":::"memory"); \ 106 | *p = v; \ 107 | } while (0) 108 | #define FENCE() __sync_synchronize() 109 | 110 | #endif 111 | #endif 112 | 113 | #if defined(__x86_64__) || defined(_M_X64_) 114 | #define PAUSE() __asm__ ("pause") 115 | 116 | static inline 117 | int _CAS2(volatile long * ptr, long * cmp1, long * cmp2, long val1, long val2) 118 | { 119 | char success; 120 | long tmp1 = *cmp1; 121 | long tmp2 = *cmp2; 122 | 123 | __asm__ __volatile__( 124 | "lock cmpxchg16b %1\n" 125 | "setz %0" 126 | : "=q" (success), "+m" (*ptr), "+a" (tmp1), "+d" (tmp2) 127 | : "b" (val1), "c" (val2) 128 | : "cc" ); 129 | 130 | *cmp1 = tmp1; 131 | *cmp2 = tmp2; 132 | return success; 133 | } 134 | #define CAS2(p, o1, o2, n1, n2) \ 135 | _CAS2((volatile long *) p, (long *) o1, (long *) o2, (long) n1, (long) n2) 136 | 137 | #define BTAS(ptr, bit) ({ \ 138 | char __ret; \ 139 | __asm__ __volatile__( \ 140 | "lock btsq %2, %0; setnc %1" \ 141 | : "+m" (*ptr), "=r" (__ret) : "ri" (bit) : "cc" ); \ 142 | __ret; \ 143 | }) 144 | 145 | #else 146 | #define PAUSE() 147 | #endif 148 | 149 | #endif /* end of include guard: PRIMITIVES_H */ 150 | -------------------------------------------------------------------------------- /queue.h: -------------------------------------------------------------------------------- 1 | #ifndef QUEUE_H 2 | #define QUEUE_H 3 | 4 | #ifdef WFQUEUE 5 | #include "wfqueue.h" 6 | 7 | #elif LCRQ 8 | #include "lcrq.h" 9 | 10 | #elif CCQUEUE 11 | #include "ccqueue.h" 12 | 13 | #elif MSQUEUE 14 | #include "msqueue.h" 15 | 16 | #elif FAAQ 17 | #include "align.h" 18 | 19 | typedef struct { 20 | volatile long P DOUBLE_CACHE_ALIGNED; 21 | volatile long C DOUBLE_CACHE_ALIGNED; 22 | } queue_t DOUBLE_CACHE_ALIGNED; 23 | 24 | typedef int handle_t; 25 | 26 | #elif DELAY 27 | 28 | typedef int queue_t; 29 | typedef int handle_t; 30 | 31 | #else 32 | #error "Please specify a queue implementation." 33 | 34 | #endif 35 | 36 | void queue_init(queue_t * q, int nprocs); 37 | void queue_register(queue_t * q, handle_t * th, int id); 38 | void enqueue(queue_t * q, handle_t * th, void * v); 39 | void * dequeue(queue_t * q, handle_t * th); 40 | void queue_free(queue_t * q, handle_t * h); 41 | void handle_free(handle_t *h); 42 | 43 | #endif /* end of include guard: QUEUE_H */ 44 | -------------------------------------------------------------------------------- /wfqueue.c: -------------------------------------------------------------------------------- 1 | #include "wfqueue.h" 2 | #include 3 | #include 4 | #include 5 | #include "primitives.h" 6 | 7 | #define N WFQUEUE_NODE_SIZE 8 | #define BOT ((void *)0) 9 | #define TOP ((void *)-1) 10 | 11 | #define MAX_GARBAGE(n) (2 * n) 12 | 13 | #ifndef MAX_SPIN 14 | #define MAX_SPIN 100 15 | #endif 16 | 17 | #ifndef MAX_PATIENCE 18 | #define MAX_PATIENCE 10 19 | #endif 20 | 21 | typedef struct _enq_t enq_t; 22 | typedef struct _deq_t deq_t; 23 | typedef struct _cell_t cell_t; 24 | typedef struct _node_t node_t; 25 | 26 | static inline void *spin(void *volatile *p) { 27 | int patience = MAX_SPIN; 28 | void *v = *p; 29 | 30 | while (!v && patience-- > 0) { 31 | v = *p; 32 | PAUSE(); 33 | } 34 | 35 | return v; 36 | } 37 | 38 | static inline node_t *new_node() { 39 | node_t *n = align_malloc(PAGE_SIZE, sizeof(node_t)); 40 | memset(n, 0, sizeof(node_t)); 41 | return n; 42 | } 43 | 44 | static node_t *check(unsigned long volatile *p_hzd_node_id, node_t *cur, 45 | node_t *old) { 46 | unsigned long hzd_node_id = ACQUIRE(p_hzd_node_id); 47 | 48 | if (hzd_node_id < cur->id) { 49 | node_t *tmp = old; 50 | while (tmp->id < hzd_node_id) { 51 | tmp = tmp->next; 52 | } 53 | cur = tmp; 54 | } 55 | 56 | return cur; 57 | } 58 | 59 | static node_t *update(node_t *volatile *pPn, node_t *cur, 60 | unsigned long volatile *p_hzd_node_id, node_t *old) { 61 | node_t *ptr = ACQUIRE(pPn); 62 | 63 | if (ptr->id < cur->id) { 64 | if (!CAScs(pPn, &ptr, cur)) { 65 | if (ptr->id < cur->id) cur = ptr; 66 | } 67 | 68 | cur = check(p_hzd_node_id, cur, old); 69 | } 70 | 71 | return cur; 72 | } 73 | 74 | static void cleanup(queue_t *q, handle_t *th) { 75 | long oid = ACQUIRE(&q->Hi); 76 | node_t *new = th->Dp; 77 | 78 | if (oid == -1) return; 79 | if (new->id - oid < MAX_GARBAGE(q->nprocs)) return; 80 | if (!CASa(&q->Hi, &oid, -1)) return; 81 | 82 | long Di = q->Di, Ei = q->Ei; 83 | while(Ei <= Di && !CAS(&q->Ei, &Ei, Di + 1)) 84 | ; 85 | 86 | node_t *old = q->Hp; 87 | handle_t *ph = th; 88 | handle_t *phs[q->nprocs]; 89 | int i = 0; 90 | 91 | do { 92 | new = check(&ph->hzd_node_id, new, old); 93 | new = update(&ph->Ep, new, &ph->hzd_node_id, old); 94 | new = update(&ph->Dp, new, &ph->hzd_node_id, old); 95 | 96 | phs[i++] = ph; 97 | ph = ph->next; 98 | } while (new->id > oid && ph != th); 99 | 100 | while (new->id > oid && --i >= 0) { 101 | new = check(&phs[i]->hzd_node_id, new, old); 102 | } 103 | 104 | long nid = new->id; 105 | 106 | if (nid <= oid) { 107 | RELEASE(&q->Hi, oid); 108 | } else { 109 | q->Hp = new; 110 | RELEASE(&q->Hi, nid); 111 | 112 | while (old != new) { 113 | node_t *tmp = old->next; 114 | free(old); 115 | old = tmp; 116 | } 117 | } 118 | } 119 | 120 | static cell_t *find_cell(node_t *volatile *ptr, long i, handle_t *th) { 121 | node_t *curr = *ptr; 122 | 123 | long j; 124 | for (j = curr->id; j < i / N; ++j) { 125 | node_t *next = curr->next; 126 | 127 | if (next == NULL) { 128 | node_t *temp = th->spare; 129 | 130 | if (!temp) { 131 | temp = new_node(); 132 | th->spare = temp; 133 | } 134 | 135 | temp->id = j + 1; 136 | 137 | if (CASra(&curr->next, &next, temp)) { 138 | next = temp; 139 | th->spare = NULL; 140 | } 141 | } 142 | 143 | curr = next; 144 | } 145 | 146 | *ptr = curr; 147 | return &curr->cells[i % N]; 148 | } 149 | 150 | static int enq_fast(queue_t *q, handle_t *th, void *v, long *id) { 151 | long i = FAAcs(&q->Ei, 1); 152 | cell_t *c = find_cell(&th->Ep, i, th); 153 | void *cv = BOT; 154 | 155 | if (CAS(&c->val, &cv, v)) { 156 | #ifdef RECORD 157 | th->fastenq++; 158 | #endif 159 | return 1; 160 | } else { 161 | *id = i; 162 | return 0; 163 | } 164 | } 165 | 166 | static void enq_slow(queue_t *q, handle_t *th, void *v, long id) { 167 | enq_t *enq = &th->Er; 168 | enq->val = v; 169 | RELEASE(&enq->id, id); 170 | 171 | node_t *tail = th->Ep; 172 | long i; 173 | cell_t *c; 174 | 175 | do { 176 | i = FAA(&q->Ei, 1); 177 | c = find_cell(&tail, i, th); 178 | enq_t *ce = BOT; 179 | 180 | if (CAScs(&c->enq, &ce, enq) && c->val != TOP) { 181 | if (CAS(&enq->id, &id, -i)) id = -i; 182 | break; 183 | } 184 | } while (enq->id > 0); 185 | 186 | id = -enq->id; 187 | c = find_cell(&th->Ep, id, th); 188 | if (id > i) { 189 | long Ei = q->Ei; 190 | while (Ei <= id && !CAS(&q->Ei, &Ei, id + 1)) 191 | ; 192 | } 193 | c->val = v; 194 | 195 | #ifdef RECORD 196 | th->slowenq++; 197 | #endif 198 | } 199 | 200 | void enqueue(queue_t *q, handle_t *th, void *v) { 201 | th->hzd_node_id = th->enq_node_id; 202 | 203 | long id; 204 | int p = MAX_PATIENCE; 205 | while (!enq_fast(q, th, v, &id) && p-- > 0) 206 | ; 207 | if (p < 0) enq_slow(q, th, v, id); 208 | 209 | th->enq_node_id = th->Ep->id; 210 | RELEASE(&th->hzd_node_id, -1); 211 | } 212 | 213 | static void *help_enq(queue_t *q, handle_t *th, cell_t *c, long i) { 214 | void *v = spin(&c->val); 215 | 216 | if ((v != TOP && v != BOT) || 217 | (v == BOT && !CAScs(&c->val, &v, TOP) && v != TOP)) { 218 | return v; 219 | } 220 | 221 | enq_t *e = c->enq; 222 | 223 | if (e == BOT) { 224 | handle_t *ph; 225 | enq_t *pe; 226 | long id; 227 | ph = th->Eh, pe = &ph->Er, id = pe->id; 228 | 229 | if (th->Ei != 0 && th->Ei != id) { 230 | th->Ei = 0; 231 | th->Eh = ph->next; 232 | ph = th->Eh, pe = &ph->Er, id = pe->id; 233 | } 234 | 235 | if (id > 0 && id <= i && !CAS(&c->enq, &e, pe) && e != pe) 236 | th->Ei = id; 237 | else { 238 | th->Ei = 0; 239 | th->Eh = ph->next; 240 | } 241 | 242 | if (e == BOT && CAS(&c->enq, &e, TOP)) e = TOP; 243 | } 244 | 245 | if (e == TOP) return (q->Ei <= i ? BOT : TOP); 246 | 247 | long ei = ACQUIRE(&e->id); 248 | void *ev = ACQUIRE(&e->val); 249 | 250 | if (ei > i) { 251 | if (c->val == TOP && q->Ei <= i) return BOT; 252 | } else { 253 | if ((ei > 0 && CAS(&e->id, &ei, -i)) || (ei == -i && c->val == TOP)) { 254 | long Ei = q->Ei; 255 | while (Ei <= i && !CAS(&q->Ei, &Ei, i + 1)) 256 | ; 257 | c->val = ev; 258 | } 259 | } 260 | 261 | return c->val; 262 | } 263 | 264 | static void help_deq(queue_t *q, handle_t *th, handle_t *ph) { 265 | deq_t *deq = &ph->Dr; 266 | long idx = ACQUIRE(&deq->idx); 267 | long id = deq->id; 268 | 269 | if (idx < id) return; 270 | 271 | node_t *Dp = ph->Dp; 272 | th->hzd_node_id = ph->hzd_node_id; 273 | FENCE(); 274 | idx = deq->idx; 275 | 276 | long i = id + 1, old = id, new = 0; 277 | while (1) { 278 | node_t *h = Dp; 279 | for (; idx == old && new == 0; ++i) { 280 | cell_t *c = find_cell(&h, i, th); 281 | 282 | long Di = q->Di; 283 | while (Di <= i && !CAS(&q->Di, &Di, i + 1)) 284 | ; 285 | 286 | void *v = help_enq(q, th, c, i); 287 | if (v == BOT || (v != TOP && c->deq == BOT)) 288 | new = i; 289 | else 290 | idx = ACQUIRE(&deq->idx); 291 | } 292 | 293 | if (new != 0) { 294 | if (CASra(&deq->idx, &idx, new)) idx = new; 295 | if (idx >= new) new = 0; 296 | } 297 | 298 | if (idx < 0 || deq->id != id) break; 299 | 300 | cell_t *c = find_cell(&Dp, idx, th); 301 | deq_t *cd = BOT; 302 | if (c->val == TOP || CAS(&c->deq, &cd, deq) || cd == deq) { 303 | CAS(&deq->idx, &idx, -idx); 304 | break; 305 | } 306 | 307 | old = idx; 308 | if (idx >= i) i = idx + 1; 309 | } 310 | } 311 | 312 | static void *deq_fast(queue_t *q, handle_t *th, long *id) { 313 | long i = FAAcs(&q->Di, 1); 314 | cell_t *c = find_cell(&th->Dp, i, th); 315 | void *v = help_enq(q, th, c, i); 316 | deq_t *cd = BOT; 317 | 318 | if (v == BOT) return BOT; 319 | if (v != TOP && CAS(&c->deq, &cd, TOP)) return v; 320 | 321 | *id = i; 322 | return TOP; 323 | } 324 | 325 | static void *deq_slow(queue_t *q, handle_t *th, long id) { 326 | deq_t *deq = &th->Dr; 327 | RELEASE(&deq->id, id); 328 | RELEASE(&deq->idx, id); 329 | 330 | help_deq(q, th, th); 331 | long i = -deq->idx; 332 | cell_t *c = find_cell(&th->Dp, i, th); 333 | void *val = c->val; 334 | 335 | #ifdef RECORD 336 | th->slowdeq++; 337 | #endif 338 | return val == TOP ? BOT : val; 339 | } 340 | 341 | void *dequeue(queue_t *q, handle_t *th) { 342 | th->hzd_node_id = th->deq_node_id; 343 | 344 | void *v; 345 | long id = 0; 346 | int p = MAX_PATIENCE; 347 | 348 | do 349 | v = deq_fast(q, th, &id); 350 | while (v == TOP && p-- > 0); 351 | if (v == TOP) 352 | v = deq_slow(q, th, id); 353 | else { 354 | #ifdef RECORD 355 | th->fastdeq++; 356 | #endif 357 | } 358 | 359 | if (v != EMPTY) { 360 | help_deq(q, th, th->Dh); 361 | th->Dh = th->Dh->next; 362 | } 363 | 364 | th->deq_node_id = th->Dp->id; 365 | RELEASE(&th->hzd_node_id, -1); 366 | 367 | if (th->spare == NULL) { 368 | cleanup(q, th); 369 | th->spare = new_node(); 370 | } 371 | 372 | #ifdef RECORD 373 | if (v == EMPTY) th->empty++; 374 | #endif 375 | return v; 376 | } 377 | 378 | static pthread_barrier_t barrier; 379 | 380 | void queue_init(queue_t *q, int nprocs) { 381 | q->Hi = 0; 382 | q->Hp = new_node(); 383 | 384 | q->Ei = 1; 385 | q->Di = 1; 386 | 387 | q->nprocs = nprocs; 388 | 389 | #ifdef RECORD 390 | q->fastenq = 0; 391 | q->slowenq = 0; 392 | q->fastdeq = 0; 393 | q->slowdeq = 0; 394 | q->empty = 0; 395 | #endif 396 | pthread_barrier_init(&barrier, NULL, nprocs); 397 | } 398 | 399 | void queue_free(queue_t *q, handle_t *h) { 400 | #ifdef RECORD 401 | static int lock = 0; 402 | 403 | FAA(&q->fastenq, h->fastenq); 404 | FAA(&q->slowenq, h->slowenq); 405 | FAA(&q->fastdeq, h->fastdeq); 406 | FAA(&q->slowdeq, h->slowdeq); 407 | FAA(&q->empty, h->empty); 408 | 409 | pthread_barrier_wait(&barrier); 410 | 411 | if (FAA(&lock, 1) == 0) 412 | printf("Enq: %f Deq: %f Empty: %f\n", 413 | q->slowenq * 100.0 / (q->fastenq + q->slowenq), 414 | q->slowdeq * 100.0 / (q->fastdeq + q->slowdeq), 415 | q->empty * 100.0 / (q->fastdeq + q->slowdeq)); 416 | #endif 417 | } 418 | 419 | void queue_register(queue_t *q, handle_t *th, int id) { 420 | th->next = NULL; 421 | th->hzd_node_id = -1; 422 | th->Ep = q->Hp; 423 | th->enq_node_id = th->Ep->id; 424 | th->Dp = q->Hp; 425 | th->deq_node_id = th->Dp->id; 426 | 427 | th->Er.id = 0; 428 | th->Er.val = BOT; 429 | th->Dr.id = 0; 430 | th->Dr.idx = -1; 431 | 432 | th->Ei = 0; 433 | th->spare = new_node(); 434 | #ifdef RECORD 435 | th->slowenq = 0; 436 | th->slowdeq = 0; 437 | th->fastenq = 0; 438 | th->fastdeq = 0; 439 | th->empty = 0; 440 | #endif 441 | 442 | static handle_t *volatile _tail; 443 | handle_t *tail = _tail; 444 | 445 | if (tail == NULL) { 446 | th->next = th; 447 | if (CASra(&_tail, &tail, th)) { 448 | th->Eh = th->next; 449 | th->Dh = th->next; 450 | return; 451 | } 452 | } 453 | 454 | handle_t *next = tail->next; 455 | do 456 | th->next = next; 457 | while (!CASra(&tail->next, &next, th)); 458 | 459 | th->Eh = th->next; 460 | th->Dh = th->next; 461 | } 462 | -------------------------------------------------------------------------------- /wfqueue.h: -------------------------------------------------------------------------------- 1 | #ifndef WFQUEUE_H 2 | #define WFQUEUE_H 3 | 4 | #ifdef WFQUEUE 5 | 6 | #include "align.h" 7 | #define EMPTY ((void *) 0) 8 | 9 | #ifndef WFQUEUE_NODE_SIZE 10 | #define WFQUEUE_NODE_SIZE ((1 << 10) - 2) 11 | #endif 12 | 13 | struct _enq_t { 14 | long volatile id; 15 | void * volatile val; 16 | } CACHE_ALIGNED; 17 | 18 | struct _deq_t { 19 | long volatile id; 20 | long volatile idx; 21 | } CACHE_ALIGNED; 22 | 23 | struct _cell_t { 24 | void * volatile val; 25 | struct _enq_t * volatile enq; 26 | struct _deq_t * volatile deq; 27 | void * pad[5]; 28 | }; 29 | 30 | struct _node_t { 31 | struct _node_t * volatile next CACHE_ALIGNED; 32 | long id CACHE_ALIGNED; 33 | struct _cell_t cells[WFQUEUE_NODE_SIZE] CACHE_ALIGNED; 34 | }; 35 | 36 | typedef struct DOUBLE_CACHE_ALIGNED { 37 | /** 38 | * Index of the next position for enqueue. 39 | */ 40 | volatile long Ei DOUBLE_CACHE_ALIGNED; 41 | 42 | /** 43 | * Index of the next position for dequeue. 44 | */ 45 | volatile long Di DOUBLE_CACHE_ALIGNED; 46 | 47 | /** 48 | * Index of the head of the queue. 49 | */ 50 | volatile long Hi DOUBLE_CACHE_ALIGNED; 51 | 52 | /** 53 | * Pointer to the head node of the queue. 54 | */ 55 | struct _node_t * volatile Hp; 56 | 57 | /** 58 | * Number of processors. 59 | */ 60 | long nprocs; 61 | #ifdef RECORD 62 | long slowenq; 63 | long slowdeq; 64 | long fastenq; 65 | long fastdeq; 66 | long empty; 67 | #endif 68 | } queue_t; 69 | 70 | typedef struct _handle_t { 71 | /** 72 | * Pointer to the next handle. 73 | */ 74 | struct _handle_t * next; 75 | 76 | /** 77 | * Hazard pointer. 78 | */ 79 | //struct _node_t * volatile Hp; 80 | unsigned long volatile hzd_node_id; 81 | 82 | /** 83 | * Pointer to the node for enqueue. 84 | */ 85 | struct _node_t * volatile Ep; 86 | unsigned long enq_node_id; 87 | 88 | /** 89 | * Pointer to the node for dequeue. 90 | */ 91 | struct _node_t * volatile Dp; 92 | unsigned long deq_node_id; 93 | 94 | /** 95 | * Enqueue request. 96 | */ 97 | struct _enq_t Er CACHE_ALIGNED; 98 | 99 | /** 100 | * Dequeue request. 101 | */ 102 | struct _deq_t Dr CACHE_ALIGNED; 103 | 104 | /** 105 | * Handle of the next enqueuer to help. 106 | */ 107 | struct _handle_t * Eh CACHE_ALIGNED; 108 | 109 | long Ei; 110 | 111 | /** 112 | * Handle of the next dequeuer to help. 113 | */ 114 | struct _handle_t * Dh; 115 | 116 | /** 117 | * Pointer to a spare node to use, to speedup adding a new node. 118 | */ 119 | struct _node_t * spare CACHE_ALIGNED; 120 | 121 | /** 122 | * Count the delay rounds of helping another dequeuer. 123 | */ 124 | int delay; 125 | 126 | #ifdef RECORD 127 | long slowenq; 128 | long slowdeq; 129 | long fastenq; 130 | long fastdeq; 131 | long empty; 132 | #endif 133 | } handle_t; 134 | 135 | #endif 136 | 137 | #endif /* end of include guard: WFQUEUE_H */ 138 | -------------------------------------------------------------------------------- /xxhash.c: -------------------------------------------------------------------------------- 1 | /* 2 | xxHash - Fast Hash algorithm 3 | Copyright (C) 2012-2014, Yann Collet. 4 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following disclaimer 14 | in the documentation and/or other materials provided with the 15 | distribution. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | You can contact the author at : 30 | - xxHash source repository : http://code.google.com/p/xxhash/ 31 | - public discussion board : https://groups.google.com/forum/#!forum/lz4c 32 | */ 33 | 34 | 35 | //************************************** 36 | // Tuning parameters 37 | //************************************** 38 | // Unaligned memory access is automatically enabled for "common" CPU, such as x86. 39 | // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected. 40 | // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance. 41 | // You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32). 42 | #if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) 43 | # define XXH_USE_UNALIGNED_ACCESS 1 44 | #endif 45 | 46 | // XXH_ACCEPT_NULL_INPUT_POINTER : 47 | // If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. 48 | // When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. 49 | // This option has a very small performance cost (only measurable on small inputs). 50 | // By default, this option is disabled. To enable it, uncomment below define : 51 | // #define XXH_ACCEPT_NULL_INPUT_POINTER 1 52 | 53 | // XXH_FORCE_NATIVE_FORMAT : 54 | // By default, xxHash library provides endian-independant Hash values, based on little-endian convention. 55 | // Results are therefore identical for little-endian and big-endian CPU. 56 | // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. 57 | // Should endian-independance be of no importance for your application, you may set the #define below to 1. 58 | // It will improve speed for Big-endian CPU. 59 | // This option has no impact on Little_Endian CPU. 60 | #define XXH_FORCE_NATIVE_FORMAT 0 61 | 62 | //************************************** 63 | // Compiler Specific Options 64 | //************************************** 65 | // Disable some Visual warning messages 66 | #ifdef _MSC_VER // Visual Studio 67 | # pragma warning(disable : 4127) // disable: C4127: conditional expression is constant 68 | #endif 69 | 70 | #ifdef _MSC_VER // Visual Studio 71 | # define FORCE_INLINE static __forceinline 72 | #else 73 | # ifdef __GNUC__ 74 | # define FORCE_INLINE static inline __attribute__((always_inline)) 75 | # else 76 | # define FORCE_INLINE static inline 77 | # endif 78 | #endif 79 | 80 | //************************************** 81 | // Includes & Memory related functions 82 | //************************************** 83 | #include "xxhash.h" 84 | // Modify the local functions below should you wish to use some other memory routines 85 | // for malloc(), free() 86 | #include 87 | static void* XXH_malloc(size_t s) { return malloc(s); } 88 | static void XXH_free (void* p) { free(p); } 89 | // for memcpy() 90 | #include 91 | static void* XXH_memcpy(void* dest, const void* src, size_t size) 92 | { 93 | return memcpy(dest,src,size); 94 | } 95 | 96 | 97 | //************************************** 98 | // Basic Types 99 | //************************************** 100 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 101 | # include 102 | typedef uint8_t BYTE; 103 | typedef uint16_t U16; 104 | typedef uint32_t U32; 105 | typedef int32_t S32; 106 | typedef uint64_t U64; 107 | #else 108 | typedef unsigned char BYTE; 109 | typedef unsigned short U16; 110 | typedef unsigned int U32; 111 | typedef signed int S32; 112 | typedef unsigned long long U64; 113 | #endif 114 | 115 | #if defined(__GNUC__) && !defined(XXH_USE_UNALIGNED_ACCESS) 116 | # define _PACKED __attribute__ ((packed)) 117 | #else 118 | # define _PACKED 119 | #endif 120 | 121 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) 122 | # ifdef __IBMC__ 123 | # pragma pack(1) 124 | # else 125 | # pragma pack(push, 1) 126 | # endif 127 | #endif 128 | 129 | typedef struct _U32_S 130 | { 131 | U32 v; 132 | } _PACKED U32_S; 133 | typedef struct _U64_S 134 | { 135 | U64 v; 136 | } _PACKED U64_S; 137 | 138 | #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) 139 | # pragma pack(pop) 140 | #endif 141 | 142 | #define A32(x) (((U32_S *)(x))->v) 143 | #define A64(x) (((U64_S *)(x))->v) 144 | 145 | 146 | //*************************************** 147 | // Compiler-specific Functions and Macros 148 | //*************************************** 149 | #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) 150 | 151 | // Note : although _rotl exists for minGW (GCC under windows), performance seems poor 152 | #if defined(_MSC_VER) 153 | # define XXH_rotl32(x,r) _rotl(x,r) 154 | # define XXH_rotl64(x,r) _rotl64(x,r) 155 | #else 156 | # define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) 157 | # define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) 158 | #endif 159 | 160 | #if defined(_MSC_VER) // Visual Studio 161 | # define XXH_swap32 _byteswap_ulong 162 | # define XXH_swap64 _byteswap_uint64 163 | #elif GCC_VERSION >= 403 164 | # define XXH_swap32 __builtin_bswap32 165 | # define XXH_swap64 __builtin_bswap64 166 | #else 167 | static inline U32 XXH_swap32 (U32 x) 168 | { 169 | return ((x << 24) & 0xff000000 ) | 170 | ((x << 8) & 0x00ff0000 ) | 171 | ((x >> 8) & 0x0000ff00 ) | 172 | ((x >> 24) & 0x000000ff ); 173 | } 174 | static inline U64 XXH_swap64 (U64 x) 175 | { 176 | return ((x << 56) & 0xff00000000000000ULL) | 177 | ((x << 40) & 0x00ff000000000000ULL) | 178 | ((x << 24) & 0x0000ff0000000000ULL) | 179 | ((x << 8) & 0x000000ff00000000ULL) | 180 | ((x >> 8) & 0x00000000ff000000ULL) | 181 | ((x >> 24) & 0x0000000000ff0000ULL) | 182 | ((x >> 40) & 0x000000000000ff00ULL) | 183 | ((x >> 56) & 0x00000000000000ffULL); 184 | } 185 | #endif 186 | 187 | 188 | //************************************** 189 | // Constants 190 | //************************************** 191 | #define PRIME32_1 2654435761U 192 | #define PRIME32_2 2246822519U 193 | #define PRIME32_3 3266489917U 194 | #define PRIME32_4 668265263U 195 | #define PRIME32_5 374761393U 196 | 197 | #define PRIME64_1 11400714785074694791ULL 198 | #define PRIME64_2 14029467366897019727ULL 199 | #define PRIME64_3 1609587929392839161ULL 200 | #define PRIME64_4 9650029242287828579ULL 201 | #define PRIME64_5 2870177450012600261ULL 202 | 203 | //************************************** 204 | // Architecture Macros 205 | //************************************** 206 | typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; 207 | #ifndef XXH_CPU_LITTLE_ENDIAN // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch 208 | static const int one = 1; 209 | # define XXH_CPU_LITTLE_ENDIAN (*(char*)(&one)) 210 | #endif 211 | 212 | 213 | //************************************** 214 | // Macros 215 | //************************************** 216 | #define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } // use only *after* variable declarations 217 | 218 | 219 | //**************************** 220 | // Memory reads 221 | //**************************** 222 | typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; 223 | 224 | FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) 225 | { 226 | if (align==XXH_unaligned) 227 | return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); 228 | else 229 | return endian==XXH_littleEndian ? *(U32*)ptr : XXH_swap32(*(U32*)ptr); 230 | } 231 | 232 | FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) 233 | { 234 | return XXH_readLE32_align(ptr, endian, XXH_unaligned); 235 | } 236 | 237 | FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) 238 | { 239 | if (align==XXH_unaligned) 240 | return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr)); 241 | else 242 | return endian==XXH_littleEndian ? *(U64*)ptr : XXH_swap64(*(U64*)ptr); 243 | } 244 | 245 | FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) 246 | { 247 | return XXH_readLE64_align(ptr, endian, XXH_unaligned); 248 | } 249 | 250 | 251 | //**************************** 252 | // Simple Hash Functions 253 | //**************************** 254 | FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) 255 | { 256 | const BYTE* p = (const BYTE*)input; 257 | const BYTE* bEnd = p + len; 258 | U32 h32; 259 | #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) 260 | 261 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 262 | if (p==NULL) 263 | { 264 | len=0; 265 | bEnd=p=(const BYTE*)(size_t)16; 266 | } 267 | #endif 268 | 269 | if (len>=16) 270 | { 271 | const BYTE* const limit = bEnd - 16; 272 | U32 v1 = seed + PRIME32_1 + PRIME32_2; 273 | U32 v2 = seed + PRIME32_2; 274 | U32 v3 = seed + 0; 275 | U32 v4 = seed - PRIME32_1; 276 | 277 | do 278 | { 279 | v1 += XXH_get32bits(p) * PRIME32_2; 280 | v1 = XXH_rotl32(v1, 13); 281 | v1 *= PRIME32_1; 282 | p+=4; 283 | v2 += XXH_get32bits(p) * PRIME32_2; 284 | v2 = XXH_rotl32(v2, 13); 285 | v2 *= PRIME32_1; 286 | p+=4; 287 | v3 += XXH_get32bits(p) * PRIME32_2; 288 | v3 = XXH_rotl32(v3, 13); 289 | v3 *= PRIME32_1; 290 | p+=4; 291 | v4 += XXH_get32bits(p) * PRIME32_2; 292 | v4 = XXH_rotl32(v4, 13); 293 | v4 *= PRIME32_1; 294 | p+=4; 295 | } 296 | while (p<=limit); 297 | 298 | h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); 299 | } 300 | else 301 | { 302 | h32 = seed + PRIME32_5; 303 | } 304 | 305 | h32 += (U32) len; 306 | 307 | while (p+4<=bEnd) 308 | { 309 | h32 += XXH_get32bits(p) * PRIME32_3; 310 | h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; 311 | p+=4; 312 | } 313 | 314 | while (p> 15; 322 | h32 *= PRIME32_2; 323 | h32 ^= h32 >> 13; 324 | h32 *= PRIME32_3; 325 | h32 ^= h32 >> 16; 326 | 327 | return h32; 328 | } 329 | 330 | 331 | unsigned int XXH32 (const void* input, size_t len, unsigned seed) 332 | { 333 | #if 0 334 | // Simple version, good for code maintenance, but unfortunately slow for small inputs 335 | XXH32_state_t state; 336 | XXH32_reset(&state, seed); 337 | XXH32_update(&state, input, len); 338 | return XXH32_digest(&state); 339 | #else 340 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 341 | 342 | # if !defined(XXH_USE_UNALIGNED_ACCESS) 343 | if ((((size_t)input) & 3) == 0) // Input is aligned, let's leverage the speed advantage 344 | { 345 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 346 | return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); 347 | else 348 | return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); 349 | } 350 | # endif 351 | 352 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 353 | return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); 354 | else 355 | return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); 356 | #endif 357 | } 358 | 359 | FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) 360 | { 361 | const BYTE* p = (const BYTE*)input; 362 | const BYTE* bEnd = p + len; 363 | U64 h64; 364 | #define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) 365 | 366 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 367 | if (p==NULL) 368 | { 369 | len=0; 370 | bEnd=p=(const BYTE*)(size_t)32; 371 | } 372 | #endif 373 | 374 | if (len>=32) 375 | { 376 | const BYTE* const limit = bEnd - 32; 377 | U64 v1 = seed + PRIME64_1 + PRIME64_2; 378 | U64 v2 = seed + PRIME64_2; 379 | U64 v3 = seed + 0; 380 | U64 v4 = seed - PRIME64_1; 381 | 382 | do 383 | { 384 | v1 += XXH_get64bits(p) * PRIME64_2; 385 | p+=8; 386 | v1 = XXH_rotl64(v1, 31); 387 | v1 *= PRIME64_1; 388 | v2 += XXH_get64bits(p) * PRIME64_2; 389 | p+=8; 390 | v2 = XXH_rotl64(v2, 31); 391 | v2 *= PRIME64_1; 392 | v3 += XXH_get64bits(p) * PRIME64_2; 393 | p+=8; 394 | v3 = XXH_rotl64(v3, 31); 395 | v3 *= PRIME64_1; 396 | v4 += XXH_get64bits(p) * PRIME64_2; 397 | p+=8; 398 | v4 = XXH_rotl64(v4, 31); 399 | v4 *= PRIME64_1; 400 | } 401 | while (p<=limit); 402 | 403 | h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); 404 | 405 | v1 *= PRIME64_2; 406 | v1 = XXH_rotl64(v1, 31); 407 | v1 *= PRIME64_1; 408 | h64 ^= v1; 409 | h64 = h64 * PRIME64_1 + PRIME64_4; 410 | 411 | v2 *= PRIME64_2; 412 | v2 = XXH_rotl64(v2, 31); 413 | v2 *= PRIME64_1; 414 | h64 ^= v2; 415 | h64 = h64 * PRIME64_1 + PRIME64_4; 416 | 417 | v3 *= PRIME64_2; 418 | v3 = XXH_rotl64(v3, 31); 419 | v3 *= PRIME64_1; 420 | h64 ^= v3; 421 | h64 = h64 * PRIME64_1 + PRIME64_4; 422 | 423 | v4 *= PRIME64_2; 424 | v4 = XXH_rotl64(v4, 31); 425 | v4 *= PRIME64_1; 426 | h64 ^= v4; 427 | h64 = h64 * PRIME64_1 + PRIME64_4; 428 | } 429 | else 430 | { 431 | h64 = seed + PRIME64_5; 432 | } 433 | 434 | h64 += (U64) len; 435 | 436 | while (p+8<=bEnd) 437 | { 438 | U64 k1 = XXH_get64bits(p); 439 | k1 *= PRIME64_2; 440 | k1 = XXH_rotl64(k1,31); 441 | k1 *= PRIME64_1; 442 | h64 ^= k1; 443 | h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; 444 | p+=8; 445 | } 446 | 447 | if (p+4<=bEnd) 448 | { 449 | h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; 450 | h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; 451 | p+=4; 452 | } 453 | 454 | while (p> 33; 462 | h64 *= PRIME64_2; 463 | h64 ^= h64 >> 29; 464 | h64 *= PRIME64_3; 465 | h64 ^= h64 >> 32; 466 | 467 | return h64; 468 | } 469 | 470 | 471 | unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) 472 | { 473 | #if 0 474 | // Simple version, good for code maintenance, but unfortunately slow for small inputs 475 | XXH64_state_t state; 476 | XXH64_reset(&state, seed); 477 | XXH64_update(&state, input, len); 478 | return XXH64_digest(&state); 479 | #else 480 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 481 | 482 | # if !defined(XXH_USE_UNALIGNED_ACCESS) 483 | if ((((size_t)input) & 7)==0) // Input is aligned, let's leverage the speed advantage 484 | { 485 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 486 | return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); 487 | else 488 | return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); 489 | } 490 | # endif 491 | 492 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 493 | return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); 494 | else 495 | return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); 496 | #endif 497 | } 498 | 499 | /**************************************************** 500 | * Advanced Hash Functions 501 | ****************************************************/ 502 | 503 | /*** Allocation ***/ 504 | typedef struct 505 | { 506 | U64 total_len; 507 | U32 seed; 508 | U32 v1; 509 | U32 v2; 510 | U32 v3; 511 | U32 v4; 512 | U32 mem32[4]; /* defined as U32 for alignment */ 513 | U32 memsize; 514 | } XXH_istate32_t; 515 | 516 | typedef struct 517 | { 518 | U64 total_len; 519 | U64 seed; 520 | U64 v1; 521 | U64 v2; 522 | U64 v3; 523 | U64 v4; 524 | U64 mem64[4]; /* defined as U64 for alignment */ 525 | U32 memsize; 526 | } XXH_istate64_t; 527 | 528 | 529 | XXH32_state_t* XXH32_createState(void) 530 | { 531 | XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t)); // A compilation error here means XXH32_state_t is not large enough 532 | return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); 533 | } 534 | XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) 535 | { 536 | XXH_free(statePtr); 537 | return XXH_OK; 538 | }; 539 | 540 | XXH64_state_t* XXH64_createState(void) 541 | { 542 | XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t)); // A compilation error here means XXH64_state_t is not large enough 543 | return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); 544 | } 545 | XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) 546 | { 547 | XXH_free(statePtr); 548 | return XXH_OK; 549 | }; 550 | 551 | 552 | /*** Hash feed ***/ 553 | 554 | XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed) 555 | { 556 | XXH_istate32_t* state = (XXH_istate32_t*) state_in; 557 | state->seed = seed; 558 | state->v1 = seed + PRIME32_1 + PRIME32_2; 559 | state->v2 = seed + PRIME32_2; 560 | state->v3 = seed + 0; 561 | state->v4 = seed - PRIME32_1; 562 | state->total_len = 0; 563 | state->memsize = 0; 564 | return XXH_OK; 565 | } 566 | 567 | XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed) 568 | { 569 | XXH_istate64_t* state = (XXH_istate64_t*) state_in; 570 | state->seed = seed; 571 | state->v1 = seed + PRIME64_1 + PRIME64_2; 572 | state->v2 = seed + PRIME64_2; 573 | state->v3 = seed + 0; 574 | state->v4 = seed - PRIME64_1; 575 | state->total_len = 0; 576 | state->memsize = 0; 577 | return XXH_OK; 578 | } 579 | 580 | 581 | FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian) 582 | { 583 | XXH_istate32_t* state = (XXH_istate32_t *) state_in; 584 | const BYTE* p = (const BYTE*)input; 585 | const BYTE* const bEnd = p + len; 586 | 587 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 588 | if (input==NULL) return XXH_ERROR; 589 | #endif 590 | 591 | state->total_len += len; 592 | 593 | if (state->memsize + len < 16) // fill in tmp buffer 594 | { 595 | XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); 596 | state->memsize += (U32)len; 597 | return XXH_OK; 598 | } 599 | 600 | if (state->memsize) // some data left from previous update 601 | { 602 | XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); 603 | { 604 | const U32* p32 = state->mem32; 605 | state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; 606 | state->v1 = XXH_rotl32(state->v1, 13); 607 | state->v1 *= PRIME32_1; 608 | p32++; 609 | state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; 610 | state->v2 = XXH_rotl32(state->v2, 13); 611 | state->v2 *= PRIME32_1; 612 | p32++; 613 | state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; 614 | state->v3 = XXH_rotl32(state->v3, 13); 615 | state->v3 *= PRIME32_1; 616 | p32++; 617 | state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; 618 | state->v4 = XXH_rotl32(state->v4, 13); 619 | state->v4 *= PRIME32_1; 620 | p32++; 621 | } 622 | p += 16-state->memsize; 623 | state->memsize = 0; 624 | } 625 | 626 | if (p <= bEnd-16) 627 | { 628 | const BYTE* const limit = bEnd - 16; 629 | U32 v1 = state->v1; 630 | U32 v2 = state->v2; 631 | U32 v3 = state->v3; 632 | U32 v4 = state->v4; 633 | 634 | do 635 | { 636 | v1 += XXH_readLE32(p, endian) * PRIME32_2; 637 | v1 = XXH_rotl32(v1, 13); 638 | v1 *= PRIME32_1; 639 | p+=4; 640 | v2 += XXH_readLE32(p, endian) * PRIME32_2; 641 | v2 = XXH_rotl32(v2, 13); 642 | v2 *= PRIME32_1; 643 | p+=4; 644 | v3 += XXH_readLE32(p, endian) * PRIME32_2; 645 | v3 = XXH_rotl32(v3, 13); 646 | v3 *= PRIME32_1; 647 | p+=4; 648 | v4 += XXH_readLE32(p, endian) * PRIME32_2; 649 | v4 = XXH_rotl32(v4, 13); 650 | v4 *= PRIME32_1; 651 | p+=4; 652 | } 653 | while (p<=limit); 654 | 655 | state->v1 = v1; 656 | state->v2 = v2; 657 | state->v3 = v3; 658 | state->v4 = v4; 659 | } 660 | 661 | if (p < bEnd) 662 | { 663 | XXH_memcpy(state->mem32, p, bEnd-p); 664 | state->memsize = (int)(bEnd-p); 665 | } 666 | 667 | return XXH_OK; 668 | } 669 | 670 | XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) 671 | { 672 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 673 | 674 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 675 | return XXH32_update_endian(state_in, input, len, XXH_littleEndian); 676 | else 677 | return XXH32_update_endian(state_in, input, len, XXH_bigEndian); 678 | } 679 | 680 | 681 | 682 | FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian) 683 | { 684 | XXH_istate32_t* state = (XXH_istate32_t*) state_in; 685 | const BYTE * p = (const BYTE*)state->mem32; 686 | BYTE* bEnd = (BYTE*)(state->mem32) + state->memsize; 687 | U32 h32; 688 | 689 | if (state->total_len >= 16) 690 | { 691 | h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); 692 | } 693 | else 694 | { 695 | h32 = state->seed + PRIME32_5; 696 | } 697 | 698 | h32 += (U32) state->total_len; 699 | 700 | while (p+4<=bEnd) 701 | { 702 | h32 += XXH_readLE32(p, endian) * PRIME32_3; 703 | h32 = XXH_rotl32(h32, 17) * PRIME32_4; 704 | p+=4; 705 | } 706 | 707 | while (p> 15; 715 | h32 *= PRIME32_2; 716 | h32 ^= h32 >> 13; 717 | h32 *= PRIME32_3; 718 | h32 ^= h32 >> 16; 719 | 720 | return h32; 721 | } 722 | 723 | 724 | U32 XXH32_digest (const XXH32_state_t* state_in) 725 | { 726 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 727 | 728 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 729 | return XXH32_digest_endian(state_in, XXH_littleEndian); 730 | else 731 | return XXH32_digest_endian(state_in, XXH_bigEndian); 732 | } 733 | 734 | 735 | FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian) 736 | { 737 | XXH_istate64_t * state = (XXH_istate64_t *) state_in; 738 | const BYTE* p = (const BYTE*)input; 739 | const BYTE* const bEnd = p + len; 740 | 741 | #ifdef XXH_ACCEPT_NULL_INPUT_POINTER 742 | if (input==NULL) return XXH_ERROR; 743 | #endif 744 | 745 | state->total_len += len; 746 | 747 | if (state->memsize + len < 32) // fill in tmp buffer 748 | { 749 | XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); 750 | state->memsize += (U32)len; 751 | return XXH_OK; 752 | } 753 | 754 | if (state->memsize) // some data left from previous update 755 | { 756 | XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); 757 | { 758 | const U64* p64 = state->mem64; 759 | state->v1 += XXH_readLE64(p64, endian) * PRIME64_2; 760 | state->v1 = XXH_rotl64(state->v1, 31); 761 | state->v1 *= PRIME64_1; 762 | p64++; 763 | state->v2 += XXH_readLE64(p64, endian) * PRIME64_2; 764 | state->v2 = XXH_rotl64(state->v2, 31); 765 | state->v2 *= PRIME64_1; 766 | p64++; 767 | state->v3 += XXH_readLE64(p64, endian) * PRIME64_2; 768 | state->v3 = XXH_rotl64(state->v3, 31); 769 | state->v3 *= PRIME64_1; 770 | p64++; 771 | state->v4 += XXH_readLE64(p64, endian) * PRIME64_2; 772 | state->v4 = XXH_rotl64(state->v4, 31); 773 | state->v4 *= PRIME64_1; 774 | p64++; 775 | } 776 | p += 32-state->memsize; 777 | state->memsize = 0; 778 | } 779 | 780 | if (p+32 <= bEnd) 781 | { 782 | const BYTE* const limit = bEnd - 32; 783 | U64 v1 = state->v1; 784 | U64 v2 = state->v2; 785 | U64 v3 = state->v3; 786 | U64 v4 = state->v4; 787 | 788 | do 789 | { 790 | v1 += XXH_readLE64(p, endian) * PRIME64_2; 791 | v1 = XXH_rotl64(v1, 31); 792 | v1 *= PRIME64_1; 793 | p+=8; 794 | v2 += XXH_readLE64(p, endian) * PRIME64_2; 795 | v2 = XXH_rotl64(v2, 31); 796 | v2 *= PRIME64_1; 797 | p+=8; 798 | v3 += XXH_readLE64(p, endian) * PRIME64_2; 799 | v3 = XXH_rotl64(v3, 31); 800 | v3 *= PRIME64_1; 801 | p+=8; 802 | v4 += XXH_readLE64(p, endian) * PRIME64_2; 803 | v4 = XXH_rotl64(v4, 31); 804 | v4 *= PRIME64_1; 805 | p+=8; 806 | } 807 | while (p<=limit); 808 | 809 | state->v1 = v1; 810 | state->v2 = v2; 811 | state->v3 = v3; 812 | state->v4 = v4; 813 | } 814 | 815 | if (p < bEnd) 816 | { 817 | XXH_memcpy(state->mem64, p, bEnd-p); 818 | state->memsize = (int)(bEnd-p); 819 | } 820 | 821 | return XXH_OK; 822 | } 823 | 824 | XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) 825 | { 826 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 827 | 828 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 829 | return XXH64_update_endian(state_in, input, len, XXH_littleEndian); 830 | else 831 | return XXH64_update_endian(state_in, input, len, XXH_bigEndian); 832 | } 833 | 834 | 835 | 836 | FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian) 837 | { 838 | XXH_istate64_t * state = (XXH_istate64_t *) state_in; 839 | const BYTE * p = (const BYTE*)state->mem64; 840 | BYTE* bEnd = (BYTE*)state->mem64 + state->memsize; 841 | U64 h64; 842 | 843 | if (state->total_len >= 32) 844 | { 845 | U64 v1 = state->v1; 846 | U64 v2 = state->v2; 847 | U64 v3 = state->v3; 848 | U64 v4 = state->v4; 849 | 850 | h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); 851 | 852 | v1 *= PRIME64_2; 853 | v1 = XXH_rotl64(v1, 31); 854 | v1 *= PRIME64_1; 855 | h64 ^= v1; 856 | h64 = h64*PRIME64_1 + PRIME64_4; 857 | 858 | v2 *= PRIME64_2; 859 | v2 = XXH_rotl64(v2, 31); 860 | v2 *= PRIME64_1; 861 | h64 ^= v2; 862 | h64 = h64*PRIME64_1 + PRIME64_4; 863 | 864 | v3 *= PRIME64_2; 865 | v3 = XXH_rotl64(v3, 31); 866 | v3 *= PRIME64_1; 867 | h64 ^= v3; 868 | h64 = h64*PRIME64_1 + PRIME64_4; 869 | 870 | v4 *= PRIME64_2; 871 | v4 = XXH_rotl64(v4, 31); 872 | v4 *= PRIME64_1; 873 | h64 ^= v4; 874 | h64 = h64*PRIME64_1 + PRIME64_4; 875 | } 876 | else 877 | { 878 | h64 = state->seed + PRIME64_5; 879 | } 880 | 881 | h64 += (U64) state->total_len; 882 | 883 | while (p+8<=bEnd) 884 | { 885 | U64 k1 = XXH_readLE64(p, endian); 886 | k1 *= PRIME64_2; 887 | k1 = XXH_rotl64(k1,31); 888 | k1 *= PRIME64_1; 889 | h64 ^= k1; 890 | h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; 891 | p+=8; 892 | } 893 | 894 | if (p+4<=bEnd) 895 | { 896 | h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; 897 | h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; 898 | p+=4; 899 | } 900 | 901 | while (p> 33; 909 | h64 *= PRIME64_2; 910 | h64 ^= h64 >> 29; 911 | h64 *= PRIME64_3; 912 | h64 ^= h64 >> 32; 913 | 914 | return h64; 915 | } 916 | 917 | 918 | unsigned long long XXH64_digest (const XXH64_state_t* state_in) 919 | { 920 | XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; 921 | 922 | if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) 923 | return XXH64_digest_endian(state_in, XXH_littleEndian); 924 | else 925 | return XXH64_digest_endian(state_in, XXH_bigEndian); 926 | } 927 | 928 | 929 | -------------------------------------------------------------------------------- /xxhash.h: -------------------------------------------------------------------------------- 1 | /* 2 | xxHash - Extremely Fast Hash algorithm 3 | Header File 4 | Copyright (C) 2012-2014, Yann Collet. 5 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are 9 | met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of conditions and the following disclaimer 15 | in the documentation and/or other materials provided with the 16 | distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | You can contact the author at : 31 | - xxHash source repository : http://code.google.com/p/xxhash/ 32 | */ 33 | 34 | /* Notice extracted from xxHash homepage : 35 | 36 | xxHash is an extremely fast Hash algorithm, running at RAM speed limits. 37 | It also successfully passes all tests from the SMHasher suite. 38 | 39 | Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) 40 | 41 | Name Speed Q.Score Author 42 | xxHash 5.4 GB/s 10 43 | CrapWow 3.2 GB/s 2 Andrew 44 | MumurHash 3a 2.7 GB/s 10 Austin Appleby 45 | SpookyHash 2.0 GB/s 10 Bob Jenkins 46 | SBox 1.4 GB/s 9 Bret Mulvey 47 | Lookup3 1.2 GB/s 9 Bob Jenkins 48 | SuperFastHash 1.2 GB/s 1 Paul Hsieh 49 | CityHash64 1.05 GB/s 10 Pike & Alakuijala 50 | FNV 0.55 GB/s 5 Fowler, Noll, Vo 51 | CRC32 0.43 GB/s 9 52 | MD5-32 0.33 GB/s 10 Ronald L. Rivest 53 | SHA1-32 0.28 GB/s 10 54 | 55 | Q.Score is a measure of quality of the hash function. 56 | It depends on successfully passing SMHasher test set. 57 | 10 is a perfect score. 58 | */ 59 | 60 | #pragma once 61 | 62 | #if defined (__cplusplus) 63 | extern "C" { 64 | #endif 65 | 66 | 67 | /***************************** 68 | Includes 69 | *****************************/ 70 | #include /* size_t */ 71 | 72 | 73 | /***************************** 74 | Type 75 | *****************************/ 76 | typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; 77 | 78 | 79 | 80 | /***************************** 81 | Simple Hash Functions 82 | *****************************/ 83 | 84 | unsigned int XXH32 (const void* input, size_t length, unsigned seed); 85 | unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed); 86 | 87 | /* 88 | XXH32() : 89 | Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". 90 | The memory between input & input+length must be valid (allocated and read-accessible). 91 | "seed" can be used to alter the result predictably. 92 | This function successfully passes all SMHasher tests. 93 | Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s 94 | XXH64() : 95 | Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". 96 | */ 97 | 98 | 99 | 100 | /***************************** 101 | Advanced Hash Functions 102 | *****************************/ 103 | typedef struct { long long ll[ 6]; } XXH32_state_t; 104 | typedef struct { long long ll[11]; } XXH64_state_t; 105 | 106 | /* 107 | These structures allow static allocation of XXH states. 108 | States must then be initialized using XXHnn_reset() before first use. 109 | 110 | If you prefer dynamic allocation, please refer to functions below. 111 | */ 112 | 113 | XXH32_state_t* XXH32_createState(void); 114 | XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); 115 | 116 | XXH64_state_t* XXH64_createState(void); 117 | XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); 118 | 119 | /* 120 | These functions create and release memory for XXH state. 121 | States must then be initialized using XXHnn_reset() before first use. 122 | */ 123 | 124 | 125 | XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); 126 | XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); 127 | unsigned int XXH32_digest (const XXH32_state_t* statePtr); 128 | 129 | XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); 130 | XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); 131 | unsigned long long XXH64_digest (const XXH64_state_t* statePtr); 132 | 133 | /* 134 | These functions calculate the xxHash of an input provided in multiple smaller packets, 135 | as opposed to an input provided as a single block. 136 | 137 | XXH state space must first be allocated, using either static or dynamic method provided above. 138 | 139 | Start a new hash by initializing state with a seed, using XXHnn_reset(). 140 | 141 | Then, feed the hash state by calling XXHnn_update() as many times as necessary. 142 | Obviously, input must be valid, meaning allocated and read accessible. 143 | The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. 144 | 145 | Finally, you can produce a hash anytime, by using XXHnn_digest(). 146 | This function returns the final nn-bits hash. 147 | You can nonetheless continue feeding the hash state with more input, 148 | and therefore get some new hashes, by calling again XXHnn_digest(). 149 | 150 | When you are done, don't forget to free XXH state space, using typically XXHnn_freeState(). 151 | */ 152 | 153 | 154 | #if defined (__cplusplus) 155 | } 156 | #endif 157 | --------------------------------------------------------------------------------