├── README.md ├── spinlock-pthread.h ├── run-test-spinlock.sh ├── spinlock-xchg.h ├── Makefile ├── spinlock-cmpxchg.h ├── spinlock-xchg-hle.h ├── spinlock-ticket.h ├── spinlock-xchg-backoff.h ├── rtm.h ├── spinlock-mcs.h ├── spinlock-k42.h ├── stack.c └── test-spinlock.c /README.md: -------------------------------------------------------------------------------- 1 | Various spinlock implementations from this article [Spinlocks and Read-Write Locks](http://locklessinc.com/articles/locks/) by *Lockless Inc*. 2 | 3 | I made some modification to make each implementation self contained and provide a benchmark script. The code relies on GCC's built-in functions for atomic memory access. 4 | 5 | **Note: Scalability is achieved by avoiding sharing and contention, not by scalable locks.** 6 | -------------------------------------------------------------------------------- /spinlock-pthread.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_PTHREAD_H 2 | #define _SPINLOCK_PTHREAD_H 3 | 4 | #define SPINLOCK_ATTR static __inline __attribute__((always_inline, no_instrument_function)) 5 | 6 | #define spinlock pthread_mutex_t 7 | 8 | SPINLOCK_ATTR void spin_lock(spinlock *lock) 9 | { 10 | pthread_mutex_lock(lock); 11 | } 12 | 13 | SPINLOCK_ATTR void spin_unlock(spinlock *lock) 14 | { 15 | pthread_mutex_unlock(lock); 16 | } 17 | 18 | #define SPINLOCK_INITIALIZER { 0 } 19 | 20 | #endif /* _SPINLOCK_H */ 21 | -------------------------------------------------------------------------------- /run-test-spinlock.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function run_test() { 4 | for nthr in 1 2 4 8 16 32; do 5 | ./$1 $nthr > /dev/null 6 | for i in `seq 1 3`; do 7 | ./$1 $nthr 8 | done 9 | echo 10 | done 11 | } 12 | 13 | echo "test spin lock using cmpxchg" 14 | run_test "test-spinlock-cmpxchg" 15 | 16 | echo "test spin lock using xchg" 17 | run_test "test-spinlock-xchg" 18 | 19 | echo "test spin lock using k42" 20 | run_test "test-spinlock-k42" 21 | 22 | echo "test spin lock using mcs" 23 | run_test "test-spinlock-mcs" 24 | 25 | echo "test spin lock using ticket" 26 | run_test "test-spinlock-ticket" 27 | 28 | echo "test spin lock using pthread" 29 | run_test "test-spinlock-pthread" 30 | 31 | echo "test spin lock using xchg-backoff" 32 | run_test "test-spinlock-xchg-backoff" 33 | 34 | -------------------------------------------------------------------------------- /spinlock-xchg.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_XCHG_H 2 | #define _SPINLOCK_XCHG_H 3 | 4 | /* Spin lock using xchg. 5 | * Copied from http://locklessinc.com/articles/locks/ 6 | */ 7 | 8 | /* Compile read-write barrier */ 9 | #define barrier() asm volatile("": : :"memory") 10 | 11 | /* Pause instruction to prevent excess processor bus usage */ 12 | #define cpu_relax() asm volatile("pause\n": : :"memory") 13 | 14 | static inline unsigned short xchg_8(void *ptr, unsigned char x) 15 | { 16 | __asm__ __volatile__("xchgb %0,%1" 17 | :"=r" (x) 18 | :"m" (*(volatile unsigned char *)ptr), "0" (x) 19 | :"memory"); 20 | 21 | return x; 22 | } 23 | 24 | #define BUSY 1 25 | typedef unsigned char spinlock; 26 | 27 | #define SPINLOCK_INITIALIZER 0 28 | 29 | static inline void spin_lock(spinlock *lock) 30 | { 31 | while (1) { 32 | if (!xchg_8(lock, BUSY)) return; 33 | 34 | while (*lock) cpu_relax(); 35 | } 36 | } 37 | 38 | static inline void spin_unlock(spinlock *lock) 39 | { 40 | barrier(); 41 | *lock = 0; 42 | } 43 | 44 | static inline int spin_trylock(spinlock *lock) 45 | { 46 | return xchg_8(lock, BUSY); 47 | } 48 | 49 | #endif /* _SPINLOCK_XCHG_H */ 50 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS = -O2 -g -std=gnu99 -Wall 2 | LDFLAGS = -lpthread 3 | 4 | programs = test-spinlock-cmpxchg test-spinlock-xchg test-spinlock-k42 \ 5 | test-spinlock-mcs test-spinlock-ticket test-spinlock-pthread \ 6 | test-spinlock-xchg-backoff test-rtm test-spinlock-xchg-hle 7 | 8 | all: $(programs) 9 | 10 | test-spinlock-cmpxchg: test-spinlock.c 11 | $(CC) $(CFLAGS) -DCMPXCHG $^ -o $@ $(LDFLAGS) 12 | 13 | test-spinlock-xchg: test-spinlock.c 14 | $(CC) $(CFLAGS) -DXCHG $^ -o $@ $(LDFLAGS) 15 | 16 | test-spinlock-k42: test-spinlock.c 17 | $(CC) $(CFLAGS) -DK42 $^ -o $@ $(LDFLAGS) 18 | 19 | test-spinlock-mcs: test-spinlock.c 20 | $(CC) $(CFLAGS) -DMCS $^ -o $@ $(LDFLAGS) 21 | 22 | test-spinlock-ticket: test-spinlock.c 23 | $(CC) $(CFLAGS) -DTICKET $^ -o $@ $(LDFLAGS) 24 | 25 | test-spinlock-pthread: test-spinlock.c 26 | $(CC) $(CFLAGS) -DPTHREAD $^ -o $@ $(LDFLAGS) 27 | 28 | test-spinlock-xchg-backoff: test-spinlock.c 29 | $(CC) $(CFLAGS) -DXCHGBACKOFF $^ -o $@ $(LDFLAGS) 30 | 31 | test-spinlock-xchg-hle: test-spinlock.c 32 | $(CC) $(CFLAGS) -DHLE $^ -o $@ $(LDFLAGS) 33 | 34 | test-rtm: test-spinlock.c 35 | $(CC) $(CFLAGS) -DRTM $^ -o $@ $(LDFLAGS) 36 | 37 | %:%.c 38 | $(CC) $(CFLAGS) $< -o $@ 39 | 40 | clean: 41 | -rm -f *.o 42 | -rm -f $(programs) 43 | -------------------------------------------------------------------------------- /spinlock-cmpxchg.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_CMPXCHG_H 2 | #define _SPINLOCK_CMPXCHG_H 3 | 4 | typedef struct { 5 | volatile char lock; 6 | } spinlock; 7 | 8 | #define SPINLOCK_ATTR static __inline __attribute__((always_inline, no_instrument_function)) 9 | 10 | /* Pause instruction to prevent excess processor bus usage */ 11 | #define cpu_relax() asm volatile("pause\n": : :"memory") 12 | 13 | SPINLOCK_ATTR char __testandset(spinlock *p) 14 | { 15 | char readval = 0; 16 | 17 | asm volatile ( 18 | "lock; cmpxchgb %b2, %0" 19 | : "+m" (p->lock), "+a" (readval) 20 | : "r" (1) 21 | : "cc"); 22 | return readval; 23 | } 24 | 25 | SPINLOCK_ATTR void spin_lock(spinlock *lock) 26 | { 27 | while (__testandset(lock)) { 28 | /* Should wait until lock is free before another try. 29 | * cmpxchg is write to cache, competing write for a sinlge cache line 30 | * would generate large amount of cache traffic. That's why this 31 | * implementation is not scalable compared to xchg based one. Otherwise, 32 | * they should have similar performance. */ 33 | cpu_relax(); 34 | } 35 | } 36 | 37 | SPINLOCK_ATTR void spin_unlock(spinlock *s) 38 | { 39 | s->lock = 0; 40 | } 41 | 42 | #define SPINLOCK_INITIALIZER { 0 } 43 | 44 | #endif /* _SPINLOCK_CMPXCHG_H */ 45 | -------------------------------------------------------------------------------- /spinlock-xchg-hle.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_XCHG_H 2 | #define _SPINLOCK_XCHG_H 3 | 4 | /* Spin lock using xchg. 5 | * Copied from http://locklessinc.com/articles/locks/ 6 | */ 7 | 8 | /* Compile read-write barrier */ 9 | #define barrier() asm volatile("": : :"memory") 10 | 11 | /* Pause instruction to prevent excess processor bus usage */ 12 | #define cpu_relax() asm volatile("pause\n": : :"memory") 13 | 14 | #define __HLE_ACQUIRE ".byte 0xf2 ; " 15 | #define __HLE_RELEASE ".byte 0xf3 ; " 16 | 17 | static inline unsigned short xchg_8(void *ptr, unsigned char x) 18 | { 19 | __asm__ __volatile__(__HLE_ACQUIRE "xchgb %0,%1" 20 | :"=r" (x) 21 | :"m" (*(volatile unsigned char *)ptr), "0" (x) 22 | :"memory"); 23 | 24 | return x; 25 | } 26 | 27 | #define BUSY 1 28 | typedef unsigned char spinlock; 29 | 30 | #define SPINLOCK_INITIALIZER 0 31 | 32 | static inline void spin_lock(spinlock *lock) 33 | { 34 | while (1) { 35 | if (!xchg_8(lock, BUSY)) return; 36 | 37 | while (*lock) cpu_relax(); 38 | } 39 | } 40 | 41 | static inline void spin_unlock(spinlock *lock) 42 | { 43 | __asm__ __volatile__(__HLE_RELEASE "movb $0, %0" 44 | :"=m" (*lock) 45 | : 46 | :"memory"); 47 | } 48 | 49 | static inline int spin_trylock(spinlock *lock) 50 | { 51 | return xchg_8(lock, BUSY); 52 | } 53 | 54 | #endif /* _SPINLOCK_XCHG_H */ 55 | -------------------------------------------------------------------------------- /spinlock-ticket.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_TICKET_H 2 | #define _SPINLOCK_H 3 | 4 | /* Code copied from http://locklessinc.com/articles/locks/ */ 5 | 6 | #define cmpxchg(P, O, N) __sync_val_compare_and_swap((P), (O), (N)) 7 | #define atomic_xadd(P, V) __sync_fetch_and_add((P), (V)) 8 | 9 | #define barrier() asm volatile("": : :"memory") 10 | #define cpu_relax() asm volatile("pause\n": : :"memory") 11 | 12 | #define spin_lock ticket_lock 13 | #define spin_unlock ticket_unlock 14 | #define spinlock ticketlock 15 | 16 | #define SPINLOCK_INITIALIZER { 0, 0 }; 17 | 18 | typedef union ticketlock ticketlock; 19 | 20 | union ticketlock 21 | { 22 | unsigned u; 23 | struct 24 | { 25 | unsigned short ticket; 26 | unsigned short users; 27 | } s; 28 | }; 29 | 30 | static inline void ticket_lock(ticketlock *t) 31 | { 32 | unsigned short me = atomic_xadd(&t->s.users, 1); 33 | 34 | while (t->s.ticket != me) cpu_relax(); 35 | } 36 | 37 | static inline void ticket_unlock(ticketlock *t) 38 | { 39 | barrier(); 40 | t->s.ticket++; 41 | } 42 | 43 | static inline int ticket_trylock(ticketlock *t) 44 | { 45 | unsigned short me = t->s.users; 46 | unsigned short menew = me + 1; 47 | unsigned cmp = ((unsigned) me << 16) + me; 48 | unsigned cmpnew = ((unsigned) menew << 16) + me; 49 | 50 | if (cmpxchg(&t->u, cmp, cmpnew) == cmp) return 0; 51 | 52 | return 1; // Busy 53 | } 54 | 55 | static inline int ticket_lockable(ticketlock *t) 56 | { 57 | ticketlock u = *t; 58 | barrier(); 59 | return (u.s.ticket == u.s.users); 60 | } 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /spinlock-xchg-backoff.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_XCHG_BACKOFF_H 2 | #define _SPINLOCK_XCHG_BACKOFF_H 3 | 4 | /* Spin lock using xchg. Added backoff wait to avoid concurrent lock/unlock 5 | * operation. 6 | * Original code copied from http://locklessinc.com/articles/locks/ 7 | */ 8 | 9 | /* Compile read-write barrier */ 10 | #define barrier() asm volatile("": : :"memory") 11 | 12 | /* Pause instruction to prevent excess processor bus usage */ 13 | #define cpu_relax() asm volatile("pause\n": : :"memory") 14 | 15 | static inline unsigned short xchg_8(void *ptr, unsigned char x) 16 | { 17 | __asm__ __volatile__("xchgb %0,%1" 18 | :"=r" (x) 19 | :"m" (*(volatile unsigned char *)ptr), "0" (x) 20 | :"memory"); 21 | 22 | return x; 23 | } 24 | 25 | #define BUSY 1 26 | typedef unsigned char spinlock; 27 | 28 | #define SPINLOCK_INITIALIZER 0 29 | 30 | static inline void spin_lock(spinlock *lock) 31 | { 32 | int wait = 1; 33 | while (1) { 34 | if (!xchg_8(lock, BUSY)) return; 35 | 36 | // wait here is important to performance. 37 | for (int i = 0; i < wait; i++) { 38 | cpu_relax(); 39 | } 40 | while (*lock) { 41 | wait *= 2; // exponential backoff if can't get lock 42 | for (int i = 0; i < wait; i++) { 43 | cpu_relax(); 44 | } 45 | } 46 | } 47 | } 48 | 49 | static inline void spin_unlock(spinlock *lock) 50 | { 51 | barrier(); 52 | *lock = 0; 53 | } 54 | 55 | static inline int spin_trylock(spinlock *lock) 56 | { 57 | return xchg_8(lock, BUSY); 58 | } 59 | 60 | #endif /* _SPINLOCK_XCHG_BACKOFF_H */ 61 | -------------------------------------------------------------------------------- /rtm.h: -------------------------------------------------------------------------------- 1 | #ifndef _RTM_H 2 | #define _RTM_H 1 3 | 4 | /* 5 | * Copyright (c) 2012,2013 Intel Corporation 6 | * Author: Andi Kleen 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that: (1) source code distributions 10 | * retain the above copyright notice and this paragraph in its entirety, (2) 11 | * distributions including binary code include the above copyright notice and 12 | * this paragraph in its entirety in the documentation or other materials 13 | * provided with the distribution 14 | * 15 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED 16 | * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 17 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 18 | */ 19 | 20 | /* Official RTM intrinsics interface matching gcc/icc, but works 21 | on older gcc compatible compilers and binutils. */ 22 | 23 | #define _XBEGIN_STARTED (~0u) 24 | #define _XABORT_EXPLICIT (1 << 0) 25 | #define _XABORT_RETRY (1 << 1) 26 | #define _XABORT_CONFLICT (1 << 2) 27 | #define _XABORT_CAPACITY (1 << 3) 28 | #define _XABORT_DEBUG (1 << 4) 29 | #define _XABORT_NESTED (1 << 5) 30 | #define _XABORT_CODE(x) (((x) >> 24) & 0xff) 31 | 32 | #define __rtm_force_inline __attribute__((__always_inline__)) inline 33 | 34 | static __rtm_force_inline int _xbegin(void) 35 | { 36 | int ret = _XBEGIN_STARTED; 37 | asm volatile(".byte 0xc7,0xf8 ; .long 0" : "+a" (ret) :: "memory"); 38 | return ret; 39 | } 40 | 41 | static __rtm_force_inline void _xend(void) 42 | { 43 | asm volatile(".byte 0x0f,0x01,0xd5" ::: "memory"); 44 | } 45 | 46 | static __rtm_force_inline void _xabort(const unsigned int status) 47 | { 48 | asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory"); 49 | } 50 | 51 | static __rtm_force_inline int _xtest(void) 52 | { 53 | unsigned char out; 54 | asm volatile(".byte 0x0f,0x01,0xd6 ; setnz %0" : "=r" (out) :: "memory"); 55 | return out; 56 | } 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /spinlock-mcs.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_MCS 2 | #define _SPINLOCK_MCS 3 | 4 | #define cmpxchg(P, O, N) __sync_val_compare_and_swap((P), (O), (N)) 5 | 6 | #define barrier() asm volatile("": : :"memory") 7 | #define cpu_relax() asm volatile("pause\n": : :"memory") 8 | 9 | static inline void *xchg_64(void *ptr, void *x) 10 | { 11 | __asm__ __volatile__("xchgq %0,%1" 12 | :"=r" ((unsigned long long) x) 13 | :"m" (*(volatile long long *)ptr), "0" ((unsigned long long) x) 14 | :"memory"); 15 | 16 | return x; 17 | } 18 | 19 | typedef struct mcs_lock_t mcs_lock_t; 20 | struct mcs_lock_t 21 | { 22 | mcs_lock_t *next; 23 | int spin; 24 | }; 25 | typedef struct mcs_lock_t *mcs_lock; 26 | 27 | static inline void lock_mcs(mcs_lock *m, mcs_lock_t *me) 28 | { 29 | mcs_lock_t *tail; 30 | 31 | me->next = NULL; 32 | me->spin = 0; 33 | 34 | tail = xchg_64(m, me); 35 | 36 | /* No one there? */ 37 | if (!tail) return; 38 | 39 | /* Someone there, need to link in */ 40 | tail->next = me; 41 | 42 | /* Make sure we do the above setting of next. */ 43 | barrier(); 44 | 45 | /* Spin on my spin variable */ 46 | while (!me->spin) cpu_relax(); 47 | 48 | return; 49 | } 50 | 51 | static inline void unlock_mcs(mcs_lock *m, mcs_lock_t *me) 52 | { 53 | /* No successor yet? */ 54 | if (!me->next) 55 | { 56 | /* Try to atomically unlock */ 57 | if (cmpxchg(m, me, NULL) == me) return; 58 | 59 | /* Wait for successor to appear */ 60 | while (!me->next) cpu_relax(); 61 | } 62 | 63 | /* Unlock next one */ 64 | me->next->spin = 1; 65 | } 66 | 67 | static inline int trylock_mcs(mcs_lock *m, mcs_lock_t *me) 68 | { 69 | mcs_lock_t *tail; 70 | 71 | me->next = NULL; 72 | me->spin = 0; 73 | 74 | /* Try to lock */ 75 | tail = cmpxchg(m, NULL, &me); 76 | 77 | /* No one was there - can quickly return */ 78 | if (!tail) return 0; 79 | 80 | return 1; // Busy 81 | } 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /spinlock-k42.h: -------------------------------------------------------------------------------- 1 | #ifndef _SPINLOCK_K42 2 | #define _SPINLOCK_K42 3 | 4 | /* Code copied from http://locklessinc.com/articles/locks/ 5 | * Note this algorithm is patented by IBM. */ 6 | 7 | /* Macro hack to avoid changing the name. */ 8 | #define spin_lock k42_lock 9 | #define spin_unlock k42_unlock 10 | #define spinlock k42lock 11 | 12 | #define SPINLOCK_INITIALIZER { 0, 0 }; 13 | 14 | #define cmpxchg(P, O, N) __sync_val_compare_and_swap((P), (O), (N)) 15 | 16 | #define barrier() asm volatile("": : :"memory") 17 | #define cpu_relax() asm volatile("pause\n": : :"memory") 18 | 19 | static inline void *xchg_64(void *ptr, void *x) 20 | { 21 | __asm__ __volatile__("xchgq %0,%1" 22 | :"=r" ((unsigned long long) x) 23 | :"m" (*(volatile long long *)ptr), "0" ((unsigned long long) x) 24 | :"memory"); 25 | 26 | return x; 27 | } 28 | 29 | typedef struct k42lock k42lock; 30 | struct k42lock 31 | { 32 | k42lock *next; 33 | k42lock *tail; 34 | }; 35 | 36 | static inline void k42_lock(k42lock *l) 37 | { 38 | k42lock me; 39 | k42lock *pred, *succ; 40 | me.next = NULL; 41 | 42 | barrier(); 43 | 44 | pred = xchg_64(&l->tail, &me); 45 | if (pred) 46 | { 47 | me.tail = (void *) 1; 48 | 49 | barrier(); 50 | pred->next = &me; 51 | barrier(); 52 | 53 | while (me.tail) cpu_relax(); 54 | } 55 | 56 | succ = me.next; 57 | 58 | if (!succ) 59 | { 60 | barrier(); 61 | l->next = NULL; 62 | 63 | if (cmpxchg(&l->tail, &me, &l->next) != &me) 64 | { 65 | while (!me.next) cpu_relax(); 66 | 67 | l->next = me.next; 68 | } 69 | } 70 | else 71 | { 72 | l->next = succ; 73 | } 74 | } 75 | 76 | static inline void k42_unlock(k42lock *l) 77 | { 78 | k42lock *succ = l->next; 79 | 80 | barrier(); 81 | 82 | if (!succ) 83 | { 84 | if (cmpxchg(&l->tail, &l->next, NULL) == (void *) &l->next) return; 85 | 86 | while (!l->next) cpu_relax(); 87 | succ = l->next; 88 | } 89 | 90 | succ->tail = NULL; 91 | } 92 | 93 | static inline int k42_trylock(k42lock *l) 94 | { 95 | if (!cmpxchg(&l->tail, NULL, &l->next)) return 0; 96 | 97 | return 1; // Busy 98 | } 99 | 100 | #endif 101 | -------------------------------------------------------------------------------- /stack.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "spinlock-pthread.h" 5 | 6 | /* 7 | * Naive implementation of lock-free stack which does not handle ABA problem. 8 | * This works if only one thread is doing pop. 9 | * 10 | * For lock-free stack which handles ABA problem, see streamflow. 11 | */ 12 | 13 | /* Return 1 if swap happened. */ 14 | static inline unsigned int compare_and_swap(volatile void *address, 15 | void *old_value, void *new_value) 16 | { 17 | void *prev = 0; 18 | 19 | asm volatile("lock; cmpxchgq %1,%2" 20 | : "=a"(prev) 21 | : "r"(new_value), "m"(*(long *)address), "0"(old_value) 22 | : "memory"); 23 | 24 | return prev == old_value; 25 | } 26 | 27 | /* Flag to use pthread mutex or spinlock, useful when we want to compare the 28 | * performance of different implementation. */ 29 | /*#define MUTEX*/ 30 | /*#define SPINLOCK*/ 31 | 32 | typedef struct Node { 33 | struct Node *next; 34 | int val; 35 | } Node; 36 | 37 | typedef struct { 38 | volatile Node *top; 39 | #ifdef MUTEX 40 | pthread_mutex_t mutex; 41 | #elif defined(SPINLOCK) 42 | Spinlock slock; 43 | #endif 44 | } Stack; 45 | 46 | Stack gstack; 47 | 48 | #if defined(MUTEX) || defined(SPINLOCK) 49 | 50 | void push(Stack *stack, Node *n) { 51 | #ifdef MUTEX 52 | pthread_mutex_lock(&stack->mutex); 53 | #elif defined(SPINLOCK) 54 | spin_lock(&stack->slock); 55 | #endif 56 | 57 | n->next = (Node *)stack->top; 58 | stack->top = n; 59 | 60 | #ifdef MUTEX 61 | pthread_mutex_unlock(&stack->mutex); 62 | #elif defined(SPINLOCK) 63 | spin_unlock(&stack->slock); 64 | #endif 65 | } 66 | 67 | Node *pop(Stack *stack) { 68 | Node *oldtop; 69 | 70 | if (stack->top == NULL) 71 | return NULL; 72 | 73 | #ifdef MUTEX 74 | pthread_mutex_lock(&stack->mutex); 75 | #elif defined(SPINLOCK) 76 | spin_lock(&stack->slock); 77 | #endif 78 | oldtop = (Node *)stack->top; 79 | stack->top = oldtop->next; 80 | 81 | #ifdef MUTEX 82 | pthread_mutex_unlock(&stack->mutex); 83 | #elif defined(SPINLOCK) 84 | spin_unlock(&stack->slock); 85 | #endif 86 | 87 | return oldtop; 88 | } 89 | 90 | #else 91 | 92 | /* Lock free version. */ 93 | void push(Stack *stack, Node *n) { 94 | Node *oldtop; 95 | while (1) { 96 | oldtop = (Node *)stack->top; 97 | n->next = oldtop; 98 | if (compare_and_swap(&stack->top, oldtop, n)) 99 | return; 100 | } 101 | } 102 | 103 | Node *pop(Stack *stack) { 104 | Node *oldtop, *next; 105 | 106 | while (1) { 107 | oldtop = (Node *)stack->top; 108 | if (oldtop == NULL) 109 | return NULL; 110 | next = oldtop->next; 111 | if (compare_and_swap(&stack->top, oldtop, next)) 112 | return oldtop; 113 | } 114 | } 115 | 116 | #endif 117 | 118 | /* Testing code. */ 119 | 120 | #define NITERS 2000000 /* Number of pushes for each push thread. */ 121 | #define NTHR 3 /* Number of push threads. */ 122 | 123 | void *pusher(void *dummy) { 124 | long i, tid = (long) dummy; 125 | for (i = 0; i < NITERS; i++) { 126 | Node *n = malloc(sizeof(*n)); 127 | n->val = NTHR * i + tid; 128 | push(&gstack, n); 129 | } 130 | 131 | return NULL; 132 | } 133 | 134 | static inline void atomic_inc64(volatile unsigned long* address) 135 | { 136 | asm volatile( 137 | "lock; incq %0\n\t" 138 | : "=m" (*address) 139 | : "m" (*address)); 140 | } 141 | 142 | volatile unsigned long popcount = 0; 143 | 144 | void *poper(void *dummy) { 145 | Node *n; 146 | 147 | while (popcount < NTHR * NITERS) { 148 | n = pop(&gstack); 149 | if (n) { 150 | printf("%d\n", n->val); 151 | /* Only one pop thread. */ 152 | popcount++; 153 | /*atomic_inc64(&popcount);*/ 154 | } 155 | } 156 | 157 | return NULL; 158 | } 159 | 160 | int main(int argc, const char *argv[]) { 161 | #ifdef MUTEX 162 | pthread_mutex_init(&gstack.mutex, NULL); 163 | #endif 164 | 165 | #ifdef SPINLOCK 166 | SPIN_LOCK_INIT(&gstack.slock); 167 | #endif 168 | 169 | /* TODO We need more threads to test the scalability of the stack 170 | * implementation. 171 | * 172 | * With 3 pusher calling push, 173 | * 1. spinlock give worst performance. 174 | * 2. mutex is faster than spinlock, but it's running time is not stable. 175 | * 3. lock free version is fast, and running time is stable. 176 | */ 177 | 178 | pthread_t thr_push[NTHR], thr_pop; 179 | long i; 180 | 181 | for (i = 0; i < NTHR; i++) { 182 | if (pthread_create(&thr_push[i], NULL, pusher, (void *)i) != 0) { 183 | perror("thread creating failed"); 184 | } 185 | } 186 | 187 | if (pthread_create(&thr_pop, NULL, poper, NULL) != 0) { 188 | perror("thread creating failed"); 189 | } 190 | 191 | for (i = 0; i < NTHR; i++) { 192 | pthread_join(thr_push[i], NULL); 193 | } 194 | pthread_join(thr_pop, NULL); 195 | 196 | return 0; 197 | } 198 | 199 | -------------------------------------------------------------------------------- /test-spinlock.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef XCHG 12 | #include "spinlock-xchg.h" 13 | #elif defined(XCHGBACKOFF) 14 | #include "spinlock-xchg-backoff.h" 15 | #elif defined(K42) 16 | #include "spinlock-k42.h" 17 | #elif defined(MCS) 18 | #include "spinlock-mcs.h" 19 | #elif defined(TICKET) 20 | #include "spinlock-ticket.h" 21 | #elif defined(PTHREAD) 22 | #include "spinlock-pthread.h" 23 | #elif defined(CMPXCHG) 24 | #include "spinlock-cmpxchg.h" 25 | #elif defined(RTM) 26 | #include "spinlock-xchg.h" 27 | #include "rtm.h" 28 | #elif defined(HLE) 29 | #include "spinlock-xchg-hle.h" 30 | #else 31 | #error "must define a spinlock implementation" 32 | #endif 33 | 34 | #ifndef cpu_relax 35 | #define cpu_relax() asm volatile("pause\n": : :"memory") 36 | #endif 37 | 38 | /* It's hard to say which spinlock implementation performs best. I guess the 39 | * performance depends on CPU topology which will affect the cache coherence 40 | * messages, and maybe other factors. 41 | * 42 | * The result of binding threads to the same physical core shows that when 43 | * threads are on a single physical CPU, contention will not cause severe 44 | * performance degradation. But there's one exception, cmpxchg. It's performance 45 | * will degrade no matter the threads resides on the which physical CPU. 46 | * 47 | * Here's the result on Dell R910 server (4 CPUs, 10 cores each), with Intel(R) 48 | * Xeon(R) CPU E7- 4850 @ 2.00GHz 49 | * 50 | * - pthread_mutex BEATS ALL when there's more than 2 cores running this 51 | * benchmark! Not sure what's the trick used by pthread_mutex. 52 | * 53 | * - For spinlock with cmpxchg, the performance degrades very fast with the 54 | * increase of threads. Seems that it does not have good scalability. 55 | * 56 | * - For spinlock with xchg, it has much better scalability than cmpxchg, but it's 57 | * slower when there's 2 and 4 cores. 58 | * 59 | * - For K42, The case for 2 threads is extremely bad, for other number of 60 | * threads, the performance is stable and shows good scalability. 61 | * 62 | * - MCS spinlock has similar performance with K42, and does not have the 63 | * extremely bad 2 thread case. 64 | * 65 | * - Ticket spinlock actually performs very badly. 66 | */ 67 | 68 | /* Number of total lock/unlock pair. 69 | * Note we need to ensure the total pair of lock and unlock opeartion are the 70 | * same no matter how many threads are used. */ 71 | #define N_PAIR 16000000 72 | 73 | /* Bind threads to specific cores. The goal is to make threads locate on the 74 | * same physical CPU. Modify bind_core before using this. */ 75 | //#define BIND_CORE 76 | 77 | static int nthr = 0; 78 | 79 | static volatile uint32_t wflag; 80 | /* Wait on a flag to make all threads start almost at the same time. */ 81 | void wait_flag(volatile uint32_t *flag, uint32_t expect) { 82 | __sync_fetch_and_add((uint32_t *)flag, 1); 83 | while (*flag != expect) { 84 | cpu_relax(); 85 | } 86 | } 87 | 88 | static struct timeval start_time; 89 | static struct timeval end_time; 90 | 91 | static void calc_time(struct timeval *start, struct timeval *end) { 92 | if (end->tv_usec < start->tv_usec) { 93 | end->tv_sec -= 1; 94 | end->tv_usec += 1000000; 95 | } 96 | 97 | assert(end->tv_sec >= start->tv_sec); 98 | assert(end->tv_usec >= start->tv_usec); 99 | struct timeval interval = { 100 | end->tv_sec - start->tv_sec, 101 | end->tv_usec - start->tv_usec 102 | }; 103 | printf("%ld.%06ld\t", (long)interval.tv_sec, (long)interval.tv_usec); 104 | } 105 | 106 | // Use an array of counter to see effect on RTM if touches more cache line. 107 | #define NCOUNTER 1 108 | #define CACHE_LINE 64 109 | 110 | // Use thread local counter to avoid cache contention between cores. 111 | // For TSX, this avoids TX conflicts so the performance overhead/improvement is 112 | // due to TSX mechanism. 113 | static __thread int8_t counter[CACHE_LINE*NCOUNTER]; 114 | 115 | #ifdef MCS 116 | mcs_lock cnt_lock = NULL; 117 | #else 118 | spinlock sl; 119 | #endif 120 | 121 | #ifdef BIND_CORE 122 | void bind_core(int threadid) { 123 | /* cores with logical id 4x is on CPU physical id 0 */ 124 | /* cores with logical id 4x+1 is on CPU physical id 1 */ 125 | int phys_id = threadid / 10; 126 | int core = threadid % 10; 127 | 128 | int logical_id = 4 * core + phys_id; 129 | /*printf("thread %d bind to logical core %d on physical id %d\n", threadid, logical_id, phys_id);*/ 130 | 131 | cpu_set_t set; 132 | CPU_ZERO(&set); 133 | CPU_SET(logical_id, &set); 134 | 135 | if (sched_setaffinity(0, sizeof(set), &set) != 0) { 136 | perror("Set affinity failed"); 137 | exit(EXIT_FAILURE); 138 | } 139 | } 140 | #endif 141 | 142 | void *inc_thread(void *id) { 143 | int n = N_PAIR / nthr; 144 | assert(n * nthr == N_PAIR); 145 | #ifdef MCS 146 | mcs_lock_t local_lock; 147 | #endif 148 | #ifdef BIND_CORE 149 | bind_core((int)(long)(id)); 150 | #endif 151 | wait_flag(&wflag, nthr); 152 | 153 | if (((long) id == 0)) { 154 | /*printf("get start time\n");*/ 155 | gettimeofday(&start_time, NULL); 156 | } 157 | 158 | /* Start lock unlock test. */ 159 | for (int i = 0; i < n; i++) { 160 | #ifdef MCS 161 | lock_mcs(&cnt_lock, &local_lock); 162 | for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++; 163 | unlock_mcs(&cnt_lock, &local_lock); 164 | #elif RTM 165 | int status; 166 | if ((status = _xbegin()) == _XBEGIN_STARTED) { 167 | for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++; 168 | if (sl == BUSY) 169 | _xabort(1); 170 | _xend(); 171 | } else { 172 | spin_lock(&sl); 173 | for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++; 174 | spin_unlock(&sl); 175 | } 176 | #else 177 | spin_lock(&sl); 178 | for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++; 179 | spin_unlock(&sl); 180 | #endif 181 | } 182 | 183 | if (__sync_fetch_and_add((uint32_t *)&wflag, -1) == 1) { 184 | /*printf("get end time\n");*/ 185 | gettimeofday(&end_time, NULL); 186 | } 187 | return NULL; 188 | } 189 | 190 | int main(int argc, const char *argv[]) 191 | { 192 | pthread_t *thr; 193 | int ret = 0; 194 | 195 | if (argc != 2) { 196 | printf("Usage: %s \n", argv[0]); 197 | exit(1); 198 | } 199 | 200 | nthr = atoi(argv[1]); 201 | /*printf("using %d threads\n", nthr);*/ 202 | thr = calloc(sizeof(*thr), nthr); 203 | 204 | // Start thread 205 | for (long i = 0; i < nthr; i++) { 206 | if (pthread_create(&thr[i], NULL, inc_thread, (void *)i) != 0) { 207 | perror("thread creating failed"); 208 | } 209 | } 210 | // join thread 211 | for (long i = 0; i < nthr; i++) 212 | pthread_join(thr[i], NULL); 213 | 214 | calc_time(&start_time, &end_time); 215 | /* 216 | *for (int i = 0; i < NCOUNTER; i++) { 217 | * if (counter[i] == N_PAIR) { 218 | * } else { 219 | * printf("counter %d error\n", i); 220 | * ret = 1; 221 | * } 222 | *} 223 | */ 224 | 225 | return ret; 226 | } 227 | --------------------------------------------------------------------------------