├── README.md
├── spinlock-pthread.h
├── run-test-spinlock.sh
├── spinlock-xchg.h
├── Makefile
├── spinlock-cmpxchg.h
├── spinlock-xchg-hle.h
├── spinlock-ticket.h
├── spinlock-xchg-backoff.h
├── rtm.h
├── spinlock-mcs.h
├── spinlock-k42.h
├── stack.c
└── test-spinlock.c


/README.md:
--------------------------------------------------------------------------------
1 | Various spinlock implementations from this article [Spinlocks and Read-Write Locks](http://locklessinc.com/articles/locks/) by *Lockless Inc*.
2 | 
3 | I made some modification to make each implementation self contained and provide a benchmark script. The code relies on GCC's built-in functions for atomic memory access.
4 | 
5 | **Note: Scalability is achieved by avoiding sharing and contention, not by scalable locks.**
6 | 


--------------------------------------------------------------------------------
/spinlock-pthread.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SPINLOCK_PTHREAD_H
 2 | #define _SPINLOCK_PTHREAD_H
 3 | 
 4 | #define SPINLOCK_ATTR static __inline __attribute__((always_inline, no_instrument_function))
 5 | 
 6 | #define spinlock pthread_mutex_t
 7 | 
 8 | SPINLOCK_ATTR void spin_lock(spinlock *lock)
 9 | {
10 |     pthread_mutex_lock(lock);
11 | }
12 | 
13 | SPINLOCK_ATTR void spin_unlock(spinlock *lock)
14 | {
15 |     pthread_mutex_unlock(lock);
16 | }
17 | 
18 | #define SPINLOCK_INITIALIZER { 0 }
19 | 
20 | #endif /* _SPINLOCK_H */
21 | 


--------------------------------------------------------------------------------
/run-test-spinlock.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function run_test() {
 4 |     for nthr in 1 2 4 8 16 32; do
 5 |         ./$1 $nthr > /dev/null
 6 |         for i in `seq 1 3`; do
 7 |             ./$1 $nthr
 8 |         done
 9 |         echo
10 |     done
11 | }
12 | 
13 | echo "test spin lock using cmpxchg"
14 | run_test "test-spinlock-cmpxchg"
15 | 
16 | echo "test spin lock using xchg"
17 | run_test "test-spinlock-xchg"
18 | 
19 | echo "test spin lock using k42"
20 | run_test "test-spinlock-k42"
21 | 
22 | echo "test spin lock using mcs"
23 | run_test "test-spinlock-mcs"
24 | 
25 | echo "test spin lock using ticket"
26 | run_test "test-spinlock-ticket"
27 | 
28 | echo "test spin lock using pthread"
29 | run_test "test-spinlock-pthread"
30 | 
31 | echo "test spin lock using xchg-backoff"
32 | run_test "test-spinlock-xchg-backoff"
33 | 
34 | 


--------------------------------------------------------------------------------
/spinlock-xchg.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SPINLOCK_XCHG_H
 2 | #define _SPINLOCK_XCHG_H
 3 | 
 4 | /* Spin lock using xchg.
 5 |  * Copied from http://locklessinc.com/articles/locks/
 6 |  */
 7 | 
 8 | /* Compile read-write barrier */
 9 | #define barrier() asm volatile("": : :"memory")
10 | 
11 | /* Pause instruction to prevent excess processor bus usage */
12 | #define cpu_relax() asm volatile("pause\n": : :"memory")
13 | 
14 | static inline unsigned short xchg_8(void *ptr, unsigned char x)
15 | {
16 |     __asm__ __volatile__("xchgb %0,%1"
17 |                 :"=r" (x)
18 |                 :"m" (*(volatile unsigned char *)ptr), "0" (x)
19 |                 :"memory");
20 | 
21 |     return x;
22 | }
23 | 
24 | #define BUSY 1
25 | typedef unsigned char spinlock;
26 | 
27 | #define SPINLOCK_INITIALIZER 0
28 | 
29 | static inline void spin_lock(spinlock *lock)
30 | {
31 |     while (1) {
32 |         if (!xchg_8(lock, BUSY)) return;
33 |     
34 |         while (*lock) cpu_relax();
35 |     }
36 | }
37 | 
38 | static inline void spin_unlock(spinlock *lock)
39 | {
40 |     barrier();
41 |     *lock = 0;
42 | }
43 | 
44 | static inline int spin_trylock(spinlock *lock)
45 | {
46 |     return xchg_8(lock, BUSY);
47 | }
48 | 
49 | #endif /* _SPINLOCK_XCHG_H */
50 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS = -O2 -g -std=gnu99 -Wall
 2 | LDFLAGS = -lpthread
 3 | 
 4 | programs = test-spinlock-cmpxchg test-spinlock-xchg test-spinlock-k42 \
 5 | 		   test-spinlock-mcs test-spinlock-ticket test-spinlock-pthread \
 6 | 		   test-spinlock-xchg-backoff test-rtm test-spinlock-xchg-hle
 7 | 
 8 | all: $(programs)
 9 | 
10 | test-spinlock-cmpxchg: test-spinlock.c
11 | 	$(CC) $(CFLAGS) -DCMPXCHG $^ -o $@ $(LDFLAGS)
12 | 
13 | test-spinlock-xchg: test-spinlock.c
14 | 	$(CC) $(CFLAGS) -DXCHG $^ -o $@ $(LDFLAGS)
15 | 
16 | test-spinlock-k42: test-spinlock.c
17 | 	$(CC) $(CFLAGS) -DK42 $^ -o $@ $(LDFLAGS)
18 | 
19 | test-spinlock-mcs: test-spinlock.c
20 | 	$(CC) $(CFLAGS) -DMCS $^ -o $@ $(LDFLAGS)
21 | 
22 | test-spinlock-ticket: test-spinlock.c
23 | 	$(CC) $(CFLAGS) -DTICKET $^ -o $@ $(LDFLAGS)
24 | 
25 | test-spinlock-pthread: test-spinlock.c
26 | 	$(CC) $(CFLAGS) -DPTHREAD $^ -o $@ $(LDFLAGS)
27 | 
28 | test-spinlock-xchg-backoff: test-spinlock.c
29 | 	$(CC) $(CFLAGS) -DXCHGBACKOFF $^ -o $@ $(LDFLAGS)
30 | 
31 | test-spinlock-xchg-hle: test-spinlock.c
32 | 	$(CC) $(CFLAGS) -DHLE $^ -o $@ $(LDFLAGS)
33 | 
34 | test-rtm: test-spinlock.c
35 | 	$(CC) $(CFLAGS) -DRTM $^ -o $@ $(LDFLAGS)
36 | 
37 | %:%.c
38 | 	$(CC) $(CFLAGS) $< -o $@
39 | 
40 | clean:
41 | 	-rm -f *.o
42 | 	-rm -f $(programs)
43 | 


--------------------------------------------------------------------------------
/spinlock-cmpxchg.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SPINLOCK_CMPXCHG_H
 2 | #define _SPINLOCK_CMPXCHG_H
 3 | 
 4 | typedef struct {
 5 |     volatile char lock;
 6 | } spinlock;
 7 | 
 8 | #define SPINLOCK_ATTR static __inline __attribute__((always_inline, no_instrument_function))
 9 | 
10 | /* Pause instruction to prevent excess processor bus usage */
11 | #define cpu_relax() asm volatile("pause\n": : :"memory")
12 | 
13 | SPINLOCK_ATTR char __testandset(spinlock *p)
14 | {
15 |     char readval = 0;
16 | 
17 |     asm volatile (
18 |             "lock; cmpxchgb %b2, %0"
19 |             : "+m" (p->lock), "+a" (readval)
20 |             : "r" (1)
21 |             : "cc");
22 |     return readval;
23 | }
24 | 
25 | SPINLOCK_ATTR void spin_lock(spinlock *lock)
26 | {
27 |     while (__testandset(lock)) {
28 |         /* Should wait until lock is free before another try.
29 |          * cmpxchg is write to cache, competing write for a sinlge cache line
30 |          * would generate large amount of cache traffic. That's why this
31 |          * implementation is not scalable compared to xchg based one. Otherwise,
32 |          * they should have similar performance. */
33 |         cpu_relax();
34 |     }
35 | }
36 | 
37 | SPINLOCK_ATTR void spin_unlock(spinlock *s)
38 | {
39 |     s->lock = 0;
40 | }
41 | 
42 | #define SPINLOCK_INITIALIZER { 0 }
43 | 
44 | #endif /* _SPINLOCK_CMPXCHG_H */
45 | 


--------------------------------------------------------------------------------
/spinlock-xchg-hle.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SPINLOCK_XCHG_H
 2 | #define _SPINLOCK_XCHG_H
 3 | 
 4 | /* Spin lock using xchg.
 5 |  * Copied from http://locklessinc.com/articles/locks/
 6 |  */
 7 | 
 8 | /* Compile read-write barrier */
 9 | #define barrier() asm volatile("": : :"memory")
10 | 
11 | /* Pause instruction to prevent excess processor bus usage */
12 | #define cpu_relax() asm volatile("pause\n": : :"memory")
13 | 
14 | #define __HLE_ACQUIRE ".byte 0xf2 ; "
15 | #define __HLE_RELEASE ".byte 0xf3 ; "
16 | 
17 | static inline unsigned short xchg_8(void *ptr, unsigned char x)
18 | {
19 |     __asm__ __volatile__(__HLE_ACQUIRE "xchgb %0,%1"
20 |                 :"=r" (x)
21 |                 :"m" (*(volatile unsigned char *)ptr), "0" (x)
22 |                 :"memory");
23 | 
24 |     return x;
25 | }
26 | 
27 | #define BUSY 1
28 | typedef unsigned char spinlock;
29 | 
30 | #define SPINLOCK_INITIALIZER 0
31 | 
32 | static inline void spin_lock(spinlock *lock)
33 | {
34 |     while (1) {
35 |         if (!xchg_8(lock, BUSY)) return;
36 |     
37 |         while (*lock) cpu_relax();
38 |     }
39 | }
40 | 
41 | static inline void spin_unlock(spinlock *lock)
42 | {
43 |     __asm__ __volatile__(__HLE_RELEASE "movb $0, %0"
44 |             :"=m" (*lock)
45 |             :
46 |             :"memory");
47 | }
48 | 
49 | static inline int spin_trylock(spinlock *lock)
50 | {
51 |     return xchg_8(lock, BUSY);
52 | }
53 | 
54 | #endif /* _SPINLOCK_XCHG_H */
55 | 


--------------------------------------------------------------------------------
/spinlock-ticket.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SPINLOCK_TICKET_H
 2 | #define _SPINLOCK_H
 3 | 
 4 | /* Code copied from http://locklessinc.com/articles/locks/ */
 5 | 
 6 | #define cmpxchg(P, O, N) __sync_val_compare_and_swap((P), (O), (N))
 7 | #define atomic_xadd(P, V) __sync_fetch_and_add((P), (V))
 8 | 
 9 | #define barrier() asm volatile("": : :"memory")
10 | #define cpu_relax() asm volatile("pause\n": : :"memory")
11 | 
12 | #define spin_lock ticket_lock
13 | #define spin_unlock ticket_unlock
14 | #define spinlock ticketlock
15 | 
16 | #define SPINLOCK_INITIALIZER { 0, 0 };
17 | 
18 | typedef union ticketlock ticketlock;
19 | 
20 | union ticketlock
21 | {
22 |     unsigned u;
23 |     struct
24 |     {
25 |         unsigned short ticket;
26 |         unsigned short users;
27 |     } s;
28 | };
29 | 
30 | static inline void ticket_lock(ticketlock *t)
31 | {
32 |     unsigned short me = atomic_xadd(&t->s.users, 1);
33 |     
34 |     while (t->s.ticket != me) cpu_relax();
35 | }
36 | 
37 | static inline void ticket_unlock(ticketlock *t)
38 | {
39 |     barrier();
40 |     t->s.ticket++;
41 | }
42 | 
43 | static inline int ticket_trylock(ticketlock *t)
44 | {
45 |     unsigned short me = t->s.users;
46 |     unsigned short menew = me + 1;
47 |     unsigned cmp = ((unsigned) me << 16) + me;
48 |     unsigned cmpnew = ((unsigned) menew << 16) + me;
49 | 
50 |     if (cmpxchg(&t->u, cmp, cmpnew) == cmp) return 0;
51 |     
52 |     return 1; // Busy
53 | }
54 | 
55 | static inline int ticket_lockable(ticketlock *t)
56 | {
57 |     ticketlock u = *t;
58 |     barrier();
59 |     return (u.s.ticket == u.s.users);
60 | }
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/spinlock-xchg-backoff.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SPINLOCK_XCHG_BACKOFF_H
 2 | #define _SPINLOCK_XCHG_BACKOFF_H
 3 | 
 4 | /* Spin lock using xchg. Added backoff wait to avoid concurrent lock/unlock
 5 |  * operation.
 6 |  * Original code copied from http://locklessinc.com/articles/locks/
 7 |  */
 8 | 
 9 | /* Compile read-write barrier */
10 | #define barrier() asm volatile("": : :"memory")
11 | 
12 | /* Pause instruction to prevent excess processor bus usage */
13 | #define cpu_relax() asm volatile("pause\n": : :"memory")
14 | 
15 | static inline unsigned short xchg_8(void *ptr, unsigned char x)
16 | {
17 |     __asm__ __volatile__("xchgb %0,%1"
18 |                 :"=r" (x)
19 |                 :"m" (*(volatile unsigned char *)ptr), "0" (x)
20 |                 :"memory");
21 | 
22 |     return x;
23 | }
24 | 
25 | #define BUSY 1
26 | typedef unsigned char spinlock;
27 | 
28 | #define SPINLOCK_INITIALIZER 0
29 | 
30 | static inline void spin_lock(spinlock *lock)
31 | {
32 |     int wait = 1;
33 |     while (1) {
34 |         if (!xchg_8(lock, BUSY)) return;
35 |     
36 |         // wait here is important to performance.
37 |         for (int i = 0; i < wait; i++) {
38 |             cpu_relax();
39 |         }
40 |         while (*lock) {
41 |             wait *= 2; // exponential backoff if can't get lock
42 |             for (int i = 0; i < wait; i++) {
43 |                 cpu_relax();
44 |             }
45 |         }
46 |     }
47 | }
48 | 
49 | static inline void spin_unlock(spinlock *lock)
50 | {
51 |     barrier();
52 |     *lock = 0;
53 | }
54 | 
55 | static inline int spin_trylock(spinlock *lock)
56 | {
57 |     return xchg_8(lock, BUSY);
58 | }
59 | 
60 | #endif /* _SPINLOCK_XCHG_BACKOFF_H */
61 | 


--------------------------------------------------------------------------------
/rtm.h:
--------------------------------------------------------------------------------
 1 | #ifndef _RTM_H
 2 | #define _RTM_H 1
 3 | 
 4 | /*
 5 |  * Copyright (c) 2012,2013 Intel Corporation
 6 |  * Author: Andi Kleen
 7 |  *
 8 |  * Redistribution and use in source and binary forms, with or without
 9 |  * modification, are permitted provided that: (1) source code distributions
10 |  * retain the above copyright notice and this paragraph in its entirety, (2)
11 |  * distributions including binary code include the above copyright notice and
12 |  * this paragraph in its entirety in the documentation or other materials
13 |  * provided with the distribution
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
16 |  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
17 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
18 |  */
19 | 
20 | /* Official RTM intrinsics interface matching gcc/icc, but works
21 |    on older gcc compatible compilers and binutils. */
22 | 
23 | #define _XBEGIN_STARTED		(~0u)
24 | #define _XABORT_EXPLICIT	(1 << 0)
25 | #define _XABORT_RETRY		(1 << 1)
26 | #define _XABORT_CONFLICT	(1 << 2)
27 | #define _XABORT_CAPACITY	(1 << 3)
28 | #define _XABORT_DEBUG		(1 << 4)
29 | #define _XABORT_NESTED		(1 << 5)
30 | #define _XABORT_CODE(x)		(((x) >> 24) & 0xff)
31 | 
32 | #define __rtm_force_inline __attribute__((__always_inline__)) inline
33 | 
34 | static __rtm_force_inline int _xbegin(void)
35 | {
36 | 	int ret = _XBEGIN_STARTED;
37 | 	asm volatile(".byte 0xc7,0xf8 ; .long 0" : "+a" (ret) :: "memory");
38 | 	return ret;
39 | }
40 | 
41 | static __rtm_force_inline void _xend(void)
42 | {
43 | 	 asm volatile(".byte 0x0f,0x01,0xd5" ::: "memory");
44 | }
45 | 
46 | static __rtm_force_inline void _xabort(const unsigned int status)
47 | {
48 | 	asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory");
49 | }
50 | 
51 | static __rtm_force_inline int _xtest(void)
52 | {
53 | 	unsigned char out;
54 | 	asm volatile(".byte 0x0f,0x01,0xd6 ; setnz %0" : "=r" (out) :: "memory");
55 | 	return out;
56 | }
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/spinlock-mcs.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SPINLOCK_MCS
 2 | #define _SPINLOCK_MCS
 3 | 
 4 | #define cmpxchg(P, O, N) __sync_val_compare_and_swap((P), (O), (N))
 5 | 
 6 | #define barrier() asm volatile("": : :"memory")
 7 | #define cpu_relax() asm volatile("pause\n": : :"memory")
 8 | 
 9 | static inline void *xchg_64(void *ptr, void *x)
10 | {
11 |     __asm__ __volatile__("xchgq %0,%1"
12 |                 :"=r" ((unsigned long long) x)
13 |                 :"m" (*(volatile long long *)ptr), "0" ((unsigned long long) x)
14 |                 :"memory");
15 | 
16 |     return x;
17 | }
18 | 
19 | typedef struct mcs_lock_t mcs_lock_t;
20 | struct mcs_lock_t
21 | {
22 |     mcs_lock_t *next;
23 |     int spin;
24 | };
25 | typedef struct mcs_lock_t *mcs_lock;
26 | 
27 | static inline void lock_mcs(mcs_lock *m, mcs_lock_t *me)
28 | {
29 |     mcs_lock_t *tail;
30 |     
31 |     me->next = NULL;
32 |     me->spin = 0;
33 | 
34 |     tail = xchg_64(m, me);
35 |     
36 |     /* No one there? */
37 |     if (!tail) return;
38 | 
39 |     /* Someone there, need to link in */
40 |     tail->next = me;
41 | 
42 |     /* Make sure we do the above setting of next. */
43 |     barrier();
44 |     
45 |     /* Spin on my spin variable */
46 |     while (!me->spin) cpu_relax();
47 |     
48 |     return;
49 | }
50 | 
51 | static inline void unlock_mcs(mcs_lock *m, mcs_lock_t *me)
52 | {
53 |     /* No successor yet? */
54 |     if (!me->next)
55 |     {
56 |         /* Try to atomically unlock */
57 |         if (cmpxchg(m, me, NULL) == me) return;
58 |     
59 |         /* Wait for successor to appear */
60 |         while (!me->next) cpu_relax();
61 |     }
62 | 
63 |     /* Unlock next one */
64 |     me->next->spin = 1; 
65 | }
66 | 
67 | static inline int trylock_mcs(mcs_lock *m, mcs_lock_t *me)
68 | {
69 |     mcs_lock_t *tail;
70 |     
71 |     me->next = NULL;
72 |     me->spin = 0;
73 |     
74 |     /* Try to lock */
75 |     tail = cmpxchg(m, NULL, &me);
76 |     
77 |     /* No one was there - can quickly return */
78 |     if (!tail) return 0;
79 |     
80 |     return 1; // Busy
81 | }
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/spinlock-k42.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPINLOCK_K42
  2 | #define _SPINLOCK_K42
  3 | 
  4 | /* Code copied from http://locklessinc.com/articles/locks/
  5 |  * Note this algorithm is patented by IBM. */
  6 | 
  7 | /* Macro hack to avoid changing the name. */
  8 | #define spin_lock k42_lock
  9 | #define spin_unlock k42_unlock
 10 | #define spinlock k42lock
 11 | 
 12 | #define SPINLOCK_INITIALIZER { 0, 0 };
 13 | 
 14 | #define cmpxchg(P, O, N) __sync_val_compare_and_swap((P), (O), (N))
 15 | 
 16 | #define barrier() asm volatile("": : :"memory")
 17 | #define cpu_relax() asm volatile("pause\n": : :"memory")
 18 | 
 19 | static inline void *xchg_64(void *ptr, void *x)
 20 | {
 21 |     __asm__ __volatile__("xchgq %0,%1"
 22 |                 :"=r" ((unsigned long long) x)
 23 |                 :"m" (*(volatile long long *)ptr), "0" ((unsigned long long) x)
 24 |                 :"memory");
 25 | 
 26 |     return x;
 27 | }
 28 | 
 29 | typedef struct k42lock k42lock;
 30 | struct k42lock
 31 | {
 32 |     k42lock *next;
 33 |     k42lock *tail;
 34 | };
 35 | 
 36 | static inline void k42_lock(k42lock *l)
 37 | {
 38 |     k42lock me;
 39 |     k42lock *pred, *succ;
 40 |     me.next = NULL;
 41 |     
 42 |     barrier();
 43 |     
 44 |     pred = xchg_64(&l->tail, &me);
 45 |     if (pred)
 46 |     {
 47 |         me.tail = (void *) 1;
 48 |         
 49 |         barrier();
 50 |         pred->next = &me;
 51 |         barrier();
 52 |         
 53 |         while (me.tail) cpu_relax();
 54 |     }
 55 |     
 56 |     succ = me.next;
 57 | 
 58 |     if (!succ)
 59 |     {
 60 |         barrier();
 61 |         l->next = NULL;
 62 |         
 63 |         if (cmpxchg(&l->tail, &me, &l->next) != &me)
 64 |         {
 65 |             while (!me.next) cpu_relax();
 66 |             
 67 |             l->next = me.next;
 68 |         }
 69 |     }
 70 |     else
 71 |     {
 72 |         l->next = succ;
 73 |     }
 74 | }
 75 | 
 76 | static inline void k42_unlock(k42lock *l)
 77 | {
 78 |     k42lock *succ = l->next;
 79 |     
 80 |     barrier();
 81 |     
 82 |     if (!succ)
 83 |     {
 84 |         if (cmpxchg(&l->tail, &l->next, NULL) == (void *) &l->next) return;
 85 |         
 86 |         while (!l->next) cpu_relax();
 87 |         succ = l->next;
 88 |     }
 89 |     
 90 |     succ->tail = NULL;
 91 | }
 92 | 
 93 | static inline int k42_trylock(k42lock *l)
 94 | {
 95 |     if (!cmpxchg(&l->tail, NULL, &l->next)) return 0;
 96 |     
 97 |     return 1; // Busy
 98 | }
 99 | 
100 | #endif
101 | 


--------------------------------------------------------------------------------
/stack.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <pthread.h>
  3 | #include <stdio.h>
  4 | #include "spinlock-pthread.h"
  5 | 
  6 | /*
  7 |  * Naive implementation of lock-free stack which does not handle ABA problem.
  8 |  * This works if only one thread is doing pop.
  9 |  *
 10 |  * For lock-free stack which handles ABA problem, see streamflow.
 11 |  */
 12 | 
 13 | /* Return 1 if swap happened. */
 14 | static inline unsigned int compare_and_swap(volatile void *address,
 15 |         void *old_value, void *new_value)
 16 | {
 17 |     void *prev = 0;
 18 | 
 19 |     asm volatile("lock; cmpxchgq %1,%2"
 20 |         : "=a"(prev)
 21 |         : "r"(new_value), "m"(*(long *)address), "0"(old_value)
 22 |         : "memory");
 23 | 
 24 |     return prev == old_value;
 25 | }
 26 | 
 27 | /* Flag to use pthread mutex or spinlock, useful when we want to compare the
 28 |  * performance of different implementation. */
 29 | /*#define MUTEX*/
 30 | /*#define SPINLOCK*/
 31 | 
 32 | typedef struct Node {
 33 |     struct Node *next;
 34 |     int val;
 35 | } Node;
 36 | 
 37 | typedef struct {
 38 |     volatile Node *top;
 39 | #ifdef MUTEX
 40 |     pthread_mutex_t mutex;
 41 | #elif defined(SPINLOCK)
 42 |     Spinlock slock;
 43 | #endif
 44 | } Stack;
 45 | 
 46 | Stack gstack;
 47 | 
 48 | #if defined(MUTEX) || defined(SPINLOCK)
 49 | 
 50 | void push(Stack *stack, Node *n) {
 51 | #ifdef MUTEX
 52 |     pthread_mutex_lock(&stack->mutex);
 53 | #elif defined(SPINLOCK)
 54 |     spin_lock(&stack->slock);
 55 | #endif
 56 | 
 57 |     n->next = (Node *)stack->top;
 58 |     stack->top = n;
 59 | 
 60 | #ifdef MUTEX
 61 |     pthread_mutex_unlock(&stack->mutex);
 62 | #elif defined(SPINLOCK)
 63 |     spin_unlock(&stack->slock);
 64 | #endif
 65 | }
 66 | 
 67 | Node *pop(Stack *stack) {
 68 |     Node *oldtop;
 69 | 
 70 |     if (stack->top == NULL)
 71 |         return NULL;
 72 | 
 73 | #ifdef MUTEX
 74 |     pthread_mutex_lock(&stack->mutex);
 75 | #elif defined(SPINLOCK)
 76 |     spin_lock(&stack->slock);
 77 | #endif
 78 |     oldtop = (Node *)stack->top;
 79 |     stack->top = oldtop->next;
 80 | 
 81 | #ifdef MUTEX
 82 |     pthread_mutex_unlock(&stack->mutex);
 83 | #elif defined(SPINLOCK)
 84 |     spin_unlock(&stack->slock);
 85 | #endif
 86 | 
 87 |     return oldtop;
 88 | }
 89 | 
 90 | #else
 91 | 
 92 | /* Lock free version. */
 93 | void push(Stack *stack, Node *n) {
 94 |     Node *oldtop;
 95 |     while (1) {
 96 |         oldtop = (Node *)stack->top;
 97 |         n->next = oldtop;
 98 |         if (compare_and_swap(&stack->top, oldtop, n))
 99 |             return;
100 |     }
101 | }
102 | 
103 | Node *pop(Stack *stack) {
104 |     Node *oldtop, *next;
105 | 
106 |     while (1) {
107 |         oldtop = (Node *)stack->top;
108 |         if (oldtop == NULL)
109 |             return NULL;
110 |         next = oldtop->next;
111 |         if (compare_and_swap(&stack->top, oldtop, next))
112 |             return oldtop;
113 |     }
114 | }
115 | 
116 | #endif
117 | 
118 | /* Testing code. */
119 | 
120 | #define NITERS 2000000 /* Number of pushes for each push thread. */
121 | #define NTHR 3 /* Number of push threads. */
122 | 
123 | void *pusher(void *dummy) {
124 |     long i, tid = (long) dummy;
125 |     for (i = 0; i < NITERS; i++) {
126 |         Node *n = malloc(sizeof(*n));
127 |         n->val = NTHR * i + tid;
128 |         push(&gstack, n);
129 |     }
130 | 
131 |     return NULL;
132 | }
133 | 
134 | static inline void atomic_inc64(volatile unsigned long* address)
135 | {
136 |     asm volatile(
137 |         "lock; incq %0\n\t"
138 |         : "=m" (*address)
139 |         : "m" (*address));
140 | }
141 | 
142 | volatile unsigned long popcount = 0;
143 | 
144 | void *poper(void *dummy) {
145 |     Node *n;
146 | 
147 |     while (popcount < NTHR * NITERS) {
148 |         n = pop(&gstack);
149 |         if (n) {
150 |             printf("%d\n", n->val);
151 |             /* Only one pop thread. */
152 |             popcount++;
153 |             /*atomic_inc64(&popcount);*/
154 |         }
155 |     }
156 | 
157 |     return NULL;
158 | }
159 | 
160 | int main(int argc, const char *argv[]) {
161 | #ifdef MUTEX
162 |     pthread_mutex_init(&gstack.mutex, NULL);
163 | #endif
164 | 
165 | #ifdef SPINLOCK
166 |     SPIN_LOCK_INIT(&gstack.slock);
167 | #endif
168 | 
169 |     /* TODO We need more threads to test the scalability of the stack
170 |      * implementation.
171 |      * 
172 |      * With 3 pusher calling push,
173 |      * 1. spinlock give worst performance.
174 |      * 2. mutex is faster than spinlock, but it's running time is not stable.
175 |      * 3. lock free version is fast, and running time is stable.
176 |      */
177 | 
178 |     pthread_t thr_push[NTHR], thr_pop;
179 |     long i;
180 | 
181 |     for (i = 0; i < NTHR; i++) {
182 |         if (pthread_create(&thr_push[i], NULL, pusher, (void *)i) != 0) {
183 |             perror("thread creating failed");
184 |         }
185 |     }
186 | 
187 |     if (pthread_create(&thr_pop, NULL, poper, NULL) != 0) {
188 |         perror("thread creating failed");
189 |     }
190 | 
191 |     for (i = 0; i < NTHR; i++) {
192 |         pthread_join(thr_push[i], NULL);
193 |     }
194 |     pthread_join(thr_pop, NULL);
195 | 
196 |     return 0;
197 | }
198 | 
199 | 


--------------------------------------------------------------------------------
/test-spinlock.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | #include <pthread.h>
  3 | #include <sched.h>
  4 | #include <assert.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <stdint.h>
  8 | #include <sys/time.h>
  9 | #include <errno.h>
 10 | 
 11 | #ifdef XCHG
 12 | #include "spinlock-xchg.h"
 13 | #elif defined(XCHGBACKOFF)
 14 | #include "spinlock-xchg-backoff.h"
 15 | #elif defined(K42)
 16 | #include "spinlock-k42.h"
 17 | #elif defined(MCS)
 18 | #include "spinlock-mcs.h"
 19 | #elif defined(TICKET)
 20 | #include "spinlock-ticket.h"
 21 | #elif defined(PTHREAD)
 22 | #include "spinlock-pthread.h"
 23 | #elif defined(CMPXCHG)
 24 | #include "spinlock-cmpxchg.h"
 25 | #elif defined(RTM)
 26 | #include "spinlock-xchg.h"
 27 | #include "rtm.h"
 28 | #elif defined(HLE)
 29 | #include "spinlock-xchg-hle.h"
 30 | #else
 31 | #error "must define a spinlock implementation"
 32 | #endif
 33 | 
 34 | #ifndef cpu_relax
 35 | #define cpu_relax() asm volatile("pause\n": : :"memory")
 36 | #endif
 37 | 
 38 | /* It's hard to say which spinlock implementation performs best. I guess the
 39 |  * performance depends on CPU topology which will affect the cache coherence
 40 |  * messages, and maybe other factors.
 41 |  *
 42 |  * The result of binding threads to the same physical core shows that when
 43 |  * threads are on a single physical CPU, contention will not cause severe
 44 |  * performance degradation. But there's one exception, cmpxchg. It's performance
 45 |  * will degrade no matter the threads resides on the which physical CPU.
 46 |  *
 47 |  * Here's the result on Dell R910 server (4 CPUs, 10 cores each), with Intel(R)
 48 |  * Xeon(R) CPU E7- 4850  @ 2.00GHz
 49 |  *
 50 |  * - pthread_mutex BEATS ALL when there's more than 2 cores running this
 51 |  *   benchmark! Not sure what's the trick used by pthread_mutex.
 52 |  *
 53 |  * - For spinlock with cmpxchg, the performance degrades very fast with the
 54 |  * increase of threads. Seems that it does not have good scalability.
 55 |  *
 56 |  * - For spinlock with xchg, it has much better scalability than cmpxchg, but it's
 57 |  * slower when there's 2 and 4 cores.
 58 |  *
 59 |  * - For K42, The case for 2 threads is extremely bad, for other number of
 60 |  *   threads, the performance is stable and shows good scalability.
 61 |  *
 62 |  * - MCS spinlock has similar performance with K42, and does not have the
 63 |  *   extremely bad 2 thread case.
 64 |  *
 65 |  * - Ticket spinlock actually performs very badly.
 66 |  */
 67 | 
 68 | /* Number of total lock/unlock pair.
 69 |  * Note we need to ensure the total pair of lock and unlock opeartion are the
 70 |  * same no matter how many threads are used. */
 71 | #define N_PAIR 16000000
 72 | 
 73 | /* Bind threads to specific cores. The goal is to make threads locate on the
 74 |  * same physical CPU. Modify bind_core before using this. */
 75 | //#define BIND_CORE
 76 | 
 77 | static int nthr = 0;
 78 | 
 79 | static volatile uint32_t wflag;
 80 | /* Wait on a flag to make all threads start almost at the same time. */
 81 | void wait_flag(volatile uint32_t *flag, uint32_t expect) {
 82 |     __sync_fetch_and_add((uint32_t *)flag, 1);
 83 |     while (*flag != expect) {
 84 |         cpu_relax();
 85 |     }
 86 | }
 87 | 
 88 | static struct timeval start_time;
 89 | static struct timeval end_time;
 90 | 
 91 | static void calc_time(struct timeval *start, struct timeval *end) {
 92 |     if (end->tv_usec < start->tv_usec) {
 93 |         end->tv_sec -= 1;
 94 |         end->tv_usec += 1000000;
 95 |     }
 96 | 
 97 |     assert(end->tv_sec >= start->tv_sec);
 98 |     assert(end->tv_usec >= start->tv_usec);
 99 |     struct timeval interval = {
100 |         end->tv_sec - start->tv_sec,
101 |         end->tv_usec - start->tv_usec
102 |     };
103 |     printf("%ld.%06ld\t", (long)interval.tv_sec, (long)interval.tv_usec);
104 | }
105 | 
106 | // Use an array of counter to see effect on RTM if touches more cache line.
107 | #define NCOUNTER 1
108 | #define CACHE_LINE 64
109 | 
110 | // Use thread local counter to avoid cache contention between cores.
111 | // For TSX, this avoids TX conflicts so the performance overhead/improvement is
112 | // due to TSX mechanism.
113 | static __thread int8_t counter[CACHE_LINE*NCOUNTER];
114 | 
115 | #ifdef MCS
116 | mcs_lock cnt_lock = NULL;
117 | #else
118 | spinlock sl;
119 | #endif
120 | 
121 | #ifdef BIND_CORE
122 | void bind_core(int threadid) {
123 |     /* cores with logical id 4x   is on CPU physical id 0 */
124 |     /* cores with logical id 4x+1 is on CPU physical id 1 */
125 |     int phys_id = threadid / 10;
126 |     int core = threadid % 10;
127 | 
128 |     int logical_id = 4 * core + phys_id;
129 |     /*printf("thread %d bind to logical core %d on physical id %d\n", threadid, logical_id, phys_id);*/
130 | 
131 |     cpu_set_t set;
132 |     CPU_ZERO(&set);
133 |     CPU_SET(logical_id, &set);
134 | 
135 |     if (sched_setaffinity(0, sizeof(set), &set) != 0) {
136 |         perror("Set affinity failed");
137 |         exit(EXIT_FAILURE);
138 |     }
139 | }
140 | #endif
141 | 
142 | void *inc_thread(void *id) {
143 |     int n = N_PAIR / nthr;
144 |     assert(n * nthr == N_PAIR);
145 | #ifdef MCS
146 |     mcs_lock_t local_lock;
147 | #endif
148 | #ifdef BIND_CORE
149 |     bind_core((int)(long)(id));
150 | #endif
151 |     wait_flag(&wflag, nthr);
152 | 
153 |     if (((long) id == 0)) {
154 |         /*printf("get start time\n");*/
155 |         gettimeofday(&start_time, NULL);
156 |     }
157 | 
158 |     /* Start lock unlock test. */
159 |     for (int i = 0; i < n; i++) {
160 | #ifdef MCS
161 |         lock_mcs(&cnt_lock, &local_lock);
162 |         for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
163 |         unlock_mcs(&cnt_lock, &local_lock);
164 | #elif RTM
165 |         int status;
166 |         if ((status = _xbegin()) == _XBEGIN_STARTED) {
167 |             for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
168 |             if (sl == BUSY)
169 |                 _xabort(1);
170 |             _xend();
171 |         } else {
172 |             spin_lock(&sl);
173 |             for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
174 |             spin_unlock(&sl);
175 |         }
176 | #else
177 |         spin_lock(&sl);
178 |         for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
179 |         spin_unlock(&sl);
180 | #endif
181 |     }
182 | 
183 |     if (__sync_fetch_and_add((uint32_t *)&wflag, -1) == 1) {
184 |         /*printf("get end time\n");*/
185 |         gettimeofday(&end_time, NULL);
186 |     }
187 |     return NULL;
188 | }
189 | 
190 | int main(int argc, const char *argv[])
191 | {
192 |     pthread_t *thr;
193 |     int ret = 0;
194 | 
195 |     if (argc != 2) {
196 |         printf("Usage: %s <num of threads>\n", argv[0]);
197 |         exit(1);
198 |     }
199 | 
200 |     nthr = atoi(argv[1]);
201 |     /*printf("using %d threads\n", nthr);*/
202 |     thr = calloc(sizeof(*thr), nthr);
203 | 
204 |     // Start thread
205 |     for (long i = 0; i < nthr; i++) {
206 |         if (pthread_create(&thr[i], NULL, inc_thread, (void *)i) != 0) {
207 |             perror("thread creating failed");
208 |         }
209 |     }
210 |     // join thread
211 |     for (long i = 0; i < nthr; i++)
212 |         pthread_join(thr[i], NULL);
213 | 
214 |     calc_time(&start_time, &end_time);
215 |     /*
216 |      *for (int i = 0; i < NCOUNTER; i++) {
217 |      *    if (counter[i] == N_PAIR) {
218 |      *    } else {
219 |      *        printf("counter %d error\n", i);
220 |      *        ret = 1;
221 |      *    }
222 |      *}
223 |      */
224 | 
225 |     return ret;
226 | }
227 | 


--------------------------------------------------------------------------------