├── datatype
    └── nstring.c
├── include
    ├── common.h
    ├── datatype.h
    ├── hashtable.h
    ├── hazard.h
    ├── list.h
    ├── lwt.h
    ├── map.h
    ├── mem.h
    ├── murmur.h
    ├── nstring.h
    ├── rcu.h
    ├── runtime.h
    ├── skiplist.h
    ├── tls.h
    └── txn.h
├── license.txt
├── makefile
├── map
    ├── hashtable.c
    ├── list.c
    ├── map.c
    ├── skiplist.c
    └── unsafe_skiplist.c
├── perf.sh
├── runtime
    ├── hazard.c
    ├── lwt.c
    ├── mem.c
    ├── mem2.c
    ├── mem_class_calc.c
    ├── random.c
    ├── rcu.c
    ├── rlocal.h
    └── runtime.c
├── test
    ├── CuTest-license.txt
    ├── CuTest.c
    ├── CuTest.h
    ├── haz_test.c
    ├── map_test1.c
    ├── map_test2.c
    ├── perf_test.c
    ├── rcu_test.c
    └── txn_test.c
├── todo
└── txn
    └── txn.c


/datatype/nstring.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "nstring.h"
 3 | #include "murmur.h"
 4 | #include "mem.h"
 5 | 
 6 | const datatype_t DATATYPE_NSTRING = { (cmp_fun_t)ns_cmp, (hash_fun_t)ns_hash, (clone_fun_t)ns_dup };
 7 | 
 8 | nstring_t *ns_alloc (uint32_t len) {
 9 |     nstring_t *ns = nbd_malloc(sizeof(nstring_t) + len);
10 |     ns->len = len;
11 |     return ns;
12 | }
13 | 
14 | int ns_cmp (const nstring_t *ns1, const nstring_t *ns2) {
15 |     int d = memcmp(ns1->data, ns2->data, (ns1->len < ns2->len) ? ns1->len : ns1->len);
16 |     return (d == 0) ? ns1->len - ns2->len : d;
17 | }
18 | 
19 | uint32_t ns_hash (const nstring_t *ns) {
20 |     return murmur32(ns->data, ns->len);
21 | }
22 | 
23 | nstring_t *ns_dup (const nstring_t *ns1) {
24 |     nstring_t *ns2 = ns_alloc(ns1->len);
25 |     memcpy(ns2->data, ns1->data, ns1->len);
26 |     return ns2;
27 | }
28 | 


--------------------------------------------------------------------------------
/include/common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  */
 5 | #ifndef COMMON_H
 6 | #define COMMON_H
 7 | 
 8 | #include <stdlib.h>
 9 | #include <assert.h>
10 | #include <limits.h>
11 | #include <string.h>
12 | #include <sys/types.h>
13 | 
14 | #define CACHE_LINE_SIZE  64 // 64 byte cache line on x86 and x86-64
15 | #define CACHE_LINE_SCALE 6  // log base 2 of the cache line size
16 | 
17 | #define EXPECT_TRUE(x)      __builtin_expect(!!(x), 1)
18 | #define EXPECT_FALSE(x)     __builtin_expect(!!(x), 0)
19 | 
20 | #ifndef NBD_SINGLE_THREADED
21 | 
22 | #define MAX_NUM_THREADS  32 // make this whatever you want, but make it a power of 2
23 | 
24 | #define SYNC_SWAP(addr,x)         __sync_lock_test_and_set(addr,x)
25 | #define SYNC_CAS(addr,old,x)      __sync_val_compare_and_swap(addr,old,x)
26 | #define SYNC_ADD(addr,n)          __sync_add_and_fetch(addr,n)
27 | #define SYNC_FETCH_AND_OR(addr,x) __sync_fetch_and_or(addr,x)
28 | #else// NBD_SINGLE_THREADED
29 | 
30 | #define MAX_NUM_THREADS  1
31 | 
32 | #define SYNC_SWAP(addr,x)         ({ typeof(*(addr)) _old = *(addr); *(addr)  = (x); _old; })
33 | #define SYNC_CAS(addr,old,x)      ({ typeof(*(addr)) _old = *(addr); *(addr)  = (x); _old; })
34 | //#define SYNC_CAS(addr,old,x)    ({ typeof(*(addr)) _old = *(addr); if ((old) == _old) { *(addr)  = (x); } _old; })
35 | #define SYNC_ADD(addr,n)          ({ typeof(*(addr)) _old = *(addr); *(addr) += (n); _old; })
36 | #define SYNC_FETCH_AND_OR(addr,x) ({ typeof(*(addr)) _old = *(addr); *(addr) |= (x); _old; })
37 | 
38 | #endif//NBD_SINGLE_THREADED
39 | 
40 | #define COUNT_TRAILING_ZEROS __builtin_ctz
41 | 
42 | #define MASK(n)     ((1ULL << (n)) - 1)
43 | 
44 | #define TRUE  1
45 | #define FALSE 0
46 | 
47 | #ifdef NBD32
48 | #define TAG1         (1U << 31)
49 | #define TAG2         (1U << 30)
50 | #else
51 | #define TAG1         (1ULL << 63)
52 | #define TAG2         (1ULL << 62)
53 | #endif
54 | #define TAG_VALUE(v, tag) ((v) |  tag)
55 | #define IS_TAGGED(v, tag) ((v) &  tag)
56 | #define STRIP_TAG(v, tag) ((v) & ~tag)
57 | 
58 | #define DOES_NOT_EXIST 0
59 | #define ERROR_INVALID_OPTION      (-1)
60 | #define ERROR_INVALID_ARGUMENT    (-2)
61 | #define ERROR_UNSUPPORTED_FEATURE (-3)
62 | #define ERROR_TXN_NOT_RUNNING     (-4)
63 | 
64 | #define VOLATILE_DEREF(x) (*((volatile typeof(x))(x)))
65 | 
66 | typedef unsigned long long uint64_t;
67 | typedef unsigned int       uint32_t;
68 | typedef unsigned short     uint16_t;
69 | typedef unsigned char      uint8_t;
70 | 
71 | typedef size_t markable_t;
72 | 
73 | static inline uint64_t rdtsc (void) {
74 |     unsigned l, u;
75 |     __asm__ __volatile__("rdtsc" : "=a" (l), "=d" (u));
76 |     return ((uint64_t)u << 32) | l;
77 | }
78 | 
79 | #include "lwt.h"
80 | #endif //COMMON_H
81 | 


--------------------------------------------------------------------------------
/include/datatype.h:
--------------------------------------------------------------------------------
 1 | #ifndef DATATYPE_H
 2 | #define DATATYPE_H
 3 | 
 4 | typedef int      (*cmp_fun_t)   (void *, void *);
 5 | typedef void *   (*clone_fun_t) (void *);
 6 | typedef uint32_t (*hash_fun_t)  (void *);
 7 | 
 8 | typedef struct datatype {
 9 |     cmp_fun_t   cmp;
10 |     hash_fun_t  hash;
11 |     clone_fun_t clone;
12 | } datatype_t;
13 | 
14 | #endif//DATATYPE_H
15 | 


--------------------------------------------------------------------------------
/include/hashtable.h:
--------------------------------------------------------------------------------
 1 | #ifndef HASHTABLE_H
 2 | #define HASHTABLE_H
 3 | 
 4 | #include "map.h"
 5 | 
 6 | typedef struct ht hashtable_t;
 7 | typedef struct ht_iter ht_iter_t;
 8 | 
 9 | hashtable_t * ht_alloc      (const datatype_t *key_type);
10 | map_val_t     ht_cas        (hashtable_t *ht, map_key_t key, map_val_t expected_val, map_val_t val);
11 | map_val_t     ht_get        (hashtable_t *ht, map_key_t key);
12 | map_val_t     ht_remove     (hashtable_t *ht, map_key_t key);
13 | size_t        ht_count      (hashtable_t *ht);
14 | void          ht_print      (hashtable_t *ht, int verbose);
15 | void          ht_free       (hashtable_t *ht);
16 | ht_iter_t *   ht_iter_begin (hashtable_t *ht, map_key_t key);
17 | map_val_t     ht_iter_next  (ht_iter_t *iter, map_key_t *key_ptr);
18 | void          ht_iter_free  (ht_iter_t *iter);
19 | 
20 | static const map_impl_t MAP_IMPL_HT = { 
21 |     (map_alloc_t)ht_alloc, (map_cas_t)ht_cas, (map_get_t)ht_get, (map_remove_t)ht_remove, 
22 |     (map_count_t)ht_count, (map_print_t)ht_print, (map_free_t)ht_free,
23 |     (map_iter_begin_t)ht_iter_begin, (map_iter_next_t)ht_iter_next, (map_iter_free_t)ht_iter_free
24 | };
25 | 
26 | #endif//HASHTABLE_H
27 | 


--------------------------------------------------------------------------------
/include/hazard.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  *
 5 |  * hazard pointers
 6 |  *
 7 |  * www.research.ibm.com/people/m/michael/ieeetpds-2004.pdf
 8 |  *
 9 |  */
10 | #ifndef HAZARD_H
11 | #define HAZARD_H
12 | 
13 | #define STATIC_HAZ_PER_THREAD 2
14 | 
15 | typedef void (*free_t) (void *);
16 | typedef void *haz_t;
17 | 
18 | //static inline void haz_set (volatile haz_t *haz, void *x) { *haz = x; haz_t y = *haz; y = y; }
19 | 
20 | static inline void haz_set (volatile haz_t *haz, void *x) { *haz = x; __asm__ __volatile__("mfence"); }
21 | 
22 | haz_t *haz_get_static         (int n);
23 | void   haz_register_dynamic   (haz_t *haz);
24 | void   haz_unregister_dynamic (haz_t *haz);
25 | void   haz_defer_free         (void *p, free_t f);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/include/list.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIST_H
 2 | #define LIST_H
 3 | 
 4 | #include "map.h"
 5 | 
 6 | typedef struct ll list_t;
 7 | typedef struct ll_iter ll_iter_t;
 8 | 
 9 | list_t *   ll_alloc   (const datatype_t *key_type);
10 | map_val_t  ll_cas     (list_t *ll, map_key_t key, map_val_t expected_val, map_val_t new_val);
11 | map_val_t  ll_lookup  (list_t *ll, map_key_t key);
12 | map_val_t  ll_remove  (list_t *ll, map_key_t key);
13 | size_t     ll_count   (list_t *ll);
14 | void       ll_print   (list_t *ll, int verbose);
15 | void       ll_free    (list_t *ll);
16 | map_key_t  ll_min_key (list_t *sl);
17 | 
18 | ll_iter_t * ll_iter_begin (list_t *ll, map_key_t key);
19 | map_val_t   ll_iter_next  (ll_iter_t *iter, map_key_t *key_ptr);
20 | void        ll_iter_free  (ll_iter_t *iter);
21 | 
22 | static const map_impl_t MAP_IMPL_LL = { 
23 |     (map_alloc_t)ll_alloc, (map_cas_t)ll_cas, (map_get_t)ll_lookup, (map_remove_t)ll_remove, 
24 |     (map_count_t)ll_count, (map_print_t)ll_print, (map_free_t)ll_free, (map_iter_begin_t)ll_iter_begin,
25 |     (map_iter_next_t)ll_iter_next, (map_iter_free_t)ll_iter_free
26 | };
27 | 
28 | #endif//LIST_H
29 | 


--------------------------------------------------------------------------------
/include/lwt.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  *
 5 |  * lightweight tracing
 6 |  */ 
 7 | #ifndef LWT_H
 8 | #define LWT_H
 9 | 
10 | #ifndef ENABLE_TRACE
11 | #define TRACE(...) do { } while (0)
12 | #else
13 | #define TRACE(flag, format, v1, v2) lwt_trace(flag, format, (size_t)(v1), (size_t)(v2))
14 | #endif
15 | 
16 | #ifndef NDEBUG
17 | #define ASSERT(x) do { if (!(x)) { lwt_halt(); assert(!#x); } } while (0)
18 | #else
19 | #define ASSERT(x) do { } while (0)
20 | #endif
21 | 
22 | // Dump trace records to <file_name>. The file should be post-processed with "sort" before viewing.
23 | void lwt_dump (const char *file_name) __attribute__ ((externally_visible));
24 | 
25 | // <flags> indicates what kind of trace messages should be included in the dump. <flags> is a sequence of letters
26 | // followed by numbers (e.g. "x1c9n2g3"). The letters indicate trace categories and the numbers are trace levels 
27 | // for each category. If a category appears in <flags>, then messages from that category will be included in the
28 | // dump if they have a trace level less than or equal to the one specified in <flags>. Categories are case
29 | // sensitive.
30 | void lwt_set_trace_level (const char *flags);
31 | 
32 | // <flag> is a two character string containing a letter followed by a number (e.g. "f3"). The letter indicates a
33 | // trace category, and the number a trace level. <flag> controls whether or not the trace message gets included in
34 | // the dump. It is only included when its specified category is enabled at a trace level greater than or equal to
35 | // the one in <flag>. Categories are case sensitive. 
36 | static inline void lwt_trace (const char *flag, const char *format, size_t value1, size_t value2) {
37 |     extern char TraceLevel[256];
38 |     if (EXPECT_FALSE(TraceLevel[(unsigned)flag[0]] >= flag[1])) {
39 |         // embed <flags> in <format> so we don't have to make the lwt_record_t any bigger than it already is
40 |         uint64_t f = ((uint64_t)(size_t)format | ((uint64_t)flag[0] << 56) | ((uint64_t)flag[1] << 48));
41 |         extern void lwt_trace_i (uint64_t format, size_t value1, size_t value2);
42 |         lwt_trace_i(f, value1, value2);
43 |     }
44 | }
45 | 
46 | void lwt_halt (void);
47 | 
48 | #endif//LWT_H
49 | 


--------------------------------------------------------------------------------
/include/map.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAP_H
 2 | #define MAP_H
 3 | 
 4 | #include "datatype.h"
 5 | 
 6 | typedef struct map map_t;
 7 | typedef struct map_iter map_iter_t;
 8 | typedef struct map_impl map_impl_t;
 9 | 
10 | #ifdef NBD32
11 | typedef uint32_t map_key_t;
12 | typedef uint32_t map_val_t;
13 | #else
14 | typedef uint64_t map_key_t;
15 | typedef uint64_t map_val_t;
16 | #endif
17 | 
18 | map_t *   map_alloc   (const map_impl_t *map_impl, const datatype_t *key_type);
19 | map_val_t map_get     (map_t *map, map_key_t key);
20 | map_val_t map_set     (map_t *map, map_key_t key, map_val_t new_val);
21 | map_val_t map_add     (map_t *map, map_key_t key, map_val_t new_val);
22 | map_val_t map_cas     (map_t *map, map_key_t key, map_val_t expected_val, map_val_t new_val);
23 | map_val_t map_replace (map_t *map, map_key_t key, map_val_t new_val);
24 | map_val_t map_remove  (map_t *map, map_key_t key);
25 | map_val_t map_count   (map_t *map);
26 | void      map_print   (map_t *map, int verbose);
27 | void      map_free    (map_t *map);
28 | 
29 | map_iter_t * map_iter_begin (map_t *map, map_key_t key);
30 | map_val_t    map_iter_next  (map_iter_t *iter, map_key_t *key);
31 | void         map_iter_free  (map_iter_t *iter);
32 | 
33 | /////////////////////////////////////////////////////////////////////////////////////
34 | 
35 | #define CAS_EXPECT_DOES_NOT_EXIST ( 0)
36 | #define CAS_EXPECT_EXISTS         (-1)
37 | #define CAS_EXPECT_WHATEVER       (-2)
38 | 
39 | typedef void *       (*map_alloc_t)  (const datatype_t *);
40 | typedef map_val_t    (*map_cas_t)    (void *, map_key_t , map_val_t, map_val_t);
41 | typedef map_val_t    (*map_get_t)    (void *, map_key_t );
42 | typedef map_val_t    (*map_remove_t) (void *, map_key_t );
43 | typedef size_t       (*map_count_t)  (void *);
44 | typedef void         (*map_print_t)  (void *, int);
45 | typedef void         (*map_free_t)   (void *);
46 | 
47 | typedef map_iter_t * (*map_iter_begin_t) (void *, map_key_t);
48 | typedef map_val_t    (*map_iter_next_t)  (map_iter_t *, map_key_t *);
49 | typedef void         (*map_iter_free_t)  (map_iter_t *);
50 | 
51 | struct map_impl {
52 |     map_alloc_t  alloc;
53 |     map_cas_t    cas;
54 |     map_get_t    get;
55 |     map_remove_t remove;
56 |     map_count_t  count;
57 |     map_print_t  print;
58 |     map_free_t   free_;
59 | 
60 |     map_iter_begin_t iter_begin;
61 |     map_iter_next_t  iter_next;
62 |     map_iter_free_t  iter_free;
63 | };
64 | 
65 | #endif//MAP_H
66 | 


--------------------------------------------------------------------------------
/include/mem.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  */
 5 | #ifndef MEM_H
 6 | #define MEM_H
 7 | void *nbd_malloc (size_t n) __attribute__((malloc, alloc_size(1)));
 8 | void nbd_free (void *x) __attribute__((nonnull));
 9 | #endif//MEM_H
10 | 


--------------------------------------------------------------------------------
/include/murmur.h:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // MurmurHash2, by Austin Appleby
  3 | 
  4 | // Note - This code makes a few assumptions about how your machine behaves -
  5 | 
  6 | // 1. We can read a 4-byte value from any address without crashing
  7 | // 2. sizeof(int) == 4
  8 | 
  9 | // And it has a few limitations -
 10 | 
 11 | // 1. It will not work incrementally.
 12 | // 2. It will not produce the same results on little-endian and big-endian
 13 | //    machines.
 14 | 
 15 | static inline uint32_t murmur32 (const char *key, int len)
 16 | {
 17 | 	// 'm' and 'r' are mixing constants generated offline.
 18 | 	// They're not really 'magic', they just happen to work well.
 19 | 
 20 | 	const uint32_t m = 0x5bd1e995;
 21 | 	const int r = 24;
 22 | 
 23 | 	// Initialize the hash to a 'random' value
 24 | 	uint32_t h = len;
 25 | 
 26 | 	// Mix 4 bytes at a time into the hash
 27 | 
 28 | 	const unsigned char *data = (const unsigned char *)key;
 29 | 
 30 | 	while(len >= 4)
 31 | 	{
 32 | 		uint32_t k = *(uint32_t *)data;
 33 | 
 34 | 		k *= m; 
 35 | 		k ^= k >> r; 
 36 | 		k *= m; 
 37 | 		
 38 | 		h *= m; 
 39 | 		h ^= k;
 40 | 
 41 | 		data += 4;
 42 | 		len -= 4;
 43 | 	}
 44 | 	
 45 | 	// Handle the last few bytes of the input array
 46 | 
 47 | 	switch(len)
 48 | 	{
 49 | 	case 3: h ^= data[2] << 16;
 50 | 	case 2: h ^= data[1] << 8;
 51 | 	case 1: h ^= data[0];
 52 | 	        h *= m;
 53 | 	};
 54 | 
 55 | 	// Do a few final mixes of the hash to ensure the last few
 56 | 	// bytes are well-incorporated.
 57 | 
 58 | 	h ^= h >> 13;
 59 | 	h *= m;
 60 | 	h ^= h >> 15;
 61 | 
 62 | 	return h;
 63 | } 
 64 | 
 65 | static inline uint32_t murmur32_8b (uint64_t key)
 66 | {
 67 |     // 'm' and 'r' are mixing constants generated offline.
 68 |     // They're not really 'magic', they just happen to work well.
 69 | 
 70 |     const uint32_t m = 0x5bd1e995;
 71 |     const int r = 24;
 72 | 
 73 |     // Initialize the hash to a 'random' value
 74 |     uint32_t h = 8;
 75 | 
 76 |     uint32_t k1 = (uint32_t)(key >> 32);
 77 |     uint32_t k2 = (uint32_t)key;
 78 | 
 79 |     k1 *= m; 
 80 |     k1 ^= k1 >> r; 
 81 |     k1 *= m; 
 82 | 
 83 |     k2 *= m; 
 84 |     k2 ^= k2 >> r; 
 85 |     k2 *= m; 
 86 | 
 87 |     // Mix 4 bytes at a time into the hash
 88 | 
 89 |     h *= m; 
 90 |     h ^= k1;
 91 |     h *= m; 
 92 |     h ^= k2;
 93 | 
 94 |     // Do a few final mixes of the hash to ensure the last few
 95 |     // bytes are well-incorporated.
 96 | 
 97 |     h ^= h >> 13;
 98 |     h *= m;
 99 |     h ^= h >> 15;
100 | 
101 |     return h;
102 | }
103 | 
104 | static inline uint32_t murmur32_4b (uint32_t key)
105 | {
106 |     // 'm' and 'r' are mixing constants generated offline.
107 |     // They're not really 'magic', they just happen to work well.
108 | 
109 |     const uint32_t m = 0x5bd1e995;
110 |     const int r = 24;
111 | 
112 |     // Initialize the hash to a 'random' value
113 |     uint32_t h = 4;
114 | 
115 |     uint32_t k = *(uint32_t *)&key;
116 | 
117 |     k *= m; 
118 |     k ^= k >> r; 
119 |     k *= m; 
120 | 
121 |     // Mix 4 bytes at a time into the hash
122 | 
123 |     h *= m; 
124 |     h ^= k;
125 | 
126 |     // Do a few final mixes of the hash to ensure the last few
127 |     // bytes are well-incorporated.
128 | 
129 |     h ^= h >> 13;
130 |     h *= m;
131 |     h ^= h >> 15;
132 | 
133 |     return h;
134 | }
135 | 


--------------------------------------------------------------------------------
/include/nstring.h:
--------------------------------------------------------------------------------
 1 | #ifndef NSTRING_H
 2 | #define NSTRING_H
 3 | 
 4 | #include "common.h"
 5 | #include "datatype.h"
 6 | 
 7 | typedef struct nstring {
 8 |     uint32_t len;
 9 |     char data[];
10 | } nstring_t;
11 | 
12 | nstring_t * ns_alloc (uint32_t len);
13 | int         ns_cmp   (const nstring_t *ns1, const nstring_t *ns2);
14 | uint32_t    ns_hash  (const nstring_t *ns);
15 | nstring_t * ns_dup   (const nstring_t *ns);
16 | 
17 | extern const datatype_t DATATYPE_NSTRING;
18 | 
19 | #endif//NSTRING_H 
20 | 


--------------------------------------------------------------------------------
/include/rcu.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  */
 5 | #ifndef RCU_H
 6 | #define RCU_H
 7 | 
 8 | void rcu_update (void);
 9 | void rcu_defer_free (void *x);
10 | 
11 | #endif//RCU_H
12 | 


--------------------------------------------------------------------------------
/include/runtime.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  */
 5 | #ifndef RUNTIME_H
 6 | #define RUNTIME_H
 7 | 
 8 | #include <pthread.h>
 9 | #include "tls.h"
10 | 
11 | void nbd_thread_init (void);
12 | uint64_t nbd_rand (void);
13 | 
14 | #endif//RUNTIME_H
15 | 


--------------------------------------------------------------------------------
/include/skiplist.h:
--------------------------------------------------------------------------------
 1 | #ifndef SKIPLIST_H
 2 | #define SKIPLIST_H
 3 | 
 4 | #include "map.h"
 5 | 
 6 | typedef struct sl skiplist_t;
 7 | typedef struct sl_iter sl_iter_t;
 8 | 
 9 | skiplist_t * sl_alloc (const datatype_t *key_type);
10 | map_val_t  sl_cas     (skiplist_t *sl, map_key_t key, map_val_t expected_val, map_val_t new_val);
11 | map_val_t  sl_lookup  (skiplist_t *sl, map_key_t key);
12 | map_val_t  sl_remove  (skiplist_t *sl, map_key_t key);
13 | size_t     sl_count   (skiplist_t *sl);
14 | void       sl_print   (skiplist_t *sl, int verbose);
15 | void       sl_free    (skiplist_t *sl);
16 | map_key_t  sl_min_key (skiplist_t *sl);
17 | 
18 | sl_iter_t * sl_iter_begin (skiplist_t *sl, map_key_t key);
19 | map_val_t   sl_iter_next  (sl_iter_t *iter, map_key_t *key_ptr);
20 | void        sl_iter_free  (sl_iter_t *iter);
21 | 
22 | static const map_impl_t MAP_IMPL_SL = { 
23 |     (map_alloc_t)sl_alloc, (map_cas_t)sl_cas, (map_get_t)sl_lookup, (map_remove_t)sl_remove, 
24 |     (map_count_t)sl_count, (map_print_t)sl_print, (map_free_t)sl_free, (map_iter_begin_t)sl_iter_begin,
25 |     (map_iter_next_t)sl_iter_next, (map_iter_free_t)sl_iter_free
26 | };
27 | 
28 | #endif//SKIPLIST_H
29 | 


--------------------------------------------------------------------------------
/include/tls.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  *
 5 |  * A platform independant wrapper around thread-local storage. On platforms that don't support 
 6 |  * __thread variables (e.g. Mac OS X), we have to use the pthreads library for thread-local storage 
 7 |  */
 8 | #ifndef TLS_H
 9 | #define TLS_H
10 | 
11 | #ifdef __ELF__ // use gcc thread-local storage (i.e. __thread variables)
12 | #define DECLARE_THREAD_LOCAL(name, type) __thread type name
13 | #define INIT_THREAD_LOCAL(name) 
14 | #define SET_THREAD_LOCAL(name, value) name = value
15 | #define LOCALIZE_THREAD_LOCAL(name, type)
16 | 
17 | #else//!__ELF__
18 | 
19 | #include <pthread.h>
20 | 
21 | #define DECLARE_THREAD_LOCAL(name, type) pthread_key_t name##_KEY
22 | 
23 | #define INIT_THREAD_LOCAL(name) \
24 |     do { \
25 |         if (pthread_key_create(&name##_KEY, NULL) != 0) { \
26 |             assert("error initializing thread local variable " #name, FALSE); \
27 |         } \
28 |     } while (0)
29 | 
30 | #define SET_THREAD_LOCAL(name, value) \
31 |     do { \
32 |         name = value; \
33 |         pthread_setspecific(name##_KEY, (void *)(size_t)value); \
34 |     } while (0);
35 | 
36 | #define LOCALIZE_THREAD_LOCAL(name, type) type name = (type)(size_t)pthread_getspecific(name##_KEY)
37 | 
38 | #endif//__ELF__
39 | #endif//TLS_H
40 | 


--------------------------------------------------------------------------------
/include/txn.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  */
 5 | #ifndef TXN_H
 6 | #define TXN_H
 7 | 
 8 | #include "map.h"
 9 | 
10 | typedef enum { TXN_RUNNING, TXN_VALIDATING, TXN_VALIDATED, TXN_ABORTED } txn_state_e;
11 | 
12 | typedef struct txn txn_t;
13 | 
14 | txn_t *     txn_begin  (map_t *map);
15 | void        txn_abort  (txn_t *txn);
16 | txn_state_e txn_commit (txn_t *txn);
17 | 
18 | map_val_t   txn_map_get (txn_t *txn, map_key_t key);
19 | void        txn_map_set (txn_t *txn, map_key_t key, map_val_t value);
20 | 
21 | #endif//TXN_H
22 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | Code in this distribution that is written by Josh Dybnis is released to the
 2 | public domain, as explained at http://creativecommons.org/licenses/publicdomain
 3 | which is repeated below:
 4 |     
 5 |     The person or persons who have associated work with this document (the 
 6 |     "Dedicator" or "Certifier") hereby either (a) certifies that, to the
 7 |     best of his knowledge, the work of authorship identified is in the 
 8 |     public domain of the country from which the work is published, or (b)
 9 |     hereby dedicates whatever copyright the dedicators holds in the work of
10 |     authorship identified below (the "Work") to the public domain. A 
11 |     certifier, moreover, dedicates any copyright interest he may have in the
12 |     associated work, and for these purposes, is described as a "dedicator" 
13 |     below.
14 |     
15 |     A certifier has taken reasonable steps to verify the copyright status of
16 |     this work. Certifier recognizes that his good faith efforts may not 
17 |     shield him from liability if in fact the work certified is not in the 
18 |     public domain.
19 |     
20 |     Dedicator makes this dedication for the benefit of the public at large 
21 |     and to the detriment of the Dedicator's heirs and successors. Dedicator 
22 |     intends this dedication to be an overt act of relinquishment in 
23 |     perpetuity of all present and future rights under copyright law, 
24 |     whether vested or contingent, in the Work. Dedicator understands that 
25 |     such relinquishment of all rights includes the relinquishment of all 
26 |     rights to enforce (by lawsuit or otherwise) those copyrights in the Work.
27 |     
28 |     Dedicator recognizes that, once placed in the public domain, the Work may
29 |     be freely reproduced, distributed, transmitted, used, modified, built 
30 |     upon, or otherwise exploited by anyone for any purpose, commercial or 
31 |     non-commercial, and in any way, including by methods that have not yet 
32 |     been invented or conceived.
33 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | ###################################################################################################
 2 | # Written by Josh Dybnis and released to the public domain, as explained at
 3 | # http://creativecommons.org/licenses/publicdomain
 4 | ###################################################################################################
 5 | # Makefile for building programs with whole-program interfile optimization
 6 | ###################################################################################################
 7 | CFLAGS0 := -Wall -Werror -std=gnu99 -lpthread #-m32 -DNBD32
 8 | CFLAGS1 := $(CFLAGS0) -g #-DNDEBUG #-fwhole-program -combine
 9 | CFLAGS2 := $(CFLAGS1) #-DENABLE_TRACE
10 | CFLAGS3 := $(CFLAGS2) #-DLIST_USE_HAZARD_POINTER
11 | CFLAGS  := $(CFLAGS3) #-DNBD_SINGLE_THREADED #-DUSE_SYSTEM_MALLOC #-DTEST_STRING_KEYS
12 | INCS    := $(addprefix -I, include)
13 | TESTS   := output/perf_test output/map_test1 output/map_test2 output/rcu_test output/txn_test #output/haz_test
14 | OBJS    := $(TESTS)
15 | 
16 | RUNTIME_SRCS := runtime/runtime.c runtime/rcu.c runtime/lwt.c runtime/mem.c runtime/random.c \
17 | 				datatype/nstring.c #runtime/hazard.c
18 | MAP_SRCS     := map/map.c map/list.c map/skiplist.c map/hashtable.c
19 | 
20 | haz_test_SRCS  := $(RUNTIME_SRCS) test/haz_test.c
21 | rcu_test_SRCS  := $(RUNTIME_SRCS) test/rcu_test.c
22 | txn_test_SRCS  := $(RUNTIME_SRCS) $(MAP_SRCS) test/txn_test.c test/CuTest.c txn/txn.c
23 | map_test1_SRCS := $(RUNTIME_SRCS) $(MAP_SRCS) test/map_test1.c
24 | map_test2_SRCS := $(RUNTIME_SRCS) $(MAP_SRCS) test/map_test2.c test/CuTest.c
25 | perf_test_SRCS := $(RUNTIME_SRCS) $(MAP_SRCS) test/perf_test.c
26 | 
27 | tests: $(TESTS)
28 | 
29 | ###################################################################################################
30 | # build and run tests
31 | ###################################################################################################
32 | test: $(addsuffix .log, $(TESTS))
33 | 	@echo > /dev/null
34 | 
35 | $(addsuffix .log, $(TESTS)) : %.log : %
36 | 	@echo "Running $*" && $* | tee $*.log
37 | 
38 | ###################################################################################################
39 | # Rebuild an executable if any of it's source files need to be recompiled
40 | #
41 | # Note: Calculating dependencies as a side-effect of compilation is disabled. There is a bug in
42 | # 		gcc. Compilation fails when -MM -MF is used and there is more than one source file.
43 | #		Otherwise "-MM -MT $@.d -MF $@.d" should be part of the command line for the compile.
44 | #
45 | #       Also, when calculating dependencies -combine is removed from CFLAGS because of another bug
46 | # 		in gcc. It chokes when -MM is used with -combine.
47 | ###################################################################################################
48 | $(OBJS): output/% : output/%.d makefile
49 | 	gcc $(CFLAGS) $(INCS) -MM -MT $@ $($*_SRCS) > $@.d
50 | 	gcc $(CFLAGS) $(INCS) -o $@ $($*_SRCS)
51 | 
52 | asm: $(addsuffix .s, $(OBJS))
53 | 
54 | $(addsuffix .s, $(OBJS)): output/%.s : output/%.d makefile
55 | 	gcc $(CFLAGS:-combine:) $(INCS) -MM -MT $@ $($*_SRCS) > output/$*.d
56 | 	gcc $(CFLAGS) $(INCS) -combine -S -o $@.temp $($*_SRCS)
57 | 	grep -v "^L[BFM]\|^LCF" $@.temp > $@
58 | 	rm $@.temp
59 | 
60 | ###################################################################################################
61 | # tags file for vi
62 | ###################################################################################################
63 | tags:
64 | 	ctags -R .
65 | 
66 | ###################################################################################################
67 | #
68 | ###################################################################################################
69 | clean:
70 | 	rm -rfv output/*
71 | 
72 | ###################################################################################################
73 | # dummy rule for boostrapping dependency files
74 | ###################################################################################################
75 | $(addsuffix .d, $(OBJS)) : output/%.d :
76 | 
77 | -include $(addsuffix .d, $(OBJS))
78 | 
79 | .PHONY: clean test tags asm
80 | 


--------------------------------------------------------------------------------
/map/hashtable.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * C implementation of Cliff Click's lock-free hash table from
  6 |  * http://www.azulsystems.com/events/javaone_2008/2008_CodingNonBlock.pdf
  7 |  * http://sourceforge.net/projects/high-scale-lib
  8 |  *
  9 |  * Note: This is code uses synchronous atomic operations because that is all that x86 provides.
 10 |  * Every atomic operation is also an implicit full memory barrier. The upshot is that it simplifies
 11 |  * the code a bit, but it won't be as fast as it could be on platforms that provide weaker
 12 |  * operations like unfenced CAS which would still do the job.
 13 |  *
 14 |  * 11FebO9 - Bug fix in ht_iter_next() from Rui Ueyama
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include "common.h"
 19 | #include "murmur.h"
 20 | #include "mem.h"
 21 | #include "rcu.h"
 22 | #include "hashtable.h"
 23 | 
 24 | #ifndef NBD32
 25 | #define GET_PTR(x) ((void *)((x) & MASK(48))) // low-order 48 bits is a pointer to a nstring_t
 26 | #else
 27 | #define GET_PTR(x) ((void *)(x))
 28 | #endif
 29 | 
 30 | typedef struct entry {
 31 |     map_key_t key;
 32 |     map_val_t val;
 33 | } entry_t;
 34 | 
 35 | typedef struct hti {
 36 |     volatile entry_t *table;
 37 |     hashtable_t *ht; // parent ht;
 38 |     struct hti *next;
 39 | #ifdef USE_SYSTEM_MALLOC
 40 |     void *unaligned_table_ptr; // system malloc doesn't guarentee cache-line alignment
 41 | #endif
 42 |     size_t count; // TODO: make these counters distributed
 43 |     size_t key_count;
 44 |     size_t copy_scan;
 45 |     size_t num_entries_copied;
 46 |     int probe;
 47 |     int ref_count;
 48 |     uint8_t scale;
 49 | } hti_t;
 50 | 
 51 | struct ht_iter {
 52 |     hti_t *  hti;
 53 |     int64_t  idx;
 54 | };
 55 | 
 56 | struct ht {
 57 |     hti_t *hti;
 58 |     const datatype_t *key_type;
 59 |     uint32_t hti_copies;
 60 |     double density;
 61 |     int probe;
 62 | };
 63 | 
 64 | static const map_val_t COPIED_VALUE          = TAG_VALUE(DOES_NOT_EXIST, TAG1);
 65 | static const map_val_t TOMBSTONE             = STRIP_TAG(-1, TAG1);
 66 | 
 67 | static const unsigned ENTRIES_PER_BUCKET     = CACHE_LINE_SIZE/sizeof(entry_t);
 68 | static const unsigned ENTRIES_PER_COPY_CHUNK = CACHE_LINE_SIZE/sizeof(entry_t)*2;
 69 | static const unsigned MIN_SCALE              = 4; // min 16 entries (4 buckets)
 70 | 
 71 | static int hti_copy_entry (hti_t *ht1, volatile entry_t *ent, uint32_t ent_key_hash, hti_t *ht2);
 72 | 
 73 | // Choose the next bucket to probe using the high-order bits of <key_hash>.
 74 | static inline int get_next_ndx(int old_ndx, uint32_t key_hash, int ht_scale) {
 75 | #if 1
 76 |     int incr = (key_hash >> (32 - ht_scale));
 77 |     if (incr < ENTRIES_PER_BUCKET) { incr += ENTRIES_PER_BUCKET; }
 78 |     return (old_ndx + incr) & MASK(ht_scale);
 79 | #else
 80 |     return (old_ndx + ENTRIES_PER_BUCKET) & MASK(ht_scale);
 81 | #endif
 82 | }
 83 | 
 84 | // Lookup <key> in <hti>.
 85 | //
 86 | // Return the entry that <key> is in, or if <key> isn't in <hti> return the entry that it would be
 87 | // in if it were inserted into <hti>. If there is no room for <key> in <hti> then return NULL, to
 88 | // indicate that the caller should look in <hti->next>.
 89 | //
 90 | // Record if the entry being returned is empty. Otherwise the caller will have to waste time
 91 | // re-comparing the keys to confirm that it did not lose a race to fill an empty entry.
 92 | static volatile entry_t *hti_lookup (hti_t *hti, map_key_t key, uint32_t key_hash, int *is_empty) {
 93 |     TRACE("h2", "hti_lookup(key %p in hti %p)", key, hti);
 94 |     *is_empty = 0;
 95 | 
 96 |     // Probe one cache line at a time
 97 |     int ndx = key_hash & MASK(hti->scale); // the first entry to search
 98 |     for (int i = 0; i < hti->probe; ++i) {
 99 | 
100 |         // The start of the bucket is the first entry in the cache line.
101 |         volatile entry_t *bucket = hti->table + (ndx & ~(ENTRIES_PER_BUCKET-1));
102 | 
103 |         // Start searching at the indexed entry. Then loop around to the begining of the cache line.
104 |         for (int j = 0; j < ENTRIES_PER_BUCKET; ++j) {
105 |             volatile entry_t *ent = bucket + ((ndx + j) & (ENTRIES_PER_BUCKET-1));
106 | 
107 |             map_key_t ent_key = ent->key;
108 |             if (ent_key == DOES_NOT_EXIST) {
109 |                 TRACE("h1", "hti_lookup: entry %p for key %p is empty", ent,
110 |                             (hti->ht->key_type == NULL) ? (void *)key : GET_PTR(key));
111 |                 *is_empty = 1; // indicate an empty so the caller avoids an expensive key compare
112 |                 return ent;
113 |             }
114 | 
115 |             // Compare <key> with the key in the entry.
116 |             if (EXPECT_TRUE(hti->ht->key_type == NULL)) {
117 |                 // fast path for integer keys
118 |                 if (ent_key == key) {
119 |                     TRACE("h1", "hti_lookup: found entry %p with key %p", ent, ent_key);
120 |                     return ent;
121 |                 }
122 |             } else {
123 | #ifndef NBD32
124 |                 // The key in <ent> is made up of two parts. The 48 low-order bits are a pointer. The
125 |                 // high-order 16 bits are taken from the hash. The bits from the hash are used as a
126 |                 // quick check to rule out non-equal keys without doing a complete compare.
127 |                 if ((key_hash >> 16) == (ent_key >> 48)) {
128 | #endif
129 |                     if (hti->ht->key_type->cmp(GET_PTR(ent_key), (void *)key) == 0) {
130 |                         TRACE("h1", "hti_lookup: found entry %p with key %p", ent, GET_PTR(ent_key));
131 |                         return ent;
132 | #ifndef NBD32
133 |                     }
134 | #endif
135 |                 }
136 |             }
137 |         }
138 | 
139 |         ndx = get_next_ndx(ndx, key_hash, hti->scale);
140 |     }
141 | 
142 |     // maximum number of probes exceeded
143 |     TRACE("h1", "hti_lookup: maximum number of probes exceeded returning 0x0", 0, 0);
144 |     return NULL;
145 | }
146 | 
147 | // Allocate and initialize a hti_t with 2^<scale> entries.
148 | static hti_t *hti_alloc (hashtable_t *parent, int scale) {
149 |     hti_t *hti = (hti_t *)nbd_malloc(sizeof(hti_t));
150 |     memset(hti, 0, sizeof(hti_t));
151 |     hti->scale = scale;
152 | 
153 |     size_t sz = sizeof(entry_t) * (1ULL << scale);
154 | #ifdef USE_SYSTEM_MALLOC
155 |     hti->unaligned_table_ptr = nbd_malloc(sz + CACHE_LINE_SIZE - 1);
156 |     hti->table = (void *)(((size_t)hti->unaligned_table_ptr + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1));
157 | #else
158 |     hti->table = nbd_malloc(sz);
159 | #endif
160 |     memset((void *)hti->table, 0, sz);
161 | 
162 |     hti->probe = (int)(hti->scale * 1.5) + 2;
163 |     int quarter = (1ULL << (hti->scale - 2)) / ENTRIES_PER_BUCKET;
164 |     if (hti->probe > quarter && quarter > 4) {
165 |         // When searching for a key probe a maximum of 1/4
166 |         hti->probe = quarter;
167 |     }
168 |     ASSERT(hti->probe);
169 |     hti->ht = parent;
170 |     hti->ref_count = 1; // one for the parent
171 | 
172 |     assert(hti->scale >= MIN_SCALE && hti->scale < 63); // size must be a power of 2
173 |     assert(sizeof(entry_t) * ENTRIES_PER_BUCKET % CACHE_LINE_SIZE == 0); // divisible into cache
174 |     assert((size_t)hti->table % CACHE_LINE_SIZE == 0); // cache aligned
175 | 
176 |     return hti;
177 | }
178 | 
179 | // Called when <hti> runs out of room for new keys.
180 | //
181 | // Initiates a copy by creating a larger hti_t and installing it in <hti->next>.
182 | static void hti_start_copy (hti_t *hti) {
183 |     TRACE("h0", "hti_start_copy(hti %p scale %llu)", hti, hti->scale);
184 | 
185 |     // heuristics to determine the size of the new table
186 |     size_t count = ht_count(hti->ht);
187 |     unsigned int new_scale = hti->scale;
188 |     new_scale += (count > (1ULL << (hti->scale - 1))) || (hti->key_count > (1ULL << (hti->scale - 2)) + (1ULL << (hti->scale - 3))); // double size if more than 1/2 full
189 | 
190 |     // Allocate the new table and attempt to install it.
191 |     hti_t *next = hti_alloc(hti->ht, new_scale);
192 |     hti_t *old_next = SYNC_CAS(&hti->next, NULL, next);
193 |     if (old_next != NULL) {
194 |         // Another thread beat us to it.
195 |         TRACE("h0", "hti_start_copy: lost race to install new hti; found %p", old_next, 0);
196 | #ifdef USE_SYSTEM_MALLOC
197 |         nbd_free(next->unaligned_table_ptr);
198 | #else
199 |         nbd_free((void *)next->table);
200 | #endif
201 |         return;
202 |     }
203 |     TRACE("h0", "hti_start_copy: new hti %p scale %llu", next, next->scale);
204 |     SYNC_ADD(&hti->ht->hti_copies, 1);
205 |     hti->ht->density = (double)hti->key_count / (1ULL << hti->scale) * 100;
206 |     hti->ht->probe = hti->probe;
207 | }
208 | 
209 | // Copy the key and value stored in <ht1_ent> (which must be an entry in <ht1>) to <ht2>.
210 | //
211 | // Return 1 unless <ht1_ent> is already copied (then return 0), so the caller can account for the total
212 | // number of entries left to copy.
213 | static int hti_copy_entry (hti_t *ht1, volatile entry_t *ht1_ent, uint32_t key_hash, hti_t *ht2) {
214 |     TRACE("h2", "hti_copy_entry: entry %p to table %p", ht1_ent, ht2);
215 |     assert(ht1);
216 |     assert(ht1->next);
217 |     assert(ht2);
218 |     assert(ht1_ent >= ht1->table && ht1_ent < ht1->table + (1ULL << ht1->scale));
219 | #ifndef NBD32
220 |     assert(key_hash == 0 || ht1->ht->key_type == NULL || (key_hash >> 16) == (ht1_ent->key >> 48));
221 | #endif
222 | 
223 |     map_val_t ht1_ent_val = ht1_ent->val;
224 |     if (EXPECT_FALSE(ht1_ent_val == COPIED_VALUE || ht1_ent_val == TAG_VALUE(TOMBSTONE, TAG1))) {
225 |         TRACE("h1", "hti_copy_entry: entry %p already copied to table %p", ht1_ent, ht2);
226 |         return FALSE; // already copied
227 |     }
228 | 
229 |     // Kill empty entries.
230 |     if (EXPECT_FALSE(ht1_ent_val == DOES_NOT_EXIST)) {
231 |         map_val_t ht1_ent_val = SYNC_CAS(&ht1_ent->val, DOES_NOT_EXIST, COPIED_VALUE);
232 |         if (ht1_ent_val == DOES_NOT_EXIST) {
233 |             TRACE("h1", "hti_copy_entry: empty entry %p killed", ht1_ent, 0);
234 |             return TRUE;
235 |         }
236 |         TRACE("h0", "hti_copy_entry: lost race to kill empty entry %p; the entry is not empty", ht1_ent, 0);
237 |     }
238 | 
239 |     // Tag the value in the old entry to indicate a copy is in progress.
240 |     ht1_ent_val = SYNC_FETCH_AND_OR(&ht1_ent->val, TAG_VALUE(0, TAG1));
241 |     TRACE("h2", "hti_copy_entry: tagged the value %p in old entry %p", ht1_ent_val, ht1_ent);
242 |     if (ht1_ent_val == COPIED_VALUE || ht1_ent_val == TAG_VALUE(TOMBSTONE, TAG1)) {
243 |         TRACE("h1", "hti_copy_entry: entry %p already copied to table %p", ht1_ent, ht2);
244 |         return FALSE; // <value> was already copied by another thread.
245 |     }
246 | 
247 |     // The old table's dead entries don't need to be copied to the new table
248 |     if (ht1_ent_val == TOMBSTONE)
249 |         return TRUE;
250 | 
251 |     // Install the key in the new table.
252 |     map_key_t ht1_ent_key = ht1_ent->key;
253 |     map_key_t key = (ht1->ht->key_type == NULL) ? (map_key_t)ht1_ent_key : (map_key_t)GET_PTR(ht1_ent_key);
254 | 
255 |     // We use 0 to indicate that <key_hash> is uninitiallized. Occasionally the key's hash will really be 0 and we
256 |     // waste time recomputing it every time. It is rare enough that it won't hurt performance.
257 |     if (key_hash == 0) {
258 | #ifdef NBD32
259 |         key_hash = (ht1->ht->key_type == NULL) ? murmur32_4b(ht1_ent_key) : ht1->ht->key_type->hash((void *)key);
260 | #else
261 |         key_hash = (ht1->ht->key_type == NULL) ? murmur32_8b(ht1_ent_key) : ht1->ht->key_type->hash((void *)key);
262 | #endif
263 |     }
264 | 
265 |     int ht2_ent_is_empty;
266 |     volatile entry_t *ht2_ent = hti_lookup(ht2, key, key_hash, &ht2_ent_is_empty);
267 |     TRACE("h0", "hti_copy_entry: copy entry %p to entry %p", ht1_ent, ht2_ent);
268 | 
269 |     // It is possible that there isn't any room in the new table either.
270 |     if (EXPECT_FALSE(ht2_ent == NULL)) {
271 |         TRACE("h0", "hti_copy_entry: no room in table %p copy to next table %p", ht2, ht2->next);
272 |         if (ht2->next == NULL) {
273 |             hti_start_copy(ht2); // initiate nested copy, if not already started
274 |         }
275 |         return hti_copy_entry(ht1, ht1_ent, key_hash, ht2->next); // recursive tail-call
276 |     }
277 | 
278 |     if (ht2_ent_is_empty) {
279 |         map_key_t old_ht2_ent_key = SYNC_CAS(&ht2_ent->key, DOES_NOT_EXIST, ht1_ent_key);
280 |         if (old_ht2_ent_key != DOES_NOT_EXIST) {
281 |             TRACE("h0", "hti_copy_entry: lost race to CAS key %p into new entry; found %p",
282 |                     ht1_ent_key, old_ht2_ent_key);
283 |             return hti_copy_entry(ht1, ht1_ent, key_hash, ht2); // recursive tail-call
284 |         }
285 |         SYNC_ADD(&ht2->key_count, 1);
286 |     }
287 | 
288 |     // Copy the value to the entry in the new table.
289 |     ht1_ent_val = STRIP_TAG(ht1_ent_val, TAG1);
290 |     map_val_t old_ht2_ent_val = SYNC_CAS(&ht2_ent->val, DOES_NOT_EXIST, ht1_ent_val);
291 | 
292 |     // If there is a nested copy in progress, we might have installed the key into a dead entry.
293 |     if (old_ht2_ent_val == COPIED_VALUE) {
294 |         TRACE("h0", "hti_copy_entry: nested copy in progress; copy %p to next table %p", ht2_ent, ht2->next);
295 |         return hti_copy_entry(ht1, ht1_ent, key_hash, ht2->next); // recursive tail-call
296 |     }
297 | 
298 |     // Mark the old entry as dead.
299 |     ht1_ent->val = COPIED_VALUE;
300 | 
301 |     // Update the count if we were the one that completed the copy.
302 |     if (old_ht2_ent_val == DOES_NOT_EXIST) {
303 |         TRACE("h0", "hti_copy_entry: key %p value %p copied to new entry", key, ht1_ent_val);
304 |         (void)SYNC_ADD(&ht1->count, -1);
305 |         (void)SYNC_ADD(&ht2->count, 1);
306 |         return TRUE;
307 |     }
308 | 
309 |     TRACE("h0", "hti_copy_entry: lost race to install value %p in new entry; found value %p",
310 |                 ht1_ent_val, old_ht2_ent_val);
311 |     return FALSE; // another thread completed the copy
312 | }
313 | 
314 | // Compare <expected> with the existing value associated with <key>. If the values match then
315 | // replace the existing value with <new>. If <new> is DOES_NOT_EXIST, delete the value associated with
316 | // the key by replacing it with a TOMBSTONE.
317 | //
318 | // Return the previous value associated with <key>, or DOES_NOT_EXIST if <key> is not in the table
319 | // or associated with a TOMBSTONE. If a copy is in progress and <key> has been copied to the next
320 | // table then return COPIED_VALUE.
321 | //
322 | // NOTE: the returned value matches <expected> iff the set succeeds
323 | //
324 | // Certain values of <expected> have special meaning. If <expected> is CAS_EXPECT_EXISTS then any
325 | // real value matches (i.ent. not a TOMBSTONE or DOES_NOT_EXIST) as long as <key> is in the table. If
326 | // <expected> is CAS_EXPECT_WHATEVER then skip the test entirely.
327 | //
328 | static map_val_t hti_cas (hti_t *hti, map_key_t key, uint32_t key_hash, map_val_t expected, map_val_t new) {
329 |     TRACE("h1", "hti_cas: hti %p key %p", hti, key);
330 |     TRACE("h1", "hti_cas: value %p expect %p", new, expected);
331 |     assert(hti);
332 |     assert(!IS_TAGGED(new, TAG1));
333 |     assert(key);
334 | 
335 |     int is_empty;
336 |     volatile entry_t *ent = hti_lookup(hti, key, key_hash, &is_empty);
337 | 
338 |     // There is no room for <key>, grow the table and try again.
339 |     if (ent == NULL) {
340 |         if (hti->next == NULL) {
341 |             hti_start_copy(hti);
342 |         }
343 |         return COPIED_VALUE;
344 |     }
345 | 
346 |     // Install <key> in the table if it doesn't exist.
347 |     if (is_empty) {
348 |         TRACE("h0", "hti_cas: entry %p is empty", ent, 0);
349 |         if (expected != CAS_EXPECT_WHATEVER && expected != CAS_EXPECT_DOES_NOT_EXIST)
350 |             return DOES_NOT_EXIST;
351 | 
352 |         // No need to do anything, <key> is already deleted.
353 |         if (new == DOES_NOT_EXIST)
354 |             return DOES_NOT_EXIST;
355 | 
356 |         // Allocate <new_key>.
357 |         map_key_t new_key = (hti->ht->key_type == NULL)
358 |                           ? (map_key_t)key
359 |                           : (map_key_t)hti->ht->key_type->clone((void *)key);
360 | #ifndef NBD32
361 |         if (EXPECT_FALSE(hti->ht->key_type != NULL)) {
362 |             // Combine <new_key> pointer with bits from its hash
363 |             new_key = ((uint64_t)(key_hash >> 16) << 48) | new_key;
364 |         }
365 | #endif
366 | 
367 |         // CAS the key into the table.
368 |         map_key_t old_ent_key = SYNC_CAS(&ent->key, DOES_NOT_EXIST, new_key);
369 | 
370 |         // Retry if another thread stole the entry out from under us.
371 |         if (old_ent_key != DOES_NOT_EXIST) {
372 |             TRACE("h0", "hti_cas: lost race to install key %p in entry %p", new_key, ent);
373 |             TRACE("h0", "hti_cas: found %p instead of NULL",
374 |                         (hti->ht->key_type == NULL) ? (void *)old_ent_key : GET_PTR(old_ent_key), 0);
375 |             if (hti->ht->key_type != NULL) {
376 |                 nbd_free(GET_PTR(new_key));
377 |             }
378 |             return hti_cas(hti, key, key_hash, expected, new); // tail-call
379 |         }
380 |         TRACE("h2", "hti_cas: installed key %p in entry %p", new_key, ent);
381 |         SYNC_ADD(&hti->key_count, 1);
382 |     }
383 | 
384 |     TRACE("h0", "hti_cas: entry for key %p is %p",
385 |                 (hti->ht->key_type == NULL) ? (void *)ent->key : GET_PTR(ent->key), ent);
386 | 
387 |     // If the entry is in the middle of a copy, the copy must be completed first.
388 |     map_val_t ent_val = ent->val;
389 |     if (EXPECT_FALSE(IS_TAGGED(ent_val, TAG1))) {
390 |         if (ent_val != COPIED_VALUE && ent_val != TAG_VALUE(TOMBSTONE, TAG1)) {
391 |             int did_copy = hti_copy_entry(hti, ent, key_hash, VOLATILE_DEREF(hti).next);
392 |             if (did_copy) {
393 |                 (void)SYNC_ADD(&hti->num_entries_copied, 1);
394 |             }
395 |             TRACE("h0", "hti_cas: value in the middle of a copy, copy completed by %s",
396 |                         (did_copy ? "self" : "other"), 0);
397 |         }
398 |         TRACE("h0", "hti_cas: value copied to next table, retry on next table", 0, 0);
399 |         return COPIED_VALUE;
400 |     }
401 | 
402 |     // Fail if the old value is not consistent with the caller's expectation.
403 |     int old_existed = (ent_val != TOMBSTONE && ent_val != DOES_NOT_EXIST);
404 |     if (EXPECT_FALSE(expected != CAS_EXPECT_WHATEVER && expected != ent_val)) {
405 |         if (EXPECT_FALSE(expected != (old_existed ? CAS_EXPECT_EXISTS : CAS_EXPECT_DOES_NOT_EXIST))) {
406 |             TRACE("h1", "hti_cas: value %p expected by caller not found; found value %p",
407 |                         expected, ent_val);
408 |             return ent_val;
409 |         }
410 |     }
411 | 
412 |     // No need to update if value is unchanged.
413 |     if ((new == DOES_NOT_EXIST && !old_existed) || ent_val == new) {
414 |         TRACE("h1", "hti_cas: old value and new value were the same", 0, 0);
415 |         return ent_val;
416 |     }
417 | 
418 |     // CAS the value into the entry. Retry if it fails.
419 |     map_val_t v = SYNC_CAS(&ent->val, ent_val, new == DOES_NOT_EXIST ? TOMBSTONE : new);
420 |     if (EXPECT_FALSE(v != ent_val)) {
421 |         TRACE("h0", "hti_cas: value CAS failed; expected %p found %p", ent_val, v);
422 |         return hti_cas(hti, key, key_hash, expected, new); // recursive tail-call
423 |     }
424 | 
425 |     // The set succeeded. Adjust the value count.
426 |     if (old_existed && new == DOES_NOT_EXIST) {
427 |         (void)SYNC_ADD(&hti->count, -1);
428 |     } else if (!old_existed && new != DOES_NOT_EXIST) {
429 |         (void)SYNC_ADD(&hti->count, 1);
430 |     }
431 | 
432 |     // Return the previous value.
433 |     TRACE("h0", "hti_cas: CAS succeeded; old value %p new value %p", ent_val, new);
434 |     return ent_val;
435 | }
436 | 
437 | //
438 | static map_val_t hti_get (hti_t *hti, map_key_t key, uint32_t key_hash) {
439 |     int is_empty;
440 |     volatile entry_t *ent = hti_lookup(hti, key, key_hash, &is_empty);
441 | 
442 |     // When hti_lookup() returns NULL it means we hit the reprobe limit while
443 |     // searching the table. In that case, if a copy is in progress the key
444 |     // might exist in the copy.
445 |     if (EXPECT_FALSE(ent == NULL)) {
446 |         if (VOLATILE_DEREF(hti).next != NULL)
447 |             return hti_get(hti->next, key, key_hash); // recursive tail-call
448 |         return DOES_NOT_EXIST;
449 |     }
450 | 
451 |     if (is_empty)
452 |         return DOES_NOT_EXIST;
453 | 
454 |     // If the entry is being copied, finish the copy and retry on the next table.
455 |     map_val_t ent_val = ent->val;
456 |     if (EXPECT_FALSE(IS_TAGGED(ent_val, TAG1))) {
457 |         if (EXPECT_FALSE(ent_val != COPIED_VALUE && ent_val != TAG_VALUE(TOMBSTONE, TAG1))) {
458 |             int did_copy = hti_copy_entry(hti, ent, key_hash, VOLATILE_DEREF(hti).next);
459 |             if (did_copy) {
460 |                 (void)SYNC_ADD(&hti->num_entries_copied, 1);
461 |             }
462 |         }
463 |         return hti_get(VOLATILE_DEREF(hti).next, key, key_hash); // tail-call
464 |     }
465 | 
466 |     return (ent_val == TOMBSTONE) ? DOES_NOT_EXIST : ent_val;
467 | }
468 | 
469 | //
470 | map_val_t ht_get (hashtable_t *ht, map_key_t key) {
471 | #ifdef NBD32
472 |     uint32_t hash = (ht->key_type == NULL) ? murmur32_4b((uint64_t)key) : ht->key_type->hash((void *)key);
473 | #else
474 |     uint32_t hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash((void *)key);
475 | #endif
476 |     return hti_get(ht->hti, key, hash);
477 | }
478 | 
479 | // returns TRUE if copy is done
480 | static int hti_help_copy (hti_t *hti) {
481 |     volatile entry_t *ent;
482 |     size_t limit;
483 |     size_t total_copied = hti->num_entries_copied;
484 |     size_t num_copied = 0;
485 |     size_t x = hti->copy_scan;
486 | 
487 |     TRACE("h1", "ht_cas: help copy. scan is %llu, size is %llu", x, 1<<hti->scale);
488 |     if (total_copied != (1ULL << hti->scale)) {
489 |         // Panic if we've been around the array twice and still haven't finished the copy.
490 |         int panic = (x >= (1ULL << (hti->scale + 1)));
491 |         if (!panic) {
492 |             limit = ENTRIES_PER_COPY_CHUNK;
493 | 
494 |             // Reserve some entries for this thread to copy. There is a race condition here because the
495 |             // fetch and add isn't atomic, but that is ok.
496 |             hti->copy_scan = x + ENTRIES_PER_COPY_CHUNK;
497 | 
498 |             // <copy_scan> might be larger than the size of the table, if some thread stalls while
499 |             // copying. In that case we just wrap around to the begining and make another pass through
500 |             // the table.
501 |             ent = hti->table + (x & MASK(hti->scale));
502 |         } else {
503 |             TRACE("h1", "ht_cas: help copy panic", 0, 0);
504 |             // scan the whole table
505 |             ent = hti->table;
506 |             limit = (1ULL << hti->scale);
507 |         }
508 | 
509 |         // Copy the entries
510 |         for (int i = 0; i < limit; ++i) {
511 |             num_copied += hti_copy_entry(hti, ent++, 0, hti->next);
512 |             assert(ent <= hti->table + (1ULL << hti->scale));
513 |         }
514 |         if (num_copied != 0) {
515 |             total_copied = SYNC_ADD(&hti->num_entries_copied, num_copied);
516 |         }
517 |     }
518 | 
519 |     return (total_copied == (1ULL << hti->scale));
520 | }
521 | 
522 | static void hti_defer_free (hti_t *hti) {
523 |     assert(hti->ref_count == 0);
524 | 
525 |     for (uint32_t i = 0; i < (1ULL << hti->scale); ++i) {
526 |         map_key_t key = hti->table[i].key;
527 |         map_val_t val = hti->table[i].val;
528 |         if (val == COPIED_VALUE)
529 |             continue;
530 |         assert(!IS_TAGGED(val, TAG1) || val == TAG_VALUE(TOMBSTONE, TAG1)); // copy not in progress
531 |         if (hti->ht->key_type != NULL && key != DOES_NOT_EXIST) {
532 |             rcu_defer_free(GET_PTR(key));
533 |         }
534 |     }
535 | #ifdef USE_SYSTEM_MALLOC
536 |     rcu_defer_free(hti->unaligned_table_ptr);
537 | #else
538 |     rcu_defer_free((void *)hti->table);
539 | #endif
540 |     rcu_defer_free(hti);
541 | }
542 | 
543 | static void hti_release (hti_t *hti) {
544 |     assert(hti->ref_count > 0);
545 |     int ref_count = SYNC_ADD(&hti->ref_count, -1);
546 |     if (ref_count == 0) {
547 |         hti_defer_free(hti);
548 |     }
549 | }
550 | 
551 | //
552 | map_val_t ht_cas (hashtable_t *ht, map_key_t key, map_val_t expected_val, map_val_t new_val) {
553 | 
554 |     TRACE("h2", "ht_cas: key %p ht %p", key, ht);
555 |     TRACE("h2", "ht_cas: expected val %p new val %p", expected_val, new_val);
556 |     assert(key != DOES_NOT_EXIST);
557 |     assert(!IS_TAGGED(new_val, TAG1) && new_val != DOES_NOT_EXIST && new_val != TOMBSTONE);
558 | 
559 |     hti_t *hti = ht->hti;
560 | 
561 |     // Help with an ongoing copy.
562 |     if (EXPECT_FALSE(hti->next != NULL)) {
563 |         int done = hti_help_copy(hti);
564 | 
565 |         // Unlink fully copied tables.
566 |         if (done) {
567 |             assert(hti->next);
568 |             if (SYNC_CAS(&ht->hti, hti, hti->next) == hti) {
569 |                 hti_release(hti);
570 |             }
571 |         }
572 |     }
573 | 
574 |     map_val_t old_val;
575 | #ifdef NBD32
576 |     uint32_t key_hash = (ht->key_type == NULL) ? murmur32_4b((uint64_t)key) : ht->key_type->hash((void *)key);
577 | #else
578 |     uint32_t key_hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash((void *)key);
579 | #endif
580 |     while ((old_val = hti_cas(hti, key, key_hash, expected_val, new_val)) == COPIED_VALUE) {
581 |         assert(hti->next);
582 |         hti = hti->next;
583 |     }
584 | 
585 |     return old_val == TOMBSTONE ? DOES_NOT_EXIST : old_val;
586 | }
587 | 
588 | // Remove the value in <ht> associated with <key>. Returns the value removed, or DOES_NOT_EXIST if there was
589 | // no value for that key.
590 | map_val_t ht_remove (hashtable_t *ht, map_key_t key) {
591 |     hti_t *hti = ht->hti;
592 |     map_val_t val;
593 | #ifdef NBD32
594 |     uint32_t key_hash = (ht->key_type == NULL) ? murmur32_4b((uint64_t)key) : ht->key_type->hash((void *)key);
595 | #else
596 |     uint32_t key_hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash((void *)key);
597 | #endif
598 |     do {
599 |         val = hti_cas(hti, key, key_hash, CAS_EXPECT_WHATEVER, DOES_NOT_EXIST);
600 |         if (val != COPIED_VALUE)
601 |             return val == TOMBSTONE ? DOES_NOT_EXIST : val;
602 |         assert(hti->next);
603 |         hti = hti->next;
604 |         assert(hti);
605 |     } while (1);
606 | }
607 | 
608 | // Returns the number of key-values pairs in <ht>
609 | size_t ht_count (hashtable_t *ht) {
610 |     hti_t *hti = ht->hti;
611 |     size_t count = 0;
612 |     while (hti) {
613 |         count += hti->count;
614 |         hti = hti->next;
615 |     }
616 |     return count;
617 | }
618 | 
619 | // Allocate and initialize a new hash table.
620 | hashtable_t *ht_alloc (const datatype_t *key_type) {
621 |     hashtable_t *ht = nbd_malloc(sizeof(hashtable_t));
622 |     ht->key_type = key_type;
623 |     ht->hti = (hti_t *)hti_alloc(ht, MIN_SCALE);
624 |     ht->hti_copies = 0;
625 |     ht->density = 0.0;
626 |     return ht;
627 | }
628 | 
629 | // Free <ht> and its internal structures.
630 | void ht_free (hashtable_t *ht) {
631 |     hti_t *hti = ht->hti;
632 |     do {
633 |         hti_t *next = hti->next;
634 |         assert(hti->ref_count == 1);
635 |         hti_release(hti);
636 |         hti = next;
637 |     } while (hti);
638 |     nbd_free(ht);
639 | }
640 | 
641 | void ht_print (hashtable_t *ht, int verbose) {
642 |     printf("probe:%-2d density:%.1f%% count:%-8lld ", ht->probe, ht->density, (uint64_t)ht_count(ht));
643 |     hti_t *hti = ht->hti;
644 |     while (hti) {
645 |         if (verbose) {
646 |             for (int i = 0; i < (1ULL << hti->scale); ++i) {
647 |                 volatile entry_t *ent = hti->table + i;
648 |                 printf("[0x%x] 0x%llx:0x%llx\n", i, (uint64_t)ent->key, (uint64_t)ent->val);
649 |                 if (i > 30) {
650 |                     printf("...\n");
651 |                     break;
652 |                 }
653 |             }
654 |         }
655 |         int scale = hti->scale;
656 |         printf("hti count:%lld scale:%d key density:%.1f%% value density:%.1f%% probe:%d\n",
657 |                 (uint64_t)hti->count, scale, (double)hti->key_count / (1ULL << scale) * 100,
658 |                 (double)hti->count / (1ULL << scale) * 100, hti->probe);
659 |         hti = hti->next;
660 |     }
661 | }
662 | 
663 | ht_iter_t *ht_iter_begin (hashtable_t *ht, map_key_t key) {
664 |     hti_t *hti;
665 |     int ref_count;
666 |     do {
667 |         hti = ht->hti;
668 |         while (hti->next != NULL) {
669 |             do { } while (hti_help_copy(hti) != TRUE);
670 |             hti = hti->next;
671 |         }
672 |         do {
673 |             ref_count = hti->ref_count;
674 |             if(ref_count == 0)
675 |                 break;
676 |         } while (ref_count != SYNC_CAS(&hti->ref_count, ref_count, ref_count + 1));
677 |     } while (ref_count == 0);
678 | 
679 |     ht_iter_t *iter = nbd_malloc(sizeof(ht_iter_t));
680 |     iter->hti = hti;
681 |     iter->idx = -1;
682 | 
683 |     return iter;
684 | }
685 | 
686 | map_val_t ht_iter_next (ht_iter_t *iter, map_key_t *key_ptr) {
687 |     volatile entry_t *ent;
688 |     map_key_t key;
689 |     map_val_t val;
690 |     size_t table_size = (1ULL << iter->hti->scale);
691 |     do {
692 |         iter->idx++;
693 |         if (iter->idx == table_size) {
694 |             return DOES_NOT_EXIST;
695 |         }
696 |         ent = &iter->hti->table[iter->idx];
697 |         key = (iter->hti->ht->key_type == NULL) ? (map_key_t)ent->key : (map_key_t)GET_PTR(ent->key);
698 |         val = ent->val;
699 | 
700 |     } while (key == DOES_NOT_EXIST || val == DOES_NOT_EXIST || val == TOMBSTONE);
701 | 
702 |     if (val == COPIED_VALUE) {
703 |         const datatype_t *key_type = iter->hti->ht->key_type;
704 | #ifdef NBD32
705 |         uint32_t hash = (key_type == NULL) ? murmur32_4b((uint64_t)key) : key_type->hash((void *)key);
706 | #else
707 |         uint32_t hash = (key_type == NULL) ? murmur32_8b((uint64_t)key) : key_type->hash((void *)key);
708 | #endif
709 |         val = hti_get(iter->hti->next, (map_key_t)ent->key, hash);
710 | 
711 |         // Go to the next entry if key is already deleted.
712 |         if (val == DOES_NOT_EXIST)
713 |             return ht_iter_next(iter, key_ptr); // recursive tail-call
714 |     }
715 | 
716 |     if (key_ptr) {
717 |         *key_ptr = key;
718 |     }
719 |     return val;
720 | }
721 | 
722 | void ht_iter_free (ht_iter_t *iter) {
723 |     hti_release(iter->hti);
724 |     nbd_free(iter);
725 | }
726 | 


--------------------------------------------------------------------------------
/map/list.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * Harris-Michael lock-free list-based set
  6 |  * http://www.research.ibm.com/people/m/michael/spaa-2002.pdf
  7 |  */
  8 | 
  9 | #include <stdio.h>
 10 | #include <string.h>
 11 | 
 12 | #include "common.h"
 13 | #include "list.h"
 14 | #include "mem.h"
 15 | #ifdef LIST_USE_HAZARD_POINTER
 16 | #include "hazard.h"
 17 | #else
 18 | #include "rcu.h"
 19 | #endif
 20 | 
 21 | typedef struct node {
 22 |     map_key_t  key;
 23 |     map_val_t  val;
 24 |     markable_t next; // next node
 25 | } node_t;
 26 | 
 27 | struct ll_iter {
 28 |     node_t *pred;
 29 | };
 30 | 
 31 | struct ll {
 32 |     node_t *head;
 33 |     const datatype_t *key_type;
 34 | };
 35 | 
 36 | // Marking the <next> field of a node logically removes it from the list
 37 | #define  MARK_NODE(x) TAG_VALUE((markable_t)(x), 0x1)
 38 | #define   HAS_MARK(x) (IS_TAGGED((x), 0x1) == 0x1)
 39 | #define   GET_NODE(x) ((node_t *)(x))
 40 | #define STRIP_MARK(x) ((node_t *)STRIP_TAG((x), 0x1))
 41 | 
 42 | static node_t *node_alloc (map_key_t key, map_val_t val) {
 43 |     node_t *item = (node_t *)nbd_malloc(sizeof(node_t));
 44 |     assert(!HAS_MARK((size_t)item));
 45 |     item->key = key;
 46 |     item->val = val;
 47 |     return item;
 48 | }
 49 | 
 50 | list_t *ll_alloc (const datatype_t *key_type) {
 51 |     list_t *ll = (list_t *)nbd_malloc(sizeof(list_t));
 52 |     ll->key_type = key_type;
 53 |     ll->head = node_alloc(0, 0);
 54 |     ll->head->next = DOES_NOT_EXIST;
 55 |     return ll;
 56 | }
 57 | 
 58 | void ll_free (list_t *ll) {
 59 |     node_t *item = STRIP_MARK(ll->head->next);
 60 |     while (item != NULL) {
 61 |         node_t *next = STRIP_MARK(item->next);
 62 |         if (ll->key_type != NULL) {
 63 |             nbd_free((void *)item->key);
 64 |         }
 65 |         nbd_free(item);
 66 |         item = next;
 67 |     }
 68 | }
 69 | 
 70 | size_t ll_count (list_t *ll) {
 71 |     size_t count = 0;
 72 |     node_t *item = STRIP_MARK(ll->head->next);
 73 |     while (item) {
 74 |         if (!HAS_MARK(item->next)) {
 75 |             count++;
 76 |         }
 77 |         item = STRIP_MARK(item->next);
 78 |     }
 79 |     return count;
 80 | }
 81 | 
 82 | #ifdef LIST_USE_HAZARD_POINTER
 83 | static void nbd_free_node (node_t *x) {
 84 |     nbd_free((void *)x->key);
 85 |     nbd_free(x);
 86 | }
 87 | #endif
 88 | 
 89 | static int find_pred (node_t **pred_ptr, node_t **item_ptr, list_t *ll, map_key_t key, int help_remove) {
 90 |     node_t *pred = ll->head;
 91 |     node_t *item = GET_NODE(pred->next);
 92 |     TRACE("l2", "find_pred: searching for key %p in list (head is %p)", key, pred);
 93 | #ifdef LIST_USE_HAZARD_POINTER
 94 |     haz_t *temp, *hp0 = haz_get_static(0), *hp1 = haz_get_static(1);
 95 | #endif
 96 | 
 97 |     while (item != NULL) {
 98 | #ifdef LIST_USE_HAZARD_POINTER
 99 |         haz_set(hp0, item);
100 |         if (STRIP_MARK(pred->next) != item)
101 |             return find_pred(pred_ptr, item_ptr, ll, key, help_remove); // retry
102 | #endif
103 |         markable_t next = item->next;
104 | 
105 |         // A mark means the node is logically removed but not physically unlinked yet.
106 |         while (EXPECT_FALSE(HAS_MARK(next))) {
107 | 
108 |             // Skip over logically removed items.
109 |             if (!help_remove) {
110 |                 item = STRIP_MARK(item->next);
111 |                 if (EXPECT_FALSE(item == NULL))
112 |                     break;
113 |                 TRACE("l3", "find_pred: skipping marked item %p (next is %p)", item, next);
114 |                 next = item->next;
115 |                 continue;
116 |             }
117 | 
118 |             // Unlink logically removed items.
119 |             TRACE("l3", "find_pred: unlinking marked item %p next is %p", item, next);
120 | 
121 |             markable_t other = SYNC_CAS(&pred->next, (markable_t)item, (markable_t)STRIP_MARK(next));
122 |             if (other == (markable_t)item) {
123 |                 TRACE("l2", "find_pred: unlinked item %p from pred %p", item, pred);
124 |                 item = STRIP_MARK(next);
125 |                 next = (item != NULL) ? item->next : DOES_NOT_EXIST;
126 |                 TRACE("l3", "find_pred: now current item is %p next is %p", item, next);
127 | 
128 |                 // The thread that completes the unlink should free the memory.
129 | #ifdef LIST_USE_HAZARD_POINTER
130 |                 free_t free_ = (ll->key_type != NULL ? (free_t)nbd_free_node : nbd_free);
131 |                 haz_defer_free(GET_NODE(other), free_);
132 | #else
133 |                 if (ll->key_type != NULL) {
134 |                     rcu_defer_free((void *)GET_NODE(other)->key);
135 |                 }
136 |                 rcu_defer_free(GET_NODE(other));
137 | #endif
138 |             } else {
139 |                 TRACE("l2", "find_pred: lost a race to unlink item %p from pred %p", item, pred);
140 |                 TRACE("l2", "find_pred: pred's link changed to %p", other, 0);
141 |                 if (HAS_MARK(other))
142 |                     return find_pred(pred_ptr, item_ptr, ll, key, help_remove); // retry
143 |                 item = GET_NODE(other);
144 |                 next = (item != NULL) ? item->next : DOES_NOT_EXIST;
145 |             }
146 |         }
147 | 
148 |         if (EXPECT_FALSE(item == NULL))
149 |             break;
150 | 
151 |         TRACE("l3", "find_pred: visiting item %p (next is %p)", item, next);
152 |         TRACE("l4", "find_pred: key %p val %p", item->key, item->val);
153 | 
154 |         int d;
155 |         if (EXPECT_TRUE(ll->key_type == NULL)) {
156 |             d = item->key - key;
157 |         } else {
158 |             d = ll->key_type->cmp((void *)item->key, (void *)key);
159 |         }
160 | 
161 |         // If we reached the key (or passed where it should be), we found the right predesssor
162 |         if (d >= 0) {
163 |             if (pred_ptr != NULL) {
164 |                 *pred_ptr = pred;
165 |             }
166 |             if (item_ptr != NULL) {
167 |                 *item_ptr = item;
168 |             }
169 |             if (d == 0) {
170 |                 TRACE("l2", "find_pred: found matching item %p in list, pred is %p", item, pred);
171 |                 return TRUE;
172 |             } 
173 |             TRACE("l2", "find_pred: found proper place for key %p in list, pred is %p", key, pred);
174 |             return FALSE;
175 |         }
176 | 
177 |         pred = item;
178 | #ifdef LIST_USE_HAZARD_POINTER
179 |         temp = hp0; hp0 = hp1; hp1 = temp;
180 | #endif
181 |         item = GET_NODE(next);
182 |     }
183 | 
184 |     // <key> is not in <ll>.
185 |     if (pred_ptr != NULL) {
186 |         *pred_ptr = pred;
187 |     }
188 |     *item_ptr = NULL;
189 |     TRACE("l2", "find_pred: reached end of list. last item is %p", pred, 0);
190 |     return FALSE;
191 | }
192 | 
193 | // Fast find. Do not help unlink partially removed nodes and do not return the found item's predecessor.
194 | map_val_t ll_lookup (list_t *ll, map_key_t key) {
195 |     TRACE("l1", "ll_lookup: searching for key %p in list %p", key, ll);
196 |     node_t *item;
197 |     int found = find_pred(NULL, &item, ll, key, FALSE);
198 | 
199 |     // If we found an <item> matching the key return its value.
200 |     if (found) {
201 |         map_val_t val = item->val;
202 |         if (val != DOES_NOT_EXIST) {
203 |             TRACE("l1", "ll_lookup: found item %p. val %p. returning item", item, item->val);
204 |             return val;
205 |         }
206 |     }
207 | 
208 |     TRACE("l1", "ll_lookup: no item in the list matched the key", 0, 0);
209 |     return DOES_NOT_EXIST;
210 | }
211 | 
212 | map_val_t ll_cas (list_t *ll, map_key_t key, map_val_t expectation, map_val_t new_val) {
213 |     TRACE("l1", "ll_cas: key %p list %p", key, ll);
214 |     TRACE("l1", "ll_cas: expectation %p new value %p", expectation, new_val);
215 |     ASSERT((int64_t)new_val > 0);
216 | 
217 |     do {
218 |         node_t *pred, *old_item;
219 |         int found = find_pred(&pred, &old_item, ll, key, TRUE);
220 |         if (!found) {
221 | 
222 |             // There was not an item in the list that matches the key. 
223 |             if (EXPECT_FALSE(expectation != CAS_EXPECT_DOES_NOT_EXIST && expectation != CAS_EXPECT_WHATEVER)) {
224 |                 TRACE("l1", "ll_cas: the expectation was not met, the list was not changed", 0, 0);
225 |                 return DOES_NOT_EXIST; // failure
226 |             }
227 | 
228 |             // Create a new item and insert it into the list.
229 |             TRACE("l2", "ll_cas: attempting to insert item between %p and %p", pred, pred->next);
230 |             map_key_t new_key = ll->key_type == NULL ? key : (map_key_t)ll->key_type->clone((void *)key);
231 |             node_t *new_item = node_alloc(new_key, new_val);
232 |             markable_t next = new_item->next = (markable_t)old_item;
233 |             markable_t other = SYNC_CAS(&pred->next, (markable_t)next, (markable_t)new_item);
234 |             if (other == next) {
235 |                 TRACE("l1", "ll_cas: successfully inserted new item %p", new_item, 0);
236 |                 return DOES_NOT_EXIST; // success
237 |             }
238 | 
239 |             // Lost a race. Failed to insert the new item into the list.
240 |             TRACE("l1", "ll_cas: lost a race. CAS failed. expected pred's link to be %p but found %p", next, other);
241 |             if (ll->key_type != NULL) {
242 |                 nbd_free((void *)new_key);
243 |             }
244 |             nbd_free(new_item);
245 |             continue; // retry
246 |         }
247 | 
248 |         // Found an item in the list that matches the key.
249 |         map_val_t old_item_val = old_item->val;
250 |         do {
251 |             // If the item's value is DOES_NOT_EXIST it means another thread removed the node out from under us.
252 |             if (EXPECT_FALSE(old_item_val == DOES_NOT_EXIST)) {
253 |                 TRACE("l2", "ll_cas: lost a race, found an item but another thread removed it. retry", 0, 0);
254 |                 break; // retry
255 |             }
256 | 
257 |             if (EXPECT_FALSE(expectation == CAS_EXPECT_DOES_NOT_EXIST)) {
258 |                 TRACE("l1", "ll_cas: found an item %p in the list that matched the key. the expectation was "
259 |                         "not met, the list was not changed", old_item, old_item_val);
260 |                 return old_item_val; // failure
261 |             }
262 | 
263 |             // Use a CAS and not a SWAP. If the node is in the process of being removed and we used a SWAP, we could
264 |             // replace DOES_NOT_EXIST with our value. Then another thread that is updating the value could think it
265 |             // succeeded and return our value even though we indicated that the node has been removed. If the CAS 
266 |             // fails it means another thread either removed the node or updated its value.
267 |             map_val_t ret_val = SYNC_CAS(&old_item->val, old_item_val, new_val);
268 |             if (ret_val == old_item_val) {
269 |                 TRACE("l1", "ll_cas: the CAS succeeded. updated the value of the item", 0, 0);
270 |                 return ret_val; // success
271 |             }
272 |             TRACE("l2", "ll_cas: lost a race. the CAS failed. another thread changed the item's value", 0, 0);
273 | 
274 |             old_item_val = ret_val;
275 |         } while (1);
276 |     } while (1);
277 | }
278 | 
279 | map_val_t ll_remove (list_t *ll, map_key_t key) {
280 |     TRACE("l1", "ll_remove: removing item with key %p from list %p", key, ll);
281 |     node_t *pred;
282 |     node_t *item;
283 |     int found = find_pred(&pred, &item, ll, key, TRUE);
284 |     if (!found) {
285 |         TRACE("l1", "ll_remove: remove failed, an item with a matching key does not exist in the list", 0, 0);
286 |         return DOES_NOT_EXIST;
287 |     }
288 | 
289 |     // Mark <item> removed. If multiple threads try to remove the same item only one of them should succeed.
290 |     markable_t next;
291 |     markable_t old_next = item->next;
292 |     do {
293 |         next     = old_next;
294 |         old_next = SYNC_CAS(&item->next, next, MARK_NODE(STRIP_MARK(next)));
295 |         if (HAS_MARK(old_next)) {
296 |             TRACE("l1", "ll_remove: lost a race -- %p is already marked for removal by another thread", item, 0);
297 |             return DOES_NOT_EXIST;
298 |         }
299 |     } while (next != old_next);
300 |     TRACE("l2", "ll_remove: logically removed item %p", item, 0);
301 |     ASSERT(HAS_MARK(VOLATILE_DEREF(item).next));
302 | 
303 |     // Atomically swap out the item's value in case another thread is updating the item while we are 
304 |     // removing it. This establishes which operation occurs first logically, the update or the remove. 
305 |     map_val_t val = SYNC_SWAP(&item->val, DOES_NOT_EXIST); 
306 |     TRACE("l2", "ll_remove: replaced item's val %p with DOES_NOT_EXIT", val, 0);
307 | 
308 |     // Unlink <item> from <ll>. If we lose a race to another thread just back off. It is safe to leave the
309 |     // item logically removed for a later call (or some other thread) to physically unlink. By marking the
310 |     // item earlier, we logically removed it. 
311 |     TRACE("l2", "ll_remove: unlink the item by linking its pred %p to its successor %p", pred, next);
312 |     markable_t other;
313 |     if ((other = SYNC_CAS(&pred->next, (markable_t)item, next)) != (markable_t)item) {
314 |         TRACE("l1", "ll_remove: unlink failed; pred's link changed from %p to %p", item, other);
315 |         return val;
316 |     } 
317 | 
318 |     // The thread that completes the unlink should free the memory.
319 | #ifdef LIST_USE_HAZARD_POINTER
320 |     free_t free_ = (ll->key_type != NULL ? (free_t)nbd_free_node : nbd_free);
321 |     haz_defer_free(GET_NODE(item), free_);
322 | #else
323 |     if (ll->key_type != NULL) {
324 |         rcu_defer_free((void *)item->key);
325 |     }
326 |     rcu_defer_free(item);
327 | #endif
328 |     TRACE("l1", "ll_remove: successfully unlinked item %p from the list", item, 0);
329 |     return val;
330 | }
331 | 
332 | void ll_print (list_t *ll, int verbose) {
333 |     if (verbose) {
334 |         markable_t next = ll->head->next;
335 |         int i = 0;
336 |         while (next != DOES_NOT_EXIST) {
337 |             node_t *item = STRIP_MARK(next);
338 |             if (item == NULL)
339 |                 break;
340 |             printf("%s%p:0x%llx ", HAS_MARK(item->next) ? "*" : "", item, (uint64_t)item->key);
341 |             fflush(stdout);
342 |             if (i++ > 30) {
343 |                 printf("...");
344 |                 break;
345 |             }
346 |             next = item->next;
347 |         }
348 |         printf("\n");
349 |     }
350 |     printf("count:%llu\n", (uint64_t)ll_count(ll));
351 | }
352 | 
353 | ll_iter_t *ll_iter_begin (list_t *ll, map_key_t key) {
354 |     ll_iter_t *iter = (ll_iter_t *)nbd_malloc(sizeof(ll_iter_t));
355 |     if (key != DOES_NOT_EXIST) {
356 |         find_pred(&iter->pred, NULL, ll, key, FALSE);
357 |     } else {
358 |         iter->pred = ll->head;
359 |     }
360 | #ifdef LIST_USE_HAZARD_POINTER
361 |     haz_register_dynamic((void **)&iter->pred);
362 | #endif
363 |     return iter;
364 | }
365 | 
366 | map_val_t ll_iter_next (ll_iter_t *iter, map_key_t *key_ptr) {
367 |     assert(iter);
368 |     if (iter->pred == NULL)
369 |         return DOES_NOT_EXIST;
370 | 
371 |     // advance iterator to next item; skip items that have been removed
372 |     markable_t item;
373 | #ifdef LIST_USE_HAZARD_POINTER 
374 |     haz_t *hp0 = haz_get_static(0);
375 | #endif
376 |     do {
377 | #ifndef LIST_USE_HAZARD_POINTER 
378 |         item = iter->pred->next;
379 | #else //LIST_USE_HAZARD_POINTER 
380 |         do {
381 |             item = iter->pred->next;
382 |             haz_set(hp0, STRIP_MARK(item));
383 |         } while (item != VOLATILE_DEREF(iter->pred).next);
384 | #endif//LIST_USE_HAZARD_POINTER
385 |         iter->pred = STRIP_MARK(item);
386 |         if (iter->pred == NULL)
387 |             return DOES_NOT_EXIST;
388 |     } while (HAS_MARK(item));
389 | 
390 |     if (key_ptr != NULL) {
391 |         *key_ptr = GET_NODE(item)->key;
392 |     }
393 |     return GET_NODE(item)->val;
394 | }
395 | 
396 | void ll_iter_free (ll_iter_t *iter) {
397 | #ifdef LIST_USE_HAZARD_POINTER
398 |     haz_unregister_dynamic((void **)&iter->pred);
399 | #endif
400 |     nbd_free(iter);
401 | }
402 | 


--------------------------------------------------------------------------------
/map/map.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  *
 5 |  * generic interface for map-like data structures
 6 |  */
 7 | 
 8 | #include "common.h"
 9 | #include "map.h"
10 | #include "mem.h"
11 | 
12 | struct map {
13 |     const map_impl_t *impl;
14 |     void *data;
15 | };
16 | 
17 | struct map_iter {
18 |     const map_impl_t *impl;
19 |     void *state;
20 | };
21 | 
22 | map_t *map_alloc (const map_impl_t *map_impl, const datatype_t *key_type) {
23 |     map_t *map = nbd_malloc(sizeof(map_t));
24 |     map->impl  = map_impl;
25 |     map->data  = map->impl->alloc(key_type);
26 |     return map;
27 | }
28 | 
29 | void map_free (map_t *map) {
30 |     map->impl->free_(map->data);
31 | }
32 | 
33 | void map_print (map_t *map, int verbose) {
34 |     map->impl->print(map->data, verbose);
35 | }
36 | 
37 | map_val_t map_count (map_t *map) {
38 |     return map->impl->count(map->data);
39 | }
40 | 
41 | map_val_t map_get (map_t *map, map_key_t key) {
42 |     return map->impl->get(map->data, key);
43 | }
44 | 
45 | map_val_t map_set (map_t *map, map_key_t key, map_val_t new_val) {
46 |     return map->impl->cas(map->data, key, CAS_EXPECT_WHATEVER, new_val);
47 | }
48 | 
49 | map_val_t map_add (map_t *map, map_key_t key, map_val_t new_val) {
50 |     return map->impl->cas(map->data, key, CAS_EXPECT_DOES_NOT_EXIST, new_val);
51 | }
52 | 
53 | map_val_t map_cas (map_t *map, map_key_t key, map_val_t expected_val, map_val_t new_val) {
54 |     return map->impl->cas(map->data, key, expected_val, new_val);
55 | }
56 | 
57 | map_val_t map_replace(map_t *map, map_key_t key, map_val_t new_val) {
58 |     return map->impl->cas(map->data, key, CAS_EXPECT_EXISTS, new_val);
59 | }
60 | 
61 | map_val_t map_remove (map_t *map, map_key_t key) {
62 |     return map->impl->remove(map->data, key);
63 | }
64 | 
65 | map_iter_t * map_iter_begin (map_t *map, map_key_t key) {
66 |     map_iter_t *iter = nbd_malloc(sizeof(map_iter_t));
67 |     iter->impl  = map->impl;
68 |     iter->state = map->impl->iter_begin(map->data, key);
69 |     return iter;
70 | }
71 | 
72 | map_val_t map_iter_next (map_iter_t *iter, map_key_t *key_ptr) {
73 |     return iter->impl->iter_next(iter->state, key_ptr);
74 | }
75 | 
76 | void map_iter_free (map_iter_t *iter) {
77 |     iter->impl->iter_free(iter->state);
78 |     nbd_free(iter);
79 | }
80 | 


--------------------------------------------------------------------------------
/map/skiplist.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * Implementation of the lock-free skiplist data-structure created by Maurice Herlihy, Yossi Lev,
  6 |  * and Nir Shavit. See Herlihy's and Shivit's book "The Art of Multiprocessor Programming".
  7 |  * http://www.amazon.com/Art-Multiprocessor-Programming-Maurice-Herlihy/dp/0123705916/
  8 |  *
  9 |  * See also Kir Fraser's dissertation "Practical Lock Freedom".
 10 |  * www.cl.cam.ac.uk/techreports/UCAM-CL-TR-579.pdf
 11 |  *
 12 |  * I've generalized the data structure to support update operations like set() and CAS() in addition to
 13 |  * the normal add() and remove() operations.
 14 |  *
 15 |  * Warning: This code is written for the x86 memory-model. The algorithim depends on certain stores
 16 |  * and loads being ordered. This code won't work correctly on platforms with weaker memory models if
 17 |  * you don't add memory barriers in the right places.
 18 |  */
 19 | 
 20 | #include <stdio.h>
 21 | #include <string.h>
 22 | 
 23 | #include "common.h"
 24 | #include "skiplist.h"
 25 | #include "runtime.h"
 26 | #include "mem.h"
 27 | #include "rcu.h"
 28 | 
 29 | // Setting MAX_LEVELS to 1 essentially makes this data structure the Harris-Michael lock-free list (see list.c).
 30 | #define MAX_LEVELS 24
 31 | 
 32 | enum unlink {
 33 |     FORCE_UNLINK,
 34 |     ASSIST_UNLINK,
 35 |     DONT_UNLINK
 36 | };
 37 | 
 38 | typedef struct node {
 39 |     map_key_t key;
 40 |     map_val_t val;
 41 |     unsigned num_levels;
 42 |     markable_t next[1];
 43 | } node_t;
 44 | 
 45 | struct sl_iter {
 46 |     node_t *next;
 47 | };
 48 | 
 49 | struct sl {
 50 |     node_t *head;
 51 |     const datatype_t *key_type;
 52 |     int high_water; // max historic number of levels
 53 | };
 54 | 
 55 | // Marking the <next> field of a node logically removes it from the list
 56 | #if 0
 57 | static inline markable_t  MARK_NODE(node_t * x) { return TAG_VALUE((markable_t)x, 0x1); }
 58 | static inline int        HAS_MARK(markable_t x) { return (IS_TAGGED(x, 0x1) == 0x1); }
 59 | static inline node_t *   GET_NODE(markable_t x) { assert(!HAS_MARK(x)); return (node_t *)x; }
 60 | static inline node_t * STRIP_MARK(markable_t x) { return ((node_t *)STRIP_TAG(x, 0x1)); }
 61 | #else
 62 | #define  MARK_NODE(x) TAG_VALUE((markable_t)(x), 0x1)
 63 | #define   HAS_MARK(x) (IS_TAGGED((x), 0x1) == 0x1)
 64 | #define   GET_NODE(x) ((node_t *)(x))
 65 | #define STRIP_MARK(x) ((node_t *)STRIP_TAG((x), 0x1))
 66 | #endif
 67 | 
 68 | static int random_levels (skiplist_t *sl) {
 69 |     uint64_t r = nbd_rand();
 70 |     int z = __builtin_ctz(r);
 71 |     int levels = (int)(z / 1.5);
 72 |     if (levels == 0)
 73 |         return 1;
 74 |     if (levels > sl->high_water) {
 75 |         levels = SYNC_ADD(&sl->high_water, 1);
 76 |         TRACE("s2", "random_levels: increased high water mark to %lld", sl->high_water, 0);
 77 |     }
 78 |     if (levels > MAX_LEVELS) { levels = MAX_LEVELS; }
 79 |     return levels;
 80 | }
 81 | 
 82 | static node_t *node_alloc (int num_levels, map_key_t key, map_val_t val) {
 83 |     assert(num_levels >= 0 && num_levels <= MAX_LEVELS);
 84 |     size_t sz = sizeof(node_t) + (num_levels - 1) * sizeof(node_t *);
 85 |     node_t *item = (node_t *)nbd_malloc(sz);
 86 |     memset(item, 0, sz);
 87 |     item->key = key;
 88 |     item->val = val;
 89 |     item->num_levels = num_levels;
 90 |     TRACE("s2", "node_alloc: new node %p (%llu levels)", item, num_levels);
 91 |     return item;
 92 | }
 93 | 
 94 | skiplist_t *sl_alloc (const datatype_t *key_type) {
 95 |     skiplist_t *sl = (skiplist_t *)nbd_malloc(sizeof(skiplist_t));
 96 |     sl->key_type = key_type;
 97 |     sl->high_water = 1;
 98 |     sl->head = node_alloc(MAX_LEVELS, 0, 0);
 99 |     memset(sl->head->next, 0, MAX_LEVELS * sizeof(skiplist_t *));
100 |     return sl;
101 | }
102 | 
103 | void sl_free (skiplist_t *sl) {
104 |     node_t *item = GET_NODE(sl->head->next[0]);
105 |     while (item) {
106 |         node_t *next = STRIP_MARK(item->next[0]);
107 |         if (sl->key_type != NULL) {
108 |             nbd_free((void *)item->key);
109 |         }
110 |         nbd_free(item);
111 |         item = next;
112 |     }
113 | }
114 | 
115 | size_t sl_count (skiplist_t *sl) {
116 |     size_t count = 0;
117 |     node_t *item = GET_NODE(sl->head->next[0]);
118 |     while (item) {
119 |         if (!HAS_MARK(item->next[0])) {
120 |             count++;
121 |         }
122 |         item = STRIP_MARK(item->next[0]);
123 |     }
124 |     return count;
125 | }
126 | 
127 | static node_t *find_preds (node_t **preds, node_t **succs, int n, skiplist_t *sl, map_key_t key, enum unlink unlink) {
128 |     node_t *pred = sl->head;
129 |     node_t *item = NULL;
130 |     TRACE("s2", "find_preds: searching for key %p in skiplist (head is %p)", key, pred);
131 |     int d = 0;
132 | 
133 |     // Traverse the levels of <sl> from the top level to the bottom
134 |     for (int level = sl->high_water - 1; level >= 0; --level) {
135 |         markable_t next = pred->next[level];
136 |         if (next == DOES_NOT_EXIST && level >= n)
137 |             continue;
138 |         TRACE("s3", "find_preds: traversing level %p starting at %p", level, pred);
139 |         if (EXPECT_FALSE(HAS_MARK(next))) {
140 |             TRACE("s2", "find_preds: pred %p is marked for removal (next %p); retry", pred, next);
141 |             ASSERT(level == pred->num_levels - 1 || HAS_MARK(pred->next[level+1]));
142 |             return find_preds(preds, succs, n, sl, key, unlink); // retry
143 |         }
144 |         item = GET_NODE(next);
145 |         while (item != NULL) {
146 |             next = item->next[level];
147 | 
148 |             // A tag means an item is logically removed but not physically unlinked yet.
149 |             while (EXPECT_FALSE(HAS_MARK(next))) {
150 |                 TRACE("s3", "find_preds: found marked item %p (next is %p)", item, next);
151 |                 if (unlink == DONT_UNLINK) {
152 | 
153 |                     // Skip over logically removed items.
154 |                     item = STRIP_MARK(next);
155 |                     if (EXPECT_FALSE(item == NULL))
156 |                         break;
157 |                     next = item->next[level];
158 |                 } else {
159 | 
160 |                     // Unlink logically removed items.
161 |                     markable_t other = SYNC_CAS(&pred->next[level], (markable_t)item, (markable_t)STRIP_MARK(next));
162 |                     if (other == (markable_t)item) {
163 |                         TRACE("s3", "find_preds: unlinked item from pred %p", pred, 0);
164 |                         item = STRIP_MARK(next);
165 |                     } else {
166 |                         TRACE("s3", "find_preds: lost race to unlink item pred %p's link changed to %p", pred, other);
167 |                         if (HAS_MARK(other))
168 |                             return find_preds(preds, succs, n, sl, key, unlink); // retry
169 |                         item = GET_NODE(other);
170 |                     }
171 |                     next = (item != NULL) ? item->next[level] : DOES_NOT_EXIST;
172 |                 }
173 |             }
174 | 
175 |             if (EXPECT_FALSE(item == NULL)) {
176 |                 TRACE("s3", "find_preds: past the last item in the skiplist", 0, 0);
177 |                 break;
178 |             }
179 | 
180 |             TRACE("s4", "find_preds: visiting item %p (next is %p)", item, next);
181 |             TRACE("s4", "find_preds: key %p val %p", STRIP_MARK(item->key), item->val);
182 | 
183 |             if (EXPECT_TRUE(sl->key_type == NULL)) {
184 |                 d = item->key - key;
185 |             } else {
186 |                 d = sl->key_type->cmp((void *)item->key, (void *)key);
187 |             }
188 | 
189 |             if (d > 0)
190 |                 break;
191 |             if (d == 0 && unlink != FORCE_UNLINK)
192 |                 break;
193 | 
194 |             pred = item;
195 |             item = GET_NODE(next);
196 |         }
197 | 
198 |         TRACE("s3", "find_preds: found pred %p next %p", pred, item);
199 | 
200 |         if (level < n) {
201 |             if (preds != NULL) {
202 |                 preds[level] = pred;
203 |             }
204 |             if (succs != NULL) {
205 |                 succs[level] = item;
206 |             }
207 |         }
208 |     }
209 | 
210 |     if (d == 0) {
211 |         TRACE("s2", "find_preds: found matching item %p in skiplist, pred is %p", item, pred);
212 |         return item;
213 |     }
214 |     TRACE("s2", "find_preds: found proper place for key %p in skiplist, pred is %p. returning null", key, pred);
215 |     return NULL;
216 | }
217 | 
218 | // Fast find that does not help unlink partially removed nodes and does not return the node's predecessors.
219 | map_val_t sl_lookup (skiplist_t *sl, map_key_t key) {
220 |     TRACE("s1", "sl_lookup: searching for key %p in skiplist %p", key, sl);
221 |     node_t *item = find_preds(NULL, NULL, 0, sl, key, DONT_UNLINK);
222 | 
223 |     // If we found an <item> matching the <key> return its value.
224 |     if (item != NULL) {
225 |         map_val_t val = item->val;
226 |         if (val != DOES_NOT_EXIST) {
227 |             TRACE("s1", "sl_lookup: found item %p. val %p. returning item", item, item->val);
228 |             return val;
229 |         }
230 |     }
231 | 
232 |     TRACE("s1", "sl_lookup: no item in the skiplist matched the key", 0, 0);
233 |     return DOES_NOT_EXIST;
234 | }
235 | 
236 | map_key_t sl_min_key (skiplist_t *sl) {
237 |     node_t *item = GET_NODE(sl->head->next[0]);
238 |     while (item != NULL) {
239 |         markable_t next = item->next[0];
240 |         if (!HAS_MARK(next))
241 |             return item->key;
242 |         item = STRIP_MARK(next);
243 |     }
244 |     return DOES_NOT_EXIST;
245 | }
246 | 
247 | static map_val_t update_item (node_t *item, map_val_t expectation, map_val_t new_val) {
248 |     map_val_t old_val = item->val;
249 | 
250 |     // If the item's value is DOES_NOT_EXIST it means another thread removed the node out from under us.
251 |     if (EXPECT_FALSE(old_val == DOES_NOT_EXIST)) {
252 |         TRACE("s2", "update_item: lost a race to another thread removing the item. retry", 0, 0);
253 |         return DOES_NOT_EXIST; // retry
254 |     }
255 | 
256 |     if (EXPECT_FALSE(expectation == CAS_EXPECT_DOES_NOT_EXIST)) {
257 |         TRACE("s1", "update_item: the expectation was not met; the skiplist was not changed", 0, 0);
258 |         return old_val; // failure
259 |     }
260 | 
261 |     // Use a CAS and not a SWAP. If the CAS fails it means another thread removed the node or updated its
262 |     // value. If another thread removed the node but it is not unlinked yet and we used a SWAP, we could
263 |     // replace DOES_NOT_EXIST with our value. Then another thread that is updating the value could think it
264 |     // succeeded and return our value even though it should return DOES_NOT_EXIST.
265 |     if (old_val == SYNC_CAS(&item->val, old_val, new_val)) {
266 |         TRACE("s1", "update_item: the CAS succeeded. updated the value of the item", 0, 0);
267 |         return old_val; // success
268 |     }
269 |     TRACE("s2", "update_item: lost a race. the CAS failed. another thread changed the item's value", 0, 0);
270 | 
271 |     // retry
272 |     return update_item(item, expectation, new_val); // tail call
273 | }
274 | 
275 | map_val_t sl_cas (skiplist_t *sl, map_key_t key, map_val_t expectation, map_val_t new_val) {
276 |     TRACE("s1", "sl_cas: key %p skiplist %p", key, sl);
277 |     TRACE("s1", "sl_cas: expectation %p new value %p", expectation, new_val);
278 |     ASSERT((int64_t)new_val > 0);
279 | 
280 |     node_t *preds[MAX_LEVELS];
281 |     node_t *nexts[MAX_LEVELS];
282 |     node_t *new_item = NULL;
283 |     int n = random_levels(sl);
284 |     node_t *old_item = find_preds(preds, nexts, n, sl, key, ASSIST_UNLINK);
285 | 
286 |     // If there is already an item in the skiplist that matches the key just update its value.
287 |     if (old_item != NULL) {
288 |         map_val_t ret_val = update_item(old_item, expectation, new_val);
289 |         if (ret_val != DOES_NOT_EXIST)
290 |             return ret_val;
291 | 
292 |         // If we lose a race with a thread removing the item we tried to update then we have to retry.
293 |         return sl_cas(sl, key, expectation, new_val); // tail call
294 |     }
295 | 
296 |     if (EXPECT_FALSE(expectation != CAS_EXPECT_DOES_NOT_EXIST && expectation != CAS_EXPECT_WHATEVER)) {
297 |         TRACE("s1", "sl_cas: the expectation was not met, the skiplist was not changed", 0, 0);
298 |         return DOES_NOT_EXIST; // failure, the caller expected an item for the <key> to already exist
299 |     }
300 | 
301 |     // Create a new node and insert it into the skiplist.
302 |     TRACE("s3", "sl_cas: attempting to insert a new item between %p and %p", preds[0], nexts[0]);
303 |     map_key_t new_key = sl->key_type == NULL ? key : (map_key_t)sl->key_type->clone((void *)key);
304 |     new_item = node_alloc(n, new_key, new_val);
305 | 
306 |     // Set <new_item>'s next pointers to their proper values
307 |     markable_t next = new_item->next[0] = (markable_t)nexts[0];
308 |     for (int level = 1; level < new_item->num_levels; ++level) {
309 |         new_item->next[level] = (markable_t)nexts[level];
310 |     }
311 | 
312 |     // Link <new_item> into <sl> from the bottom level up. After <new_item> is inserted into the bottom level
313 |     // it is officially part of the skiplist.
314 |     node_t *pred = preds[0];
315 |     markable_t other = SYNC_CAS(&pred->next[0], next, (markable_t)new_item);
316 |     if (other != next) {
317 |         TRACE("s3", "sl_cas: failed to change pred's link: expected %p found %p", next, other);
318 | 
319 |         // Lost a race to another thread modifying the skiplist. Free the new item we allocated and retry.
320 |         if (sl->key_type != NULL) {
321 |             nbd_free((void *)new_key);
322 |         }
323 |         nbd_free(new_item);
324 |         return sl_cas(sl, key, expectation, new_val); // tail call
325 |     }
326 | 
327 |     TRACE("s3", "sl_cas: successfully inserted a new item %p at the bottom level", new_item, 0);
328 | 
329 |     ASSERT(new_item->num_levels <= MAX_LEVELS);
330 |     for (int level = 1; level < new_item->num_levels; ++level) {
331 |         TRACE("s3", "sl_cas: inserting the new item %p at level %p", new_item, level);
332 |         do {
333 |             node_t *   pred = preds[level];
334 |             ASSERT(new_item->next[level]==(markable_t)nexts[level] || new_item->next[level]==MARK_NODE(nexts[level]));
335 |             TRACE("s3", "sl_cas: attempting to to insert the new item between %p and %p", pred, nexts[level]);
336 | 
337 |             markable_t other = SYNC_CAS(&pred->next[level], (markable_t)nexts[level], (markable_t)new_item);
338 |             if (other == (markable_t)nexts[level])
339 |                 break; // successfully linked <new_item> into the skiplist at the current <level>
340 |             TRACE("s3", "sl_cas: lost a race. failed to change pred's link. expected %p found %p", nexts[level], other);
341 | 
342 |             // Find <new_item>'s new preds and nexts.
343 |             find_preds(preds, nexts, new_item->num_levels, sl, key, ASSIST_UNLINK);
344 | 
345 |             for (int i = level; i < new_item->num_levels; ++i) {
346 |                 markable_t old_next = new_item->next[i];
347 |                 if ((markable_t)nexts[i] == old_next)
348 |                     continue;
349 | 
350 |                 // Update <new_item>'s inconsistent next pointer before trying again. Use a CAS so if another thread
351 |                 // is trying to remove the new item concurrently we do not stomp on the mark it places on the item.
352 |                 TRACE("s3", "sl_cas: attempting to update the new item's link from %p to %p", old_next, nexts[i]);
353 |                 other = SYNC_CAS(&new_item->next[i], old_next, (markable_t)nexts[i]);
354 |                 ASSERT(other == old_next || other == MARK_NODE(old_next));
355 | 
356 |                 // If another thread is removing this item we can stop linking it into to skiplist
357 |                 if (HAS_MARK(other)) {
358 |                     find_preds(NULL, NULL, 0, sl, key, FORCE_UNLINK); // see comment below
359 |                     return DOES_NOT_EXIST;
360 |                 }
361 |             }
362 |         } while (1);
363 |     }
364 | 
365 |     // In case another thread was in the process of removing the <new_item> while we were added it, we have to
366 |     // make sure it is completely unlinked before we return. We might have lost a race and inserted the new item
367 |     // at some level after the other thread thought it was fully removed. That is a problem because once a thread
368 |     // thinks it completely unlinks a node it queues it to be freed
369 |     if (HAS_MARK(new_item->next[new_item->num_levels - 1])) {
370 |         find_preds(NULL, NULL, 0, sl, key, FORCE_UNLINK);
371 |     }
372 | 
373 |     return DOES_NOT_EXIST; // success, inserted a new item
374 | }
375 | 
376 | map_val_t sl_remove (skiplist_t *sl, map_key_t key) {
377 |     TRACE("s1", "sl_remove: removing item with key %p from skiplist %p", key, sl);
378 |     node_t *preds[MAX_LEVELS];
379 |     node_t *item = find_preds(preds, NULL, sl->high_water, sl, key, ASSIST_UNLINK);
380 |     if (item == NULL) {
381 |         TRACE("s3", "sl_remove: remove failed, an item with a matching key does not exist in the skiplist", 0, 0);
382 |         return DOES_NOT_EXIST;
383 |     }
384 | 
385 |     // Mark <item> at each level of <sl> from the top down. If multiple threads try to concurrently remove
386 |     // the same item only one of them should succeed. Marking the bottom level establishes which of them succeeds.
387 |     markable_t old_next = 0;
388 |     for (int level = item->num_levels - 1; level >= 0; --level) {
389 |         markable_t next;
390 |         old_next = item->next[level];
391 |         do {
392 |             TRACE("s3", "sl_remove: marking item at level %p (next %p)", level, old_next);
393 |             next = old_next;
394 |             old_next = SYNC_CAS(&item->next[level], next, MARK_NODE((node_t *)next));
395 |             if (HAS_MARK(old_next)) {
396 |                 TRACE("s2", "sl_remove: %p is already marked for removal by another thread (next %p)", item, old_next);
397 |                 if (level == 0)
398 |                     return DOES_NOT_EXIST;
399 |                 break;
400 |             }
401 |         } while (next != old_next);
402 |     }
403 | 
404 |     // Atomically swap out the item's value in case another thread is updating the item while we are
405 |     // removing it. This establishes which operation occurs first logically, the update or the remove.
406 |     map_val_t val = SYNC_SWAP(&item->val, DOES_NOT_EXIST);
407 |     TRACE("s2", "sl_remove: replaced item %p's value with DOES_NOT_EXIT", item, 0);
408 | 
409 |     // unlink the item
410 |     find_preds(NULL, NULL, 0, sl, key, FORCE_UNLINK);
411 | 
412 |     // free the node
413 |     if (sl->key_type != NULL) {
414 |         rcu_defer_free((void *)item->key);
415 |     }
416 |     rcu_defer_free(item);
417 | 
418 |     return val;
419 | }
420 | 
421 | void sl_print (skiplist_t *sl, int verbose) {
422 | 
423 |     if (verbose) {
424 |         for (int level = MAX_LEVELS - 1; level >= 0; --level) {
425 |             node_t *item = sl->head;
426 |             if (item->next[level] == DOES_NOT_EXIST)
427 |                 continue;
428 |             printf("(%d) ", level);
429 |             int i = 0;
430 |             while (item) {
431 |                 markable_t next = item->next[level];
432 |                 printf("%s%p ", HAS_MARK(next) ? "*" : "", item);
433 |                 item = STRIP_MARK(next);
434 |                 if (i++ > 30) {
435 |                     printf("...");
436 |                     break;
437 |                 }
438 |             }
439 |             printf("\n");
440 |             fflush(stdout);
441 |         }
442 |         node_t *item = sl->head;
443 |         int i = 0;
444 |         while (item) {
445 |             int is_marked = HAS_MARK(item->next[0]);
446 |             printf("%s%p:0x%llx ", is_marked ? "*" : "", item, (uint64_t)item->key);
447 |             if (item != sl->head) {
448 |                 printf("[%d]", item->num_levels);
449 |             } else {
450 |                 printf("[HEAD]");
451 |             }
452 |             for (int level = 1; level < item->num_levels; ++level) {
453 |                 node_t *next = STRIP_MARK(item->next[level]);
454 |                 is_marked = HAS_MARK(item->next[0]);
455 |                 printf(" %p%s", next, is_marked ? "*" : "");
456 |                 if (item == sl->head && item->next[level] == DOES_NOT_EXIST)
457 |                     break;
458 |             }
459 |             printf("\n");
460 |             fflush(stdout);
461 |             item = STRIP_MARK(item->next[0]);
462 |             if (i++ > 30) {
463 |                 printf("...\n");
464 |                 break;
465 |             }
466 |         }
467 |     }
468 |     printf("levels:%-2d  count:%-6lld \n", sl->high_water, (uint64_t)sl_count(sl));
469 | }
470 | 
471 | sl_iter_t *sl_iter_begin (skiplist_t *sl, map_key_t key) {
472 |     sl_iter_t *iter = (sl_iter_t *)nbd_malloc(sizeof(sl_iter_t));
473 |     if (key != DOES_NOT_EXIST) {
474 |         find_preds(NULL, &iter->next, 1, sl, key, DONT_UNLINK);
475 |     } else {
476 |         iter->next = GET_NODE(sl->head->next[0]);
477 |     }
478 |     return iter;
479 | }
480 | 
481 | map_val_t sl_iter_next (sl_iter_t *iter, map_key_t *key_ptr) {
482 |     assert(iter);
483 |     node_t *item = iter->next;
484 |     while (item != NULL && HAS_MARK(item->next[0])) {
485 |         item = STRIP_MARK(item->next[0]);
486 |     }
487 |     if (item == NULL) {
488 |         iter->next = NULL;
489 |         return DOES_NOT_EXIST;
490 |     }
491 |     iter->next = STRIP_MARK(item->next[0]);
492 |     if (key_ptr != NULL) {
493 |         *key_ptr = item->key;
494 |     }
495 |     return item->val;
496 | }
497 | 
498 | void sl_iter_free (sl_iter_t *iter) {
499 |     nbd_free(iter);
500 | }
501 | 


--------------------------------------------------------------------------------
/map/unsafe_skiplist.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * non thread safe skiplist
  6 |  */
  7 | 
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | 
 11 | #include "common.h"
 12 | #include "skiplist.h"
 13 | #include "runtime.h"
 14 | #include "mem.h"
 15 | 
 16 | #define MAX_LEVELS 24
 17 | 
 18 | typedef struct node {
 19 |     map_key_t key;
 20 |     map_val_t val;
 21 |     int num_levels;
 22 |     struct node *next[1];
 23 | } node_t;
 24 | 
 25 | struct sl_iter {
 26 |     node_t *next;
 27 | };
 28 | 
 29 | struct sl {
 30 |     node_t *head;
 31 |     const datatype_t *key_type;
 32 |     int high_water; // max level of any item in the list
 33 | };
 34 | 
 35 | static int random_levels (skiplist_t *sl) {
 36 |     uint64_t r = nbd_rand();
 37 |     int z = __builtin_ctz(r);
 38 |     int levels = (int)(z / 1.5);
 39 |     if (levels == 0)
 40 |         return 1;
 41 |     if (levels > sl->high_water) {
 42 |         levels = SYNC_ADD(&sl->high_water, 1);
 43 |         TRACE("s2", "random_levels: increased high water mark to %lld", sl->high_water, 0);
 44 |     }
 45 |     if (levels > MAX_LEVELS) { levels = MAX_LEVELS; }
 46 |     return levels;
 47 | }
 48 | 
 49 | static node_t *node_alloc (int num_levels, map_key_t key, map_val_t val) {
 50 |     assert(num_levels > 0 && num_levels <= MAX_LEVELS);
 51 |     size_t sz = sizeof(node_t) + (num_levels - 1) * sizeof(node_t *);
 52 |     node_t *item = (node_t *)nbd_malloc(sz);
 53 |     memset(item, 0, sz);
 54 |     item->key = key;
 55 |     item->val = val;
 56 |     item->num_levels = num_levels;
 57 |     TRACE("s2", "node_alloc: new node %p (%llu levels)", item, num_levels);
 58 |     return item;
 59 | }
 60 | 
 61 | skiplist_t *sl_alloc (const datatype_t *key_type) {
 62 |     skiplist_t *sl = (skiplist_t *)nbd_malloc(sizeof(skiplist_t));
 63 |     sl->key_type = key_type;
 64 |     sl->high_water = 1;
 65 |     sl->head = node_alloc(MAX_LEVELS, 0, 0);
 66 |     memset(sl->head->next, 0, MAX_LEVELS * sizeof(skiplist_t *));
 67 |     return sl;
 68 | }
 69 | 
 70 | void sl_free (skiplist_t *sl) {
 71 |     node_t *item = sl->head->next[0];
 72 |     while (item) {
 73 |         node_t *next = item->next[0];
 74 |         if (sl->key_type != NULL) {
 75 |             nbd_free((void *)item->key);
 76 |         }
 77 |         nbd_free(item);
 78 |         item = next;
 79 |     }
 80 | }
 81 | 
 82 | size_t sl_count (skiplist_t *sl) {
 83 |     size_t count = 0;
 84 |     node_t *item = sl->head->next[0];
 85 |     while (item) {
 86 |         count++;
 87 |         item = item->next[0];
 88 |     }
 89 |     return count;
 90 | }
 91 | 
 92 | static node_t *find_preds (node_t **preds, node_t **succs, int n, skiplist_t *sl, map_key_t key, int unlink) {
 93 |     node_t *pred = sl->head;
 94 |     node_t *item = NULL;
 95 |     TRACE("s2", "find_preds: searching for key %p in skiplist (head is %p)", key, pred);
 96 |     int d = 0;
 97 | 
 98 |     // Traverse the levels of <sl> from the top level to the bottom
 99 |     for (int level = sl->high_water - 1; level >= 0; --level) {
100 |         node_t *next = pred->next[level];
101 |         if (next == DOES_NOT_EXIST && level >= n)
102 |             continue;
103 |         TRACE("s3", "find_preds: traversing level %p starting at %p", level, pred);
104 |         item = next;
105 |         while (item != NULL) {
106 |             next = item->next[level];
107 | 
108 |             if (EXPECT_TRUE(sl->key_type == NULL)) {
109 |                 d = item->key - key;
110 |             } else {
111 |                 d = sl->key_type->cmp((void *)item->key, (void *)key);
112 |             }
113 | 
114 |             if (d >= 0) {
115 |                 if (d == 0 && unlink) {
116 |                     pred->next[level] = next;
117 |                     TRACE("s3", "find_preds: unlinked item from pred %p", pred, 0);
118 |                     item = next;
119 |                     next = (item != NULL) ? item->next[level] : DOES_NOT_EXIST;
120 |                 }
121 |                 break;
122 |             }
123 | 
124 |             pred = item;
125 |             item = next;
126 |         }
127 | 
128 |         TRACE("s3", "find_preds: found pred %p next %p", pred, item);
129 | 
130 |         if (level < n) { 
131 |             if (preds != NULL) {
132 |                 preds[level] = pred;
133 |             }
134 |             if (succs != NULL) {
135 |                 succs[level] = item;
136 |             }
137 |         }
138 |     }
139 | 
140 |     if (d == 0) {
141 |         TRACE("s2", "find_preds: found matching item %p in skiplist, pred is %p", item, pred);
142 |         return item;
143 |     }
144 |     TRACE("s2", "find_preds: found proper place for key %p in skiplist, pred is %p. returning null", key, pred);
145 |     return NULL;
146 | }
147 | 
148 | // Fast find that does not return the node's predecessors.
149 | map_val_t sl_lookup (skiplist_t *sl, map_key_t key) {
150 |     TRACE("s1", "sl_lookup: searching for key %p in skiplist %p", key, sl);
151 |     node_t *item = find_preds(NULL, NULL, 0, sl, key, FALSE);
152 | 
153 |     // If we found an <item> matching the <key> return its value.
154 |     if (item != NULL) {
155 |         map_val_t val = item->val;
156 |         return val;
157 |     }
158 | 
159 |     TRACE("s1", "sl_lookup: no item in the skiplist matched the key", 0, 0);
160 |     return DOES_NOT_EXIST;
161 | }
162 | 
163 | map_key_t sl_min_key (skiplist_t *sl) {
164 |     node_t *item = sl->head->next[0];
165 |     while (item != NULL)
166 |         return item->key;
167 |     return DOES_NOT_EXIST;
168 | }
169 | 
170 | map_val_t sl_cas (skiplist_t *sl, map_key_t key, map_val_t expectation, map_val_t new_val) {
171 |     TRACE("s1", "sl_cas: key %p skiplist %p", key, sl);
172 |     TRACE("s1", "sl_cas: expectation %p new value %p", expectation, new_val);
173 |     ASSERT((int64_t)new_val > 0);
174 | 
175 |     node_t *preds[MAX_LEVELS];
176 |     node_t *nexts[MAX_LEVELS];
177 |     node_t *new_item = NULL;
178 |     int n = random_levels(sl);
179 |     node_t *old_item = find_preds(preds, nexts, n, sl, key, FALSE);
180 | 
181 |     // If there is already an item in the skiplist that matches the key just update its value.
182 |     if (old_item != NULL) {
183 |         map_val_t old_val = old_item->val;
184 |         if (expectation == CAS_EXPECT_DOES_NOT_EXIST || 
185 |            (expectation != CAS_EXPECT_WHATEVER && expectation != CAS_EXPECT_EXISTS && expectation != old_val)) {
186 |             TRACE("s1", "sl_cas: the expectation was not met; the skiplist was not changed", 0, 0);
187 |             return old_val;
188 |         } 
189 |         old_item->val = new_val;
190 |         return old_val;
191 |     }
192 | 
193 |     if (EXPECT_FALSE(expectation != CAS_EXPECT_DOES_NOT_EXIST && expectation != CAS_EXPECT_WHATEVER)) {
194 |         TRACE("s1", "sl_cas: the expectation was not met, the skiplist was not changed", 0, 0);
195 |         return DOES_NOT_EXIST; // failure, the caller expected an item for the <key> to already exist 
196 |     }
197 | 
198 |     TRACE("s3", "sl_cas: inserting a new item between %p and %p", preds[0], nexts[0]);
199 | 
200 |     // Create a new node and insert it into the skiplist.
201 |     map_key_t new_key = sl->key_type == NULL ? key : (map_key_t)sl->key_type->clone((void *)key);
202 |     new_item = node_alloc(n, new_key, new_val);
203 | 
204 |     // Set <new_item>'s next pointers to their proper values
205 |     for (int level = 0; level < new_item->num_levels; ++level) {
206 |         new_item->next[level] = nexts[level];
207 |     }
208 | 
209 |     // Link <new_item> into <sl> 
210 |     for (int level = 0; level < new_item->num_levels; ++level) {
211 |         preds[level]->next[level] = new_item;
212 |     }
213 | 
214 |     return DOES_NOT_EXIST; // success, inserted a new item
215 | }
216 | 
217 | map_val_t sl_remove (skiplist_t *sl, map_key_t key) {
218 |     TRACE("s1", "sl_remove: removing item with key %p from skiplist %p", key, sl);
219 |     node_t *preds[MAX_LEVELS];
220 |     node_t *item = find_preds(preds, NULL, sl->high_water, sl, key, FALSE);
221 |     if (item == NULL) {
222 |         TRACE("s3", "sl_remove: remove failed, an item with a matching key does not exist in the skiplist", 0, 0);
223 |         return DOES_NOT_EXIST;
224 |     }
225 |     map_val_t val = item->val; 
226 | 
227 |     // unlink the item
228 |     find_preds(NULL, NULL, 0, sl, key, TRUE);
229 | 
230 |     // free the node
231 |     if (sl->key_type != NULL) {
232 |         nbd_free((void *)item->key);
233 |     }
234 |     nbd_free(item);
235 | 
236 |     return val;
237 | }
238 | 
239 | void sl_print (skiplist_t *sl) {
240 | 
241 |     printf("high water: %d levels\n", sl->high_water);
242 |     for (int level = MAX_LEVELS - 1; level >= 0; --level) {
243 |         node_t *item = sl->head;
244 |         if (item->next[level] == DOES_NOT_EXIST)
245 |             continue;
246 |         printf("(%d) ", level);
247 |         int i = 0;
248 |         while (item) {
249 |             node_t *next = item->next[level];
250 |             printf("%p ", item);
251 |             item = next;
252 |             if (i++ > 30) {
253 |                 printf("...");
254 |                 break;
255 |             }
256 |         }
257 |         printf("\n");
258 |         fflush(stdout);
259 |     }
260 |     node_t *item = sl->head;
261 |     int i = 0;
262 |     while (item) {
263 |         printf("%p:0x%llx ", item, (uint64_t)item->key);
264 |         if (item != sl->head) {
265 |             printf("[%d]", item->num_levels);
266 |         } else {
267 |             printf("[HEAD]");
268 |         }
269 |         for (int level = 1; level < item->num_levels; ++level) {
270 |             node_t *next = item->next[level];
271 |             printf(" %p", next);
272 |             if (item == sl->head && item->next[level] == DOES_NOT_EXIST)
273 |                 break;
274 |         }
275 |         printf("\n");
276 |         fflush(stdout);
277 |         item = item->next[0];
278 |         if (i++ > 30) {
279 |             printf("...\n");
280 |             break;
281 |         }
282 |     }
283 | }
284 | 
285 | sl_iter_t *sl_iter_begin (skiplist_t *sl, map_key_t key) {
286 |     sl_iter_t *iter = (sl_iter_t *)nbd_malloc(sizeof(sl_iter_t));
287 |     if (key != DOES_NOT_EXIST) {
288 |         find_preds(NULL, &iter->next, 1, sl, key, FALSE);
289 |     } else {
290 |         iter->next = sl->head->next[0];
291 |     }
292 |     return iter;
293 | }
294 | 
295 | map_val_t sl_iter_next (sl_iter_t *iter, map_key_t *key_ptr) {
296 |     assert(iter);
297 |     node_t *item = iter->next;
298 |     if (item == NULL) {
299 |         iter->next = NULL;
300 |         return DOES_NOT_EXIST;
301 |     }
302 |     iter->next = item->next[0];
303 |     if (key_ptr != NULL) {
304 |         *key_ptr = item->key;
305 |     }
306 |     return item->val;
307 | }
308 | 
309 | void sl_iter_free (sl_iter_t *iter) {
310 |     nbd_free(iter);
311 | }
312 | 


--------------------------------------------------------------------------------
/perf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | for ks in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 #26 27 28 29 30
 3 | do
 4 |     for th in 8
 5 |     do
 6 |         output/perf_test $th $ks
 7 |     done
 8 | done
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/runtime/hazard.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * hazard pointers
  6 |  *
  7 |  * www.research.ibm.com/people/m/michael/ieeetpds-2004.pdf
  8 |  *
  9 |  */
 10 | #include "common.h"
 11 | #include "lwt.h"
 12 | #include "mem.h"
 13 | #include "tls.h"
 14 | #include "runtime.h"
 15 | #include "hazard.h"
 16 | #include "lwt.h"
 17 | 
 18 | typedef struct pending {
 19 |     void * ptr; 
 20 |     free_t free_; 
 21 | } pending_t;
 22 | 
 23 | typedef struct haz_local {
 24 |     pending_t *pending; // to be freed
 25 |     int pending_size;
 26 |     int pending_count;
 27 | 
 28 |     haz_t static_haz[STATIC_HAZ_PER_THREAD];
 29 | 
 30 |     haz_t **dynamic;
 31 |     int dynamic_size;
 32 |     int dynamic_count;
 33 | 
 34 | } __attribute__ ((aligned(CACHE_LINE_SIZE))) haz_local_t;
 35 | 
 36 | static haz_local_t haz_local_[MAX_NUM_THREADS] = {};
 37 | 
 38 | static void sort_hazards (haz_t *hazards, int n) {
 39 |     TRACE("H3", "sort_hazards: sorting hazard list %p of %p elements", hazards, n);
 40 |     return;
 41 | }
 42 | 
 43 | static int search_hazards (void *p, haz_t *hazards, int n) {
 44 |     TRACE("H4", "search_hazards: searching list %p for hazard %p", hazards, p);
 45 |     for (int i = 0; i < n; ++i) {
 46 |         if (hazards[i] == p) {
 47 |             TRACE("H2", "haz_search_hazards: found hazard %p", p, 0);
 48 |             return TRUE;
 49 |         }
 50 |     }
 51 |     return FALSE;
 52 | }
 53 | 
 54 | static void resize_pending (void) {
 55 |     TRACE("H2", "haz_resize_pending", 0, 0);
 56 |     LOCALIZE_THREAD_LOCAL(ThreadId, int);
 57 |     haz_local_t *l = haz_local_ + ThreadId;
 58 |     pending_t *p = nbd_malloc(sizeof(pending_t) * l->pending_size * 2);
 59 |     memcpy(p, l->pending, l->pending_size);
 60 |     nbd_free(l->pending);
 61 |     l->pending = p;
 62 |     l->pending_size *= 2;
 63 | }
 64 | 
 65 | void haz_defer_free (void *d, free_t f) {
 66 |     TRACE("H1", "haz_defer_free: %p (%p)", d, f);
 67 |     assert(d);
 68 |     assert(f);
 69 |     LOCALIZE_THREAD_LOCAL(ThreadId, int);
 70 |     haz_local_t *l = haz_local_ + ThreadId;
 71 |     while (l->pending_count == l->pending_size) {
 72 | 
 73 |         if (l->pending_size == 0) {
 74 |             l->pending_size = MAX_NUM_THREADS * STATIC_HAZ_PER_THREAD;
 75 |             l->pending = nbd_malloc(sizeof(pending_t) * l->pending_size);
 76 |             break;
 77 |         }
 78 | 
 79 |         // scan for hazard pointers
 80 |         haz_t *hazards = nbd_malloc(sizeof(haz_t) * l->pending_size);
 81 |         int    hazard_count = 0;
 82 |         for (int i = 0; i < MAX_NUM_THREADS; ++i) {
 83 |             haz_local_t *h = haz_local_ + i;
 84 |             for (int j = 0; j < STATIC_HAZ_PER_THREAD; ++j) {
 85 |                 if (h->static_haz[j] != NULL) {
 86 |                     if (hazard_count == l->pending_size) {
 87 |                         resize_pending();
 88 |                         nbd_free(hazards);
 89 |                         haz_defer_free(d, f);
 90 |                         return;
 91 |                     }
 92 |                     hazards[hazard_count++] = h->static_haz[j];
 93 |                 }
 94 |             }
 95 |             for (int j = 0; j < h->dynamic_count; ++j) {
 96 |                 if (h->dynamic[j] != NULL && *h->dynamic[j] != NULL) {
 97 |                     if (hazard_count == l->pending_size) {
 98 |                         resize_pending();
 99 |                         nbd_free(hazards);
100 |                         haz_defer_free(d, f);
101 |                         return;
102 |                     }
103 |                     hazards[hazard_count++] = *h->dynamic[j];
104 |                 }
105 |             }
106 |         }
107 |         sort_hazards(hazards, hazard_count);
108 | 
109 |         // check for conflicts
110 |         int  conflicts_count = 0;
111 |         for (int i = 0; i < l->pending_count; ++i) {
112 |             pending_t *p = l->pending + i;
113 |             if (search_hazards(p->ptr, hazards, hazard_count)) {
114 |                 l->pending[conflicts_count++] = *p; // put conflicts back on the pending list
115 |             } else {
116 |                 assert(p->free_);
117 |                 assert(p->ptr);
118 |                 p->free_(p->ptr); // free pending item
119 |             }
120 |         }
121 |         l->pending_count = conflicts_count;
122 |         nbd_free(hazards);
123 |     }
124 |     assert(l->pending_size > l->pending_count);
125 |     l->pending[ l->pending_count ].ptr   = d;
126 |     l->pending[ l->pending_count ].free_ = f;
127 |     l->pending_count++;
128 | }
129 | 
130 | haz_t *haz_get_static (int i) {
131 |     TRACE("H1", "haz_get_static: %p", i, 0);
132 |     if (i >= STATIC_HAZ_PER_THREAD)
133 |         return NULL;
134 |     LOCALIZE_THREAD_LOCAL(ThreadId, int);
135 |     assert(i < STATIC_HAZ_PER_THREAD);
136 |     haz_t *ret = &haz_local_[ThreadId].static_haz[i];
137 |     TRACE("H1", "haz_get_static: returning %p", ret, 0);
138 |     return ret;
139 | }
140 | 
141 | void haz_register_dynamic (haz_t *haz) {
142 |     TRACE("H1", "haz_register_dynamic: %p", haz, 0);
143 |     LOCALIZE_THREAD_LOCAL(ThreadId, int);
144 |     haz_local_t *l = haz_local_ + ThreadId;
145 | 
146 |     if (l->dynamic_size == 0) {
147 |         int n = MAX_NUM_THREADS * STATIC_HAZ_PER_THREAD;
148 |         l->dynamic = nbd_malloc(sizeof(haz_t *) * n);
149 |         l->dynamic_size = n;
150 |     }
151 | 
152 |     if (l->dynamic_count == l->dynamic_size) {
153 |         haz_t **d = nbd_malloc(sizeof(haz_t *) * l->dynamic_size * 2);
154 |         memcpy(d, l->dynamic, l->dynamic_size);
155 |         nbd_free(l->dynamic);
156 |         l->dynamic = d;
157 |         l->dynamic_size *= 2;
158 |     }
159 | 
160 |     l->dynamic[ l->dynamic_count++ ] = haz;
161 | }
162 | 
163 | // assumes <haz> was registered in the same thread
164 | void haz_unregister_dynamic (void **haz) {
165 |     TRACE("H1", "haz_unregister_dynamic: %p", haz, 0);
166 |     LOCALIZE_THREAD_LOCAL(ThreadId, int);
167 |     haz_local_t *l = haz_local_ + ThreadId;
168 | 
169 |     for (int i = 0; i < l->dynamic_count; ++i) {
170 |         if (l->dynamic[i] == haz) {
171 |             if (i != l->dynamic_count - 1) {
172 |                 l->dynamic[i] = l->dynamic[ l->dynamic_count - 1 ];
173 |             }
174 |             l->dynamic_count--;
175 |             return;
176 |         }
177 |     }
178 |     assert(0);
179 | }
180 | 


--------------------------------------------------------------------------------
/runtime/lwt.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * lightweight tracing 
  6 |  */
  7 | #include <stdio.h>
  8 | #include "common.h"
  9 | #include "rlocal.h"
 10 | #include "lwt.h"
 11 | #include "mem.h"
 12 | 
 13 | #define LWT_BUFFER_SCALE 20
 14 | #define LWT_BUFFER_SIZE (1ULL << LWT_BUFFER_SCALE)
 15 | #define LWT_BUFFER_MASK (LWT_BUFFER_SIZE - 1)
 16 | 
 17 | volatile int halt_ = 0;
 18 | 
 19 | typedef struct lwt_record {
 20 |     uint64_t timestamp;
 21 |     uint64_t format;
 22 |     size_t value1;
 23 |     size_t value2;
 24 | } lwt_record_t;
 25 | 
 26 | typedef struct lwt_buffer {
 27 |     uint32_t head;
 28 |     lwt_record_t x[0];
 29 | } lwt_buffer_t;
 30 | 
 31 | lwt_buffer_t *TraceBuffer[MAX_NUM_THREADS] = {};
 32 | char TraceLevel[256] = {};
 33 | static const char *TraceSpec = "";
 34 | 
 35 | void lwt_thread_init (void) {
 36 |     int thread_index = GET_THREAD_INDEX();
 37 | 
 38 |     if (TraceBuffer[thread_index] == NULL) {
 39 |         TraceBuffer[thread_index] = 
 40 |             (lwt_buffer_t *)nbd_malloc(sizeof(lwt_buffer_t) + sizeof(lwt_record_t) * LWT_BUFFER_SIZE);
 41 |         memset(TraceBuffer[thread_index], 0, sizeof(lwt_buffer_t));
 42 |     }
 43 | }
 44 | 
 45 | void lwt_set_trace_level (const char *flags) {
 46 |     assert(strlen(flags) % 2 == 0); // a well formed <flags> should be an even number of characters long
 47 |     TraceSpec = flags;
 48 |     memset(TraceLevel, 0, sizeof(TraceLevel));
 49 |     for (int i = 0; flags[i]; i+=2) {
 50 |         TraceLevel[(unsigned)flags[i]] = flags[i+1];
 51 |     }
 52 | }
 53 | 
 54 | static void dump_record (FILE *file, int thread_id, lwt_record_t *r, uint64_t offset) {
 55 |     // print the record if its trace category is enabled at a high enough level
 56 |     int flag  =  r->format >> 56;
 57 |     int level = (r->format >> 48) & 0xFF;
 58 |     if (TraceLevel[(unsigned)flag] >= level) {
 59 |         char s[3] = {flag, level, '\0'};
 60 |         fprintf(file, "%09llu %d %s ", ((uint64_t)r->timestamp - offset) >> 5, thread_id, s);
 61 |         const char *format = (const char *)(size_t)(r->format & MASK(48)); // strip out the embedded flags
 62 |         fprintf(file, format, r->value1, r->value2);
 63 |         fprintf(file, "\n");
 64 |     }
 65 | }
 66 | 
 67 | static void dump_buffer (FILE *file, int thread_index, uint64_t offset) {
 68 |     lwt_buffer_t *tb = TraceBuffer[thread_index]; 
 69 |     assert(tb);
 70 |     if (tb->head > LWT_BUFFER_SIZE) {
 71 |         for (int i = tb->head & LWT_BUFFER_MASK; i < LWT_BUFFER_SIZE; ++i) {
 72 |             dump_record(file, thread_index + 1, tb->x + i, offset);
 73 |         }
 74 |     }
 75 | 
 76 |     for (int i = 0; i < (tb->head & LWT_BUFFER_MASK); ++i) {
 77 |         dump_record(file, thread_index + 1, tb->x + i, offset);
 78 |     }
 79 | }
 80 | 
 81 | void lwt_halt (void) {
 82 |     halt_ = 1;
 83 | }
 84 | 
 85 | void lwt_dump (const char *file_name) {
 86 |     halt_ = 1;
 87 |     uint64_t offset = (uint64_t)-1;
 88 | 
 89 |     for (int i = 0; i < MAX_NUM_THREADS; ++i) {
 90 |         if (TraceBuffer[i] != NULL && TraceBuffer[i]->head != 0) {
 91 |             uint64_t x = TraceBuffer[i]->x[0].timestamp;
 92 |             if (x < offset) {
 93 |                 offset = x;
 94 |             }
 95 |             if (TraceBuffer[i]->head > LWT_BUFFER_SIZE)
 96 |             {
 97 |                 x = TraceBuffer[i]->x[TraceBuffer[i]->head & LWT_BUFFER_MASK].timestamp;
 98 |                 if (x < offset) {
 99 |                     offset = x;
100 |                 }
101 |             }
102 |         }
103 |     }
104 | 
105 |     if (offset != (uint64_t)-1) {
106 |         FILE *file = fopen(file_name, "w");
107 |         assert(file);
108 |         for (int i = 0; i < MAX_NUM_THREADS; ++i) {
109 |             if (TraceBuffer[i] != NULL) {
110 |                 dump_buffer(file, i, offset);
111 |             }
112 |         }
113 |         fflush(file);
114 |         fclose(file);
115 |     }
116 | }
117 | 
118 | void lwt_trace_i (uint64_t format, size_t value1, size_t value2) {
119 |     while (halt_) {}
120 |     lwt_buffer_t *tb = TraceBuffer[GET_THREAD_INDEX()];
121 |     if (tb != NULL) {
122 |         unsigned int u, l;
123 |         __asm__ __volatile__("rdtsc" : "=a" (l), "=d" (u)); 
124 |         uint64_t timestamp = ((uint64_t)u << 32) | l; 
125 |         lwt_record_t temp = { timestamp, format, value1, value2 };
126 | 
127 |         tb->x[tb->head++ & LWT_BUFFER_MASK] = temp;
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------
/runtime/mem.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * Extreamly fast multi-threaded malloc.
  6 |  */
  7 | #ifndef USE_SYSTEM_MALLOC
  8 | #define _BSD_SOURCE // so we get MAP_ANON on linux
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | #include <errno.h>
 12 | #include <sys/mman.h>
 13 | #include "common.h"
 14 | #include "rlocal.h"
 15 | #include "lwt.h"
 16 | 
 17 | #ifndef NBD32
 18 | #define MAX_SCALE        36 // allocate blocks up to 64GB (arbitrary, could be bigger)
 19 | #define MIN_SCALE         3 // smallest allocated block is 8 bytes
 20 | #define MAX_POINTER_BITS 48
 21 | #define PAGE_SCALE       21 // 2MB pages
 22 | #else
 23 | #define MAX_SCALE        31 
 24 | #define MIN_SCALE         2 // smallest allocated block is 4 bytes
 25 | #define MAX_POINTER_BITS 32
 26 | #define PAGE_SCALE       12 // 4KB pages
 27 | #endif
 28 | #define PAGE_SIZE        (1ULL << PAGE_SCALE)
 29 | #define HEADERS_SIZE     (((size_t)1ULL << (MAX_POINTER_BITS - PAGE_SCALE)) * sizeof(header_t))
 30 | 
 31 | typedef struct block {
 32 |     struct block *next;
 33 | } block_t;
 34 | 
 35 | // TODO: Break the page header into two parts. The first part is located in the header region. The 
 36 | //       second part is located on the page and is only used when there are free items.
 37 | typedef struct header {
 38 | #ifdef  RECYCLE_PAGES
 39 |     struct header *next;
 40 |     struct header *prev;
 41 |     block_t *free_list; // list of free blocks
 42 |     int num_in_use;
 43 | #endif//RECYCLE_PAGES
 44 |     uint8_t owner; // thread id of owner
 45 |     uint8_t scale; // log2 of the block size
 46 | } header_t;
 47 | 
 48 | #ifdef RECYCLE_PAGES
 49 | typedef struct size_class {
 50 |     header_t *active_page;
 51 |     header_t *oldest_partial;
 52 |     header_t *newest_partial;
 53 | } size_class_t;
 54 | #endif//RECYCLE_PAGES
 55 | 
 56 | typedef struct tl {
 57 | #ifndef RECYCLE_PAGES
 58 |     block_t *free_list[MAX_SCALE+1];
 59 | #else
 60 |     header_t *free_pages;
 61 |     size_class_t size_class[MAX_SCALE+1];
 62 | #endif//RECYCLE_PAGES
 63 |     block_t *blocks_from[MAX_NUM_THREADS];
 64 |     block_t *blocks_to[MAX_NUM_THREADS];
 65 | } __attribute__((aligned(CACHE_LINE_SIZE))) tl_t;
 66 | 
 67 | static header_t *headers_ = NULL;
 68 | 
 69 | static tl_t tl_[MAX_NUM_THREADS] = {};
 70 | 
 71 | static inline header_t *get_header (void *r) {
 72 |     ASSERT(((size_t)r >> PAGE_SCALE) < HEADERS_SIZE);
 73 |     return headers_ + ((size_t)r >> PAGE_SCALE);
 74 | }
 75 | 
 76 | static void *get_new_region (int block_scale) {
 77 |     int thread_index = GET_THREAD_INDEX();
 78 | #ifdef RECYCLE_PAGES
 79 |     tl_t *tl = &tl_[thread_index]; // thread-local data
 80 |     if (block_scale <= PAGE_SCALE && tl->free_pages != NULL) {
 81 |         void *region = tl->free_pages;
 82 |         tl->free_pages = tl->free_pages->next;
 83 |         get_header(region)->scale = block_scale;
 84 |         return region;
 85 |     }
 86 | #endif//RECYCLE_PAGES
 87 |     size_t region_size = (1ULL << block_scale);
 88 |     if (region_size < PAGE_SIZE) {
 89 |         region_size = PAGE_SIZE;
 90 |     }
 91 |     void *region = mmap(NULL, region_size, PROT_READ|PROT_WRITE, MAP_NORESERVE|MAP_ANON|MAP_PRIVATE, -1, 0);
 92 |     TRACE("m1", "get_new_region: mmapped new region %p (size %p)", region, region_size);
 93 |     if (region == (void *)-1) {
 94 |         perror("get_new_region: mmap");
 95 |         exit(-1);
 96 |     }
 97 |     if ((size_t)region & (region_size - 1)) {
 98 |         TRACE("m0", "get_new_region: region not aligned", 0, 0);
 99 |         munmap(region, region_size);
100 |         region = mmap(NULL, region_size * 2, PROT_READ|PROT_WRITE, MAP_NORESERVE|MAP_ANON|MAP_PRIVATE, -1, 0);
101 |         if (region == (void *)-1) {
102 |             perror("get_new_region: mmap");
103 |             exit(-1);
104 |         }
105 |         TRACE("m0", "get_new_region: mmapped new region %p (size %p)", region, region_size * 2);
106 |         void *aligned = (void *)(((size_t)region + region_size) & ~(region_size - 1));
107 |         size_t extra = (char *)aligned - (char *)region;
108 |         if (extra) {
109 |             munmap(region, extra);
110 |             TRACE("m0", "get_new_region: unmapped extra memory %p (size %p)", region, extra);
111 |         }
112 |         extra = ((char *)region + region_size) - (char *)aligned;
113 |         if (extra) {
114 |             munmap((char *)aligned + region_size, extra);
115 |             TRACE("m0", "get_new_region: unmapped extra memory %p (size %p)", (char *)aligned + region_size, extra);
116 |         }
117 |         region = aligned;
118 |     }
119 |     assert(region);
120 | 
121 |     header_t *h = get_header(region);
122 |     TRACE("m1", "get_new_region: header %p (%p)", h, h - headers_);
123 |     assert(h->scale == 0);
124 |     h->scale = block_scale;
125 |     h->owner = thread_index;
126 | 
127 |     return region;
128 | }
129 | 
130 | void mem_init (void) {
131 |     assert(headers_ == NULL);
132 |     // Allocate space for the page headers. This could be a big chunk of memory on 64 bit systems,
133 |     // but it just takes up virtual address space. Physical space used by the headers is still 
134 |     // proportional to the amount of memory the user mallocs.
135 |     headers_ = mmap(NULL, HEADERS_SIZE, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
136 |     TRACE("m1", "mem_init: header page %p", headers_, 0);
137 | 
138 |     // initialize spsc queues
139 |     for (int i = 0; i < MAX_NUM_THREADS; ++i) {
140 |         for (int j = 0; j < MAX_NUM_THREADS; ++j) {
141 |             if (i != j) {
142 |                 tl_[i].blocks_to[j] = (block_t *)&(tl_[j].blocks_from[i]);
143 |             }
144 |         }
145 |     }
146 | }
147 | 
148 | void nbd_free (void *x) {
149 |     TRACE("m1", "nbd_free: block %p page %p", x, (size_t)x & ~MASK(PAGE_SCALE));
150 |     ASSERT(x);
151 |     block_t  *b = (block_t *)x;
152 |     header_t *h = get_header(x);
153 |     int b_scale = h->scale;
154 |     TRACE("m1", "nbd_free: header %p scale %llu", h, b_scale);
155 |     ASSERT(b_scale && b_scale <= MAX_SCALE);
156 | #ifdef RECYCLE_PAGES
157 |     if (b_scale > PAGE_SCALE) {
158 |         int rc = munmap(x, 1ULL << b_scale);
159 |         ASSERT(rc == 0);
160 |         rc = rc;
161 |     }
162 | #endif
163 | #ifndef NDEBUG
164 |     memset(b, 0xcd, (1ULL << b_scale)); // bear trap
165 | #endif
166 |     int thread_index = GET_THREAD_INDEX();
167 |     tl_t *tl = &tl_[thread_index]; // thread-local data
168 |     if (h->owner == thread_index) {
169 |         TRACE("m1", "nbd_free: private block, old free list head %p", tl->free_list[b_scale], 0);
170 | 
171 | #ifndef RECYCLE_PAGES
172 |         b->next = tl->free_list[b_scale];
173 |         tl->free_list[b_scale] = b;
174 | #else //RECYCLE_PAGES
175 |         b->next = h->free_list;
176 |         h->free_list = b;
177 |         h->num_in_use--;
178 |         size_class_t *sc = &tl->size_class[b_scale];
179 |         if (sc->active_page != h) {
180 |             if (h->num_in_use == 0) {
181 |                 // remove <h> from the partial-page list
182 |                 if (h->next != NULL) { h->next->prev = h->prev; }
183 |                 if (h->prev != NULL) { h->prev->next = h->next; }
184 |                 // put <h> on the free-page list
185 |                 h->next = tl->free_pages;
186 |                 tl->free_pages = h;
187 |             } else {
188 |                 // move <h> to the top of the partial-page list
189 |                 if (h->next != NULL) {
190 |                     h->next->prev = h->prev;
191 |                     if (h->prev != NULL) { h->prev->next = h->next; }
192 |                     h->prev = sc->newest_partial;
193 |                     h->next = NULL;
194 |                     sc->newest_partial = h;
195 |                 }
196 |             }
197 |         }
198 | #endif//RECYCLE_PAGES
199 |     } else {
200 |         // push <b> onto it's owner's queue
201 |         int b_owner = h->owner;
202 |         TRACE("m1", "nbd_free: owner %llu", b_owner, 0);
203 |         
204 |         // The assignment statements are volatile to prevent the compiler from reordering them.
205 |         VOLATILE_DEREF(b).next = NULL; 
206 |         VOLATILE_DEREF(tl->blocks_to[b_owner]).next = b;
207 | 
208 |         tl->blocks_to[b_owner] = b;
209 |     }
210 | }
211 | 
212 | static inline void process_incoming_blocks (tl_t *tl) {
213 |     for (int p = 0; p < MAX_NUM_THREADS; ++p) {
214 |         block_t *b = tl->blocks_from[p];
215 |         if (EXPECT_FALSE(b == NULL)) continue; // the queue is completely empty
216 | 
217 |         // Leave the last block on the queue. Removing the last block on the queue would create a
218 |         // race with the producer thread putting a new block on the queue.
219 |         for (block_t *next = b->next; next != NULL; b = next, next = b->next) {
220 |             // push <b> onto the appropriate free list
221 | #ifndef RECYCLE_PAGES
222 |             int b_scale = get_header(b)->scale;
223 |             b->next = tl->free_list[b_scale];
224 |             tl->free_list[b_scale] = b;
225 | #else //RECYCLE_PAGES
226 |             header_t *h = get_header(b);
227 |             b->next = h->free_list;
228 |             h->free_list = b;
229 | #endif//RECYCLE_PAGES
230 |         }
231 |         tl->blocks_from[p] = b;
232 |     }
233 | }
234 | 
235 | static inline block_t *pop_free_list (tl_t *tl, int scale) {
236 | #ifndef RECYCLE_PAGES
237 |     block_t **free_list = &tl->free_list[scale];
238 | #else //RECYCLE_PAGES
239 |     size_class_t *sc = &tl->size_class[scale];
240 |     if (EXPECT_FALSE(sc->active_page == NULL))
241 |         return NULL;
242 |     block_t **free_list = &sc->active_page->free_list;
243 | #endif//RECYCLE_PAGES
244 |     block_t *b = *free_list;
245 |     if (EXPECT_FALSE(b == NULL))
246 |         return NULL;
247 |     ASSERT(get_header(b)->scale == scale);
248 |     *free_list = b->next;
249 |     return b;
250 | }
251 | 
252 | // Allocate a block of memory at least size <n>. Blocks are binned in powers-of-two. Round up <n> to
253 | // the nearest power of two. 
254 | //
255 | // First check the current thread's free list for an available block. If there are no blocks on the
256 | // free list, pull items off of the current thread's incoming block queues and push them onto the 
257 | // free list. If we didn't get an appropriate size block off of the block queues then allocate a new
258 | // page, break it up into blocks and push them onto the free list. 
259 | void *nbd_malloc (size_t n) {
260 |     // the scale is the log base 2 of <n>, rounded up
261 |     int b_scale = (sizeof(void *) * __CHAR_BIT__) - __builtin_clzl((n) - 1);
262 |     TRACE("m1", "nbd_malloc: size %llu (scale %llu)", n, b_scale);
263 | 
264 |     if (EXPECT_FALSE(b_scale < MIN_SCALE)) { b_scale = MIN_SCALE; }
265 |     if (EXPECT_FALSE(b_scale > MAX_SCALE)) { return NULL; }
266 | 
267 |     tl_t *tl = &tl_[GET_THREAD_INDEX()]; // thread-local data
268 | 
269 |     block_t *b = pop_free_list(tl, b_scale);
270 |     if (b != NULL) {
271 |         TRACE("m1", "nbd_malloc: returning block %p", b, 0);
272 |         return b;
273 |     assert(b);
274 |     }
275 | 
276 |     // The free list is empty so process blocks freed from other threads and then check again.
277 |     process_incoming_blocks(tl);
278 |     b = pop_free_list(tl, b_scale);
279 |     if (b != NULL) {
280 |         TRACE("m1", "nbd_malloc: returning block %p", b, 0);
281 |         return b;
282 |     assert(b);
283 |     }
284 | 
285 | #ifdef  RECYCLE_PAGES
286 |     // The current active page is completely allocated. Make the oldest partially allocated page 
287 |     // the new active page.
288 |     size_class_t *sc = &tl->size_class[b_scale];
289 |     if (sc->oldest_partial != NULL) {
290 |         sc->active_page = sc->oldest_partial;
291 |         sc->oldest_partial = sc->oldest_partial->next;
292 |         sc->oldest_partial->prev = NULL;
293 |         b = pop_free_list(tl, b_scale);
294 |         ASSERT(b != NULL);
295 |         TRACE("m1", "nbd_malloc: returning block %p", b, 0);
296 |         return b;
297 |     assert(b);
298 |     }
299 |     // There are no partially allocated pages so get a new page.
300 | 
301 | #endif//RECYCLE_PAGES
302 | 
303 |     // Get a new page.
304 |     char *page = get_new_region(b_scale);
305 |     b = (block_t *)page; // grab the first block on the page
306 | 
307 |     // Break up the remainder of the page into blocks and put them on the free list. Start at the
308 |     // end of the page so that the free list ends up in increasing order, for ease of debugging.
309 |     if (b_scale < PAGE_SCALE) {
310 |         size_t block_size = (1ULL << b_scale);
311 |         block_t *head = NULL;
312 |         for (int offset = PAGE_SIZE - block_size; offset > 0; offset -= block_size) {
313 |             block_t *x = (block_t *)(page + offset);
314 |             x->next = head; head = x;
315 |         }
316 | #ifndef RECYCLE_PAGES
317 |         tl->free_list[b_scale] = head;
318 | #else //RECYCLE_PAGES
319 |         sc->active_page = get_header(page);
320 |         sc->active_page->free_list = head;
321 | #endif//RECYCLE_PAGES
322 |     }
323 | 
324 |     TRACE("m1", "nbd_malloc: returning block %p from new region %p", b, (size_t)b & ~MASK(PAGE_SCALE));
325 |     assert(b);
326 |     return b;
327 | }
328 | #else//USE_SYSTEM_MALLOC
329 | #include <stdlib.h>
330 | #include "common.h"
331 | #include "rlocal.h"
332 | #include "lwt.h"
333 | 
334 | void mem_init (void) {
335 |     return;
336 | }
337 | 
338 | void nbd_free (void *x) {
339 |     TRACE("m1", "nbd_free: %p", x, 0);
340 | #ifndef NDEBUG
341 |     memset(x, 0xcd, sizeof(void *)); // bear trap
342 | #endif//NDEBUG
343 |     free(x);
344 |     return;
345 | }
346 | 
347 | void *nbd_malloc (size_t n) {
348 |     TRACE("m1", "nbd_malloc: request size %llu", n, 0);
349 |     void *x = malloc(n);
350 |     TRACE("m1", "nbd_malloc: returning %p", x, 0);
351 |     return x;
352 | }
353 | #endif//USE_SYSTEM_MALLOC
354 | 


--------------------------------------------------------------------------------
/runtime/mem2.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * fast multi-threaded malloc.
  6 |  */
  7 | #ifndef USE_SYSTEM_MALLOC
  8 | #define _BSD_SOURCE // so we get MAP_ANON on linux
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | #include <errno.h>
 12 | #include <sys/mman.h>
 13 | #include "common.h"
 14 | #include "rlocal.h"
 15 | #include "lwt.h"
 16 | 
 17 | #define CHUNK_SCALE 12 // 4k chunks
 18 | #define PAGE_SCALE  21 // 2MB pages
 19 | #define PAGE_SIZE (1ULL << PAGE_SCALE)
 20 | 
 21 | // On both linux and Mac OS X the size of the mmap-able virtual address space is between 2^46 and 2^47. Linux has 
 22 | // no problem when you grab the whole thing. Mac OS X apparently does some O(n) thing on the first page fault
 23 | // that takes over 2 seconds if you mmap 2^46 bytes. So on Mac OS X we only take 2^38 bytes of virtual space. Which
 24 | // is OK though, since you can only buy a Mac with up to 32GB of RAM (as of 2/09).
 25 | #ifndef NBD32
 26 | #ifdef  __MACOSX__
 27 | #define TOTAL_SCALE 38
 28 | #else //__MACOSX__
 29 | #define TOTAL_SCALE 46
 30 | #endif//__MACOSX__
 31 | #else// NBD32
 32 | #define TOTAL_SCALE 32
 33 | #endif//NBD32
 34 | #define TOTAL_SIZE (1ULL << TOTAL_SCALE)
 35 | 
 36 | #define INVALID_SLAB_CLASS          255
 37 | #define METASLAB_CLASS_MAX          2
 38 | #define NESTED_4K_SLAB_CLASS_MAX    16
 39 | #define NESTED_32K_SLAB_CLASS_MAX   39
 40 | #define NESTED_256K_SLAB_CLASS_MAX  63
 41 | #define NESTED_SLAB_CLASS_MAX       NESTED_256K_SLAB_CLASS_MAX
 42 | #define LARGE_SLAB_CLASS_MAX        93
 43 | #define HUGE_SLAB_CLASS_MAX         (sizeof(BlockSize) / sizeof(*BlockSize))
 44 | #define SLAB_CLASS_MAX              HUGE_SLAB_CLASS_MAX
 45 | 
 46 | #define NESTED_SLAB_CASES NESTED_4K_SLAB_CASES: case NESTED_32K_SLAB_CASES: case NESTED_256K_SLAB_CASES
 47 | #define NESTED_4K_SLAB_CASES           METASLAB_CLASS_MAX+1 ... NESTED_4K_SLAB_CLASS_MAX
 48 | #define NESTED_32K_SLAB_CASES    NESTED_4K_SLAB_CLASS_MAX+1 ... NESTED_32K_SLAB_CLASS_MAX: case 0
 49 | #define NESTED_256K_SLAB_CASES  NESTED_32K_SLAB_CLASS_MAX+1 ... NESTED_SLAB_CLASS_MAX: case 1
 50 | #define LARGE_SLAB_CASES            NESTED_SLAB_CLASS_MAX+1 ... LARGE_SLAB_CLASS_MAX: case 2
 51 | #define HUGE_SLAB_CASES              LARGE_SLAB_CLASS_MAX+1 ... HUGE_SLAB_CLASS_MAX
 52 | 
 53 | #define SLAB_CLASS_SCALE(class) ({                       \
 54 |         int _scale = 0;                                  \
 55 |         switch (class) {                                 \
 56 |         case NESTED_4K_SLAB_CASES:   _scale = 12; break; \
 57 |         case NESTED_32K_SLAB_CASES:  _scale = 15; break; \
 58 |         case NESTED_256K_SLAB_CASES: _scale = 18; break; \
 59 |         case LARGE_SLAB_CASES:       _scale = 21; break; \
 60 |         }                                                \
 61 |         _scale;                                          \
 62 | })
 63 | 
 64 | // indexed by class
 65 | static const uint32_t BlockSize[] = { 
 66 |     // meta slab classes (for the nested slabs)
 67 |     1 << 12, 1 << 15, 1 << 18
 68 |     
 69 |     // nested slab classes (4kB, 32kB, and 256kB)
 70 |     8,     16,    24,    32,    40,    48,    56,    64,    72,    80,
 71 |     88,    96,    112,   120,   128,   144,   160,   176,   192,   224,   
 72 |     256,   288,   320,   352,   384,   416,   448,   480,   512,   576,   
 73 |     640,   704,   768,   832,   896,   960,   1024,  1152,  1280,  1408,  
 74 |     1536,  1664,  1856,  2048,  2240,  2432,  2688,  2944,  3200,  3520,  
 75 |     3840,  4160,  4544,  4928,  5312,  5696,  6144,  6592,  7040,  7488,  
 76 |     7936,  
 77 | 
 78 |     // large slab classes (full page, 2MB)
 79 |     8896,  9984,  11200, 12544, 14016, 15616, 17408, 19328, 21440, 23744, 
 80 |     26176, 28800, 31616, 34624, 37760, 41024, 44416, 47936, 51584, 55296, 
 81 |     59008, 62784, 66496, 70208, 73856, 77376, 80832, 84160, 87360, 90368, 
 82 |     93248, 95936, 98496, 100864, 
 83 | 
 84 |     // huge slabs (slabs on huge blocks, 2MB-4MB)
 85 |     110912,  121984,  134144,  147520,  162240,  178432,  196224,  215808,  237376,  261056,
 86 |     287104,  315776,  347328,  382016,  420160,  462144,  508352,  559168,  615040,  676544,
 87 |     744192,  818560,  900416,  990400,  1089408, 1198336, 1318144, 1449920, 1594880, 1754368,
 88 |     1929792
 89 | };
 90 | 
 91 | typedef uint8_t class_t;
 92 | 
 93 | typedef struct block {
 94 |     struct block *next;
 95 | } block_t;
 96 | 
 97 | typedef struct slab {
 98 |     unsigned valid:1;
 99 |     unsigned free_list:15;
100 |     unsigned num_in_use:9;
101 |     unsigned class:6;
102 | } __attribute__((packed)) slab_t;
103 | 
104 | typedef struct metaslab {
105 |     slab_t slab;
106 |     char * data;
107 |     slab_t slab[1 << (PAGE_SCALE - CHUNK_SCALE)];
108 |     struct {
109 |         struct metaslab *older; 
110 |         struct metaslab *newer; 
111 |     } q[NESTED_SLAB_CLASS_MAX+1];
112 |     uint64_t partial_slab_bitmap2[NESTED_32K_SLAB_CLASS_MAX+1];
113 |     uint8_t  partial_slab_bitmap1[NESTED_SLAB_CLASS_MAX+1]; 
114 | } metaslab_t;
115 | 
116 | char    *MemBase   = NULL;
117 | char    *MemEnd    = NULL;
118 | char    *PageBreak = NULL;
119 | size_t  *PageMap   = NULL;
120 | block_t *FreePages = NULL;
121 | struct { slab_t *slab; char *slab_base; } ActiveSlab[SLAB_CLASS_MAX + 1] = {};
122 | 
123 | struct {
124 |     size_t slabs_in_use;
125 |     size_t bytes_requested;
126 |     size_t bytes_allocated;
127 |     size_t total_bytes_allocated;
128 | } ClassStats[METASLAB_CLASS_MAX+1];
129 | 
130 | struct { 
131 |     slab_t *oldest;
132 |     slab_t *newest; 
133 | } PartialSlabQueue[SLAB_CLASS_MAX+1];
134 | 
135 | struct {
136 |     slab_t *oldest;
137 | } FreshPartialSlabQueue[SLAB_CLASS_MAX+1];
138 | 
139 | static block_t *get_block (class_t slab_class);
140 | 
141 | void mem_init (void) {
142 |     ASSERT(INVALID_SLAB_CLASS > SLAB_CLASS_MAX);
143 | 
144 |     void *buf = mmap(NULL, TOTAL_SIZE, PROT_NONE, MAP_NORESERVE|MAP_ANON|MAP_PRIVATE, -1, 0);
145 |     if (buf == (void *)-1) {
146 |         perror("mmap");
147 |         exit(-1);
148 |     }
149 |     MemEnd  = buf + TOTAL_SIZE;
150 |     MemBase = (char *)( ((size_t)buf + PAGE_SIZE-1) & ~(PAGE_SIZE-1) ); // align to a page boundry
151 | 
152 |     size_t page_map_size = sizeof(void *) >> (TOTAL_SCALE - PAGE_SCALE);
153 |     mprotect(MemBase, chunk_map_size, PROT_READ|PROT_WRITE);
154 |     PageBreak = MemBase + chunk_map_size;
155 |     PageMap  = (size_t *)MemBase;
156 | }
157 | 
158 | static class_t get_slab_class (size_t size) {
159 |     for (int i = METASLAB_CLASS_MAX + 1; i <= SLAB_CLASS_MAX; ++i) {
160 |         if (size <= BlockSize[i])
161 |             return i;
162 |     }
163 |     return INVALID_SLAB_CLASS;
164 | }
165 | 
166 | static class_t get_meta_class (class_t class) {
167 |     int scale = SLAB_CLASS_SCALE(class);
168 |     if (scale == PAGE_SCALE || scale == 0)
169 |         return INVALID_SLAB_CLASS;
170 |     return (scale - 12) / 3;
171 | }
172 | 
173 | static void *get_page (void) {
174 |     block_t *p = FreePages;
175 |     if (p == NULL) {
176 |         p = (block_t *)PageBreak;
177 |         PageBreak += PAGE_SIZE;
178 |         return p;
179 |     }
180 |     FreePages = p->next;
181 |     return p;
182 | }
183 | 
184 | static void free_page (void *p) {
185 |     ASSERT(p < (void *)PageBreak);
186 |     block_t *b = (block_t *)p;
187 |     b->next = FreePages;
188 |     FreePages = b;
189 | }
190 | 
191 | static void init_slab (void *b, class_t slab_class) {
192 | }
193 | 
194 | static slab_t *new_large_slab (class_t slab_class) {
195 |     return NULL;
196 | }
197 | 
198 | static int find_partial_slab(metaslab_t *metaslab, class_t target_class, int target_index) {
199 |     switch (target_class) {
200 |         case NESTED_4K_SLAB_CASSES:
201 |             {
202 |                 // search nearby the target first
203 |                 int base_index = (target_index & ~0x7);
204 |                 for (int i = 0; i < 8; ++i) {
205 |                     if (base_index + i == target_index)
206 |                         continue;
207 |                     if (metaslab->slab[base_index + i].class == target_class)
208 |                         return base_index + i;
209 |                 }
210 |                 do {
211 |                     metaslab->partial_slab_bitmap2[target_class] &= ~(1ULL << (base_index >> 3));
212 |                     uint64_t bitmap = metaslab->partial_slab_bitmap2[target_class];
213 |                     if (bitmap == 0)
214 |                         return NULL;
215 |                     int n = base_index >> 3;
216 |                     if (bitmap & (0xFF << (n & ~0x7))) {
217 |                         bitmap &= 0xFF << (n & ~0x7); // search nearby the target first
218 |                     }
219 |                     base_index = COUNT_TRAILING_ZEROS(bitmap) << 3;
220 |                     for (int i = 0; i < 8; ++i) {
221 |                         if (metaslab->slab[base_index + i].class == target_class)
222 |                             return base_index + i;
223 |                     }
224 |                 } while (1);
225 |             }
226 |         case NESTED_32K_SLAB_CASSES:
227 |             {
228 |                 uint64_t bitmap = metaslab->partial_slab_bitmap2[target_class];
229 |                 if (bitmap == 0)
230 |                     return NULL;
231 |                 int n = target_index >> 3;
232 |                 if (bitmap & (0xFF << (n & ~0x7))) {
233 |                     bitmap &= 0xFF << (n & ~0x7); // search nearby the target first
234 |                 }
235 |                 return COUNT_TRAILING_ZEROS(bitmap) << 3;
236 |             }
237 |         case NESTED_256K_SLAB_CASSES:
238 |             {
239 |                 uint8_t bitmap = metaslab->partial_slab_bitmap1[target_class];
240 |                 if (bitmap == 0)
241 |                     return NULL;
242 |                 return COUNT_TRAILING_ZEROS(bitmap) << 6;
243 |             }
244 |         default:
245 |             ASSERT(FALSE);
246 |             return -1;
247 |     }
248 | }
249 | 
250 | static void activate_new_slab (class_t slab_class) {
251 |     slab_t *new_slab;
252 |     switch (slab_class) {
253 |         case NESTED_SLAB_CASES:
254 |             int slab_index       = ActiveSlab[slab_class].slab_index;
255 |             metaslab_t *metaslab = ActiveSlab[slab_class].metaslab;
256 | 
257 |             // First look for a partial slab on the same metaslab as the old active slab.
258 |             new_slab = find_partial_slab(metaslab, slab_class);
259 |             if (new_slab == NULL) {
260 |                 // No partial slab on the same metaslab. Remove a metaslab from the front of the queue.
261 |                 metaslab_t *metaslab = (metaslab_t *)PartialSlabQueue[slab_class].oldest;
262 |                 if (metaslab != NULL) {
263 |                     ASSERT(metaslab->q[slab_class].older == NULL);
264 |                     PartialSlabQueue[slab_class].newest = (slab_t *)metaslab->q[slab_class].newer;
265 |                     metaslab->q[slab_class].newer->q[slab_class].older = NULL;
266 |                     new_slab = find_partial_slab(metaslab, slab_class);
267 |                 } else {
268 |                     // Can't find a partial slab; create a new slab.
269 |                     new_slab = (slab_t *)get_block(get_meta_class(slab_class));
270 |                     init_slab(new_slab, slab_class);
271 |                 }
272 |             }
273 |             break;
274 | 
275 |         case LARGE_SLAB_CASES:
276 |         case HUGE_SLAB_CASES:
277 |             // large or huge slab class
278 |             new_slab = PartialSlabQueue[slab_class].oldest;
279 |             if (new_slab == NULL) {
280 |                 ASSERT(new_slab->older == NULL);
281 |                 PartialSlabQueue[slab_class].newest = new_slab->newer;
282 |                 new_slab->newer->older = NULL;
283 |             }
284 |             if (new_slab == NULL) {
285 |                 if (IS_HUGE_SLAB_CLASS(slab_class)) {
286 |                     new_slab = new_large_slab(slab_class);
287 |                 } else {
288 |                     ASSERT(IS_LARGE_SLAB_CLASS(slab_class));
289 |                     new_slab = (slab_t *)get_page();
290 |                 }
291 |                 init_slab(new_slab, slab_class);
292 |             }
293 |             break;
294 | 
295 |         default:
296 |             ASSERT(FALSE);
297 |     }
298 | 
299 |     ActiveSlab[slab_class] = new_slab;
300 | }
301 | 
302 | static void *get_block(class_t slab_class) {
303 | 
304 |     // Look for a free block on the active slab.
305 |     switch (slab_class) {
306 |         case NESTED_SLAB_CASES:
307 |             int slab_index       = ActiveSlab[slab_class].slab_index;
308 |             metaslab_t *metaslab = ActiveSlab[slab_class].metaslab;
309 |             if (metaslab != NULL) {
310 |                 slab_t slab = metaslab->slab[slab_index];
311 |                 if (slab.free_list) {
312 |                     char *slab_base = metaslab->data + ( ( slab_index - 1 ) << SLAB_CLASS_SCALE(slab_class) );
313 |                     void *b = (void *)( slab_base + ( ( slab.free_list - 1 ) << 3 ) );
314 |                     metaslab->slab[slab_index].free_list = *(uint16_t *)b;
315 |                     return b;
316 |                 }
317 |             }
318 |             break;
319 | 
320 |         case LARGE_SLAB_CASES:
321 |             //TODO
322 |             break;
323 | 
324 |         case HUGE_SLAB_CASES:
325 |             //TODO
326 |             break;
327 | 
328 |         default:
329 |             ASSERT(FALSE);
330 |     }
331 | 
332 |     // Find another slab, activate it, and try again.
333 |     activate_new_slab(slab_class);
334 |     return get_block(slab_class); // recursive tail-call
335 | }
336 | 
337 | void *nbd_malloc (size_t n) {
338 |     TRACE("m1", "nbd_malloc: size %llu", n, 0);
339 |     if (n == 0)
340 |         return NULL;
341 | 
342 |     block_t *b = get_block( get_slab_class(n) );
343 | 
344 |     TRACE("m1", "nbd_malloc: returning block %p", b, 0);
345 |     return b;
346 | }
347 | 
348 | void nbd_free (void *x) {
349 |     TRACE("m1", "nbd_free: block %p", x, 0);
350 |     ASSERT(x);
351 |     ASSERT(x >= (void *)MemBase && x < (void *)MemEnd);
352 | 
353 |     block_t *b = (block_t *)x;
354 |     size_t page_index = (size_t)b >> PAGE_SCALE;
355 |     metaslab_t *metaslab = PageMap[page_index];
356 |     ASSERT(metaslab);
357 |     size_t slab_index = ((size_t)b & MASK(PAGE_SCALE)) >> 12;
358 |     slab_t slab = metaslab->slab[slab_index];
359 | 
360 |     // if <slab> is not valid <b> is on a larger slab.
361 |     if (slab.valid) {
362 |         b->next = slab.free_list;
363 |         // the <offset> of the block is offset by 1 so 0 can represent NULL.
364 |         slab.free_list = ( ((size_t)b & MASK(12)) >> 3 ) + 1;
365 |     } else {
366 |         // <b> is not on a 4kB slab. 
367 |         slab_index &= 0x7; // Try the 32kB slab.
368 |         slab = metaslab->slab[slab_index];
369 |         if (slab.valid) {
370 |             b->next = slab.free_list;
371 |             slab.free_list = ( ((size_t)b & MASK(15)) >> 3 ) + 1;
372 |         } else {
373 |             // <b> is not on a 32kB slab. 
374 |             slab_index &= 0x3F; // <b> must be on the 256kB slab.
375 |             slab = metaslab->slab[slab_index];
376 |             ASSERT(slab.valid);
377 |             b->next = slab.free_list;
378 |             slab.free_list = ( ((size_t)b & MASK(18)) >> 3 ) + 1;
379 |         }
380 |     }
381 |     --slab.num_in_use;
382 |     metaslab->slab[slab_index] = slab;
383 |     if (slab.num_in_use == 0) {
384 |         free_slab(metaslab, slab_index);
385 |     }
386 | }
387 | 
388 | #else//USE_SYSTEM_MALLOC
389 | #include <stdlib.h>
390 | 
391 | void mem_init (void) {
392 |     return;
393 | }
394 | 
395 | void ndb_free (void *x) {
396 |     TRACE("m1", "nbd_free: %p", x, 0);
397 | #ifndef NDEBUG
398 |     memset(x, 0xcd, sizeof(void *)); // bear trap
399 | #endif//NDEBUG
400 |     free(x);
401 |     return;
402 | }
403 | 
404 | void *nbd_malloc (size_t n) {
405 |     TRACE("m1", "nbd_malloc: request size %llu", n, 0);
406 |     void *x = malloc(n);
407 |     TRACE("m1", "nbd_malloc: returning %p", x, 0);
408 |     return x;
409 | }
410 | #endif//USE_SYSTEM_MALLOC
411 | 


--------------------------------------------------------------------------------
/runtime/mem_class_calc.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <assert.h>
  4 | 
  5 | typedef unsigned char      uint8_t;
  6 | typedef unsigned short     uint16_t;
  7 | typedef unsigned int     uint32_t;
  8 | 
  9 | #define CACHE_LINE_SCALE 6
 10 | 
 11 | // Return the expected fraction of bytes wasted per slab.
 12 | //
 13 | // The internal fragmentation due to using size classes is biased by including the space required,
 14 | // for a pointer to each block.
 15 | double calc_frag(int slab_size, int block_size, int delta)
 16 | {
 17 |     double quant = (double)delta / 2 / block_size;
 18 |     assert(quant >= 0.0);
 19 |     int blocks_per_slab = (int)(slab_size / block_size);
 20 |  
 21 |     // internal fragmentation that comes from tiling non-power-of-2 sized blocks in slabs
 22 |     int extra_space = slab_size - blocks_per_slab * block_size; 
 23 |     assert(extra_space < block_size);
 24 | 
 25 |     // number of different cache line colors needed to evenly distribute cache line accesses
 26 |     int num_colors = block_size >> CACHE_LINE_SCALE;
 27 |     if (num_colors <= 1)
 28 |         return (double)extra_space/slab_size + quant;
 29 | 
 30 |     int num_overflow = num_colors - 1 - (extra_space >> CACHE_LINE_SCALE);
 31 |     if (num_overflow <= 0)
 32 |         return (double)extra_space/slab_size + quant;
 33 | 
 34 |     double coloring = (double)num_overflow * block_size / num_colors;
 35 |     return ((double)extra_space + coloring)/slab_size + quant;
 36 | }
 37 | 
 38 | // size classes for various alignments, max 6% expected internal fragmentation
 39 | 
 40 | // 2B-128B blocks, 4k slab
 41 | static uint8_t  A1_4kB[] = { 2, 3, 5, 7, 9, 11, 14, 17, 20, 24, 28, 33, 39, 46, 53, 62, 70, 80, 91, 105, 120, 128 };
 42 | static uint8_t  A2_4kB[] = { 2,    4, 6, 8, 10, 14, 18, 22,     28, 34, 40, 48, 56, 66, 74, 84, 94, 104, 120, 128 };
 43 | static uint8_t  A4_4kB[] = {       4,    8, 12,     16, 20, 24,     32, 40, 48, 56, 68,     80, 92, 104, 120, 128 };
 44 | static uint8_t  A8_4kB[] = {             8,         16,     24,     32, 40, 48,     64,     80, 96, 112, 120, 128 };
 45 | static uint8_t A16_4kB[] = {                        16,             32,     48,     64,     80, 96, 112,      128 };
 46 | 
 47 | // 128B-1kB blocks, 32k slab
 48 | static uint16_t  A1_32kB[] = { 137, 156, 178, 201, 227, 256, 288, 323, 361, 402, 447, 494, 545, 598, 654, 712, 771, 832, 895, 958, 1022 };
 49 | static uint16_t  A8_32kB[] = { 144,      168, 192, 224, 256, 296, 336, 376, 424, 472,      528, 584, 640, 704, 768, 832, 896, 960, 1024 };
 50 | static uint16_t A16_32kB[] = { 144,      176, 208,      240, 272, 320, 368, 416, 464,      512, 576, 640, 704, 768, 832, 896, 960, 1024 };
 51 | 
 52 | // 1kB-8kB blocks, 256k slab
 53 | static uint16_t  A1_256kB[] = { 1152, 1297, 1458,       1636, 1832, 2048, 2284,       2541, 2820, 3124, 3550, 3904, 4280, 4676, 5092,       5525, 5974, 6435, 6906, 7380, 7856 };
 54 | static uint16_t  A8_256kB[] = { 1152, 1288, 1440,       1608, 1792, 2000, 2224, 2472, 2744, 3032, 3344, 3680, 4040,       4416, 4816, 5232, 5664, 6112, 6568, 7032, 7504, 7976 };
 55 | static uint16_t A64_256kB[] = { 1152, 1280, 1408, 1536, 1664, 1856, 2048, 2240, 2432, 2688, 2944, 3200, 3520, 3840, 4160, 4544, 4928, 5312, 5696, 6144, 6592, 7040, 7488, 7936 };
 56 | 
 57 | // 8kB-100kB blocks, 2MB slab
 58 | static uint32_t A64_2MB[] = { 
 59 |     8896,  9984,  11200, 12544, 14016, 15616, 17408, 19328, 21440, 23744, 26176, 28800, 31616, 34624, 37760, 41024, 
 60 |     44416, 47936, 51584, 55296, 59008, 62784, 66496, 70208, 73856, 77376, 80832, 84160, 87360, 90368, 93248, 95936,
 61 |     98496, 100864
 62 | };
 63 | 
 64 | int main (void) {
 65 | 
 66 |     double x = 100864;
 67 |     int n;
 68 |     for (n = 0; n < 40 && x < (1 << 21); ++n) {
 69 |         x *= 1.1;
 70 |         x = (uint32_t)x & ~63;
 71 |         printf("%u, ", (uint32_t)x);
 72 |     }
 73 |     printf("\n%d\n", n);
 74 |     return 0;
 75 |     const int start1 = 120832;
 76 |     const int start2 = 1408;
 77 |     const int alignment = 64;
 78 | #define ischosen(x) \
 79 |     (x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 80 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 81 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 82 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 83 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 84 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 85 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 86 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || \
 87 |      x == 0 || x == 0 || x == 0 || x == 0 || x == 0 || x == 0)
 88 | 
 89 |     const int slab_size = 1 << 21;
 90 |     const double thresh = .06;
 91 |     int block_size;
 92 |     int i = 0;
 93 |     for (block_size = start1; i < 87 && block_size < (slab_size >> 3); ++i, block_size += alignment) {
 94 |         printf("%5d ", block_size);
 95 | 
 96 |         int d;
 97 |         double min = 1;
 98 |         int ch = block_size + alignment;
 99 |         for (d = block_size; d >= alignment; d-=alignment) { 
100 |             int x = block_size - d;
101 |             if (ischosen(x)) {
102 |                 double f = calc_frag(slab_size, block_size, d);
103 |                 if (f < thresh && f < min) { min = f; ch = d; }
104 |             }
105 |         }
106 | 
107 |         for (d = start2; d > start2 - 1024; d-=alignment) {
108 |             if (d <= block_size && d <= ch) {
109 |                 double f = calc_frag(slab_size, block_size, d);
110 |                 if (f < thresh) {
111 |                     if (d == ch) {
112 |                         printf(" *%3.1f%% ", f*100);
113 |                     } else {
114 |                         printf(" %4.1f%% ", f*100);
115 |                     }
116 |                     continue;
117 |                 } 
118 |             }
119 |             if (d-1 <= block_size && d-alignment <= ch && calc_frag(slab_size, block_size, d - alignment) < thresh) {
120 |                 printf("%6d ", block_size);
121 |                 continue;
122 |             }
123 |             printf("       ");
124 |         }
125 |             
126 |         if (ischosen(block_size)) {
127 |             printf("%5d*", block_size);
128 |         } else {
129 |             printf("%5d", block_size);
130 |         }
131 |         printf("\n");
132 |     }
133 |     return 0;
134 | }
135 | 


--------------------------------------------------------------------------------
/runtime/random.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | #include <pthread.h>
 5 | #include <unistd.h>
 6 | #include <fcntl.h>
 7 | 
 8 | #include "common.h"
 9 | #include "runtime.h"
10 | 
11 | DECLARE_THREAD_LOCAL(rx_, uint32_t);
12 | DECLARE_THREAD_LOCAL(ry_, uint32_t);
13 | DECLARE_THREAD_LOCAL(rz_, uint32_t);
14 | DECLARE_THREAD_LOCAL(rc_, uint32_t);
15 | 
16 | void rnd_init (void) {
17 |     INIT_THREAD_LOCAL(rx_);
18 |     INIT_THREAD_LOCAL(ry_);
19 |     INIT_THREAD_LOCAL(rz_);
20 |     INIT_THREAD_LOCAL(rc_);
21 | }
22 | 
23 | // TODO: put a lock around this so that multiple threads being initialize concurrently don't read
24 | //       the same values from /dev/urandom
25 | void rnd_thread_init (void) {
26 |     int fd = open("/dev/urandom", O_RDONLY);
27 |     if (fd == -1) {
28 |         perror("Error opening /dev/urandom");
29 |         exit(1);
30 |     }
31 | 
32 |     char buf[16];
33 | 
34 |     int n = read(fd, buf, sizeof(buf));
35 |     if (n != 16) {
36 |         if (n == -1) {
37 |             perror("Error reading from /dev/urandom");
38 |         }
39 |         fprintf(stderr, "Could not read enough bytes from /dev/urandom");
40 |         exit(1);
41 |     }
42 | 
43 |     uint32_t x, y, z, c;
44 |     memcpy(&x, buf +  0, 4);
45 |     memcpy(&y, buf +  4, 4);
46 |     memcpy(&z, buf +  8, 4);
47 |     memcpy(&c, buf + 12, 4);
48 | 
49 |     SET_THREAD_LOCAL(rx_, x);
50 |     SET_THREAD_LOCAL(ry_, y);
51 |     SET_THREAD_LOCAL(rz_, z);
52 |     SET_THREAD_LOCAL(rc_, z);
53 | }
54 | 
55 | // George Marsaglia's KISS generator
56 | //
57 | // Even though this returns 64 bits, this algorithm was only designed to generate 32 bits.
58 | // The upper 32 bits is going to be highly correlated with the lower 32 bits of the next call.
59 | uint64_t nbd_rand (void) {
60 |     LOCALIZE_THREAD_LOCAL(rx_, unsigned);
61 |     LOCALIZE_THREAD_LOCAL(ry_, unsigned);
62 |     LOCALIZE_THREAD_LOCAL(rz_, unsigned);
63 |     LOCALIZE_THREAD_LOCAL(rc_, unsigned);
64 | 
65 |     uint32_t rx = 69069 * rx_ + 12345;
66 |     uint32_t ry = ry_;
67 |     ry ^= (ry << 13);
68 |     ry ^= (ry >> 17);
69 |     ry ^= (ry <<  5);
70 |     uint64_t t = rz_ * 698769069LL + rc_;
71 |     uint64_t r = rx + ry + t;
72 | 
73 |     SET_THREAD_LOCAL(rx_, rx);
74 |     SET_THREAD_LOCAL(ry_, ry);
75 |     SET_THREAD_LOCAL(rz_, t);
76 |     SET_THREAD_LOCAL(rc_, t >> 32);
77 | 
78 |     return r;
79 | }
80 | 


--------------------------------------------------------------------------------
/runtime/rcu.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  *
 5 |  * safe memory reclamation using a simple technique from rcu
 6 |  *
 7 |  * WARNING: not robust enough for real-world use
 8 |  */
 9 | #include <string.h>
10 | #include "common.h"
11 | #include "rlocal.h"
12 | #include "lwt.h"
13 | #include "mem.h"
14 | #include "tls.h"
15 | #include "rcu.h"
16 | 
17 | #define RCU_POST_THRESHOLD 10
18 | #define RCU_QUEUE_SCALE 20
19 | 
20 | typedef struct fifo {
21 |     uint32_t head;
22 |     uint32_t tail;
23 |     uint32_t scale;
24 |     void *x[0];
25 | } fifo_t;
26 | 
27 | #define MOD_SCALE(x, b) ((x) & MASK(b))
28 | static uint64_t rcu_[MAX_NUM_THREADS][MAX_NUM_THREADS] = {};
29 | static uint64_t rcu_last_posted_[MAX_NUM_THREADS][MAX_NUM_THREADS] = {};
30 | static fifo_t *pending_[MAX_NUM_THREADS] = {};
31 | static int num_threads_ = 0;
32 | 
33 | static fifo_t *fifo_alloc(int scale) {
34 |     fifo_t *q = (fifo_t *)nbd_malloc(sizeof(fifo_t) + (1ULL << scale) * sizeof(void *));
35 |     memset(q, 0, sizeof(fifo_t));
36 |     q->scale = scale;
37 |     q->head = 0;
38 |     q->tail = 0;
39 |     return q;
40 | }
41 | 
42 | void rcu_thread_init (void) {
43 |     int thread_index = GET_THREAD_INDEX();
44 |     if (pending_[thread_index] == NULL) {
45 |         pending_[thread_index] = fifo_alloc(RCU_QUEUE_SCALE);
46 |         (void)SYNC_ADD(&num_threads_, 1);
47 |     }
48 | }
49 | 
50 | void rcu_update (void) {
51 |     int thread_index = GET_THREAD_INDEX();
52 |     int next_thread_index = (thread_index + 1) % num_threads_;
53 |     TRACE("r1", "rcu_update: updating thread %llu", next_thread_index, 0);
54 |     int i;
55 |     for (i = 0; i < num_threads_; ++i) {
56 |         if (i == thread_index)
57 |             continue;
58 | 
59 |         // No need to post an update if the value hasn't changed
60 |         if (rcu_[thread_index][i] == rcu_last_posted_[thread_index][i])
61 |             continue;
62 | 
63 |         uint64_t x = rcu_[thread_index][i];
64 |         rcu_[next_thread_index][i] = rcu_last_posted_[thread_index][i] = x;
65 |         TRACE("r2", "rcu_update: posted updated value (%llu) for thread %llu", x, i);
66 |     }
67 | 
68 |     // free
69 |     fifo_t *q = pending_[thread_index];
70 |     while (q->tail != rcu_[thread_index][thread_index]) {
71 |         uint32_t i = MOD_SCALE(q->tail, q->scale);
72 |         TRACE("r0", "rcu_update: freeing %p from queue at position %llu", q->x[i], q->tail);
73 |         nbd_free(q->x[i]);
74 |         q->tail++;
75 |     }
76 | }
77 | 
78 | void rcu_defer_free (void *x) {
79 |     assert(x);
80 |     int thread_index = GET_THREAD_INDEX();
81 |     fifo_t *q = pending_[thread_index];
82 |     assert(MOD_SCALE(q->head + 1, q->scale) != MOD_SCALE(q->tail, q->scale));
83 |     uint32_t i = MOD_SCALE(q->head, q->scale);
84 |     q->x[i] = x;
85 |     TRACE("r0", "rcu_defer_free: put %p on queue at position %llu", x, q->head);
86 |     q->head++;
87 | 
88 |     if (pending_[thread_index]->head - rcu_last_posted_[thread_index][thread_index] >= RCU_POST_THRESHOLD) {
89 |         TRACE("r0", "rcu_defer_free: posting %llu", pending_[thread_index]->head, 0);
90 |         int next_thread_index = (thread_index + 1) % num_threads_;
91 |         rcu_[next_thread_index][thread_index] = pending_[thread_index]->head;
92 |         rcu_last_posted_[thread_index][thread_index] = pending_[thread_index]->head;
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/runtime/rlocal.h:
--------------------------------------------------------------------------------
 1 | #ifndef RLOCAL_H
 2 | #define RLOCAL_H
 3 | 
 4 | #include "runtime.h"
 5 | #include "tls.h"
 6 | 
 7 | extern DECLARE_THREAD_LOCAL(ThreadId, int);
 8 | 
 9 | #define GET_THREAD_INDEX() ({ LOCALIZE_THREAD_LOCAL(ThreadId, int); assert(ThreadId != 0); ThreadId - 1; })
10 | 
11 | void mem_init (void);
12 | void rnd_init (void);
13 | 
14 | void rnd_thread_init (void);
15 | void rcu_thread_init (void);
16 | void lwt_thread_init (void);
17 | 
18 | #endif//RLOCAL_H 
19 | 


--------------------------------------------------------------------------------
/runtime/runtime.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Written by Josh Dybnis and released to the public domain, as explained at
 3 |  * http://creativecommons.org/licenses/publicdomain
 4 |  */
 5 | #include <stdlib.h>
 6 | #include <pthread.h>
 7 | #include "common.h"
 8 | #include "runtime.h"
 9 | #include "rlocal.h"
10 | #include "mem.h"
11 | #include "tls.h"
12 | 
13 | DECLARE_THREAD_LOCAL(ThreadId, int);
14 | static int ThreadIndex
15 | 
16 | static int MaxThreadId = 0;
17 | 
18 | __attribute__ ((constructor)) void nbd_init (void) {
19 |     rnd_init();
20 |     mem_init();
21 | }
22 | 
23 | void nbd_thread_init (void) {
24 |     LOCALIZE_THREAD_LOCAL(ThreadId, int);
25 | 
26 |     if (ThreadId == 0) {
27 |         ++MaxThreadId; // TODO: reuse thread id's of threads that have been destroyed
28 |         ASSERT(MaxThreadId <= MAX_NUM_THREADS);
29 |         SET_THREAD_LOCAL(ThreadId, MaxThreadId);
30 |         rnd_thread_init();
31 |     } 
32 | 
33 |     lwt_thread_init();
34 |     rcu_thread_init();
35 | }
36 | 


--------------------------------------------------------------------------------
/test/CuTest-license.txt:
--------------------------------------------------------------------------------
 1 | NOTE
 2 | 
 3 | The license is based on the zlib/libpng license. For more details see
 4 | http://www.opensource.org/licenses/zlib-license.html. The intent of the
 5 | license is to: 
 6 | 
 7 | - keep the license as simple as possible
 8 | - encourage the use of CuTest in both free and commercial applications
 9 |   and libraries
10 | - keep the source code together 
11 | - give credit to the CuTest contributors for their work
12 | 
13 | If you ship CuTest in source form with your source distribution, the
14 | following license document must be included with it in unaltered form.
15 | If you find CuTest useful we would like to hear about it. 
16 | 
17 | LICENSE
18 | 
19 | Copyright (c) 2003 Asim Jalis
20 | 
21 | This software is provided 'as-is', without any express or implied
22 | warranty. In no event will the authors be held liable for any damages
23 | arising from the use of this software.
24 | 
25 | Permission is granted to anyone to use this software for any purpose,
26 | including commercial applications, and to alter it and redistribute it
27 | freely, subject to the following restrictions:
28 | 
29 | 1. The origin of this software must not be misrepresented; you must not
30 | claim that you wrote the original software. If you use this software in
31 | a product, an acknowledgment in the product documentation would be
32 | appreciated but is not required.
33 | 
34 | 2. Altered source versions must be plainly marked as such, and must not
35 | be misrepresented as being the original software.
36 | 
37 | 3. This notice may not be removed or altered from any source
38 | distribution.
39 | 


--------------------------------------------------------------------------------
/test/CuTest.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <setjmp.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <math.h>
  7 | 
  8 | #include "CuTest.h"
  9 | 
 10 | /*-------------------------------------------------------------------------*
 11 |  * CuStr
 12 |  *-------------------------------------------------------------------------*/
 13 | 
 14 | char* CuStrAlloc(int size)
 15 | {
 16 | 	char* newStr = (char*) malloc( sizeof(char) * (size) );
 17 | 	return newStr;
 18 | }
 19 | 
 20 | char* CuStrCopy(const char* old)
 21 | {
 22 | 	int len = strlen(old);
 23 | 	char* newStr = CuStrAlloc(len + 1);
 24 | 	strcpy(newStr, old);
 25 | 	return newStr;
 26 | }
 27 | 
 28 | /*-------------------------------------------------------------------------*
 29 |  * CuString
 30 |  *-------------------------------------------------------------------------*/
 31 | 
 32 | void CuStringInit(CuString* str)
 33 | {
 34 | 	str->length = 0;
 35 | 	str->size = STRING_MAX;
 36 | 	str->buffer = (char*) malloc(sizeof(char) * str->size);
 37 | 	str->buffer[0] = '\0';
 38 | }
 39 | 
 40 | CuString* CuStringNew(void)
 41 | {
 42 | 	CuString* str = (CuString*) malloc(sizeof(CuString));
 43 | 	str->length = 0;
 44 | 	str->size = STRING_MAX;
 45 | 	str->buffer = (char*) malloc(sizeof(char) * str->size);
 46 | 	str->buffer[0] = '\0';
 47 | 	return str;
 48 | }
 49 | 
 50 | void CuStringResize(CuString* str, int newSize)
 51 | {
 52 | 	str->buffer = (char*) realloc(str->buffer, sizeof(char) * newSize);
 53 | 	str->size = newSize;
 54 | }
 55 | 
 56 | void CuStringAppend(CuString* str, const char* text)
 57 | {
 58 | 	int length;
 59 | 
 60 | 	if (text == NULL) {
 61 | 		text = "NULL";
 62 | 	}
 63 | 
 64 | 	length = strlen(text);
 65 | 	if (str->length + length + 1 >= str->size)
 66 | 		CuStringResize(str, str->length + length + 1 + STRING_INC);
 67 | 	str->length += length;
 68 | 	strcat(str->buffer, text);
 69 | }
 70 | 
 71 | void CuStringAppendChar(CuString* str, char ch)
 72 | {
 73 | 	char text[2];
 74 | 	text[0] = ch;
 75 | 	text[1] = '\0';
 76 | 	CuStringAppend(str, text);
 77 | }
 78 | 
 79 | void CuStringAppendFormat(CuString* str, const char* format, ...)
 80 | {
 81 | 	va_list argp;
 82 | 	char buf[HUGE_STRING_LEN];
 83 | 	va_start(argp, format);
 84 | 	vsprintf(buf, format, argp);
 85 | 	va_end(argp);
 86 | 	CuStringAppend(str, buf);
 87 | }
 88 | 
 89 | void CuStringInsert(CuString* str, const char* text, int pos)
 90 | {
 91 | 	int length = strlen(text);
 92 | 	if (pos > str->length)
 93 | 		pos = str->length;
 94 | 	if (str->length + length + 1 >= str->size)
 95 | 		CuStringResize(str, str->length + length + 1 + STRING_INC);
 96 | 	memmove(str->buffer + pos + length, str->buffer + pos, (str->length - pos) + 1);
 97 | 	str->length += length;
 98 | 	memcpy(str->buffer + pos, text, length);
 99 | }
100 | 
101 | /*-------------------------------------------------------------------------*
102 |  * CuTest
103 |  *-------------------------------------------------------------------------*/
104 | 
105 | void CuTestInit(CuTest* t, const char* name, TestFunction function)
106 | {
107 | 	t->name = CuStrCopy(name);
108 | 	t->failed = 0;
109 | 	t->ran = 0;
110 | 	t->message = NULL;
111 | 	t->function = function;
112 | 	t->jumpBuf = NULL;
113 | }
114 | 
115 | CuTest* CuTestNew(const char* name, TestFunction function)
116 | {
117 | 	CuTest* tc = CU_ALLOC(CuTest);
118 | 	CuTestInit(tc, name, function);
119 | 	return tc;
120 | }
121 | 
122 | void CuTestRun(CuTest* tc)
123 | {
124 | 	jmp_buf buf;
125 | 	tc->jumpBuf = &buf;
126 | 	if (setjmp(buf) == 0)
127 | 	{
128 | 		tc->ran = 1;
129 | 		(tc->function)(tc);
130 | 	}
131 | 	tc->jumpBuf = 0;
132 | }
133 | 
134 | static void CuFailInternal(CuTest* tc, const char* file, int line, CuString* string)
135 | {
136 | 	char buf[HUGE_STRING_LEN];
137 | 
138 | 	sprintf(buf, "%s:%d: ", file, line);
139 | 	CuStringInsert(string, buf, 0);
140 | 
141 | 	tc->failed = 1;
142 | 	tc->message = string->buffer;
143 |     extern void lwt_halt(void);
144 |     extern void lwt_dump(const char *);
145 |     lwt_dump(tc->name);
146 | 	if (tc->jumpBuf != 0) longjmp(*(tc->jumpBuf), 0);
147 | }
148 | 
149 | void CuFail_Line(CuTest* tc, const char* file, int line, const char* message2, const char* message)
150 | {
151 | 	CuString string;
152 | 
153 | 	CuStringInit(&string);
154 | 	if (message2 != NULL) 
155 | 	{
156 | 		CuStringAppend(&string, message2);
157 | 		CuStringAppend(&string, ": ");
158 | 	}
159 | 	CuStringAppend(&string, message);
160 | 	CuFailInternal(tc, file, line, &string);
161 | }
162 | 
163 | void CuAssert_Line(CuTest* tc, const char* file, int line, const char* message, int condition)
164 | {
165 | 	if (condition) return;
166 | 	CuFail_Line(tc, file, line, NULL, message);
167 | }
168 | 
169 | void CuAssertStrEquals_LineMsg(CuTest* tc, const char* file, int line, const char* message, 
170 | 	const char* expected, const char* actual)
171 | {
172 | 	CuString string;
173 | 	if ((expected == NULL && actual == NULL) ||
174 | 	    (expected != NULL && actual != NULL &&
175 | 	     strcmp(expected, actual) == 0))
176 | 	{
177 | 		return;
178 | 	}
179 | 
180 | 	CuStringInit(&string);
181 | 	if (message != NULL) 
182 | 	{
183 | 		CuStringAppend(&string, message);
184 | 		CuStringAppend(&string, ": ");
185 | 	}
186 | 	CuStringAppend(&string, "expected <");
187 | 	CuStringAppend(&string, expected);
188 | 	CuStringAppend(&string, "> but was <");
189 | 	CuStringAppend(&string, actual);
190 | 	CuStringAppend(&string, ">");
191 | 	CuFailInternal(tc, file, line, &string);
192 | }
193 | 
194 | void CuAssertIntEquals_LineMsg(CuTest* tc, const char* file, int line, const char* message, 
195 | 	int expected, int actual)
196 | {
197 | 	char buf[STRING_MAX];
198 | 	if (expected == actual) return;
199 | 	sprintf(buf, "expected <%d> but was <%d>", expected, actual);
200 | 	CuFail_Line(tc, file, line, message, buf);
201 | }
202 | 
203 | void CuAssertDblEquals_LineMsg(CuTest* tc, const char* file, int line, const char* message, 
204 | 	double expected, double actual, double delta)
205 | {
206 | 	char buf[STRING_MAX];
207 | 	if (fabs(expected - actual) <= delta) return;
208 | 	sprintf(buf, "expected <%lf> but was <%lf>", expected, actual);
209 | 	CuFail_Line(tc, file, line, message, buf);
210 | }
211 | 
212 | void CuAssertPtrEquals_LineMsg(CuTest* tc, const char* file, int line, const char* message, 
213 | 	void* expected, void* actual)
214 | {
215 | 	char buf[STRING_MAX];
216 | 	if (expected == actual) return;
217 | 	sprintf(buf, "expected pointer <0x%p> but was <0x%p>", expected, actual);
218 | 	CuFail_Line(tc, file, line, message, buf);
219 | }
220 | 
221 | 
222 | /*-------------------------------------------------------------------------*
223 |  * CuSuite
224 |  *-------------------------------------------------------------------------*/
225 | 
226 | void CuSuiteInit(CuSuite* testSuite)
227 | {
228 | 	testSuite->count = 0;
229 | 	testSuite->failCount = 0;
230 | }
231 | 
232 | CuSuite* CuSuiteNew(void)
233 | {
234 | 	CuSuite* testSuite = CU_ALLOC(CuSuite);
235 | 	CuSuiteInit(testSuite);
236 | 	return testSuite;
237 | }
238 | 
239 | void CuSuiteAdd(CuSuite* testSuite, CuTest *testCase)
240 | {
241 | 	assert(testSuite->count < MAX_TEST_CASES);
242 | 	testSuite->list[testSuite->count] = testCase;
243 | 	testSuite->count++;
244 | }
245 | 
246 | void CuSuiteAddSuite(CuSuite* testSuite, CuSuite* testSuite2)
247 | {
248 | 	int i;
249 | 	for (i = 0 ; i < testSuite2->count ; ++i)
250 | 	{
251 | 		CuTest* testCase = testSuite2->list[i];
252 | 		CuSuiteAdd(testSuite, testCase);
253 | 	}
254 | }
255 | 
256 | void CuSuiteRun(CuSuite* testSuite)
257 | {
258 | 	int i;
259 | 	for (i = 0 ; i < testSuite->count ; ++i)
260 | 	{
261 | 		CuTest* testCase = testSuite->list[i];
262 | 		CuTestRun(testCase);
263 | 		if (testCase->failed) { testSuite->failCount += 1; }
264 | 	}
265 | }
266 | 
267 | void CuSuiteSummary(CuSuite* testSuite, CuString* summary)
268 | {
269 | 	int i;
270 | 	for (i = 0 ; i < testSuite->count ; ++i)
271 | 	{
272 | 		CuTest* testCase = testSuite->list[i];
273 | 		CuStringAppend(summary, testCase->failed ? "F" : ".");
274 | 	}
275 | 	CuStringAppend(summary, "\n\n");
276 | }
277 | 
278 | void CuSuiteDetails(CuSuite* testSuite, CuString* details)
279 | {
280 | 	int i;
281 | 	int failCount = 0;
282 | 
283 | 	if (testSuite->failCount == 0)
284 | 	{
285 | 		int passCount = testSuite->count - testSuite->failCount;
286 | 		const char* testWord = passCount == 1 ? "test" : "tests";
287 | 		CuStringAppendFormat(details, "OK (%d %s)\n", passCount, testWord);
288 | 	}
289 | 	else
290 | 	{
291 | 		if (testSuite->failCount == 1)
292 | 			CuStringAppend(details, "There was 1 failure:\n");
293 | 		else
294 | 			CuStringAppendFormat(details, "There were %d failures:\n", testSuite->failCount);
295 | 
296 | 		for (i = 0 ; i < testSuite->count ; ++i)
297 | 		{
298 | 			CuTest* testCase = testSuite->list[i];
299 | 			if (testCase->failed)
300 | 			{
301 | 				failCount++;
302 | 				CuStringAppendFormat(details, "%d) %s: %s\n",
303 | 					failCount, testCase->name, testCase->message);
304 | 			}
305 | 		}
306 | 		CuStringAppend(details, "\n!!!FAILURES!!!\n");
307 | 
308 | 		CuStringAppendFormat(details, "Runs: %d ",   testSuite->count);
309 | 		CuStringAppendFormat(details, "Passes: %d ", testSuite->count - testSuite->failCount);
310 | 		CuStringAppendFormat(details, "Fails: %d\n",  testSuite->failCount);
311 | 	}
312 | }
313 | 


--------------------------------------------------------------------------------
/test/CuTest.h:
--------------------------------------------------------------------------------
  1 | #ifndef CU_TEST_H
  2 | #define CU_TEST_H
  3 | 
  4 | #include <setjmp.h>
  5 | #include <stdarg.h>
  6 | 
  7 | /* CuString */
  8 | 
  9 | char* CuStrAlloc(int size);
 10 | char* CuStrCopy(const char* old);
 11 | 
 12 | #define CU_ALLOC(TYPE)		((TYPE*) malloc(sizeof(TYPE)))
 13 | 
 14 | #define HUGE_STRING_LEN	8192
 15 | #define STRING_MAX		256
 16 | #define STRING_INC		256
 17 | 
 18 | typedef struct
 19 | {
 20 | 	int length;
 21 | 	int size;
 22 | 	char* buffer;
 23 | } CuString;
 24 | 
 25 | void CuStringInit(CuString* str);
 26 | CuString* CuStringNew(void);
 27 | void CuStringRead(CuString* str, const char* path);
 28 | void CuStringAppend(CuString* str, const char* text);
 29 | void CuStringAppendChar(CuString* str, char ch);
 30 | void CuStringAppendFormat(CuString* str, const char* format, ...);
 31 | void CuStringInsert(CuString* str, const char* text, int pos);
 32 | void CuStringResize(CuString* str, int newSize);
 33 | 
 34 | /* CuTest */
 35 | 
 36 | typedef struct CuTest CuTest;
 37 | 
 38 | typedef void (*TestFunction)(CuTest *);
 39 | 
 40 | struct CuTest
 41 | {
 42 | 	const char* name;
 43 | 	TestFunction function;
 44 | 	int failed;
 45 | 	int ran;
 46 | 	const char* message;
 47 | 	jmp_buf *jumpBuf;
 48 | };
 49 | 
 50 | void CuTestInit(CuTest* t, const char* name, TestFunction function);
 51 | CuTest* CuTestNew(const char* name, TestFunction function);
 52 | void CuTestRun(CuTest* tc);
 53 | 
 54 | /* Internal versions of assert functions -- use the public versions */
 55 | void CuFail_Line(CuTest* tc, 
 56 |     const char* file, int line, const char* message2, const char* message);
 57 | void CuAssert_Line(CuTest* tc, 
 58 |     const char* file, int line, const char* message, int condition);
 59 | void CuAssertStrEquals_LineMsg(CuTest* tc, 
 60 | 	const char* file, int line, const char* message, const char* expected, const char* actual);
 61 | void CuAssertIntEquals_LineMsg(CuTest* tc, 
 62 | 	const char* file, int line, const char* message, int expected, int actual);
 63 | void CuAssertDblEquals_LineMsg(CuTest* tc, 
 64 | 	const char* file, int line, const char* message, double expected, double actual, double delta);
 65 | void CuAssertPtrEquals_LineMsg(CuTest* tc, 
 66 | 	const char* file, int line, const char* message, void* expected, void* actual);
 67 | 
 68 | /* public assert functions */
 69 | 
 70 | #define CuFail(tc, ms)                        CuFail_Line(  (tc), __FILE__, __LINE__, NULL, (ms))
 71 | #define CuAssert(tc, ms, cond)                CuAssert_Line((tc), __FILE__, __LINE__, (ms), (cond))
 72 | #define CuAssertTrue(tc, cond)                CuAssert_Line((tc), __FILE__, __LINE__, "assert failed", (cond))
 73 | 
 74 | #define CuAssertStrEquals(tc,ex,ac)           CuAssertStrEquals_LineMsg((tc),__FILE__,__LINE__,NULL,(ex),(ac))
 75 | #define CuAssertStrEquals_Msg(tc,ms,ex,ac)    CuAssertStrEquals_LineMsg((tc),__FILE__,__LINE__,(ms),(ex),(ac))
 76 | #define CuAssertIntEquals(tc,ex,ac)           CuAssertIntEquals_LineMsg((tc),__FILE__,__LINE__,NULL,(ex),(ac))
 77 | #define CuAssertIntEquals_Msg(tc,ms,ex,ac)    CuAssertIntEquals_LineMsg((tc),__FILE__,__LINE__,(ms),(ex),(ac))
 78 | #define CuAssertDblEquals(tc,ex,ac,dl)        CuAssertDblEquals_LineMsg((tc),__FILE__,__LINE__,NULL,(ex),(ac),(dl))
 79 | #define CuAssertDblEquals_Msg(tc,ms,ex,ac,dl) CuAssertDblEquals_LineMsg((tc),__FILE__,__LINE__,(ms),(ex),(ac),(dl))
 80 | #define CuAssertPtrEquals(tc,ex,ac)           CuAssertPtrEquals_LineMsg((tc),__FILE__,__LINE__,NULL,(ex),(ac))
 81 | #define CuAssertPtrEquals_Msg(tc,ms,ex,ac)    CuAssertPtrEquals_LineMsg((tc),__FILE__,__LINE__,(ms),(ex),(ac))
 82 | 
 83 | #define CuAssertPtrNotNull(tc,p)        CuAssert_Line((tc),__FILE__,__LINE__,"null pointer unexpected",(p != NULL))
 84 | #define CuAssertPtrNotNullMsg(tc,msg,p) CuAssert_Line((tc),__FILE__,__LINE__,(msg),(p != NULL))
 85 | 
 86 | /* CuSuite */
 87 | 
 88 | #define MAX_TEST_CASES	1024
 89 | 
 90 | #define SUITE_ADD_TEST(SUITE,TEST)	CuSuiteAdd(SUITE, CuTestNew(#TEST, TEST))
 91 | 
 92 | typedef struct
 93 | {
 94 | 	int count;
 95 | 	CuTest* list[MAX_TEST_CASES];
 96 | 	int failCount;
 97 | 
 98 | } CuSuite;
 99 | 
100 | 
101 | void CuSuiteInit(CuSuite* testSuite);
102 | CuSuite* CuSuiteNew(void);
103 | void CuSuiteAdd(CuSuite* testSuite, CuTest *testCase);
104 | void CuSuiteAddSuite(CuSuite* testSuite, CuSuite* testSuite2);
105 | void CuSuiteRun(CuSuite* testSuite);
106 | void CuSuiteSummary(CuSuite* testSuite, CuString* summary);
107 | void CuSuiteDetails(CuSuite* testSuite, CuString* details);
108 | 
109 | #endif /* CU_TEST_H */
110 | 


--------------------------------------------------------------------------------
/test/haz_test.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * hazard pointer test
  6 |  *
  7 |  */
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | #include <stdio.h>
 11 | #include <errno.h>
 12 | #include <pthread.h>
 13 | #include <sys/time.h>
 14 | #include "common.h"
 15 | #include "mem.h"
 16 | #include "runtime.h"
 17 | #include "hazard.h"
 18 | 
 19 | #define NUM_ITERATIONS 10000000
 20 | 
 21 | typedef struct node {
 22 |     struct node *next;
 23 | } node_t;
 24 | 
 25 | typedef struct lifo {
 26 |     node_t *head;
 27 | } lifo_t;
 28 | 
 29 | static volatile int wait_;
 30 | static lifo_t *stk_;
 31 | 
 32 | void *worker (void *arg) {
 33 |     int id = (int)(size_t)arg;
 34 |     unsigned int r = (unsigned int)(id + 1) * 0x5bd1e995; // seed psuedo-random number generator
 35 |     haz_t *hp0 = haz_get_static(0);
 36 | 
 37 |     // Wait for all the worker threads to be ready.
 38 |     (void)SYNC_ADD(&wait_, -1);
 39 |     do {} while (wait_); 
 40 | 
 41 |     int i;
 42 |     for (i = 0; i < NUM_ITERATIONS; ++ i) {
 43 |         r ^= r << 6; r ^= r >> 21; r ^= r << 7; // generate next psuedo-random number
 44 |         if (r & 0x1000) {
 45 |             // push
 46 |             node_t *new_head = (node_t *)nbd_malloc(sizeof(node_t));
 47 |             node_t *old_head = stk_->head;
 48 |             node_t *temp;
 49 |             do {
 50 |                 temp = old_head;
 51 |                 new_head->next = temp;
 52 |             } while ((old_head = SYNC_CAS(&stk_->head, temp, new_head)) != temp);
 53 |         } else {
 54 |             // pop
 55 |             node_t *temp;
 56 |             node_t *head = stk_->head;
 57 |             do {
 58 |                 temp = head;
 59 |                 if (temp == NULL)
 60 |                     break;
 61 |                 haz_set(hp0, temp);
 62 |                 head = VOLATILE_DEREF(stk_).head;
 63 |                 if (temp != head)
 64 |                     continue;
 65 |             } while ((head = SYNC_CAS(&stk_->head, temp, temp->next)) != temp);
 66 | 
 67 |             if (temp != NULL) {
 68 |                 haz_defer_free(temp, nbd_free);
 69 |             }
 70 |         }
 71 |     }
 72 | 
 73 |     return NULL;
 74 | }
 75 | 
 76 | int main (int argc, char **argv) {
 77 |     //lwt_set_trace_level("m0r0");
 78 | 
 79 |     int num_threads = MAX_NUM_THREADS;
 80 |     if (argc == 2)
 81 |     {
 82 |         errno = 0;
 83 |         num_threads = strtol(argv[1], NULL, 10);
 84 |         if (errno) {
 85 |             fprintf(stderr, "%s: Invalid argument for number of threads\n", argv[0]);
 86 |             return -1;
 87 |         }
 88 |         if (num_threads <= 0) {
 89 |             fprintf(stderr, "%s: Number of threads must be at least 1\n", argv[0]);
 90 |             return -1;
 91 |         }
 92 |     }
 93 | 
 94 |     stk_ = (lifo_t *)nbd_malloc(sizeof(lifo_t)); 
 95 |     memset(stk_, 0, sizeof(lifo_t));
 96 | 
 97 |     struct timeval tv1, tv2;
 98 |     gettimeofday(&tv1, NULL);
 99 |     wait_ = num_threads;
100 | 
101 |     pthread_t thread[num_threads];
102 |     for (int i = 0; i < num_threads; ++i) {
103 |         int rc = nbd_thread_create(thread + i, i, worker, (void *)(size_t)i);
104 |         if (rc != 0) { perror("pthread_create"); return rc; }
105 |     }
106 |     for (int i = 0; i < num_threads; ++i) {
107 |         pthread_join(thread[i], NULL);
108 |     }
109 | 
110 |     gettimeofday(&tv2, NULL);
111 |     int ms = (int)(1000000*(tv2.tv_sec - tv1.tv_sec) + tv2.tv_usec - tv1.tv_usec) / 1000;
112 |     printf("Th:%d Time:%dms\n\n", num_threads, ms);
113 |     fflush(stdout);
114 | 
115 |     return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/test/map_test1.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <errno.h>
  3 | #include <pthread.h>
  4 | #include <sys/time.h>
  5 | 
  6 | #include "common.h"
  7 | #include "nstring.h"
  8 | #include "runtime.h"
  9 | #include "map.h"
 10 | #include "rcu.h"
 11 | #include "list.h"
 12 | #include "skiplist.h"
 13 | #include "hashtable.h"
 14 | 
 15 | #define NUM_ITERATIONS 10000000
 16 | 
 17 | //#define TEST_STRING_KEYS
 18 | 
 19 | static volatile int wait_;
 20 | static long num_threads_;
 21 | static map_t *map_;
 22 | 
 23 | void *worker (void *arg) {
 24 |     nbd_thread_init();
 25 | 
 26 |     // Wait for all the worker threads to be ready.
 27 |     (void)SYNC_ADD(&wait_, -1);
 28 |     do {} while (wait_); 
 29 | 
 30 | #ifdef TEST_STRING_KEYS
 31 |     nstring_t *key_str = ns_alloc(10);
 32 | #endif
 33 | 
 34 |     for (int i = 0; i < NUM_ITERATIONS/num_threads_; ++i) {
 35 |         unsigned r = nbd_rand();
 36 |         int key = r & 0xF;
 37 | #ifdef TEST_STRING_KEYS
 38 |         key_str->len = sprintf(key_str->data, "%X", key) + 1;
 39 |         assert(key_str->len <= 10);
 40 |         if (r & (1 << 8)) {
 41 |             map_set(map_, (map_key_t)key_str, 1);
 42 |         } else {
 43 |             map_remove(map_, (map_key_t)key_str);
 44 |         }
 45 | #else
 46 |         if (r & (1 << 8)) {
 47 |             map_set(map_, (map_key_t)(key + 1), 1);
 48 |         } else {
 49 |             map_remove(map_, (map_key_t)(key + 1));
 50 |         }
 51 | #endif
 52 | 
 53 |         rcu_update();
 54 |     }
 55 | 
 56 |     return NULL;
 57 | }
 58 | 
 59 | int main (int argc, char **argv) {
 60 |     nbd_thread_init();
 61 |     lwt_set_trace_level("r0m3s3");
 62 | 
 63 |     char* program_name = argv[0];
 64 |     pthread_t thread[MAX_NUM_THREADS];
 65 | 
 66 |     if (argc > 2) {
 67 |         fprintf(stderr, "Usage: %s num_threads\n", program_name);
 68 |         return -1;
 69 |     }
 70 | 
 71 |     num_threads_ = MAX_NUM_THREADS;
 72 |     if (argc == 2)
 73 |     {
 74 |         errno = 0;
 75 |         num_threads_ = strtol(argv[1], NULL, 10);
 76 |         if (errno) {
 77 |             fprintf(stderr, "%s: Invalid argument for number of threads\n", program_name);
 78 |             return -1;
 79 |         }
 80 |         if (num_threads_ <= 0) {
 81 |             fprintf(stderr, "%s: Number of threads must be at least 1\n", program_name);
 82 |             return -1;
 83 |         }
 84 |         if (num_threads_ > MAX_NUM_THREADS) {
 85 |             fprintf(stderr, "%s: Number of threads cannot be more than %d\n", program_name, MAX_NUM_THREADS);
 86 |             return -1;
 87 |         }
 88 |     }
 89 | 
 90 |     static const map_impl_t *map_types[] = { &MAP_IMPL_LL, &MAP_IMPL_SL, &MAP_IMPL_HT };
 91 |     for (int i = 0; i < sizeof(map_types)/sizeof(*map_types); ++i) {
 92 | #ifdef TEST_STRING_KEYS
 93 |         map_ = map_alloc(map_types[i], &DATATYPE_NSTRING);
 94 | #else
 95 |         map_ = map_alloc(map_types[i], NULL);
 96 | #endif
 97 | 
 98 |         struct timeval tv1, tv2;
 99 |         gettimeofday(&tv1, NULL);
100 | 
101 |         wait_ = num_threads_;
102 | 
103 |         for (int i = 0; i < num_threads_; ++i) {
104 |             int rc = pthread_create(thread + i, NULL, worker, (void*)(size_t)i);
105 |             if (rc != 0) { perror("pthread_create"); return rc; }
106 |         }
107 | 
108 |         for (int i = 0; i < num_threads_; ++i) {
109 |             pthread_join(thread[i], NULL);
110 |         }
111 | 
112 |         gettimeofday(&tv2, NULL);
113 |         int ms = (int)(1000000*(tv2.tv_sec - tv1.tv_sec) + tv2.tv_usec - tv1.tv_usec) / 1000;
114 |         map_print(map_, FALSE);
115 |         printf("Th:%ld Time:%dms\n\n", num_threads_, ms);
116 |         fflush(stdout);
117 |     }
118 | 
119 |     return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/test/map_test2.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  *
  5 |  * tests ported from high-scale-lib
  6 |  * http://sourceforge.net/projects/high-scale-lib
  7 |  */
  8 | #include <stdio.h>
  9 | #include <pthread.h>
 10 | #include <sys/time.h>
 11 | 
 12 | #include "CuTest.h"
 13 | 
 14 | #include "common.h"
 15 | #include "runtime.h"
 16 | #include "nstring.h"
 17 | #include "map.h"
 18 | #include "list.h"
 19 | #include "skiplist.h"
 20 | #include "hashtable.h"
 21 | #include "lwt.h"
 22 | #include "mem.h"
 23 | #include "rcu.h"
 24 | 
 25 | #define ASSERT_EQUAL(x, y) CuAssertIntEquals(tc, x, y)
 26 | 
 27 | //#define TEST_STRING_KEYS
 28 | 
 29 | typedef struct worker_data {
 30 |     int id;
 31 |     CuTest *tc;
 32 |     map_t *map;
 33 |     volatile int *wait;
 34 | } worker_data_t;
 35 | 
 36 | static const map_impl_t *map_type_;
 37 | 
 38 | static size_t iterator_size (map_t *map) {
 39 |     map_iter_t *iter = map_iter_begin(map, 0);
 40 |     size_t count = 0;
 41 |     while (map_iter_next(iter, NULL) != DOES_NOT_EXIST) {
 42 |         count++;
 43 |     }
 44 |     map_iter_free(iter);
 45 |     return count;
 46 | }
 47 | 
 48 | // Test some basic stuff; add a few keys, remove a few keys
 49 | void basic_test (CuTest* tc) {
 50 | 
 51 | #ifdef TEST_STRING_KEYS
 52 |     map_t *map = map_alloc(map_type_, &DATATYPE_NSTRING);
 53 |     nstring_t *s1 = ns_alloc(3); strcpy(s1->data, "k1");
 54 |     nstring_t *s2 = ns_alloc(3); strcpy(s2->data, "k2");
 55 |     nstring_t *s3 = ns_alloc(3); strcpy(s3->data, "k3");
 56 |     nstring_t *s4 = ns_alloc(3); strcpy(s4->data, "k4");
 57 |     map_key_t k1 = (map_key_t)s1;
 58 |     map_key_t k2 = (map_key_t)s2;
 59 |     map_key_t k3 = (map_key_t)s3;
 60 |     map_key_t k4 = (map_key_t)s4;
 61 | #else
 62 |     map_t *map = map_alloc(map_type_, NULL);
 63 |     map_key_t k1 = (map_key_t)1;
 64 |     map_key_t k2 = (map_key_t)2;
 65 |     map_key_t k3 = (map_key_t)3;
 66 |     map_key_t k4 = (map_key_t)4;
 67 | #endif
 68 | 
 69 |     ASSERT_EQUAL( 0,              map_count  (map)        );
 70 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_add    (map, k1,10) );
 71 |     ASSERT_EQUAL( 1,              map_count  (map)        );
 72 |     ASSERT_EQUAL( 1,              iterator_size(map)      );
 73 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_add    (map, k2,20) );
 74 |     ASSERT_EQUAL( 2,              map_count  (map)        );
 75 |     ASSERT_EQUAL( 2,              iterator_size(map)      );
 76 |     ASSERT_EQUAL( 20,             map_get    (map, k2)    );
 77 |     ASSERT_EQUAL( 10,             map_set    (map, k1,11) );
 78 |     ASSERT_EQUAL( 20,             map_set    (map, k2,21) );
 79 |     ASSERT_EQUAL( 2,              map_count  (map)        );
 80 |     ASSERT_EQUAL( 2,              iterator_size(map)      );
 81 |     ASSERT_EQUAL( 21,             map_add    (map, k2,22) );
 82 |     ASSERT_EQUAL( 11,             map_remove (map, k1)    );
 83 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_get    (map, k1)    );
 84 |     ASSERT_EQUAL( 1,              map_count  (map)        );
 85 |     ASSERT_EQUAL( 1,              iterator_size(map)      );
 86 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_remove (map, k1)    );
 87 |     ASSERT_EQUAL( 21,             map_remove (map, k2)    );
 88 |     ASSERT_EQUAL( 0,              map_count  (map)        );
 89 |     ASSERT_EQUAL( 0,              iterator_size(map)      );
 90 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_remove (map, k2)    );
 91 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_remove (map, k3)    );
 92 |     ASSERT_EQUAL( 0,              map_count  (map)        );
 93 |     ASSERT_EQUAL( 0,              iterator_size(map)      );
 94 |     
 95 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_add    (map, k4,40) );
 96 |     ASSERT_EQUAL( 40,             map_get    (map, k4)    );
 97 |     ASSERT_EQUAL( 1,              map_count  (map)        );
 98 |     ASSERT_EQUAL( 1,              iterator_size(map)      );
 99 |     ASSERT_EQUAL( 40,             map_remove (map, k4)    );
100 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_get    (map, k4)    );
101 |     ASSERT_EQUAL( 0,              map_count  (map)        );
102 |     ASSERT_EQUAL( 0,              iterator_size(map)      );
103 | 
104 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_replace(map, k4,10) );
105 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_get    (map, k4)    );
106 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_set    (map, k4,40) );
107 |     ASSERT_EQUAL( 40,             map_replace(map, k4,41) );
108 |     ASSERT_EQUAL( 41,             map_get    (map, k4)    );
109 |     ASSERT_EQUAL( 41,             map_remove (map, k4)    );
110 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_get    (map, k4)    );
111 |     ASSERT_EQUAL( 0,              map_count  (map)        );
112 |     ASSERT_EQUAL( 0,              iterator_size(map)      );
113 | 
114 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_replace(map, k2,20) );
115 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_get    (map, k2)    );
116 | 
117 |     // In the end, all entries should be removed
118 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_set    (map, k2,20) );
119 |     ASSERT_EQUAL( 20,             map_replace(map, k2,21) );
120 |     ASSERT_EQUAL( 21,             map_get    (map, k2)    );
121 |     ASSERT_EQUAL( 21,             map_remove (map, k2)    );
122 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_get    (map, k2)    );
123 |     ASSERT_EQUAL( 0,              map_count  (map)        );
124 |     ASSERT_EQUAL( 0,              iterator_size(map)      );
125 | 
126 |     map_free(map);
127 | 
128 |     rcu_update(); // In a quiecent state.
129 | #ifdef TEST_STRING_KEYS
130 |     nbd_free(s1); nbd_free(s2); nbd_free(s3); nbd_free(s4);
131 | #endif
132 | }
133 | 
134 | void *add_remove_worker (void *arg) {
135 |     nbd_thread_init();
136 | 
137 |     worker_data_t *wd = (worker_data_t *)arg;
138 |     map_t *map = wd->map;
139 |     CuTest* tc = wd->tc;
140 |     int d = wd->id;
141 |     int iters = (map_type_ == &MAP_IMPL_LL ? 10000 : 100000);
142 | 
143 |     (void)SYNC_ADD(wd->wait, -1);
144 |     do { } while (*wd->wait); // wait for all workers to be ready
145 | 
146 |     map_key_t key;
147 | #ifdef TEST_STRING_KEYS
148 |     nstring_t *s = ns_alloc(9);
149 |     key = (map_key_t)s;
150 | #endif
151 | 
152 |     for (int j = 0; j < 10; ++j) {
153 |         for (int i = d+1; i < iters; i+=2) {
154 | #ifdef TEST_STRING_KEYS
155 |             s->len = 1 + snprintf(s->data, 9, "%u", i);
156 | #else
157 |             key = (map_key_t)i;
158 | #endif
159 |             TRACE("t0", "test map_add() iteration (%llu, %llu)", j, i);
160 |             ASSERT_EQUAL(DOES_NOT_EXIST, map_add(map, key, d+1) );
161 |             rcu_update(); // In a quiecent state.
162 |         }
163 |         for (int i = d+1; i < iters; i+=2) {
164 | #ifdef TEST_STRING_KEYS
165 |             s->len = 1 + snprintf(s->data, 9, "%u", i);
166 | #else
167 |             key = (map_key_t)i;
168 | #endif
169 |             TRACE("t0", "test map_remove() iteration (%llu, %llu)", j, i);
170 |             ASSERT_EQUAL(d+1, map_remove(map, key) );
171 |             rcu_update(); // In a quiecent state.
172 |         }
173 |     }
174 | #ifdef TEST_STRING_KEYS
175 |     nbd_free(s);
176 | #endif
177 |     return NULL;
178 | }
179 | 
180 | // Do some simple concurrent testing
181 | void concurrent_add_remove_test (CuTest* tc) {
182 | 
183 |     pthread_t thread[2];
184 |     worker_data_t wd[2];
185 |     volatile int wait = 2;
186 | #ifdef TEST_STRING_KEYS
187 |     map_t *map = map_alloc(map_type_, &DATATYPE_NSTRING);
188 | #else
189 |     map_t *map = map_alloc(map_type_, NULL);
190 | #endif
191 | 
192 |     struct timeval tv1, tv2;
193 |     gettimeofday(&tv1, NULL);
194 | 
195 |     // In 2 threads, add & remove even & odd elements concurrently
196 |     int i;
197 |     for (i = 0; i < 2; ++i) {
198 |         wd[i].id = i;
199 |         wd[i].tc = tc;
200 |         wd[i].map = map;
201 |         wd[i].wait = &wait;
202 |         int rc = pthread_create(thread + i, NULL, add_remove_worker, wd + i);
203 |         if (rc != 0) { perror("nbd_thread_create"); return; }
204 |     }
205 | 
206 |     for (i = 0; i < 2; ++i) {
207 |         pthread_join(thread[i], NULL);
208 |     }
209 | 
210 |     gettimeofday(&tv2, NULL);
211 |     int ms = (int)(1000000*(tv2.tv_sec - tv1.tv_sec) + tv2.tv_usec - tv1.tv_usec) / 1000;
212 |     map_print(map, FALSE);
213 |     printf("Time:%dms\n", ms);
214 |     fflush(stdout);
215 | 
216 |     // In the end, all members should be removed
217 |     ASSERT_EQUAL( 0, map_count(map) );
218 |     ASSERT_EQUAL( 0, iterator_size(map) );
219 | 
220 |     // In a quiecent state; it is safe to free.
221 |     map_free(map);
222 | }
223 | 
224 | void basic_iteration_test (CuTest* tc) {
225 | #ifdef TEST_STRING_KEYS
226 |     map_t *map = map_alloc(map_type_, &DATATYPE_NSTRING);
227 |     nstring_t *s1 = ns_alloc(3); strcpy(s1->data, "k1");
228 |     nstring_t *s2 = ns_alloc(3); strcpy(s2->data, "k2");
229 |     map_key_t k1 = (map_key_t)s1;
230 |     map_key_t k2 = (map_key_t)s2;
231 |     nstring_t *x_k;
232 |     nstring_t *y_k;
233 | #else
234 |     map_t *map = map_alloc(map_type_, NULL);
235 |     map_key_t k1 = (map_key_t)1;
236 |     map_key_t k2 = (map_key_t)2;
237 |     map_key_t x_k;
238 |     map_key_t y_k;
239 | #endif
240 | 
241 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_add    (map, k1,1) );
242 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_add    (map, k2,2) );
243 | 
244 |     map_val_t x_v, y_v;
245 |     map_iter_t *iter = map_iter_begin(map, 0);
246 |     x_v = map_iter_next(iter, (map_key_t *)&x_k);
247 |     y_v = map_iter_next(iter, (map_key_t *)&y_k);
248 |     ASSERT_EQUAL( DOES_NOT_EXIST, map_iter_next(iter, NULL) );
249 |     map_iter_free(iter);
250 | #ifdef TEST_STRING_KEYS
251 |     ASSERT_EQUAL( TRUE, (ns_cmp(x_k, s1) == 0 && x_v == 1) || (ns_cmp(y_k, s1) == 0 && y_v == 1) );
252 |     ASSERT_EQUAL( TRUE, (ns_cmp(x_k, s2) == 0 && x_v == 2) || (ns_cmp(y_k, s2) == 0 && y_v == 2) );
253 |     nbd_free(s1);
254 |     nbd_free(s2);
255 | #else
256 |     ASSERT_EQUAL( TRUE, (x_k == k1 && x_v == 1) || (y_k == k1 && y_v == 1) );
257 |     ASSERT_EQUAL( TRUE, (x_k == k2 && x_v == 2) || (y_k == k2 && y_v == 2) );
258 | #endif
259 | 
260 |     map_free(map);
261 | }
262 | 
263 | void big_iteration_test (CuTest* tc) {
264 |     static const int n = 10000;
265 |     
266 | #ifdef TEST_STRING_KEYS
267 |     map_t *map = map_alloc(map_type_, &DATATYPE_NSTRING);
268 |     nstring_t *s = ns_alloc(9);
269 |     nstring_t *s3 = ns_alloc(3); strcpy(s3->data, "k3");
270 |     nstring_t *s4 = ns_alloc(3); strcpy(s4->data, "k4");
271 |     map_key_t k3 = (map_key_t)s3;
272 |     map_key_t k4 = (map_key_t)s4;
273 |     map_key_t key = (map_key_t)s;
274 | #else
275 |     map_t *map = map_alloc(map_type_, NULL);
276 |     map_key_t k3 = (map_key_t)3;
277 |     map_key_t k4 = (map_key_t)4;
278 |     map_key_t key;
279 | #endif
280 | 
281 |     for (int i = 1; i <= n; ++i) {
282 | #ifdef TEST_STRING_KEYS
283 |         s->len = 1 + snprintf(s->data, 9, "k%d", i);
284 | #else
285 |         key = (map_key_t)i;
286 | #endif
287 |         ASSERT_EQUAL( DOES_NOT_EXIST, map_get(map, key)    );
288 |         ASSERT_EQUAL( DOES_NOT_EXIST, map_set(map, key, i) );
289 |         ASSERT_EQUAL( i,              map_get(map, key)    );
290 |         rcu_update(); // In a quiecent state.
291 |     }
292 | 
293 |     ASSERT_EQUAL( n, map_count(map) );
294 |     ASSERT_EQUAL( n, iterator_size(map) );
295 | 
296 |     uint64_t sum = 0;
297 |     map_val_t val;
298 |     map_iter_t *iter = map_iter_begin(map, 0);
299 |     while ((val = map_iter_next(iter, NULL)) != DOES_NOT_EXIST) {
300 |         sum += val;
301 |     }
302 |     map_iter_free(iter);
303 |     ASSERT_EQUAL(n*(n+1)/2, sum);
304 |     ASSERT_EQUAL(3, map_remove(map, k3));
305 |     ASSERT_EQUAL(4, map_remove(map, k4));
306 |     sum = 0;
307 |     iter = map_iter_begin(map, 0);
308 |     while ((val = map_iter_next(iter, NULL)) != DOES_NOT_EXIST) {
309 |         sum += val;
310 |     }
311 |     map_iter_free(iter);
312 |     ASSERT_EQUAL(n*(n+1)/2 - (3+4), sum);
313 |         
314 | #ifdef TEST_STRING_KEYS
315 |     nbd_free(s);
316 | #endif
317 | }
318 | 
319 | int main (void) {
320 |     nbd_thread_init();
321 |     lwt_set_trace_level("r0m3l2t0");
322 | 
323 |     static const map_impl_t *map_types[] = { &MAP_IMPL_LL, &MAP_IMPL_SL, &MAP_IMPL_HT };
324 |     for (int i = 0; i < sizeof(map_types)/sizeof(*map_types); ++i) {
325 |         map_type_ = map_types[i];
326 | 
327 |         // Create and run test suite
328 |         CuString *output = CuStringNew();
329 |         CuSuite* suite = CuSuiteNew();
330 | 
331 |         SUITE_ADD_TEST(suite, concurrent_add_remove_test);
332 | //        SUITE_ADD_TEST(suite, basic_test);
333 | //        SUITE_ADD_TEST(suite, basic_iteration_test);
334 | //        SUITE_ADD_TEST(suite, big_iteration_test);
335 | 
336 |         CuSuiteRun(suite);
337 |         CuSuiteDetails(suite, output);
338 |         printf("%s\n", output->buffer);
339 |     }
340 | 
341 |     return 0;
342 | }
343 | 


--------------------------------------------------------------------------------
/test/perf_test.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <errno.h>
  3 | #include <unistd.h>
  4 | #include <pthread.h>
  5 | #include <sys/time.h>
  6 | 
  7 | #include "common.h"
  8 | #include "nstring.h"
  9 | #include "runtime.h"
 10 | #include "map.h"
 11 | #include "rcu.h"
 12 | #include "mem.h"
 13 | #include "list.h"
 14 | #include "skiplist.h"
 15 | #include "hashtable.h"
 16 | 
 17 | //#define TEST_STRING_KEYS
 18 | 
 19 | static int num_threads_;
 20 | static volatile int start_, stop_, load_;
 21 | static map_t *map_;
 22 | static int get_range_, put_range_;
 23 | static size_t num_keys_;
 24 | static double load_time_;
 25 | static int duration_;
 26 | 
 27 | #define OP_SELECT_RANGE (1ULL << 20)
 28 | 
 29 | void *worker (void *arg) {
 30 |     nbd_thread_init();
 31 | 
 32 |     // Wait for all the worker threads to be ready.
 33 |     (void)SYNC_ADD(&load_, -1);
 34 |     do {} while (load_);
 35 | 
 36 |     // Pre-load map
 37 |     int n = num_keys_ / 2 / num_threads_;
 38 |     for (int i = 0; i < n; ++i) {
 39 |         map_key_t key = (nbd_rand() & (num_keys_ - 1)) + 1;
 40 |         map_set(map_, key, key);
 41 |     }
 42 | 
 43 |     // Wait for all the worker threads to be done loading.
 44 |     (void)SYNC_ADD(&start_, -1);
 45 |     do {} while (start_);
 46 | 
 47 |     uint64_t ops = 0;
 48 |     while (!stop_) {
 49 |         ++ops;
 50 |         map_key_t key = (nbd_rand() & (num_keys_ - 1)) + 1;
 51 |         map_key_t x = nbd_rand() & (OP_SELECT_RANGE - 1);
 52 |         if (x < get_range_) {
 53 | #ifndef NDEBUG
 54 |             map_val_t val =
 55 | #endif
 56 |                 map_get(map_, key);
 57 | #ifdef TEST_STRING_KEYS
 58 |             ASSERT(val == DOES_NOT_EXIST || ns_cmp((nstring_t *)key, (nstring_t *)val) == 0);
 59 | #else
 60 |             ASSERT(val == DOES_NOT_EXIST || key == val);
 61 | #endif
 62 |         } else if (x < put_range_) {
 63 |             map_add(map_, key, key);
 64 |         } else {
 65 |             map_remove(map_, key);
 66 |         }
 67 |         rcu_update();
 68 |     }
 69 | 
 70 |     return (void *)ops;
 71 | }
 72 | 
 73 | uint64_t run_test (void) {
 74 |     load_ = num_threads_ + 1;
 75 |     start_ = num_threads_ + 1;
 76 | 
 77 |     stop_ = 0;
 78 | 
 79 |     pthread_t thread[MAX_NUM_THREADS];
 80 |     for (int i = 0; i < num_threads_; ++i) {
 81 |         int rc = pthread_create(thread + i, NULL, worker, (void*)(size_t)i);
 82 |         if (rc != 0) { perror("pthread_create"); exit(rc); }
 83 |     }
 84 | 
 85 |     do { /* nothing */ } while (load_ != 1);
 86 |     load_ = 0;
 87 | 
 88 |     struct timeval tv1, tv2;
 89 |     gettimeofday(&tv1, NULL);
 90 | 
 91 |     do { /* nothing */ } while (start_ != 1);
 92 | 
 93 |     gettimeofday(&tv2, NULL);
 94 |     load_time_ = (double)(1000000*(tv2.tv_sec - tv1.tv_sec) + tv2.tv_usec - tv1.tv_usec) / 1000000;
 95 | 
 96 |     start_ = 0;
 97 |     sleep(duration_);
 98 |     stop_ = 1;
 99 | 
100 |     uint64_t ops = 0;
101 |     for (int i = 0; i < num_threads_; ++i) {
102 |         void *count;
103 |         pthread_join(thread[i], &count);
104 |         ops += (size_t)count;
105 |     }
106 |     return ops;
107 | }
108 | 
109 | int main (int argc, char **argv) {
110 |     char* program_name = argv[0];
111 | 
112 |     if (argc > 3) {
113 |         fprintf(stderr, "Usage: %s num_threads\n", program_name);
114 |         return -1;
115 |     }
116 | 
117 |     num_threads_ = 2;
118 |     if (num_threads_ > MAX_NUM_THREADS) { num_threads_ = MAX_NUM_THREADS; }
119 |     if (argc > 1)
120 |     {
121 |         errno = 0;
122 |         num_threads_ = strtol(argv[1], NULL, 10);
123 |         if (errno) {
124 |             fprintf(stderr, "%s: Invalid argument for number of threads\n", program_name);
125 |             return -1;
126 |         }
127 |         if (num_threads_ <= 0) {
128 |             fprintf(stderr, "%s: Number of threads must be at least 1\n", program_name);
129 |             return -1;
130 |         }
131 |     }
132 |     if (num_threads_ > MAX_NUM_THREADS) {
133 |         fprintf(stderr, "%s: Number of threads cannot be more than %d\n", program_name, MAX_NUM_THREADS);
134 |         return -1;
135 |     }
136 | 
137 |     int table_scale = 12;
138 |     if (argc > 2) {
139 |         table_scale = strtol(argv[2], NULL, 10);
140 |         if (errno) {
141 |             fprintf(stderr, "%s: Invalid argument for the scale of the collection\n", program_name);
142 |             return -1;
143 |         }
144 |         table_scale = strtol(argv[2], NULL, 10);
145 |         if (table_scale < 0 || table_scale > 36) {
146 |             fprintf(stderr, "%s: The scale of the collection must be between 0 and 36\n", program_name);
147 |             return -1;
148 |         }
149 |     }
150 | 
151 |     int read_ratio = 90;
152 |     int put_ratio = 50;
153 |     get_range_ = (int)((double)OP_SELECT_RANGE / 100 * read_ratio);
154 |     put_range_ = get_range_ + (int)(((double)OP_SELECT_RANGE - get_range_) / 100 * put_ratio);
155 | 
156 |     nbd_thread_init();
157 |     static const map_impl_t *map_types[] = { &MAP_IMPL_HT };
158 |     for (int i = 0; i < sizeof(map_types)/sizeof(*map_types); ++i) {
159 | #ifdef TEST_STRING_KEYS
160 |         map_ = map_alloc(map_types[i], &DATATYPE_NSTRING);
161 | #else
162 |         map_ = map_alloc(map_types[i], NULL);
163 | #endif
164 | 
165 |         num_keys_ = 1ULL << table_scale;
166 | 
167 |         duration_ = 1 + table_scale/4;
168 |         double mops_per_sec = (double)run_test() / 1000000.0 / duration_;
169 | 
170 |         printf("Threads:%-2d  Size:2^%-2d  load time:%-4.2f  Mops/s:%-4.2f  per-thread:%-4.2f  ",
171 |                 num_threads_, table_scale, load_time_, mops_per_sec, mops_per_sec/num_threads_);
172 |         map_print(map_, FALSE);
173 |         fflush(stdout);
174 | 
175 |         map_free(map_);
176 |     }
177 | 
178 |     return 0;
179 | }
180 | 


--------------------------------------------------------------------------------
/test/rcu_test.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <errno.h>
  3 | #include <pthread.h>
  4 | #include <unistd.h>
  5 | #include <sys/time.h>
  6 | #include "common.h"
  7 | #include "runtime.h"
  8 | #include "mem.h"
  9 | #include "rcu.h"
 10 | 
 11 | #define NUM_ITERATIONS 10000000
 12 | 
 13 | typedef struct node {
 14 |     struct node *next;
 15 | } node_t;
 16 | 
 17 | typedef struct lifo {
 18 |     node_t *head;
 19 | } lifo_t;
 20 | 
 21 | static volatile int wait_;
 22 | static lifo_t *stk_;
 23 | 
 24 | static lifo_t *lifo_alloc (void) {
 25 |     lifo_t *stk = (lifo_t *)nbd_malloc(sizeof(lifo_t)); 
 26 |     memset(stk, 0, sizeof(lifo_t));
 27 |     return stk;
 28 | }
 29 | 
 30 | static void lifo_aba_push (lifo_t *stk, node_t *x) {
 31 |     node_t *head;
 32 |     do {
 33 |         head = VOLATILE_DEREF(stk).head;
 34 |         VOLATILE_DEREF(x).next = head;
 35 |     } while (SYNC_CAS(&stk->head, head, x) != head);
 36 | }
 37 | 
 38 | node_t *lifo_aba_pop (lifo_t *stk) {
 39 |     node_t *head;
 40 |     do {
 41 |         head = VOLATILE_DEREF(stk).head;
 42 |         if (head == NULL)
 43 |             return NULL;
 44 |     } while (SYNC_CAS(&stk->head, head, head->next) != head);
 45 |     head->next = NULL;
 46 |     return head;
 47 | }
 48 | 
 49 | node_t *node_alloc (void) {
 50 |     node_t *node = (node_t *)nbd_malloc(sizeof(node_t));
 51 |     memset(node, 0, sizeof(node_t));
 52 |     return node;
 53 | }
 54 | 
 55 | void *worker (void *arg) {
 56 |     nbd_thread_init();
 57 | 
 58 |     // Wait for all the worker threads to be ready.
 59 |     (void)__sync_fetch_and_add(&wait_, -1);
 60 |     do {} while (wait_); 
 61 | 
 62 |     int i;
 63 |     for (i = 0; i < NUM_ITERATIONS; ++ i) {
 64 |         int n = nbd_rand();
 65 |         if (n & 0x1) {
 66 |             lifo_aba_push(stk_, node_alloc());
 67 |         } else {
 68 |             node_t *x = lifo_aba_pop(stk_);
 69 |             if (x) {
 70 |                 rcu_defer_free(x);
 71 |             }
 72 |         }
 73 |         rcu_update();
 74 |     }
 75 | 
 76 |     return NULL;
 77 | }
 78 | 
 79 | int main (int argc, char **argv) {
 80 |     nbd_thread_init();
 81 |     lwt_set_trace_level("m3r3");
 82 | 
 83 |     int num_threads = sysconf(_SC_NPROCESSORS_CONF);
 84 |     if (argc == 2)
 85 |     {
 86 |         errno = 0;
 87 |         num_threads = strtol(argv[1], NULL, 10);
 88 |         if (errno) {
 89 |             fprintf(stderr, "%s: Invalid argument for number of threads\n", argv[0]);
 90 |             return -1;
 91 |         }
 92 |         if (num_threads <= 0) {
 93 |             fprintf(stderr, "%s: Number of threads must be at least 1\n", argv[0]);
 94 |             return -1;
 95 |         }
 96 |     }
 97 | 
 98 |     stk_ = lifo_alloc();
 99 |     wait_ = num_threads;
100 | 
101 |     struct timeval tv1, tv2;
102 |     gettimeofday(&tv1, NULL);
103 |     wait_ = num_threads;
104 | 
105 |     pthread_t thread[num_threads];
106 |     for (int i = 0; i < num_threads; ++i) {
107 |         int rc = pthread_create(thread + i, NULL, worker, (void *)(size_t)i);
108 |         if (rc != 0) { perror("pthread_create"); return rc; }
109 |     }
110 |     for (int i = 0; i < num_threads; ++i) {
111 |         pthread_join(thread[i], NULL);
112 |     }
113 | 
114 |     gettimeofday(&tv2, NULL);
115 |     int ms = (int)(1000000*(tv2.tv_sec - tv1.tv_sec) + tv2.tv_usec - tv1.tv_usec) / 1000;
116 |     printf("Th:%d Time:%dms\n\n", num_threads, ms);
117 |     fflush(stdout);
118 | 
119 |     return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/test/txn_test.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "CuTest.h"
 3 | 
 4 | #include "common.h"
 5 | #include "runtime.h"
 6 | #include "txn.h"
 7 | #include "map.h"
 8 | #include "hashtable.h"
 9 | 
10 | #define ASSERT_EQUAL(x, y) CuAssertIntEquals(tc, x, y)
11 | 
12 | void test1 (CuTest* tc) {
13 |     map_t *map = map_alloc(&MAP_IMPL_HT, NULL);
14 |     txn_t *t1 = txn_begin(map);
15 |     txn_t *t2 = txn_begin(map);
16 |     map_key_t k1 = (map_key_t)1;
17 |     txn_map_set(t1, k1, 2);
18 |     txn_map_set(t1, k1, 3);
19 |     ASSERT_EQUAL( DOES_NOT_EXIST, txn_map_get(t2, k1) );
20 |     txn_map_set(t2, k1, 4);
21 |     ASSERT_EQUAL( 3, txn_map_get(t1, k1) );
22 |     ASSERT_EQUAL( 4, txn_map_get(t2, k1) );
23 |     ASSERT_EQUAL( TXN_VALIDATED, txn_commit(t2));
24 |     ASSERT_EQUAL( TXN_ABORTED,   txn_commit(t1));
25 | }
26 | 
27 | int main (void) {
28 |     nbd_thread_init();
29 |     lwt_set_trace_level("x3h3");
30 | 
31 |     CuString *output = CuStringNew();
32 |     CuSuite* suite = CuSuiteNew();
33 |     SUITE_ADD_TEST(suite, test1);
34 |     CuSuiteRun(suite);
35 |     CuSuiteDetails(suite, output);
36 |     printf("%s\n", output->buffer);
37 | 
38 |     return 0;
39 | }
40 | 


--------------------------------------------------------------------------------
/todo:
--------------------------------------------------------------------------------
 1 | memory reclamation
 2 | ------------------
 3 | - allow threads to dynamically enter and exit rcu's token passing ring
 4 | - augment rcu with heartbeat manager to kill and recover from stalled threads
 5 | - make rcu try yielding when its buffer gets full
 6 | - use alternate memory reclamation schemes: hazard pointers and/or reference counting
 7 | 
 8 | quality
 9 | -------
10 | - verify the memory management of keys in list, skiplist, and hashtable
11 | - transaction tests
12 | - validate function arguments in interface functions
13 | - document usage
14 | - document algorithms
15 | 
16 | optimization
17 | ------------
18 | - investigate 16 byte CAS; ht can store GUIDs inline instead of pointers to actual keys 
19 | - txn write after write can just update the old update record instead of pushing a new one
20 | - use a shared scan for write-set validation in txn, similar to ht copy logic
21 | - experiment with the performance impact of not passing the hash between functions in ht
22 | - experiment with embedding the nstring keys in the list/skiplist nodes
23 | - lower skiplist's high_water when the actual number of levels in use drops
24 | - non-power-of 2 sized hashtables for improved memory usage
25 | - mem2
26 | 
27 | features
28 | --------
29 | - allow values of 0 to be inserted into maps (change DOES_NOT_EXIST to something other than 0)
30 | - read-committed type transactions
31 | - recycle free regions across size-classes and between threads
32 | 


--------------------------------------------------------------------------------
/txn/txn.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Written by Josh Dybnis and released to the public domain, as explained at
  3 |  * http://creativecommons.org/licenses/publicdomain
  4 |  */
  5 | #include "common.h"
  6 | #include "txn.h"
  7 | #include "mem.h"
  8 | #include "rcu.h"
  9 | #include "lwt.h"
 10 | #include "skiplist.h"
 11 | 
 12 | #define UNDETERMINED_VERSION 0
 13 | #define ABORTED_VERSION      TAG_VALUE(0, TAG1)
 14 | #define INITIAL_WRITES_SIZE  4
 15 | #define PTR_TO_VAL(x) ((size_t)(x) >> 2)
 16 | #define VAL_TO_PTR(x) ((update_t *)((x) << 2))
 17 | 
 18 | typedef struct update_rec update_t;
 19 | typedef map_key_t version_t;
 20 | 
 21 | struct update_rec {
 22 |     version_t version; // tagged versions are txn_t pointers, untagged are actual version numbers
 23 |     map_val_t value;
 24 |     map_val_t next; // an earlier update
 25 | };
 26 | 
 27 | typedef struct write_rec {
 28 |     map_key_t key;
 29 |     update_t *rec;
 30 | } write_rec_t;
 31 | 
 32 | struct txn {
 33 |     version_t rv;
 34 |     version_t wv;
 35 |     map_t *map;
 36 |     write_rec_t *writes;
 37 |     size_t writes_size;
 38 |     size_t writes_count;
 39 |     size_t validate_scan;
 40 |     txn_state_e state;
 41 | };
 42 | 
 43 | static txn_state_e txn_validate (txn_t *txn);
 44 | 
 45 | static skiplist_t *active_ = NULL;
 46 | 
 47 | static version_t version_ = 1;
 48 | 
 49 | // Validate the updates for <key>. Validation fails if there is a write-write conflict. That is if after our
 50 | // read version another transaction committed a change to an entry we are also trying to change.
 51 | //
 52 | // If we encounter a potential conflict with a transaction that is in the process of validating, we help it
 53 | // complete validating. It must be finished before we can decide to rollback or commit.
 54 | //
 55 | static txn_state_e validate_key (txn_t *txn, map_key_t key) {
 56 |     assert(txn->state != TXN_RUNNING);
 57 | 
 58 |     map_val_t val = map_get(txn->map, key);
 59 |     update_t *update = NULL;
 60 |     for (; val != DOES_NOT_EXIST; val = update->next) {
 61 | 
 62 |         // If the update or its version is not tagged it means the update is committed.
 63 |         //
 64 |         // We can stop at the first committed record we find that is at least as old as our read version. All
 65 |         // the other committed records following it will be older. And all the uncommitted records following it
 66 |         // will eventually conflict with it and abort.
 67 |         if (!IS_TAGGED(val, TAG2))
 68 |             return TXN_VALIDATED;
 69 |         update = VAL_TO_PTR(val);
 70 |         if (!IS_TAGGED(update->version, TAG1))
 71 |             return (update->version <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
 72 | 
 73 |         // If the update's version is tagged then either the update was aborted or the the version number is
 74 |         // actually a pointer to a running transaction's txn_t.
 75 | 
 76 |         // Skip aborted transactions.
 77 |         if (EXPECT_FALSE(update->version == ABORTED_VERSION))
 78 |             continue;
 79 | 
 80 |         // The update's transaction is still in progress. Access its txn_t.
 81 |         txn_t *writer = (txn_t *)VAL_TO_PTR(update->version);
 82 |         if (writer == txn)
 83 |             continue; // Skip our own updates.
 84 |         txn_state_e writer_state = writer->state;
 85 | 
 86 |         // Any running transaction will only be able to acquire a wv greater than ours. A transaction changes its
 87 |         // state to validating before aquiring a wv. We can ignore an unvalidated transaction if its version is
 88 |         // greater than ours. See the next comment below for the explination why.
 89 |         if (writer_state == TXN_RUNNING)
 90 |             continue;
 91 | 
 92 |         // If <writer> has a later version than us we can safely ignore its updates. It will not commit until
 93 |         // we have completed validation (in order to remain non-blocking it will help us validate if necessary).
 94 |         // This protocol ensures a deterministic resolution to every conflict and avoids infinite ping-ponging
 95 |         // between validating two conflicting transactions.
 96 |         if (writer_state == TXN_VALIDATING) {
 97 |             if (writer->wv > txn->wv)
 98 |                 continue;
 99 |             // Help <writer> commit. We need to know if <writer> aborts or commits before we can decide what to
100 |             // do. But we don't want to block, so we assist.
101 |             writer_state = txn_validate(writer);
102 |         }
103 | 
104 |         // Skip updates from aborted transactions.
105 |         if (writer_state == TXN_ABORTED)
106 |             continue;
107 | 
108 |         assert(writer_state == TXN_VALIDATED);
109 |         return (writer->wv <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
110 |     }
111 | 
112 |     return TXN_VALIDATED;
113 | }
114 | 
115 | static txn_state_e txn_validate (txn_t *txn) {
116 |     assert(txn->state != TXN_RUNNING);
117 |     switch (txn->state) {
118 | 
119 |         case TXN_VALIDATING:
120 |             if (txn->wv == UNDETERMINED_VERSION) {
121 |                 version_t wv = SYNC_ADD(&version_, 1);
122 |                 (void)SYNC_CAS(&txn->wv, UNDETERMINED_VERSION, wv);
123 |             }
124 | 
125 |             for (int i = 0; i < txn->writes_count; ++i) {
126 |                 txn_state_e s = validate_key(txn, txn->writes[i].key);
127 |                 if (s == TXN_ABORTED) {
128 |                     txn->state = TXN_ABORTED;
129 |                     break;
130 |                 }
131 |                 assert(s == TXN_VALIDATED);
132 |             }
133 |             if (txn->state == TXN_VALIDATING) {
134 |                 txn->state =  TXN_VALIDATED;
135 |             }
136 |             break;
137 | 
138 |         case TXN_VALIDATED:
139 |         case TXN_ABORTED:
140 |             break;
141 | 
142 |         default:
143 |             assert(FALSE);
144 |     }
145 | 
146 |     return txn->state;
147 | }
148 | 
149 | static update_t *alloc_update_rec (version_t ver, map_val_t val) {
150 |     update_t *u = (update_t *)nbd_malloc(sizeof(update_t));
151 |     u->version = ver;
152 |     u->value = val;
153 |     u->next = DOES_NOT_EXIST;
154 |     return u;
155 | }
156 | 
157 | txn_t *txn_begin (map_t *map) {
158 |     TRACE("x1", "txn_begin: map %p", map, 0);
159 |     txn_t *txn = (txn_t *)nbd_malloc(sizeof(txn_t));
160 |     memset(txn, 0, sizeof(txn_t));
161 |     txn->wv = UNDETERMINED_VERSION;
162 |     txn->state = TXN_RUNNING;
163 |     txn->map = map;
164 |     txn->writes = nbd_malloc(sizeof(*txn->writes) * INITIAL_WRITES_SIZE);
165 |     txn->writes_size = INITIAL_WRITES_SIZE;
166 |     if (EXPECT_FALSE(active_ == NULL)) {
167 |         skiplist_t *a = sl_alloc(NULL);
168 |         if (SYNC_CAS(&active_, NULL, a) != NULL) {
169 |             sl_free(a);
170 |         }
171 |     }
172 | 
173 |     // acquire the read version for txn. must be careful to avoid a race
174 |     do {
175 |         txn->rv = version_;
176 | 
177 |         unsigned old_count;
178 |         unsigned temp = 0;
179 |         do {
180 |             old_count = temp;
181 |             temp = sl_cas(active_, txn->rv, old_count, old_count + 1);
182 |         } while (temp != old_count);
183 | 
184 |         if (txn->rv == version_)
185 |             break;
186 | 
187 |         temp = 1;
188 |         do {
189 |             old_count = temp;
190 |             temp = sl_cas(active_, (map_key_t)txn->rv, old_count, old_count - 1);
191 |         } while (temp != old_count);
192 |     } while (1);
193 | 
194 |     TRACE("x1", "txn_begin: returning new transaction %p (read version %p)", txn, txn->rv);
195 |     return txn;
196 | }
197 | 
198 | void txn_abort (txn_t *txn) {
199 |     if (txn->state != TXN_RUNNING)
200 |         return;
201 | 
202 |     int i;
203 |     for (i = 0; i < txn->writes_count; ++i) {
204 |         update_t *update = (update_t *)txn->writes[i].rec;
205 |         update->version = ABORTED_VERSION;
206 |     }
207 | 
208 |     rcu_defer_free(txn->writes);
209 |     rcu_defer_free(txn);
210 | }
211 | 
212 | txn_state_e txn_commit (txn_t *txn) {
213 |     if (txn->state != TXN_RUNNING)
214 |         return txn->state;
215 | 
216 |     assert(txn->state == TXN_RUNNING);
217 |     txn->state = TXN_VALIDATING;
218 |     txn_state_e state = txn_validate(txn);
219 | 
220 |     // Detach <txn> from its updates.
221 |     version_t wv = (txn->state == TXN_ABORTED) ? ABORTED_VERSION : txn->wv;
222 |     int i;
223 |     for (i = 0; i < txn->writes_count; ++i) {
224 |         update_t *update = txn->writes[i].rec;
225 |         update->version = wv;
226 |     }
227 | 
228 |     // Lower the reference count for <txn>'s read version
229 |     unsigned temp = 2;
230 |     unsigned old_count;
231 |     do {
232 |         old_count = temp;
233 |         temp = sl_cas(active_, (map_key_t)txn->rv, old_count, old_count - 1);
234 |         if (temp == 1 && txn->rv != version_) {
235 |             sl_remove(active_, (map_key_t)txn->rv);
236 |             break;
237 |         }
238 |     } while (old_count != temp);
239 | 
240 |     rcu_defer_free(txn->writes);
241 |     rcu_defer_free(txn);
242 | 
243 |     return state;
244 | }
245 | 
246 | // Get most recent committed version prior to our read version.
247 | map_val_t txn_map_get (txn_t *txn, map_key_t key) {
248 |     TRACE("x1", "txn_map_get: txn %p map %p", txn, txn->map);
249 |     TRACE("x1", "txn_map_get: key %p", key, 0);
250 | 
251 |     if (txn->state != TXN_RUNNING) {
252 |         TRACE("x1", "txn_map_get: error txn not running (state %p)", txn->state, 0);
253 |         return ERROR_TXN_NOT_RUNNING;
254 |     }
255 | 
256 |     // Iterate through the update records to find the latest committed version prior to our read version.
257 |     map_val_t newest_val = map_get(txn->map, key);
258 |     map_val_t val = newest_val;
259 |     update_t *update;
260 |     for ( ; (update = VAL_TO_PTR(val)) != NULL ; val = update->next) {
261 | 
262 |         // If TAG2 is set in <val> it indicates that <val> is an update record. Otherwise all the following are
263 |         // true: <val> is a literal value, it is older than any currently active transaction, and it is the most
264 |         // recently set value for its key. Therefore it is visible to <txn>.
265 |         if (!IS_TAGGED(val, TAG2)) {
266 |             TRACE("x1", "txn_map_get: found untagged value; returning %p", val, 0);
267 |             return val;
268 |         }
269 | 
270 |         // If the update's version is not tagged it means the update is committed.
271 |         if (!IS_TAGGED(update->version, TAG1)) {
272 |             if (update->version <= txn->rv) {
273 |                 TRACE("x2", "txn_map_get: found committed update %p (version %p)", update, update->version);
274 |                 break; // success
275 |             }
276 |             TRACE("x2", "txn_map_get: skipping update %p (version %p)", update, update->version);
277 |             continue;
278 |         }
279 | 
280 |         // If the update's version is tagged then either the update was aborted or the the version number is
281 |         // actually a pointer to a running transaction's txn_t.
282 | 
283 |         // Skip updates from aborted transactions.
284 |         if (EXPECT_FALSE(update->version == ABORTED_VERSION)) {
285 |             TRACE("x2", "txn_map_get: skipping aborted update %p", update, 0);
286 |             continue;
287 |         }
288 | 
289 |         // The update's transaction is still in progress. Access its txn_t.
290 |         txn_t *writer = (txn_t *)VAL_TO_PTR(update->version);
291 |         if (writer == txn) { 
292 |             TRACE("x2", "txn_map_get: found txn's own update %p", update, 0);
293 |             break; // success
294 |         }
295 | 
296 |         txn_state_e writer_state = writer->state;
297 |         if (writer_state == TXN_RUNNING) {
298 |             TRACE("x2", "txn_map_get: skipping update %p of in-progress transaction %p", update, writer);
299 |             continue;
300 |         }
301 | 
302 |         if (writer_state == TXN_VALIDATING) {
303 |             TRACE("x2", "txn_map_get: update %p transaction %p validating", update, writer);
304 |             if (writer->wv > txn->rv)
305 |                 continue;
306 |             writer_state = txn_validate(writer);
307 |         }
308 | 
309 |         // Skip updates from aborted transactions.
310 |         if (writer_state == TXN_ABORTED) {
311 |             TRACE("x2", "txn_map_get: skipping aborted update %p", update, 0);
312 |             continue;
313 |         }
314 | 
315 |         assert(writer_state == TXN_VALIDATED);
316 |         if (writer->wv > txn->rv) {
317 |             TRACE("x2", "txn_map_get: skipping update %p (version %p)", update, update->version);
318 |             continue;
319 |         }
320 |         break; // success
321 |     }
322 | 
323 |     if (update == NULL) {
324 |         TRACE("x1", "txn_map_get: key does not exist in map", key, 0);
325 |         return DOES_NOT_EXIST;
326 |     }
327 | 
328 |     map_val_t value = update->value;
329 |     TRACE("x1", "txn_map_get: key found returning value %p", value, 0);
330 | 
331 |     // collect some garbage
332 |     version_t min_active_version = UNDETERMINED_VERSION;
333 |     update_t *next_update = NULL;
334 |     if (IS_TAGGED(update->next, TAG2)) {
335 |         next_update = VAL_TO_PTR(update->next);
336 | 
337 |         // If <next_update> (and all update records following it [execpt if it is aborted]) is old enough
338 |         // that it is not visible to any active transaction we can safely free it.
339 |         min_active_version = (version_t)sl_min_key(active_);
340 |         if (next_update->version < min_active_version) {
341 | 
342 |             // If the <next_update> is aborted, skip over it to look for more recent ones that may follow
343 |             update_t *temp = next_update;
344 |             while (temp->version == ABORTED_VERSION) {
345 |                 assert(!IS_TAGGED(temp->version, TAG1));
346 |                 map_val_t next = temp->next;
347 |                 if (!IS_TAGGED(next, TAG2))
348 |                     break;
349 | 
350 |                 // Bail out of garbage collection if we find a record that might still be accessed by an
351 |                 // ongoing transaction.
352 |                 if (VAL_TO_PTR(next)->version >= min_active_version)
353 |                     return value;
354 | 
355 |                 temp = VAL_TO_PTR(next);
356 |             }
357 | 
358 |             // free the next update record and all the ones following it
359 |             temp = next_update;
360 |             map_val_t next;
361 |             do {
362 |                 next = SYNC_SWAP(&temp->next, DOES_NOT_EXIST);
363 | 
364 |                 // if we find ourself in a race just back off and let the other thread take care of it
365 |                 if (next == DOES_NOT_EXIST)
366 |                     return value;
367 | 
368 |                 nbd_free(temp);
369 | 
370 |                 temp = VAL_TO_PTR(next);
371 | 
372 |             } while (IS_TAGGED(next, TAG2));
373 |         }
374 |     }
375 | 
376 |     // If there is one item left and it is visible by all active transactions we can merge it into the map itself.
377 |     // There is no need for an update record.
378 |     if (next_update == NULL && val == newest_val) {
379 |         if (min_active_version == UNDETERMINED_VERSION) {
380 |             min_active_version = (version_t)sl_min_key(active_);
381 |         }
382 |         if (update->version <= min_active_version) {
383 |             if (map_cas(txn->map, key, TAG_VALUE(val, TAG2), value) == TAG_VALUE(val, TAG2)) {
384 |                 rcu_defer_free(update);
385 |             }
386 |         }
387 |     }
388 | 
389 |     return value;
390 | }
391 | 
392 | void txn_map_set (txn_t *txn, map_key_t key, map_val_t value) {
393 |     TRACE("x1", "txn_map_set: txn %p map %p", txn, txn->map);
394 |     TRACE("x1", "txn_map_set: key %p value %p", key, value);
395 |     assert(!IS_TAGGED(value, TAG1) && !IS_TAGGED(value, TAG2));
396 | 
397 |     if (txn->state != TXN_RUNNING) {
398 |         TRACE("x1", "txn_map_set: error txn not running (state %p)", txn->state, 0);
399 |         return;
400 |     }
401 | 
402 |     // create a new update record
403 |     version_t ver = TAG_VALUE(PTR_TO_VAL(txn), TAG1); // tagged versions are txn_t pointers
404 |     update_t *update = alloc_update_rec(ver, value);
405 | 
406 |     // push the new update record onto <key>'s update list
407 |     map_val_t old_update = map_get(txn->map, key);
408 |     TRACE("x2", "txn_map_set: old update %p new update record %p", old_update, update);
409 |     do {
410 |         update->next = old_update;
411 |         map_val_t temp = map_cas(txn->map, key, old_update, TAG_VALUE(PTR_TO_VAL(update), TAG2));
412 |         if (temp == old_update) 
413 |             break;
414 | 
415 |         TRACE("x1", "txn_map_set: cas failed; found %p expected %p", temp, old_update);
416 |         old_update = temp;
417 |     } while (1);
418 | 
419 |     // add <key> to the write set for commit-time validation
420 |     if (txn->writes_count == txn->writes_size) {
421 |         write_rec_t *w = nbd_malloc(sizeof(write_rec_t) * txn->writes_size * 2);
422 |         memcpy(w, txn->writes, txn->writes_size * sizeof(write_rec_t));
423 |         txn->writes_size *= 2;
424 |         nbd_free(txn->writes);
425 |         txn->writes = w;
426 |     }
427 |     int i = txn->writes_count++;
428 |     txn->writes[i].key = key;
429 |     txn->writes[i].rec = update;
430 | }
431 | 


--------------------------------------------------------------------------------