├── .gitignore ├── DataGuardian.h ├── DataGuardianTestResults.md ├── DataProtector.cpp ├── DataProtector.h ├── DataProtector.md ├── DataProtector.pdf ├── DataProtectorTest.cpp ├── Makefile └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | DataProtectorTest 2 | -------------------------------------------------------------------------------- /DataGuardian.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | template 8 | class DataGuardian { 9 | 10 | struct TPtr { 11 | std::atomic ptr; 12 | char padding[64-sizeof(std::atomic)]; 13 | }; 14 | 15 | public: 16 | DataGuardian () { 17 | _P[0].ptr = nullptr; 18 | _P[1].ptr = nullptr; 19 | for (int i = 0; i < maxNrThreads; i++) { 20 | _H[i].ptr = nullptr; 21 | } 22 | _V = 0; 23 | } 24 | 25 | ~DataGuardian () { 26 | std::lock_guard lock(_mutex); 27 | while (isHazard(_P[_V].ptr)) { 28 | usleep(250); 29 | } 30 | T const* temp = _P[_V].ptr.load(); 31 | delete temp; // OK, if nullptr 32 | _P[_V].ptr = nullptr; 33 | } 34 | 35 | bool isHazard (T const* p) { 36 | for (int i = 0; i < maxNrThreads; i++) { 37 | T const* g = _H[i].ptr.load(std::memory_order_relaxed); 38 | if (g != nullptr && g == p) { 39 | return true; 40 | } 41 | } 42 | return false; 43 | } 44 | 45 | T const* lease (int myId) { 46 | int v; 47 | T const* p; 48 | 49 | while (true) { 50 | v = _V.load(std::memory_order_consume); // (XXX) 51 | // This memory_order_consume corresponds to the change to _V 52 | // in exchange() below which uses memory_order_seq_cst, which 53 | // implies release semantics. This is important to ensure that 54 | // we see the changes to _P just before the version _V 55 | // is flipped. 56 | p = _P[v].ptr.load(std::memory_order_relaxed); 57 | _H[myId].ptr = p; // implicit memory_order_seq_cst 58 | if (_V.load(std::memory_order_relaxed) != v) { // (YYY) 59 | _H[myId].ptr = nullptr; // implicit memory_order_seq_cst 60 | continue; 61 | } 62 | break; 63 | }; 64 | return p; 65 | } 66 | 67 | void unlease (int myId) { 68 | _H[myId].ptr = nullptr; // implicit memory_order_seq_cst 69 | } 70 | 71 | void exchange (T const* replacement) { 72 | std::lock_guard lock(_mutex); 73 | 74 | int v = _V.load(std::memory_order_relaxed); 75 | _P[1-v].ptr.store(replacement, std::memory_order_relaxed); 76 | _V = 1-v; // implicit memory_order_seq_cst, whoever sees this 77 | // also sees the two above modifications! 78 | // Our job is essentially done, we only need to destroy 79 | // the old value. However, this might be unsafe, because there might 80 | // be a reader. All readers have indicated their reading activity 81 | // with a store(std::memory_order_seq_cst) to _H[]. After that 82 | // indication, they have rechecked the value of _V and have thus 83 | // confirmed that it was not yet changed. Therefore, we can simply 84 | // observe _H[*] and wait until none is equal to _P[v]: 85 | T const* p = _P[v].ptr.load(std::memory_order_relaxed); 86 | while (isHazard(p)) { 87 | usleep(250); 88 | } 89 | // Now it is safe to destroy _P[v] 90 | delete p; 91 | _P[v].ptr = nullptr; 92 | } 93 | 94 | private: 95 | TPtr _P[2]; 96 | TPtr _H[maxNrThreads]; 97 | std::atomic _V; 98 | char padding3[64-sizeof(std::atomic)]; 99 | std::mutex _mutex; 100 | 101 | // Here is a proof that this is all OK: The mutex only ensures that there is 102 | // always only at most one mutating thread. All is standard, except that 103 | // we must ensure that whenever _V is changed the mutating thread knows 104 | // about all readers that are still using the old version, which is 105 | // done through _H[myId] where id is the id of a thread. 106 | // The critical argument needed is the following: Both the change to 107 | // _H[myId] in lease() and the change to _V in exchange() use 108 | // memory_order_seq_cst, therefore they happen in some sequential 109 | // order and all threads observe the same order. If the reader in line 110 | // (YYY) above sees the same value as before in line (XXX), then any 111 | // write to _V must be later in the total order of modifications than 112 | // the change to _H[myId]. Therefore the mutating thread must see the change 113 | // to _H[myId], after all, it sees its own change to _V. Therefore it is 114 | // ensured that the delete to _P[v] only happens when all reading threads 115 | // have terminated their lease through unlease(). 116 | }; 117 | 118 | -------------------------------------------------------------------------------- /DataGuardianTestResults.md: -------------------------------------------------------------------------------- 1 | Test results 2 | ============ 3 | 4 | On GCE n1-standard-16 with 16 vCPUs, all in M/s and M/(threads*s): 5 | 6 | Compiled with g++ -O3: 7 | 8 | |Nr |Guardian |Ungeschuet|Mutex |Spinlock |Protector 9 | -------------------------------------------------------------- 10 | | 1 | 34.0 34.06| 2259 2259|60.42 60.42|102.1 102.1| 84.3 84.35 11 | | 2 | 64.6 32.31| 4339 2169| 9.33 4.66| 97.5 48.7|160.1 80.05 12 | | 3 | 92.6 30.86| 6162 2054| 7.89 2.63| 93.5 31.1|234.3 78.12 13 | | 4 |120.5 30.13| 8021 2005|13.96 3.49| 89.9 22.4|300.4 75.12 14 | | 5 |146.3 29.26| 9814 1962|11.43 2.28| 88.3 17.6|367.4 73.49 15 | | 6 |168.6 28.10|11302 1883| 9.23 1.53| 83.9 13.9|431.2 71.87 16 | | 7 |208.3 29.76|12790 1827| 8.43 1.20| 81.0 11.5|498.5 71.21 17 | | 8 |220.1 27.51|14075 1759| 8.39 1.04| 81.2 10.1|554.7 69.34 18 | | 9 |258.1 28.68|14040 1560| 8.24 0.91| 81.0 9.0|542.1 60.23 19 | |10 |282.4 28.24|14078 1407| 8.36 0.83| 81.9 8.1|540.9 54.09 20 | |11 |301.0 27.37|14029 1275| 8.10 0.73| 81.0 7.3|524.4 47.67 21 | |12 |321.2 26.76|14060 1171| 7.99 0.66| 82.2 6.8|518.4 43.20 22 | |13 |345.0 26.54|14031 1079| 7.79 0.59| 82.3 6.3|510.9 39.30 23 | |14 |366.7 26.19|14086 1006| 7.63 0.54| 82.8 5.9|502.5 35.89 24 | |15 |385.9 25.73|14092 939| 7.54 0.50| 83.4 5.5|498.5 33.23 25 | |16 |408.6 25.54|14276 892| 7.53 0.47| 82.9 5.1|491.9 30.74 26 | |20 |408.0 20.40|14317 715| 7.98 0.39| 84.5 4.2|488.9 24.44 27 | |24 |402.6 16.77|14196 591| 8.29 0.34| 84.0 3.5|525.3 21.88 28 | |28 |398.6 14.23|14235 508| 8.43 0.30| 83.4 2.9|501.8 17.92 29 | |32 |389.9 12.18|14123 441| 8.53 0.26| 83.4 2.6|528.9 16.52 30 | |48 |385.2 8.02|14202 295| 8.62 0.17| 81.0 1.6|488.4 10.17 31 | |64 |375.1 5.86|14196 221| 8.91 0.13| 79.2 1.2|508.3 7.94 32 | 33 | Compiled with clang++ -O3: 34 | 35 | |Nr |Guardian |Ungeschuet|Mutex |Spinlock |Protector 36 | ----------------------------------------------------------- 37 | | 1 | 46.2 46.2| 2066 2066|58.07 58.07|92.9 92.9| 65.0 65.0 38 | | 2 |123.9 61.9| 3933 1966| 8.50 4.25|88.6 44.3|124.0 62.0 39 | | 3 |198.2 66.0| 5614 1871| 7.35 2.45|85.5 28.5|178.8 59.6 40 | | 4 |267.4 66.8| 7287 1821|13.63 3.40|82.6 20.6|226.4 56.6 41 | | 5 |336.1 67.2| 8912 1782|11.97 2.39|79.9 15.9|280.9 56.1 42 | | 6 |392.9 65.4|10223 1703| 8.77 1.46|76.8 12.8|322.1 53.6 43 | | 7 |449.9 64.2|11676 1668| 7.10 1.01|85.4 12.2|365.9 52.2 44 | | 8 |504.1 63.0|12760 1595| 6.76 0.84|84.6 10.5|394.0 49.2 45 | | 9 |501.3 55.7|12887 1431| 6.54 0.72|85.5 9.5|406.7 45.1 46 | |10 |511.5 51.1|12955 1295| 6.45 0.64|84.9 8.4|416.4 41.6 47 | |11 |489.8 44.5|12790 1162| 6.29 0.57|82.7 7.5|406.8 36.9 48 | |12 |491.9 40.9|12891 1074| 6.15 0.51|83.3 6.9|420.5 35.0 49 | |13 |490.5 37.7|12964 997| 6.01 0.46|83.4 6.4|425.5 32.7 50 | |14 |484.2 34.5|13056 932| 5.86 0.41|83.0 5.9|428.4 30.6 51 | |15 |483.2 32.2|13048 869| 5.75 0.38|83.0 5.5|433.4 28.8 52 | |16 |481.5 30.0|13251 828| 5.72 0.35|82.6 5.1|434.8 27.1 53 | |20 |482.1 24.1|13250 662| 6.20 0.31|83.5 4.1|437.1 21.8 54 | |24 |476.5 19.8|13256 552| 6.65 0.27|83.0 3.4|428.2 17.8 55 | |28 |524.4 18.7|13262 473| 6.66 0.23|82.4 2.9|445.2 15.9 56 | |32 |515.6 16.1|13152 411| 6.86 0.21|82.0 2.5|464.5 14.5 57 | |48 |507.7 10.5|13178 274| 7.04 0.14|79.5 1.6|435.1 9.0 58 | |64 |493.4 7.7|13236 206| 7.15 0.11|78.2 1.2|444.9 6.9 59 | 60 | 61 | 62 | On my computer with 4 vCPUs, all in M/s and M/(threads*s): 63 | 64 | 65 | Compiled with g++ -O3: 66 | 67 | |Nr |Guardian |Ungeschue|Mutex | Spinlock |Protector 68 | --------------------------------------------------------------- 69 | | 1 | 28.4 28.4|1786 1786|41.9 41.93|106.68 106.68| 67.93 67.93 70 | | 2 | 52.3 26.1|3615 1807|10.1 5.06| 96.52 48.26|132.56 66.28 71 | | 3 | 74.3 24.7|3650 1216| 9.1 3.04| 93.97 31.32|125.26 41.75 72 | | 4 | 96.1 24.0|3672 918|10.5 2.63| 91.84 22.96|126.91 31.73 73 | | 5 | 96.5 19.3|3681 736|10.4 2.08| 89.56 17.91|121.05 24.20 74 | | 6 | 96.6 16.1|3646 607|10.5 1.75| 88.46 14.74|128.69 21.44 75 | | 7 |102.7 14.6|3645 520|10.5 1.50| 87.58 12.51|125.48 17.92 76 | | 8 |105.7 13.2|3668 458|10.5 1.31| 87.00 10.87|126.52 15.81 77 | 78 | 79 | 80 | Compiled with clang++ -O3: 81 | 82 | |Nr |Guardian |Ungesch. |Mutex |Spinlock |Protector 83 | -------------------------------------------------------------- 84 | | 1 | 36.88 36.88|1743 1743|38.72 38.72|99.53 99.53| 47.1 47.1 85 | | 2 |100.98 50.49|2952 1476| 7.72 3.86|88.97 44.48| 88.4 44.2 86 | | 3 |119.23 39.74|3391 1130| 8.83 2.94|87.63 29.21| 95.9 31.9 87 | | 4 |126.94 31.73|3422 855| 9.54 2.38|85.54 21.38|107.9 26.9 88 | | 5 |122.28 24.45|3452 690| 9.38 1.87|83.46 16.69|100.9 20.1 89 | | 6 |116.19 19.36|3441 573| 9.37 1.56|82.53 13.75|118.2 19.7 90 | | 7 |121.22 17.31|3430 490| 9.37 1.33|81.88 11.69|114.9 16.4 91 | | 8 |122.36 15.29|3432 429| 9.37 1.17|81.47 10.18|113.3 14.1 92 | 1 93 | 2 94 | 3 95 | 4 96 | 5 97 | 6 98 | 7 99 | 8 100 | 1 101 | 2 102 | 3 103 | 4 104 | 5 105 | 6 106 | 7 107 | 8 108 | 1 109 | 2 110 | 3 111 | 4 112 | 5 113 | 6 114 | 7 115 | 8 116 | 1 117 | 2 118 | 3 119 | 4 120 | 5 121 | 6 122 | 7 123 | 8 124 | 1 125 | 2 126 | 3 127 | 4 128 | 5 129 | 6 130 | 7 131 | 8 132 | 133 | -------------------------------------------------------------------------------- /DataProtector.cpp: -------------------------------------------------------------------------------- 1 | #include "DataProtector.h" 2 | template thread_local int DataProtector::_mySlot = -1; 3 | template std::atomic DataProtector::_last(0); 4 | template class DataProtector<64>; 5 | -------------------------------------------------------------------------------- /DataProtector.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | class DataProtector { 6 | struct alignas(64) Entry { 7 | std::atomic _count; 8 | }; 9 | 10 | Entry* _list; 11 | 12 | static std::atomic _last; 13 | static thread_local int _mySlot; 14 | 15 | public: 16 | 17 | // A class to automatically unuse the DataProtector: 18 | class UnUser { 19 | DataProtector* _prot; 20 | int _id; 21 | 22 | public: 23 | UnUser (DataProtector* p, int i) : _prot(p), _id(i) { 24 | } 25 | 26 | ~UnUser () { 27 | if (_prot != nullptr) { 28 | _prot->unUse(_id); 29 | } 30 | } 31 | 32 | // A move constructor 33 | UnUser (UnUser&& that) : _prot(that._prot), _id(that._id) { 34 | // Note that return value optimization will usually avoid 35 | // this move constructor completely. However, it has to be 36 | // present for the program to compile. 37 | that._prot = nullptr; 38 | } 39 | 40 | // Explicitly delete the others: 41 | UnUser (UnUser const& that) = delete; 42 | UnUser& operator= (UnUser const& that) = delete; 43 | UnUser& operator= (UnUser&& that) = delete; 44 | UnUser () = delete; 45 | }; 46 | 47 | DataProtector () : _list(nullptr) { 48 | _list = new Entry[Nr]; 49 | // Just to be sure: 50 | for (size_t i = 0; i < Nr; i++) { 51 | _list[i]._count = 0; 52 | } 53 | } 54 | 55 | ~DataProtector () { 56 | delete[] _list; 57 | } 58 | 59 | UnUser use () { 60 | int id = getMyId(); 61 | _list[id]._count++; // this is implicitly using memory_order_seq_cst 62 | return UnUser(this, id); // return value optimization! 63 | } 64 | 65 | void scan () { 66 | for (size_t i = 0; i < Nr; i++) { 67 | while (_list[i]._count > 0) { 68 | usleep(250); 69 | } 70 | } 71 | } 72 | 73 | private: 74 | 75 | void unUse (int id) { 76 | _list[id]._count--; // this is implicitly using memory_order_seq_cst 77 | } 78 | 79 | int getMyId () { 80 | int id = _mySlot; 81 | if (id >= 0) { 82 | return id; 83 | } 84 | while (true) { 85 | int newId = _last + 1; 86 | if (newId >= Nr) { 87 | newId = 0; 88 | } 89 | if (_last.compare_exchange_strong(id, newId)) { 90 | _mySlot = newId; 91 | return newId; 92 | } 93 | } 94 | } 95 | 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /DataProtector.md: -------------------------------------------------------------------------------- 1 | Lockfree protection of data structures that are frequently read 2 | =============================================================== 3 | 4 | by Max Neunhoeffer 5 | 6 | 7 | Motivation 8 | ---------- 9 | 10 | In multi-threaded applications running on multi-core systems, it occurs 11 | often that there are certain data structures, which are frequently read 12 | but relatively seldom changed. An example of this would be a database 13 | server that has a list of databases that changes rarely, but needs to be 14 | consulted for every single query hitting the database. In such sitations 15 | one needs to guarantee fast read access as well as protection against 16 | inconsistencies, use after free and memory leaks. 17 | 18 | Therefore we seek a lock-free protection mechanism that scales to lots 19 | of threads on modern machines and uses only C++11 standard library 20 | methods. The mechanism should be easy to use and easy to understand and 21 | prove correct. This article presents a solution to this, which is 22 | probably not new, but which we still did not find anywhere else. 23 | 24 | 25 | The concrete challenge at hand 26 | ------------------------------ 27 | 28 | Assume a global data structure on the heap and a single atomic pointer 29 | P to it. If (fast) readers access this completely unprotected, then 30 | a (slow) writer can create a completely new data structure and then 31 | change the pointer to the new structure with an atomic operation. Since 32 | writing is not time critical, one can easily use a mutex to ensure that 33 | there is only a single writer at any given time. The only problem is to 34 | decide, when it is safe to destruct the old value, because the writer 35 | cannot easily know that no reader is still accessing the old values. The 36 | challenge is aggravated by the fact that without thread synchronization 37 | it is unclear, when a reader actually sees the new pointer value, in 38 | particular on a multi-core machine with a complex system of caches. 39 | 40 | If you want to see our solution directly, scroll down to "Source code 41 | links". We first present a classical good approach and then try to 42 | improve on it. 43 | 44 | 45 | Hazard pointers and their hazards 46 | --------------------------------- 47 | 48 | The "classical" lock-free solution to this problems are hazard pointers 49 | (see [this paper][1] and this [article on Dr Dobbs][2]). 50 | The basic idea is that each reading thread first registers the location 51 | of its own "hazard" pointer in some list, and whenever it wants to 52 | access the data structure, it sets its own hazard pointer to the value 53 | of P it uses to access the data, and restores it to `nullptr` when it is 54 | done with the read access. 55 | 56 | A writer can then replace the old value of P with a pointer to a 57 | completely new value and then scan all registered hazard pointers to see 58 | whether any thread still accesses the old value. If all store operations 59 | to the hazard pointers and the one to P use `memory_order_seq_cst` (see 60 | [this page][3] for an explanation), then it is guaranteed that if a reader 61 | thread sees the old version of P, then it observes the change of its own 62 | hazard pointer earlier, therefore, because of the guaranteed sequential 63 | order of all stores with `memory_order_seq_cst`, the writer thread also 64 | observes the hazard pointer value before its own change of P. 65 | 66 | This is a very powerful and neat argument, and it only uses the 67 | guaranteed memory model of the C++11 standard in connection with atomic 68 | operations in the STL. It has very good performance characteristics, 69 | because the readers just have to ensure `memory_order_seq_cst` by means 70 | of memory fence or equivalent instructions, and since one can assume that 71 | the actual hazard pointers reside in different cache lines there is no 72 | unnecessary cache invalidation. 73 | 74 | However, this approach is not without its own hazards (pun intended). 75 | The practical problems in my opinion lie in the management of the hazard 76 | pointer allocations and deallocations and from the awkward registration 77 | procedure. A complex multi-threaded application can have various 78 | different types of threads, some dynamically created and joined. At the 79 | same time it can have multiple data structures that need the sort of 80 | protection discussed here. The position of the actual hazard pointer 81 | structure is thread-local information, and one needs a different one 82 | for each instance of a data structure that needs protection. 83 | 84 | What makes matters worse is that at the time of thread creation the 85 | main thread function often does not have access to the protected data at 86 | all, due to data encapsulation and object-oriented design. One also does 87 | not want to do the allocation of hazard pointer structures lazily, since 88 | this hurts the fast path for read access. 89 | 90 | If one were to design a "DataGuardian" class that does all the 91 | management of hazard pointers itself, then it would have to store the 92 | locations of the hazard pointers in thread-local data, but then it would 93 | have to be static and it would thus not be possible to use different 94 | hazard pointers for different instances of the DataGuardian. We have 95 | actually tried this and failed to deliver a simple and convenient 96 | implementation. This frustration lead us to our solution, which we 97 | describe next. 98 | 99 | 100 | Lock-free reference counting 101 | ---------------------------- 102 | 103 | The fundamental idea is to use a special kind of reference counting in 104 | which a reading thread uses atomic compare-and-exchange operations to 105 | increase a reference counter before it reads P and the corresponding 106 | data and decreases the counter after it is done with the reading. 107 | However, the crucial difference to this standard approach is that 108 | every thread uses a different counter, all residing in pairwise 109 | different cache lines! This is important since it means that the 110 | compare-and-exchange operations are relatively quick since no contention 111 | with corresponding cache invalidations happens. 112 | 113 | Before we do any more talking, here is the code for the simple version 114 | of the `DataProtector` class, first `DataProtector.h`: 115 | 116 | #include 117 | #include 118 | 119 | template 120 | class DataProtector { 121 | struct alignas(64) Entry { 122 | std::atomic _count; 123 | }; 124 | 125 | Entry* _list; 126 | 127 | static std::atomic _last; 128 | static thread_local int _mySlot; 129 | 130 | public: 131 | 132 | DataProtector () : _list(nullptr) { 133 | _list = new Entry[Nr]; 134 | // Just to be sure: 135 | for (size_t i = 0; i < Nr; i++) { 136 | _list[i]._count = 0; 137 | } 138 | } 139 | 140 | ~DataProtector () { 141 | delete[] _list; 142 | } 143 | 144 | void use () { 145 | int id = getMyId(); 146 | _list[id]._count++; // this is implicitly using memory_order_seq_cst 147 | } 148 | 149 | void unUse () { 150 | int id = getMyId(); 151 | _list[id]._count--; // this is implicitly using memory_order_seq_cst 152 | } 153 | 154 | void scan () { 155 | for (size_t i = 0; i < Nr; i++) { 156 | while (_list[i]._count > 0) { 157 | usleep(250); 158 | } 159 | } 160 | } 161 | 162 | private: 163 | 164 | int getMyId () { 165 | int id = _mySlot; 166 | if (id >= 0) { 167 | return id; 168 | } 169 | while (true) { 170 | int newId = _last + 1; 171 | if (newId >= Nr) { 172 | newId = 0; 173 | } 174 | if (_last.compare_exchange_strong(id, newId)) { 175 | _mySlot = newId; 176 | return newId; 177 | } 178 | } 179 | } 180 | 181 | }; 182 | 183 | And a minuscule part in `DataProtector.cpp` for the definition of two 184 | static variables, one of which is thread-local: 185 | 186 | #include "DataProtector.h" 187 | template thread_local int DataProtector::_mySlot = -1; 188 | template std::atomic DataProtector::_last(0); 189 | template class DataProtector<64>; 190 | 191 | In a multi-threaded application one would declare the following, either 192 | global or in some object intance: 193 | 194 | std::atomic P; 195 | DataProtector Prot; 196 | 197 | A reader uses this as follows: 198 | 199 | Prot.use(); 200 | Data* Pcopy = P; // uses memory_order_seq_cst by default 201 | // continue accessing data via Pcopy 202 | Prot.unUse(); 203 | 204 | A writer simply does (protected by some mutex): 205 | 206 | Data* newP = new Data(); 207 | Data* oldP = P; 208 | P = newP; 209 | Prot.scan(); 210 | delete oldP; 211 | 212 | The code speaks mostly for itself, because this is actually a very 213 | simple approach: We administrate multiple slots with reference counters, 214 | making sure that each resides in a different cache line by using 215 | alignment. Each thread chooses once and for all a slot (we store the 216 | number in static thread-local storage), valid for all instances of the 217 | DataProtector class. This leads to a very fast path for reading data. 218 | 219 | The writer, which is always only one at a time using mutexes, first 220 | builds up a completely new copy of the protected data structure and then 221 | switches the atomic pointer P to the new value. From this point on all 222 | readers only see the new version. To ensure that no more readers access 223 | the old version, the writer simply scans all reference counters in the 224 | DataProtector class and waits until it has seen a 0 in each of them. It 225 | is not necessary to see zeros in all of them at the same time, it is 226 | enough to have seen a zero in each slot once. After that it is safe to 227 | destroy the old value of the protected data structure. 228 | 229 | The proof that this works is equally simple as in the hazard pointer 230 | case above: The changes to the reference counters as well as the change 231 | to the global pointer P by the writer are all done with 232 | `memory_order_seq_cst`. That is, the C++ memory model guarantees that 233 | all these changes are observed by all threads in the same sequential 234 | order. A reader that observes the old value of P (and then subsequently 235 | reads the old version of the data structure), has incremented its 236 | reference counter before reading P. Therefore it observes the change to 237 | the counter before it observes the change to P by the writer. Thus the 238 | writer must observe the change to the counter also as happening before 239 | the change to P. Therefore it will always see some positive counter as 240 | long as a reader is still accessing the old value of P and the 241 | corresponding data structure. 242 | 243 | We assumed that it is not a problem that the writer is somewhat slow, 244 | because writes are infrequent. Therefore, locking a mutex, reading all 245 | reference counters, whose number is of the order of magnitude of the 246 | number of reader threads, and waiting for each of them to drop to 0 once 247 | is not a performance problem. 248 | 249 | The benefits of this approach are as follows: All management happens 250 | encapsulated in the `DataProtector` class, which is extremely simple 251 | to use. We have discussed the performance characteristics above and show 252 | a benchmark and comparison with other methods below. 253 | 254 | There is a single convenience improvement, which we describe in the 255 | following section. 256 | 257 | 258 | Convenience with scope guards 259 | ----------------------------- 260 | 261 | To make the usage for the readers even more convenient and reduce 262 | possiblities for bugs, we create a facility to use scope guards. We do 263 | this in the form of a small `UnUser` class, which is encapsulated in the 264 | `DataProtector` class. Modern C++11 features like type inference 265 | (`auto`) further help. After this modification, the reader uses the 266 | `DataProtector` as follows: 267 | 268 | { 269 | auto unuser(Prot.use()); 270 | Data* Pcopy = P; // uses memory_order_seq_cst by default 271 | // continue accessing data via Pcopy 272 | } 273 | 274 | The unuser instance will have type `DataProtector::UnUser` and the `use` 275 | method of the DataProtector returns the right instance such that the 276 | destructor of the `UnUser` class automagically calls the `unUse` method 277 | of the `DataProtector` class, when the object goes out of scope. This 278 | method can then in turn be `private`. Without further talking, here is 279 | the code of the `UnUser` class: 280 | 281 | // A class to automatically unuse the DataProtector: 282 | class UnUser { 283 | DataProtector* _prot; 284 | int _id; 285 | 286 | public: 287 | UnUser (DataProtector* p, int i) : _prot(p), _id(i) { 288 | } 289 | 290 | ~UnUser () { 291 | if (_prot != nullptr) { 292 | _prot->unUse(_id); 293 | } 294 | } 295 | 296 | // A move constructor 297 | UnUser (UnUser&& that) : _prot(that._prot), _id(that._id) { 298 | // Note that return value optimization will usually avoid 299 | // this move constructor completely. However, it has to be 300 | // present for the program to compile. 301 | that._prot = nullptr; 302 | } 303 | 304 | // Explicitly delete the others: 305 | UnUser (UnUser const& that) = delete; 306 | UnUser& operator= (UnUser const& that) = delete; 307 | UnUser& operator= (UnUser&& that) = delete; 308 | UnUser () = delete; 309 | }; 310 | 311 | There is nothing special to it, note that the rule of five is observed 312 | and that the implemented move constructor allows return value 313 | optimization to kick in, such that the value now returned by the `use` 314 | class of the `DataProtector` is directly constructed in the stack frame 315 | of the reader: 316 | 317 | UnUser use () { 318 | int id = getMyId(); 319 | _list[id]._count++; // this is implicitly using memory_order_seq_cst 320 | return UnUser(this, id); // return value optimization! 321 | } 322 | 323 | As already mentioned, the `unUse` method is now private, other than 324 | that, the code of the `DataProtector` is unchanged. 325 | 326 | 327 | Source code links 328 | ----------------- 329 | 330 | All code is available online in this github repository: 331 | 332 | [https://github.com/neunhoef/DataProtector][4] 333 | 334 | There, we also publish the test code, which is used in the following 335 | section to measure performance. 336 | 337 | Additionally, this is actually being used in published software in the 338 | [source code of ArangoDB][5], see [here][6] and [here][7] for details. 339 | 340 | 341 | Performance comparison with other methods 342 | ----------------------------------------- 343 | 344 | To assess the performance of our `DataProtector` class, we have done 345 | a comparison with the following methods: 346 | 347 | 1. `DataGuardian` with hazard pointers 348 | 349 | This is our own implementation of hazard pointers. 350 | 2. unprotected access 351 | 352 | This is just unprotected access, which is of course not an option 353 | at all, but interesting nevertheless. One sees, that the readers 354 | essentially just consult their caches which are updated eventually. 355 | There is no guarantee against use-after-free at all. 356 | 357 | 3. a mutex implementation 358 | 359 | This is a very simple-minded application where all readers and the 360 | writer share a global mutex. 361 | 362 | 4. a spin-lock implementation using boost atomics 363 | 364 | Again a very simple-minded implementation of spin-locks. 365 | 366 | 5. `DataProtector` 367 | 368 | This is our new class described in this article. 369 | 370 | The test program simply starts a number of reader threads which 371 | constantly read a dummy data structure, thereby detecing 372 | use-after-delete and seeing a `nullptr`. We count reads per second and 373 | reads per second and thread. 374 | 375 | Here are the results on an n1-standard-16 instance on Google Compute 376 | Engine (GCE) for various numbers of threads. The code has been compiled 377 | with `g++ -std=c++11 -O3 -Wall`. Results are in million reads per second 378 | (M/s), and million reads per second and thread (M/s/thread): 379 | 380 | |Nr |DataGuardian|unprotected|Mutex |Spinlock |DataProtector 381 | |Thr| M/s M/s/T| M/S M/s/T| M/S M/s/T| M/S M/s/T| M/S M/s/T 382 | ------------------------------------------------------------------- 383 | | 1 | 34.0 34.06| 2259 2259|60.42 60.42|102.1 102.1| 84.3 84.35 384 | | 2 | 64.6 32.31| 4339 2169| 9.33 4.66| 97.5 48.7|160.1 80.05 385 | | 3 | 92.6 30.86| 6162 2054| 7.89 2.63| 93.5 31.1|234.3 78.12 386 | | 4 |120.5 30.13| 8021 2005|13.96 3.49| 89.9 22.4|300.4 75.12 387 | | 5 |146.3 29.26| 9814 1962|11.43 2.28| 88.3 17.6|367.4 73.49 388 | | 6 |168.6 28.10|11302 1883| 9.23 1.53| 83.9 13.9|431.2 71.87 389 | | 7 |208.3 29.76|12790 1827| 8.43 1.20| 81.0 11.5|498.5 71.21 390 | | 8 |220.1 27.51|14075 1759| 8.39 1.04| 81.2 10.1|554.7 69.34 391 | | 9 |258.1 28.68|14040 1560| 8.24 0.91| 81.0 9.0|542.1 60.23 392 | |10 |282.4 28.24|14078 1407| 8.36 0.83| 81.9 8.1|540.9 54.09 393 | |11 |301.0 27.37|14029 1275| 8.10 0.73| 81.0 7.3|524.4 47.67 394 | |12 |321.2 26.76|14060 1171| 7.99 0.66| 82.2 6.8|518.4 43.20 395 | |13 |345.0 26.54|14031 1079| 7.79 0.59| 82.3 6.3|510.9 39.30 396 | |14 |366.7 26.19|14086 1006| 7.63 0.54| 82.8 5.9|502.5 35.89 397 | |15 |385.9 25.73|14092 939| 7.54 0.50| 83.4 5.5|498.5 33.23 398 | |16 |408.6 25.54|14276 892| 7.53 0.47| 82.9 5.1|491.9 30.74 399 | |20 |408.0 20.40|14317 715| 7.98 0.39| 84.5 4.2|488.9 24.44 400 | |24 |402.6 16.77|14196 591| 8.29 0.34| 84.0 3.5|525.3 21.88 401 | |28 |398.6 14.23|14235 508| 8.43 0.30| 83.4 2.9|501.8 17.92 402 | |32 |389.9 12.18|14123 441| 8.53 0.26| 83.4 2.6|528.9 16.52 403 | |48 |385.2 8.02|14202 295| 8.62 0.17| 81.0 1.6|488.4 10.17 404 | |64 |375.1 5.86|14196 221| 8.91 0.13| 79.2 1.2|508.3 7.94 405 | 406 | The first column is the number of reader threads, in each of the five 407 | following columns there is first the total number of reads in all 408 | threads in millions per second and then the same number divided by the 409 | number of threads, which is the total number of reads per second and 410 | thread. The results have some random variation and are very similar when 411 | using the `clang` compiler. 412 | 413 | One can see that both the hazard pointers in the `DataGuardian` class 414 | and the `DataProtector` class scale well, until the number of actual 415 | CPUs (16 vCPUs are 8 cores with hyperthreading) is reached. On such a 416 | machine 554 million reads per second with 8 threads is a good result, 417 | this means that every thread achieves 70 M reads per second and thus 418 | only spends around 14 nanoseconds for each. This shows that in this 419 | uncontented situation the atomic compare-and-exchange operations are 420 | quite fast. 421 | 422 | 423 | References 424 | ---------- 425 | 426 | - [Hazard pointer article][1] 427 | 428 | `http://www.research.ibm.com/people/m/michael/ieeetpds-2004.pdf` 429 | - [Dr Dobbs][2] 430 | 431 | `http://www.drdobbs.com/lock-free-data-structures-with-hazard-po/184401890` 432 | - [Memory orders in C++][3] 433 | 434 | `http://www.cplusplus.com/reference/atomic/memory_order/` 435 | - [DataProtector implementation and test code][4] 436 | 437 | `https://github.com/neunhoef/DataProtector` 438 | - [The multi-model NoSQL database ArangoDB][5] 439 | 440 | `https://www.arangodb.com` 441 | - [Source code of ArangoDB on github][6] 442 | 443 | `https://github.com/ArangoDB/ArangoDB` 444 | - [DataProtector in the ArangoDB source code][7] 445 | 446 | `https://github.com/ArangoDB/ArangoDB/blob/devel/lib/Basics/DataProtector.h` 447 | 448 | [1]: http://www.research.ibm.com/people/m/michael/ieeetpds-2004.pdf 449 | [2]: http://www.drdobbs.com/lock-free-data-structures-with-hazard-po/184401890 450 | [3]: http://www.cplusplus.com/reference/atomic/memory_order/ 451 | [4]: https://github.com/neunhoef/DataProtector 452 | [5]: https://www.arangodb.com 453 | [6]: https://github.com/ArangoDB/ArangoDB 454 | [7]: https://github.com/ArangoDB/ArangoDB/blob/devel/lib/Basics/DataProtector.h 455 | 456 | -------------------------------------------------------------------------------- /DataProtector.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neunhoef/DataProtector/2cc6f89404cd92d762b73a5c05358d1cb452f89c/DataProtector.pdf -------------------------------------------------------------------------------- /DataProtectorTest.cpp: -------------------------------------------------------------------------------- 1 | #include "DataGuardian.h" 2 | #include "DataProtector.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define T 10 10 | #define maxN 64 11 | 12 | using namespace std; 13 | 14 | struct DataToBeProtected { 15 | DataToBeProtected(int i) : nr(i), isValid(true) { 16 | } 17 | ~DataToBeProtected() { 18 | isValid = false; 19 | } 20 | int nr; 21 | bool isValid; 22 | }; 23 | 24 | DataToBeProtected const* unprotected = nullptr; 25 | DataGuardian guardian; 26 | 27 | atomic pointerToData(nullptr); 28 | 29 | DataProtector<64> protector; 30 | 31 | mutex mut; 32 | 33 | uint64_t total = 0; 34 | 35 | atomic nullptrsSeen; 36 | atomic alarmsSeen; 37 | 38 | shared_ptr global_shared_ptr; 39 | thread_local shared_ptr thread_local_shared_ptr; 40 | 41 | void reader_guardian (int id) { 42 | uint64_t count = 0; 43 | time_t start = time(nullptr); 44 | while (time(nullptr) < start + T) { 45 | for (int i = 0; i < 1000; i++) { 46 | count++; 47 | DataToBeProtected const* p = guardian.lease(id); 48 | if (p == nullptr) { 49 | nullptrsSeen++; 50 | } 51 | else { 52 | if (! p->isValid) { 53 | alarmsSeen++; 54 | } 55 | } 56 | guardian.unlease(id); 57 | } 58 | } 59 | lock_guard locker(mut); 60 | total += count; 61 | } 62 | 63 | void reader_protector (int id) { 64 | uint64_t count = 0; 65 | time_t start = time(nullptr); 66 | while (time(nullptr) < start + T) { 67 | for (int i = 0; i < 1000; i++) { 68 | count++; 69 | auto unuser(protector.use()); 70 | DataToBeProtected const* p = pointerToData; 71 | if (p == nullptr) { 72 | nullptrsSeen++; 73 | } 74 | else { 75 | if (! p->isValid) { 76 | alarmsSeen++; 77 | } 78 | } 79 | } 80 | } 81 | lock_guard locker(mut); 82 | total += count; 83 | } 84 | 85 | void reader_unprotected (int) { 86 | uint64_t count = 0; 87 | time_t start = time(nullptr); 88 | while (time(nullptr) < start + T) { 89 | for (int i = 0; i < 1000; i++) { 90 | count++; 91 | DataToBeProtected const* p = unprotected; 92 | if (p == nullptr) { 93 | nullptrsSeen++; 94 | } 95 | else { 96 | if (! p->isValid) { 97 | alarmsSeen++; 98 | } 99 | } 100 | } 101 | } 102 | lock_guard locker(mut); 103 | total += count; 104 | } 105 | 106 | void reader_mutex (int) { 107 | uint64_t count = 0; 108 | time_t start = time(nullptr); 109 | while (time(nullptr) < start + T) { 110 | for (int i = 0; i < 1000; i++) { 111 | count++; 112 | lock_guard locker(mut); 113 | DataToBeProtected const* p = unprotected; 114 | if (p == nullptr) { 115 | nullptrsSeen++; 116 | } 117 | else { 118 | if (! p->isValid) { 119 | alarmsSeen++; 120 | } 121 | } 122 | } 123 | } 124 | lock_guard locker(mut); 125 | total += count; 126 | } 127 | 128 | void reader_shared_ptr(int) { 129 | uint64_t count = 0; 130 | time_t start = time(nullptr); 131 | while (time(nullptr) < start + T) { 132 | for (int i = 0; i < 1000; i++) { 133 | count++; 134 | atomic_thread_fence(memory_order_consume); 135 | if (thread_local_shared_ptr != global_shared_ptr) { 136 | thread_local_shared_ptr = atomic_load(&global_shared_ptr); 137 | } 138 | if (thread_local_shared_ptr == nullptr) { 139 | nullptrsSeen++; 140 | } 141 | else { 142 | if (! thread_local_shared_ptr->isValid) { 143 | alarmsSeen++; 144 | } 145 | } 146 | } 147 | } 148 | lock_guard locker(mut); 149 | total += count; 150 | } 151 | 152 | 153 | void writer_guardian () { 154 | DataToBeProtected* p; 155 | for (int i = 0; i < T+2; i++) { 156 | p = new DataToBeProtected(i); 157 | guardian.exchange(p); 158 | usleep(1000000); 159 | } 160 | guardian.exchange(nullptr); 161 | } 162 | 163 | void writer_protector () { 164 | DataToBeProtected* p; 165 | DataToBeProtected* q; 166 | for (int i = 0; i < T+2; i++) { 167 | p = new DataToBeProtected(i); 168 | q = pointerToData; 169 | pointerToData = p; 170 | protector.scan(); 171 | delete q; 172 | usleep(1000000); 173 | } 174 | q = pointerToData; 175 | pointerToData = nullptr; 176 | protector.scan(); 177 | delete q; 178 | } 179 | 180 | void writer_unprotected () { 181 | DataToBeProtected* p; 182 | for (int i = 0; i < T+2; i++) { 183 | DataToBeProtected const* q = unprotected; 184 | p = new DataToBeProtected(i); 185 | unprotected = p; 186 | usleep(1000000); 187 | delete q; 188 | } 189 | delete unprotected; 190 | unprotected = nullptr; 191 | } 192 | 193 | void writer_mutex () { 194 | DataToBeProtected* p; 195 | for (int i = 0; i < T+2; i++) { 196 | p = new DataToBeProtected(i); 197 | { 198 | lock_guard locker(mut); 199 | delete unprotected; 200 | unprotected = p; 201 | } 202 | usleep(1000000); 203 | } 204 | delete unprotected; 205 | unprotected = nullptr; 206 | } 207 | 208 | void writer_shared_ptr () { 209 | for (int i = 0; i < T+2; i++) { 210 | atomic_store(&global_shared_ptr, make_shared(i)); 211 | usleep(1000000); 212 | } 213 | } 214 | 215 | char const* modes[] = {"guardian", "unprotected", "std::mutex", "std::shared_ptr", 216 | "protector"}; 217 | 218 | int main (int argc, char* argv[]) { 219 | std::vector totals; 220 | std::vector perthread; 221 | std::vector nrThreads; 222 | 223 | for (int mode = 0; mode < 5; mode++) { 224 | for (int j = 1; j < argc; j++) { 225 | nullptrsSeen = 0; 226 | alarmsSeen = 0; 227 | total = 0; 228 | int N = atoi(argv[j]); 229 | cout << "Mode: " << modes[mode] << endl; 230 | cout << "Nr of threads: " << N << endl; 231 | vector readerThreads; 232 | readerThreads.reserve(N); 233 | thread* writerThread; 234 | 235 | switch (mode) { 236 | case 0: writerThread = new thread(writer_guardian); break; 237 | case 1: writerThread = new thread(writer_unprotected); break; 238 | case 2: writerThread = new thread(writer_mutex); break; 239 | case 3: writerThread = new thread(writer_shared_ptr); break; 240 | case 4: writerThread = new thread(writer_protector); break; 241 | } 242 | 243 | usleep(500000); 244 | for (int i = 0; i < N; i++) { 245 | switch (mode) { 246 | case 0: readerThreads.emplace_back(reader_guardian, i); break; 247 | case 1: readerThreads.emplace_back(reader_unprotected, i); break; 248 | case 2: readerThreads.emplace_back(reader_mutex, i); break; 249 | case 3: readerThreads.emplace_back(reader_shared_ptr, i); break; 250 | case 4: readerThreads.emplace_back(reader_protector, i); break; 251 | } 252 | } 253 | writerThread->join(); 254 | for (int i = 0; i < N; i++) { 255 | readerThreads[i].join(); 256 | } 257 | delete writerThread; 258 | writerThread = nullptr; 259 | cout << "Total: " << total/1000000.0/T << "M/s, per thread: " 260 | << total/1000000.0/N/T << "M/(thread*s)" << endl; 261 | cout << "nullptr values seen: " << nullptrsSeen 262 | << ", alarms seen: " << alarmsSeen << endl << endl; 263 | totals.push_back(total/1000000.0/T); 264 | perthread.push_back(total/1000000.0/N/T); 265 | nrThreads.push_back(N); 266 | } 267 | } 268 | for (size_t i = 0; i < totals.size(); i++) { 269 | std::cout << i << "\t" << nrThreads[i] << "\t" << totals[i] << "\t" 270 | << perthread[i] << std::endl; 271 | } 272 | return 0; 273 | } 274 | 275 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: DataProtectorTest 2 | 3 | DataProtectorTest: DataProtectorTest.cpp DataGuardian.h Makefile DataProtector.h DataProtector.cpp 4 | g++ DataProtectorTest.cpp DataProtector.cpp -o DataProtectorTest -std=c++11 -Wall -O3 -g -lpthread 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Lockfree protection of data structures that are frequently read 2 | =============================================================== 3 | 4 | by Max Neunhoeffer and Jan Steemann 5 | 6 | In multi-threaded applications running on multi-core systems, it occurs 7 | often that there are certain data structures, which are frequently read 8 | but relatively seldom changed. An example of this would be a database 9 | server that has a list of databases that changes rarely, but needs to be 10 | consulted for every single query hitting the database. In such sitations 11 | one needs to guarantee fast read access as well as protection against 12 | inconsistencies, use after free and memory leaks. 13 | 14 | Therefore we seek a lock-free protection mechanism that scales to lots 15 | of threads on modern machines and uses only C++11 standard library 16 | methods. The mechanism should be easy to use and easy to understand and 17 | prove correct. This repository presents a solution to this, which is 18 | probably not new, but which we still did not find anywhere else. 19 | 20 | Usage: 21 | 22 | make 23 | ./DataProtectorTest 1 2 3 4 5 6 7 8 24 | 25 | See the file `DataProtector.md` for more details about the code in this 26 | repository. 27 | 28 | --------------------------------------------------------------------------------