├── Makefile ├── README.md └── hash_table.hh /Makefile: -------------------------------------------------------------------------------- 1 | PREFIX ?= /usr/local 2 | 3 | INCLUDE_DIR = $(PREFIX)/include/h2co3_hash_table/ 4 | 5 | all: hash_table.hh 6 | 7 | install: all 8 | mkdir -p $(INCLUDE_DIR) 9 | cp hash_table.hh $(INCLUDE_DIR) 10 | 11 | clean: 12 | 13 | .PHONY: all install clean 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A fast, data-oriented, stdlib-flavored hash table 2 | ======= 3 | 4 | This repo contains the implementation of a fast (both for lookup and insertion) hash table, with an interface similar to that of C++ standard library collection types. 5 | 6 | The algorithm is inspired by what's found in Apple's Objective-C runtime: rather than using the classic "tombstone" or "dummy value" approach for deleted key-value pairs, this variant always compares even deleted values in a collision sequence, however the global maximum of collision sequence lengths is stored and lookup terminates once a sequence of this length has been traversed without finding a match. 7 | 8 | The table also expands its internal storage in chunks of powers of two, so bit twiddling can be used for modular arithmetic rather than the more expensive true integer division. 9 | 10 | Usage 11 | ----- 12 | 13 | #include "hash_table.hh" 14 | 15 | hash_table table; 16 | 17 | // an optional capacity may be specified 18 | // in the constructor. It need not be a power of 2. 19 | hash_table table_with_capacity(1000); 20 | 21 | // custom hash and equality comparator function objects 22 | // may also be provided if necessary: 23 | hash_table customized_table; 24 | 25 | // As with std::unordered_map::operator[], 26 | // our implementation of operator[] also 27 | // value-initializes non-existent entries. 28 | table["foo"] = "bar"; 29 | table["qux"]; // ValueType{} 30 | 31 | // .find() for those who <3 iterators 32 | if (table.find("qux") == table.end()) { 33 | std::printf("key not found\n"); 34 | } 35 | 36 | // and in general, there's an iterator-based API 37 | // begin(), end(), cbegin(), cend(), etc. all work 38 | for (auto &entry : table) { 39 | do_stuff_with(entry.key, entry.value); 40 | } 41 | 42 | auto it = table.find("some key"); 43 | if (it != table.end()) { 44 | table.erase(it); // erase it! 45 | } 46 | 47 | // But my preferred interface is get*() and set(): 48 | if (auto *valptr = table.get("foo")) { 49 | std::printf("found it: %s\n", valptr->c_str()); 50 | } 51 | 52 | auto answer = table.get_or("answer", "42"); 53 | table.remove("answer"); 54 | 55 | // Of course, size() and empty() work too! 56 | std::size_t num_keys = table.size(); 57 | bool is_empty = table.empty(); 58 | 59 | // For debugging purposes only: load factor! 60 | double lf = table.load_factor(); 61 | 62 | License 63 | ------- 64 | 2-clause BSD. [if this is a problem for you, ping me and we'll find a solution. I'm no lawyer `:-)`] 65 | 66 | TODO 67 | ---- 68 | * include some benchmarks (my experience shows that this is generally ≥5 times faster than `std::unordered_map`, depending on the exact use case.) 69 | * tests 70 | * maybe more/better usage examples? 71 | * unicorns! 72 | -------------------------------------------------------------------------------- /hash_table.hh: -------------------------------------------------------------------------------- 1 | // 2 | // hash_table.hh 3 | // 4 | // Created by Arpad Goretity (H2CO3) 5 | // on 02/06/2015 6 | // 7 | // Licensed under the 2-clause BSD License 8 | // 9 | 10 | #ifndef H2CO3_HASH_TABLE_HH 11 | #define H2CO3_HASH_TABLE_HH 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | 21 | namespace h2co3 { 22 | 23 | template< 24 | typename Key, 25 | typename Value, 26 | typename Hash = std::hash, 27 | typename Equal = std::equal_to 28 | > 29 | struct hash_table { 30 | public: 31 | 32 | // "pointed-to" type of iterators; a Key + Value pair 33 | struct KeyValue { 34 | Key key; // must not be modified by the user! (const omitted because reasons) 35 | Value value; 36 | }; 37 | 38 | // stlib-traits-friendly typedefs 39 | using key_type = Key; 40 | using mapped_type = Value; 41 | using value_type = KeyValue; 42 | using size_type = std::size_t; 43 | using difference_type = std::ptrdiff_t; 44 | using hasher = Hash; 45 | using key_equal = Equal; 46 | using reference = value_type &; 47 | using const_reference = const value_type &; 48 | using pointer = value_type *; 49 | using const_pointer = const value_type *; 50 | 51 | private: 52 | 53 | struct Slot { 54 | KeyValue kv; 55 | bool used; 56 | 57 | Slot() noexcept: kv {}, used { false } {} 58 | 59 | Slot(Key key, Value value) : 60 | kv { std::move(key), std::move(value) }, 61 | used { true } 62 | {} 63 | 64 | Slot(const Slot &other) = default; 65 | 66 | Slot(Slot &&other) noexcept: 67 | kv { std::move(other.kv) }, 68 | used { other.used } 69 | { 70 | other.used = false; 71 | } 72 | 73 | friend void swap(Slot &lhs, Slot &rhs) noexcept { 74 | using std::swap; 75 | swap(lhs.kv, rhs.kv); 76 | swap(lhs.used, rhs.used); 77 | } 78 | 79 | Slot &operator=(Slot other) { 80 | swap(*this, other); 81 | return *this; 82 | } 83 | 84 | // intentionally not operator== 85 | bool equals(const Key &key) const { 86 | assert(used); 87 | return Equal{}(kv.key, key); 88 | } 89 | }; 90 | 91 | std::vector slots; 92 | std::size_t count; 93 | std::size_t max_hash_offset; 94 | 95 | // the sole purpose of this function is that we can 96 | // explicitly call const member functions on 'this'. 97 | auto cthis() const { return this; } 98 | 99 | std::size_t key_index(const Key &key) const { 100 | return Hash{}(key) & mask(); 101 | } 102 | 103 | std::size_t mask() const { 104 | assert( 105 | slots.size() && !(slots.size() & (slots.size() - 1)) && 106 | "table size must be a power of two" 107 | ); 108 | 109 | return slots.size() - 1; 110 | } 111 | 112 | bool should_rehash() const { 113 | // keep load factor below 0.75 114 | // this ratio is chosen carefully so that it can be optimized well: 115 | // it is equivalent with ((size << 1) + size) >> 2. 116 | return slots.empty() || count >= slots.size() * 3 / 4; 117 | } 118 | 119 | const Slot *get_slot(const Key &key) const { 120 | // do not try to modulo by 0. An empty table has no values. 121 | if (slots.empty()) { 122 | return nullptr; 123 | } 124 | 125 | std::size_t i = key_index(key); 126 | std::size_t hash_offset = 0; 127 | 128 | // linear probing using a cached maximal probe sequence length. 129 | // This avoids the need to mark deleted slots as special and 130 | // fixes the performance problem whereby searching for a key after 131 | // having performed lots of deletions results in O(n) running time. 132 | // (max_hash_offset is one less than the length of the longest sequence.) 133 | do { 134 | if (slots[i].used && slots[i].equals(key)) { 135 | return &slots[i]; 136 | } 137 | 138 | i = (i + 1) & mask(); 139 | hash_offset++; 140 | } while (hash_offset <= max_hash_offset); 141 | 142 | return nullptr; 143 | } 144 | 145 | Slot *get_slot(const Key &key) { 146 | return const_cast(cthis()->get_slot(key)); 147 | } 148 | 149 | KeyValue *insert_nonexistent_norehash(Key key, Value value) { 150 | assert(should_rehash() == false); 151 | assert(size() < slots.size()); // requires empty slots 152 | assert(cthis()->get_slot(key) == nullptr); 153 | 154 | std::size_t i = key_index(key); 155 | std::size_t hash_offset = 0; 156 | 157 | // first, find an empty (unused) slot 158 | while (slots[i].used) { 159 | i = (i + 1) & mask(); 160 | hash_offset++; 161 | } 162 | 163 | // then, perform the actual insertion. 164 | // this also marks the slot as used. 165 | slots[i] = { std::move(key), std::move(value) }; 166 | assert(slots[i].used); 167 | 168 | // unconditionally increment the size because 169 | // we know that the key didn't exist before. 170 | count++; 171 | 172 | // finally, update maximal length of probe sequences (minus one) 173 | if (hash_offset > max_hash_offset) { 174 | max_hash_offset = hash_offset; 175 | } 176 | 177 | return &slots[i].kv; 178 | } 179 | 180 | void rehash() { 181 | // compute new size. Must be a power of two. 182 | const std::size_t new_size = slots.empty() ? 8 : slots.size() * 2; 183 | 184 | // move original slot array out of *this and reset internal state 185 | auto old_slots = std::move(slots); 186 | 187 | // language lawyer: move() need not clear std::vector. 188 | // this->clear() takes care of that, however 189 | // (as well as zeroing out count and max_hash_offset.) 190 | clear(); 191 | 192 | // make room for new slots (need to default-construct 193 | // in order for them to be in an 'unused'/free state) 194 | slots.resize(new_size); 195 | 196 | // re-insert each key-value pair 197 | for (auto &slot : old_slots) { 198 | if (slot.used) { 199 | insert_nonexistent_norehash(std::move(slot.kv.key), std::move(slot.kv.value)); 200 | } 201 | } 202 | } 203 | 204 | public: 205 | 206 | ////////////////// 207 | // Constructors // 208 | ////////////////// 209 | hash_table() noexcept: slots {}, count { 0 }, max_hash_offset { 0 } {} 210 | 211 | hash_table(std::size_t capacity) noexcept: hash_table() { 212 | // Make sure the real capacity is a power of two >= 8. 213 | // We should also keep in mind that the number of elements 214 | // is at most 3/4 of the number of slots! 215 | std::size_t min_num_slots = (capacity * 4 + 2) / 3; // round up 216 | std::size_t real_cap = 8; 217 | 218 | while (real_cap < min_num_slots) { 219 | real_cap *= 2; 220 | } 221 | 222 | slots.resize(real_cap); 223 | } 224 | 225 | hash_table(const hash_table &) = default; 226 | 227 | hash_table(hash_table &&other) noexcept: 228 | slots { std::move(other.slots) }, 229 | count { other.count }, 230 | max_hash_offset { other.max_hash_offset } 231 | { 232 | other.clear(); 233 | } 234 | 235 | // naive implementation, may be improved. not sure if worth the effort. 236 | hash_table(std::initializer_list elems) : hash_table(elems.size()) { 237 | for (auto &elem : elems) { 238 | // cannot move from an initializer_list 239 | set(elem.key, elem.value); 240 | } 241 | } 242 | 243 | ///////////////////////// 244 | // Resource management // 245 | ///////////////////////// 246 | 247 | friend void swap(hash_table &lhs, hash_table &rhs) noexcept { 248 | using std::swap; 249 | swap(lhs.slots, rhs.slots); 250 | swap(lhs.count, rhs.count); 251 | swap(lhs.max_hash_offset, rhs.max_hash_offset); 252 | } 253 | 254 | hash_table &operator=(hash_table other) { 255 | swap(*this, other); 256 | return *this; 257 | } 258 | 259 | void clear() noexcept { 260 | slots.clear(); 261 | count = 0; 262 | max_hash_offset = 0; 263 | } 264 | 265 | /////////////////////////////////////////////////////////////// 266 | // Actual hash table operations: Get, Insert/Replace, Delete // 267 | /////////////////////////////////////////////////////////////// 268 | 269 | const Value *get(const Key &key) const { 270 | if (const Slot *slot = get_slot(key)) { 271 | return &slot->kv.value; 272 | } 273 | return nullptr; 274 | } 275 | 276 | Value *get(const Key &key) { 277 | if (Slot *slot = get_slot(key)) { 278 | return &slot->kv.value; 279 | } 280 | return nullptr; 281 | } 282 | 283 | const Value &get_or(const Key &key, const Value &defaultValue) const { 284 | if (Slot *slot = get_slot(key)) { 285 | return slot->kv.value; 286 | } 287 | return defaultValue; 288 | } 289 | 290 | Value get_or(const Key &key, Value &&defaultValue) const { 291 | if (const Slot *slot = get_slot(key)) { 292 | return slot->kv.value; 293 | } 294 | return std::move(defaultValue); 295 | } 296 | 297 | Value &get_or(const Key &key, Value &defaultValue) { 298 | if (Slot *slot = get_slot(key)) { 299 | return slot->kv.value; 300 | } 301 | return defaultValue; 302 | } 303 | 304 | Value *set(const Key &key, Value value) { 305 | // if the key is already in the table, just replace it and move on 306 | if (Value *candidate = get(key)) { 307 | *candidate = std::move(value); 308 | return candidate; 309 | } 310 | 311 | // else we need to insert it. First, check if we need to expand. 312 | if (should_rehash()) { 313 | rehash(); 314 | } 315 | 316 | // then we actually insert the key. 317 | auto kv = insert_nonexistent_norehash(key, std::move(value)); 318 | return &kv->value; 319 | } 320 | 321 | Value *set(Key &&key, Value value) { 322 | // if the key is already in the table, just replace it and move on 323 | if (Value *candidate = get(key)) { 324 | *candidate = std::move(value); 325 | return candidate; 326 | } 327 | 328 | // else we need to insert it. First, check if we need to expand. 329 | if (should_rehash()) { 330 | rehash(); 331 | } 332 | 333 | // then we actually insert the key. 334 | auto kv = insert_nonexistent_norehash(std::move(key), std::move(value)); 335 | return &kv->value; 336 | } 337 | 338 | void remove(const Key &key) { 339 | if (Slot *slot = get_slot(key)) { 340 | // destroy key and value (we don't want to surprise users of RAII) 341 | // This also marks the slot as unused. 342 | *slot = {}; 343 | assert(slot->used == false); 344 | 345 | // removing an existing key means we need to decrease the table size. 346 | count--; 347 | } 348 | } 349 | 350 | std::size_t size() const { 351 | return count; 352 | } 353 | 354 | bool empty() const { 355 | return size() == 0; 356 | } 357 | 358 | double load_factor() const { 359 | return double(size()) / slots.size(); 360 | } 361 | 362 | // Default-constructing indexing operators 363 | Value &operator[](const Key &key) { 364 | // if the value already exists, return a reference to it 365 | if (Value *value = get(key)) { 366 | return *value; 367 | } 368 | 369 | // if it doesn't, then default-construct and insert it, 370 | // then return a reference to the newly-added value. 371 | return *set(key, {}); 372 | } 373 | 374 | Value &operator[](Key &&key) { 375 | // if the value already exists, return a reference to it 376 | if (Value *value = get(key)) { 377 | return *value; 378 | } 379 | 380 | // if it doesn't, then default-construct and insert it, 381 | // then return a reference to the newly-added value. 382 | return *set(std::move(key), {}); 383 | } 384 | 385 | const Value &operator[](const Key &key) const { 386 | if (const Value *value = get(key)) { 387 | return *value; 388 | } 389 | 390 | std::fprintf(stderr, "hash_table::operator[] failed: key does not exist\n"); 391 | std::fflush(stderr); 392 | abort(); 393 | } 394 | 395 | ////////////////// 396 | // Iterator API // 397 | ////////////////// 398 | 399 | struct const_iterator { 400 | protected: 401 | friend struct hash_table; 402 | 403 | const hash_table *owner; 404 | std::size_t slot_index; 405 | 406 | const_iterator(const hash_table *p_owner, std::size_t p_slot_index) : 407 | owner(p_owner), 408 | slot_index(p_slot_index) 409 | {} 410 | 411 | public: 412 | 413 | const_iterator(const const_iterator &other) = default; 414 | 415 | const KeyValue *operator->() const { 416 | assert(slot_index < owner->slots.size() && "cannot dereference end iterator"); 417 | return &owner->slots[slot_index].kv; 418 | } 419 | 420 | const KeyValue &operator*() const { 421 | return *operator->(); 422 | } 423 | 424 | const_iterator &operator++() { 425 | assert(slot_index < owner->slots.size() && "cannot increment end iterator"); 426 | do { 427 | slot_index++; 428 | } while (slot_index < owner->slots.size() && not owner->slots[slot_index].used); 429 | return *this; 430 | } 431 | 432 | const_iterator operator++(int) { 433 | auto prev(*this); 434 | ++*this; 435 | return prev; 436 | } 437 | 438 | bool operator==(const const_iterator &other) const { 439 | return owner == other.owner && slot_index == other.slot_index; 440 | } 441 | 442 | bool operator!=(const const_iterator &other) const { 443 | return !operator==(other); 444 | } 445 | }; 446 | 447 | struct iterator : public const_iterator { 448 | private: 449 | friend struct hash_table; 450 | 451 | iterator(const const_iterator &other) : const_iterator(other) {} 452 | 453 | public: 454 | iterator(const iterator &other) : const_iterator(other) {} 455 | 456 | KeyValue &operator*() const { 457 | return *operator->(); 458 | } 459 | 460 | KeyValue *operator->() const { 461 | return const_cast( 462 | static_cast(this)->operator->() 463 | ); 464 | } 465 | 466 | iterator &operator++() { 467 | assert(this->slot_index < this->owner->slots.size() && "cannot increment end iterator"); 468 | do { 469 | this->slot_index++; 470 | } while (this->slot_index < this->owner->slots.size() && not this->owner->slots[this->slot_index].used); 471 | return *this; 472 | } 473 | 474 | iterator operator++(int) { 475 | auto prev(*this); 476 | ++*this; 477 | return prev; 478 | } 479 | }; 480 | 481 | const_iterator begin() const { 482 | auto it = const_iterator(this, 0); 483 | while (it.slot_index < slots.size() && not slots[it.slot_index].used) { 484 | it.slot_index++; 485 | } 486 | return it; 487 | } 488 | 489 | const_iterator end() const { 490 | return const_iterator(this, slots.size()); 491 | } 492 | 493 | iterator begin() { 494 | return iterator(cthis()->begin()); 495 | } 496 | 497 | iterator end() { 498 | return iterator(cthis()->end()); 499 | } 500 | 501 | const_iterator cbegin() const { 502 | return begin(); 503 | } 504 | 505 | const_iterator cend() const { 506 | return end(); 507 | } 508 | 509 | const_iterator find(const Key &key) const { 510 | if (const Slot *slot = get_slot(key)) { 511 | return const_iterator(this, slot - slots.data()); 512 | } 513 | return end(); 514 | } 515 | 516 | iterator find(const Key &key) { 517 | return iterator(cthis()->find(key)); 518 | } 519 | 520 | void erase(const const_iterator &it) { 521 | assert(it.owner == this && "cannot erase an element of another instance"); 522 | remove(it->key); 523 | } 524 | }; 525 | 526 | } 527 | 528 | #endif // H2CO3_HASH_TABLE_HH 529 | --------------------------------------------------------------------------------