├── Makefile
├── README.md
└── hash_table.hh


/Makefile:
--------------------------------------------------------------------------------
 1 | PREFIX ?= /usr/local
 2 | 
 3 | INCLUDE_DIR = $(PREFIX)/include/h2co3_hash_table/
 4 | 
 5 | all: hash_table.hh
 6 | 
 7 | install: all
 8 | 	mkdir -p $(INCLUDE_DIR)
 9 | 	cp hash_table.hh $(INCLUDE_DIR)
10 | 
11 | clean:
12 | 
13 | .PHONY: all install clean
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A fast, data-oriented, stdlib-flavored hash table
 2 | =======
 3 | 
 4 | This repo contains the implementation of a fast (both for lookup and insertion) hash table, with an interface similar to that of C++ standard library collection types.
 5 | 
 6 | The algorithm is inspired by what's found in Apple's Objective-C runtime: rather than using the classic "tombstone" or "dummy value" approach for deleted key-value pairs, this variant always compares even deleted values in a collision sequence, however the global maximum of collision sequence lengths is stored and lookup terminates once a sequence of this length has been traversed without finding a match.
 7 | 
 8 | The table also expands its internal storage in chunks of powers of two, so bit twiddling can be used for modular arithmetic rather than the more expensive true integer division.
 9 | 
10 | Usage
11 | -----
12 | 
13 | 	#include "hash_table.hh"
14 | 
15 |     hash_table<KeyType, ValueType> table;
16 |     
17 |     // an optional capacity may be specified
18 |     // in the constructor. It need not be a power of 2.
19 |     hash_table<KeyType, ValueType> table_with_capacity(1000);
20 |     
21 |     // custom hash and equality comparator function objects
22 |     // may also be provided if necessary:
23 |     hash_table<KeyType, ValueType, Hash, EqCmp> customized_table;
24 |     
25 |     // As with std::unordered_map::operator[],
26 |     // our implementation of operator[] also
27 |     // value-initializes non-existent entries.
28 |     table["foo"] = "bar";
29 |     table["qux"]; // ValueType{}
30 |     
31 |     // .find() for those who <3 iterators
32 |     if (table.find("qux") == table.end()) {
33 |         std::printf("key not found\n");
34 |     }
35 |     
36 |     // and in general, there's an iterator-based API
37 |     // begin(), end(), cbegin(), cend(), etc. all work
38 |     for (auto &entry : table) {
39 |         do_stuff_with(entry.key, entry.value);
40 |     }
41 |     
42 |     auto it = table.find("some key");
43 |     if (it != table.end()) {
44 |         table.erase(it); // erase it!
45 |     }
46 |     
47 |     // But my preferred interface is get*() and set():
48 |     if (auto *valptr = table.get("foo")) {
49 |         std::printf("found it: %s\n", valptr->c_str());
50 |     }
51 |     
52 |     auto answer = table.get_or("answer", "42");
53 |     table.remove("answer");
54 |     
55 |     // Of course, size() and empty() work too!
56 |     std::size_t num_keys = table.size();
57 |     bool is_empty = table.empty();
58 |     
59 |     // For debugging purposes only: load factor!
60 |     double lf = table.load_factor();
61 | 
62 | License
63 | -------
64 | 2-clause BSD. [if this is a problem for you, ping me and we'll find a solution. I'm no lawyer `:-)`]
65 | 
66 | TODO
67 | ----
68 | * include some benchmarks (my experience shows that this is generally ≥5 times faster than `std::unordered_map`, depending on the exact use case.)
69 | * tests
70 | * maybe more/better usage examples?
71 | * unicorns!
72 | 


--------------------------------------------------------------------------------
/hash_table.hh:
--------------------------------------------------------------------------------
  1 | //
  2 | // hash_table.hh
  3 | //
  4 | // Created by Arpad Goretity (H2CO3)
  5 | // on 02/06/2015
  6 | //
  7 | // Licensed under the 2-clause BSD License
  8 | //
  9 | 
 10 | #ifndef H2CO3_HASH_TABLE_HH
 11 | #define H2CO3_HASH_TABLE_HH
 12 | 
 13 | #include <vector>
 14 | #include <functional>
 15 | #include <type_traits>
 16 | #include <cstdlib>
 17 | #include <cassert>
 18 | #include <cstdio>
 19 | 
 20 | 
 21 | namespace h2co3 {
 22 | 
 23 | template<
 24 | 	typename Key,
 25 | 	typename Value,
 26 | 	typename Hash = std::hash<Key>,
 27 | 	typename Equal = std::equal_to<Key>
 28 | >
 29 | struct hash_table {
 30 | public:
 31 | 
 32 | 	// "pointed-to" type of iterators; a Key + Value pair
 33 | 	struct KeyValue {
 34 | 		Key key;     // must not be modified by the user! (const omitted because reasons)
 35 | 		Value value;
 36 | 	};
 37 | 
 38 | 	// stlib-traits-friendly typedefs
 39 | 	using key_type        = Key;
 40 | 	using mapped_type     = Value;
 41 | 	using value_type      = KeyValue;
 42 | 	using size_type       = std::size_t;
 43 | 	using difference_type = std::ptrdiff_t;
 44 | 	using hasher          = Hash;
 45 | 	using key_equal       = Equal;
 46 | 	using reference       = value_type &;
 47 | 	using const_reference = const value_type &;
 48 | 	using pointer         = value_type *;
 49 | 	using const_pointer   = const value_type *;
 50 | 
 51 | private:
 52 | 
 53 | 	struct Slot {
 54 | 		KeyValue kv;
 55 | 		bool used;
 56 | 
 57 | 		Slot() noexcept: kv {}, used { false } {}
 58 | 
 59 | 		Slot(Key key, Value value) :
 60 | 			kv { std::move(key), std::move(value) },
 61 | 			used { true }
 62 | 			{}
 63 | 
 64 | 		Slot(const Slot &other) = default;
 65 | 
 66 | 		Slot(Slot &&other) noexcept:
 67 | 			kv { std::move(other.kv) },
 68 | 			used { other.used }
 69 | 		{
 70 | 			other.used = false;
 71 | 		}
 72 | 
 73 | 		friend void swap(Slot &lhs, Slot &rhs) noexcept {
 74 | 			using std::swap;
 75 | 			swap(lhs.kv,   rhs.kv);
 76 | 			swap(lhs.used, rhs.used);
 77 | 		}
 78 | 
 79 | 		Slot &operator=(Slot other) {
 80 | 			swap(*this, other);
 81 | 			return *this;
 82 | 		}
 83 | 
 84 | 		// intentionally not operator==
 85 | 		bool equals(const Key &key) const {
 86 | 			assert(used);
 87 | 			return Equal{}(kv.key, key);
 88 | 		}
 89 | 	};
 90 | 
 91 | 	std::vector<Slot> slots;
 92 | 	std::size_t count;
 93 | 	std::size_t max_hash_offset;
 94 | 
 95 | 	// the sole purpose of this function is that we can
 96 | 	// explicitly call const member functions on 'this'.
 97 | 	auto cthis() const { return this; }
 98 | 
 99 | 	std::size_t key_index(const Key &key) const {
100 | 		return Hash{}(key) & mask();
101 | 	}
102 | 
103 | 	std::size_t mask() const {
104 | 		assert(
105 | 			slots.size() && !(slots.size() & (slots.size() - 1)) &&
106 | 			"table size must be a power of two"
107 | 		);
108 | 
109 | 		return slots.size() - 1;
110 | 	}
111 | 
112 | 	bool should_rehash() const {
113 | 		// keep load factor below 0.75
114 | 		// this ratio is chosen carefully so that it can be optimized well:
115 | 		// it is equivalent with ((size << 1) + size) >> 2.
116 | 		return slots.empty() || count >= slots.size() * 3 / 4;
117 | 	}
118 | 
119 | 	const Slot *get_slot(const Key &key) const {
120 | 		// do not try to modulo by 0. An empty table has no values.
121 | 		if (slots.empty()) {
122 | 			return nullptr;
123 | 		}
124 | 
125 | 		std::size_t i = key_index(key);
126 | 		std::size_t hash_offset = 0;
127 | 
128 | 		// linear probing using a cached maximal probe sequence length.
129 | 		// This avoids the need to mark deleted slots as special and
130 | 		// fixes the performance problem whereby searching for a key after
131 | 		// having performed lots of deletions results in O(n) running time.
132 | 		// (max_hash_offset is one less than the length of the longest sequence.)
133 | 		do {
134 | 			if (slots[i].used && slots[i].equals(key)) {
135 | 				return &slots[i];
136 | 			}
137 | 
138 | 			i = (i + 1) & mask();
139 | 			hash_offset++;
140 | 		} while (hash_offset <= max_hash_offset);
141 | 
142 | 		return nullptr;
143 | 	}
144 | 
145 | 	Slot *get_slot(const Key &key) {
146 | 		return const_cast<Slot *>(cthis()->get_slot(key));
147 | 	}
148 | 
149 | 	KeyValue *insert_nonexistent_norehash(Key key, Value value) {
150 | 		assert(should_rehash() == false);
151 | 		assert(size() < slots.size()); // requires empty slots
152 | 		assert(cthis()->get_slot(key) == nullptr);
153 | 
154 | 		std::size_t i = key_index(key);
155 | 		std::size_t hash_offset = 0;
156 | 
157 | 		// first, find an empty (unused) slot
158 | 		while (slots[i].used) {
159 | 			i = (i + 1) & mask();
160 | 			hash_offset++;
161 | 		}
162 | 
163 | 		// then, perform the actual insertion.
164 | 		// this also marks the slot as used.
165 | 		slots[i] = { std::move(key), std::move(value) };
166 | 		assert(slots[i].used);
167 | 
168 | 		// unconditionally increment the size because
169 | 		// we know that the key didn't exist before.
170 | 		count++;
171 | 
172 | 		// finally, update maximal length of probe sequences (minus one)
173 | 		if (hash_offset > max_hash_offset) {
174 | 			max_hash_offset = hash_offset;
175 | 		}
176 | 
177 | 		return &slots[i].kv;
178 | 	}
179 | 
180 | 	void rehash() {
181 | 		// compute new size. Must be a power of two.
182 | 		const std::size_t new_size = slots.empty() ? 8 : slots.size() * 2;
183 | 
184 | 		// move original slot array out of *this and reset internal state
185 | 		auto old_slots = std::move(slots);
186 | 
187 | 		// language lawyer: move() need not clear std::vector.
188 | 		// this->clear() takes care of that, however
189 | 		// (as well as zeroing out count and max_hash_offset.)
190 | 		clear();
191 | 
192 | 		// make room for new slots (need to default-construct
193 | 		// in order for them to be in an 'unused'/free state)
194 | 		slots.resize(new_size);
195 | 
196 | 		// re-insert each key-value pair
197 | 		for (auto &slot : old_slots) {
198 | 			if (slot.used) {
199 | 				insert_nonexistent_norehash(std::move(slot.kv.key), std::move(slot.kv.value));
200 | 			}
201 | 		}
202 | 	}
203 | 
204 | public:
205 | 
206 | 	//////////////////
207 | 	// Constructors //
208 | 	//////////////////
209 | 	hash_table() noexcept: slots {}, count { 0 }, max_hash_offset { 0 } {}
210 | 
211 | 	hash_table(std::size_t capacity) noexcept: hash_table() {
212 | 		// Make sure the real capacity is a power of two >= 8.
213 | 		// We should also keep in mind that the number of elements
214 | 		// is at most 3/4 of the number of slots!
215 | 		std::size_t min_num_slots = (capacity * 4 + 2) / 3; // round up
216 | 		std::size_t real_cap = 8;
217 | 
218 | 		while (real_cap < min_num_slots) {
219 | 			real_cap *= 2;
220 | 		}
221 | 
222 | 		slots.resize(real_cap);
223 | 	}
224 | 
225 | 	hash_table(const hash_table &) = default;
226 | 
227 | 	hash_table(hash_table &&other) noexcept:
228 | 		slots { std::move(other.slots) },
229 | 		count { other.count },
230 | 		max_hash_offset { other.max_hash_offset }
231 | 	{
232 | 		other.clear();
233 | 	}
234 | 
235 | 	// naive implementation, may be improved. not sure if worth the effort.
236 | 	hash_table(std::initializer_list<KeyValue> elems) : hash_table(elems.size()) {
237 | 		for (auto &elem : elems) {
238 | 			// cannot move from an initializer_list
239 | 			set(elem.key, elem.value);
240 | 		}
241 | 	}
242 | 
243 | 	/////////////////////////
244 | 	// Resource management //
245 | 	/////////////////////////
246 | 
247 | 	friend void swap(hash_table &lhs, hash_table &rhs) noexcept {
248 | 		using std::swap;
249 | 		swap(lhs.slots, rhs.slots);
250 | 		swap(lhs.count, rhs.count);
251 | 		swap(lhs.max_hash_offset, rhs.max_hash_offset);
252 | 	}
253 | 
254 | 	hash_table &operator=(hash_table other) {
255 | 		swap(*this, other);
256 | 		return *this;
257 | 	}
258 | 
259 | 	void clear() noexcept {
260 | 		slots.clear();
261 | 		count = 0;
262 | 		max_hash_offset = 0;
263 | 	}
264 | 
265 | 	///////////////////////////////////////////////////////////////
266 | 	// Actual hash table operations: Get, Insert/Replace, Delete //
267 | 	///////////////////////////////////////////////////////////////
268 | 
269 | 	const Value *get(const Key &key) const {
270 | 		if (const Slot *slot = get_slot(key)) {
271 | 			return &slot->kv.value;
272 | 		}
273 | 		return nullptr;
274 | 	}
275 | 
276 | 	Value *get(const Key &key) {
277 | 		if (Slot *slot = get_slot(key)) {
278 | 			return &slot->kv.value;
279 | 		}
280 | 		return nullptr;
281 | 	}
282 | 
283 | 	const Value &get_or(const Key &key, const Value &defaultValue) const {
284 | 		if (Slot *slot = get_slot(key)) {
285 | 			return slot->kv.value;
286 | 		}
287 | 		return defaultValue;
288 | 	}
289 | 
290 | 	Value get_or(const Key &key, Value &&defaultValue) const {
291 | 		if (const Slot *slot = get_slot(key)) {
292 | 			return slot->kv.value;
293 | 		}
294 | 		return std::move(defaultValue);
295 | 	}
296 | 
297 | 	Value &get_or(const Key &key, Value &defaultValue) {
298 | 		if (Slot *slot = get_slot(key)) {
299 | 			return slot->kv.value;
300 | 		}
301 | 		return defaultValue;
302 | 	}
303 | 
304 | 	Value *set(const Key &key, Value value) {
305 | 		// if the key is already in the table, just replace it and move on
306 | 		if (Value *candidate = get(key)) {
307 | 			*candidate = std::move(value);
308 | 			return candidate;
309 | 		}
310 | 
311 | 		// else we need to insert it. First, check if we need to expand.
312 | 		if (should_rehash()) {
313 | 			rehash();
314 | 		}
315 | 
316 | 		// then we actually insert the key.
317 | 		auto kv = insert_nonexistent_norehash(key, std::move(value));
318 | 		return &kv->value;
319 | 	}
320 | 
321 | 	Value *set(Key &&key, Value value) {
322 | 		// if the key is already in the table, just replace it and move on
323 | 		if (Value *candidate = get(key)) {
324 | 			*candidate = std::move(value);
325 | 			return candidate;
326 | 		}
327 | 
328 | 		// else we need to insert it. First, check if we need to expand.
329 | 		if (should_rehash()) {
330 | 			rehash();
331 | 		}
332 | 
333 | 		// then we actually insert the key.
334 | 		auto kv = insert_nonexistent_norehash(std::move(key), std::move(value));
335 | 		return &kv->value;
336 | 	}
337 | 
338 | 	void remove(const Key &key) {
339 | 		if (Slot *slot = get_slot(key)) {
340 | 			// destroy key and value (we don't want to surprise users of RAII)
341 | 			// This also marks the slot as unused.
342 | 			*slot = {};
343 | 			assert(slot->used == false);
344 | 
345 | 			// removing an existing key means we need to decrease the table size.
346 | 			count--;
347 | 		}
348 | 	}
349 | 
350 | 	std::size_t size() const {
351 | 		return count;
352 | 	}
353 | 
354 | 	bool empty() const {
355 | 		return size() == 0;
356 | 	}
357 | 
358 | 	double load_factor() const {
359 | 		return double(size()) / slots.size();
360 | 	}
361 | 
362 | 	// Default-constructing indexing operators
363 | 	Value &operator[](const Key &key) {
364 | 		// if the value already exists, return a reference to it
365 | 		if (Value *value = get(key)) {
366 | 			return *value;
367 | 		}
368 | 
369 | 		// if it doesn't, then default-construct and insert it,
370 | 		// then return a reference to the newly-added value.
371 | 		return *set(key, {});
372 | 	}
373 | 
374 | 	Value &operator[](Key &&key) {
375 | 		// if the value already exists, return a reference to it
376 | 		if (Value *value = get(key)) {
377 | 			return *value;
378 | 		}
379 | 
380 | 		// if it doesn't, then default-construct and insert it,
381 | 		// then return a reference to the newly-added value.
382 | 		return *set(std::move(key), {});
383 | 	}
384 | 
385 | 	const Value &operator[](const Key &key) const {
386 | 		if (const Value *value = get(key)) {
387 | 			return *value;
388 | 		}
389 | 
390 | 		std::fprintf(stderr, "hash_table::operator[] failed: key does not exist\n");
391 | 		std::fflush(stderr);
392 | 		abort();
393 | 	}
394 | 
395 | 	//////////////////
396 | 	// Iterator API //
397 | 	//////////////////
398 | 
399 | 	struct const_iterator {
400 | 	protected:
401 | 		friend struct hash_table;
402 | 
403 | 		const hash_table *owner;
404 | 		std::size_t slot_index;
405 | 
406 | 		const_iterator(const hash_table *p_owner, std::size_t p_slot_index) :
407 | 			owner(p_owner),
408 | 			slot_index(p_slot_index)
409 | 		{}
410 | 
411 | 	public:
412 | 
413 | 		const_iterator(const const_iterator &other) = default;
414 | 
415 | 		const KeyValue *operator->() const {
416 | 			assert(slot_index < owner->slots.size() && "cannot dereference end iterator");
417 | 			return &owner->slots[slot_index].kv;
418 | 		}
419 | 
420 | 		const KeyValue &operator*() const {
421 | 			return *operator->();
422 | 		}
423 | 
424 | 		const_iterator &operator++() {
425 | 			assert(slot_index < owner->slots.size() && "cannot increment end iterator");
426 | 			do {
427 | 				slot_index++;
428 | 			} while (slot_index < owner->slots.size() && not owner->slots[slot_index].used);
429 | 			return *this;
430 | 		}
431 | 
432 | 		const_iterator operator++(int) {
433 | 			auto prev(*this);
434 | 			++*this;
435 | 			return prev;
436 | 		}
437 | 
438 | 		bool operator==(const const_iterator &other) const {
439 | 			return owner == other.owner && slot_index == other.slot_index;
440 | 		}
441 | 
442 | 		bool operator!=(const const_iterator &other) const {
443 | 			return !operator==(other);
444 | 		}
445 | 	};
446 | 
447 | 	struct iterator : public const_iterator {
448 | 	private:
449 | 		friend struct hash_table;
450 | 
451 | 		iterator(const const_iterator &other) : const_iterator(other) {}
452 | 
453 | 	public:
454 | 		iterator(const iterator &other) : const_iterator(other) {}
455 | 
456 | 		KeyValue &operator*() const {
457 | 			return *operator->();
458 | 		}
459 | 
460 | 		KeyValue *operator->() const {
461 | 			return const_cast<KeyValue *>(
462 | 				static_cast<const const_iterator *>(this)->operator->()
463 | 			);
464 | 		}
465 | 
466 | 		iterator &operator++() {
467 | 			assert(this->slot_index < this->owner->slots.size() && "cannot increment end iterator");
468 | 			do {
469 | 				this->slot_index++;
470 | 			} while (this->slot_index < this->owner->slots.size() && not this->owner->slots[this->slot_index].used);
471 | 			return *this;
472 | 		}
473 | 
474 | 		iterator operator++(int) {
475 | 			auto prev(*this);
476 | 			++*this;
477 | 			return prev;
478 | 		}
479 | 	};
480 | 
481 | 	const_iterator begin() const {
482 | 		auto it = const_iterator(this, 0);
483 | 		while (it.slot_index < slots.size() && not slots[it.slot_index].used) {
484 | 			it.slot_index++;
485 | 		}
486 | 		return it;
487 | 	}
488 | 
489 | 	const_iterator end() const {
490 | 		return const_iterator(this, slots.size());
491 | 	}
492 | 
493 | 	iterator begin() {
494 | 		return iterator(cthis()->begin());
495 | 	}
496 | 
497 | 	iterator end() {
498 | 		return iterator(cthis()->end());
499 | 	}
500 | 
501 | 	const_iterator cbegin() const {
502 | 		return begin();
503 | 	}
504 | 
505 | 	const_iterator cend() const {
506 | 		return end();
507 | 	}
508 | 
509 | 	const_iterator find(const Key &key) const {
510 | 		if (const Slot *slot = get_slot(key)) {
511 | 			return const_iterator(this, slot - slots.data());
512 | 		}
513 | 		return end();
514 | 	}
515 | 
516 | 	iterator find(const Key &key) {
517 | 		return iterator(cthis()->find(key));
518 | 	}
519 | 
520 | 	void erase(const const_iterator &it) {
521 | 		assert(it.owner == this && "cannot erase an element of another instance");
522 | 		remove(it->key);
523 | 	}
524 | };
525 | 
526 | }
527 | 
528 | #endif // H2CO3_HASH_TABLE_HH
529 | 


--------------------------------------------------------------------------------