├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── COPYING
├── LICENSE
├── README.md
├── examples
    ├── CMakeLists.txt
    ├── hashmap.cpp
    ├── hashset.cpp
    └── serialization_and_memsize.cpp
├── include
    └── tudocomp
    │   └── util
    │       ├── compact_hash
    │           ├── decomposed_key_t.hpp
    │           ├── entry_t.hpp
    │           ├── hash_functions.hpp
    │           ├── index_structure
    │           │   ├── cv_bvs_t.hpp
    │           │   ├── displacement_t.hpp
    │           │   ├── elias_gamma_displacement_table_t.hpp
    │           │   ├── layered_displacement_table_t.hpp
    │           │   └── naive_displacement_table_t.hpp
    │           ├── map
    │           │   ├── hashmap_t.hpp
    │           │   ├── satellite_data_t.hpp
    │           │   ├── typedefs.hpp
    │           │   ├── val_quot_bucket_layout_t.hpp
    │           │   └── val_quot_ptrs_t.hpp
    │           ├── set
    │           │   ├── hashset_t.hpp
    │           │   ├── no_satellite_data_t.hpp
    │           │   ├── quot_bucket_layout_t.hpp
    │           │   ├── quot_ptr_t.hpp
    │           │   └── typedefs.hpp
    │           ├── size_manager_t.hpp
    │           ├── storage
    │           │   ├── bucket_t.hpp
    │           │   ├── buckets_bv_t.hpp
    │           │   ├── plain_sentinel_t.hpp
    │           │   └── sparse_pos_t.hpp
    │           └── util.hpp
    │       ├── heap_size.hpp
    │       ├── object_size_t.hpp
    │       └── serialization.hpp
└── test
    ├── CMakeLists.txt
    ├── compact_hash_displacement_tests.cpp
    ├── compact_hash_elias_displacement_tests.cpp
    ├── compact_hash_tests.cpp
    ├── compact_hash_tests.template.hpp
    ├── compact_hashset_tests.template.hpp
    ├── compact_sparse_hash_displacement_tests.cpp
    ├── compact_sparse_hash_elias_displacement_tests.cpp
    ├── compact_sparse_hash_tests.cpp
    ├── compact_sparse_hashset_displacement_tests.cpp
    ├── compact_sparse_hashset_elias_displacement_tests.cpp
    ├── compact_sparse_hashset_serialization_tests.cpp
    ├── compact_sparse_hashset_tests.cpp
    ├── sandbox_test.cpp
    └── v2_tests.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/bit_span"]
2 | 	path = submodules/bit_span
3 | 	url = https://github.com/tudocomp/bit_span.git
4 | [submodule "submodules/build_system"]
5 | 	path = submodules/build_system
6 | 	url = https://github.com/tudocomp/build_system
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0.2 FATAL_ERROR)
 2 | 
 3 | project (compact_sparse_hash)
 4 | 
 5 | # Check if this project is build standalone
 6 | #
 7 | # We do this in case we want to use this repo as a GIT submodule,
 8 | # because then we only need the source files themselves
 9 | if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
10 |     set(CSH_STANDALONE 1)
11 | endif()
12 | 
13 | if(CSH_STANDALONE)
14 |     # init build system
15 |     execute_process(COMMAND git submodule update --init -- build_system
16 |         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/submodules)
17 | 
18 |     list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/submodules/build_system/cmakemodules")
19 |     include(tdc_init)
20 | 
21 |     # downloadable dependencies
22 |     include(depend_glog)
23 | 
24 |     # quit if dependencies aren't met
25 |     tdc_check_hard_deps()
26 |     if(TDC_DEPS_MISSING)
27 |         return()
28 |     endif()
29 | 
30 |     # soft dependencies
31 |     include(softdepend_gtest)
32 | 
33 |     # submodules
34 |     include(git_submodule_subdirectories)
35 |     git_submodule_subdirectory(submodules/build_system)
36 |     git_submodule_subdirectory(submodules/bit_span)
37 | endif()
38 | 
39 | # Main target
40 | add_library(compact_sparse_hash INTERFACE)
41 | target_link_libraries(compact_sparse_hash INTERFACE bit_span)
42 | target_include_directories(compact_sparse_hash INTERFACE include)
43 | 
44 | if(CSH_STANDALONE)
45 |     # Unit tests
46 |     add_subdirectory(test)
47 | 
48 |     # Examples
49 |     add_subdirectory(examples)
50 | 
51 |     # Disclaimer
52 |     MESSAGE(STATUS "Built Type: " ${CMAKE_BUILD_TYPE} )
53 | endif()
54 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | TuDoComp - TU Dortmund lossless compression framework
 2 | Copyright (C) 2016 Patrick Dinklage, Dominik Köppl, Marvin Löbel, Johannes Fischer
 3 | Contact found at: https://ls11-www.cs.tu-dortmund.de/staff/koeppl
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Compact Sparse Hash Table
  2 | ========
  3 | 
  4 | The compact sparse hash table is a blend of compact hashing [1] and
  5 | [Google's sparse hash table](https://github.com/sparsehash/sparsehash).
  6 | Our hash table is more memory efficient than both variants when the hash table is not much filled.
  7 | The restriction is that it can only hash integer keys, but of arbitrary bit width.
  8 | 
  9 | # Why?
 10 | The main idea is to use the compact sparse hash table as a dynamic dictionary for
 11 | maintaining a set of (key,value)-pairs, or shortly kv-pairs, where the keys are integer values.
 12 | It is especially useful if memory efficiency is in focus, since the table stores the keys bit-aligned.
 13 | Therefore, it is crucial to specify the bit width of a key. The bit width can be updated online.
 14 | For instance, compact hash tables and sparse hash tables are already used for computing LZ78 [2].
 15 | 
 16 | # Usage
 17 | 
 18 | A minimal example is
 19 | ```C++
 20 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 21 | ...
 22 | // creates a hash table with zero entries, set the bit-width of the keys to four
 23 | auto map = tdc::compact_hash::map::sparse_cv_hashmap_t<int>(0, 4);
 24 | for(int i = 0; i <= 15; ++i) { // interval [0..15] can be represented by four bits
 25 |     map.insert(i, i*i); // insert key i, value i*i
 26 |     std::cout << i << " -> " << map[i] << std::endl; // map[i] returns value i*i with key i
 27 | }
 28 | ```
 29 | 
 30 | # How it works
 31 | The idea of a hash table is to maintain a set of (key,value)-pairs, or shortly kv-pairs.
 32 | 
 33 | It applies the approach of Cleary [1], in which a _bijective_ hash function
 34 | determines the _initial position_, i.e., the position at which to try to store a kv-pair at first place
 35 | (in case of a collision a pair cannot stored there).
 36 | The bijective hash functions allows us to store only a fragment of the key, called the _quotient_, in the hash table.
 37 | The complete key of a kv-pair can be restored with the quotient and the additional knowledge of the initial address of the kv-pair.
 38 | Unfortunately, due to collisions, it happens that the kv-pair is misplaced (i.e., it is not
 39 | stored at its initial address).
 40 | The initial address can be restored by additionally maintaining two bit vectors, and restricting the
 41 | collision resolving to linear probing.
 42 | The bit vectors track the misplacements such that we can recalculate the initial address of a stored kv-pair.
 43 | Each of the two additional bit vectors stores for each position in the hash table one bit.
 44 | In summary, this technique saves space by not saving the full keys, but only their quotients.
 45 | 
 46 | To further slim down the space footprint, we apply the trick of the sparse hash table:
 47 | Instead of allocating a large hash table, we allocate a vector of pointers to buckets.
 48 | Each bucket represents a section of length `B` of the hash table, such that we have `n/B` buckets if the hash table is of size `n`
 49 | (we assure that `n` is divisible by `B` such that all buckets have the same length `B`).
 50 | Although a bucket stores up to `B` elements, it only acquires space for the actually saved kv-pairs in it.
 51 | For that, it stores a bit vector of length `B` marking with a one all positions in its section of the hash table that are actually occupied by a
 52 | kv-pair.
 53 | The kv-pair corresponding to the `i`-th one in the bit vector (i.e., the `i`-th one in the bit vector has rank `i`)
 54 | is the `i`-th element stored in the bucket.
 55 | Given that we want to access the `j`-th element in the section belonging to a bucket,
 56 | we know that the `j`-the position is marked with a one in the bit vector, but not the rank of this one.
 57 | To compute the rank of the one at the `j`-th position, we count how many one's up to the `j`-th position are stored in the bit vector in the bucket.
 58 | Remember that the rank is the entry number of the element in the bucket we want to access.
 59 | By keeping `B` small enough, we argue that the entire bucket can be stored in cache, allowing us to work with the bit vector
 60 | with modern CPU instructions like `popcount`.
 61 | When inserting a new kv-pair into the bucket, we update the bit vector, and move the stored elements adequately
 62 | (like in a standard std::vector). However, this is not a performance bottleneck, since again, with a sufficiently small bucket size,
 63 | this operation is computed efficiently on modern computer hardware.
 64 | Currently, we have set the bucket size `B` to 64.
 65 | 
 66 | 
 67 | # API
 68 | We have a `set` and a `map` interface to the (sparse) compact hash table:
 69 |  - `tdc::compact_hash::set::hashset_t`
 70 |  - `tdc::compact_hash::map::hashmap_t`
 71 | Each of these hash table classes is templated by the following parameters:
 72 |  - the hash function
 73 |  - how the storage of the hash table is represented (e.g., sparse)
 74 |  - how to maintain entries that are stored not at their initial address, i.e., how the displacement works
 75 |    - `cv_bvs_t` : Approach by Cleary using two bit vectors setting a virgin and change bit
 76 |    - `displacement_t<T>`: using a displacement array represented by `T`, which can be
 77 |      - `layered_displacement_table_t<size_t i>`: the recursive m-Bonsai approach of [3], where we implemented the simpler practical approach that uses an integer array with fixed bit-width `i` and an auxiliary `std::unordered_map<size_t,size_t>` for storing displacement values that cannot be represented with `i` bits.
 78 |      - `elias_gamma_displacement_table_t`: the gamma m-Bonsai approach of [3]
 79 |      - `naive_displacement_table_t`: stores the displacement array as a plain array with `size_t` integers (for debug purposes)
 80 | 
 81 | The `hashset_t` has the following helpful methods:
 82 |  - `lookup(key)` looks up a key and returns an `entry_t`,
 83 |  - `lookup_insert(key)` additionally inserts `key` if not present,
 84 |  - `lookup_insert_key_width(key, key_width)` works like above, but additionally increases the bit widths of the keys to `key_width`,
 85 |  - `grow_key_width(key_width)` increases the bit width of the keys to `key_width`.
 86 | 
 87 | All `lookup*` methods return an `entry_t` object, which contains an _id_ (`uint64_t`)
 88 | which is unique and immutable until the hash table needs to be rehashed.
 89 | This _id_ is computed based on the displacement setting:
 90 |  - For `displacement_t<T>` it is the position in the hash table the entry was hashed to. The id needs `log2(table_size)` bits.
 91 |  - For `cv_bvs_t` it is the local position within its group (the approach `cv_bvs_t` clusters all entries with the same initial address to one group)
 92 |    It is `id = initial_address | (local_position << log2(table_size))`. The id needs `log2(table_size) + log2(x)` bits, where `x` is the size of the specific group (which is at most the maximal number of collisions at an initial address) .
 93 | 
 94 | It is possible to let the hash table call an event handler before it rehashes its contents.
 95 | For that, methods that can cause a rehashing provide a template parameter `on_resize_t` that can be set to an event handler.
 96 | See the class `default_on_resize_t` in `hashset_t` for an example.
 97 | 
 98 | # Constraints
 99 | 
100 | * keys have to be integers
101 | * linear probing for collision handling
102 | * hash table size is always a power of two
103 | * hash function must be bijective
104 | * API is not STL-conform
105 | 
106 | # Features
107 | * The bit width of the keys can be updated on-line.
108 |   Changing the bit width causes a rehashing of the complete hash table.
109 | * Supports multiple hash functions. Currently, a `xorshift` hash function is implemented.
110 | * On resizing the hash table, each bucket of the old hash table is rehashed and subsequently freed,
111 |   such that there is no high memory peak like in traditional hash tables that need to keep entire old and new hash table
112 |   in RAM during a resize operation.
113 | 
114 | # Serialization
115 | 
116 | We offer a serialization API for the `set` interface:
117 | 
118 | ```c++
119 | #include <tudocomp/util/serialization.hpp>
120 | 
121 | using tdc::serialize;
122 | using table_t = tdc::compact_hash::set::hashset_t<...>;
123 | 
124 | table_t a = table_t(...);
125 | 
126 | std::stringstream ss;
127 | 
128 | // serialize to any std::ostream:
129 | serialize<table_t>::write(ss, a);
130 | 
131 | // deserialize from any std::istream:
132 | table_t b = serialize<table_t>::read(ss);
133 | ```
134 | 
135 | # Dependencies
136 | 
137 | The project is written in modern `C++14`.
138 | It uses `cmake` to build the library.
139 | 
140 | The external dependencies are:
141 | 
142 | * [Google Logging (glog)](https://github.com/google/glog) (0.34 or later).
143 | * [Google Test](https://github.com/google/googletest) (1.7.0 or later) __[Just for running the unit tests]__.
144 | 
145 | `cmake` searches the external dependencies first on the system,
146 | or automatically downloads and builds them from their official repositories.
147 | In that regard, a proper installation of the dependencies is not required.
148 | 
149 | # License
150 | 
151 | The code in this repository is published under the
152 | [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0)
153 | 
154 | # Todo
155 | * When additionally restricting the values to be integers, we can avoid padding:
156 |   We currently byte-align the values to allow the reinterpretation of its content (just by casting).
157 |   By restricting to integer values, we can write the values bit-compact in a bit vector.
158 | * Additionally, in the case that we work with values that are integers,
159 |   we want to support setting the width of the integer values online to further slim down memory consumption.
160 | * The hash table currently does not support the deletion of a kv-pair.
161 | * Support variable bucket sizes `B`
162 | 
163 | # Related Work
164 | * [Dynpdt: dynamic path-decomposed trie](https://github.com/kampersanda/dynpdt), a space-efficient dynamic keyword dictionary. It supports strings as values.
165 | * [mame-Bonsai](https://github.com/Poyias/mBonsai), a compact hash table implementation used as a trie data structure
166 | * [Bonsai trie reimplementation](https://github.com/kampersanda/bonsais), a reimplementation of the previous trie data structure
167 | 
168 | # References
169 | * [1] J. G. Cleary. Compact hash tables using bidirectional linear probing. IEEE Trans. Computers, 33(9): 828-834, 1984.
170 | * [2] J. Fischer, D. Köppl: Practical Evaluation of Lempel-Ziv-78 and Lempel-Ziv-Welch Tries. SPIRE 2017: 191-207.
171 | * [3] A. Poyias, R. Raman: Improved Practical Compact Dynamic Tries. SPIRE 2015: 324-336
172 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_custom_target(examples)
 2 | 
 3 | # Create executable for every *.cpp file
 4 | FILE(GLOB children RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 5 | FOREACH(child ${children})
 6 |     get_filename_component(executable ${child} NAME_WE)
 7 |     
 8 |     add_executable(
 9 |         ${executable} 
10 | 
11 |         ${child}
12 |         )
13 | 
14 |     target_include_directories(${executable} INTERFACE include)
15 | 
16 |     target_link_libraries(
17 |         ${executable}
18 | 
19 |         compact_sparse_hash
20 |         )
21 |     add_dependencies(examples ${executable})
22 | 
23 | ENDFOREACH()
24 | 


--------------------------------------------------------------------------------
/examples/hashmap.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | 
 4 | #include <cstdint>
 5 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 6 | #include <algorithm>
 7 | 
 8 | template<typename val_t>
 9 | using map_type = tdc::compact_hash::map::plain_elias_hashmap_t<val_t>;
10 | 
11 | int main() {
12 |     // creates a hash table with zero entries, set the bit-width of the keys to four
13 |     auto map = map_type<int>(0, 4);
14 |     for(int i = 0; i <= 15; ++i) { // interval [0..15] can be represented by four bits
15 |         map.insert(i, std::move(i*i)); // insert key i, value i*i
16 |         std::cout << i << " -> " << map[i] << std::endl; // map[i] returns value i*i with key i
17 | 
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/examples/hashset.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | 
 4 | #include <cstdint>
 5 | #include <tudocomp/util/compact_hash/set/hashset_t.hpp>
 6 | #include <tudocomp/util/compact_hash/hash_functions.hpp>
 7 | #include <tudocomp/util/compact_hash/index_structure/cv_bvs_t.hpp>
 8 | #include <algorithm>
 9 | #include <tudocomp/util/serialization.hpp>
10 | 
11 | using set_type = tdc::compact_hash::set::hashset_t<
12 |     tdc::compact_hash::poplar_xorshift_t,
13 |     tdc::compact_hash::cv_bvs_t
14 |     >;
15 | 
16 | int main() {
17 |     // creates a set with capacity zero and bit-width five
18 |     auto set = set_type(0, 5);
19 |     for(int i = 0; i <= 4; ++i) { // can hash keys in the range [0..2**5-1]
20 |          set.lookup_insert(i*i);
21 |     }
22 |     for(int i = 0; i <= 15; ++i) {
23 |         auto ret = set.lookup(i);
24 |         if(ret.found()) {
25 |             std::cout << "Id of node : " << ret.id() << std::endl; // returns the unique ID of the entry. This ID does not change until resizing occurs.
26 |             std::cout << i << " -> " << ret.found() << std::endl; // checks whether set[i] is set
27 |             std::cout << std::endl;
28 |         }
29 |     }
30 |     std::stringstream ss;
31 |     tdc::serialize<set_type>::write(ss, set);
32 |     std::cout << ss.str() << std::endl;
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/examples/serialization_and_memsize.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <algorithm>
 4 | #include <sstream>
 5 | #include <cstdint>
 6 | 
 7 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 8 | #include <tudocomp/util/serialization.hpp>
 9 | #include <tudocomp/util/heap_size.hpp>
10 | 
11 | template<typename val_t>
12 | using map_type = tdc::compact_hash::map::sparse_elias_hashmap_t<val_t>;
13 | 
14 | int main() {
15 |     // creates a hash table with default capacity and initial bit widths
16 |     auto map = map_type<int>();
17 |     for(int i = 0; i < 1000; ++i) {
18 |         auto key = i;
19 |         auto val = i*i + 42;
20 | 
21 |         map.insert_kv_width(key, std::move(val), tdc::bits_for(key), tdc::bits_for(val));
22 |     }
23 | 
24 |     std::cout << "elements in map: " << map.size() << std::endl;
25 |     std::cout << "key width: " << map.key_width() << " bits" << std::endl;
26 |     std::cout << "value width: " << map.value_width() << " bits" << std::endl;
27 | 
28 |     // this could just be an `ofstream` for outputting to a file.
29 |     std::stringstream output_stream;
30 | 
31 |     // compute size of the datastructure
32 |     auto heap_object_size = tdc::heap_size_compute(map);
33 | 
34 |     // serialize the datastructure
35 |     auto written_object_size = tdc::serialize_write(output_stream, map);
36 | 
37 |     std::cout << "total heap size of initial map: " << heap_object_size.size_in_bytes() << std::endl;
38 |     std::cout << "serialized bytes: " << written_object_size.size_in_bytes() << std::endl;
39 | 
40 |     auto deserialized_map = tdc::serialize_read<map_type<int>>(output_stream);
41 |     auto heap_object_size2 = tdc::heap_size_compute(deserialized_map);
42 | 
43 |     std::cout << "total heap size of deserialized map: " << heap_object_size2.size_in_bytes() << std::endl;
44 | }
45 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/decomposed_key_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | 
 5 | namespace tdc {namespace compact_hash {
 6 |     struct decomposed_key_t {
 7 |         size_t initial_address;   // initial address of key in table
 8 |         uint64_t stored_quotient;   // quotient value stored in table
 9 |     };
10 | }}
11 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/entry_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace tdc {namespace compact_hash {
 4 | 
 5 | template<typename entry_ptr>
 6 | class generic_entry_t {
 7 |     uint64_t m_id;
 8 |     bool m_key_already_exist;
 9 |     bool m_not_found;
10 |     entry_ptr m_ptr;
11 | 
12 |     inline generic_entry_t(uint64_t id, bool key_already_exist, bool not_found, entry_ptr ptr):
13 |         m_id(id),
14 |         m_key_already_exist(key_already_exist),
15 |         m_not_found(not_found),
16 |         m_ptr(ptr) {}
17 | public:
18 |     /// Creates a `entry_t` for a key that already exists in the table.
19 |     ///
20 |     /// The _id_ is an integer that uniquely describes the key,
21 |     /// while only taking up approximately log2(table_size) bits.
22 |     /// It gets invalidated if the underlying table needs to be resized.
23 |     inline static generic_entry_t found_exist(uint64_t id, entry_ptr ptr) {
24 |         return generic_entry_t {
25 |             id,
26 |             true,
27 |             false,
28 |             ptr,
29 |         };
30 |     }
31 | 
32 |     /// Creates a `entry_t` for a new key in the table.
33 |     ///
34 |     /// The _id_ is an integer that uniquely describes the key,
35 |     /// while only taking up approximately log2(table_size) bits.
36 |     /// It gets invalidated if the underlying table needs to be resized.
37 |     inline static generic_entry_t found_new(uint64_t id, entry_ptr ptr) {
38 |         return generic_entry_t {
39 |             id,
40 |             false,
41 |             false,
42 |             ptr,
43 |         };
44 |     }
45 | 
46 |     /// Creates a `entry_t` for a key that could not be found in the table.
47 |     inline static generic_entry_t not_found() {
48 |         return generic_entry_t {
49 |             0,
50 |             false,
51 |             true,
52 |             entry_ptr(),
53 |         };
54 |     }
55 | 
56 |     /// Returns true if the key exists in the table.
57 |     inline bool found() const {
58 |         return !m_not_found;
59 |     }
60 | 
61 |     /// Returns the _id_ of the key.
62 |     ///
63 |     /// The _id_ is an integer that uniquely describes the key,
64 |     /// while only taking up approximately log2(table_size) bits.
65 |     /// It gets invalidated if the underlying table needs to be resized.
66 |     inline uint64_t id() const {
67 |         DCHECK(found());
68 |         return m_id;
69 |     }
70 | 
71 |     /// Returns true if the key already exists in the table.
72 |     inline bool key_already_exist() const {
73 |         DCHECK(found());
74 |         return m_key_already_exist;
75 |     }
76 | 
77 |     /// Return the ptr to the data, if it exists.
78 |     inline entry_ptr ptr() const {
79 |         DCHECK(found());
80 |         return m_ptr;
81 |     }
82 | };
83 | 
84 | }}
85 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/hash_functions.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <glog/logging.h>
  4 | #include <tudocomp/util/serialization.hpp>
  5 | 
  6 | // Source: https://github.com/kampersanda/poplar-trie/blob/master/include/poplar/bijective_hash.hpp
  7 | namespace poplar{namespace bijective_hash {
  8 | 
  9 | // (p, q): p < 2**w is a prime and q < 2**w is an integer such that pq mod m = 1
 10 | constexpr uint64_t PRIME_TABLE[][2][3] = {
 11 |   {{0ULL, 0ULL, 0ULL}, {0ULL, 0ULL, 0ULL}}, // 0
 12 |   {{1ULL, 1ULL, 1ULL}, {1ULL, 1ULL, 1ULL}}, // 1
 13 |   {{3ULL, 1ULL, 3ULL}, {3ULL, 1ULL, 3ULL}}, // 2
 14 |   {{7ULL, 5ULL, 3ULL}, {7ULL, 5ULL, 3ULL}}, // 3
 15 |   {{13ULL, 11ULL, 7ULL}, {5ULL, 3ULL, 7ULL}}, // 4
 16 |   {{31ULL, 29ULL, 23ULL}, {31ULL, 21ULL, 7ULL}}, // 5
 17 |   {{61ULL, 59ULL, 53ULL}, {21ULL, 51ULL, 29ULL}}, // 6
 18 |   {{127ULL, 113ULL, 109ULL}, {127ULL, 17ULL, 101ULL}}, // 7
 19 |   {{251ULL, 241ULL, 239ULL}, {51ULL, 17ULL, 15ULL}}, // 8
 20 |   {{509ULL, 503ULL, 499ULL}, {341ULL, 455ULL, 315ULL}}, // 9
 21 |   {{1021ULL, 1019ULL, 1013ULL}, {341ULL, 819ULL, 93ULL}}, // 10
 22 |   {{2039ULL, 2029ULL, 2027ULL}, {455ULL, 1509ULL, 195ULL}}, // 11
 23 |   {{4093ULL, 4091ULL, 4079ULL}, {1365ULL, 819ULL, 3855ULL}}, // 12
 24 |   {{8191ULL, 8179ULL, 8171ULL}, {8191ULL, 4411ULL, 4291ULL}}, // 13
 25 |   {{16381ULL, 16369ULL, 16363ULL}, {5461ULL, 4369ULL, 12483ULL}}, // 14
 26 |   {{32749ULL, 32719ULL, 32717ULL}, {13797ULL, 10031ULL, 1285ULL}}, // 15
 27 |   {{65521ULL, 65519ULL, 65497ULL}, {4369ULL, 3855ULL, 36969ULL}}, // 16
 28 |   {{131071ULL, 131063ULL, 131059ULL}, {131071ULL, 29127ULL, 110907ULL}}, // 17
 29 |   {{262139ULL, 262133ULL, 262127ULL}, {209715ULL, 95325ULL, 200463ULL}}, // 18
 30 |   {{524287ULL, 524269ULL, 524261ULL}, {524287ULL, 275941ULL, 271853ULL}}, // 19
 31 |   {{1048573ULL, 1048571ULL, 1048559ULL}, {349525ULL, 209715ULL, 986895ULL}}, // 20
 32 |   {{2097143ULL, 2097133ULL, 2097131ULL}, {1864135ULL, 1324517ULL, 798915ULL}}, // 21
 33 |   {{4194301ULL, 4194287ULL, 4194277ULL}, {1398101ULL, 986895ULL, 3417581ULL}}, // 22
 34 |   {{8388593ULL, 8388587ULL, 8388581ULL}, {1118481ULL, 798915ULL, 3417581ULL}}, // 23
 35 |   {{16777213ULL, 16777199ULL, 16777183ULL}, {5592405ULL, 986895ULL, 15760415ULL}}, // 24
 36 |   {{33554393ULL, 33554383ULL, 33554371ULL}, {17207401ULL, 31500079ULL, 15952107ULL}}, // 25
 37 |   {{67108859ULL, 67108837ULL, 67108819ULL}, {53687091ULL, 62137837ULL, 50704475ULL}}, // 26
 38 |   {{134217689ULL, 134217649ULL, 134217617ULL}, {17207401ULL, 113830225ULL, 82223473ULL}}, // 27
 39 |   {{268435399ULL, 268435367ULL, 268435361ULL}, {131863031ULL, 96516119ULL, 186492001ULL}}, // 28
 40 |   {{536870909ULL, 536870879ULL, 536870869ULL}, {357913941ULL, 32537631ULL, 274678141ULL}}, // 29
 41 |   {{1073741789ULL, 1073741783ULL, 1073741741ULL}, {889671797ULL, 1047552999ULL, 349289509ULL}}, // 30
 42 |   {{2147483647ULL, 2147483629ULL, 2147483587ULL}, {2147483647ULL, 1469330917ULL, 1056139499ULL}}, // 31
 43 |   {{4294967291ULL, 4294967279ULL, 4294967231ULL}, {858993459ULL, 252645135ULL, 1057222719ULL}}, // 32
 44 |   {{8589934583ULL, 8589934567ULL, 8589934543ULL}, {7635497415ULL, 1030792151ULL, 3856705327ULL}}, // 33
 45 |   {{17179869143ULL, 17179869107ULL, 17179869071ULL}, {9637487591ULL, 11825104763ULL, 12618841967ULL}}, // 34
 46 |   {{34359738337ULL, 34359738319ULL, 34359738307ULL}, {1108378657ULL, 21036574511ULL, 22530975979ULL}}, // 35
 47 |   {{68719476731ULL, 68719476719ULL, 68719476713ULL}, {13743895347ULL, 64677154575ULL, 8963410009ULL}}, // 36
 48 |   {{137438953447ULL, 137438953441ULL, 137438953427ULL}, {43980465111ULL, 35468117025ULL, 70246576219ULL}}, // 37
 49 |   {{274877906899ULL, 274877906857ULL, 274877906837ULL}, {207685529691ULL, 41073710233ULL, 208085144509ULL}}, // 38
 50 |   {{549755813881ULL, 549755813869ULL, 549755813821ULL}, {78536544841ULL, 347214198245ULL, 369238979477ULL}}, // 39
 51 |   {{1099511627689ULL, 1099511627609ULL, 1099511627581ULL}, {315951617177ULL, 928330176745ULL, 343949791253ULL}}, // 40
 52 |   {{2199023255531ULL, 2199023255521ULL, 2199023255497ULL}, {209430786243ULL, 1134979744801ULL, 1119502748281ULL}}, // 41
 53 |   {{4398046511093ULL, 4398046511087ULL, 4398046511071ULL}, {1199467230301ULL, 3363212037903ULL, 3331853417503ULL}}, // 42
 54 |   {{8796093022151ULL, 8796093022141ULL, 8796093022091ULL}, {8178823336439ULL, 918994793365ULL, 2405769031715ULL}}, // 43
 55 |   {{17592186044399ULL, 17592186044299ULL, 17592186044297ULL}, {16557351571215ULL, 2405769031715ULL, 2365335938745ULL}}, // 44
 56 |   {{35184372088777ULL, 35184372088763ULL, 35184372088751ULL}, {27507781814905ULL, 17847145262451ULL, 11293749065551ULL}}, // 45
 57 |   {{70368744177643ULL, 70368744177607ULL, 70368744177601ULL}, {13403570319555ULL, 34567102403063ULL, 4467856773185ULL}}, // 46
 58 |   {{140737488355213ULL, 140737488355201ULL, 140737488355181ULL}, {88113905752901ULL, 4432676798593ULL, 22020151239269ULL}}, // 47
 59 |   {{281474976710597ULL, 281474976710591ULL, 281474976710567ULL}, {100186008659725ULL, 4330384257087ULL, 123342967322647ULL}}, // 48
 60 |   {{562949953421231ULL, 562949953421201ULL, 562949953421189ULL}, {222399981598543ULL, 25358106009969ULL, 366146311168333ULL}}, // 49
 61 |   {{1125899906842597ULL, 1125899906842589ULL, 1125899906842573ULL}, {667199944795629ULL, 289517118902389ULL, 286994093901061ULL}}, // 50
 62 |   {{2251799813685119ULL, 2251799813685109ULL, 2251799813685083ULL}, {558586000294015ULL, 161999986596061ULL, 232003617167571ULL}}, // 51
 63 |   {{4503599627370449ULL, 4503599627370353ULL, 4503599627370323ULL}, {3449565672028465ULL, 3558788516733329ULL, 3514369651416283ULL}}, // 52
 64 |   {{9007199254740881ULL, 9007199254740847ULL, 9007199254740761ULL}, {2840107873116529ULL, 496948924399503ULL, 4991002184445225ULL}}, // 53
 65 |   {{18014398509481951ULL, 18014398509481931ULL, 18014398509481853ULL}, {16922616781634591ULL, 13595772459986403ULL, 6600695637062101ULL}}, // 54
 66 |   {{36028797018963913ULL, 36028797018963901ULL, 36028797018963869ULL}, {20962209174669945ULL, 20434243085382549ULL, 11645671763705525ULL}}, // 55
 67 |   {{72057594037927931ULL, 72057594037927909ULL, 72057594037927889ULL}, {14411518807585587ULL, 18681598454277613ULL, 21463964181510449ULL}}, // 56
 68 |   {{144115188075855859ULL, 144115188075855823ULL, 144115188075855811ULL}, {88686269585142075ULL, 44116894308935471ULL, 18900352534538475ULL}}, // 57
 69 |   {{288230376151711687ULL, 288230376151711681ULL, 288230376151711607ULL}, {126416831645487607ULL, 18300341342965825ULL, 136751638320155207ULL}}, // 58
 70 |   {{576460752303423263ULL, 576460752303423061ULL, 576460752303422971ULL}, {5124095576030431ULL, 2700050362076925ULL, 198471980483577139ULL}}, // 59
 71 |   {{1152921504606846883ULL, 1152921504606846803ULL, 1152921504606846697ULL}, {12397005425880075ULL, 566464323072728283ULL, 4132335141960025ULL}}, // 60
 72 |   {{2305843009213693951ULL, 2305843009213693669ULL, 2305843009213693613ULL}, {2305843009213693951ULL, 1768084568902373101ULL, 360500529464087845ULL}}, // 61
 73 |   {{4611686018427387733ULL, 4611686018427387421ULL, 4611686018427387271ULL}, {4557748170258646525ULL, 152768066863019061ULL, 1515372340968241207ULL}}, // 62
 74 |   {{9223372036854775291ULL, 9223372036854775279ULL, 9223372036854775181ULL}, {3657236494304118067ULL, 2545580940228350223ULL, 3339243145719352645ULL}}, // 63
 75 |   {{9223372036854775291ULL, 9223372036854775279ULL, 9223372036854775181ULL}, {3657236494304118067ULL, 11768952977083126031ULL, 3339243145719352645ULL}}, // 64
 76 | };
 77 | 
 78 | class Xorshift {
 79 | public:
 80 |   /// runtime initilization arguments, if any
 81 |   struct config_args {};
 82 | 
 83 |   /// get the config of this instance
 84 |   inline config_args current_config() const { return config_args{}; }
 85 | 
 86 |   Xorshift() = default;
 87 | 
 88 |   inline Xorshift(uint32_t univ_bits, config_args config):
 89 |     m_bits(univ_bits),
 90 |     m_shift(univ_bits / 2 + 1)
 91 |   {
 92 |     DCHECK_LT(0U, m_bits);
 93 |     DCHECK_LE(m_bits, 64U);
 94 |     DCHECK_LT(0, mask());
 95 |   }
 96 | 
 97 |   inline uint64_t hash(uint64_t x) const {
 98 |     DCHECK_LE(x, mask());
 99 |     x = hash_<0>(x);
100 |     x = hash_<1>(x);
101 |     x = hash_<2>(x);
102 |     return x;
103 |   }
104 | 
105 |   inline uint64_t hash_inv(uint64_t x) const {
106 |     DCHECK_LE(x, mask());
107 |     x = hash_inv_<2>(x);
108 |     x = hash_inv_<1>(x);
109 |     x = hash_inv_<0>(x);
110 |     return x;
111 |   }
112 | 
113 |   /// STL compability
114 |   inline uint64_t operator()(uint64_t x) const {
115 |     return hash(x);
116 |   }
117 | 
118 |   inline uint64_t bits() const {
119 |     return m_bits;
120 |   }
121 | 
122 |   inline uint64_t mask() const {
123 |     return (-1ULL >> (64-m_bits));
124 |   }
125 | 
126 |   void show_stat(std::ostream& os) const {
127 |     os << "Statistics of Xorshift\n";
128 |     os << " - mask: " << mask() << "\n";
129 |     os << " - bits: " << bits() << "\n";
130 |   }
131 | 
132 | private:
133 |   uint32_t m_bits{};
134 |   uint32_t m_shift{};
135 | 
136 |   template<typename T>
137 |   friend struct ::tdc::serialize;
138 | 
139 |   template<typename T>
140 |   friend struct ::tdc::heap_size;
141 | 
142 |   template <uint32_t N>
143 |   uint64_t hash_(uint64_t x) const {
144 |     DCHECK_LE(x, mask());
145 |     x = x ^ (x >> (m_shift + N));
146 |     x = (x * PRIME_TABLE[bits()][0][N]) & mask();
147 |     return x;
148 |   }
149 |   template <uint32_t N>
150 |   uint64_t hash_inv_(uint64_t x) const {
151 |     x = (x * PRIME_TABLE[bits()][1][N]) & mask();
152 |     x = x ^ (x >> (m_shift + N));
153 |     return x;
154 |   }
155 | };
156 | 
157 | }} //ns - poplar::bijective_hash
158 | 
159 | 
160 | namespace tdc {namespace compact_hash {
161 | 
162 | class xorshift_t {
163 |     uint64_t m_j;
164 |     uint64_t m_w_mask;
165 | 
166 |     template<typename T>
167 |     friend struct ::tdc::serialize;
168 | 
169 |     template<typename T>
170 |     friend struct ::tdc::heap_size;
171 | 
172 |     xorshift_t() = default;
173 | public:
174 |     /// runtime initilization arguments, if any
175 |     struct config_args {};
176 | 
177 |     /// get the config of this instance
178 |     inline config_args current_config() const { return config_args{}; }
179 | 
180 |     /// Constructs a hash function for values with a width of `w` bits.
181 |     xorshift_t(uint32_t w, config_args config):
182 |         m_j((w / 2ull) + 1)
183 |     {
184 |         DCHECK_LT((w / 2ull), m_j);
185 |         DCHECK_NE(w, 0U);
186 | 
187 |         // NB: Two shifts because a single shift with w == 64 is undefined
188 |         // behavior for a uint64_t according to the C++ standard.
189 |         m_w_mask = (1ull << (w - 1ull) << 1ull) - 1ull;
190 |     }
191 | 
192 |     /// This takes a value `x` with a width of `w` bits,
193 |     /// and calculates a hash value with a width of `w` bits.
194 |     inline uint64_t hash(uint64_t x) const {
195 |         uint64_t j = m_j;
196 |         uint64_t w_mask = m_w_mask;
197 | 
198 |         return (x xor ((x << j) & w_mask)) & w_mask;
199 |     }
200 | 
201 |     /// This takes a hash value `x` with a width of `w` bits,
202 |     /// and reverses the hash function to the original value.
203 |     inline uint64_t hash_inv(uint64_t x) const {
204 |         return hash(x);
205 |     }
206 | };
207 | 
208 | using poplar_xorshift_t = poplar::bijective_hash::Xorshift;
209 | 
210 | }
211 | 
212 | template<>
213 | struct heap_size<compact_hash::xorshift_t> {
214 |     using T = compact_hash::xorshift_t;
215 | 
216 |     static object_size_t compute(T const& val) {
217 |         using namespace compact_hash;
218 | 
219 |         auto bytes = object_size_t::empty();
220 | 
221 |         bytes += heap_size<uint64_t>::compute(val.m_j);
222 |         bytes += heap_size<uint64_t>::compute(val.m_w_mask);
223 | 
224 |         return bytes;
225 |     }
226 | };
227 | 
228 | template<>
229 | struct serialize<compact_hash::xorshift_t> {
230 |     using T = compact_hash::xorshift_t;
231 | 
232 |     static object_size_t write(std::ostream& out, T const& val) {
233 |         using namespace compact_hash;
234 | 
235 |         auto bytes = object_size_t::empty();
236 | 
237 |         bytes += serialize<uint64_t>::write(out, val.m_j);
238 |         bytes += serialize<uint64_t>::write(out, val.m_w_mask);
239 | 
240 |         return bytes;
241 |     }
242 |     static T read(std::istream& in) {
243 |         using namespace compact_hash;
244 | 
245 |         T ret;
246 |         ret.m_j = serialize<uint64_t>::read(in);
247 |         ret.m_w_mask = serialize<uint64_t>::read(in);
248 |         return ret;
249 |     }
250 |     static bool equal_check(T const& lhs, T const& rhs) {
251 |         return gen_equal_check(m_j)
252 |         && gen_equal_check(m_w_mask);
253 |     }
254 | };
255 | 
256 | template<>
257 | struct heap_size<poplar::bijective_hash::Xorshift> {
258 |     using T = poplar::bijective_hash::Xorshift;
259 | 
260 |     static object_size_t compute(T const& val) {
261 |         using namespace compact_hash;
262 | 
263 |         auto bytes = object_size_t::empty();
264 | 
265 |         bytes += heap_size<uint64_t>::compute(val.m_shift);
266 |         bytes += heap_size<uint64_t>::compute(val.m_bits);
267 | 
268 |         return bytes;
269 |     }
270 | };
271 | 
272 | template<>
273 | struct serialize<poplar::bijective_hash::Xorshift> {
274 |     using T = poplar::bijective_hash::Xorshift;
275 | 
276 |     static object_size_t write(std::ostream& out, T const& val) {
277 |         using namespace compact_hash;
278 | 
279 |         auto bytes = object_size_t::empty();
280 | 
281 |         bytes += serialize<uint64_t>::write(out, val.m_shift);
282 |         bytes += serialize<uint64_t>::write(out, val.m_bits);
283 | 
284 |         return bytes;
285 |     }
286 |     static T read(std::istream& in) {
287 |         using namespace compact_hash;
288 | 
289 |         T ret;
290 |         ret.m_shift = serialize<uint64_t>::read(in);
291 |         ret.m_bits = serialize<uint64_t>::read(in);
292 |         return ret;
293 |     }
294 |     static bool equal_check(T const& lhs, T const& rhs) {
295 |         return gen_equal_check(m_shift)
296 |         && gen_equal_check(m_bits);
297 |     }
298 | };
299 | 
300 | }
301 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/index_structure/cv_bvs_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <limits>
  4 | 
  5 | #include <tudocomp/ds/IntVector.hpp>
  6 | #include <tudocomp/ds/IntPtr.hpp>
  7 | #include "../entry_t.hpp"
  8 | 
  9 | #include <tudocomp/util/serialization.hpp>
 10 | 
 11 | namespace tdc {namespace compact_hash {
 12 | 
 13 | class cv_bvs_t {
 14 |     template<typename T>
 15 |     friend struct ::tdc::serialize;
 16 | 
 17 |     template<typename T>
 18 |     friend struct ::tdc::heap_size;
 19 | 
 20 |     IntVector<uint_t<2>> m_cv;
 21 |     inline cv_bvs_t(IntVector<uint_t<2>>&& cv): m_cv(std::move(cv)) {}
 22 | 
 23 | public:
 24 |     /// runtime initilization arguments, if any
 25 |     struct config_args {};
 26 | 
 27 |     /// get the config of this instance
 28 |     inline config_args current_config() const { return config_args{}; }
 29 | 
 30 |     inline cv_bvs_t(size_t table_size, config_args config) {
 31 |         m_cv.reserve(table_size);
 32 |         m_cv.resize(table_size);
 33 |     }
 34 | 
 35 |     /// A Group is a half-open range [group_start, group_end)
 36 |     /// that corresponds to a group of elements in the hashtable that
 37 |     /// belong to the same initial_address.
 38 |     ///
 39 |     /// This means that `c[group_start] == 1`, and
 40 |     /// `c[group_start < x < group_end] == 0`.
 41 |     ///
 42 |     /// `groups_terminator` points to the next free location
 43 |     /// inside the hashtable.
 44 |     struct Group {
 45 |         size_t group_start;       // Group that belongs to the key.
 46 |         size_t group_end;         // It's a half-open range: [start .. end).
 47 |         size_t groups_terminator; // Next free location.
 48 |     };
 49 | 
 50 |     template<typename storage_t, typename size_mgr_t>
 51 |     struct context_t {
 52 |         using satellite_t = typename storage_t::satellite_t_export;
 53 |         using entry_width_t = typename satellite_t::entry_bit_width_t;
 54 |         using entry_ptr_t = typename satellite_t::entry_ptr_t;
 55 |         using entry_t = generic_entry_t<entry_ptr_t>;
 56 |         using table_pos_t = typename storage_t::table_pos_t;
 57 | 
 58 |         IntVector<uint_t<2>>& m_cv;
 59 |         size_t const table_size;
 60 |         entry_width_t widths;
 61 |         size_mgr_t const& size_mgr;
 62 |         storage_t& storage;
 63 | 
 64 |         /// Getter for the v bit at table position `pos`.
 65 |         inline bool get_v(size_t pos) {
 66 |             return (m_cv[pos] & 0b01) != 0;
 67 |         }
 68 | 
 69 |         /// Getter for the c bit at table position `pos`.
 70 |         inline bool get_c(size_t pos) {
 71 |             return (m_cv[pos] & 0b10) != 0;
 72 |         }
 73 | 
 74 |         /// Setter for the v bit at table position `pos`.
 75 |         inline void set_v(size_t pos, bool v) {
 76 |             auto x = m_cv[pos] & 0b10;
 77 |             m_cv[pos] = x | (0b01 * v);
 78 |         }
 79 | 
 80 |         /// Setter for the c bit at table position `pos`.
 81 |         inline void set_c(size_t pos, bool c) {
 82 |             auto x = m_cv[pos] & 0b01;
 83 |             m_cv[pos] = x | (0b10 * c);
 84 |         }
 85 | 
 86 |         /// Setter for the c and v bit at table position `pos`.
 87 |         inline void set_cv(size_t pos, uint8_t v) {
 88 |             m_cv[pos] = v;
 89 |         }
 90 | 
 91 |         // Assumption: There exists a group at the initial address of `key`.
 92 |         // This group is either the group belonging to key,
 93 |         // or the one after it in the case that no group for `key` exists yet.
 94 |         inline Group search_existing_group(uint64_t initial_address) {
 95 |             auto sctx = storage.context(table_size, widths);
 96 |             auto ret = Group();
 97 |             size_t cursor = initial_address;
 98 | 
 99 |             // Walk forward from the initial address until we find a empty location.
100 |             // TODO: This search could maybe be accelerated by:
101 |             // - checking whole blocks in the bucket bitvector for == or != 0
102 |             size_t v_counter = 0;
103 |             DCHECK_EQ(get_v(cursor), true);
104 |             for(;
105 |                 !sctx.pos_is_empty(sctx.table_pos(cursor));
106 |                 cursor = size_mgr.mod_add(cursor))
107 |             {
108 |                 v_counter += get_v(cursor);
109 |             }
110 |             DCHECK_GE(v_counter, 1U);
111 |             ret.groups_terminator = cursor;
112 | 
113 |             // Walk back again to find the end of the group
114 |             // belonging to the initial address.
115 |             size_t c_counter = v_counter;
116 |             for(; c_counter != 1; cursor = size_mgr.mod_sub(cursor)) {
117 |                 c_counter -= get_c(size_mgr.mod_sub(cursor));
118 |             }
119 |             ret.group_end = cursor;
120 | 
121 |             // Walk further back to find the start of the group
122 |             // belonging to the initial address
123 |             for(; c_counter != 0; cursor = size_mgr.mod_sub(cursor)) {
124 |                 c_counter -= get_c(size_mgr.mod_sub(cursor));
125 |             }
126 |             ret.group_start = cursor;
127 | 
128 |             return ret;
129 |         }
130 | 
131 |         /// Search a quotient inside an existing Group.
132 |         ///
133 |         /// This returns a pointer to the value if its found, or null
134 |         /// otherwise.
135 |         inline entry_t search_in_group(Group const& group,
136 |                                        uint64_t stored_quotient) {
137 |             auto sctx = storage.context(table_size, widths);
138 |             for(size_t i = group.group_start; i != group.group_end; i = size_mgr.mod_add(i)) {
139 |                 auto sparse_entry = sctx.at(sctx.table_pos(i));
140 | 
141 |                 if (sparse_entry.get_quotient() == stored_quotient) {
142 |                     uint64_t in_group_offset = size_mgr.mod_sub(i, group.group_start);
143 |                     return entry_t::found_exist(in_group_offset, sparse_entry);
144 |                 }
145 |             }
146 |             return entry_t::not_found();
147 |         }
148 | 
149 |         /// Inserts a new key-value pair after an existing
150 |         /// group, shifting all following entries one to the right as needed.
151 |         inline entry_ptr_t insert_value_after_group(
152 |             Group const& group, uint64_t stored_quotient)
153 |         {
154 |             auto sctx = storage.context(table_size, widths);
155 |             auto end_pos = sctx.table_pos(group.group_end);
156 |             if (sctx.pos_is_empty(end_pos)) {
157 |                 // if there is no following group, just append the new entry
158 |                 return sctx.allocate_pos(end_pos);
159 |             } else {
160 |                 // else, shift all following elements one to the right
161 |                 return shift_groups_and_insert(group.group_end,
162 |                                                group.groups_terminator,
163 |                                                stored_quotient);
164 |             }
165 |         }
166 | 
167 |         /// Shifts all values and `c` bits of the half-open range [from, to)
168 |         /// inside the table one to the right, and inserts the new value
169 |         /// at the now-empty location `from`.
170 |         ///
171 |         /// The position `to` needs to be empty.
172 |         inline entry_ptr_t shift_groups_and_insert(
173 |             size_t from, size_t to, uint64_t stored_quotient)
174 |         {
175 |             DCHECK_NE(from, to);
176 | 
177 |             for(size_t i = to; i != from;) {
178 |                 size_t next_i = size_mgr.mod_sub(i, size_t(1));
179 | 
180 |                 set_c(i, get_c(next_i));
181 | 
182 |                 i = next_i;
183 |             }
184 |             set_c(from, false);
185 | 
186 |             return shift_elements_and_insert(from, to);
187 |         }
188 | 
189 |         /// Shifts all values of the half-open range [from, to)
190 |         /// inside the table one to the right, and inserts the new value
191 |         /// at the now-empty location `from`.
192 |         ///
193 |         /// The position `to` needs to be empty.
194 |         inline entry_ptr_t shift_elements_and_insert(
195 |             size_t from, size_t to)
196 |         {
197 |             auto sctx = storage.context(table_size, widths);
198 |             // move from...to one to the right, then insert at from
199 | 
200 |             DCHECK(from != to);
201 | 
202 |             table_pos_t from_pos;
203 | 
204 |             if (to < from) {
205 |                 // if the range wraps around, we decompose into two ranges:
206 |                 // [   |      |      ]
207 |                 // | to^      ^from  |
208 |                 // ^start         end^
209 |                 // [ 2 ]      [  1   ]
210 |                 //
211 |                 // NB: because we require from != to, and insert 1 additional element,
212 |                 // we are always dealing with a minimum 2 element range,
213 |                 // and thus can not end up with a split range with length == 0.
214 | 
215 |                 from_pos = sparse_shift(from,  table_size);
216 |                 if (to > 0) {
217 |                     auto start_pos = sparse_shift(0, to);
218 |                     sctx.at(from_pos).swap_with(sctx.at(start_pos));
219 |                 }
220 |             } else {
221 |                 // [     |      |      ]
222 |                 //   from^      ^to
223 | 
224 |                 from_pos = sparse_shift(from, to);
225 |             }
226 | 
227 |             // insert the element from the end of the range at the free
228 |             // position to the right of it.
229 |             auto new_loc = sctx.allocate_pos(sctx.table_pos(to));
230 | 
231 |             auto from_ptrs = sctx.at(from_pos);
232 |             new_loc.init_from(from_ptrs);
233 |             from_ptrs.uninitialize();
234 | 
235 |             return from_ptrs;
236 |         }
237 | 
238 |         /// Shifts all elements one to the right,
239 |         /// moving the last element to the front position,
240 |         /// and returns a ptr pair to it.
241 |         inline table_pos_t sparse_shift(size_t from, size_t to) {
242 |             DCHECK_LT(from, to);
243 |             auto sctx = storage.context(table_size, widths);
244 | 
245 |             // initialize iterators like this:
246 |             // [         ]
247 |             // ^from   to^
248 |             //          ||
249 |             //    <- src^|
250 |             //    <- dest^
251 | 
252 |             auto from_loc = sctx.table_pos(from);
253 |             auto from_iter = sctx.make_iter(from_loc);
254 | 
255 |             auto last = sctx.table_pos(to - 1);
256 |             auto src = sctx.make_iter(last);
257 |             auto dst = sctx.make_iter(sctx.table_pos(to));
258 | 
259 |             // move the element at the last position to a temporary position
260 |             auto tmp_p = sctx.at(last);
261 |             auto tmp = tmp_p.move_out();
262 | 
263 |             // move all elements one to the right
264 |             // TODO: Could be optimized
265 |             // to memcpies for different underlying layouts
266 |             while(src != from_iter) {
267 |                 // Decrement first for backward iteration
268 |                 src.decrement();
269 |                 dst.decrement();
270 | 
271 |                 // Get access to the value/quotient at src and dst
272 |                 auto src_be = src.get();
273 |                 auto dst_be = dst.get();
274 | 
275 |                 // Copy value/quotient over
276 |                 dst_be.move_from(src_be);
277 |             }
278 | 
279 |             // move last element to the front
280 |             auto from_p = sctx.at(from_loc);
281 |             from_p.set(std::move(tmp));
282 |             return from_loc;
283 |         }
284 | 
285 |         inline uint64_t local_id_to_global_id(uint64_t initial_address, uint64_t local_id) {
286 |             local_id <<= size_mgr.capacity_log2();
287 |             local_id |= initial_address;
288 |             return local_id;
289 |         }
290 | 
291 |         entry_t lookup_id(uint64_t id) {
292 |             uint64_t local_id = id >> size_mgr.capacity_log2();
293 |             uint64_t initial_address = id & ((1ull << size_mgr.capacity_log2()) - 1);
294 | 
295 |             auto group = search_existing_group(initial_address);
296 |             auto position = size_mgr.mod_add(group.group_start, local_id);
297 | 
298 |             auto sctx = storage.context(table_size, widths);
299 |             auto sparse_entry = sctx.at(sctx.table_pos(position));
300 | 
301 |             return entry_t::found_exist(id, sparse_entry);
302 |         }
303 | 
304 |         entry_t lookup_insert(uint64_t initial_address,
305 |                               uint64_t stored_quotient)
306 |         {
307 |             auto sctx = storage.context(table_size, widths);
308 |             auto ia_pos = sctx.table_pos(initial_address);
309 | 
310 |             // cases:
311 |             // - initial address empty.
312 |             // - initial address occupied, there is an element for this key
313 |             //   (v[initial address] = 1).
314 |             // - initial address occupied, there is no element for this key
315 |             //   (v[initial address] = 0).
316 | 
317 |             if (sctx.pos_is_empty(ia_pos)) {
318 |                 // check if we can insert directly
319 | 
320 |                 auto location = sctx.allocate_pos(ia_pos);
321 |                 location.set_quotient(stored_quotient);
322 | 
323 |                 // we created a new group, so update the bitflags
324 |                 set_cv(initial_address, 0b11);
325 | 
326 |                 uint64_t global_id = local_id_to_global_id(initial_address, 0);
327 |                 return entry_t::found_new(global_id, location);
328 |             } else {
329 |                 // check if there already is a group for this key
330 |                 bool const group_exists = get_v(initial_address);
331 | 
332 |                 if (group_exists) {
333 |                     auto const group = search_existing_group(initial_address);
334 | 
335 |                     // check if element already exists
336 |                     auto r = search_in_group(group, stored_quotient);
337 | 
338 |                     if (r.found()) {
339 |                         // There is a value for this key already.
340 |                         DCHECK_EQ(r.ptr().get_quotient(), stored_quotient);
341 | 
342 |                         uint64_t global_id = local_id_to_global_id(
343 |                             initial_address, r.id());
344 |                         return entry_t::found_exist(global_id, r.ptr());
345 |                     } else {
346 |                         // Insert a new value
347 |                         auto p = insert_value_after_group(group, stored_quotient);
348 |                         p.set_quotient(stored_quotient);
349 | 
350 |                         uint64_t in_group_offset = size_mgr.mod_sub(
351 |                             group.group_end, group.group_start);
352 |                         uint64_t global_id = local_id_to_global_id(
353 |                             initial_address, in_group_offset);
354 |                         return entry_t::found_new(global_id, p);
355 |                     }
356 |                 } else {
357 |                     // insert a new group
358 | 
359 |                     // pretend we already inserted the new group
360 |                     // this makes table_insert_value_after_group() find the group
361 |                     // at the location _before_ the new group
362 |                     set_v(initial_address, true);
363 |                     auto const group = search_existing_group(initial_address);
364 | 
365 |                     // insert the element after the found group
366 |                     auto p = insert_value_after_group(group, stored_quotient);
367 |                     p.set_quotient(stored_quotient);
368 | 
369 |                     // mark the inserted element as the start of a new group,
370 |                     // thus fixing-up the v <-> c mapping
371 |                     set_c(group.group_end, true);
372 | 
373 |                     uint64_t global_id = local_id_to_global_id(
374 |                         initial_address, 0);
375 |                     return entry_t::found_new(global_id, p);
376 |                 }
377 |             }
378 |         }
379 | 
380 |         template<typename F>
381 |         inline void for_all_allocated(F f) {
382 |             auto sctx = storage.context(table_size, widths);
383 | 
384 |             // first, skip forward to the first empty location
385 |             // so that iteration can start at the beginning of the first complete group
386 | 
387 |             size_t i = 0;
388 |             for(;;i++) {
389 |                 if (sctx.pos_is_empty(sctx.table_pos(i))) {
390 |                     break;
391 |                 }
392 |             }
393 | 
394 |             // Remember our startpoint so that we can recognize it when
395 |             // we wrapped around back to it
396 |             size_t const original_start = i;
397 | 
398 |             // We proceed to the next position so that we can iterate until
399 |             // we reach `original_start` again.
400 |             uint64_t initial_address = i;
401 |             i = size_mgr.mod_add(i);
402 | 
403 |             while(true) {
404 |                 auto sctx = storage.context(table_size, widths);
405 |                 while (sctx.pos_is_empty(sctx.table_pos(i))) {
406 |                     if (i == original_start) {
407 |                         return;
408 |                     }
409 | 
410 |                     initial_address = i;
411 |                     i = size_mgr.mod_add(i);
412 |                 }
413 | 
414 |                 // If start of group, find next v bit to find initial address
415 |                 if (get_c(i)) {
416 |                     initial_address = size_mgr.mod_add(initial_address);
417 |                     while(!get_v(initial_address)) {
418 |                         initial_address = size_mgr.mod_add(initial_address);
419 |                     }
420 |                 }
421 | 
422 |                 f(initial_address, i);
423 | 
424 |                 i = size_mgr.mod_add(i);
425 |             }
426 |         }
427 | 
428 |         void print_all() {
429 |             auto sctx = storage.context(table_size, widths);
430 |             std::cout << "/////////////////\n";
431 |             for(size_t i = 0; i < table_size; i++) {
432 |                 auto p = sctx.table_pos(i);
433 |                 if(sctx.pos_is_empty(p)) {
434 |                     std::cout << "-- -\n";
435 |                 } else {
436 |                     std::cout << int(get_c(i)) << int(get_v(i)) << " #\n";
437 |                 }
438 |             }
439 |             std::cout << "/////////////////\n";
440 |         }
441 | 
442 |         template<typename F>
443 |         inline void drain_all(F f) {
444 |             table_pos_t drain_start;
445 |             bool first = true;
446 | 
447 |             for_all_allocated([&](auto initial_address, auto i) {
448 |                 auto sctx = storage.context(table_size, widths);
449 |                 auto p = sctx.table_pos(i);
450 | 
451 |                 if (first) {
452 |                     first = false;
453 |                     drain_start = p;
454 |                 }
455 | 
456 |                 sctx.trim_storage(&drain_start, p);
457 |                 f(initial_address, sctx.at(p));
458 |             });
459 |         }
460 | 
461 |         inline entry_t search(uint64_t initial_address, uint64_t stored_quotient) {
462 |             //std::cout << "search on cv(ia="<<initial_address<<", sq="<<stored_quotient<<")\n";
463 |             if (get_v(initial_address)) {
464 |                 auto grp = search_existing_group(initial_address);
465 |                 auto r = search_in_group(grp, stored_quotient);
466 |                 if (!r.found()) {
467 |                     return r;
468 |                 } else {
469 |                     uint64_t global_id = local_id_to_global_id(
470 |                         initial_address, r.id());
471 |                     return entry_t::found_exist(global_id, r.ptr());
472 |                 }
473 |             }
474 |             return entry_t::not_found();
475 |         }
476 |     };
477 |     template<typename storage_t, typename size_mgr_t>
478 |     inline auto context(storage_t& storage,
479 |                         size_t table_size,
480 |                         typename storage_t::satellite_t_export::entry_bit_width_t const& widths,
481 |                         size_mgr_t const& size_mgr) {
482 |         return context_t<storage_t, size_mgr_t> {
483 |             m_cv, table_size, widths, size_mgr, storage
484 |         };
485 |     }
486 | };
487 | 
488 | }
489 | 
490 | template<>
491 | struct heap_size<compact_hash::cv_bvs_t> {
492 |     using T = compact_hash::cv_bvs_t;
493 | 
494 |     static object_size_t compute(T const& val, size_t table_size) {
495 |         DCHECK_EQ(val.m_cv.size(), table_size);
496 |         auto size = val.m_cv.stat_allocation_size_in_bytes();
497 | 
498 |         return object_size_t::exact(size);
499 |     }
500 | };
501 | 
502 | template<>
503 | struct serialize<compact_hash::cv_bvs_t> {
504 |     using T = compact_hash::cv_bvs_t;
505 | 
506 |     static object_size_t write(std::ostream& out, T const& val,
507 |                                size_t table_size) {
508 |         DCHECK_EQ(val.m_cv.size(), table_size);
509 |         auto data = (char const*) val.m_cv.data();
510 |         auto size = val.m_cv.stat_allocation_size_in_bytes();
511 | 
512 |         out.write(data, size);
513 | 
514 |         return object_size_t::exact(size);
515 |     }
516 | 
517 |     static T read(std::istream& in,
518 |                   size_t table_size) {
519 |         auto cv = IntVector<uint_t<2>>();
520 |         cv.reserve(table_size);
521 |         cv.resize(table_size);
522 |         auto data = (char*) cv.data();
523 |         auto size = cv.stat_allocation_size_in_bytes();
524 | 
525 |         in.read(data, size);
526 | 
527 |         return T {
528 |             std::move(cv)
529 |         };
530 |     }
531 | 
532 |     static bool equal_check(T const& lhs, T const& rhs, size_t table_size) {
533 |         return gen_equal_diagnostic(lhs.m_cv == rhs.m_cv);
534 |     }
535 | };
536 | 
537 | }
538 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/index_structure/displacement_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <limits>
  4 | #include <unordered_map>
  5 | #include <type_traits>
  6 | 
  7 | #include <tudocomp/util/bit_packed_layout_t.hpp>
  8 | #include <tudocomp/util/int_coder.hpp>
  9 | #include <tudocomp/ds/IntVector.hpp>
 10 | #include <tudocomp/ds/IntPtr.hpp>
 11 | 
 12 | #include "../entry_t.hpp"
 13 | 
 14 | #include <tudocomp/util/serialization.hpp>
 15 | 
 16 | namespace tdc {namespace compact_hash {
 17 | 
 18 | template<typename displacement_table_t>
 19 | class displacement_t {
 20 |     template<typename T>
 21 |     friend struct ::tdc::serialize;
 22 | 
 23 |     template<typename T>
 24 |     friend struct ::tdc::heap_size;
 25 | 
 26 |     displacement_table_t m_displace;
 27 | 
 28 |     displacement_t(displacement_table_t&& table):
 29 |         m_displace(std::move(table)) {}
 30 | 
 31 | public:
 32 |     displacement_table_t& displacement_table() { return m_displace; }
 33 |     /// runtime initilization arguments, if any
 34 |     struct config_args {
 35 |         typename displacement_table_t::config_args table_config;
 36 |     };
 37 | 
 38 |     /// get the config of this instance
 39 |     inline config_args current_config() const {
 40 |         return config_args { m_displace.current_config() };
 41 |     }
 42 | 
 43 |     inline displacement_t(size_t table_size, config_args config):
 44 |         m_displace(table_size, config.table_config) {}
 45 | 
 46 |     template<typename storage_t, typename size_mgr_t>
 47 |     struct context_t {
 48 |         using satellite_t = typename storage_t::satellite_t_export;
 49 |         using entry_width_t = typename satellite_t::entry_bit_width_t;
 50 |         using entry_t = generic_entry_t<typename satellite_t::entry_ptr_t>;
 51 |         using table_pos_t = typename storage_t::table_pos_t;
 52 | 
 53 |         displacement_table_t& m_displace;
 54 |         size_t const table_size;
 55 |         entry_width_t widths;
 56 |         size_mgr_t const& size_mgr;
 57 |         storage_t& storage;
 58 | 
 59 |         entry_t lookup_id(uint64_t id) {
 60 |             uint64_t position = id;
 61 | 
 62 |             auto sctx = storage.context(table_size, widths);
 63 |             auto sparse_entry = sctx.at(sctx.table_pos(position));
 64 | 
 65 |             return entry_t::found_exist(id, sparse_entry);
 66 |         }
 67 | 
 68 |         entry_t lookup_insert(uint64_t initial_address,
 69 |                               uint64_t stored_quotient)
 70 |         {
 71 |             auto sctx = storage.context(table_size, widths);
 72 | 
 73 |             auto cursor = initial_address;
 74 |             while(true) {
 75 |                 auto pos = sctx.table_pos(cursor);
 76 | 
 77 |                 if (sctx.pos_is_empty(pos)) {
 78 |                     auto ptrs = sctx.allocate_pos(pos);
 79 |                     m_displace.set(cursor, size_mgr.mod_sub(cursor, initial_address));
 80 |                     ptrs.set_quotient(stored_quotient);
 81 |                     return entry_t::found_new(cursor, ptrs);
 82 |                 }
 83 | 
 84 |                 if(m_displace.get(cursor) == size_mgr.mod_sub(cursor, initial_address)) {
 85 |                     auto ptrs = sctx.at(pos);
 86 |                     if (ptrs.get_quotient() == stored_quotient) {
 87 |                         return entry_t::found_exist(cursor, ptrs);
 88 |                     }
 89 |                 }
 90 | 
 91 |                 cursor = size_mgr.mod_add(cursor);
 92 |                 DCHECK_NE(cursor, initial_address);
 93 |             }
 94 | 
 95 |             DCHECK(false) << "unreachable";
 96 |             return entry_t::not_found();
 97 |         }
 98 | 
 99 |         template<typename F>
100 |         inline void for_all_allocated(F f) {
101 |             auto sctx = storage.context(table_size, widths);
102 | 
103 |             // first, skip forward to the first empty location
104 |             // so that iteration can start at the beginning of the first complete group
105 | 
106 |             size_t i = 0;
107 |             for(;;i++) {
108 |                 if (sctx.pos_is_empty(sctx.table_pos(i))) {
109 |                     break;
110 |                 }
111 |             }
112 | 
113 |             // Remember our startpoint so that we can recognize it when
114 |             // we wrapped around back to it
115 |             size_t const original_start = i;
116 | 
117 |             // We proceed to the next position so that we can iterate until
118 |             // we reach `original_start` again.
119 |             i = size_mgr.mod_add(i);
120 | 
121 |             while(true) {
122 |                 auto sctx = storage.context(table_size, widths);
123 |                 while (sctx.pos_is_empty(sctx.table_pos(i))) {
124 |                     if (i == original_start) {
125 |                         return;
126 |                     }
127 | 
128 |                     i = size_mgr.mod_add(i);
129 |                 }
130 | 
131 |                 auto disp = m_displace.get(i);
132 |                 uint64_t initial_address = size_mgr.mod_sub(i, disp);
133 | 
134 |                 f(initial_address, i);
135 | 
136 |                 i = size_mgr.mod_add(i);
137 |             }
138 |         }
139 | 
140 |         template<typename F>
141 |         inline void drain_all(F f) {
142 |             table_pos_t drain_start;
143 |             bool first = true;
144 | 
145 |             for_all_allocated([&](auto initial_address, auto i) {
146 |                 auto sctx = storage.context(table_size, widths);
147 |                 auto p = sctx.table_pos(i);
148 | 
149 |                 if (first) {
150 |                     first = false;
151 |                     drain_start = p;
152 |                 }
153 | 
154 |                 sctx.trim_storage(&drain_start, p);
155 |                 f(initial_address, sctx.at(p));
156 |             });
157 |         }
158 | 
159 |         inline entry_t search(uint64_t const initial_address,
160 |                               uint64_t stored_quotient) {
161 |             auto sctx = storage.context(table_size, widths);
162 |             auto cursor = initial_address;
163 |             while(true) {
164 |                 auto pos = sctx.table_pos(cursor);
165 | 
166 |                 if (sctx.pos_is_empty(pos)) {
167 |                     return entry_t::not_found();
168 |                 }
169 | 
170 |                 if(m_displace.get(cursor) == size_mgr.mod_sub(cursor, initial_address)) {
171 |                     auto ptrs = sctx.at(pos);
172 |                     if (ptrs.get_quotient() == stored_quotient) {
173 |                         return entry_t::found_exist(cursor, ptrs);
174 |                     }
175 |                 }
176 | 
177 |                 cursor = size_mgr.mod_add(cursor);
178 |                 DCHECK_NE(cursor, initial_address);
179 |             }
180 | 
181 |             DCHECK(false) << "unreachable";
182 |             return entry_t::not_found();
183 |         }
184 |     };
185 |     template<typename storage_t, typename size_mgr_t>
186 |     inline auto context(storage_t& storage,
187 |                         size_t table_size,
188 |                         typename storage_t::satellite_t_export::entry_bit_width_t const& widths,
189 |                         size_mgr_t const& size_mgr) {
190 |         return context_t<storage_t, size_mgr_t> {
191 |             m_displace, table_size, widths, size_mgr, storage
192 |         };
193 |     }
194 | };
195 | 
196 | }
197 | 
198 | template<typename displacement_table_t>
199 | struct heap_size<compact_hash::displacement_t<displacement_table_t>> {
200 |     using T = compact_hash::displacement_t<displacement_table_t>;
201 | 
202 |     static object_size_t compute(T const& val, size_t table_size) {
203 |         return heap_size<displacement_table_t>::compute(val.m_displace, table_size);
204 |     }
205 | };
206 | 
207 | template<typename displacement_table_t>
208 | struct serialize<compact_hash::displacement_t<displacement_table_t>> {
209 |     using T = compact_hash::displacement_t<displacement_table_t>;
210 | 
211 |     static object_size_t write(std::ostream& out, T const& val, size_t table_size) {
212 |         return serialize<displacement_table_t>::write(out, val.m_displace, table_size);
213 |     }
214 | 
215 |     static T read(std::istream& in, size_t table_size) {
216 |         auto displace =
217 |             serialize<displacement_table_t>::read(in, table_size);
218 | 
219 |         return T {
220 |             std::move(displace)
221 |         };
222 |     }
223 |     static bool equal_check(T const& lhs, T const& rhs, size_t table_size) {
224 |         return gen_equal_check(m_displace, table_size);
225 |     }
226 | };
227 | 
228 | }
229 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/index_structure/layered_displacement_table_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <limits>
  4 | #include <unordered_map>
  5 | #include <type_traits>
  6 | #include <cmath>
  7 | 
  8 | #include <tudocomp/util/bit_packed_layout_t.hpp>
  9 | #include <tudocomp/util/int_coder.hpp>
 10 | #include <tudocomp/ds/IntVector.hpp>
 11 | #include <tudocomp/ds/IntPtr.hpp>
 12 | 
 13 | #include <tudocomp/util/serialization.hpp>
 14 | 
 15 | namespace tdc {namespace compact_hash {
 16 | 
 17 | template<size_t N>
 18 | struct static_layered_bit_width_t {
 19 |     using elem_t = uint_t<N>;
 20 | 
 21 |     /// runtime initilization arguments, if any
 22 |     struct config_args {};
 23 | 
 24 |     /// get the config of this instance
 25 |     inline config_args current_config() const { return config_args{}; }
 26 | 
 27 |     static_layered_bit_width_t() = default;
 28 |     static_layered_bit_width_t(config_args config) {}
 29 | 
 30 |     inline void set_width(IntVector<elem_t>& iv) const {}
 31 |     inline uint64_t max() const { return std::numeric_limits<elem_t>::max(); }
 32 | };
 33 | 
 34 | struct dynamic_layered_bit_width_t {
 35 |     using elem_t = dynamic_t;
 36 | 
 37 |     size_t m_width;
 38 | 
 39 |     /// runtime initilization arguments, if any
 40 |     struct config_args { size_t width = 4; };
 41 | 
 42 |     /// get the config of this instance
 43 |     inline config_args current_config() const { return config_args{ m_width }; }
 44 | 
 45 |     dynamic_layered_bit_width_t() = default;
 46 |     dynamic_layered_bit_width_t(config_args config): m_width(config.width) {}
 47 | 
 48 |     inline void set_width(IntVector<elem_t>& iv) const {
 49 |         iv.width(m_width);
 50 |     }
 51 |     inline uint64_t max() const { return (1ull << m_width) - 1; }
 52 | };
 53 | 
 54 | /// Stores displacement entries as integers with a bit width given by
 55 | /// `bit_width_t`. Displacement value larger than that
 56 | /// will be spilled into a `std::unordered_map<size_t, size_t>`.
 57 | template<typename bit_width_t>
 58 | class layered_displacement_table_t {
 59 |     template<typename T>
 60 |     friend struct ::tdc::serialize;
 61 | 
 62 |     template<typename T>
 63 |     friend struct ::tdc::heap_size;
 64 | 
 65 |     using elem_t = typename bit_width_t::elem_t;
 66 |     using elem_val_t = typename IntVector<elem_t>::value_type;
 67 | 
 68 |     IntVector<elem_t> m_displace;
 69 |     std::unordered_map<size_t, size_t> m_spill;
 70 |     bit_width_t m_bit_width;
 71 | 
 72 |     layered_displacement_table_t() = default;
 73 | public:
 74 |     /// runtime initilization arguments, if any
 75 |     struct config_args {
 76 |         typename bit_width_t::config_args bit_width_config;
 77 |     };
 78 | 
 79 |     /// get the config of this instance
 80 |     inline config_args current_config() const {
 81 |         return config_args{ m_bit_width.current_config() };
 82 |     }
 83 | 
 84 |     inline layered_displacement_table_t(size_t table_size,
 85 |                                         config_args config):
 86 |         m_bit_width(config.bit_width_config)
 87 |     {
 88 |         m_bit_width.set_width(m_displace);
 89 |         m_displace.reserve(table_size);
 90 |         m_displace.resize(table_size);
 91 |     }
 92 |     inline size_t get(size_t pos) {
 93 |         size_t max = m_bit_width.max();
 94 |         size_t tmp = elem_val_t(m_displace[pos]);
 95 |         if (tmp == max) {
 96 |             return m_spill[pos];
 97 |         } else {
 98 |             return tmp;
 99 |         }
100 |     }
101 |     inline void set(size_t pos, size_t val) {
102 |         size_t max = m_bit_width.max();
103 |         if (val >= max) {
104 |             m_displace[pos] = max;
105 |             m_spill[pos] = val;
106 |         } else {
107 |             m_displace[pos] = val;
108 |         }
109 |     }
110 | };
111 | 
112 | }
113 | 
114 | template<typename bit_width_t>
115 | struct heap_size<compact_hash::layered_displacement_table_t<bit_width_t>> {
116 |     using T = compact_hash::layered_displacement_table_t<bit_width_t>;
117 | 
118 |     static object_size_t compute(T const& val, size_t table_size) {
119 |         auto bytes = object_size_t::empty();
120 | 
121 |         DCHECK_EQ(val.m_displace.size(), table_size);
122 |         auto size = val.m_displace.stat_allocation_size_in_bytes();
123 |         bytes += object_size_t::exact(size);
124 |         bytes += heap_size_compute(val.m_bit_width);
125 | 
126 |         size_t unordered_map_size_guess
127 |             = sizeof(decltype(val.m_spill))
128 |             + val.m_spill.size() * sizeof(size_t) * 2;
129 | 
130 |         bytes += object_size_t::unknown_extra_data(unordered_map_size_guess);
131 | 
132 |         return bytes;
133 |     }
134 | };
135 | 
136 | template<typename bit_width_t>
137 | struct serialize<compact_hash::layered_displacement_table_t<bit_width_t>> {
138 |     using T = compact_hash::layered_displacement_table_t<bit_width_t>;
139 | 
140 |     static object_size_t write(std::ostream& out, T const& val, size_t table_size) {
141 |         auto bytes = object_size_t::empty();
142 | 
143 |         DCHECK_EQ(val.m_displace.size(), table_size);
144 | 
145 |         bytes += serialize_write(out, val.m_bit_width);
146 | 
147 |         auto data = (char const*) val.m_displace.data();
148 |         auto size = val.m_displace.stat_allocation_size_in_bytes();
149 |         out.write(data, size);
150 |         bytes += object_size_t::exact(size);
151 | 
152 |         size_t spill_size = val.m_spill.size();
153 |         out.write((char*) &spill_size, sizeof(size_t));
154 |         bytes += object_size_t::exact(sizeof(size_t));
155 | 
156 |         for (auto pair : val.m_spill) {
157 |             size_t k = pair.first;
158 |             size_t v = pair.second;
159 |             out.write((char*) &k, sizeof(size_t));
160 |             out.write((char*) &v, sizeof(size_t));
161 |             bytes += object_size_t::exact(sizeof(size_t) * 2);
162 |             spill_size--;
163 |         }
164 | 
165 |         DCHECK_EQ(spill_size, 0U);
166 | 
167 |         return bytes;
168 |     }
169 | 
170 |     static T read(std::istream& in, size_t table_size) {
171 |         T ret;
172 |         serialize_read_into(in, ret.m_bit_width);
173 |         ret.m_bit_width.set_width(ret.m_displace);
174 |         ret.m_displace.reserve(table_size);
175 |         ret.m_displace.resize(table_size);
176 |         auto data = (char*) ret.m_displace.data();
177 |         auto size = ret.m_displace.stat_allocation_size_in_bytes();
178 |         in.read(data, size);
179 | 
180 |         auto& spill = ret.m_spill;
181 |         size_t spill_size;
182 |         in.read((char*) &spill_size, sizeof(size_t));
183 | 
184 |         for (size_t i = 0; i < spill_size; i++) {
185 |             size_t k;
186 |             size_t v;
187 |             in.read((char*) &k, sizeof(size_t));
188 |             in.read((char*) &v, sizeof(size_t));
189 | 
190 |             spill[k] = v;
191 |         }
192 | 
193 |         return ret;
194 |     }
195 | 
196 |     static bool equal_check(T const& lhs, T const& rhs, size_t table_size) {
197 |         return gen_equal_diagnostic(lhs.m_displace == rhs.m_displace)
198 |         && gen_equal_diagnostic(lhs.m_spill == rhs.m_spill)
199 |         && gen_equal_check(m_bit_width);
200 |     }
201 | };
202 | 
203 | template<size_t bit_width_t>
204 | struct heap_size<compact_hash::static_layered_bit_width_t<bit_width_t>> {
205 |     using T = compact_hash::static_layered_bit_width_t<bit_width_t>;
206 | 
207 |     static object_size_t compute(T const& val) {
208 |         return object_size_t::empty();
209 |     }
210 | };
211 | 
212 | template<size_t bit_width_t>
213 | struct serialize<compact_hash::static_layered_bit_width_t<bit_width_t>> {
214 |     using T = compact_hash::static_layered_bit_width_t<bit_width_t>;
215 | 
216 |     static object_size_t write(std::ostream& out, T const& val) {
217 |         return object_size_t::empty();
218 |     }
219 | 
220 |     static T read(std::istream& in) {
221 |         return T();
222 |     }
223 | 
224 |     static bool equal_check(T const& lhs, T const& rhs) {
225 |         return true;
226 |     }
227 | };
228 | 
229 | template<>
230 | struct heap_size<compact_hash::dynamic_layered_bit_width_t> {
231 |     using T = compact_hash::dynamic_layered_bit_width_t;
232 | 
233 |     static object_size_t compute(T const& val) {
234 |         return object_size_t::exact(sizeof(T));
235 |     }
236 | };
237 | 
238 | template<>
239 | struct serialize<compact_hash::dynamic_layered_bit_width_t> {
240 |     using T = compact_hash::dynamic_layered_bit_width_t;
241 | 
242 |     static object_size_t write(std::ostream& out, T const& val) {
243 |         auto bytes = object_size_t::empty();
244 |         bytes += serialize_write(out, val.m_width);
245 |         return bytes;
246 |     }
247 | 
248 |     static T read(std::istream& in) {
249 |         T ret;
250 |         serialize_read_into(in, ret.m_width);
251 |         return ret;
252 |     }
253 | 
254 |     static bool equal_check(T const& lhs, T const& rhs) {
255 |         return gen_equal_check(m_width);
256 |     }
257 | };
258 | 
259 | 
260 | }
261 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/index_structure/naive_displacement_table_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <unordered_map>
 5 | #include <type_traits>
 6 | #include <cmath>
 7 | 
 8 | #include <tudocomp/util/bit_packed_layout_t.hpp>
 9 | #include <tudocomp/util/int_coder.hpp>
10 | #include <tudocomp/ds/IntVector.hpp>
11 | #include <tudocomp/ds/IntPtr.hpp>
12 | 
13 | #include <tudocomp/util/serialization.hpp>
14 | 
15 | namespace tdc {namespace compact_hash {
16 | 
17 | /// Stores displacement entries as `size_t` integers.
18 | struct naive_displacement_table_t {
19 |     template<typename T>
20 |     friend struct ::tdc::serialize;
21 | 
22 |     /// runtime initilization arguments, if any
23 |     struct config_args {};
24 | 
25 |     /// get the config of this instance
26 |     inline config_args current_config() const { return config_args{}; }
27 | 
28 |     std::vector<size_t> m_displace;
29 |     inline naive_displacement_table_t(size_t table_size,
30 |                                       config_args config) {
31 |         m_displace.reserve(table_size);
32 |         m_displace.resize(table_size);
33 |     }
34 |     inline size_t get(size_t pos) const {
35 |         return m_displace[pos];
36 |     }
37 |     inline void set(size_t pos, size_t val) {
38 |         m_displace[pos] = val;
39 |     }
40 | };
41 | 
42 | }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/map/satellite_data_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <tudocomp/util/compact_hash/entry_t.hpp>
 4 | 
 5 | #include "val_quot_ptrs_t.hpp"
 6 | #include "val_quot_bucket_layout_t.hpp"
 7 | 
 8 | namespace tdc {namespace compact_hash{namespace map {
 9 | 
10 | template<typename val_t>
11 | struct satellite_data_t {
12 | private:
13 |     using qvd_t = val_quot_bucket_layout_t<val_t>;
14 |     using widths_t = typename qvd_t::QVWidths;
15 | public:
16 |     static constexpr bool has_sentinel = true;
17 |     using entry_ptr_t = val_quot_ptrs_t<val_t>;
18 |     using entry_bit_width_t = widths_t;
19 | 
20 |     using bucket_data_layout_t = qvd_t;
21 | 
22 |     using sentinel_value_type = typename cbp::cbp_repr_t<val_t>::value_type;
23 | };
24 | 
25 | }}}
26 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/map/typedefs.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <tudocomp/util/compact_hash/map/hashmap_t.hpp>
 4 | #include <tudocomp/util/compact_hash/hash_functions.hpp>
 5 | #include <tudocomp/util/compact_hash/storage/buckets_bv_t.hpp>
 6 | #include <tudocomp/util/compact_hash/storage/plain_sentinel_t.hpp>
 7 | #include <tudocomp/util/compact_hash/index_structure/cv_bvs_t.hpp>
 8 | #include <tudocomp/util/compact_hash/index_structure/displacement_t.hpp>
 9 | #include <tudocomp/util/compact_hash/index_structure/elias_gamma_displacement_table_t.hpp>
10 | #include <tudocomp/util/compact_hash/index_structure/layered_displacement_table_t.hpp>
11 | #include <tudocomp/util/compact_hash/index_structure/naive_displacement_table_t.hpp>
12 | 
13 | namespace tdc {namespace compact_hash {namespace map {
14 | 
15 | template<typename val_t, typename hash_t = poplar_xorshift_t>
16 | using plain_cv_hashmap_t
17 |     = hashmap_t<val_t, hash_t, plain_sentinel_t, cv_bvs_t>;
18 | 
19 | template<typename val_t, typename hash_t = poplar_xorshift_t>
20 | using sparse_cv_hashmap_t
21 |     = hashmap_t<val_t, hash_t, buckets_bv_t, cv_bvs_t>;
22 | 
23 | template<typename val_t, typename hash_t = poplar_xorshift_t>
24 | using plain_layered_hashmap_t
25 |     = hashmap_t<
26 |         val_t, hash_t, plain_sentinel_t,
27 |         displacement_t<layered_displacement_table_t<dynamic_layered_bit_width_t>>>;
28 | 
29 | template<typename val_t, typename hash_t = poplar_xorshift_t>
30 | using sparse_layered_hashmap_t
31 |     = hashmap_t<
32 |         val_t, hash_t, buckets_bv_t,
33 |         displacement_t<layered_displacement_table_t<dynamic_layered_bit_width_t>>>;
34 | 
35 | template<typename val_t, typename hash_t = poplar_xorshift_t>
36 | using plain_elias_hashmap_t
37 |     = hashmap_t<
38 |         val_t, hash_t, plain_sentinel_t,
39 |         displacement_t<elias_gamma_displacement_table_t<
40 |             dynamic_fixed_elias_gamma_bucket_size_t>>>;
41 | 
42 | template<typename val_t, typename hash_t = poplar_xorshift_t>
43 | using sparse_elias_hashmap_t
44 |     = hashmap_t<
45 |         val_t, hash_t, buckets_bv_t,
46 |         displacement_t<elias_gamma_displacement_table_t<
47 |             dynamic_fixed_elias_gamma_bucket_size_t>>>;
48 | 
49 | }}}
50 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/map/val_quot_bucket_layout_t.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include <memory>
 5 | #include <cstdint>
 6 | #include <utility>
 7 | #include <algorithm>
 8 | 
 9 | #include <tudocomp/util/bit_packed_layout_t.hpp>
10 | #include <tudocomp/util/compact_hash/util.hpp>
11 | #include "val_quot_ptrs_t.hpp"
12 | 
13 | namespace tdc {namespace compact_hash{namespace map {
14 | 
15 | template<typename val_t>
16 | struct val_quot_bucket_layout_t {
17 |     struct QVWidths {
18 |         uint8_t quot_width;
19 |         uint8_t val_width;
20 |     };
21 | 
22 |     /// Calculates the offsets of the two different arrays inside the allocation.
23 |     struct Layout {
24 |         cbp::cbp_layout_element_t<val_t> vals_layout;
25 |         cbp::cbp_layout_element_t<dynamic_t> quots_layout;
26 |         size_t overall_qword_size;
27 | 
28 |         inline Layout(): vals_layout(), quots_layout(), overall_qword_size(0) {
29 |         }
30 |     };
31 |     inline static Layout calc_sizes(size_t size, QVWidths widths) {
32 |         DCHECK_NE(size, 0U);
33 |         DCHECK_LE(alignof(val_t), alignof(uint64_t));
34 | 
35 |         auto layout = cbp::bit_layout_t();
36 | 
37 |         // The values
38 |         auto values = layout.cbp_elements<val_t>(size, widths.val_width);
39 | 
40 |         // The quotients
41 |         auto quots = layout.cbp_elements<dynamic_t>(size, widths.quot_width);
42 | 
43 |         Layout r;
44 |         r.vals_layout = values;
45 |         r.quots_layout = quots;
46 |         r.overall_qword_size = layout.get_size_in_uint64_t_units();
47 |         return r;
48 |     }
49 | 
50 |     /// Creates the pointers to the beginnings of the two arrays inside
51 |     /// the allocation.
52 |     inline static val_quot_ptrs_t<val_t> ptr(uint64_t* alloc, size_t size, QVWidths widths) {
53 |         DCHECK_NE(size, 0U);
54 |         auto layout = calc_sizes(size, widths);
55 | 
56 |         return val_quot_ptrs_t<val_t> {
57 |             layout.vals_layout.ptr_relative_to(alloc),
58 |             layout.quots_layout.ptr_relative_to(alloc),
59 |         };
60 |     }
61 | 
62 |     // Run destructors of each element in the bucket.
63 |     inline static void destroy_vals(uint64_t* alloc, size_t size, QVWidths widths) {
64 |         if (size != 0) {
65 |             auto start = ptr(alloc, size, widths).val_ptr();
66 |             auto end = start + size;
67 | 
68 |             for(; start != end; start++) {
69 |                 cbp::cbp_repr_t<val_t>::call_destructor(start);
70 |             }
71 |         }
72 |     }
73 | 
74 |     /// Returns a `val_quot_ptrs_t` to position `pos`,
75 |     /// or a sentinel value that acts as a one-pass-the-end pointer for the empty case.
76 |     inline static val_quot_ptrs_t<val_t> at(uint64_t* alloc, size_t size, size_t pos, QVWidths widths) {
77 |         if(size != 0) {
78 |             auto ps = ptr(alloc, size, widths);
79 |             return val_quot_ptrs_t<val_t>(ps.val_ptr() + pos, ps.quot_ptr() + pos);
80 |         } else {
81 |             DCHECK_EQ(pos, 0U);
82 |             return val_quot_ptrs_t<val_t>();
83 |         }
84 |     }
85 | };
86 | 
87 | }}}
88 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/map/val_quot_ptrs_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>
  4 | #include <cstdint>
  5 | #include <utility>
  6 | #include <algorithm>
  7 | 
  8 | #include <tudocomp/util/compact_hash/util.hpp>
  9 | #include <tudocomp/util/compact_hash/entry_t.hpp>
 10 | 
 11 | namespace tdc {namespace compact_hash{namespace map {
 12 | 
 13 | /// Represents a pair of pointers to value and quotient inside a bucket.
 14 | template<typename val_t>
 15 | class val_quot_ptrs_t {
 16 |     ValPtr<val_t> m_val_ptr;
 17 |     mutable QuotPtr m_quot_ptr;
 18 | 
 19 | public:
 20 |     using value_type = typename cbp::cbp_repr_t<val_t>::value_type;
 21 | 
 22 |     struct my_value_type {
 23 |         uint64_t quot;
 24 |         value_type val;
 25 |     };
 26 | 
 27 |     inline val_quot_ptrs_t(ValPtr<val_t> val_ptr,
 28 |                       QuotPtr quot_ptr):
 29 |         m_val_ptr(val_ptr),
 30 |         m_quot_ptr(quot_ptr)
 31 |     {
 32 |     }
 33 | 
 34 |     inline val_quot_ptrs_t():
 35 |         m_val_ptr(), m_quot_ptr() {}
 36 | 
 37 |     inline uint64_t get_quotient() const {
 38 |         return uint64_t(*m_quot_ptr);
 39 |     }
 40 | 
 41 |     inline void set_quotient(uint64_t v) const {
 42 |         *m_quot_ptr = v;
 43 |     }
 44 | 
 45 |     inline void swap_quotient(uint64_t& other) const {
 46 |         uint64_t tmp = uint64_t(*m_quot_ptr);
 47 |         std::swap(other, tmp);
 48 |         *m_quot_ptr = tmp;
 49 |     }
 50 | 
 51 |     inline ValPtr<val_t> val_ptr() const {
 52 |         return m_val_ptr;
 53 |     }
 54 | 
 55 |     inline QuotPtr quot_ptr() const {
 56 |         return m_quot_ptr;
 57 |     }
 58 | 
 59 |     inline void increment_ptr() {
 60 |         m_quot_ptr++;
 61 |         m_val_ptr++;
 62 |     }
 63 |     inline void decrement_ptr() {
 64 |         m_quot_ptr--;
 65 |         m_val_ptr--;
 66 |     }
 67 | 
 68 |     inline friend bool operator==(val_quot_ptrs_t const& lhs,
 69 |                                   val_quot_ptrs_t const& rhs)
 70 |     {
 71 |         return lhs.m_val_ptr == rhs.m_val_ptr;
 72 |     }
 73 | 
 74 |     inline friend bool operator!=(val_quot_ptrs_t const& lhs,
 75 |                                   val_quot_ptrs_t const& rhs)
 76 |     {
 77 |         return lhs.m_val_ptr != rhs.m_val_ptr;
 78 |     }
 79 | 
 80 |     inline void set(value_type&& val,
 81 |                     uint64_t quot) {
 82 |         set_quotient(quot);
 83 |         *val_ptr() = std::move(val);
 84 |     }
 85 | 
 86 |     inline void set_no_drop(value_type&& val,
 87 |                             uint64_t quot) {
 88 |         set_quotient(quot);
 89 |         cbp::cbp_repr_t<val_t>::construct_val_from_rval(val_ptr(), std::move(val));
 90 |     }
 91 | 
 92 |     inline void set_val(value_type&& val) {
 93 |         *val_ptr() = std::move(val);
 94 |     }
 95 | 
 96 |     inline void set_val_no_drop(value_type&& val) {
 97 |         cbp::cbp_repr_t<val_t>::construct_val_from_rval(val_ptr(), std::move(val));
 98 |     }
 99 | 
100 |     inline void move_from(val_quot_ptrs_t other) {
101 |         *val_ptr() = std::move(*other.val_ptr());
102 |         set_quotient(other.get_quotient());
103 |     }
104 | 
105 |     inline void init_from(val_quot_ptrs_t other) {
106 |         cbp::cbp_repr_t<val_t>::construct_val_from_ptr(val_ptr(), other.val_ptr());
107 |         set_quotient(other.get_quotient());
108 |     }
109 | 
110 |     inline void swap_with(val_quot_ptrs_t other) {
111 |         value_type tmp_val = std::move(*val_ptr());
112 |         uint64_t tmp_quot = get_quotient();
113 | 
114 |         move_from(other);
115 |         other.set(std::move(tmp_val), tmp_quot);
116 |     }
117 | 
118 |     inline void uninitialize() {
119 |         cbp::cbp_repr_t<val_t>::call_destructor(val_ptr());
120 |     }
121 | 
122 |     inline bool contents_eq(val_quot_ptrs_t rhs) const {
123 |         return (get_quotient() == rhs.get_quotient()) && (*val_ptr() == *rhs.val_ptr());
124 |     }
125 | 
126 |     inline my_value_type move_out() const {
127 |         return my_value_type {
128 |             get_quotient(),
129 |             std::move(*val_ptr()),
130 |         };
131 |     }
132 | 
133 |     inline void set(my_value_type&& val) {
134 |         set_quotient(val.quot);
135 |         *val_ptr() = std::move(val.val);
136 |     }
137 | };
138 | 
139 | }}}
140 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/set/hashset_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <tudocomp/util/compact_hash/util.hpp>
  4 | #include <tudocomp/util/compact_hash/size_manager_t.hpp>
  5 | #include <tudocomp/util/compact_hash/storage/buckets_bv_t.hpp>
  6 | #include <tudocomp/util/compact_hash/entry_t.hpp>
  7 | 
  8 | #include <tudocomp/util/compact_hash/set/no_satellite_data_t.hpp>
  9 | 
 10 | #include <tudocomp/util/serialization.hpp>
 11 | 
 12 | namespace tdc {namespace compact_hash {namespace set {
 13 | 
 14 | template<typename hash_t, typename placement_t>
 15 | class hashset_t {
 16 |     using storage_t = buckets_bv_t<no_satellite_data_t>;
 17 |     using satellite_t = typename storage_t::satellite_t_export;
 18 | public:
 19 |     /// runtime initilization arguments for the template config parameters
 20 |     struct config_args {
 21 |         typename size_manager_t::config_args size_manager_config;
 22 |         typename hash_t::config_args hash_config;
 23 |         typename storage_t::config_args storage_config;
 24 |         typename placement_t::config_args displacement_config;
 25 |     };
 26 | 
 27 |     /// this is called during a resize to copy over internal config values
 28 |     inline config_args current_config() const {
 29 |         auto r = config_args{};
 30 |         r.size_manager_config = m_sizing.current_config();
 31 |         r.hash_config = m_hash.current_config();
 32 |         r.storage_config = m_storage.current_config();
 33 |         r.displacement_config = m_placement.current_config();
 34 |         return r;
 35 |     }
 36 | 
 37 |     /// Default value of the `key_width` parameter of the constructor.
 38 |     static constexpr size_t DEFAULT_KEY_WIDTH = 1;
 39 |     static constexpr size_t DEFAULT_TABLE_SIZE = 0;
 40 | 
 41 |     inline hashset_t(hashset_t&& other):
 42 |         m_sizing(std::move(other.m_sizing)),
 43 |         m_key_width(std::move(other.m_key_width)),
 44 |         m_storage(std::move(other.m_storage)),
 45 |         m_placement(std::move(other.m_placement)),
 46 |         m_hash(std::move(other.m_hash))
 47 |     {
 48 |     }
 49 |     inline hashset_t& operator=(hashset_t&& other) {
 50 |         m_sizing = std::move(other.m_sizing);
 51 |         m_key_width = std::move(other.m_key_width);
 52 |         m_storage = std::move(other.m_storage);
 53 |         m_placement = std::move(other.m_placement);
 54 |         m_hash = std::move(other.m_hash);
 55 | 
 56 |         return *this;
 57 |     }
 58 |     // NB: These just exist to catch bugs, and could be removed
 59 |     inline hashset_t(hashset_t const& other) = delete;
 60 |     inline hashset_t& operator=(hashset_t  const& other) = delete;
 61 | 
 62 |     /// Constructs a hashtable with a initial table size `size`,
 63 |     /// and a initial key bit-width `key_width`.
 64 |     inline hashset_t(size_t size = DEFAULT_TABLE_SIZE,
 65 |                      size_t key_width = DEFAULT_KEY_WIDTH,
 66 |                      config_args config = config_args{}):
 67 |         m_sizing(size, config.size_manager_config),
 68 |         m_key_width(key_width),
 69 |         m_storage(table_size(), storage_widths(), config.storage_config),
 70 |         m_placement(table_size(), config.displacement_config),
 71 |         m_hash(real_width(), config.hash_config)
 72 |     {
 73 |     }
 74 | 
 75 |     /// Returns the amount of elements inside the datastructure.
 76 |     inline size_t size() const {
 77 |         return m_sizing.size();
 78 |     }
 79 | 
 80 |     /// Returns the current size of the hashtable.
 81 |     /// This value is greater-or-equal the amount of the elements
 82 |     /// currently contained in it, which is represented by `size()`.
 83 |     inline size_t table_size() const {
 84 |         return m_sizing.capacity();
 85 |     }
 86 | 
 87 |     /// Current width of the keys stored in this datastructure.
 88 |     inline size_t key_width() const {
 89 |         return m_key_width;
 90 |     }
 91 | 
 92 |     /// Amount of bits of the key, that are stored implicitly
 93 |     /// by its position in the table.
 94 |     inline size_t initial_address_width() const {
 95 |         return m_sizing.capacity_log2();
 96 |     }
 97 | 
 98 |     /// Amount of bits of the key, that are stored explicitly
 99 |     /// in the buckets.
100 |     inline size_t quotient_width() const {
101 |         return real_width() - m_sizing.capacity_log2();
102 |     }
103 | 
104 |     /// Sets the maximum load factor
105 |     /// (how full the table can get before re-allocating).
106 |     ///
107 |     /// Expects a value `0.0 < z < 1.0`.
108 |     inline void max_load_factor(float z) {
109 |         m_sizing.max_load_factor(z);
110 |     }
111 | 
112 |     /// Returns the maximum load factor.
113 |     inline float max_load_factor() const noexcept {
114 |         return m_sizing.max_load_factor();
115 |     }
116 | 
117 |     struct default_on_resize_t {
118 |         /// Will be called in case of an resize.
119 |         inline void on_resize(size_t table_size) {}
120 |         /// Will be called after `on_resize()` for each element
121 |         /// that gets re-inserted into the new set.
122 |         inline void on_reinsert(uint64_t key, uint64_t id) {}
123 |     };
124 | 
125 |     using entry_t = generic_entry_t<typename satellite_t::entry_ptr_t>;
126 | 
127 |     /// Looks up the key `key` in the set, inserting it if
128 |     /// it doesn't already exist.
129 |     ///
130 |     /// The returned `entry_t` contains both an _id_ that is unique for each
131 |     /// element in the set for a given table size,
132 |     /// and a boolean indicating if the key already exists.
133 |     ///
134 |     /// If the set needs to be resized, the observer `on_resize` will be
135 |     /// used to notify the code about the changed size and new key-id mappings.
136 |     template<typename on_resize_t = default_on_resize_t>
137 |     inline entry_t lookup_insert(uint64_t key,
138 |                                  on_resize_t&& on_resize = on_resize_t()) {
139 |         return lookup_insert_key_width(key, key_width(), on_resize);
140 |     }
141 | 
142 |     /// Looks up the key `key` in the set, inserting it if
143 |     /// it doesn't already exist, and grows the key width to `key_width`
144 |     /// bits.
145 |     ///
146 |     /// The returned `entry_t` contains both an _id_ that is unique for each
147 |     /// element in the set for a given table size,
148 |     /// and a boolean indicating if the key already exists.
149 |     ///
150 |     /// If the set needs to be resized, the observer `on_resize` will be
151 |     /// used to notify the code about the changed size and new key-id mappings.
152 |     template<typename on_resize_t = default_on_resize_t>
153 |     inline entry_t lookup_insert_key_width(uint64_t key,
154 |                                            uint8_t key_width,
155 |                                            on_resize_t&& on_resize = on_resize_t()) {
156 |         auto raw_key_width = std::max<size_t>(key_width, this->key_width());
157 |         return grow_and_insert(key, raw_key_width, on_resize);
158 |     }
159 | 
160 |     /// Grow the key width as needed.
161 |     ///
162 |     /// Note that it is more efficient to change the width directly during
163 |     /// insertion of a new value.
164 |     template<typename on_resize_t = default_on_resize_t>
165 |     inline void grow_key_width(size_t key_width,
166 |                                on_resize_t&& on_resize = on_resize_t()) {
167 |         auto raw_key_width = std::max<size_t>(key_width, this->key_width());
168 |         grow_if_needed(size(), raw_key_width, on_resize);
169 |     }
170 | 
171 |     /// Search for a key inside the hashset.
172 |     ///
173 |     /// The returned `entry_t` contains a boolean indicating if the key was found.
174 |     /// If it is, then it contains the corresponding _id_ of the entry.
175 |     inline entry_t lookup(uint64_t key) {
176 |         auto dkey = decompose_key(key);
177 |         auto pctx = m_placement.context(m_storage, table_size(), storage_widths(), m_sizing);
178 |         return pctx.search(dkey.initial_address, dkey.stored_quotient);
179 |     }
180 | 
181 |     /// Takes an ID as returned by `entry_t::id()`, and returns the corresponding `entry_t`.
182 |     ///
183 |     /// The bavior is undefined if the id does not exist in the data structure, or after an
184 |     /// intermediate rehash.
185 |     inline entry_t lookup_id(uint64_t id) {
186 |         auto pctx = m_placement.context(m_storage, table_size(), storage_widths(), m_sizing);
187 |         auto result = pctx.lookup_id(id);
188 | 
189 |         return result;
190 |     }
191 | 
192 |     /// Swap this instance of the data structure with another one.
193 |     inline void swap(hashset_t& other) {
194 |         std::swap(*this, other);
195 |     }
196 | 
197 |     /// Moves the contents of this hashtable
198 |     /// into another table.
199 |     ///
200 |     /// This method tries to eagerly free memory in
201 |     /// order to keep the total consumption low, if possible.
202 |     ///
203 |     /// The target hashtable will grow as needed. To prevent that, ensure its
204 |     /// capacity and bit widths are already large enough.
205 |     ///
206 |     /// The `on_resize` handler will call `on_reinsert()` for
207 |     /// each moved element. It will not be called for growth operations
208 |     /// of the target hashtable.
209 |     template<typename on_resize_t = default_on_resize_t>
210 |     inline void move_into(hashset_t& other,
211 |                           on_resize_t&& on_resize = on_resize_t()) {
212 |         auto pctx = m_placement.context(m_storage, table_size(), storage_widths(), m_sizing);
213 |         pctx.drain_all([&](auto initial_address, auto kv) {
214 |             auto stored_quotient = kv.get_quotient();
215 |             auto key = this->compose_key(initial_address, stored_quotient);
216 |             auto r = other.lookup_insert(key);
217 |             DCHECK(r.found());
218 |             DCHECK(!r.key_already_exist());
219 |             on_resize.on_reinsert(key, r.id());
220 |         });
221 |     }
222 | 
223 |     /// Check wether for the `new_size` this hashtable would need
224 |     /// to perform a grow of the capacity.
225 |     inline bool needs_to_grow_capacity(size_t new_size) const {
226 |         return m_sizing.needs_to_grow_capacity(m_sizing.capacity(), new_size);
227 |     }
228 | 
229 |     /// Check wether for the `new_size` and `new_key_width` this
230 |     /// hashtable would need to reallocate.
231 |     inline bool needs_to_realloc(size_t new_size,
232 |                                  size_t new_key_width) const {
233 |         return needs_to_grow_capacity(new_size)
234 |             || (new_key_width != key_width());
235 |     }
236 | 
237 |     /// Compute the new capacity the hashmap would have after a grow
238 |     /// operation for `new_size`.
239 |     inline size_t grown_capacity(size_t new_size) const {
240 |         size_t new_capacity = m_sizing.capacity();
241 |         while (m_sizing.needs_to_grow_capacity(new_capacity, new_size)) {
242 |             new_capacity = m_sizing.grown_capacity(new_capacity);
243 |         }
244 |         return new_capacity;
245 |     }
246 | 
247 |     /// Pseudo-Pointer to a key.
248 |     ///
249 |     /// Does not actually point at a memory location, and defines equality
250 |     ///  based on the key value and wether this is in its `null` state.
251 |     class pointer_type {
252 |         uint64_t m_key;
253 |         bool m_empty;
254 |     public:
255 |         pointer_type(uint64_t key): m_key(key), m_empty(false) {}
256 |         pointer_type(): m_key(-1), m_empty(true) {}
257 | 
258 |         inline uint64_t& operator*() {
259 |             return m_key;
260 |         }
261 |         inline uint64_t* operator->() {
262 |             return &m_key;
263 |         }
264 |         inline bool operator==(pointer_type const& other) const {
265 |             return (m_empty == other.m_empty) && (m_key == other.m_key);
266 |         }
267 |         inline bool operator!=(pointer_type const& other) const {
268 |             return !(*this == other);
269 |         }
270 |         inline bool operator==(uint64_t const* const& other) const {
271 |             return (other != nullptr) && (m_key == *other);
272 |         }
273 |         inline bool operator!=(uint64_t const* const& other) const {
274 |             return !(*this == other);
275 |         }
276 |     };
277 | 
278 |     /// Search for a key inside the hashtable.
279 |     ///
280 |     /// This returns a pseudo-pointer to the key if its found, or null
281 |     /// otherwise. This method exists for STL-compability.
282 |     inline pointer_type find(uint64_t key) {
283 |         if  (count(key)) {
284 |             return pointer_type(key);
285 |         } else {
286 |             return pointer_type();
287 |         }
288 |     }
289 | 
290 |     /// Count the number of occurrences of `key`, as defined on STL containers.
291 |     ///
292 |     /// It will return either 0 or 1.
293 |     inline size_t count(uint64_t key) {
294 |         return lookup(key).found();
295 |     }
296 | 
297 | private:
298 |     using quot_width_t = typename satellite_t::entry_bit_width_t;
299 | 
300 |     /// Size of table, and width of the stored keys and values
301 |     size_manager_t m_sizing;
302 |     uint8_t m_key_width;
303 | 
304 |     /// Storage of the table elements
305 |     storage_t m_storage;
306 | 
307 |     /// Placement management structures
308 |     placement_t m_placement;
309 | 
310 |     /// Hash function
311 |     hash_t m_hash {1};
312 | 
313 |     template<typename T>
314 |     friend struct ::tdc::serialize;
315 | 
316 |     template<typename T>
317 |     friend struct ::tdc::heap_size;
318 | 
319 |     /// The actual amount of bits currently usable for
320 |     /// storing a key in the hashtable.
321 |     ///
322 |     /// Due to implementation details, this can be
323 |     /// larger than `key_width()`.
324 |     ///
325 |     /// Specifically, there are currently two cases:
326 |     /// - If all bits of the the key fit into the initial-address space,
327 |     ///   then the quotient bitvector inside the buckets would
328 |     ///   have to store integers of width 0. This is undefined behavior
329 |     ///   with the current code, so we add a padding bit.
330 |     /// - Otherwise the current maximum key width `m_key_width`
331 |     ///   determines the real width.
332 |     inline size_t real_width() const {
333 |         return std::max<size_t>(m_sizing.capacity_log2() + 1, m_key_width);
334 |     }
335 | 
336 |     inline quot_width_t storage_widths() const {
337 |         return uint8_t(quotient_width());
338 |     }
339 | 
340 |     /// Debug check that a key does not occupy more bits than the
341 |     /// hashtable currently allows.
342 |     inline bool dcheck_key_width(uint64_t key) {
343 |         uint64_t key_mask = (1ull << (key_width() - 1ull) << 1ull) - 1ull;
344 |         bool key_is_too_large = key & ~key_mask;
345 |         return !key_is_too_large;
346 |     }
347 | 
348 |     /// Decompose a key into its initial address and quotient.
349 |     inline decomposed_key_t decompose_key(uint64_t key) {
350 |         DCHECK(dcheck_key_width(key)) << "Attempt to decompose key " << key << ", which requires more than the current set maximum of " << key_width() << " bits, but should not.";
351 | 
352 |         uint64_t hres = m_hash.hash(key);
353 | 
354 |         DCHECK_EQ(m_hash.hash_inv(hres), key);
355 | 
356 |         return m_sizing.decompose_hashed_value(hres);
357 |     }
358 | 
359 |     /// Compose a key from its initial address and quotient.
360 |     inline uint64_t compose_key(uint64_t initial_address, uint64_t quotient) {
361 |         uint64_t harg = m_sizing.compose_hashed_value(initial_address, quotient);
362 |         uint64_t key = m_hash.hash_inv(harg);
363 | 
364 |         DCHECK(dcheck_key_width(key)) << "Composed key " << key << ", which requires more than the current set maximum of " << key_width() << " bits, but should not.";
365 |         return key;
366 |     }
367 | 
368 |     /// Access the element represented by `handler` under
369 |     /// the key `key` with the, possibly new, width of `key_width` bits.
370 |     ///
371 |     /// `handler` is a type that allows reacting correctly to different ways
372 |     /// to access or create a new or existing value in the hashtable.
373 |     /// See `InsertHandler` and `AddressDefaultHandler` below.
374 |     template<typename on_resize_t>
375 |     inline auto grow_and_insert(uint64_t key, size_t key_width, on_resize_t& onr) {
376 |         grow_if_needed(this->size() + 1, key_width, onr);
377 |         auto const dkey = this->decompose_key(key);
378 | 
379 |         DCHECK_EQ(key, this->compose_key(dkey.initial_address, dkey.stored_quotient));
380 | 
381 |         auto pctx = m_placement.context(m_storage, table_size(), storage_widths(), m_sizing);
382 | 
383 |         auto result = pctx.lookup_insert(dkey.initial_address, dkey.stored_quotient);
384 | 
385 |         if (!result.key_already_exist()) {
386 |             m_sizing.set_size(m_sizing.size() + 1);
387 |         }
388 | 
389 |         return result;
390 |     }
391 | 
392 |     /// Check the current key width and table site against the arguments,
393 |     /// and grows the table or quotient bitvectors as needed.
394 |     template<typename on_resize_t>
395 |     inline void grow_if_needed(size_t const new_size,
396 |                                size_t const new_key_width,
397 |                                on_resize_t& onr) {
398 |         /*
399 |         std::cout
400 |                 << "buckets size/cap: " << m_buckets.size()
401 |                 << ", size: " << m_sizing.size()
402 |                 << "\n";
403 |         */
404 | 
405 |         // TODO: Could reuse the existing table if only m_key_width changes
406 |         // TODO: The iterators is inefficient since it does redundant
407 |         // memory lookups and address calculations.
408 | 
409 |         if (needs_to_realloc(new_size, new_key_width)) {
410 |             size_t new_capacity = grown_capacity(new_size);
411 |             auto config = this->current_config();
412 |             auto new_table = hashset_t<hash_t, placement_t>(
413 |                 new_capacity, new_key_width, config);
414 | 
415 |             /*
416 |             std::cout
417 |                 << "grow to cap " << new_table.table_size()
418 |                 << ", key_width: " << new_table.key_width()
419 |                 << ", val_width: " << new_table.value_width()
420 |                 << ", real_width: " << new_table.real_width()
421 |                 << ", quot width: " << new_table.quotient_width()
422 |                 << "\n";
423 |             */
424 | 
425 |             onr.on_resize(new_capacity);
426 | 
427 |             move_into(new_table, onr);
428 | 
429 |             *this = std::move(new_table);
430 |         }
431 | 
432 |         DCHECK(!needs_to_realloc(new_size, new_key_width));
433 |     }
434 | };
435 | 
436 | }}
437 | 
438 | template<typename hash_t, typename placement_t>
439 | struct heap_size<compact_hash::set::hashset_t<hash_t, placement_t>> {
440 |     using T = compact_hash::set::hashset_t<hash_t, placement_t>;
441 |     using storage_t = typename T::storage_t;
442 | 
443 |     static object_size_t compute(T const& val) {
444 |         using namespace compact_hash::set;
445 |         using namespace compact_hash;
446 | 
447 |         auto bytes = object_size_t::empty();
448 | 
449 |         bytes += heap_size<size_manager_t>::compute(val.m_sizing);
450 |         bytes += heap_size<uint8_t>::compute(val.m_key_width);
451 |         bytes += heap_size<hash_t>::compute(val.m_hash);
452 |         bytes += heap_size<storage_t>::compute(
453 |             val.m_storage, val.table_size(), val.storage_widths());
454 |         bytes += heap_size<placement_t>::compute(
455 |             val.m_placement, val.table_size());
456 | 
457 |         return bytes;
458 |     }
459 | };
460 | 
461 | template<typename hash_t, typename placement_t>
462 | struct serialize<compact_hash::set::hashset_t<hash_t, placement_t>> {
463 |     using T = compact_hash::set::hashset_t<hash_t, placement_t>;
464 |     using storage_t = typename T::storage_t;
465 | 
466 |     static object_size_t write(std::ostream& out, T const& val) {
467 |         using namespace compact_hash::set;
468 |         using namespace compact_hash;
469 | 
470 |         auto bytes = object_size_t::empty();
471 | 
472 |         bytes += serialize<size_manager_t>::write(out, val.m_sizing);
473 |         bytes += serialize<uint8_t>::write(out, val.m_key_width);
474 |         bytes += serialize<hash_t>::write(out, val.m_hash);
475 |         bytes += serialize<storage_t>::write(
476 |             out, val.m_storage, val.table_size(), val.storage_widths());
477 |         bytes += serialize<placement_t>::write(
478 |             out, val.m_placement, val.table_size());
479 | 
480 |         return bytes;
481 |     }
482 |     static T read(std::istream& in) {
483 |         using namespace compact_hash::set;
484 |         using namespace compact_hash;
485 | 
486 |         T ret;
487 | 
488 |         auto sizing = serialize<size_manager_t>::read(in);
489 |         auto key_width = serialize<uint8_t>::read(in);
490 |         auto hash = serialize<hash_t>::read(in);
491 |         ret.m_sizing = std::move(sizing);
492 |         ret.m_key_width = std::move(key_width);
493 |         ret.m_hash = std::move(hash);
494 | 
495 |         auto storage = serialize<storage_t>::read(in, ret.table_size(), ret.storage_widths());
496 |         auto placement = serialize<placement_t>::read(in, ret.table_size());
497 | 
498 |         ret.m_storage = std::move(storage);
499 |         ret.m_placement = std::move(placement);
500 | 
501 |         return ret;
502 |     }
503 |     static bool equal_check(T const& lhs, T const& rhs) {
504 |         if (!(gen_equal_check(table_size()) && gen_equal_check(storage_widths()))) {
505 |             return false;
506 |         }
507 | 
508 |         auto table_size = lhs.table_size();
509 |         auto storage_widths = lhs.storage_widths();
510 | 
511 |         bool deep_eq = gen_equal_check(m_sizing)
512 |         && gen_equal_check(m_key_width)
513 |         && gen_equal_check(m_hash)
514 |         && gen_equal_check(m_storage, table_size, storage_widths)
515 |         && gen_equal_check(m_placement, table_size);
516 | 
517 |         return deep_eq;
518 |     }
519 | };
520 | 
521 | }
522 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/set/no_satellite_data_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <tudocomp/util/compact_hash/entry_t.hpp>
 4 | #include "quot_ptr_t.hpp"
 5 | #include "quot_bucket_layout_t.hpp"
 6 | 
 7 | namespace tdc {namespace compact_hash {namespace set {
 8 | 
 9 | struct no_satellite_data_t {
10 |     using entry_ptr_t = quot_ptr_t;
11 |     using entry_bit_width_t = uint8_t;
12 |     using bucket_data_layout_t = quot_bucket_layout_t;
13 |     using sentinel_value_type = void;
14 | };
15 | 
16 | }}}
17 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/set/quot_bucket_layout_t.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include <memory>
 5 | #include <cstdint>
 6 | #include <utility>
 7 | #include <algorithm>
 8 | 
 9 | #include <tudocomp/util/bit_packed_layout_t.hpp>
10 | #include <tudocomp/util/compact_hash/util.hpp>
11 | #include "quot_ptr_t.hpp"
12 | 
13 | namespace tdc {namespace compact_hash {namespace set {
14 | 
15 | struct quot_bucket_layout_t {
16 |     /// Calculates the offsets of the two different arrays inside the allocation.
17 |     struct Layout {
18 |         cbp::cbp_layout_element_t<dynamic_t> quots_layout;
19 |         size_t overall_qword_size;
20 | 
21 |         inline Layout(): quots_layout(), overall_qword_size(0) {
22 |         }
23 |     };
24 |     inline static Layout calc_sizes(size_t size, uint8_t quot_width) {
25 |         DCHECK_NE(size, 0U);
26 | 
27 |         auto layout = cbp::bit_layout_t();
28 | 
29 |         // The quotients
30 |         auto quots = layout.cbp_elements<dynamic_t>(size, quot_width);
31 | 
32 |         Layout r;
33 |         r.quots_layout = quots;
34 |         r.overall_qword_size = layout.get_size_in_uint64_t_units();
35 |         return r;
36 |     }
37 | 
38 |     /// Creates the pointers to the beginnings of the two arrays inside
39 |     /// the allocation.
40 |     inline static quot_ptr_t ptr(uint64_t* alloc, size_t size, uint8_t quot_width) {
41 |         DCHECK_NE(size, 0U);
42 |         auto layout = calc_sizes(size, quot_width);
43 | 
44 |         return layout.quots_layout.ptr_relative_to(alloc);
45 |     }
46 | 
47 |     // Run destructors of each element in the bucket.
48 |     inline static void destroy_vals(uint64_t*, size_t, uint8_t) {
49 |         // NB: this does not contain values
50 |     }
51 | 
52 |     /// Returns a `val_quot_ptr_t` to position `pos`,
53 |     /// or a sentinel value that acts as a one-pass-the-end pointer for the empty case.
54 |     inline static quot_ptr_t at(uint64_t* alloc, size_t size, size_t pos, uint8_t quot_width) {
55 |         if(size != 0) {
56 |             auto ps = ptr(alloc, size, quot_width);
57 |             return quot_ptr_t(ps.quot_ptr() + pos);
58 |         } else {
59 |             DCHECK_EQ(pos, 0U);
60 |             return quot_ptr_t();
61 |         }
62 |     }
63 | };
64 | 
65 | }}}
66 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/set/quot_ptr_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>
  4 | #include <cstdint>
  5 | #include <utility>
  6 | #include <algorithm>
  7 | 
  8 | #include <tudocomp/util/compact_hash/util.hpp>
  9 | #include <tudocomp/util/compact_hash/entry_t.hpp>
 10 | 
 11 | namespace tdc {namespace compact_hash {namespace set {
 12 | 
 13 | /// Represents a pair of pointers to value and quotient inside a bucket.
 14 | class quot_ptr_t {
 15 |     mutable QuotPtr m_quot_ptr;
 16 | 
 17 | public:
 18 |     struct my_value_type {
 19 |         uint64_t quot;
 20 |     };
 21 | 
 22 |     inline quot_ptr_t(QuotPtr quot_ptr):
 23 |         m_quot_ptr(quot_ptr)
 24 |     {
 25 |     }
 26 | 
 27 |     inline quot_ptr_t():
 28 |         m_quot_ptr() {}
 29 | 
 30 |     inline uint64_t get_quotient() const {
 31 |         return uint64_t(*m_quot_ptr);
 32 |     }
 33 | 
 34 |     inline void set_quotient(uint64_t v) const {
 35 |         *m_quot_ptr = v;
 36 |     }
 37 | 
 38 |     inline void swap_quotient(uint64_t& other) const {
 39 |         uint64_t tmp = uint64_t(*m_quot_ptr);
 40 |         std::swap(other, tmp);
 41 |         *m_quot_ptr = tmp;
 42 |     }
 43 | 
 44 |     inline QuotPtr quot_ptr() const {
 45 |         return m_quot_ptr;
 46 |     }
 47 | 
 48 |     inline void increment_ptr() {
 49 |         m_quot_ptr++;
 50 |     }
 51 |     inline void decrement_ptr() {
 52 |         m_quot_ptr--;
 53 |     }
 54 | 
 55 |     inline friend bool operator==(quot_ptr_t const& lhs,
 56 |                                   quot_ptr_t const& rhs)
 57 |     {
 58 |         return lhs.m_quot_ptr == rhs.m_quot_ptr;
 59 |     }
 60 | 
 61 |     inline friend bool operator!=(quot_ptr_t const& lhs,
 62 |                                   quot_ptr_t const& rhs)
 63 |     {
 64 |         return lhs.m_quot_ptr != rhs.m_quot_ptr;
 65 |     }
 66 | 
 67 |     inline void set(uint64_t quot) {
 68 |         set_quotient(quot);
 69 |     }
 70 | 
 71 |     inline void set_no_drop(uint64_t quot) {
 72 |         set_quotient(quot);
 73 |     }
 74 | 
 75 |     inline void move_from(quot_ptr_t other) {
 76 |         set_quotient(other.get_quotient());
 77 |     }
 78 | 
 79 |     inline void init_from(quot_ptr_t other) {
 80 |         set_quotient(other.get_quotient());
 81 |     }
 82 | 
 83 |     inline void swap_with(quot_ptr_t other) {
 84 |         uint64_t tmp_quot = get_quotient();
 85 |         move_from(other);
 86 |         other.set(tmp_quot);
 87 |     }
 88 | 
 89 |     inline void uninitialize() {
 90 |     }
 91 | 
 92 |     inline bool contents_eq(quot_ptr_t rhs) const {
 93 |         return get_quotient() == rhs.get_quotient();
 94 |     }
 95 | 
 96 |     inline my_value_type move_out() const {
 97 |         return my_value_type {
 98 |             get_quotient(),
 99 |         };
100 |     }
101 | 
102 |     inline void set(my_value_type&& val) {
103 |         set_quotient(val.quot);
104 |     }
105 | };
106 | 
107 | }}}
108 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/set/typedefs.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <tudocomp/util/compact_hash/set/hashset_t.hpp>
 4 | #include <tudocomp/util/compact_hash/hash_functions.hpp>
 5 | #include <tudocomp/util/compact_hash/index_structure/cv_bvs_t.hpp>
 6 | #include <tudocomp/util/compact_hash/index_structure/displacement_t.hpp>
 7 | #include <tudocomp/util/compact_hash/index_structure/elias_gamma_displacement_table_t.hpp>
 8 | #include <tudocomp/util/compact_hash/index_structure/layered_displacement_table_t.hpp>
 9 | #include <tudocomp/util/compact_hash/index_structure/naive_displacement_table_t.hpp>
10 | 
11 | namespace tdc {namespace compact_hash {namespace set {
12 | 
13 | template<typename hash_t = poplar_xorshift_t>
14 | using sparse_cv_hashset_t
15 |     = hashset_t<hash_t, cv_bvs_t>;
16 | 
17 | template<typename hash_t = poplar_xorshift_t>
18 | using sparse_layered_hashset_t
19 |     = hashset_t<hash_t, displacement_t<layered_displacement_table_t<dynamic_layered_bit_width_t>>>;
20 | 
21 | template<typename hash_t = poplar_xorshift_t>
22 | using sparse_elias_hashset_t
23 |     = hashset_t<hash_t, displacement_t<elias_gamma_displacement_table_t<
24 |         dynamic_fixed_elias_gamma_bucket_size_t>>>;
25 | 
26 | }}}
27 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/size_manager_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | 
  5 | #include "decomposed_key_t.hpp"
  6 | 
  7 | #include <tudocomp/util/serialization.hpp>
  8 | #include <tudocomp/util/heap_size.hpp>
  9 | 
 10 | namespace tdc {namespace compact_hash {
 11 | 
 12 | /// This manages the size of the hashtable, and related calculations.
 13 | class size_manager_t {
 14 |     /*
 15 |      * TODO: This is currently hardcoded to work with power-of-two table sizes.
 16 |      * Generalize it to allows arbitrary growth functions.
 17 |      */
 18 | 
 19 |     uint8_t m_capacity_log2;
 20 |     size_t m_size;
 21 |     float m_load_factor = 0.5;
 22 | 
 23 |     template<typename T>
 24 |     friend struct ::tdc::serialize;
 25 | 
 26 |     template<typename T>
 27 |     friend struct ::tdc::heap_size;
 28 | 
 29 |     /// Adjust the user-specified size of the table as needed
 30 |     /// by the current implementation.
 31 |     ///
 32 |     /// In this case, the grow function multiplies the capacity by two,
 33 |     /// so we need to start at a value != 0.
 34 |     inline static size_t adjust_size(size_t size) {
 35 |         return (size < 2) ? 2 : size;
 36 |     }
 37 | 
 38 |     size_manager_t() = default;
 39 | 
 40 | public:
 41 |     /// runtime initilization arguments, if any
 42 |     struct config_args {
 43 |         config_args() {}
 44 |         config_args(float load_factor): load_factor(load_factor) {}
 45 | 
 46 |         float load_factor = 0.5;
 47 |     };
 48 | 
 49 |     /// get the config of this instance
 50 |     inline config_args current_config() const {
 51 |         return config_args {
 52 |             m_load_factor,
 53 |         };
 54 |     }
 55 | 
 56 |     /// Create the size manager with an initial table size `capacity`
 57 |     inline size_manager_t(size_t capacity, config_args config = config_args{}) {
 58 |         capacity = adjust_size(capacity);
 59 | 
 60 |         m_size = 0;
 61 |         m_load_factor = config.load_factor;
 62 |         CHECK(is_pot(capacity));
 63 |         m_capacity_log2 = log2_upper(capacity);
 64 |     }
 65 | 
 66 |     /// Returns the amount of elements currently stored in the hashtable.
 67 |     inline size_t size() const {
 68 |         return m_size;
 69 |     }
 70 | 
 71 |     /// Update the amount of elements currently stored in the hashtable
 72 |     inline void set_size(size_t new_size) {
 73 |         DCHECK_LT(new_size, capacity());
 74 |         m_size = new_size;
 75 |     }
 76 | 
 77 |     /// The amount of bits used by the current table size.
 78 |     // TODO: Remove/make private
 79 |     inline uint8_t capacity_log2() const {
 80 |         return m_capacity_log2;
 81 |     }
 82 | 
 83 |     /// The current table size.
 84 |     inline size_t capacity() const {
 85 |         return 1ull << m_capacity_log2;
 86 |     }
 87 | 
 88 |     /// Check if the capacity needs to grow for the size given as the
 89 |     /// argument.
 90 |     inline bool needs_to_grow_capacity(size_t capacity, size_t new_size) const {
 91 |         // Capacity, at which a re-allocation is needed
 92 |         size_t trigger_capacity = size_t(float(capacity) * m_load_factor);
 93 | 
 94 |         // Make sure we have always a minimum of 1 free space in the table.
 95 |         trigger_capacity = std::min(capacity - 1, trigger_capacity);
 96 | 
 97 |         bool ret = trigger_capacity < new_size;
 98 |         return ret;
 99 |     }
100 | 
101 |     /// Returns the new capacity after growth.
102 |     inline size_t grown_capacity(size_t capacity) const {
103 |         DCHECK_GE(capacity, 1U);
104 |         return capacity * 2;
105 |     }
106 | 
107 |     /// Decompose the hash value such that `initial_address`
108 |     /// covers the entire table, and `quotient` contains
109 |     /// the remaining bits.
110 |     inline decomposed_key_t decompose_hashed_value(uint64_t hres) {
111 |         uint64_t shift = capacity_log2();
112 | 
113 |         return decomposed_key_t {
114 |             hres & ((1ull << shift) - 1ull),
115 |             hres >> shift,
116 |         };
117 |     }
118 | 
119 |     /// Composes a hash value from an `initial_address` and `quotient`.
120 |     inline uint64_t compose_hashed_value(uint64_t initial_address, uint64_t quotient) {
121 |         uint64_t shift = capacity_log2();
122 |         uint64_t harg = (quotient << shift) | initial_address;
123 |         return harg;
124 |     }
125 | 
126 |     /// Adds the `add` value to `v`, and wraps it around the current capacity.
127 |     template<typename int_t>
128 |     inline int_t mod_add(int_t v, int_t add = 1) const {
129 |         size_t mask = capacity() - 1;
130 |         return (v + add) & mask;
131 |     }
132 | 
133 |     /// Subtracts the `sub` value to `v`, and wraps it around the current capacity.
134 |     template<typename int_t>
135 |     inline int_t mod_sub(int_t v, int_t sub = 1) const {
136 |         size_t mask = capacity() - 1;
137 |         return (v - sub) & mask;
138 |     }
139 | 
140 |     /// Sets the maximum load factor
141 |     /// (how full the table can get before re-allocating).
142 |     ///
143 |     /// Expects a value `0.0 < z < 1.0`.
144 |     inline void max_load_factor(float z) {
145 |         DCHECK_GT(z, 0.0);
146 |         DCHECK_LE(z, 1.0);
147 |         m_load_factor = z;
148 |     }
149 | 
150 |     /// Returns the maximum load factor.
151 |     inline float max_load_factor() const noexcept {
152 |         return m_load_factor;
153 |     }
154 | };
155 | 
156 | }
157 | 
158 | template<>
159 | struct heap_size<compact_hash::size_manager_t> {
160 |     using T = compact_hash::size_manager_t;
161 | 
162 |     static object_size_t compute(T const& val) {
163 |         using namespace compact_hash;
164 | 
165 |         auto bytes = object_size_t::empty();
166 | 
167 |         bytes += heap_size<uint8_t>::compute(val.m_capacity_log2);
168 |         bytes += heap_size<size_t>::compute(val.m_size);
169 |         bytes += heap_size<float>::compute(val.m_load_factor);
170 | 
171 |         return bytes;
172 |     }
173 | };
174 | 
175 | template<>
176 | struct serialize<compact_hash::size_manager_t> {
177 |     using T = compact_hash::size_manager_t;
178 | 
179 |     static object_size_t write(std::ostream& out, T const& val) {
180 |         using namespace compact_hash;
181 | 
182 |         auto bytes = object_size_t::empty();
183 | 
184 |         bytes += serialize<uint8_t>::write(out, val.m_capacity_log2);
185 |         bytes += serialize<size_t>::write(out, val.m_size);
186 |         bytes += serialize<float>::write(out, val.m_load_factor);
187 | 
188 |         return bytes;
189 |     }
190 |     static T read(std::istream& in) {
191 |         using namespace compact_hash;
192 | 
193 |         T ret;
194 |         ret.m_capacity_log2 = serialize<uint8_t>::read(in);
195 |         ret.m_size = serialize<size_t>::read(in);
196 |         ret.m_load_factor = serialize<float>::read(in);
197 |         return ret;
198 |     }
199 |     static bool equal_check(T const& lhs, T const& rhs) {
200 |         return gen_equal_check(m_capacity_log2)
201 |         && gen_equal_check(m_size)
202 |         && gen_equal_check(m_load_factor);
203 |     }
204 | };
205 | 
206 | }
207 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/storage/bucket_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>
  4 | #include <cstdint>
  5 | #include <utility>
  6 | #include <algorithm>
  7 | 
  8 | #include <tudocomp/util/bit_packed_layout_t.hpp>
  9 | #include <tudocomp/util/compact_hash/util.hpp>
 10 | #include <tudocomp/util/serialization.hpp>
 11 | 
 12 | namespace tdc {namespace compact_hash {
 13 | using namespace compact_hash;
 14 | 
 15 | /// A bucket of quotient-value pairs in a sparse compact hashtable.
 16 | ///
 17 | /// It consists of a pointer to a single heap allocation, that contains:
 18 | /// - A 64-bit bitvector of currently stored elements.
 19 | /// - A dynamic-width array of quotients.
 20 | /// - A potentially dynamic-width array of satellite values.
 21 | ///
 22 | /// An empty bucket does not allocate any memory.
 23 | ///
 24 | /// WARNING:
 25 | /// To prevent the overhead of unnecessary default-constructions,
 26 | /// the bucket does not initialize or destroy the value and quotient parts
 27 | /// of the allocation in its constructor/destructor.
 28 | /// Instead, it relies on the surrounding container to initialize and destroy
 29 | /// the values correctly.
 30 | // TODO: Investigate changing this semantic to automatic initialization
 31 | // and destruction.
 32 | template<size_t N, typename satellite_t>
 33 | class bucket_t {
 34 |     std::unique_ptr<uint64_t[]> m_data;
 35 | 
 36 |     template<typename T>
 37 |     friend struct ::tdc::serialize;
 38 | 
 39 |     template<typename T>
 40 |     friend struct ::tdc::heap_size;
 41 | 
 42 |     using entry_ptr_t = typename satellite_t::entry_ptr_t;
 43 |     using entry_bit_width_t = typename satellite_t::entry_bit_width_t;
 44 | public:
 45 |     /// Maps hashtable position to position of the corresponding bucket,
 46 |     /// and the position inside of it.
 47 |     struct bucket_layout_t: satellite_t::bucket_data_layout_t {
 48 |         static constexpr size_t BVS_WIDTH_SHIFT = 6;
 49 |         static constexpr size_t BVS_WIDTH_MASK = 0b111111;
 50 | 
 51 |         static inline size_t table_pos_to_idx_of_bucket(size_t pos) {
 52 |             return pos >> BVS_WIDTH_SHIFT;
 53 |         }
 54 | 
 55 |         static inline size_t table_pos_to_idx_inside_bucket(size_t pos) {
 56 |             return pos & BVS_WIDTH_MASK;
 57 |         }
 58 | 
 59 |         static inline size_t table_size_to_bucket_size(size_t size) {
 60 |             return (size + BVS_WIDTH_MASK) >> BVS_WIDTH_SHIFT;
 61 |         }
 62 |     };
 63 | 
 64 |     inline bucket_t(): m_data() {}
 65 | 
 66 |     /// Construct a bucket, reserving space according to the bitvector
 67 |     /// `bv` and `quot_width`.
 68 |     inline bucket_t(uint64_t bv, entry_bit_width_t width) {
 69 |         if (bv != 0) {
 70 |             auto qvd_size = qvd_data_size(size(bv), width);
 71 | 
 72 |             m_data = std::make_unique<uint64_t[]>(qvd_size + 1);
 73 |             m_data[0] = bv;
 74 | 
 75 |             // NB: We call this for its alignment asserts
 76 |             ptr(width);
 77 |         } else {
 78 |             m_data.reset();
 79 |         }
 80 |     }
 81 | 
 82 |     inline bucket_t(bucket_t&& other) = default;
 83 |     inline bucket_t& operator=(bucket_t&& other) = default;
 84 | 
 85 |     /// Returns the bitvector of contained elements.
 86 |     inline uint64_t bv() const {
 87 |         if (!is_empty()) {
 88 |             return m_data[0];
 89 |         } else {
 90 |             return 0;
 91 |         }
 92 |     }
 93 | 
 94 |     /// Returns the amount of elements in the bucket.
 95 |     inline size_t size() const {
 96 |         return size(bv());
 97 |     }
 98 | 
 99 |     // Run destructors of each element in the bucket.
100 |     inline void destroy_vals(entry_bit_width_t widths) {
101 |         if (is_allocated()) {
102 |             bucket_layout_t::destroy_vals(get_qv(), size(), widths);
103 |         }
104 |     }
105 | 
106 |     /// Returns a `entry_ptr_t` to position `pos`,
107 |     /// or a sentinel value that acts as a one-pass-the-end pointer.
108 |     inline entry_ptr_t at(size_t pos, entry_bit_width_t width) const {
109 |         return bucket_layout_t::at(get_qv(), size(), pos, width);
110 |     }
111 | 
112 |     inline bool is_allocated() const {
113 |         return bool(m_data);
114 |     }
115 | 
116 |     inline bool is_empty() const {
117 |         return !bool(m_data);
118 |     }
119 | 
120 |     inline size_t stat_allocation_size_in_bytes(entry_bit_width_t width) const {
121 |         if (!is_empty()) {
122 |             return (qvd_data_size(size(), width) + 1) * sizeof(uint64_t);
123 |         } else {
124 |             return 0;
125 |         }
126 |     }
127 | 
128 |     /// Insert a new element into the bucket, growing it as needed
129 |     inline entry_ptr_t insert_at(
130 |         size_t new_elem_bucket_pos,
131 |         uint64_t new_elem_bv_bit,
132 |         entry_bit_width_t width)
133 |     {
134 |         // Just a sanity check that can not live inside or outside `bucket_t` itself.
135 |         static_assert(sizeof(bucket_t<N, satellite_t>) == sizeof(void*), "unique_ptr is more than 1 ptr large!");
136 | 
137 |         // TODO: check out different sizing strategies
138 |         // eg, the known sparse_hash repo uses overallocation for small buckets
139 | 
140 |         // create a new bucket with enough size for the new element
141 |         // NB: The elements in it are uninitialized
142 |         auto new_bucket = bucket_t<N, satellite_t>(bv() | new_elem_bv_bit, width);
143 | 
144 |         auto new_iter = new_bucket.at(0, width);
145 |         auto old_iter = at(0, width);
146 | 
147 |         auto const new_iter_midpoint = new_bucket.at(new_elem_bucket_pos, width);
148 |         auto const new_iter_end = new_bucket.at(new_bucket.size(), width);
149 | 
150 |         entry_ptr_t ret;
151 | 
152 |         // move all elements before the new element's location from old bucket into new bucket
153 |         while(new_iter != new_iter_midpoint) {
154 |             new_iter.init_from(old_iter);
155 |             new_iter.increment_ptr();
156 |             old_iter.increment_ptr();
157 |         }
158 | 
159 |         // move new element into its location in the new bucket
160 |         {
161 |             ret = new_iter;
162 |             new_iter.increment_ptr();
163 |         }
164 | 
165 |         // move all elements after the new element's location from old bucket into new bucket
166 |         while(new_iter != new_iter_end) {
167 |             new_iter.init_from(old_iter);
168 |             new_iter.increment_ptr();
169 |             old_iter.increment_ptr();
170 |         }
171 | 
172 |         // destroy old empty elements, and overwrite with new bucket
173 |         destroy_vals(width);
174 |         *this = std::move(new_bucket);
175 | 
176 |         return ret;
177 |     }
178 | private:
179 |     inline static size_t size(uint64_t bv) {
180 |         return popcount(bv);
181 |     }
182 | 
183 |     inline uint64_t* get_qv() const {
184 |         return static_cast<uint64_t*>(m_data.get()) + 1;
185 |     }
186 | 
187 |     inline static size_t qvd_data_size(size_t size, entry_bit_width_t width) {
188 |         return bucket_layout_t::calc_sizes(size, width).overall_qword_size;
189 |     }
190 | 
191 |     /// Creates the pointers to the beginnings of the two arrays inside
192 |     /// the allocation.
193 |     inline entry_ptr_t ptr(entry_bit_width_t width) const {
194 |         return bucket_layout_t::ptr(get_qv(), size(), width);
195 |     }
196 | };
197 | 
198 | }
199 | 
200 | template<size_t N, typename satellite_t>
201 | struct heap_size<compact_hash::bucket_t<N, satellite_t>> {
202 |     using T = compact_hash::bucket_t<N, satellite_t>;
203 |     using entry_bit_width_t = typename T::entry_bit_width_t;
204 | 
205 |     static object_size_t compute(T const& val, entry_bit_width_t const& widths) {
206 |         using namespace compact_hash;
207 | 
208 |         auto bytes = object_size_t::empty();
209 | 
210 |         size_t size = val.size();
211 | 
212 |         if (size > 0) {
213 |             size_t raw_size = T::qvd_data_size(size, widths) + 1;
214 |             bytes += heap_size<std::unique_ptr<uint64_t[]>>::compute(val.m_data, raw_size);
215 |         }
216 | 
217 |         return bytes;
218 |     }
219 | };
220 | 
221 | template<size_t N, typename satellite_t>
222 | struct serialize<compact_hash::bucket_t<N, satellite_t>> {
223 |     using T = compact_hash::bucket_t<N, satellite_t>;
224 |     using entry_bit_width_t = typename T::entry_bit_width_t;
225 | 
226 |     static object_size_t write(std::ostream& out, T const& val, entry_bit_width_t const& widths) {
227 |         using namespace compact_hash;
228 | 
229 |         auto bytes = object_size_t::empty();
230 | 
231 |         bytes += serialize<uint64_t>::write(out, val.bv());
232 |         size_t size = val.size();
233 | 
234 |         if (size > 0) {
235 |             size_t raw_size = T::qvd_data_size(size, widths) + 1;
236 |             for (size_t i = 1; i < raw_size; i++) {
237 |                 bytes += serialize<uint64_t>::write(out, val.m_data[i]);
238 |             }
239 |         }
240 | 
241 |         return bytes;
242 |     }
243 |     static T read(std::istream& in, entry_bit_width_t const& widths) {
244 |         using namespace compact_hash;
245 | 
246 |         T ret;
247 | 
248 |         uint64_t bv = serialize<uint64_t>::read(in);
249 |         size_t size = T::size(bv);
250 | 
251 |         if (size > 0) {
252 |             size_t raw_size = T::qvd_data_size(size, widths) + 1;
253 |             ret.m_data = std::make_unique<uint64_t[]>(raw_size);
254 |             ret.m_data[0] = bv;
255 |             for (size_t i = 1; i < raw_size; i++) {
256 |                 ret.m_data[i] = serialize<uint64_t>::read(in);
257 |             }
258 |         }
259 | 
260 |         return ret;
261 |     }
262 | };
263 | 
264 | }
265 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/storage/buckets_bv_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>
  4 | 
  5 | #include <tudocomp/util/compact_hash/util.hpp>
  6 | #include <tudocomp/util/compact_hash/storage/sparse_pos_t.hpp>
  7 | #include <tudocomp/util/compact_hash/storage/bucket_t.hpp>
  8 | 
  9 | #include <tudocomp/util/serialization.hpp>
 10 | 
 11 | // Table for uninitalized elements
 12 | 
 13 | namespace tdc {namespace compact_hash {
 14 |     template<typename satellite_t>
 15 |     struct buckets_bv_t {
 16 |         using satellite_t_export = satellite_t;
 17 |         using entry_ptr_t = typename satellite_t::entry_ptr_t;
 18 |         using entry_bit_width_t = typename satellite_t::entry_bit_width_t;
 19 | 
 20 |         using my_bucket_t = bucket_t<8, satellite_t>;
 21 |         using bucket_layout_t = typename my_bucket_t::bucket_layout_t;
 22 |         using buckets_t = std::unique_ptr<my_bucket_t[]>;
 23 |         using qvd_t = typename satellite_t::bucket_data_layout_t;
 24 | 
 25 |         buckets_t m_buckets;
 26 | 
 27 |         template<typename T>
 28 |         friend struct ::tdc::serialize;
 29 | 
 30 |         /// runtime initilization arguments, if any
 31 |         struct config_args {};
 32 | 
 33 |         /// get the config of this instance
 34 |         inline config_args current_config() const { return config_args{}; }
 35 | 
 36 |         inline buckets_bv_t() {}
 37 |         inline buckets_bv_t(size_t table_size,
 38 |                             entry_bit_width_t widths,
 39 |                             config_args config) {
 40 |             size_t buckets_size = bucket_layout_t::table_size_to_bucket_size(table_size);
 41 | 
 42 |             m_buckets = std::make_unique<my_bucket_t[]>(buckets_size);
 43 |         }
 44 |         using table_pos_t = sparse_pos_t<my_bucket_t, bucket_layout_t>;
 45 | 
 46 |         // pseudo-iterator for iterating over bucket elements
 47 |         // NB: does not wrap around!
 48 |         struct iter_t {
 49 |             my_bucket_t const*        m_bucket;
 50 |             entry_ptr_t               m_b_start;
 51 |             entry_ptr_t               m_b_end;
 52 |             entry_bit_width_t         m_widths;
 53 | 
 54 |             inline void set_bucket_elem_range(size_t end_offset) {
 55 |                 size_t start_offset = 0;
 56 |                 DCHECK_LE(start_offset, end_offset);
 57 | 
 58 |                 m_b_start = m_bucket->at(start_offset, m_widths);
 59 |                 m_b_end   = m_bucket->at(end_offset, m_widths);
 60 |             }
 61 | 
 62 |             inline iter_t(my_bucket_t const* buckets,
 63 |                           size_t buckets_size,
 64 |                           table_pos_t const& pos,
 65 |                           entry_bit_width_t const& widths):
 66 |                 m_widths(widths)
 67 |             {
 68 |                 // NB: Using pointer arithmetic here, because
 69 |                 // we can (intentionally) end up with the address 1-past
 70 |                 // the end of the vector, which represents an end-iterator.
 71 |                 m_bucket = buckets + pos.idx_of_bucket;
 72 | 
 73 |                 if(pos.idx_of_bucket < buckets_size) {
 74 |                     set_bucket_elem_range(pos.offset_in_bucket());
 75 |                 } else {
 76 |                     // use default constructed nullptr entry_ptr_t
 77 |                 }
 78 |             }
 79 | 
 80 |             inline entry_ptr_t get() {
 81 |                 return m_b_end;
 82 |             }
 83 | 
 84 |             inline void decrement() {
 85 |                 if (m_b_start != m_b_end) {
 86 |                     m_b_end.decrement_ptr();
 87 |                 } else {
 88 |                     do {
 89 |                         --m_bucket;
 90 |                     } while(m_bucket->bv() == 0);
 91 |                     set_bucket_elem_range(m_bucket->size() - 1);
 92 |                 }
 93 |             }
 94 | 
 95 |             inline bool operator!=(iter_t& other) {
 96 |                 return m_b_end != other.m_b_end;
 97 |             }
 98 |         };
 99 | 
100 |         template<typename buckets_t>
101 |         struct context_t {
102 |             buckets_t& m_buckets;
103 |             size_t const table_size;
104 |             entry_bit_width_t widths;
105 | 
106 |             /// Run the destructors of the elements of the `i`-th bucket,
107 |             /// and drop it from the hashtable, replacing it with an empty one.
108 |             inline void drop_bucket(size_t i) {
109 |                 DCHECK_LT(i, bucket_layout_t::table_size_to_bucket_size(table_size));
110 |                 m_buckets[i].destroy_vals(widths);
111 |                 m_buckets[i] = my_bucket_t();
112 |             }
113 | 
114 |             inline void destroy_vals() {
115 |                 if(m_buckets == nullptr) return;  // stop when this is an instance after std::move
116 | 
117 |                 size_t buckets_size = bucket_layout_t::table_size_to_bucket_size(table_size);
118 | 
119 |                 for(size_t i = 0; i < buckets_size; i++) {
120 |                     m_buckets[i].destroy_vals(widths);
121 |                 }
122 |             }
123 |             inline table_pos_t table_pos(size_t pos) {
124 |                 return table_pos_t { pos, m_buckets.get() };
125 |             }
126 |             inline entry_ptr_t allocate_pos(table_pos_t pos) {
127 |                 DCHECK(!pos.exists_in_bucket());
128 | 
129 |                 auto& bucket = pos.bucket();
130 |                 auto offset_in_bucket = pos.offset_in_bucket();
131 |                 uint64_t new_bucket_bv = bucket.bv() | pos.bit_mask_in_bucket;
132 | 
133 |                 return bucket.insert_at(offset_in_bucket, new_bucket_bv, widths);
134 |             }
135 |             inline entry_ptr_t at(table_pos_t pos) {
136 |                 DCHECK(pos.exists_in_bucket());
137 | 
138 |                 auto& bucket = pos.bucket();
139 |                 auto offset_in_bucket = pos.offset_in_bucket();
140 | 
141 |                 return bucket.at(offset_in_bucket, widths);
142 |             }
143 |             inline bool pos_is_empty(table_pos_t pos) {
144 |                 return !pos.exists_in_bucket();
145 |             }
146 |             inline iter_t make_iter(table_pos_t const& pos) {
147 |                 size_t buckets_size = bucket_layout_t::table_size_to_bucket_size(table_size);
148 |                 return iter_t(m_buckets.get(), buckets_size, pos, widths);
149 |             }
150 |             inline void trim_storage(table_pos_t* last_start, table_pos_t const& end) {
151 |                 // Check if end lies on a bucket boundary, then drop all buckets before it
152 | 
153 |                 if (end.offset_in_bucket() == 0) {
154 | 
155 |                     // ignore buckets if we start in the middle of one
156 |                     if ((*last_start).offset_in_bucket() != 0) {
157 |                         // TODO: Just iterate forward to the first valid one
158 |                         *last_start = end;
159 |                     }
160 | 
161 |                     auto bstart = (*last_start).idx_of_bucket;
162 |                     auto bend = end.idx_of_bucket;
163 |                     size_t buckets_size = bucket_layout_t::table_size_to_bucket_size(table_size);
164 | 
165 |                     for (size_t i = bstart; i != bend; i = (i + 1) % buckets_size) {
166 |                         drop_bucket(i);
167 |                     }
168 | 
169 |                     *last_start = end;
170 |                 }
171 |             }
172 |         };
173 |         inline auto context(size_t table_size, entry_bit_width_t const& widths) {
174 |             // DCHECK(m_buckets); // this needs to be commented out for swapping two CHTs with std::move. 
175 |             return context_t<buckets_t> {
176 |                 m_buckets, table_size, widths
177 |             };
178 |         }
179 |         inline auto context(size_t table_size, entry_bit_width_t const& widths) const {
180 |             DCHECK(m_buckets);
181 |             return context_t<buckets_t const> {
182 |                 m_buckets, table_size, widths
183 |             };
184 |         }
185 |     };
186 | }
187 | 
188 | template<typename satellite_t>
189 | struct heap_size<compact_hash::buckets_bv_t<satellite_t>> {
190 |     using T = compact_hash::buckets_bv_t<satellite_t>;
191 |     using bucket_t = typename T::my_bucket_t;
192 |     using entry_bit_width_t = typename T::entry_bit_width_t;
193 |     using bucket_layout_t = typename T::bucket_layout_t;
194 | 
195 |     static object_size_t compute(T const& val, size_t table_size, entry_bit_width_t const& widths) {
196 |         using namespace compact_hash;
197 | 
198 |         auto bytes = object_size_t::empty();
199 |         bytes += object_size_t::exact(sizeof(decltype(val.m_buckets)));
200 | 
201 |         auto ctx = val.context(table_size, widths);
202 | 
203 |         size_t buckets_size = bucket_layout_t::table_size_to_bucket_size(table_size);
204 |         for(size_t i = 0; i < buckets_size; i++) {
205 |             auto& bucket = ctx.m_buckets[i];
206 |             bytes += heap_size<bucket_t>::compute(bucket, widths);
207 |         }
208 | 
209 |         return bytes;
210 |     }
211 | };
212 | 
213 | template<typename satellite_t>
214 | struct serialize<compact_hash::buckets_bv_t<satellite_t>> {
215 |     using T = compact_hash::buckets_bv_t<satellite_t>;
216 |     using bucket_t = typename T::my_bucket_t;
217 |     using entry_bit_width_t = typename T::entry_bit_width_t;
218 |     using bucket_layout_t = typename T::bucket_layout_t;
219 | 
220 |     static object_size_t write(std::ostream& out, T const& val, size_t table_size, entry_bit_width_t const& widths) {
221 |         using namespace compact_hash;
222 | 
223 |         auto bytes = object_size_t::empty();
224 | 
225 |         auto ctx = val.context(table_size, widths);
226 | 
227 |         size_t buckets_size = bucket_layout_t::table_size_to_bucket_size(table_size);
228 |         for(size_t i = 0; i < buckets_size; i++) {
229 |             auto& bucket = ctx.m_buckets[i];
230 |             bytes += serialize<bucket_t>::write(out, bucket, widths);
231 |         }
232 | 
233 |         return bytes;
234 |     }
235 |     static T read(std::istream& in, size_t table_size, entry_bit_width_t const& widths) {
236 |         using namespace compact_hash;
237 | 
238 |         T val { table_size, widths, {} };
239 | 
240 |         auto ctx = val.context(table_size, widths);
241 | 
242 |         size_t buckets_size = bucket_layout_t::table_size_to_bucket_size(table_size);
243 |         for(size_t i = 0; i < buckets_size; i++) {
244 |             auto& bucket = ctx.m_buckets[i];
245 |             bucket = serialize<bucket_t>::read(in, widths);
246 |         }
247 | 
248 |         return val;
249 |     }
250 |     static bool equal_check(T const& lhs, T const& rhs, size_t table_size, entry_bit_width_t const& widths) {
251 |         auto lhsc = lhs.context(table_size, widths);
252 |         auto rhsc = rhs.context(table_size, widths);
253 | 
254 |         for (size_t i = 0; i < table_size; i++) {
255 |             auto lhspos = lhsc.table_pos(i);
256 |             auto rhspos = rhsc.table_pos(i);
257 |             if (!gen_equal_diagnostic(lhsc.pos_is_empty(lhspos) == rhsc.pos_is_empty(rhspos))) {
258 |                 return false;
259 |             }
260 |             if (!lhsc.pos_is_empty(lhspos)) {
261 |                 auto lhsptrs = lhsc.at(lhspos);
262 |                 auto rhsptrs = rhsc.at(rhspos);
263 | 
264 |                 if (!gen_equal_diagnostic(lhsptrs.contents_eq(rhsptrs))) {
265 |                     return false;
266 |                 }
267 |             }
268 |         }
269 | 
270 |         return true;
271 |     }
272 | };
273 | 
274 | }
275 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/storage/plain_sentinel_t.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>
  4 | 
  5 | #include <tudocomp/util/compact_hash/util.hpp>
  6 | #include <tudocomp/util/compact_hash/entry_t.hpp>
  7 | 
  8 | #include <tudocomp/util/serialization.hpp>
  9 | 
 10 | // Table for uninitalized elements
 11 | 
 12 | namespace tdc {namespace compact_hash {
 13 |     template<typename satellite_t>
 14 |     struct plain_sentinel_t {
 15 |         using satellite_t_export = satellite_t;
 16 |         using entry_ptr_t = typename satellite_t::entry_ptr_t;
 17 |         using entry_bit_width_t = typename satellite_t::entry_bit_width_t;
 18 |         using qvd_t = typename satellite_t::bucket_data_layout_t;
 19 |         using value_type = typename satellite_t::sentinel_value_type;
 20 | 
 21 |         template<typename T>
 22 |         friend struct ::tdc::serialize;
 23 | 
 24 |         std::unique_ptr<uint64_t[]> m_alloc;
 25 |         value_type m_empty_value;
 26 | 
 27 |         /// runtime initilization arguments, if any
 28 |         struct config_args {
 29 |             value_type empty_value = value_type();
 30 |         };
 31 | 
 32 |         /// get the config of this instance
 33 |         inline config_args current_config() const {
 34 |             return config_args{
 35 |                 m_empty_value,
 36 |             };
 37 |         }
 38 | 
 39 |         inline plain_sentinel_t() {}
 40 |         inline plain_sentinel_t(size_t table_size,
 41 |                                 entry_bit_width_t widths,
 42 |                                 config_args config):
 43 |             m_empty_value(config.empty_value)
 44 |         {
 45 |             size_t alloc_size = qvd_t::calc_sizes(table_size, widths).overall_qword_size;
 46 |             m_alloc = std::make_unique<uint64_t[]>(alloc_size);
 47 | 
 48 |             auto ctx = context(table_size, widths);
 49 | 
 50 |             for(size_t i = 0; i < table_size; i++) {
 51 |                 // NB: Using at because allocate_pos()
 52 |                 // destroys the location first.
 53 |                 auto elem = ctx.at(ctx.table_pos(i));
 54 |                 elem.set_no_drop(value_type(m_empty_value), 0);
 55 |             }
 56 |         }
 57 |         struct table_pos_t {
 58 |             size_t offset;
 59 |             inline table_pos_t(): offset(-1) {}
 60 |             inline table_pos_t(size_t o): offset(o) {}
 61 |             inline table_pos_t& operator=(table_pos_t const& other) = default;
 62 |             inline table_pos_t(table_pos_t const& other) = default;
 63 |         };
 64 |         // pseudo-iterator for iterating over bucket elements
 65 |         // NB: does not wrap around!
 66 |         struct iter_t {
 67 |             entry_ptr_t       m_end;
 68 |             value_type const& m_empty_value;
 69 | 
 70 |             inline iter_t(entry_ptr_t endpos,
 71 |                           value_type const& empty_value):
 72 |                 m_end(endpos),
 73 |                 m_empty_value(empty_value)
 74 |             {
 75 |             }
 76 | 
 77 |             inline entry_ptr_t get() {
 78 |                 return m_end;
 79 |             }
 80 | 
 81 |             inline void decrement() {
 82 |                 do {
 83 |                     m_end.decrement_ptr();
 84 |                 } while(*m_end.val_ptr() == m_empty_value);
 85 |             }
 86 | 
 87 |             inline bool operator!=(iter_t& other) {
 88 |                 return m_end != other.m_end;
 89 |             }
 90 |         };
 91 | 
 92 |         template<typename alloc_type>
 93 |         struct context_t {
 94 |             alloc_type& m_alloc;
 95 |             value_type const& m_empty_value;
 96 |             size_t const table_size;
 97 |             entry_bit_width_t widths;
 98 | 
 99 |             inline void destroy_vals() {
100 |                 qvd_t::destroy_vals(m_alloc.get(), table_size, widths);
101 |             }
102 | 
103 |             inline table_pos_t table_pos(size_t pos) {
104 |                 return table_pos_t { pos };
105 |             }
106 |             inline entry_ptr_t allocate_pos(table_pos_t pos) {
107 |                 DCHECK_LT(pos.offset, table_size);
108 |                 auto tmp = at(pos);
109 | 
110 |                 // NB: allocate_pos returns a unitialized location,
111 |                 // but all locations are per default initialized with a empty_value.
112 |                 // Therefore we destroy the existing value first.
113 |                 tmp.uninitialize();
114 | 
115 |                 return tmp;
116 |             }
117 |             inline entry_ptr_t at(table_pos_t pos) {
118 |                 DCHECK_LT(pos.offset, table_size);
119 |                 return qvd_t::at(m_alloc.get(), table_size, pos.offset, widths);
120 |             }
121 |             inline bool pos_is_empty(table_pos_t pos) {
122 |                 DCHECK_LT(pos.offset, table_size);
123 |                 return *at(pos).val_ptr() == m_empty_value;
124 |             }
125 |             inline iter_t make_iter(table_pos_t const& pos) {
126 |                 // NB: One-pass-the-end is acceptable for a end iterator
127 |                 DCHECK_LE(pos.offset, table_size);
128 |                 return iter_t {
129 |                     qvd_t::at(m_alloc.get(), table_size, pos.offset, widths),
130 |                     m_empty_value,
131 |                 };
132 |             }
133 |             inline void trim_storage(table_pos_t* last_start, table_pos_t const& end) {
134 |                 // Nothing to be done
135 |             }
136 |         };
137 |         inline auto context(size_t table_size, entry_bit_width_t const& widths) {
138 |             return context_t<std::unique_ptr<uint64_t[]>> {
139 |                 m_alloc, m_empty_value, table_size, widths,
140 |             };
141 |         }
142 |         inline auto context(size_t table_size, entry_bit_width_t const& widths) const {
143 |             return context_t<std::unique_ptr<uint64_t[]> const> {
144 |                 m_alloc, m_empty_value, table_size, widths,
145 |             };
146 |         }
147 |     };
148 | }
149 | 
150 | template<typename satellite_t>
151 | struct heap_size<compact_hash::plain_sentinel_t<satellite_t>> {
152 |     using T = compact_hash::plain_sentinel_t<satellite_t>;
153 |     using entry_bit_width_t = typename T::entry_bit_width_t;
154 |     using value_type = typename T::value_type;
155 |     using qvd_t = typename T::qvd_t;
156 | 
157 |     static object_size_t compute(T const& val, size_t table_size, entry_bit_width_t const& widths) {
158 |         using namespace compact_hash;
159 | 
160 |         auto bytes = object_size_t::empty();
161 | 
162 |         auto alloc_size = qvd_t::calc_sizes(table_size, widths).overall_qword_size;
163 | 
164 |         bytes += heap_size<value_type>::compute(val.m_empty_value);
165 |         bytes += heap_size<std::unique_ptr<uint64_t[]>>::compute(val.m_alloc, alloc_size);
166 | 
167 |         return bytes;
168 |     }
169 | };
170 | 
171 | template<typename satellite_t>
172 | struct serialize<compact_hash::plain_sentinel_t<satellite_t>> {
173 |     using T = compact_hash::plain_sentinel_t<satellite_t>;
174 |     using entry_bit_width_t = typename T::entry_bit_width_t;
175 |     using value_type = typename T::value_type;
176 |     using qvd_t = typename T::qvd_t;
177 | 
178 |     static object_size_t write(std::ostream& out, T const& val, size_t table_size, entry_bit_width_t const& widths) {
179 |         using namespace compact_hash;
180 | 
181 |         auto bytes = object_size_t::empty();
182 | 
183 |         auto alloc_size = qvd_t::calc_sizes(table_size, widths).overall_qword_size;
184 | 
185 |         bytes += serialize<value_type>::write(out, val.m_empty_value);
186 |         for (size_t i = 0; i < alloc_size; i++) {
187 |             bytes += serialize<uint64_t>::write(out, val.m_alloc[i]);
188 |         }
189 | 
190 |         return bytes;
191 |     }
192 |     static T read(std::istream& in, size_t table_size, entry_bit_width_t const& widths) {
193 |         using namespace compact_hash;
194 | 
195 |         auto alloc_size = qvd_t::calc_sizes(table_size, widths).overall_qword_size;
196 | 
197 |         T ret;
198 |         ret.m_empty_value = serialize<value_type>::read(in);
199 |         ret.m_alloc = std::make_unique<uint64_t[]>(alloc_size);
200 | 
201 |         for (size_t i = 0; i < alloc_size; i++) {
202 |             ret.m_alloc[i] = serialize<uint64_t>::read(in);
203 |         }
204 | 
205 |         return ret;
206 |     }
207 |     static bool equal_check(T const& lhs, T const& rhs, size_t table_size, entry_bit_width_t const& widths) {
208 |         auto lhsc = lhs.context(table_size, widths);
209 |         auto rhsc = rhs.context(table_size, widths);
210 | 
211 |         for (size_t i = 0; i < table_size; i++) {
212 |             auto lhspos = lhsc.table_pos(i);
213 |             auto rhspos = rhsc.table_pos(i);
214 |             if (!gen_equal_diagnostic(lhsc.pos_is_empty(lhspos) == rhsc.pos_is_empty(rhspos))) {
215 |                 return false;
216 |             }
217 |             if (!lhsc.pos_is_empty(lhspos)) {
218 |                 auto lhsptrs = lhsc.at(lhspos);
219 |                 auto rhsptrs = rhsc.at(rhspos);
220 | 
221 |                 if (!gen_equal_diagnostic(lhsptrs.get_quotient() == rhsptrs.get_quotient())) {
222 |                     return false;
223 |                 }
224 |                 if (!gen_equal_diagnostic(*lhsptrs.val_ptr() == *rhsptrs.val_ptr())) {
225 |                     return false;
226 |                 }
227 |             }
228 |         }
229 | 
230 |         return true;
231 |     }
232 | };
233 | 
234 | }
235 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/storage/sparse_pos_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <cstdint>
 5 | #include <utility>
 6 | #include <algorithm>
 7 | 
 8 | #include <tudocomp/util/compact_hash/util.hpp>
 9 | 
10 | namespace tdc {namespace compact_hash {
11 | 
12 | /// This type represents a position inside the compact sparse hashtable.
13 | ///
14 | /// It is valid to have a sparse_pos_t one-past-the-end of the underlying
15 | /// bucket vector, to act as an end-iterator.
16 | template<typename bucket_t, typename bucket_layout_t>
17 | class sparse_pos_t {
18 | private:
19 |     bucket_t* m_buckets;
20 | 
21 | public:
22 |     /// Index of bucket inside the hashtable
23 |     size_t idx_of_bucket;
24 | 
25 |     /// Bit mask of the element inside the bucket
26 |     uint64_t bit_mask_in_bucket;
27 | 
28 |     inline sparse_pos_t(size_t pos, bucket_t* buckets):
29 |         m_buckets(buckets),
30 |         idx_of_bucket(bucket_layout_t::table_pos_to_idx_of_bucket(pos)),
31 |         bit_mask_in_bucket(1ull << bucket_layout_t::table_pos_to_idx_inside_bucket(pos))
32 |     {}
33 | 
34 |     inline sparse_pos_t(): m_buckets(nullptr) {}
35 |     inline sparse_pos_t(sparse_pos_t const& other) = default;
36 |     inline sparse_pos_t& operator=(sparse_pos_t const& other) = default;
37 | 
38 |     /// Accesses the bucket at this sparse position.
39 |     inline bucket_t& bucket() const {
40 |         //DCHECK_LT(idx_of_bucket, m_buckets->size());
41 |         return m_buckets[idx_of_bucket];
42 |     }
43 | 
44 |     /// Check if the sparse position exists in the corresponding bucket.
45 |     inline bool exists_in_bucket() const {
46 |         // bitvector of the bucket
47 |         uint64_t bv = bucket().bv();
48 | 
49 |         return (bv & bit_mask_in_bucket) != 0;
50 |     }
51 | 
52 |     /// Get the idx of the element inside the corresponding bucket.
53 |     ///
54 |     /// It is legal to call this method even if the element at
55 |     /// the sparse position does not exists, to calculate a position
56 |     /// at which it should be inserted.
57 |     inline size_t offset_in_bucket() const {
58 |         // bitvector of the bucket
59 |         uint64_t bv = bucket().bv();
60 | 
61 |         return popcount(bv & (bit_mask_in_bucket - 1));
62 |     }
63 | };
64 | 
65 | }}
66 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/compact_hash/util.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <cstdint>
 5 | #include <utility>
 6 | #include <algorithm>
 7 | 
 8 | #include <tudocomp/util/bit_packed_layout_t.hpp>
 9 | 
10 | namespace tdc {namespace compact_hash {
11 | 
12 | inline uint8_t log2_upper(uint64_t v) { // TODO: this is slow. Use the highest set bit
13 |     uint8_t m = 0;
14 |     uint64_t n = v;
15 |     while(n) {
16 |         n >>= 1;
17 |         m++;
18 |     }
19 |     m--;
20 |     return m;
21 | }
22 | 
23 | inline bool is_pot(size_t n) {
24 |     return (n > 0ull && ((n & (n - 1ull)) == 0ull));
25 | }
26 | 
27 | using QuotPtr = typename cbp::cbp_repr_t<dynamic_t>::pointer_t;
28 | 
29 | template<typename val_t>
30 | using ValPtr = typename cbp::cbp_repr_t<val_t>::pointer_t;
31 | template<typename val_t>
32 | using ValRef = typename cbp::cbp_repr_t<val_t>::reference_t;
33 | 
34 | inline size_t popcount(uint64_t value) {
35 |     return __builtin_popcountll(value);
36 | }
37 | 
38 | }}
39 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/heap_size.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <iostream>
 4 | #include <typeinfo>
 5 | #include <tuple>
 6 | #include <memory>
 7 | 
 8 | #include <tudocomp/util/object_size_t.hpp>
 9 | 
10 | namespace tdc {
11 |     template<typename T>
12 |     struct heap_size {
13 |         static object_size_t compute(T const& val) {
14 |             return object_size_t::unknown_extra_data(sizeof(T));
15 |         }
16 |     };
17 | 
18 |     template<typename T>
19 |     inline object_size_t heap_size_compute(T const& val) {
20 |         return heap_size<T>::compute(val);
21 |     }
22 | 
23 | #define gen_heap_size_without_indirection(...) \
24 |     template<>\
25 |     struct heap_size<__VA_ARGS__> {\
26 |         static object_size_t compute(__VA_ARGS__ const& val) {\
27 |             return object_size_t::exact(sizeof(__VA_ARGS__));\
28 |         }\
29 |     };
30 | 
31 |     gen_heap_size_without_indirection(bool)
32 |     gen_heap_size_without_indirection(unsigned char)
33 |     gen_heap_size_without_indirection(signed char)
34 |     gen_heap_size_without_indirection(char)
35 |     gen_heap_size_without_indirection(unsigned short int)
36 |     gen_heap_size_without_indirection(unsigned int)
37 |     gen_heap_size_without_indirection(unsigned long int)
38 |     gen_heap_size_without_indirection(unsigned long long int)
39 |     gen_heap_size_without_indirection(signed short int)
40 |     gen_heap_size_without_indirection(signed int)
41 |     gen_heap_size_without_indirection(signed long int)
42 |     gen_heap_size_without_indirection(signed long long int)
43 |     gen_heap_size_without_indirection(float)
44 |     gen_heap_size_without_indirection(double)
45 | 
46 |     template<typename T>
47 |     struct heap_size<std::unique_ptr<T[]>> {
48 |         static object_size_t compute(std::unique_ptr<T[]> const& val, size_t size) {
49 |             auto bytes = object_size_t::exact(sizeof(val));
50 | 
51 |             for (size_t i = 0; i < size; i++) {
52 |                 bytes += heap_size<T>::compute(val[i]);
53 |             }
54 | 
55 |             return bytes;
56 |         }
57 |     };
58 | }
59 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/object_size_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <iostream>
 4 | #include <typeinfo>
 5 | #include <tuple>
 6 | #include <memory>
 7 | 
 8 | namespace tdc {
 9 |     /// Represents the total size of some object.
10 |     ///
11 |     /// For example, it can represent the number of bytes written to a file,
12 |     /// or the total heap size of a object in memory.
13 |     ///
14 |     /// It is possible for a datastructure to not have a known size. In that
15 |     /// case, this datastructure should contain the closest lower approximation
16 |     /// of one, and `is_exact()` returns `false`.
17 |     class object_size_t {
18 |         size_t m_bytes = 0;
19 |         bool m_has_unknown_parts = false;
20 | 
21 |         object_size_t() = default;
22 |         object_size_t(size_t bytes, bool has_unknown_parts):
23 |             m_bytes(bytes), m_has_unknown_parts(has_unknown_parts) {}
24 |     public:
25 |         inline static object_size_t empty() {
26 |             return object_size_t(0, false);
27 |         }
28 |         inline static object_size_t exact(size_t size) {
29 |             return object_size_t(size, false);
30 |         }
31 |         inline static object_size_t unknown_extra_data(size_t size) {
32 |             return object_size_t(size, true);
33 |         }
34 | 
35 |         inline object_size_t operator+(object_size_t const& other) const {
36 |             return object_size_t(
37 |                 m_bytes + other.m_bytes,
38 |                 m_has_unknown_parts || other.m_has_unknown_parts);
39 |         }
40 |         inline object_size_t& operator+=(object_size_t const& other) {
41 |             m_bytes += other.m_bytes;
42 |             m_has_unknown_parts |= other.m_has_unknown_parts;
43 |             return *this;
44 |         }
45 | 
46 |         inline size_t size_in_bytes() const {
47 |             return m_bytes;
48 |         }
49 | 
50 |         inline double size_in_kibibytes() const {
51 |             return double(m_bytes) / 1024.0;
52 |         }
53 | 
54 |         inline double size_in_mebibytes() const {
55 |             return double(m_bytes) / 1024.0 / 1024.0;
56 |         }
57 | 
58 |         inline bool is_exact() const {
59 |             return !m_has_unknown_parts;
60 |         }
61 | 
62 |         inline friend std::ostream& operator<<(std::ostream& out, object_size_t const& v) {
63 |             if (!v.is_exact()) {
64 |                 out << ">=";
65 |             }
66 |             out << v.size_in_kibibytes();
67 |             out << " KiB";
68 |             return out;
69 |         }
70 |     };
71 | }
72 | 


--------------------------------------------------------------------------------
/include/tudocomp/util/serialization.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <iostream>
 4 | #include <typeinfo>
 5 | #include <tuple>
 6 | 
 7 | #include <tudocomp/util/object_size_t.hpp>
 8 | 
 9 | namespace tdc {
10 |     inline bool equal_diagnostic(bool v, char const* msg) {
11 |         if (!v) {
12 |             std::cerr << "not equal: " << msg << "\n";
13 |         }
14 |         return v;
15 |     }
16 | #define gen_equal_diagnostic(e) \
17 |     equal_diagnostic(e, #e)
18 | 
19 | #define gen_equal_check(field, ...)                                          \
20 |     gen_equal_diagnostic(                                                         \
21 |         serialize<decltype(lhs.field)>::equal_check(lhs.field, rhs.field, ##__VA_ARGS__))
22 | 
23 |     template<typename T>
24 |     struct serialize {
25 |         //*
26 |         static object_size_t write(std::ostream& out, T const& val) {
27 |             CHECK(false) << "Need to specialize `tdc::serialize` for type " << typeid(T).name();
28 |             return object_size_t::unknown_extra_data(0);
29 |         }
30 |         static T read(std::istream& in) {
31 |             CHECK(false) << "Need to specialize `tdc::serialize` for type " << typeid(T).name();
32 |         }
33 |         static bool equal_check(T const& lhs, T const& rhs) {
34 |             CHECK(false) << "Need to specialize `tdc::serialize` for type " << typeid(T).name();
35 |             return false;
36 |         }
37 |         //*/
38 |     };
39 | 
40 |     template<typename T>
41 |     inline object_size_t serialize_write(std::ostream& out, T const& val) {
42 |         return serialize<T>::write(out, val);
43 |     }
44 | 
45 |     template<typename T>
46 |     inline T serialize_read(std::istream& inp) {
47 |         return serialize<T>::read(inp);
48 |     }
49 | 
50 |     template<typename T>
51 |     inline void serialize_read_into(std::istream& inp, T& out) {
52 |         out = serialize<T>::read(inp);
53 |     }
54 | 
55 | #define gen_direct_serialization(...) \
56 |     template<>\
57 |     struct serialize<__VA_ARGS__> {\
58 |         using T = __VA_ARGS__;\
59 |         static object_size_t write(std::ostream& out, T const& val) {\
60 |             out.write((char const*) &val, sizeof(T));\
61 |             return object_size_t::exact(sizeof(T));\
62 |         }\
63 |         static T read(std::istream& in) {\
64 |             T val;\
65 |             in.read((char*) &val, sizeof(T));\
66 |             return val;\
67 |         }\
68 |         static bool equal_check(T const& lhs, T const& rhs) {\
69 |             return gen_equal_diagnostic(lhs == rhs);\
70 |         }\
71 |     };
72 | 
73 |     gen_direct_serialization(bool)
74 |     gen_direct_serialization(unsigned char)
75 |     gen_direct_serialization(signed char)
76 |     gen_direct_serialization(char)
77 |     gen_direct_serialization(unsigned short int)
78 |     gen_direct_serialization(unsigned int)
79 |     gen_direct_serialization(unsigned long int)
80 |     gen_direct_serialization(unsigned long long int)
81 |     gen_direct_serialization(signed short int)
82 |     gen_direct_serialization(signed int)
83 |     gen_direct_serialization(signed long int)
84 |     gen_direct_serialization(signed long long int)
85 |     gen_direct_serialization(float)
86 |     gen_direct_serialization(double)
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if(NOT GTEST_FOUND)
 2 |     MESSAGE(STATUS "gtest is not available - tests disabled!")
 3 |     return()
 4 | endif()
 5 | 
 6 | include(tdc_testsuite)
 7 | 
 8 | run_test(compact_sparse_hash_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
 9 | run_test(compact_hash_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
10 | run_test(compact_sparse_hash_displacement_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
11 | run_test(compact_hash_displacement_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
12 | run_test(compact_sparse_hash_elias_displacement_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
13 | run_test(compact_hash_elias_displacement_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
14 | 
15 | run_test(v2_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
16 | run_test(sandbox_test DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
17 | 
18 | run_test(compact_sparse_hashset_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
19 | run_test(compact_sparse_hashset_displacement_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
20 | run_test(compact_sparse_hashset_elias_displacement_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
21 | 
22 | run_test(compact_sparse_hashset_serialization_tests DEPS ${TDC_TEST_DEPS} compact_sparse_hash)
23 | 


--------------------------------------------------------------------------------
/test/compact_hash_displacement_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 7 | 
 8 | template<typename val_t>
 9 | using COMPACT_TABLE = tdc::compact_hash::map::plain_layered_hashmap_t<val_t>;
10 | 
11 | #include "compact_hash_tests.template.hpp"
12 | 


--------------------------------------------------------------------------------
/test/compact_hash_elias_displacement_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 7 | 
 8 | template<typename val_t>
 9 | using COMPACT_TABLE = tdc::compact_hash::map::plain_elias_hashmap_t<val_t>;
10 | 
11 | #include "compact_hash_tests.template.hpp"
12 | 


--------------------------------------------------------------------------------
/test/compact_hash_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 7 | 
 8 | template<typename val_t>
 9 | using COMPACT_TABLE = tdc::compact_hash::map::plain_cv_hashmap_t<val_t>;
10 | 
11 | #include "compact_hash_tests.template.hpp"
12 | 


--------------------------------------------------------------------------------
/test/compact_hashset_tests.template.hpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <cstdint>
  4 | #include <algorithm>
  5 | #include <unordered_set>
  6 | 
  7 | using namespace tdc;
  8 | using namespace tdc::compact_hash;
  9 | using namespace tdc::compact_hash::set;
 10 | 
 11 | using compact_hash_type = COMPACT_TABLE;
 12 | 
 13 | struct shadow_sets_t {
 14 |     std::unordered_set<uint64_t> keys;
 15 |     std::unordered_set<uint64_t> ids;
 16 |     compact_hash_type& table;
 17 | 
 18 |     shadow_sets_t(compact_hash_type& t): table(t) {}
 19 | 
 20 |     struct on_resize_t {
 21 |         shadow_sets_t& self;
 22 | 
 23 |         inline void on_resize(size_t table_size) {
 24 |             self.keys.clear();
 25 |             self.ids.clear();
 26 |         }
 27 |         inline void on_reinsert(uint64_t key, uint64_t id) {
 28 |             self.new_key(key, id);
 29 |         }
 30 |     };
 31 |     auto on_resize() {
 32 |         return on_resize_t { *this };
 33 |     }
 34 | 
 35 |     void new_key(uint64_t key, uint64_t id) {
 36 |         // std::cout << "insert(key=" << key << ", id=" << id << ")\n";
 37 |         EXPECT_TRUE(keys.count(key) == 0) << "Key " << key << " already exists";
 38 |         EXPECT_TRUE(ids.count(id) == 0) << "Id " << id << " already exists";
 39 |         keys.insert(key);
 40 |         ids.insert(id);
 41 |     }
 42 | 
 43 |     void existing_key(uint64_t key, uint64_t id) {
 44 |         EXPECT_TRUE(keys.count(key) == 1) << "Key " << key << " does not exists";
 45 |         EXPECT_TRUE(ids.count(id) == 1) << "Id " << id << " does not exists";
 46 |     }
 47 | 
 48 |     auto lookup(uint64_t key) {
 49 |         auto r = table.lookup(key);
 50 | 
 51 |         if (r.found()) {
 52 |             EXPECT_TRUE(r.key_already_exist());
 53 |             existing_key(key, r.id());
 54 |         }
 55 | 
 56 |         return r;
 57 |     }
 58 |     auto lookup_insert(uint64_t key) {
 59 |         auto r = table.lookup_insert(key, on_resize());
 60 | 
 61 |         if (r.key_already_exist()) {
 62 |             existing_key(key, r.id());
 63 |         } else {
 64 |             new_key(key, r.id());
 65 |         }
 66 | 
 67 |         return r;
 68 |     }
 69 |     auto lookup_insert_key_width(uint64_t key, uint8_t key_width) {
 70 |         auto r = table.lookup_insert_key_width(key, key_width, on_resize());
 71 | 
 72 |         if (r.key_already_exist()) {
 73 |             existing_key(key, r.id());
 74 |         } else {
 75 |             new_key(key, r.id());
 76 |         }
 77 | 
 78 |         return r;
 79 |     }
 80 |     void max_load_factor(double v) {
 81 |         table.max_load_factor(v);
 82 |     }
 83 | };
 84 | 
 85 | /// Assert that a element exists in the hashtable
 86 | inline void debug_check_single(compact_hash_type& table, uint64_t key) {
 87 |     auto r = table.lookup(key);
 88 |     ASSERT_TRUE(r.found()) << "key " << key << " not found!";
 89 | 
 90 |     auto c = table.count(key);
 91 |     ASSERT_EQ(c, 1U) << "key " << key << " not found!";
 92 | 
 93 |     auto p = table.find(key);
 94 |     ASSERT_TRUE(p != decltype(p)()) << "key " << key << " not found!";
 95 |     ASSERT_TRUE(p != nullptr) << "key " << key << " not found!";
 96 |     ASSERT_TRUE(*p == key) << "key " << key << " not found!";
 97 | }
 98 | 
 99 | /// Assert that a element exists in the hashtable
100 | inline void debug_check_single_id(compact_hash_type& table, uint64_t id) {
101 |     auto r = table.lookup_id(id);
102 |     ASSERT_TRUE(r.found()) << "id " << id << " not found!";
103 |     ASSERT_EQ(r.id(), id) << "lookup id is " << r.id() << " instead of " << id;
104 | }
105 | 
106 | /// Assert that a element exists in the hashtable
107 | inline void debug_check_single(shadow_sets_t& table, uint64_t key) {
108 |     auto r = table.lookup(key);
109 |     ASSERT_TRUE(r.found()) << "key " << key << " not found!";
110 |     table.existing_key(key, r.id());
111 | }
112 | 
113 | template<typename hashfn_t>
114 | void test_hashfn() {
115 |     for(uint32_t w = 1; w < 64; w++) {
116 |         hashfn_t fn { w, {} };
117 | 
118 |         size_t max_val = std::min<size_t>(((1 << w) - 1), 1000);
119 | 
120 |         for (size_t i = 0; i < (max_val + 1); i++) {
121 |             auto hi = fn.hash(i);
122 |             auto hhi = fn.hash_inv(hi);
123 |             /*
124 |             std::cout
125 |                 << w << ", "
126 |                 << i << ", "
127 |                 << hi << ", "
128 |                 << hhi << "\n";
129 |             */
130 | 
131 |             ASSERT_EQ(i, hhi);
132 |         }
133 |     }
134 | }
135 | 
136 | TEST(hashfn, xorshift) {
137 |     test_hashfn<compact_hash::xorshift_t>();
138 | }
139 | 
140 | TEST(hashfn, poplar_xorshift) {
141 |     test_hashfn<compact_hash::poplar_xorshift_t>();
142 | }
143 | 
144 | TEST(hash, lookup_insert) {
145 |     auto chx = compact_hash_type(256, 16);
146 |     auto ch = shadow_sets_t(chx);
147 | 
148 |     ch.lookup_insert(44);
149 |     ch.lookup_insert(45);
150 |     ch.lookup_insert(45);
151 |     ch.lookup_insert(44 + 256);
152 |     ch.lookup_insert(45 + 256);
153 |     ch.lookup_insert(46);
154 | 
155 |     ch.lookup_insert(44);
156 |     ch.lookup_insert(45);
157 |     ch.lookup_insert(44 + 256);
158 |     ch.lookup_insert(45 + 256);
159 |     ch.lookup_insert(46);
160 | 
161 |     //ch.lookup_insert(0);
162 |     //ch.lookup_insert(4);
163 |     //ch.lookup_insert(9);
164 |     //ch.lookup_insert(128);
165 | 
166 |     //std::cout << "=======================\n";
167 |     //std::cout << ch.debug_state() << "\n";
168 |     //std::cout << "=======================\n";
169 | 
170 | }
171 | 
172 | TEST(hash, lookup_insert_wrap) {
173 |     auto chx = compact_hash_type(4, 16);
174 |     auto ch = shadow_sets_t(chx);
175 |     ch.max_load_factor(1.0);
176 | 
177 |     ch.lookup_insert(3);
178 |     ch.lookup_insert(7);
179 |     ch.lookup_insert(15);
180 | 
181 |     //std::cout << "=======================\n";
182 |     //std::cout << ch.debug_state() << "\n";
183 |     //std::cout << "=======================\n";
184 | 
185 | }
186 | 
187 | TEST(hash, lookup_insert_move_wrap) {
188 |     auto chx = compact_hash_type(8, 16);
189 |     auto ch = shadow_sets_t(chx);
190 |     ch.max_load_factor(1.0);
191 | 
192 |     ch.lookup_insert(3);
193 |     ch.lookup_insert(3 + 8);
194 | 
195 |     ch.lookup_insert(5);
196 |     ch.lookup_insert(5 + 8);
197 |     ch.lookup_insert(5 + 16);
198 |     ch.lookup_insert(5 + 24);
199 | 
200 |     ch.lookup_insert(4);
201 | 
202 |     //std::cout << "=======================\n";
203 |     //std::cout << ch.debug_state() << "\n";
204 |     //std::cout << "=======================\n";
205 | 
206 |     debug_check_single(ch, 3);
207 |     debug_check_single(ch, 3 + 8);
208 |     debug_check_single(ch, 5);
209 |     debug_check_single(ch, 5 + 8);
210 |     debug_check_single(ch, 5 + 16);
211 |     debug_check_single(ch, 5 + 24);
212 |     debug_check_single(ch, 4);
213 | }
214 | 
215 | TEST(hash, cornercase) {
216 |     auto chx = compact_hash_type(8, 16);
217 |     auto ch = shadow_sets_t(chx);
218 | 
219 |     ch.lookup_insert(0);
220 |     ch.lookup_insert(0 + 8);
221 | 
222 |     debug_check_single(ch, 0);
223 |     debug_check_single(ch, 0 + 8);
224 | 
225 |     //std::cout << "=======================\n";
226 |     //std::cout << ch.debug_state() << "\n";
227 |     //std::cout << "=======================\n";
228 | 
229 | }
230 | 
231 | TEST(hash, grow) {
232 |     std::vector<uint64_t> lookup_inserted;
233 | 
234 |     auto chx = compact_hash_type(0, 10); // check that it grows to minimum 2
235 |     auto ch = shadow_sets_t(chx);
236 | 
237 |     auto add = [&](auto key) {
238 |         ch.lookup_insert(key);
239 |         //lookup_inserted.clear();
240 |         lookup_inserted.push_back(key);
241 |         for (auto& k : lookup_inserted) {
242 |             debug_check_single(ch, k);
243 |         }
244 |     };
245 | 
246 | 
247 |     for(size_t i = 0; i < 1000; i++) {
248 |         add(i);
249 |     }
250 | 
251 |     //std::cout << "=======================\n";
252 |     //std::cout << ch.debug_state() << "\n";
253 |     //std::cout << "=======================\n";
254 | 
255 | }
256 | 
257 | TEST(hash, grow_bits) {
258 |     std::vector<uint64_t> lookup_inserted;
259 | 
260 |     auto chx = compact_hash_type(0, 10); // check that it grows to minimum 2
261 |     auto ch = shadow_sets_t(chx);
262 | 
263 |     uint8_t bits = 1;
264 | 
265 |     auto add = [&](auto key) {
266 |         bits = std::max(bits, bits_for(key));
267 | 
268 |         ch.lookup_insert_key_width(key, bits);
269 |         //lookup_inserted.clear();
270 |         lookup_inserted.push_back(key);
271 |         for (auto& k : lookup_inserted) {
272 |             debug_check_single(ch, k);
273 |         }
274 |     };
275 | 
276 | 
277 |     for(size_t i = 0; i < 1000; i++) {
278 |         add(i);
279 |     }
280 | 
281 |     //std::cout << "=======================\n";
282 |     //std::cout << ch.debug_state() << "\n";
283 |     //std::cout << "=======================\n";
284 | 
285 | }
286 | 
287 | TEST(hash, grow_bits_larger) {
288 |     std::vector<uint64_t> lookup_inserted;
289 | 
290 |     auto chx = compact_hash_type(0, 0); // check that it grows to minimum 2
291 |     auto ch = shadow_sets_t(chx);
292 | 
293 |     uint8_t bits = 1;
294 | 
295 |     auto add = [&](auto key) {
296 |         bits = std::max(bits, bits_for(key));
297 | 
298 |         ch.lookup_insert_key_width(key, bits);
299 |         lookup_inserted.clear();
300 |         lookup_inserted.push_back(key);
301 |         for (auto& k : lookup_inserted) {
302 |             debug_check_single(ch, k);
303 |         }
304 |     };
305 | 
306 | 
307 |     for(size_t i = 0; i < 10000; i++) {
308 |         add(i*13ull);
309 |     }
310 | }
311 | 
312 | TEST(hash, grow_bits_larger_address) {
313 |     std::vector<uint64_t> lookup_inserted;
314 | 
315 |     auto chx = compact_hash_type(0, 0); // check that it grows to minimum 2
316 |     auto ch = shadow_sets_t(chx);
317 | 
318 |     uint8_t bits = 1;
319 | 
320 |     auto add = [&](auto key) {
321 |         bits = std::max(bits, bits_for(key));
322 | 
323 |         auto r = ch.lookup_insert_key_width(key, bits);
324 |         ASSERT_FALSE(r.key_already_exist());
325 |         lookup_inserted.clear();
326 |         lookup_inserted.push_back(key);
327 |         for (auto& k : lookup_inserted) {
328 |             debug_check_single(ch, k);
329 |         }
330 |     };
331 | 
332 | 
333 |     for(size_t i = 0; i < 10000; i++) {
334 |         add(i*13ull);
335 |     }
336 | 
337 |     //std::cout << "=======================\n";
338 |     //std::cout << ch.debug_state() << "\n";
339 |     //std::cout << "=======================\n";
340 | }
341 | 
342 | constexpr size_t load_max = 100000;
343 | //constexpr size_t load_max = 100;
344 | 
345 | void load_factor_test(float z) {
346 |     auto tablex = compact_hash_type(0, 1);
347 |     auto table = shadow_sets_t(tablex);
348 |     // TODO DEBUG
349 |     // table.debug_state();
350 | 
351 |     table.max_load_factor(z);
352 |     for(size_t i = 0; i < load_max; i++) {
353 |         table.lookup_insert_key_width(i, bits_for(i));
354 |     }
355 |     //std::cout << table.debug_print_storage() << "\n";
356 |     for(size_t i = 0; i < load_max; i++) {
357 |         auto r = table.lookup(i);
358 |         ASSERT_TRUE(r.found());
359 |     }
360 |     auto r = table.lookup(load_max);
361 |     ASSERT_FALSE(r.found());
362 | 
363 |     // TODO DEBUG
364 |     /*
365 |     auto stats = table.stat_gather();
366 | 
367 |     std::cout << "stats.buckets: " << stats.buckets << "\n";
368 |     std::cout << "stats.allocated_buckets: " << stats.allocated_buckets << "\n";
369 |     std::cout << "stats.buckets_real_allocated_capacity_in_bytes: " << stats.buckets_real_allocated_capacity_in_bytes << "\n";
370 |     std::cout << "stats.real_allocated_capacity_in_bytes: " << stats.real_allocated_capacity_in_bytes << "\n";
371 |     std::cout << "stats.theoretical_minimum_size_in_bits: " << stats.theoretical_minimum_size_in_bits << "\n";
372 |     */
373 | }
374 | 
375 | TEST(hash_load, max_load_10) {
376 |     load_factor_test(0.1);
377 | }
378 | TEST(hash_load, max_load_20) {
379 |     load_factor_test(0.2);
380 | }
381 | TEST(hash_load, max_load_30) {
382 |     load_factor_test(0.3);
383 | }
384 | TEST(hash_load, max_load_40) {
385 |     load_factor_test(0.4);
386 | }
387 | TEST(hash_load, max_load_50) {
388 |     load_factor_test(0.5);
389 | }
390 | TEST(hash_load, max_load_60) {
391 |     load_factor_test(0.6);
392 | }
393 | TEST(hash_load, max_load_70) {
394 |     load_factor_test(0.7);
395 | }
396 | TEST(hash_load, max_load_80) {
397 |     load_factor_test(0.8);
398 | }
399 | TEST(hash_load, max_load_90) {
400 |     load_factor_test(0.9);
401 | }
402 | TEST(hash_load, max_load_100) {
403 |     load_factor_test(1.0);
404 | }
405 | 
406 | TEST(hash, swap) {
407 |     auto a = compact_hash_type(8, 16);
408 |     {
409 |         auto& ch = a;
410 |         ch.max_load_factor(1.0);
411 |         ch.lookup_insert(3);
412 |         ch.lookup_insert(3 + 8);
413 |         ch.lookup_insert(5);
414 |         ch.lookup_insert(5 + 8);
415 |         ch.lookup_insert(5 + 16);
416 |         ch.lookup_insert(5 + 24);
417 |         ch.lookup_insert(4);
418 |     }
419 |     auto b = compact_hash_type(8, 16);
420 |     {
421 |         auto& ch = b;
422 |         ch.max_load_factor(1.0);
423 |         ch.lookup_insert(3);
424 |         ch.lookup_insert(3 + 8);
425 |         ch.lookup_insert(5);
426 |         ch.lookup_insert(5 + 8);
427 |         ch.lookup_insert(5 + 16);
428 |         ch.lookup_insert(5 + 24);
429 |         ch.lookup_insert(4);
430 |     }
431 | 
432 |     a.swap(b);
433 |     std::swap(a, b);
434 | }
435 | 
436 | TEST(hash, grow_bits_larger_id_lookup) {
437 |     auto ch = compact_hash_type(0, 1); // check that it grows to minimum 2
438 | 
439 |     uint8_t bits = 1;
440 | 
441 |     auto add = [&](auto key) {
442 |         bits = std::max(bits, bits_for(key));
443 | 
444 |         auto entry = ch.lookup_insert_key_width(key, bits);
445 |         uint64_t id = entry.id();
446 |         debug_check_single(ch, key);
447 |         debug_check_single_id(ch, id);
448 |     };
449 | 
450 | 
451 |     for(size_t i = 0; i < 10000; i++) {
452 |         add(i*13ull);
453 |     }
454 | }
455 | 
456 | 


--------------------------------------------------------------------------------
/test/compact_sparse_hash_displacement_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 7 | 
 8 | template<typename val_t>
 9 | using COMPACT_TABLE = tdc::compact_hash::map::sparse_layered_hashmap_t<val_t>;
10 | 
11 | #include "compact_hash_tests.template.hpp"
12 | 


--------------------------------------------------------------------------------
/test/compact_sparse_hash_elias_displacement_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 7 | 
 8 | template<typename val_t>
 9 | using COMPACT_TABLE = tdc::compact_hash::map::sparse_elias_hashmap_t<val_t>;
10 | 
11 | #include "compact_hash_tests.template.hpp"
12 | 


--------------------------------------------------------------------------------
/test/compact_sparse_hash_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 7 | 
 8 | template<typename val_t>
 9 | using COMPACT_TABLE = tdc::compact_hash::map::sparse_cv_hashmap_t<val_t>;
10 | 
11 | #include "compact_hash_tests.template.hpp"
12 | 


--------------------------------------------------------------------------------
/test/compact_sparse_hashset_displacement_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/set/typedefs.hpp>
 7 | 
 8 | using COMPACT_TABLE = tdc::compact_hash::set::sparse_layered_hashset_t<>;
 9 | 
10 | #include "compact_hashset_tests.template.hpp"
11 | 


--------------------------------------------------------------------------------
/test/compact_sparse_hashset_elias_displacement_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/set/typedefs.hpp>
 7 | 
 8 | using COMPACT_TABLE = tdc::compact_hash::set::sparse_elias_hashset_t<>;
 9 | 
10 | #include "compact_hashset_tests.template.hpp"
11 | 


--------------------------------------------------------------------------------
/test/compact_sparse_hashset_serialization_tests.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <cstdint>
  4 | #include <algorithm>
  5 | 
  6 | #include <tudocomp/util/compact_hash/set/hashset_t.hpp>
  7 | #include <tudocomp/util/compact_hash/map/hashmap_t.hpp>
  8 | 
  9 | #include <tudocomp/util/compact_hash/index_structure/displacement_t.hpp>
 10 | #include <tudocomp/util/compact_hash/index_structure/elias_gamma_displacement_table_t.hpp>
 11 | #include <tudocomp/util/compact_hash/index_structure/layered_displacement_table_t.hpp>
 12 | #include <tudocomp/util/compact_hash/index_structure/naive_displacement_table_t.hpp>
 13 | #include <tudocomp/util/compact_hash/index_structure/cv_bvs_t.hpp>
 14 | #include <tudocomp/util/compact_hash/hash_functions.hpp>
 15 | #include <tudocomp/util/compact_hash/storage/buckets_bv_t.hpp>
 16 | #include <tudocomp/util/compact_hash/storage/plain_sentinel_t.hpp>
 17 | #include <tudocomp/util/serialization.hpp>
 18 | 
 19 | using namespace tdc::compact_hash;
 20 | using namespace tdc::compact_hash::set;
 21 | using namespace tdc::compact_hash::map;
 22 | 
 23 | template<typename table_t, typename build_func>
 24 | void serialize_test_builder(build_func f) {
 25 |     using tdc::serialize;
 26 |     using tdc::heap_size;
 27 |     auto a = f();
 28 | 
 29 |     std::stringstream ss;
 30 |     auto bytes = serialize<table_t>::write(ss, a);
 31 |     size_t stream_bytes = ss.tellp();
 32 |     ASSERT_EQ(bytes.size_in_bytes(), stream_bytes);
 33 | 
 34 |     auto b = serialize<table_t>::read(ss);
 35 | 
 36 |     ASSERT_TRUE(serialize<table_t>::equal_check(a, b));
 37 | 
 38 |     auto c = f();
 39 | 
 40 |     ASSERT_TRUE(serialize<table_t>::equal_check(a, c));
 41 |     ASSERT_TRUE(serialize<table_t>::equal_check(b, c));
 42 | 
 43 |     std::cout << "heap size: "
 44 |         << heap_size<table_t>::compute(a)
 45 |         << ", written bytes: "
 46 |         << bytes
 47 |         << "\n";
 48 | }
 49 | 
 50 | 
 51 | template<typename table_t>
 52 | void serialize_test_set() {
 53 |     serialize_test_builder<table_t>([] {
 54 |         auto ch = table_t(8, 16);
 55 |         ch.max_load_factor(1.0);
 56 |         ch.lookup_insert(3);
 57 |         ch.lookup_insert(3 + 8);
 58 |         ch.lookup_insert(5);
 59 |         ch.lookup_insert(5 + 8);
 60 |         ch.lookup_insert(5 + 16);
 61 |         ch.lookup_insert(5 + 24);
 62 |         ch.lookup_insert(4);
 63 |         return ch;
 64 |     });
 65 |     serialize_test_builder<table_t>([] {
 66 |         auto ch = table_t(8, 16);
 67 |         ch.max_load_factor(1.0);
 68 |         ch.lookup_insert(3);
 69 |         ch.lookup_insert(3 + 8);
 70 |         ch.lookup_insert(5);
 71 |         ch.lookup_insert(5 + 8);
 72 |         ch.lookup_insert(5 + 16);
 73 |         ch.lookup_insert(5 + 24);
 74 |         ch.lookup_insert(4);
 75 |         return ch;
 76 |     });
 77 | 
 78 |     serialize_test_builder<table_t>([] {
 79 |         auto ch = table_t(0, 10);
 80 | 
 81 |         auto add = [&](auto key) {
 82 |             ch.lookup_insert(key);
 83 |         };
 84 | 
 85 |         for(size_t i = 0; i < 1000; i++) {
 86 |             add(i);
 87 |         }
 88 | 
 89 |         return ch;
 90 |     });
 91 | 
 92 |     serialize_test_builder<table_t>([] {
 93 |         auto ch = table_t(0, 10);
 94 | 
 95 |         uint8_t bits = 1;
 96 | 
 97 |         auto add = [&](auto key) {
 98 |             bits = std::max(bits, tdc::bits_for(key));
 99 | 
100 |             ch.lookup_insert_key_width(key, bits);
101 | 
102 |         };
103 | 
104 |         for(size_t i = 0; i < 1000; i++) {
105 |             add(i);
106 |         }
107 | 
108 |         return ch;
109 |     });
110 | 
111 |     serialize_test_builder<table_t>([] {
112 |         auto ch = table_t(0, 0);
113 | 
114 |         uint8_t bits = 1;
115 | 
116 |         auto add = [&](auto key) {
117 |             bits = std::max(bits, tdc::bits_for(key));
118 |             ch.lookup_insert_key_width(key, bits);
119 |         };
120 | 
121 | 
122 |         for(size_t i = 0; i < 10000; i++) {
123 |             add(i*13ull);
124 |         }
125 | 
126 |         return ch;
127 |     });
128 | }
129 | 
130 | #define gen_test_set(name, ...)        \
131 | TEST(serialize, name) {            \
132 |     serialize_test_set<__VA_ARGS__>(); \
133 | }
134 | 
135 | gen_test_set(set_poplar_displacement_compact_fixed_4,
136 |     hashset_t<
137 |         poplar_xorshift_t,
138 |         displacement_t<
139 |             layered_displacement_table_t<static_layered_bit_width_t<4>>
140 |         >
141 |     >
142 | )
143 | 
144 | gen_test_set(set_poplar_displacement_compact_dynamic,
145 |     hashset_t<
146 |         poplar_xorshift_t,
147 |         displacement_t<
148 |             layered_displacement_table_t<dynamic_layered_bit_width_t>
149 |         >
150 |     >
151 | )
152 | 
153 | gen_test_set(set_poplar_cv,
154 |     hashset_t<
155 |         poplar_xorshift_t,
156 |         cv_bvs_t
157 |     >
158 | )
159 | 
160 | gen_test_set(set_poplar_displacement_elias_fixed_1024,
161 |     hashset_t<
162 |         poplar_xorshift_t,
163 |         displacement_t<
164 |             elias_gamma_displacement_table_t<
165 |                 fixed_elias_gamma_bucket_size_t<1024>
166 |             >
167 |         >
168 |     >
169 | )
170 | 
171 | gen_test_set(set_poplar_displacement_elias_growing,
172 |     hashset_t<
173 |         poplar_xorshift_t,
174 |         displacement_t<
175 |             elias_gamma_displacement_table_t<
176 |                 growing_elias_gamma_bucket_size_t
177 |             >
178 |         >
179 |     >
180 | )
181 | 
182 | gen_test_set(set_poplar_displacement_elias_dynamic,
183 |     hashset_t<
184 |         poplar_xorshift_t,
185 |         displacement_t<
186 |             elias_gamma_displacement_table_t<
187 |                 dynamic_fixed_elias_gamma_bucket_size_t
188 |             >
189 |         >
190 |     >
191 | )
192 | 
193 | template<typename table_t>
194 | void serialize_test_map() {
195 |     serialize_test_builder<table_t>([] {
196 |         auto ch = table_t(8, 16);
197 |         ch.max_load_factor(1.0);
198 |         ch.insert(3, 42);
199 |         ch.insert(3 + 8, 43);
200 |         ch.insert(5, 44);
201 |         ch.insert(5 + 8, 45);
202 |         ch.insert(5 + 16, 46);
203 |         ch.insert(5 + 24, 47);
204 |         ch.insert(4, 48);
205 |         return ch;
206 |     });
207 |     serialize_test_builder<table_t>([] {
208 |         auto ch = table_t(8, 16);
209 |         ch.max_load_factor(1.0);
210 |         ch.insert(3, 49);
211 |         ch.insert(3 + 8, 50);
212 |         ch.insert(5, 51);
213 |         ch.insert(5 + 8, 52);
214 |         ch.insert(5 + 16, 53);
215 |         ch.insert(5 + 24, 54);
216 |         ch.insert(4, 55);
217 |         return ch;
218 |     });
219 | 
220 |     serialize_test_builder<table_t>([] {
221 |         auto ch = table_t(0, 10);
222 | 
223 |         auto add = [&](auto key) {
224 |             ch.insert(key, key * 3);
225 |         };
226 | 
227 |         for(size_t i = 0; i < 1000; i++) {
228 |             add(i);
229 |         }
230 | 
231 |         return ch;
232 |     });
233 | 
234 |     serialize_test_builder<table_t>([] {
235 |         auto ch = table_t(0, 10);
236 | 
237 |         uint8_t bits = 1;
238 | 
239 |         auto add = [&](auto key) {
240 |             bits = std::max(bits, tdc::bits_for(key));
241 | 
242 |             ch.insert_key_width(key, key * 4, bits);
243 | 
244 |         };
245 | 
246 |         for(size_t i = 0; i < 1000; i++) {
247 |             add(i);
248 |         }
249 | 
250 |         return ch;
251 |     });
252 | 
253 |     serialize_test_builder<table_t>([] {
254 |         auto ch = table_t(0, 0);
255 | 
256 |         uint8_t bits = 1;
257 | 
258 |         auto add = [&](auto key) {
259 |             bits = std::max(bits, tdc::bits_for(key));
260 |             ch.insert_key_width(key, key * 5, bits);
261 |         };
262 | 
263 | 
264 |         for(size_t i = 0; i < 10000; i++) {
265 |             add(i*13ull);
266 |         }
267 | 
268 |         return ch;
269 |     });
270 | }
271 | 
272 | #define gen_test_map(name, ...)        \
273 | TEST(serialize, name) {            \
274 |     serialize_test_map<__VA_ARGS__>(); \
275 | }
276 | 
277 | using val_t = uint64_t;
278 | 
279 | gen_test_map(map_poplar_bbv_displacement_compact_fixed_4,
280 |     hashmap_t<
281 |         val_t,
282 |         poplar_xorshift_t,
283 |         buckets_bv_t,
284 |         displacement_t<
285 |             layered_displacement_table_t<static_layered_bit_width_t<4>>
286 |         >
287 |     >
288 | )
289 | 
290 | gen_test_map(map_poplar_bbv_displacement_compact_dynamic,
291 |     hashmap_t<
292 |         val_t,
293 |         poplar_xorshift_t,
294 |         buckets_bv_t,
295 |         displacement_t<
296 |             layered_displacement_table_t<dynamic_layered_bit_width_t>
297 |         >
298 |     >
299 | )
300 | 
301 | gen_test_map(map_poplar_bbv_cv,
302 |     hashmap_t<
303 |         val_t,
304 |         poplar_xorshift_t,
305 |         buckets_bv_t,
306 |         cv_bvs_t
307 |     >
308 | )
309 | 
310 | gen_test_map(map_poplar_bbv_displacement_elias_fixed_1024,
311 |     hashmap_t<
312 |         val_t,
313 |         poplar_xorshift_t,
314 |         buckets_bv_t,
315 |         displacement_t<
316 |             elias_gamma_displacement_table_t<
317 |                 fixed_elias_gamma_bucket_size_t<1024>
318 |             >
319 |         >
320 |     >
321 | )
322 | 
323 | gen_test_map(map_poplar_bbv_displacement_elias_growing,
324 |     hashmap_t<
325 |         val_t,
326 |         poplar_xorshift_t,
327 |         buckets_bv_t,
328 |         displacement_t<
329 |             elias_gamma_displacement_table_t<
330 |                 growing_elias_gamma_bucket_size_t
331 |             >
332 |         >
333 |     >
334 | )
335 | 
336 | gen_test_map(map_poplar_bbv_displacement_elias_dynamic,
337 |     hashmap_t<
338 |         val_t,
339 |         poplar_xorshift_t,
340 |         buckets_bv_t,
341 |         displacement_t<
342 |             elias_gamma_displacement_table_t<
343 |                 dynamic_fixed_elias_gamma_bucket_size_t
344 |             >
345 |         >
346 |     >
347 | )
348 | 
349 | gen_test_map(map_poplar_ps_displacement_compact_fixed_4,
350 |     hashmap_t<
351 |         val_t,
352 |         poplar_xorshift_t,
353 |         plain_sentinel_t,
354 |         displacement_t<
355 |             layered_displacement_table_t<static_layered_bit_width_t<4>>
356 |         >
357 |     >
358 | )
359 | 
360 | gen_test_map(map_poplar_ps_displacement_compact_dynamic,
361 |     hashmap_t<
362 |         val_t,
363 |         poplar_xorshift_t,
364 |         plain_sentinel_t,
365 |         displacement_t<
366 |             layered_displacement_table_t<dynamic_layered_bit_width_t>
367 |         >
368 |     >
369 | )
370 | 
371 | gen_test_map(map_poplar_ps_cv,
372 |     hashmap_t<
373 |         val_t,
374 |         poplar_xorshift_t,
375 |         plain_sentinel_t,
376 |         cv_bvs_t
377 |     >
378 | )
379 | 
380 | gen_test_map(map_poplar_ps_displacement_elias_fixed_1024,
381 |     hashmap_t<
382 |         val_t,
383 |         poplar_xorshift_t,
384 |         plain_sentinel_t,
385 |         displacement_t<
386 |             elias_gamma_displacement_table_t<
387 |                 fixed_elias_gamma_bucket_size_t<1024>
388 |             >
389 |         >
390 |     >
391 | )
392 | 
393 | gen_test_map(map_poplar_ps_displacement_elias_growing,
394 |     hashmap_t<
395 |         val_t,
396 |         poplar_xorshift_t,
397 |         plain_sentinel_t,
398 |         displacement_t<
399 |             elias_gamma_displacement_table_t<
400 |                 growing_elias_gamma_bucket_size_t
401 |             >
402 |         >
403 |     >
404 | )
405 | 
406 | gen_test_map(map_poplar_ps_displacement_elias_dynamic,
407 |     hashmap_t<
408 |         val_t,
409 |         poplar_xorshift_t,
410 |         plain_sentinel_t,
411 |         displacement_t<
412 |             elias_gamma_displacement_table_t<
413 |                 dynamic_fixed_elias_gamma_bucket_size_t
414 |             >
415 |         >
416 |     >
417 | )
418 | 


--------------------------------------------------------------------------------
/test/compact_sparse_hashset_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | 
 6 | #include <tudocomp/util/compact_hash/set/typedefs.hpp>
 7 | 
 8 | using COMPACT_TABLE = tdc::compact_hash::set::sparse_cv_hashset_t<>;
 9 | 
10 | #include "compact_hashset_tests.template.hpp"
11 | 


--------------------------------------------------------------------------------
/test/sandbox_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | #include <tudocomp/util/compact_hash/map/typedefs.hpp>
 6 | #include <tudocomp/util/bits.hpp>
 7 | 
 8 | 
 9 | using namespace tdc;
10 | using namespace tdc::compact_hash::map;
11 | using namespace tdc::compact_hash;
12 | using namespace std;
13 | 
14 | TEST(Sandbox, example) {
15 |    auto map = sparse_cv_hashmap_t<int>(0, 4); // creates a hash table with zero entries, set the bit-width of the keys to three
16 |    std::cout << "Key Width: " << map.key_width() << std::endl;
17 | std::cout << "Add i -> i*i from i = 0 up to 15" << std::endl;
18 |    for(int i = 0; i <= 15; ++i) { // interval [0..15] can be represented by 4-bits
19 |            map.insert(i, std::move(i*i)); // insert key i, value i*i
20 |            std::cout << i << " -> " << map[i] << std::endl; // map[i] returns value with key i
21 |    }
22 |    std::cout << "Size: " << map.size() << std::endl;
23 | std::cout << "Update all values, set to i -> i" << std::endl;
24 |    for(int i = 0; i <= 15; ++i) {
25 | 	 std::cout << "Previously: " << i << " -> " << map[i] << std::endl; // map[i] returns value with key i
26 |        map[i] = i;
27 |        std::cout << "Now: " << i << " -> " << map[i] << std::endl;
28 |    }
29 |    std::cout << "Size: " << map.size() << std::endl;
30 |    std::cout << "Add 10 additional elements with key-width 9" << std::endl;
31 |    for(int i = 1; i < 11; ++i) { // interval [0..15]<<5 can be represented by 9-bits
32 | 	   map.insert_key_width(i<<5, std::move(i+1), 9); // insert key i<<5, value i, key have a width of 9
33 | 	   std::cout << (i<<5) << " -> " << map[i<<5] << std::endl; // map[i] returns value with key i
34 |    }
35 |     std::cout << "Key Width: " << map.key_width() << std::endl;
36 |     std::cout << "Size: " << map.size() << std::endl;
37 |     std::cout << "Old values are still stored: " << std::endl;
38 |    for(int i = 0; i <= 15; ++i) {
39 |        std::cout << i << " -> " << map[i] << std::endl;
40 |    }
41 | 
42 |  }
43 | 


--------------------------------------------------------------------------------
/test/v2_tests.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <cstdint>
  4 | #include <algorithm>
  5 | #include <tudocomp/util/compact_hash/map/hashmap_t.hpp>
  6 | #include <tudocomp/util/compact_hash/index_structure/cv_bvs_t.hpp>
  7 | #include <tudocomp/util/compact_hash/index_structure/displacement_t.hpp>
  8 | #include <tudocomp/util/compact_hash/index_structure/elias_gamma_displacement_table_t.hpp>
  9 | #include <tudocomp/util/compact_hash/index_structure/layered_displacement_table_t.hpp>
 10 | #include <tudocomp/util/compact_hash/index_structure/naive_displacement_table_t.hpp>
 11 | #include <tudocomp/util/compact_hash/storage/bucket_t.hpp>
 12 | #include <tudocomp/util/compact_hash/storage/buckets_bv_t.hpp>
 13 | #include <tudocomp/util/compact_hash/storage/plain_sentinel_t.hpp>
 14 | #include <tudocomp/util/compact_hash/hash_functions.hpp>
 15 | #include <tudocomp/util/bit_packed_layout_t.hpp>
 16 | 
 17 | using namespace tdc::compact_hash::map;
 18 | using namespace tdc::compact_hash;
 19 | using namespace tdc;
 20 | 
 21 | template<typename val_t>
 22 | using map_bucket_t = bucket_t<8, satellite_data_t<val_t>>;
 23 | 
 24 | template<typename val_t>
 25 | void BucketTest() {
 26 |     using widths_t = typename satellite_data_t<val_t>::entry_bit_width_t;
 27 | 
 28 |     auto b = map_bucket_t<val_t>();
 29 | 
 30 |     widths_t ws { 5, 7 };
 31 |     b = map_bucket_t<val_t>(0b10, ws);
 32 | 
 33 |     ASSERT_EQ(b.bv(), 2U);
 34 |     ASSERT_EQ(b.size(), 1U);
 35 |     ASSERT_EQ(b.is_empty(), false);
 36 | 
 37 |     auto p1 = b.at(0, ws);
 38 |     p1.set_no_drop(3, 4);
 39 | 
 40 |     b.stat_allocation_size_in_bytes(ws);
 41 | 
 42 |     auto p2 = b.insert_at(0, 0b11, ws);
 43 |     p2.set_no_drop(5, 6);
 44 | 
 45 |     p2.set(7, 8);
 46 | 
 47 |     b.destroy_vals(ws);
 48 | }
 49 | 
 50 | #define MakeBucketTest(tname) \
 51 | TEST(Bucket, tname##_test) {  \
 52 |     BucketTest<tname>();      \
 53 | }
 54 | 
 55 | using uint_t40 = uint_t<40>;
 56 | MakeBucketTest(uint8_t);
 57 | MakeBucketTest(uint64_t);
 58 | MakeBucketTest(dynamic_t);
 59 | MakeBucketTest(uint_t40);
 60 | 
 61 | template<template<typename> typename table_t, typename val_t>
 62 | void TableTest() {
 63 |     using tab_t = table_t<satellite_data_t<val_t>>;
 64 |     using widths_t = typename satellite_data_t<val_t>::entry_bit_width_t;
 65 | 
 66 |     {
 67 |         auto t = tab_t();
 68 | 
 69 |         widths_t ws { 5, 7 };
 70 |         size_t table_size = 16;
 71 |         t = tab_t(table_size, ws, {});
 72 |         auto ctx = t.context(table_size, ws);
 73 | 
 74 |         for(size_t i = 0; i < table_size; i++) {
 75 |             auto pos = ctx.table_pos(i);
 76 |             ASSERT_EQ(ctx.pos_is_empty(pos), true);
 77 | 
 78 |             auto elem = ctx.allocate_pos(pos);
 79 |             elem.set_no_drop(i + 1, i + 2);
 80 |         }
 81 | 
 82 |         for(size_t i = 0; i < table_size; i++) {
 83 |             auto pos = ctx.table_pos(i);
 84 |             ASSERT_EQ(ctx.pos_is_empty(pos), false);
 85 | 
 86 |             auto elem = ctx.at(pos);
 87 |             ASSERT_EQ(*elem.val_ptr(), i + 1);
 88 |             ASSERT_EQ(elem.get_quotient(), i + 2);
 89 |         }
 90 |     }
 91 | 
 92 |     {
 93 |         widths_t ws { 5, 7 };
 94 |         size_t table_size = 128;
 95 |         auto t = tab_t(table_size, ws, {});
 96 |         auto ctx = t.context(table_size, ws);
 97 | 
 98 |         for(size_t i = 60; i < 80; i++) {
 99 |             auto pos = ctx.table_pos(i);
100 |             ASSERT_EQ(ctx.pos_is_empty(pos), true);
101 | 
102 |             auto elem = ctx.allocate_pos(pos);
103 |             elem.set_no_drop(i - 60 + 1, i - 60 + 2);
104 | 
105 |             ASSERT_EQ(ctx.pos_is_empty(pos), false);
106 |         }
107 | 
108 |         auto iter = ctx.make_iter(ctx.table_pos(80));
109 |         (void) iter;
110 |         for(size_t i = 0; i < 20; i++) {
111 |             iter.decrement();
112 |             auto elem = iter.get();
113 | 
114 |             ASSERT_EQ(*elem.val_ptr(), 20 - i);
115 |             ASSERT_EQ(elem.get_quotient(), 20 - i + 1);
116 |         }
117 |     }
118 | 
119 | }
120 | 
121 | #define MakeTableTest(tab, tname)   \
122 | TEST(Table, tab##_##tname##_test) { \
123 |     TableTest<tab, tname>();        \
124 | }
125 | 
126 | MakeTableTest(plain_sentinel_t, uint8_t);
127 | MakeTableTest(buckets_bv_t,     uint8_t);
128 | MakeTableTest(plain_sentinel_t, uint64_t);
129 | MakeTableTest(buckets_bv_t,     uint64_t);
130 | MakeTableTest(plain_sentinel_t, dynamic_t);
131 | MakeTableTest(buckets_bv_t,     dynamic_t);
132 | MakeTableTest(plain_sentinel_t, uint_t40);
133 | MakeTableTest(buckets_bv_t,     uint_t40);
134 | 
135 | template<typename placement_t, template<typename> typename table_t, typename val_t>
136 | void CVTableTest() {
137 |     using tab_t = table_t<satellite_data_t<val_t>>;
138 |     using widths_t = typename satellite_data_t<val_t>::entry_bit_width_t;
139 |     using value_type = typename cbp::cbp_repr_t<val_t>::value_type;
140 | 
141 |     widths_t ws { 5, 7 };
142 |     auto size_mgr = size_manager_t(128);
143 | 
144 |     auto t = tab_t(size_mgr.capacity(), ws, {});
145 |     auto p = placement_t(size_mgr.capacity(), {});
146 | 
147 |     auto tctx = t.context(size_mgr.capacity(), ws);
148 |     auto pctx = p.context(t, size_mgr.capacity(), ws, size_mgr);
149 | 
150 |     auto check_insert = [&](auto ia, auto value, auto sq, bool should_exists) {
151 |         auto res = pctx.lookup_insert(ia, sq);
152 |         ASSERT_EQ(res.key_already_exist(), should_exists);
153 |         ASSERT_EQ(res.ptr().get_quotient(), sq);
154 |         *res.ptr().val_ptr() = value;
155 |     };
156 |     auto table_state = [&](std::vector<std::array<uint64_t, 3>> const& should) {
157 |         std::vector<std::array<uint64_t, 3>> r;
158 |         for (size_t i = 0; i < size_mgr.capacity(); i++) {
159 |             auto tpos = tctx.table_pos(i);
160 |             if (!tctx.pos_is_empty(tpos)) {
161 |                 // TODO: Replace with search()
162 |                 auto ptr = tctx.at(tpos);
163 |                 r.push_back(std::array<uint64_t, 3>{
164 |                     i, value_type(*ptr.val_ptr()), ptr.get_quotient()
165 |                 });
166 |             }
167 |         }
168 |         auto is = r;
169 |         ASSERT_EQ(is, should);
170 |     };
171 | 
172 |     check_insert(60, 1, 5U, false);
173 |     table_state({
174 |         {60, 1, 5},
175 |     });
176 | 
177 |     check_insert(66, 2, 5U, false);
178 |     table_state({
179 |         {60, 1, 5},
180 |         {66, 2, 5},
181 |     });
182 | 
183 |     check_insert(64, 3, 5U, false);
184 |     table_state({
185 |         {60, 1, 5},
186 |         {64, 3, 5},
187 |         {66, 2, 5},
188 |     });
189 | 
190 |     check_insert(62, 4, 5U, false);
191 |     table_state({
192 |         {60, 1, 5},
193 |         {62, 4, 5},
194 |         {64, 3, 5},
195 |         {66, 2, 5},
196 |     });
197 | 
198 |     check_insert(62, 5, 6U, false);
199 |     table_state({
200 |         {60, 1, 5},
201 |         {62, 4, 5},
202 |         {63, 5, 6},
203 |         {64, 3, 5},
204 |         {66, 2, 5},
205 |     });
206 | 
207 |     check_insert(62, 10, 6U, true);
208 |     table_state({
209 |         {60, 1, 5},
210 |         {62, 4, 5},
211 |         {63, 10, 6},
212 |         {64, 3, 5},
213 |         {66, 2, 5},
214 |     });
215 | 
216 |     check_insert(62, 9, 7U, false);
217 |     table_state({
218 |         {60, 1, 5},
219 |         {62, 4, 5},
220 |         {63, 10, 6},
221 |         {64, 9, 7},
222 |         {65, 3, 5},
223 |         {66, 2, 5},
224 |     });
225 | 
226 |     /*
227 |      Test:
228 |      - multiple independ inserts
229 |      - appends to same group
230 |      - appends to displaced group
231 | 
232 |      */
233 | }
234 | 
235 | #define MakeCVTableTest(place, tab, tname)      \
236 | TEST(CVTable, place##_##tab##_##tname##_test) { \
237 |     CVTableTest<place, tab, tname>();           \
238 | }
239 | 
240 | MakeCVTableTest(cv_bvs_t, plain_sentinel_t, uint8_t);
241 | MakeCVTableTest(cv_bvs_t, plain_sentinel_t, uint64_t);
242 | MakeCVTableTest(cv_bvs_t, plain_sentinel_t, dynamic_t);
243 | MakeCVTableTest(cv_bvs_t, plain_sentinel_t, uint_t40);
244 | MakeCVTableTest(cv_bvs_t, buckets_bv_t,     uint8_t);
245 | MakeCVTableTest(cv_bvs_t, buckets_bv_t,     uint64_t);
246 | MakeCVTableTest(cv_bvs_t, buckets_bv_t,     dynamic_t);
247 | MakeCVTableTest(cv_bvs_t, buckets_bv_t,     uint_t40);
248 | 
249 | template<typename placement_t, template<typename> typename table_t, typename val_t>
250 | void DPTableTest() {
251 |     using tab_t = table_t<satellite_data_t<val_t>>;
252 |     using widths_t = typename satellite_data_t<val_t>::entry_bit_width_t;
253 |     using value_type = typename cbp::cbp_repr_t<val_t>::value_type;
254 | 
255 |     struct TestSizeMgr {
256 |         size_t table_size;
257 |         inline size_t mod_add(size_t i, size_t delta = 1) const {
258 |             return (i + delta) % table_size;
259 |         }
260 |         inline size_t mod_sub(size_t i, size_t delta = 1) const {
261 |             return (i + table_size - delta) % table_size;
262 |         }
263 |     };
264 | 
265 |     widths_t ws { 5, 7 };
266 |     auto size_mgr = TestSizeMgr { 128 };
267 |     auto t = tab_t(size_mgr.table_size, ws, {});
268 |     auto p = placement_t(size_mgr.table_size, {});
269 | 
270 |     auto tctx = t.context(size_mgr.table_size, ws);
271 |     auto pctx = p.context(t, size_mgr.table_size, ws, size_mgr);
272 | 
273 |     auto check_insert = [&](auto ia, auto value, auto sq, bool should_exists) {
274 |         auto res = pctx.lookup_insert(ia, sq);
275 |         ASSERT_EQ(res.key_already_exist(), should_exists);
276 |         ASSERT_EQ(res.ptr().get_quotient(), sq);
277 |         *res.ptr().val_ptr() = value;
278 |     };
279 |     auto table_state = [&](std::vector<std::array<uint64_t, 3>> const& should) {
280 |         std::vector<std::array<uint64_t, 3>> r;
281 |         for (size_t i = 0; i < size_mgr.table_size; i++) {
282 |             auto tpos = tctx.table_pos(i);
283 |             if (!tctx.pos_is_empty(tpos)) {
284 |                 // TODO: Replace with search()
285 |                 auto ptr = tctx.at(tpos);
286 |                 r.push_back(std::array<uint64_t, 3>{
287 |                     i, value_type(*ptr.val_ptr()), ptr.get_quotient()
288 |                 });
289 |             }
290 |         }
291 |         auto is = r;
292 |         ASSERT_EQ(is, should);
293 |     };
294 | 
295 |     check_insert(60, 1, 5U, false);
296 |     table_state({
297 |         {60, 1, 5},
298 |     });
299 | 
300 |     check_insert(66, 2, 5U, false);
301 |     table_state({
302 |         {60, 1, 5},
303 |         {66, 2, 5},
304 |     });
305 | 
306 |     check_insert(64, 3, 5U, false);
307 |     table_state({
308 |         {60, 1, 5},
309 |         {64, 3, 5},
310 |         {66, 2, 5},
311 |     });
312 | 
313 |     check_insert(62, 4, 5U, false);
314 |     table_state({
315 |         {60, 1, 5},
316 |         {62, 4, 5},
317 |         {64, 3, 5},
318 |         {66, 2, 5},
319 |     });
320 | 
321 |     check_insert(62, 5, 6U, false);
322 |     table_state({
323 |         {60, 1, 5},
324 |         {62, 4, 5},
325 |         {63, 5, 6},
326 |         {64, 3, 5},
327 |         {66, 2, 5},
328 |     });
329 | 
330 |     check_insert(62, 10, 6U, true);
331 |     table_state({
332 |         {60, 1, 5},
333 |         {62, 4, 5},
334 |         {63, 10, 6},
335 |         {64, 3, 5},
336 |         {66, 2, 5},
337 |     });
338 | 
339 |     check_insert(62, 9, 7U, false);
340 |     table_state({
341 |         {60, 1, 5},
342 |         {62, 4, 5},
343 |         {63, 10, 6},
344 |         {64, 3, 5},
345 |         {65, 9, 7},
346 |         {66, 2, 5},
347 |     });
348 | 
349 |     /*
350 |      Test:
351 |      - multiple independ inserts
352 |      - appends to same group
353 |      - appends to displaced group
354 | 
355 |      */
356 | }
357 | 
358 | #define MakeDPTableTest(place, tab, tname)      \
359 | TEST(DPTable, place##_##tab##_##tname##_test) { \
360 |     DPTableTest<place, tab, tname>();           \
361 | }
362 | 
363 | using naive_displacement_t = displacement_t<naive_displacement_table_t>;
364 | MakeDPTableTest(naive_displacement_t, plain_sentinel_t, uint8_t);
365 | MakeDPTableTest(naive_displacement_t, plain_sentinel_t, uint64_t);
366 | MakeDPTableTest(naive_displacement_t, plain_sentinel_t, dynamic_t);
367 | MakeDPTableTest(naive_displacement_t, plain_sentinel_t, uint_t40);
368 | MakeDPTableTest(naive_displacement_t, buckets_bv_t,     uint8_t);
369 | MakeDPTableTest(naive_displacement_t, buckets_bv_t,     uint64_t);
370 | MakeDPTableTest(naive_displacement_t, buckets_bv_t,     dynamic_t);
371 | MakeDPTableTest(naive_displacement_t, buckets_bv_t,     uint_t40);
372 | 
373 | using layered_displacement_t = displacement_t<layered_displacement_table_t<static_layered_bit_width_t<4>>>;
374 | MakeDPTableTest(layered_displacement_t, plain_sentinel_t, uint8_t);
375 | MakeDPTableTest(layered_displacement_t, plain_sentinel_t, uint64_t);
376 | MakeDPTableTest(layered_displacement_t, plain_sentinel_t, dynamic_t);
377 | MakeDPTableTest(layered_displacement_t, plain_sentinel_t, uint_t40);
378 | MakeDPTableTest(layered_displacement_t, buckets_bv_t,     uint8_t);
379 | MakeDPTableTest(layered_displacement_t, buckets_bv_t,     uint64_t);
380 | MakeDPTableTest(layered_displacement_t, buckets_bv_t,     dynamic_t);
381 | MakeDPTableTest(layered_displacement_t, buckets_bv_t,     uint_t40);
382 | 
383 | using layered_displacement2_t = displacement_t<layered_displacement_table_t<dynamic_layered_bit_width_t>>;
384 | MakeDPTableTest(layered_displacement2_t, plain_sentinel_t, uint8_t);
385 | MakeDPTableTest(layered_displacement2_t, plain_sentinel_t, uint64_t);
386 | MakeDPTableTest(layered_displacement2_t, plain_sentinel_t, dynamic_t);
387 | MakeDPTableTest(layered_displacement2_t, plain_sentinel_t, uint_t40);
388 | MakeDPTableTest(layered_displacement2_t, buckets_bv_t,     uint8_t);
389 | MakeDPTableTest(layered_displacement2_t, buckets_bv_t,     uint64_t);
390 | MakeDPTableTest(layered_displacement2_t, buckets_bv_t,     dynamic_t);
391 | MakeDPTableTest(layered_displacement2_t, buckets_bv_t,     uint_t40);
392 | 
393 | using elias_gamma_displacement_t = displacement_t<elias_gamma_displacement_table_t<fixed_elias_gamma_bucket_size_t<1024>>>;
394 | MakeDPTableTest(elias_gamma_displacement_t, plain_sentinel_t, uint8_t);
395 | MakeDPTableTest(elias_gamma_displacement_t, plain_sentinel_t, uint64_t);
396 | MakeDPTableTest(elias_gamma_displacement_t, plain_sentinel_t, dynamic_t);
397 | MakeDPTableTest(elias_gamma_displacement_t, plain_sentinel_t, uint_t40);
398 | MakeDPTableTest(elias_gamma_displacement_t, buckets_bv_t,     uint8_t);
399 | MakeDPTableTest(elias_gamma_displacement_t, buckets_bv_t,     uint64_t);
400 | MakeDPTableTest(elias_gamma_displacement_t, buckets_bv_t,     dynamic_t);
401 | MakeDPTableTest(elias_gamma_displacement_t, buckets_bv_t,     uint_t40);
402 | 
403 | using elias_gamma_displacement2_t = displacement_t<elias_gamma_displacement_table_t<dynamic_fixed_elias_gamma_bucket_size_t>>;
404 | MakeDPTableTest(elias_gamma_displacement2_t, plain_sentinel_t, uint8_t);
405 | MakeDPTableTest(elias_gamma_displacement2_t, plain_sentinel_t, uint64_t);
406 | MakeDPTableTest(elias_gamma_displacement2_t, plain_sentinel_t, dynamic_t);
407 | MakeDPTableTest(elias_gamma_displacement2_t, plain_sentinel_t, uint_t40);
408 | MakeDPTableTest(elias_gamma_displacement2_t, buckets_bv_t,     uint8_t);
409 | MakeDPTableTest(elias_gamma_displacement2_t, buckets_bv_t,     uint64_t);
410 | MakeDPTableTest(elias_gamma_displacement2_t, buckets_bv_t,     dynamic_t);
411 | MakeDPTableTest(elias_gamma_displacement2_t, buckets_bv_t,     uint_t40);
412 | 
413 | using elias_gamma_displacement3_t = displacement_t<elias_gamma_displacement_table_t<growing_elias_gamma_bucket_size_t>>;
414 | MakeDPTableTest(elias_gamma_displacement3_t, plain_sentinel_t, uint8_t);
415 | MakeDPTableTest(elias_gamma_displacement3_t, plain_sentinel_t, uint64_t);
416 | MakeDPTableTest(elias_gamma_displacement3_t, plain_sentinel_t, dynamic_t);
417 | MakeDPTableTest(elias_gamma_displacement3_t, plain_sentinel_t, uint_t40);
418 | MakeDPTableTest(elias_gamma_displacement3_t, buckets_bv_t,     uint8_t);
419 | MakeDPTableTest(elias_gamma_displacement3_t, buckets_bv_t,     uint64_t);
420 | MakeDPTableTest(elias_gamma_displacement3_t, buckets_bv_t,     dynamic_t);
421 | MakeDPTableTest(elias_gamma_displacement3_t, buckets_bv_t,     uint_t40);
422 | 
423 | template<template<typename> typename table_t, typename val_t>
424 | void FullTableTest() {
425 |     {
426 |         table_t<val_t> table;
427 | 
428 |         table.insert_kv_width(42, 124, 8, 8);
429 | 
430 |         auto r = table[42];
431 |         ASSERT_EQ(r, 124u);
432 |     }
433 |     {
434 |         table_t<val_t> table;
435 | 
436 |         auto tchk = [&](size_t end) {
437 |             for (uint64_t w = 1; w < end; w++) {
438 |                 auto r = table[w];
439 |                 ASSERT_EQ(r, w);
440 |             }
441 |             auto nptr = typename table_t<val_t>::pointer_type();
442 |             for (uint64_t w = 1; w < end; w++) {
443 |                 auto r = table.search(w);
444 |                 ASSERT_NE(r, nptr);
445 |                 ASSERT_EQ(*r, w);
446 |             }
447 |         };
448 |         bool quick = true;
449 | 
450 |         size_t last_bits = 0;
451 |         for (uint64_t v = 1; v < 1000; v++) {
452 |             size_t bits = bits_for(v);
453 |             if (last_bits != bits) {
454 |                 //std::cout << "bits: " << bits << "\n";
455 |                 last_bits = bits;
456 |             }
457 |             table.insert_kv_width(v, std::move(v), bits, bits);
458 | 
459 |             if (!quick) {
460 |                 tchk(v + 1);
461 |             }
462 |         }
463 |         if (quick) {
464 |             tchk(1000);
465 |         }
466 |     }
467 | }
468 | 
469 | #define MakeFullTableTest(tab, tname)   \
470 | TEST(FullTable, tab##_##tname##_test) { \
471 |     FullTableTest<tab, tname>(); \
472 | }
473 | 
474 | template<typename val_t>
475 | using csh_test_t = hashmap_t<val_t, poplar_xorshift_t, buckets_bv_t, cv_bvs_t>;
476 | template<typename val_t>
477 | using ch_test_t = hashmap_t<val_t, poplar_xorshift_t, plain_sentinel_t, cv_bvs_t>;
478 | 
479 | template<typename val_t>
480 | using csh_disp_test_t = hashmap_t<val_t, poplar_xorshift_t, buckets_bv_t, naive_displacement_t>;
481 | template<typename val_t>
482 | using ch_disp_test_t = hashmap_t<val_t, poplar_xorshift_t, plain_sentinel_t, naive_displacement_t>;
483 | 
484 | MakeFullTableTest(csh_test_t, uint16_t)
485 | MakeFullTableTest(csh_test_t, uint64_t)
486 | MakeFullTableTest(csh_test_t, dynamic_t)
487 | MakeFullTableTest(csh_test_t, uint_t40)
488 | MakeFullTableTest(ch_test_t, uint16_t)
489 | MakeFullTableTest(ch_test_t, uint64_t)
490 | MakeFullTableTest(ch_test_t, dynamic_t)
491 | MakeFullTableTest(ch_test_t, uint_t40)
492 | MakeFullTableTest(csh_disp_test_t, uint16_t)
493 | MakeFullTableTest(csh_disp_test_t, uint64_t)
494 | MakeFullTableTest(csh_disp_test_t, dynamic_t)
495 | MakeFullTableTest(csh_disp_test_t, uint_t40)
496 | MakeFullTableTest(ch_disp_test_t, uint16_t)
497 | MakeFullTableTest(ch_disp_test_t, uint64_t)
498 | MakeFullTableTest(ch_disp_test_t, dynamic_t)
499 | MakeFullTableTest(ch_disp_test_t, uint_t40)
500 | 


--------------------------------------------------------------------------------