├── .dockerignore ├── .github └── workflows │ └── build.yml ├── .gitignore ├── .gitmodules ├── CITATION.cff ├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── README.md ├── include ├── PartitionedSicHash.h ├── SicHash.h └── sichash │ ├── IrregularCuckooHashTable.h │ ├── IrregularCuckooHashTableHK.h │ ├── SlowIrregularCuckooHashTable.h │ └── TinyBinaryCuckooHashTable.h ├── plots-construction.png ├── plots-query.png ├── scripts ├── .gitignore ├── boxPlotBinaryCuckoo.sh ├── boxPlotIrregularCuckooSpace.sh ├── bucketSize.sh ├── competitorNames.txt ├── dockerVolume │ └── figure-1.sh └── figure-1.tex └── src ├── BenchmarkData.h ├── constructionSuccess.cpp ├── example.cpp ├── maxLoadFactor.cpp ├── sicHashBenchmark.cpp └── solvers.cpp /.dockerignore: -------------------------------------------------------------------------------- 1 | cmake-build-* 2 | build* 3 | scripts/dockerVolume 4 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | strategy: 10 | matrix: 11 | compiler: 12 | - { name: gcc, version: 11} 13 | - { name: gcc, version: 12} 14 | - { name: gcc, version: 13} 15 | - { name: gcc, version: 14} 16 | - { name: clang, version: 16} 17 | - { name: clang, version: 17} 18 | - { name: clang, version: 18} 19 | name: Build (${{ matrix.compiler.name }} ${{ matrix.compiler.version }}) 20 | runs-on: ubuntu-24.04 21 | steps: 22 | - name: Install dependencies 23 | run: | 24 | sudo add-apt-repository universe 25 | sudo apt-get update 26 | sudo apt-get install --assume-yes --no-install-recommends ca-certificates cmake git libtbb-dev 27 | - name: Install GCC 28 | if: ${{ matrix.compiler.name == 'gcc' }} 29 | run: | 30 | sudo apt-get install --assume-yes --no-install-recommends gcc-${{ matrix.compiler.version }} g++-${{ matrix.compiler.version }} 31 | echo "CC=/usr/bin/gcc-${{ matrix.compiler.version }}" >> $GITHUB_ENV 32 | echo "CXX=/usr/bin/g++-${{ matrix.compiler.version }}" >> $GITHUB_ENV 33 | - name: Install Clang 34 | if: ${{ matrix.compiler.name == 'clang' }} 35 | run: | 36 | sudo apt-get install --assume-yes --no-install-recommends clang-${{ matrix.compiler.version }} 37 | echo "CC=/usr/bin/clang-${{ matrix.compiler.version }}" >> $GITHUB_ENV 38 | echo "CXX=/usr/bin/clang++-${{ matrix.compiler.version }}" >> $GITHUB_ENV 39 | - uses: actions/checkout@v4 40 | with: 41 | submodules: recursive 42 | - name: Build 43 | run: | 44 | cmake -B ./build 45 | cmake --build ./build --parallel 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | cmake-build-* 3 | build 4 | .Rhistory 5 | .clion.source.upload.marker 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extlib/util"] 2 | path = extlib/util 3 | url = https://github.com/ByteHamster/Util.git 4 | [submodule "extlib/simpleRibbon"] 5 | path = extlib/simpleRibbon 6 | url = https://github.com/ByteHamster/SimpleRibbon.git 7 | [submodule "extlib/tlx"] 8 | path = extlib/tlx 9 | url = https://github.com/tlx/tlx.git 10 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software in an academic context or publication, please cite it as below." 3 | authors: 4 | - family-names: "Lehmann" 5 | given-names: "Hans-Peter" 6 | orcid: "https://orcid.org/0000-0002-0474-1805" 7 | - family-names: "Sanders" 8 | given-names: "Peter" 9 | orcid: "https://orcid.org/0000-0003-3330-9349" 10 | - family-names: "Walzer" 11 | given-names: "Stefan" 12 | orcid: "https://orcid.org/0000-0002-6477-0106" 13 | title: "SicHash - Small Irregular Cuckoo Tables for Perfect Hashing" 14 | preferred-citation: 15 | type: conference-paper 16 | title: "SicHash - Small Irregular Cuckoo Tables for Perfect Hashing" 17 | authors: 18 | - family-names: "Lehmann" 19 | given-names: "Hans-Peter" 20 | orcid: "https://orcid.org/0000-0002-0474-1805" 21 | - family-names: "Sanders" 22 | given-names: "Peter" 23 | orcid: "https://orcid.org/0000-0003-3330-9349" 24 | - family-names: "Walzer" 25 | given-names: "Stefan" 26 | orcid: "https://orcid.org/0000-0002-6477-0106" 27 | doi: "10.1137/1.9781611977561.CH15" 28 | journal: "ALENEX" 29 | year: 2023 30 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.25...4.0) 2 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 3 | project(SicHash) 4 | 5 | if(TARGET SicHash) 6 | return() 7 | endif() 8 | 9 | if (NOT CMAKE_BUILD_TYPE) 10 | set(CMAKE_BUILD_TYPE "Release") 11 | endif () 12 | 13 | if(CMAKE_BUILD_TYPE STREQUAL "Release" AND PROJECT_IS_TOP_LEVEL) 14 | add_compile_options(-march=native) 15 | endif() 16 | 17 | set(TLX_INSTALL_INCLUDE_DIR tlx CACHE PATH "Workaround for TLX breaking the first cmake call") 18 | add_subdirectory(extlib/tlx) 19 | 20 | add_library(SicHash INTERFACE) 21 | target_include_directories(SicHash INTERFACE include) 22 | target_compile_features(SicHash INTERFACE cxx_std_20) 23 | 24 | add_subdirectory(extlib/util EXCLUDE_FROM_ALL) 25 | target_link_libraries(SicHash INTERFACE ByteHamster::Util) 26 | 27 | add_subdirectory(extlib/simpleRibbon EXCLUDE_FROM_ALL) 28 | target_link_libraries(SicHash INTERFACE SimpleRibbon ips2ra) 29 | 30 | add_library(SicHash::sichash ALIAS SicHash) 31 | 32 | if(PROJECT_IS_TOP_LEVEL) 33 | target_compile_options(SicHash INTERFACE $<$:-Wall -Wextra -Wpedantic -Werror -frecord-gcc-switches>) 34 | 35 | add_executable(Example src/example.cpp) 36 | target_link_libraries(Example PRIVATE SicHash) 37 | target_compile_features(Example PRIVATE cxx_std_20) 38 | 39 | add_executable(Solvers src/solvers.cpp) 40 | target_link_libraries(Solvers PRIVATE SicHash tlx) 41 | target_compile_features(Solvers PRIVATE cxx_std_20) 42 | 43 | add_executable(ConstructionSuccess src/constructionSuccess.cpp) 44 | target_link_libraries(ConstructionSuccess PRIVATE SicHash tlx) 45 | target_compile_features(ConstructionSuccess PRIVATE cxx_std_20) 46 | 47 | add_executable(SicHashBenchmark src/sicHashBenchmark.cpp) 48 | target_link_libraries(SicHashBenchmark PRIVATE SicHash tlx) 49 | target_compile_features(SicHashBenchmark PRIVATE cxx_std_20) 50 | 51 | add_executable(MaxLoadFactor src/maxLoadFactor.cpp) 52 | target_link_libraries(MaxLoadFactor PRIVATE SicHash tlx) 53 | target_compile_features(MaxLoadFactor PRIVATE cxx_std_20) 54 | endif() 55 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN apt-get update && apt-get -y upgrade 4 | RUN apt-get install --assume-yes --no-install-recommends ca-certificates build-essential cmake git 5 | RUN apt-get install --assume-yes --no-install-recommends libboost-regex-dev libsqlite3-dev 6 | RUN apt-get install --assume-yes --no-install-recommends texlive-latex-extra texlive-fonts-recommended texlive-latex-recommended texlive-fonts-extra 7 | RUN apt-get install --assume-yes --no-install-recommends libtbb-dev libxxhash-dev 8 | 9 | # Build sqlplot-tools 10 | RUN git clone https://github.com/bingmann/sqlplot-tools.git /opt/sqlplot-tools 11 | RUN mkdir /opt/sqlplot-tools/build 12 | WORKDIR /opt/sqlplot-tools/build 13 | RUN cmake -DCMAKE_BUILD_TYPE=Release -DWITH_POSTGRESQL=OFF -DWITH_MYSQL=OFF .. 14 | RUN cmake --build . -j 8 15 | 16 | # Build SicHash 17 | COPY . /opt/sichash 18 | RUN mkdir /opt/sichash/build 19 | WORKDIR /opt/sichash/build 20 | RUN cmake -DCMAKE_BUILD_TYPE=Release .. 21 | RUN cmake --build . -j 8 22 | 23 | # Actual benchmark 24 | CMD bash /opt/dockerVolume/figure-1.sh 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SicHash 2 | 3 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) 4 | ![Build status](https://github.com/ByteHamster/SicHash/actions/workflows/build.yml/badge.svg) 5 | 6 | A perfect hash function (PHF) maps a set S of n keys to the first m integers without collisions. 7 | It is called _minimal_ perfect (MPHF) if m=n. 8 | Perfect hash functions have applications in databases, bioinformatics, and as a building block of various space-efficient data structures. 9 | 10 | SicHash is a (minimal) perfect hash function based on irregular cuckoo hashing, retrieval, and overloading. 11 | Each input key has a small number of choices for output positions. 12 | Using cuckoo hashing, SicHash determines a mapping from each key to one of its choices, 13 | such that there are no collisions between keys. 14 | It then stores the mapping from keys to their candidate index space-efficiently using 15 | the [BuRR](https://github.com/lorenzhs/BuRR) retrieval data structure. 16 | 17 | SicHash offers a very good trade-off between construction performance, query performance, and space consumption. 18 | 19 | ### Library Usage 20 | 21 | Clone this repo and add the following to your `CMakeLists.txt`. 22 | Note that the repo has submodules, so either use `git clone --recursive` or `git submodule update --init --recursive`. 23 | 24 | ``` 25 | add_subdirectory(path/to/SicHash) 26 | target_link_libraries(YourTarget PRIVATE SicHash) 27 | ``` 28 | 29 | Constructing a SicHash perfect hash function is then straightforward: 30 | 31 | ```cpp 32 | std::vector keys = {"abc", "def", "123", "456"}; 33 | sichash::SicHashConfig config; 34 | sichash::SicHash hashFunc(keys, config); 35 | std::cout << hashFunc("abc") << std::endl; 36 | ``` 37 | 38 | ### Construction Performance 39 | 40 | [![Plots preview](https://raw.githubusercontent.com/ByteHamster/SicHash/main/plots-construction.png)](https://arxiv.org/pdf/2210.01560) 41 | 42 | ### Query Performance 43 | 44 | [![Plots preview](https://raw.githubusercontent.com/ByteHamster/SicHash/main/plots-query.png)](https://arxiv.org/pdf/2210.01560) 45 | 46 | ### Reproducing Experiments 47 | 48 | This repository contains the source code and our reproducibility artifacts for the benchmarks specific to SicHash. 49 | Benchmarks that compare SicHash to competitors are available in a different repository: https://github.com/ByteHamster/MPHF-Experiments 50 | 51 | We provide an easy to use Docker image to quickly reproduce our results. 52 | Alternatively, you can look at the `Dockerfile` to see all libraries, tools, and commands necessary to compile SicHash. 53 | 54 | #### Building the Docker Image 55 | 56 | Run the following command to build the Docker image. 57 | Building the image takes about 5 minutes, as some packages (including LaTeX for the plots) have to be installed. 58 | 59 | ```bash 60 | docker build -t sichash --no-cache . 61 | ``` 62 | 63 | Some compiler warnings (red) are expected when building competitors and will not prevent building the image or running the experiments. 64 | Please ignore them! 65 | 66 | #### Running the Experiments 67 | Due to the long total running time of all experiments in our paper, we provide run scripts for a slightly simplified version of the experiments. 68 | They run fewer iterations and output fewer data points. 69 | 70 | You can modify the benchmarks scripts in `scripts/dockerVolume` if you want to change the number of runs or data points. 71 | This does not require the Docker image to recompile. 72 | Different experiments can be started by using the following command: 73 | 74 | ```bash 75 | docker run --interactive --tty -v "$(pwd)/scripts/dockerVolume:/opt/dockerVolume" sichash /opt/dockerVolume/figure-1.sh 76 | ``` 77 | 78 | The number also refers to the figure in the paper. 79 | 80 | | Figure in paper | Launch command | Estimated runtime | 81 | | :-------------- | :---------------------------- | :----------------- | 82 | | 1 | /opt/dockerVolume/figure-1.sh | 10 minutes | 83 | 84 | The resulting plots can be found in `scripts/dockerVolume` and are called `figure-.pdf`. 85 | More experiments comparing SicHash with competitors can be found in a different repository: https://github.com/ByteHamster/MPHF-Experiments 86 | 87 | ### License 88 | 89 | This code is licensed under the [GPLv3](/LICENSE). 90 | If you use the project in an academic context or publication, please cite [our paper](https://doi.org/10.1137/1.9781611977561.ch15): 91 | 92 | ``` 93 | @inproceedings{lehmann2023sichash, 94 | author = {Hans{-}Peter Lehmann and 95 | Peter Sanders and 96 | Stefan Walzer}, 97 | title = {SicHash - Small Irregular Cuckoo Tables for Perfect Hashing}, 98 | booktitle = {{ALENEX}}, 99 | pages = {176--189}, 100 | publisher = {{SIAM}}, 101 | year = {2023}, 102 | doi = {10.1137/1.9781611977561.CH15} 103 | } 104 | ``` 105 | -------------------------------------------------------------------------------- /include/PartitionedSicHash.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace sichash { 6 | /** 7 | * Multiple SicHash perfect hash function, one for each construction thread. 8 | * If you use one single thread, this is slower than simply using SicHash directly. 9 | */ 10 | template 11 | class PartitionedSicHash { 12 | private: 13 | size_t numThreads; 14 | static constexpr size_t HASH_FUNCTION_CHILD_ASSIGNMENT = 43; 15 | std::vector*> children; 16 | std::vector childOffsets; 17 | 18 | public: 19 | PartitionedSicHash(std::span keys, SicHashConfig config, size_t numThreads) 20 | : numThreads(numThreads) { 21 | children.resize(numThreads); 22 | if (numThreads == 1) { 23 | children[0] = new SicHash(keys, config); 24 | childOffsets.push_back(0); 25 | childOffsets.push_back(children[0]->M); 26 | return; 27 | } 28 | std::vector>> childInputByThread; 29 | childInputByThread.resize(numThreads); 30 | const size_t N = keys.size(); 31 | for (auto &singleThread : childInputByThread) { 32 | singleThread.resize(numThreads); 33 | for (auto &singleChildInput : singleThread) { 34 | singleChildInput.reserve(N / (numThreads * numThreads)); 35 | } 36 | } 37 | size_t keysPerThread = (N + numThreads) / numThreads; 38 | std::vector threads; 39 | for (size_t t = 0; t < numThreads; t++) { 40 | threads.emplace_back([&, t]() { 41 | size_t from = t * keysPerThread; 42 | size_t to = std::min(N, (t + 1) * keysPerThread); 43 | for (size_t i = from; i < to; i++) { 44 | HashedKey hash = HashedKey(keys[i]); 45 | size_t child = hash.hash(HASH_FUNCTION_CHILD_ASSIGNMENT, numThreads); 46 | childInputByThread[t][child].push_back(hash); 47 | } 48 | }); 49 | } 50 | for (size_t i = 0; i < numThreads; i++) { 51 | threads[i].join(); 52 | } 53 | threads.clear(); 54 | std::atomic hadException = false; 55 | for (size_t i = 0; i < numThreads; i++) { 56 | threads.emplace_back([&, i]() { 57 | std::vector input; 58 | input.reserve(N / numThreads); 59 | for (size_t t = 0; t < numThreads; t++) { 60 | input.insert(input.end(), childInputByThread[t][i].begin(),childInputByThread[t][i].end()); 61 | childInputByThread[t][i].resize(0); 62 | childInputByThread[t][i].shrink_to_fit(); 63 | } 64 | try { 65 | children[i] = new SicHash(input, config); 66 | } catch (const std::exception& e) { 67 | std::cout<<"Error: "<M; 82 | } 83 | } 84 | 85 | ~PartitionedSicHash() { 86 | for (auto &child : children) { 87 | delete child; 88 | } 89 | } 90 | 91 | /** Estimate for the space usage of this structure, in bits */ 92 | [[nodiscard]] size_t spaceUsage() const { 93 | size_t spaceUsage = sizeof(*this) * 8; 94 | for (auto &child : children) { 95 | spaceUsage += child->spaceUsage(); 96 | } 97 | return spaceUsage; 98 | } 99 | 100 | size_t operator() (std::string &key) const { 101 | HashedKey hash = HashedKey(key); 102 | size_t child = hash.hash(HASH_FUNCTION_CHILD_ASSIGNMENT, numThreads); 103 | return children[child]->operator()(hash) + childOffsets[child]; 104 | } 105 | }; 106 | } // Namespace sichash 107 | -------------------------------------------------------------------------------- /include/SicHash.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "sichash/IrregularCuckooHashTable.h" 10 | 11 | namespace sichash { 12 | 13 | constexpr static size_t SEED_BITS = 16; 14 | 15 | struct BucketInfo { 16 | size_t offset : 48; 17 | size_t seed : SEED_BITS; 18 | }; 19 | 20 | struct SicHashConfig { 21 | // Load factor of the PHF. When constructing an MPHF, this is the load factor before compaction. 22 | double loadFactor = 0.9; 23 | 24 | // Expected size of each of the small cuckoo hash tables. 25 | size_t smallTableSize = 5000; 26 | 27 | // Don't print progress to std::cout. The messages are rare, so they do not affect the measurements, 28 | // but they might be annoying when using SicHash as a library. 29 | bool silent = false; 30 | 31 | // Main configuration parameters. Set values using .percentages() or .spaceBudget(). 32 | uint64_t threshold1; 33 | uint64_t threshold2; 34 | 35 | // For convenience, only set when calling .spaceBudget(). Not used during construction. 36 | float x = -1; 37 | 38 | SicHashConfig() { 39 | percentages(0.5, 0.25); 40 | } 41 | 42 | /** 43 | * Percentages in [0, 1] of items with 2 and 4 choices (1 and 2 bits, respectively). 44 | * The percentage of items with 8 choices (3 bits) is calculated automatically. 45 | * Both values must be >=0 and the sum must be <=1 46 | */ 47 | SicHashConfig &percentages(float percentage1, float percentage2) { 48 | if (percentage1 + percentage2 > 1.0) { 49 | throw std::logic_error("Selected thresholds have >100%"); 50 | } 51 | if (percentage1 < 0.0 || percentage2 < 0.0) { 52 | throw std::logic_error("Selected negative thresholds"); 53 | } 54 | threshold1 = static_cast(UINT64_MAX) * 0.99999 * percentage1; 55 | threshold2 = static_cast(UINT64_MAX) * 0.99999 * (percentage1 + percentage2); 56 | if (threshold2 < threshold1) { 57 | throw std::logic_error("Overflow when determining thresholds"); 58 | } 59 | return *this; 60 | } 61 | 62 | /** 63 | * Try to construct a PHF with a given space budget (in bits per key). 64 | * Because we are using 1,2,3 bit retrieval data structures, the space budget must be in [1, 3]. 65 | * Parameter x in [0, 1] is a tuning parameter for selecting which mix of hash functions to use. 66 | * High x have a higher load threshold, while low x are usually faster to construct. 67 | */ 68 | SicHashConfig &spaceBudget(float spaceBudget, float _x) { 69 | x = _x; 70 | spaceBudget -= 8.0 * (sizeof(BucketInfo)) / smallTableSize; 71 | if (x < 0.0 || x > 1.0) { 72 | throw std::logic_error("x must be in [0, 1]"); 73 | } 74 | if (spaceBudget < 1.0 || spaceBudget > 3.0) { 75 | throw std::logic_error("space budget must be in [1, 3]"); 76 | } 77 | float p1_min = std::max(0.0, 2.0 - spaceBudget); 78 | float p1_max = (3 - spaceBudget) / 2; 79 | float p1 = p1_min + (p1_max - p1_min) * x; 80 | float p2 = 3 - 2*p1 - spaceBudget; 81 | percentages(p1, p2); 82 | return *this; 83 | } 84 | 85 | [[nodiscard]] double class1Percentage() const { 86 | return (double) threshold1 / (double) UINT64_MAX; 87 | } 88 | 89 | [[nodiscard]] double class2Percentage() const { 90 | return (double) (threshold2 - threshold1) / (double) UINT64_MAX; 91 | } 92 | 93 | [[nodiscard]] double class3Percentage() const { 94 | return (double) (UINT64_MAX - threshold2 - threshold1) / (double) UINT64_MAX; 95 | } 96 | }; 97 | 98 | /** 99 | * SicHash perfect hash function. 100 | * @tparam minimal Remap values >N to empty slots to get a MPHF 101 | * @tparam ribbonWidth Tuning parameter for the ribbon retrieval data structure. Usually 64 or 32. 102 | * @tparam minimalFanoLowerBits Number of lower bits in the EliasFano coding for remapping. 103 | * Only interesting for minimal=true. See paper for details. 104 | * loadFactor < ~0.89 ==> use minimalFanoLowerBits=3 105 | * loadFactor < ~0.94 ==> use minimalFanoLowerBits=4 106 | * loadFactor < ~0.97 ==> use minimalFanoLowerBits=5 107 | */ 108 | template 109 | class SicHash { 110 | public: 111 | static constexpr size_t HASH_FUNCTION_BUCKET_ASSIGNMENT = 42; 112 | SicHashConfig config; 113 | size_t N; 114 | size_t numSmallTables; 115 | std::vector bucketInfo; 116 | SimpleRibbon<1, ribbonWidth> ribbon1; 117 | SimpleRibbon<2, ribbonWidth> ribbon2; 118 | SimpleRibbon<3, ribbonWidth> ribbon3; 119 | bytehamster::util::EliasFano *minimalRemap = nullptr; 120 | size_t unnecessaryConstructions = 0; 121 | size_t M = 0; 122 | 123 | // Keys parameter must be an std::vector or an std::vector. 124 | SicHash(const auto &keys, SicHashConfig _config) 125 | : config(_config), 126 | N(keys.size()), 127 | numSmallTables(N / config.smallTableSize + 1), 128 | bucketInfo(numSmallTables + 1) { 129 | std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); 130 | if (!config.silent) { 131 | std::cout << "Creating MHCs" << std::endl; 132 | } 133 | std::vector> hashedKeys(N); 134 | initialHash(0, N, keys, hashedKeys); 135 | // Note that the order of duplicates in ips2ra is non-deterministic 136 | ips2ra::sort(hashedKeys.begin(), hashedKeys.end(), 137 | [](const std::pair &pair) { return pair.first; }); 138 | 139 | if (!config.silent) { 140 | std::cout << "MHCs took " << std::chrono::duration_cast( 141 | std::chrono::steady_clock::now() - begin).count() << std::endl; 142 | } 143 | 144 | construct(hashedKeys); 145 | } 146 | 147 | explicit SicHash(std::istream &is) { 148 | uint64_t TAG; 149 | is.read(reinterpret_cast(&TAG), sizeof(TAG)); 150 | assert(TAG == 0x51cAa5A); 151 | is.read(reinterpret_cast(&config), sizeof(config)); 152 | is.read(reinterpret_cast(&N), sizeof(N)); 153 | is.read(reinterpret_cast(&numSmallTables), sizeof(numSmallTables)); 154 | is.read(reinterpret_cast(&unnecessaryConstructions), sizeof(unnecessaryConstructions)); 155 | bucketInfo.resize(numSmallTables + 1); 156 | is.read(reinterpret_cast(bucketInfo.data()), bucketInfo.size() * sizeof(BucketInfo)); 157 | ribbon1 = SimpleRibbon<1, ribbonWidth>(is); 158 | ribbon2 = SimpleRibbon<2, ribbonWidth>(is); 159 | ribbon3 = SimpleRibbon<3, ribbonWidth>(is); 160 | if constexpr (minimal) { 161 | minimalRemap = new bytehamster::util::EliasFano(is); 162 | } 163 | if (is.bad()) { 164 | throw std::runtime_error("Input stream went bad"); 165 | } 166 | } 167 | 168 | void writeTo(std::ostream &os) { 169 | uint64_t TAG = 0x51cAa5A; 170 | os.write(reinterpret_cast(&TAG), sizeof(TAG)); 171 | os.write(reinterpret_cast(&config), sizeof(config)); 172 | os.write(reinterpret_cast(&N), sizeof(N)); 173 | os.write(reinterpret_cast(&numSmallTables), sizeof(numSmallTables)); 174 | os.write(reinterpret_cast(&unnecessaryConstructions), sizeof(unnecessaryConstructions)); 175 | os.write(reinterpret_cast(bucketInfo.data()), bucketInfo.size() * sizeof(BucketInfo)); 176 | ribbon1.writeTo(os); 177 | ribbon2.writeTo(os); 178 | ribbon3.writeTo(os); 179 | if constexpr (minimal) { 180 | minimalRemap->writeTo(os); 181 | } 182 | if (os.bad()) { 183 | throw std::runtime_error("Output stream went bad"); 184 | } 185 | } 186 | 187 | void construct(std::vector> &hashedKeys) { 188 | std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); 189 | if (!config.silent) { 190 | std::cout<<"Inserting into Cuckoo"<>> maps; // Avoids conditional jumps later 193 | maps.resize(0b111 + 1); 194 | maps[0b001].reserve(N * config.class1Percentage()); 195 | maps[0b011].reserve(N * config.class2Percentage()); 196 | maps[0b111].reserve(N * config.class3Percentage()); 197 | unnecessaryConstructions = 0; 198 | hashedKeys.emplace_back(numSmallTables + 1, 0); // Sentinel 199 | 200 | std::vector emptySlots; 201 | std::vector takenSlotsOutsideRange; 202 | if constexpr (minimal) { 203 | emptySlots.reserve(N / config.loadFactor - N); 204 | takenSlotsOutsideRange.reserve(N / config.loadFactor - N); 205 | } 206 | constructSmallTables(0, numSmallTables, hashedKeys, emptySlots, maps, takenSlotsOutsideRange); 207 | M = bucketInfo[0].offset; 208 | bucketInfo[numSmallTables].offset = M; 209 | bucketInfo[0].offset = 0; 210 | 211 | if (!config.silent) { 212 | std::cout << "Buckets took " << std::chrono::duration_cast( 213 | std::chrono::steady_clock::now() - begin).count() << std::endl; 214 | begin = std::chrono::steady_clock::now(); 215 | std::cout<<"On average, the small hash tables needed to be retried " 216 | <<(double)(unnecessaryConstructions+numSmallTables)/(double)numSmallTables<<" times"<(maps[0b001]); 221 | ribbon2 = SimpleRibbon<2, ribbonWidth>(maps[0b011]); 222 | ribbon3 = SimpleRibbon<3, ribbonWidth>(maps[0b111]); 223 | if (!config.silent) { 224 | std::cout << "Ribbon took " << std::chrono::duration_cast( 225 | std::chrono::steady_clock::now() - begin).count() << std::endl; 226 | } 227 | 228 | if constexpr (minimal) { 229 | std::vector emptySlotsWithGaps; 230 | emptySlots.reserve(N / config.loadFactor - N); 231 | size_t emptySlotsIdx = 0; 232 | size_t takenSlotsOutsideIdx = 0; 233 | for (size_t slot = N; slot < M; slot++) { 234 | if (takenSlotsOutsideRange[takenSlotsOutsideIdx] < slot) { 235 | takenSlotsOutsideIdx++; 236 | } 237 | emptySlotsWithGaps.push_back(emptySlots[emptySlotsIdx]); 238 | if (takenSlotsOutsideRange[takenSlotsOutsideIdx] == slot) { 239 | emptySlotsIdx++; // Consume empty slot 240 | } 241 | } 242 | size_t universeSize = emptySlotsWithGaps.empty() ? 10 : emptySlotsWithGaps.back() + 1; 243 | minimalRemap = new bytehamster::util::EliasFano( 244 | emptySlotsWithGaps.size(), universeSize); 245 | for (size_t slot : emptySlotsWithGaps) { 246 | minimalRemap->push_back(slot); 247 | } 248 | minimalRemap->buildRankSelect(); 249 | M = N; 250 | } 251 | } 252 | 253 | void initialHash(size_t from, size_t to, const auto &keys, 254 | std::vector> &hashedKeys) { 255 | for (size_t i = from; i < to; i++) { 256 | HashedKey hash = HashedKey(keys[i]); 257 | size_t smallTable = hash.hash(HASH_FUNCTION_BUCKET_ASSIGNMENT, numSmallTables); 258 | hashedKeys[i] = std::make_pair(smallTable, hash); 259 | } 260 | } 261 | 262 | void constructSmallTables(size_t from, size_t to, const std::vector> &hashedKeys, 263 | std::vector &emptySlots, 264 | std::vector>> &maps, 265 | std::vector &takenSlotsOutsideRange) { 266 | IrregularCuckooHashTableConfig cuckooConfig; 267 | cuckooConfig.threshold1 = config.threshold1; 268 | cuckooConfig.threshold2 = config.threshold2; 269 | cuckooConfig.maxEntries = config.smallTableSize * 1.2 + 100; 270 | IrregularCuckooHashTable irregularCuckooHashTable(cuckooConfig); 271 | size_t sizePrefix = 0; 272 | 273 | // Find key to start with 274 | size_t keyIdx = (double(from) / double(numSmallTables)) * N; // Rough estimate 275 | while (hashedKeys[keyIdx].first < from) { 276 | keyIdx++; 277 | } 278 | while (hashedKeys[keyIdx].first > from) { 279 | keyIdx--; 280 | } 281 | 282 | for (size_t bucketIdx = from; bucketIdx < to; bucketIdx++) { 283 | irregularCuckooHashTable.clear(); 284 | while (hashedKeys[keyIdx].first == bucketIdx) { 285 | irregularCuckooHashTable.prepare(hashedKeys[keyIdx].second); 286 | keyIdx++; 287 | } 288 | size_t tableM = irregularCuckooHashTable.size() / config.loadFactor; 289 | size_t seed = 0; 290 | while (!irregularCuckooHashTable.construct(tableM, seed)) { 291 | unnecessaryConstructions++; 292 | seed++; 293 | if (seed >= (1ul << SEED_BITS)) { 294 | throw std::logic_error("Selected thresholds that cannot be constructed"); 295 | } 296 | } 297 | bucketInfo[bucketIdx] = BucketInfo(sizePrefix, seed); 298 | 299 | for (size_t k = 0; k < irregularCuckooHashTable.size(); k++) { 300 | IrregularCuckooHashTable::TableEntry &entry = irregularCuckooHashTable.heap[k]; 301 | maps[entry.hashFunctionMask].emplace_back(entry.hash.mhc, entry.hashFunctionIndex & entry.hashFunctionMask); 302 | } 303 | if constexpr (minimal) { 304 | for (size_t k = 0; k < tableM; k++) { 305 | size_t position = sizePrefix + k; 306 | if (irregularCuckooHashTable.cells[k] == nullptr) { 307 | emptySlots.push_back(position); 308 | } else if (position >= N) { 309 | takenSlotsOutsideRange.push_back(position); 310 | } 311 | } 312 | } 313 | sizePrefix += tableM; 314 | } 315 | bucketInfo[from].offset = sizePrefix; 316 | } 317 | 318 | ~SicHash() { 319 | if (minimal && minimalRemap != nullptr) { 320 | delete minimalRemap; 321 | } 322 | } 323 | 324 | /** Estimate for the space usage of this structure, in bits */ 325 | [[nodiscard]] size_t spaceUsage() const { 326 | size_t bytes = ribbon1.sizeBytes() + ribbon2.sizeBytes() + ribbon3.sizeBytes() 327 | + bucketInfo.size() * sizeof(bucketInfo.at(0)); 328 | if constexpr (minimal) { 329 | bytes += minimalRemap->space(); 330 | std::cout<<"Remap space: "<<8.0*minimalRemap->space()/N<space(); 340 | } 341 | 342 | size_t efN = bucketInfo.size(); 343 | size_t efBits = 2 * efN; 344 | efBits += efN * std::ceil(std::log2((double) bucketInfo.back().offset / (double)efN)); 345 | 346 | size_t golombBits = 0; 347 | double averageSeed = (double)(unnecessaryConstructions+numSmallTables)/(double)numSmallTables; 348 | size_t b = std::log2(averageSeed); 349 | for (auto [offset, seed] : bucketInfo) { 350 | size_t q = seed >> b; 351 | // size_t r = seed - q; 352 | golombBits += b; // Remainder binary coded 353 | golombBits += q + 1; // Quotient unary coded 354 | } 355 | return bytes * 8 + efBits + golombBits; 356 | } 357 | 358 | // Parameter must be an std::string or a HashedKey. 359 | size_t operator() (const auto &key) const { 360 | HashedKey hash = HashedKey(key); 361 | size_t smallTable = hash.hash(HASH_FUNCTION_BUCKET_ASSIGNMENT, numSmallTables); 362 | __builtin_prefetch(&bucketInfo[smallTable],0,0); 363 | uint8_t hashFunction; 364 | if (hash.mhc <= config.threshold1) { 365 | hashFunction = ribbon1.retrieve(hash.mhc); 366 | } else if (hash.mhc <= config.threshold2) { 367 | hashFunction = ribbon2.retrieve(hash.mhc); 368 | } else { 369 | hashFunction = ribbon3.retrieve(hash.mhc); 370 | } 371 | size_t smallTableM = bucketInfo[smallTable + 1].offset - bucketInfo[smallTable].offset; 372 | size_t result = hash.hash(hashFunction + bucketInfo[smallTable].seed, smallTableM) 373 | + bucketInfo[smallTable].offset; 374 | if constexpr (minimal) { 375 | if (result >= N) { 376 | return *minimalRemap->at(result - N); 377 | } 378 | } 379 | return result; 380 | } 381 | }; 382 | } // Namespace sichash 383 | -------------------------------------------------------------------------------- /include/sichash/IrregularCuckooHashTable.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace sichash { 12 | struct HashedKey { 13 | uint64_t mhc; 14 | 15 | HashedKey() { 16 | this->mhc = 0; 17 | } 18 | 19 | explicit HashedKey(uint64_t mhc) { 20 | this->mhc = mhc; 21 | } 22 | 23 | explicit HashedKey(const std::string &element, uint32_t seed = 0) { 24 | uint64_t stringHash = bytehamster::util::MurmurHash64(element.data(), element.length()); 25 | uint64_t modified = stringHash + seed; 26 | mhc = bytehamster::util::MurmurHash64(&modified, sizeof(uint64_t)); 27 | } 28 | 29 | [[nodiscard]] inline uint64_t hash(int hashFunctionIndex, size_t range) const { 30 | return bytehamster::util::fastrange64(bytehamster::util::remix(mhc + hashFunctionIndex), range); 31 | } 32 | }; 33 | 34 | struct IrregularCuckooHashTableConfig { 35 | uint64_t threshold1 = UINT64_MAX / 100 * 50; // 50% 36 | uint64_t threshold2 = UINT64_MAX / 100 * 20; // 25% 37 | size_t maxEntries = 0; 38 | }; 39 | 40 | //#define PRECALCULATE_HASHES 41 | #define RATTLE_KICKING 42 | //#define GHOST_INSERTIONS 43 | 44 | class IrregularCuckooHashTable { 45 | public: 46 | struct TableEntry { 47 | HashedKey hash; 48 | uint16_t hashFunctionIndex = 0; 49 | uint8_t hashFunctionMask = 0; 50 | #ifdef PRECALCULATE_HASHES 51 | size_t hashes[8]; 52 | #endif 53 | #ifdef GHOST_INSERTIONS 54 | uint8_t ghosts = 0; 55 | #endif 56 | 57 | #ifdef PRECALCULATE_HASHES 58 | inline void precalculateHashes(size_t currSeed, size_t currM) { 59 | for (size_t h = 0; h <= hashFunctionMask; h++) { 60 | hashes[h] = hash.hash(h + currSeed, currM); 61 | } 62 | } 63 | #endif 64 | 65 | inline size_t currentCell(size_t currSeed, size_t currM) { 66 | #ifdef PRECALCULATE_HASHES 67 | (void) currSeed; 68 | (void) currM; 69 | return hashes[hashFunctionIndex & hashFunctionMask]; 70 | #else 71 | return hash.hash((hashFunctionIndex & hashFunctionMask) + currSeed, currM); 72 | #endif 73 | } 74 | }; 75 | TableEntry *heap; 76 | std::vector cells; 77 | size_t M = 0; 78 | private: 79 | size_t numEntries = 0; 80 | size_t seed = 0; 81 | const IrregularCuckooHashTableConfig config; 82 | public: 83 | explicit IrregularCuckooHashTable(IrregularCuckooHashTableConfig config_) 84 | : config(config_) { 85 | heap = new TableEntry[config.maxEntries]; 86 | } 87 | 88 | ~IrregularCuckooHashTable() { 89 | delete[] heap; 90 | } 91 | 92 | static std::string name() { 93 | #ifdef PRECALCULATE_HASHES 94 | return "IrregularCuckooHashTablePre"; 95 | #else 96 | return "IrregularCuckooHashTable"; 97 | #endif 98 | } 99 | 100 | void prepare(HashedKey hash) { 101 | assert(numEntries < config.maxEntries); 102 | heap[numEntries] = TableEntry(); 103 | heap[numEntries].hash = hash; 104 | if (hash.mhc <= config.threshold1) { 105 | heap[numEntries].hashFunctionMask = 0b001; 106 | } else if (hash.mhc <= config.threshold2) { 107 | heap[numEntries].hashFunctionMask = 0b011; 108 | } else { 109 | heap[numEntries].hashFunctionMask = 0b111; 110 | } 111 | numEntries++; 112 | } 113 | 114 | void clear() { 115 | numEntries = 0; 116 | } 117 | 118 | bool construct(size_t M_, size_t seed_) { 119 | M = M_; 120 | seed = seed_; 121 | cells.clear(); 122 | cells.resize(M, nullptr); 123 | #ifdef PRECALCULATE_HASHES 124 | for (size_t i = 0; i < numEntries; i++) { 125 | heap[i].precalculateHashes(seed, M); 126 | } 127 | #endif 128 | for (size_t i = 0; i < numEntries; i++) { 129 | if (!insert(&heap[i])) { 130 | return false; 131 | } 132 | } 133 | #ifdef GHOST_INSERTIONS 134 | for (size_t i = 0; i < numEntries; i++) { 135 | for (size_t h = 0; h <= heap[i].hashFunctionMask; h++) { 136 | heap[i].hashFunctionIndex = h; 137 | size_t cell = heap[i].currentCell(seed, M); 138 | if (cells[cell] != &heap[i]) { 139 | continue; // Check next hash function 140 | } 141 | if (heap[i].ghosts == 0) { 142 | break; // Found last position 143 | } 144 | heap[i].ghosts--; 145 | cells[cell] = nullptr; 146 | } 147 | assert(heap[i].ghosts == 0); 148 | assert(cells[heap[i].currentCell(seed, M)] == &heap[i]); 149 | } 150 | #endif 151 | return true; 152 | } 153 | 154 | [[nodiscard]] size_t size() const { 155 | return numEntries; 156 | } 157 | private: 158 | bool insert(TableEntry *entry) { 159 | #ifdef GHOST_INSERTIONS 160 | size_t placed = 0; 161 | for (size_t i = 0; i <= entry->hashFunctionMask; i++) { 162 | entry->hashFunctionIndex = i; 163 | size_t cell = entry->currentCell(seed, M); 164 | if (cells[cell] == nullptr) { 165 | cells[cell] = entry; 166 | placed++; 167 | } 168 | } 169 | if (placed >= 1) { 170 | entry->ghosts = placed - 1; 171 | return true; 172 | } 173 | entry->ghosts = 0; 174 | #endif 175 | 176 | size_t tries = 0; 177 | while (tries < 10000) { 178 | size_t cell = entry->currentCell(seed, M); 179 | #ifdef GHOST_INSERTIONS 180 | if (cells[cell] != nullptr && cells[cell]->ghosts > 0) { 181 | cells[cell]->ghosts--; 182 | cells[cell] = entry; 183 | return true; 184 | } 185 | #endif 186 | #ifdef RATTLE_KICKING 187 | if (cells[cell] == nullptr || entry->hashFunctionIndex >= cells[cell]->hashFunctionIndex) { 188 | std::swap(entry, cells[cell]); 189 | } 190 | #else 191 | std::swap(entry, cells[cell]); 192 | #endif 193 | 194 | if (entry == nullptr) { 195 | return true; 196 | } 197 | entry->hashFunctionIndex++; 198 | tries++; 199 | } 200 | return false; 201 | } 202 | }; 203 | } // Namespace sichash 204 | -------------------------------------------------------------------------------- /include/sichash/IrregularCuckooHashTableHK.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "IrregularCuckooHashTable.h" 3 | 4 | namespace sichash { 5 | /** 6 | * Irregular cuckoo hash table with matching based construction. 7 | */ 8 | class HopcroftKarpMatchingCuckooHashTable { 9 | public: 10 | struct TableEntry { 11 | HashedKey hash; 12 | uint8_t hashFunctionIndex = 0; 13 | uint8_t hashFunctionMask = 0; 14 | #ifdef PRECALCULATE_HASHES 15 | size_t hashes[8]; 16 | #endif 17 | }; 18 | size_t M = 0; 19 | size_t numEntries = 0; 20 | TableEntry *heap; 21 | size_t seed = 0; 22 | const IrregularCuckooHashTableConfig config; 23 | 24 | explicit HopcroftKarpMatchingCuckooHashTable(IrregularCuckooHashTableConfig config_) 25 | : config(config_) { 26 | heap = new TableEntry[config.maxEntries]; 27 | } 28 | 29 | ~HopcroftKarpMatchingCuckooHashTable() { 30 | delete[] heap; 31 | } 32 | 33 | static std::string name() { 34 | #ifdef PRECALCULATE_HASHES 35 | return "HopcroftKarpMatchingCuckooHashTablePre"; 36 | #else 37 | return "HopcroftKarpMatchingCuckooHashTable"; 38 | #endif 39 | } 40 | 41 | void prepare(HashedKey hash) { 42 | assert(numEntries < config.maxEntries); 43 | heap[numEntries].hash = hash; 44 | if (hash.mhc <= config.threshold1) { 45 | heap[numEntries].hashFunctionMask = 0b001; 46 | } else if (hash.mhc <= config.threshold2) { 47 | heap[numEntries].hashFunctionMask = 0b011; 48 | } else { 49 | heap[numEntries].hashFunctionMask = 0b111; 50 | } 51 | numEntries++; 52 | } 53 | 54 | bool construct(size_t M_, size_t seed_) { 55 | M = M_; 56 | seed = seed_; 57 | 58 | n_left = numEntries; 59 | n_right = M; 60 | match_from_left.clear(); 61 | match_from_left.resize(n_left, -1); 62 | match_from_right.clear(); 63 | match_from_right.resize(n_right, -1); 64 | dist.clear(); 65 | dist.resize(n_left); 66 | #ifdef PRECALCULATE_HASHES 67 | for (size_t i = 0; i < numEntries; i++) { 68 | for (size_t h = 0; h <= heap[i].hashFunctionMask; h++) { 69 | heap[i].hashes[h] = heap[i].hash.hash(h + seed, M); 70 | } 71 | } 72 | #endif 73 | 74 | size_t matchingSize = get_max_matching(); 75 | if (matchingSize != numEntries) { 76 | return false; 77 | } 78 | return true; 79 | } 80 | private: 81 | // https://judge.yosupo.jp/submission/52112 82 | int n_left = 0, n_right = 0, flow = 0; 83 | std::vector match_from_left, match_from_right; 84 | std::vector dist; 85 | 86 | void bfs() { 87 | std::queue q; 88 | for (int u = 0; u < n_left; ++u) { 89 | if (!~match_from_left[u]) { 90 | q.push(u); 91 | dist[u] = 0; 92 | } else { 93 | dist[u] = -1; 94 | } 95 | } 96 | while (!q.empty()) { 97 | int u = q.front(); 98 | q.pop(); 99 | for (size_t i = 0; i <= heap[u].hashFunctionMask; i++) { 100 | #ifdef PRECALCULATE_HASHES 101 | int v = heap[u].hashes[i]; 102 | #else 103 | int v = heap[u].hash.hash(i + seed, M); 104 | #endif 105 | if (~match_from_right[v] && !~dist[match_from_right[v]]) { 106 | dist[match_from_right[v]] = dist[u] + 1; 107 | q.push(match_from_right[v]); 108 | } 109 | } 110 | } 111 | } 112 | 113 | bool dfs(int u) { 114 | for (size_t i = 0; i <= heap[u].hashFunctionMask; i++) { 115 | #ifdef PRECALCULATE_HASHES 116 | int v = heap[u].hashes[i]; 117 | #else 118 | int v = heap[u].hash.hash(i + seed, M); 119 | #endif 120 | if (!~match_from_right[v]) { 121 | match_from_left[u] = v; 122 | heap[u].hashFunctionIndex = i; 123 | match_from_right[v] = u; 124 | return true; 125 | } 126 | } 127 | for (size_t i = 0; i <= heap[u].hashFunctionMask; i++) { 128 | #ifdef PRECALCULATE_HASHES 129 | int v = heap[u].hashes[i]; 130 | #else 131 | int v = heap[u].hash.hash(i + seed, M); 132 | #endif 133 | if (dist[match_from_right[v]] == dist[u] + 1 && dfs(match_from_right[v])) { 134 | match_from_left[u] = v; 135 | heap[u].hashFunctionIndex = i; 136 | match_from_right[v] = u; 137 | return true; 138 | } 139 | } 140 | return false; 141 | } 142 | 143 | int get_max_matching() { 144 | flow = 0; 145 | while (true) { 146 | bfs(); 147 | int augment = 0; 148 | for (int u = 0; u < n_left; ++u) { 149 | if (!~match_from_left[u]) { 150 | augment += dfs(u); 151 | } 152 | } 153 | if (!augment) { 154 | break; 155 | } 156 | flow += augment; 157 | } 158 | return flow; 159 | } 160 | }; 161 | } // Namespace sichash 162 | -------------------------------------------------------------------------------- /include/sichash/SlowIrregularCuckooHashTable.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "IrregularCuckooHashTable.h" 4 | 5 | namespace sichash { 6 | /** 7 | * This cuckoo hash table is used for certain types of benchmarks that do NOT measure timing. 8 | * The implementation is more flexible than IrregularCuckooHashTable 9 | * (as it supports a more flexible number of hash functions) 10 | * but is not optimized for performance. 11 | * It also supports incremental insertion. 12 | */ 13 | class SlowIrregularCuckooHashTable { 14 | public: 15 | struct TableEntry { 16 | HashedKey hash; 17 | uint8_t hashFunctionIndex = 0; 18 | uint8_t numHashFunctions = 0; 19 | }; 20 | TableEntry *heap; 21 | size_t displacements = 0; 22 | private: 23 | std::vector cells; 24 | size_t M = 0; 25 | size_t numEntries = 0; 26 | std::vector> thresholds; 27 | public: 28 | explicit SlowIrregularCuckooHashTable(size_t M, std::vector> &thresholds_, size_t maxEntries) 29 | : M(M), thresholds(thresholds_) { 30 | heap = new TableEntry[maxEntries]; 31 | cells.resize(M, nullptr); 32 | displacements = 0; 33 | } 34 | 35 | ~SlowIrregularCuckooHashTable() { 36 | delete[] heap; 37 | } 38 | 39 | static std::string name() { 40 | return "SlowIrregularCuckooHashTable"; 41 | } 42 | 43 | bool insert(HashedKey hash) { 44 | heap[numEntries].hash = hash; 45 | heap[numEntries].numHashFunctions = 0; 46 | for (auto [thresh, num] : thresholds) { 47 | if (hash.mhc <= thresh) { 48 | heap[numEntries].numHashFunctions = num; 49 | break; 50 | } 51 | } 52 | if (heap[numEntries].numHashFunctions == 0) { 53 | throw std::logic_error("Thresholds invalid. No threshold found for mhc " + std::to_string(hash.mhc)); 54 | } 55 | numEntries++; 56 | return insert(&heap[numEntries - 1]); 57 | } 58 | 59 | [[nodiscard]] size_t size() const { 60 | return numEntries; 61 | } 62 | private: 63 | bool insert(TableEntry *entry) { 64 | size_t tries = 0; 65 | while (tries < 20 * M) { 66 | size_t cell = entry->hash.hash(entry->hashFunctionIndex, M); 67 | std::swap(entry, cells[cell]); 68 | if (entry == nullptr) { 69 | return true; 70 | } 71 | entry->hashFunctionIndex = (entry->hashFunctionIndex + 1) % entry->numHashFunctions; 72 | tries++; 73 | displacements++; 74 | } 75 | return false; 76 | } 77 | }; 78 | } // Namespace sichash 79 | -------------------------------------------------------------------------------- /include/sichash/TinyBinaryCuckooHashTable.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "IrregularCuckooHashTable.h" 9 | 10 | namespace sichash { 11 | /** 12 | * Tiny binary cuckoo hash table. Construction needs multiple tries before succeeding. 13 | */ 14 | class TinyBinaryCuckooHashTable { 15 | public: 16 | struct TableEntry { 17 | HashedKey hash; 18 | uint32_t candidateCellsXor = 0; 19 | }; 20 | TableEntry *heap; 21 | TableEntry** cells; 22 | size_t N; 23 | size_t M; 24 | private: 25 | size_t seed = 0; 26 | size_t numEntries = 0; 27 | public: 28 | explicit TinyBinaryCuckooHashTable(size_t N, size_t M) : N(N), M(M) { 29 | heap = new TableEntry[N]; 30 | cells = new TableEntry*[M]; 31 | } 32 | 33 | ~TinyBinaryCuckooHashTable() { 34 | delete[] heap; 35 | delete[] cells; 36 | } 37 | 38 | void prepare(HashedKey hash) { 39 | assert(numEntries < N); 40 | heap[numEntries].hash = hash; 41 | numEntries++; 42 | } 43 | 44 | bool construct(size_t seed_) { 45 | seed = seed_; 46 | memset(cells, 0, M * sizeof(void*)); // Fill with nullpointers 47 | for (size_t i = 0; i < numEntries; i++) { 48 | if (!insert(&heap[i])) { 49 | return false; 50 | } 51 | } 52 | return true; 53 | } 54 | 55 | [[nodiscard]] size_t size() const { 56 | return numEntries; 57 | } 58 | private: 59 | typedef union { 60 | struct { 61 | uint32_t low; 62 | uint32_t high; 63 | } halves; 64 | uint64_t full; 65 | } Union64; 66 | 67 | bool insert(TableEntry *entry) { 68 | Union64 hash; 69 | hash.full = util::remix(entry->hash.mhc + seed); 70 | uint32_t cell1 = util::fastrange32(hash.halves.high, M); 71 | uint32_t cell2 = util::fastrange32(hash.halves.low, M); 72 | entry->candidateCellsXor = cell1 ^ cell2; 73 | if (cells[cell1] == nullptr) { 74 | cells[cell1] = entry; 75 | return true; 76 | } 77 | if (cells[cell2] == nullptr) { 78 | cells[cell2] = entry; 79 | return true; 80 | } 81 | uint32_t currentCell = cell2; 82 | 83 | size_t tries = 0; 84 | while (tries < M) { 85 | uint32_t alternativeCell = entry->candidateCellsXor ^ currentCell; 86 | std::swap(entry, cells[alternativeCell]); 87 | if (entry == nullptr) { 88 | return true; 89 | } 90 | currentCell = alternativeCell; 91 | tries++; 92 | } 93 | return false; 94 | } 95 | }; 96 | } // Namespace sichash 97 | -------------------------------------------------------------------------------- /plots-construction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ByteHamster/SicHash/3f8d9fbcee127e6abff8550b7269c54d6f9d59ee/plots-construction.png -------------------------------------------------------------------------------- /plots-query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ByteHamster/SicHash/3f8d9fbcee127e6abff8550b7269c54d6f9d59ee/plots-query.png -------------------------------------------------------------------------------- /scripts/.gitignore: -------------------------------------------------------------------------------- 1 | dockerVolume/*.pdf 2 | dockerVolume/*.txt 3 | 4 | ## Core latex/pdflatex auxiliary files: 5 | *.vtc 6 | *.aux 7 | *.lof 8 | *.log 9 | *.lot 10 | *.fls 11 | *.out 12 | *.toc 13 | *.fmt 14 | *.fot 15 | *.cb 16 | *.cb2 17 | .*.lb 18 | 19 | ## Intermediate documents: 20 | *.dvi 21 | *.xdv 22 | *-converted-to.* 23 | # these rules might exclude image files for figures etc. 24 | # *.ps 25 | # *.eps 26 | *.pdf 27 | 28 | ## Generated if empty string is given at "Please type another file name for output:" 29 | .pdf 30 | 31 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 32 | *.bbl 33 | *.bcf 34 | *.blg 35 | *-blx.aux 36 | *-blx.bib 37 | *.run.xml 38 | 39 | ## Build tool auxiliary files: 40 | *.fdb_latexmk 41 | *.synctex 42 | *.synctex(busy) 43 | *.synctex.gz 44 | *.synctex.gz(busy) 45 | *.pdfsync 46 | 47 | ## Build tool directories for auxiliary files 48 | # latexrun 49 | latex.out/ 50 | 51 | ## Auxiliary and intermediate files from other packages: 52 | # algorithms 53 | *.alg 54 | *.loa 55 | 56 | # achemso 57 | acs-*.bib 58 | 59 | # amsthm 60 | *.thm 61 | 62 | # beamer 63 | *.nav 64 | *.pre 65 | *.snm 66 | *.vrb 67 | 68 | # changes 69 | *.soc 70 | 71 | # comment 72 | *.cut 73 | 74 | # cprotect 75 | *.cpt 76 | 77 | # elsarticle (documentclass of Elsevier journals) 78 | *.spl 79 | 80 | # endnotes 81 | *.ent 82 | 83 | # fixme 84 | *.lox 85 | 86 | # feynmf/feynmp 87 | *.mf 88 | *.mp 89 | *.t[1-9] 90 | *.t[1-9][0-9] 91 | *.tfm 92 | 93 | #(r)(e)ledmac/(r)(e)ledpar 94 | *.end 95 | *.?end 96 | *.[1-9] 97 | *.[1-9][0-9] 98 | *.[1-9][0-9][0-9] 99 | *.[1-9]R 100 | *.[1-9][0-9]R 101 | *.[1-9][0-9][0-9]R 102 | *.eledsec[1-9] 103 | *.eledsec[1-9]R 104 | *.eledsec[1-9][0-9] 105 | *.eledsec[1-9][0-9]R 106 | *.eledsec[1-9][0-9][0-9] 107 | *.eledsec[1-9][0-9][0-9]R 108 | 109 | # glossaries 110 | *.acn 111 | *.acr 112 | *.glg 113 | *.glo 114 | *.gls 115 | *.glsdefs 116 | *.lzo 117 | *.lzs 118 | 119 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 120 | # *.ist 121 | 122 | # gnuplottex 123 | *-gnuplottex-* 124 | 125 | # gregoriotex 126 | *.gaux 127 | *.gtex 128 | 129 | # htlatex 130 | *.4ct 131 | *.4tc 132 | *.idv 133 | *.lg 134 | *.trc 135 | *.xref 136 | 137 | # hyperref 138 | *.brf 139 | 140 | # knitr 141 | *-concordance.tex 142 | # TODO Comment the next line if you want to keep your tikz graphics files 143 | *.tikz 144 | *-tikzDictionary 145 | 146 | # listings 147 | *.lol 148 | 149 | # luatexja-ruby 150 | *.ltjruby 151 | 152 | # makeidx 153 | *.idx 154 | *.ilg 155 | *.ind 156 | 157 | # minitoc 158 | *.maf 159 | *.mlf 160 | *.mlt 161 | *.mtc[0-9]* 162 | *.slf[0-9]* 163 | *.slt[0-9]* 164 | *.stc[0-9]* 165 | 166 | # minted 167 | _minted* 168 | *.pyg 169 | 170 | # morewrites 171 | *.mw 172 | 173 | # nomencl 174 | *.nlg 175 | *.nlo 176 | *.nls 177 | 178 | # pax 179 | *.pax 180 | 181 | # pdfpcnotes 182 | *.pdfpc 183 | 184 | # sagetex 185 | *.sagetex.sage 186 | *.sagetex.py 187 | *.sagetex.scmd 188 | 189 | # scrwfile 190 | *.wrt 191 | 192 | # sympy 193 | *.sout 194 | *.sympy 195 | sympy-plots-for-*.tex/ 196 | 197 | # pdfcomment 198 | *.upa 199 | *.upb 200 | 201 | # pythontex 202 | *.pytxcode 203 | pythontex-files-*/ 204 | 205 | # tcolorbox 206 | *.listing 207 | 208 | # thmtools 209 | *.loe 210 | 211 | # TikZ & PGF 212 | *.dpth 213 | *.md5 214 | *.auxlock 215 | 216 | # todonotes 217 | *.tdo 218 | 219 | # vhistory 220 | *.hst 221 | *.ver 222 | 223 | # easy-todo 224 | *.lod 225 | 226 | # xcolor 227 | *.xcp 228 | 229 | # xmpincl 230 | *.xmpi 231 | 232 | # xindy 233 | *.xdy 234 | 235 | # xypic precompiled matrices and outlines 236 | *.xyc 237 | *.xyd 238 | 239 | # endfloat 240 | *.ttt 241 | *.fff 242 | 243 | # Latexian 244 | TSWLatexianTemp* 245 | 246 | ## Editors: 247 | # WinEdt 248 | *.bak 249 | *.sav 250 | 251 | # Texpad 252 | .texpadtmp 253 | 254 | # LyX 255 | *.lyx~ 256 | 257 | # Kile 258 | *.backup 259 | 260 | # gummi 261 | .*.swp 262 | 263 | # KBibTeX 264 | *~[0-9]* 265 | 266 | # TeXnicCenter 267 | *.tps 268 | 269 | # auto folder when using emacs and auctex 270 | ./auto/* 271 | *.el 272 | 273 | # expex forward references with \gathertags 274 | *-tags.tex 275 | 276 | # standalone packages 277 | *.sta 278 | 279 | # Makeindex log files 280 | *.lpz 281 | -------------------------------------------------------------------------------- /scripts/boxPlotBinaryCuckoo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hostname 3 | strings MaxLoadFactor | grep fPIC 4 | 5 | function repeat() { 6 | repetitions=$1 7 | shift 8 | # shellcheck disable=SC2034 9 | for i in $(seq "$repetitions"); do 10 | # shellcheck disable=SC2068 11 | $@ 12 | done 13 | } 14 | 15 | repeat 2000 ./MaxLoadFactor -m 500 --percentage2 100 16 | repeat 2000 ./MaxLoadFactor -m 1k --percentage2 100 17 | repeat 2000 ./MaxLoadFactor -m 5k --percentage2 100 18 | repeat 2000 ./MaxLoadFactor -m 10k --percentage2 100 19 | repeat 2000 ./MaxLoadFactor -m 50k --percentage2 100 20 | repeat 2000 ./MaxLoadFactor -m 100k --percentage2 100 21 | repeat 2000 ./MaxLoadFactor -m 1M --percentage2 100 22 | -------------------------------------------------------------------------------- /scripts/boxPlotIrregularCuckooSpace.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hostname 3 | strings MaxLoadFactor | grep fPIC 4 | 5 | function repeat() { 6 | repetitions=$1 7 | shift 8 | # shellcheck disable=SC2034 9 | for i in $(seq "$repetitions"); do 10 | # shellcheck disable=SC2068 11 | $@ 12 | done 13 | } 14 | 15 | # shellcheck disable=SC2068 16 | function runMultipleM () { 17 | repeat 2000 ./MaxLoadFactor -m 500 $@ 18 | repeat 2000 ./MaxLoadFactor -m 5k $@ 19 | # These are slower and have less variance. Only run half of the samples. 20 | repeat 1000 ./MaxLoadFactor -m 50k $@ 21 | repeat 1000 ./MaxLoadFactor -m 500k $@ 22 | } 23 | 24 | runMultipleM --percentage4 100 25 | # x*1 + y*2 + (1-x-y)*3 = 2 26 | runMultipleM --percentage2 50 --percentage4 0 --percentage8 50 27 | runMultipleM --percentage2 33 --percentage4 34 --percentage8 33 28 | runMultipleM --percentage2 10 --percentage4 80 --percentage8 10 29 | -------------------------------------------------------------------------------- /scripts/bucketSize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hostname 3 | strings SicHashBenchmark | grep fPIC 4 | 5 | function runBenchmark() { 6 | export averageTries 7 | exec 5>&1 8 | R=$(./SicHashBenchmark --minimal --numKeys 10M --bucketSize "$1" --loadFactor 0.9 --percentage2 "0.$2" --percentage4 "0.$3" | tee >(cat - >&5)) 9 | averageTries=$(echo "$R" | sed -n 's/\(.*\)averageTries=\([0-9.]*\) \(.*\)/\2/p') 10 | if [[ "$averageTries" == "" ]]; then 11 | averageTries="1000" 12 | fi 13 | } 14 | 15 | for i in $(seq 40 1 85); do 16 | for j in $(seq 15 1 60); do 17 | if [[ $((i + j)) -gt '100' ]]; then 18 | continue 19 | fi 20 | echo "Trying $i $j" 21 | # The larger the hash table for a single configuration, the more tries are needed. 22 | # When a small bucket size already times out, a larger one with the same configuration will probably time out as well. 23 | # Therefore, continue with the next loop iteration as soon as one of the methods times out. 24 | timeoutTries=50 25 | 26 | runBenchmark 100 "$i" "$j" 27 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 28 | runBenchmark 200 "$i" "$j" 29 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 30 | runBenchmark 500 "$i" "$j" 31 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 32 | runBenchmark 1000 "$i" "$j" 33 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 34 | runBenchmark 2000 "$i" "$j" 35 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 36 | runBenchmark 5000 "$i" "$j" 37 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 38 | runBenchmark 10000 "$i" "$j" 39 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 40 | runBenchmark 20000 "$i" "$j" 41 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 42 | runBenchmark 50000 "$i" "$j" 43 | if (( $(echo "$averageTries > $timeoutTries" | bc -l) )); then continue; fi 44 | runBenchmark 100000 "$i" "$j" 45 | done 46 | done 47 | -------------------------------------------------------------------------------- /scripts/competitorNames.txt: -------------------------------------------------------------------------------- 1 | RESULT store_code=cmph-BMZ store_name=BMZ showIn=legend store_attr=mark=asterisk,color=colorBmz,solid 2 | RESULT store_code=cmph-BDZ store_name=BDZ showIn=legend store_attr=mark=triangle,color=colorBdz,solid 3 | RESULT store_code=cmph-FCH store_name=FCH showIn=legend store_attr=mark=oplus,color=colorFch,solid 4 | RESULT store_code=cmph-CHD store_name=CHD showIn=legend store_attr=mark=x,color=colorChd,solid 5 | RESULT store_code=cmph-CHM store_name=CHM showIn= store_attr=mark=|,color=colorChm,solid 6 | RESULT store_code=SicHash store_name=SicHash showIn=legend store_attr=mark=o,color=colorHeterogeneous,solid 7 | RESULT store_code=SicHashMinimal store_name=SicHash Minimal showIn= store_attr=mark=o,color=colorHeterogeneous,densely dotted 8 | RESULT store_code=PTHash store_name=PTHash showIn=legend store_attr=mark=pentagon,color=colorPthash,solid 9 | RESULT store_code=PTHashMinimal store_name=PTHash Minimal showIn= store_attr=mark=pentagon,color=colorPthash,densely dotted 10 | RESULT store_code=RecSplit store_name=RecSplit showIn=legend store_attr=mark=square,color=colorRecSplit,solid 11 | RESULT store_code=BBHash store_name=BBHash showIn=legend store_attr=mark=diamond,color=colorBbhash,solid 12 | RESULT store_code=MphfWbpm store_name=WBPM showIn=legend store_attr=mark=+,color=colorBbhash,solid 13 | -------------------------------------------------------------------------------- /scripts/dockerVolume/figure-1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run benchmark 4 | cd /opt/sichash/build 5 | function repeat() { 6 | repetitions=$1 7 | shift 8 | for i in $(seq "$repetitions"); do 9 | $@ 10 | done 11 | } 12 | 13 | function runMultipleM () { 14 | repeat 2000 ./MaxLoadFactor -m 500 $@ 15 | repeat 500 ./MaxLoadFactor -m 5k $@ 16 | repeat 100 ./MaxLoadFactor -m 50k $@ 17 | repeat 20 ./MaxLoadFactor -m 500k $@ 18 | } 19 | 20 | runMultipleM --percentage4 100 | tee figure-1.txt 21 | runMultipleM --percentage2 50 --percentage4 0.00 --percentage8 0.50 | tee --append figure-1.txt 22 | runMultipleM --percentage2 33 --percentage4 0.34 --percentage8 0.33 | tee --append figure-1.txt 23 | runMultipleM --percentage2 10 --percentage4 0.80 --percentage8 0.10 | tee --append figure-1.txt 24 | 25 | # Build plot 26 | cd /opt/sichash/scripts 27 | cp /opt/sichash/build/figure-1.txt figure-1.txt 28 | /opt/sqlplot-tools/build/src/sqlplot-tools figure-1.tex 29 | pdflatex figure-1.tex 30 | pdflatex figure-1.tex 31 | cp figure-1.pdf /opt/dockerVolume 32 | cp figure-1.txt /opt/dockerVolume 33 | -------------------------------------------------------------------------------- /scripts/figure-1.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage[a4paper, margin=2cm]{geometry} 3 | \usepackage{xcolor} 4 | \usepackage{xspace} 5 | \usepackage{booktabs} 6 | \usepackage{dsfont} 7 | \usepackage{footmisc} 8 | \usepackage{marvosym} 9 | \usepackage{amsmath} 10 | \usepackage{hyperref} 11 | \usepackage[capitalise,noabbrev]{cleveref} 12 | \usepackage{tabularx} 13 | \usepackage{listings} 14 | \usepackage{multirow} 15 | \usepackage{pgfplots} 16 | \usepackage{subcaption} 17 | \usetikzlibrary{pgfplots.statistics} 18 | \pgfplotsset{compat=newest} 19 | 20 | \usepgfplotslibrary{groupplots} 21 | \pgfplotsset{every axis/.style={scale only axis}} 22 | 23 | \pgfplotsset{ 24 | major grid style={thin,dotted}, 25 | minor grid style={thin,dotted}, 26 | ymajorgrids, 27 | yminorgrids, 28 | every axis/.append style={ 29 | line width=0.7pt, 30 | tick style={ 31 | line cap=round, 32 | thin, 33 | major tick length=4pt, 34 | minor tick length=2pt, 35 | }, 36 | }, 37 | legend cell align=left, 38 | legend style={ 39 | line width=0.7pt, 40 | /tikz/every even column/.append style={column sep=3mm,black}, 41 | /tikz/every odd column/.append style={black}, 42 | }, 43 | % move title closer 44 | legend style={font=\small}, 45 | title style={yshift=-2pt}, 46 | % less space on left and right 47 | enlarge x limits=0.04, 48 | every tick label/.append style={font=\footnotesize}, 49 | every axis label/.append style={font=\small}, 50 | every axis y label/.append style={yshift=-1ex}, 51 | /pgf/number format/1000 sep={}, 52 | axis lines*=left, 53 | xlabel near ticks, 54 | ylabel near ticks, 55 | axis lines*=left, 56 | label style={font=\footnotesize}, 57 | tick label style={font=\footnotesize}, 58 | } 59 | 60 | \title{SicHash plot} 61 | \date{} 62 | \begin{document} 63 | 64 | \definecolor{separatorColor}{HTML}{BBBBBB} 65 | \definecolor{color-0-100-0}{HTML}{000000} 66 | \definecolor{color-50-0-50}{HTML}{4DAF4A} 67 | \definecolor{color-33-34-33}{HTML}{984EA3} 68 | \definecolor{color-10-80-10}{HTML}{A65628} 69 | 70 | \begin{figure}[p] 71 | \begin{subfigure}[c]{0.75\textwidth} 72 | \begin{tikzpicture} 73 | \begin{axis}[ 74 | boxplot/draw direction=y, 75 | xtick={0, 1, 2, 3, 1.5, 76 | 5, 6, 7, 8, 6.5, 77 | 10, 11, 12, 13, 11.5, 78 | 15, 16, 17, 18, 16.5}, 79 | xticklabels={A,B,C,D,\\\\$M=500$, 80 | A,B,C,D,\\\\$M=5\,000$, 81 | A,B,C,D,\\\\$M=50\,000$, 82 | A,B,C,D,\\\\$M=500\,000$}, 83 | xticklabel style={align=center}, 84 | xtick style={draw=none}, 85 | yticklabel={\pgfmathprintnumber\tick\%}, 86 | width=12cm, 87 | height=5cm, 88 | ymajorgrids=false, 89 | ymin=97, 90 | no marks, 91 | ] 92 | \addplot[no marks,separatorColor,dashed,thin] coordinates { ( 4,97.01) ( 4,100) }; 93 | \addplot[no marks,separatorColor,dashed,thin] coordinates { ( 9,97.01) ( 9,100) }; 94 | \addplot[no marks,separatorColor,dashed,thin] coordinates { (14,97.01) (14,100) }; 95 | 96 | \addplot[color=color-0-100-0,dashed,thin] coordinates { (-0.8,97.677016) (18.8,97.677016) }; 97 | \addplot[color=color-10-80-10,dashed,thin] coordinates { (-0.8,98.111705) (18.8,98.111705) }; 98 | \addplot[color=color-33-34-33,dashed,thin] coordinates { (-0.8,98.8525565) (18.8,98.8525565) }; 99 | \addplot[color=color-50-0-50,dashed,thin] coordinates { (-0.8,99.21047014) (18.8,99.21047014) }; 100 | 101 | % IMPORT-DATA boxPlotIrregularCuckooSpace figure-1.txt 102 | %% MULTIPLOT(M,ps|attr) 103 | %% SELECT 104 | %% 0 AS x, ROUND(100.0*loadFactor,2) AS y, M, 105 | %% printf("%d/%d/%d", percentage2, percentage4, percentage8) AS ps, 106 | %% printf("boxplot,color=color-%d-%d-%d,boxplot/draw position=%d,fill=white", percentage2, percentage4, percentage8, 107 | %% (SELECT COUNT(DISTINCT percentage2) FROM boxPlotIrregularCuckooSpace o WHERE boxPlotIrregularCuckooSpace.percentage2 > o.percentage2) 108 | %% + 5*(SELECT COUNT(DISTINCT M) FROM boxPlotIrregularCuckooSpace o WHERE boxPlotIrregularCuckooSpace.M > o.M)) AS attr 109 | %% FROM boxPlotIrregularCuckooSpace 110 | %% ORDER BY M,ps,y 111 | 112 | \legend{}; 113 | \end{axis} 114 | \end{tikzpicture} 115 | \end{subfigure} 116 | \begin{subfigure}[c]{0.24\textwidth} 117 | \centering 118 | \begin{tikzpicture} 119 | \begin{axis}[ 120 | width=2cm, 121 | height=2cm, 122 | legend columns=1, 123 | hide axis, 124 | xmin=10, 125 | xmax=50, 126 | ymin=0, 127 | ymax=0.4, 128 | ] 129 | %% MULTIPLOT(title|title|attr) 130 | %% SELECT 131 | %% 0 AS x, 1 AS y, 132 | %% printf("%c: %d/%d/%d", CHAR(65 + (SELECT COUNT(DISTINCT percentage2) FROM boxPlotIrregularCuckooSpace o WHERE boxPlotIrregularCuckooSpace.percentage2 > o.percentage2)), percentage2, percentage4, percentage8) AS title, 133 | %% printf("color=color-%d-%d-%d", percentage2, percentage4, percentage8) AS attr 134 | %% FROM boxPlotIrregularCuckooSpace 135 | %% GROUP BY percentage2 136 | %% ORDER BY title 137 | 138 | \end{axis} 139 | \end{tikzpicture} 140 | \end{subfigure} 141 | \caption{Achieved load factors when running different irregular cuckoo hashing configurations, which all need the same storage space (2 bits). The configurations are described by the percentages of objects with 2/4/8 choices, having a space usage of 1/2/3 bits, respectively. The configuration 0/100/0 refers to ordinary 4-ary cuckoo hashing. Horizontal lines indicate the theoretical maximum load factor for $M\rightarrow\infty$.} 142 | \end{figure} 143 | 144 | \end{document} 145 | 146 | -------------------------------------------------------------------------------- /src/BenchmarkData.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | std::vector generateInputData(size_t N) { 10 | std::vector inputData; 11 | inputData.reserve(N); 12 | auto time = std::chrono::system_clock::now(); 13 | long seed = std::chrono::duration_cast(time.time_since_epoch()).count(); 14 | bytehamster::util::XorShift64 prng(seed); 15 | std::cout<<"Generating input (seed "< 2 | #include 3 | #include "BenchmarkData.h" 4 | 5 | /** 6 | * Uses the implementation that is focused on flexibility and statistics. 7 | * The implementation is NOT meant for performance statistics. 8 | */ 9 | int main(int argc, char** argv) { 10 | size_t iterations = 10; 11 | size_t N = 1e5; 12 | size_t M = 3e5; 13 | std::string name = ""; 14 | tlx::CmdlineParser cmd; 15 | cmd.add_string('l', "name", name, "Name for identifying the output"); 16 | cmd.add_bytes('i', "iterations", iterations, "Number of times to try construction"); 17 | cmd.add_bytes('n', "numKeys", N, "Number of keys to store"); 18 | cmd.add_bytes('m', "numLocations", M, "Size of the hash table"); 19 | size_t thresholds_[9] = {0}; 20 | for (size_t i = 2; i <= 8; i++) { 21 | cmd.add_size_t('0' + i, "percentage" + std::to_string(i), thresholds_[i], "Percentage of items to have this number of hash functions"); 22 | } 23 | if (!cmd.process(argc, argv)) { 24 | return 1; 25 | } 26 | std::vector> thresholds; 27 | size_t thresholdSum = 0; 28 | for (size_t i = 2; i <= 8; i++) { 29 | thresholdSum += thresholds_[i]; 30 | if (thresholdSum > 100) { 31 | std::cerr<<"Thresholds are more than 100%"< keys = generateInputData(N); 42 | std::cout.clear(); 43 | sichash::SlowIrregularCuckooHashTable hashTable(M, thresholds, N); 44 | bool success = true; 45 | for (size_t i = 0; i < N; i++) { 46 | success = hashTable.insert(sichash::HashedKey(keys[i])); 47 | if (!success) { 48 | break; 49 | } 50 | } 51 | if (success) { 52 | displacementSum += hashTable.displacements; 53 | successfulSeeds++; 54 | } 55 | } 56 | std::cout << "RESULT"; 57 | if (!name.empty()) { 58 | std::cout << " name=" << name; 59 | } 60 | std::cout << " N=" << N 61 | << " M=" << M 62 | << " iterations=" << iterations 63 | << " success=" << successfulSeeds 64 | << " displacements=" << (successfulSeeds == 0 ? 0 : (double)displacementSum / (double)successfulSeeds); 65 | for (size_t i = 2; i <= 8; i++) { 66 | std::cout << " percentage" << i << "=" << thresholds_[i]; 67 | } 68 | std::cout << std::endl; 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /src/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | int main() { 7 | std::vector keys = {"abc", "def", "123", "456"}; 8 | sichash::SicHashConfig config; 9 | config.silent = false; 10 | sichash::SicHash hashFunc(keys, config); 11 | std::cout << hashFunc("abc") << std::endl; 12 | } 13 | -------------------------------------------------------------------------------- /src/maxLoadFactor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "BenchmarkData.h" 4 | 5 | /** 6 | * Uses the implementation that is focused on flexibility and statistics. 7 | * The implementation is NOT meant for performance statistics. 8 | */ 9 | int main(int argc, char** argv) { 10 | size_t M = 3e5; 11 | std::string name = ""; 12 | tlx::CmdlineParser cmd; 13 | cmd.add_string('l', "name", name, "Name for identifying the output"); 14 | cmd.add_bytes('m', "numLocations", M, "Size of the hash table"); 15 | size_t thresholds_[9] = {0}; 16 | for (size_t i = 2; i <= 8; i++) { 17 | cmd.add_size_t('0' + i, "percentage" + std::to_string(i), thresholds_[i], "Percentage of items to have this number of hash functions"); 18 | } 19 | if (!cmd.process(argc, argv)) { 20 | return 1; 21 | } 22 | std::vector> thresholds; 23 | size_t thresholdSum = 0; 24 | for (size_t i = 2; i <= 8; i++) { 25 | thresholdSum += thresholds_[i]; 26 | if (thresholdSum > 100) { 27 | std::cerr<<"Thresholds are more than 100%"< keys = generateInputData(M); 33 | sichash::SlowIrregularCuckooHashTable hashTable(M, thresholds, M); 34 | size_t N = 0; 35 | while (hashTable.insert(sichash::HashedKey(keys[N])) && N < M) { 36 | N++; 37 | if ((N % (M/42)) == 0 && N >= 0.7 * M) { // 0.3*42=12 steps displayed 38 | std::cout<<"\rInserting: "<<100l*N/M<<"%"< 18 | void run() { 19 | config.percentages(t1, t2); 20 | std::vector keys = generateInputData(N); 21 | std::cout << "Cooldown" << std::endl; 22 | std::this_thread::sleep_for(std::chrono::seconds(1)); 23 | 24 | std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); 25 | SicHashInstance sicHashTable(keys, config); 26 | std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); 27 | long constructionTime = std::chrono::duration_cast(end - begin).count(); 28 | 29 | long queryTime = 0; 30 | if (numQueries > 0) { 31 | std::cout << "Checking" << std::endl; 32 | std::vector taken(minimal ? keys.size() : (keys.size() / sicHashTable.config.loadFactor + 100), false); // +100 for rounding 33 | for (std::string &key : keys) { 34 | size_t retrieved = sicHashTable(key); 35 | if (retrieved > taken.size()) { 36 | std::cerr << "Error: out of range" << std::endl; 37 | exit(1); 38 | } else if (taken[retrieved]) { 39 | std::cerr << "Error: collision" << std::endl; 40 | exit(1); 41 | } 42 | taken[retrieved] = true; 43 | } 44 | 45 | std::cout<<"Preparing query plan"< queryPlan; 47 | queryPlan.reserve(numQueries); 48 | bytehamster::util::XorShift64 prng(time(nullptr)); 49 | for (size_t i = 0; i < numQueries; i++) { 50 | queryPlan.push_back(keys[prng(N)]); 51 | } 52 | std::cout << "Cooldown" << std::endl; 53 | std::this_thread::sleep_for(std::chrono::seconds(1)); 54 | std::cout << "Querying" << std::endl; 55 | begin = std::chrono::steady_clock::now(); 56 | for (std::string &key : queryPlan) { 57 | size_t retrieved = sicHashTable(key); 58 | DO_NOT_OPTIMIZE(retrieved); 59 | } 60 | end = std::chrono::steady_clock::now(); 61 | queryTime = std::chrono::duration_cast(end - begin).count(); 62 | } 63 | 64 | size_t spaceUsage = sicHashTable.spaceUsage(); 65 | std::cout << "RESULT" 66 | << " loadFactor=" << config.loadFactor 67 | << " N=" << N 68 | << " t1=" << config.class1Percentage() 69 | << " t2=" << config.class2Percentage() 70 | << " spaceUsage=" << (double) spaceUsage / keys.size() 71 | << " bucketSize=" << config.smallTableSize 72 | << " constructionTimeMillis=" << constructionTime 73 | << " queryTimeMillis=" << queryTime 74 | << " numQueries=" << numQueries 75 | << " minimal=" << minimal 76 | << std::endl; 77 | } 78 | 79 | int main(int argc, char** argv) { 80 | tlx::CmdlineParser cmd; 81 | cmd.add_bytes('n', "numKeys", N, "Total number of keys to use"); 82 | cmd.add_double('l', "loadFactor", config.loadFactor, "Load factor of the table, usually between 0.8 and 0.99"); 83 | cmd.add_float('1', "percentage2", t1, "Threshold for objects with 2 choices"); 84 | cmd.add_float('2', "percentage4", t2, "Threshold for objects with 4 choices"); 85 | cmd.add_bytes('b', "bucketSize", config.smallTableSize, "Size of the small buckets (cuckoo hash tables)"); 86 | cmd.add_bytes('q', "numQueries", numQueries, "Number of queries"); 87 | cmd.add_bool('m', "minimal", minimal, "Construct minimal perfect hash function"); 88 | if (!cmd.process(argc, argv)) { 89 | return 1; 90 | } 91 | 92 | if (minimal) { 93 | run>(); 94 | } else { 95 | run>(); 96 | } 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /src/solvers.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "BenchmarkData.h" 5 | 6 | template 7 | void plotConstructionPerformanceByLoadFactor() { 8 | size_t M = 5000; 9 | std::vector keys = generateInputData(M); 10 | for (double loadFactor = 0.8; loadFactor <= 0.98; loadFactor += 0.002) { 11 | size_t N = loadFactor * M; 12 | sichash::IrregularCuckooHashTableConfig config; 13 | config.maxEntries = N; 14 | config.threshold1 = UINT64_MAX / 100 * 50; 15 | config.threshold2 = UINT64_MAX / 100 * 75; 16 | HashTable hashTable(config); 17 | for (size_t i = 0; i < N; i++) { 18 | hashTable.prepare(sichash::HashedKey(keys[i])); 19 | } 20 | // Rough estimate to balance time needed for each test iteration 21 | const size_t requiredNumberOfSuccessfulConstructions = 100000 * (1.05 - loadFactor) * (1.05 - loadFactor); 22 | size_t iterations = 0; 23 | std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); 24 | for (size_t seed = 0;; seed++) { 25 | if (hashTable.construct(M, seed)) { 26 | iterations++; 27 | } 28 | if (iterations == requiredNumberOfSuccessfulConstructions) { 29 | break; 30 | } 31 | if (seed >= requiredNumberOfSuccessfulConstructions * 10) { 32 | std::cout<<"Unable to construct at this load factor."<(end - begin).count(); 38 | std::cout << "RESULT" 39 | << " method=" << HashTable::name() 40 | << " N=" << N 41 | << " M=" << M 42 | << " constructionTimeMicros=" << 0.001 * constructionTime / requiredNumberOfSuccessfulConstructions 43 | << " totalTimeMillis=" << constructionTime / 1000000 44 | << std::endl; 45 | } 46 | } 47 | 48 | int main() { 49 | plotConstructionPerformanceByLoadFactor(); 50 | plotConstructionPerformanceByLoadFactor(); 51 | return 0; 52 | } 53 | --------------------------------------------------------------------------------