├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── bench
    ├── CMakeLists.txt
    ├── batched_data_gen.cpp
    ├── batched_data_gen.h
    ├── bencher.py
    ├── experiments.cuh
    └── main_benchmarks.cu
├── src
    ├── CMakeLists.txt
    ├── CommandLine.h
    ├── concurrent_map
    │   ├── cmap_class.cuh
    │   ├── cmap_implementation.cuh
    │   ├── device
    │   │   ├── build.cuh
    │   │   ├── concurrent_kernel.cuh
    │   │   ├── count_kernel.cuh
    │   │   ├── delete_kernel.cuh
    │   │   ├── misc_kernels.cuh
    │   │   └── search_kernel.cuh
    │   └── warp
    │   │   ├── count.cuh
    │   │   ├── delete.cuh
    │   │   ├── insert.cuh
    │   │   └── search.cuh
    ├── concurrent_set
    │   ├── cset_class.cuh
    │   ├── cset_helper_kernels.cuh
    │   ├── cset_implementation.cuh
    │   └── cset_warp_operations.cuh
    ├── gpu_hash_table.cuh
    ├── slab_hash.cuh
    ├── slab_hash_global.cuh
    ├── slab_hash_helper_methods.cuh
    └── slab_iterator.cuh
└── test
    ├── CMakeLists.txt
    ├── cmap_test.cu
    ├── concurrent_map.cu
    ├── concurrent_set.cu
    ├── iterator_test.cu
    └── test_slab_hash.cu


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle:  Chromium
2 | BinPackArguments: false
3 | BinPackParameters: false
4 | ColumnLimit:     90
5 | IndentWidth:     2
6 | BreakConstructorInitializers: BeforeComma
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # CMake files
 2 | CMakeCache.txt
 3 | CMakeFiles
 4 | Makefile
 5 | cmake_install.cmake
 6 | install_manifest.txt
 7 | _build/
 8 | build/
 9 | .vscode
10 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "SlabAlloc"]
 2 | 	path = SlabAlloc
 3 | 	url = https://github.com/owensgroup/SlabAlloc
 4 | [submodule "ThirdParty/rapidjson"]
 5 | 	path = ThirdParty/rapidjson
 6 | 	url = https://github.com/Tencent/rapidjson
 7 | [submodule "ThirdParty/googletest"]
 8 | 	path = ThirdParty/googletest
 9 | 	url = https://github.com/google/googletest
10 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required (VERSION 3.8 FATAL_ERROR)
 2 | project (SlabHash)
 3 | 
 4 | find_package(CUDA 8.0 REQUIRED)
 5 | 
 6 | option(CMAKE_VERBOSE_MAKEFILE ON)
 7 | option(DGTEST,  "DGTEST"  ON)
 8 | 
 9 | set(CUDA_NVCC_FLAGS -std=c++11)
10 | set (CMAKE_CXX_STANDARD 11)
11 | 
12 | if (CUDA_VERBOSE_PTXAS)
13 |   set(VERBOSE_PTXAS --ptxas-options=-v)
14 | endif (CUDA_VERBOSE_PTXAS)
15 | 
16 | set(CMAKE_BUILD_TYPE "Release")
17 | 
18 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
19 | 
20 | set(GENCODE_SM30
21 |   -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_30,code=compute_30)
22 | set(GENCODE_SM35
23 |   -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_35,code=compute_35)
24 | set(GENCODE_SM37
25 |   -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_37,code=compute_37)
26 | set(GENCODE_SM50
27 |   -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_50,code=compute_50)
28 | set(GENCODE_SM60
29 |   -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60)
30 | set(GENCODE_SM61
31 |   -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61)
32 | set(GENCODE_SM70
33 |   -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70)
34 | set(GENCODE_SM71
35 |   -gencode=arch=compute_71,code=sm_71 -gencode=arch=compute_71,code=compute_71)
36 | set(GENCODE_SM75
37 | -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_75,code=compute_75)
38 | 
39 | option(SLABHASH_GENCODE_SM30 "GENCODE_SM30" OFF)
40 | option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" ON)
41 | option(SLABHASH_GENCODE_SM37 "GENCODE_SM37" OFF)
42 | option(SLABHASH_GENCODE_SM50 "GENCODE_SM50" OFF)
43 | option(SLABHASH_GENCODE_SM60 "GENCODE_SM60" OFF)
44 | option(SLABHASH_GENCODE_SM61 "GENCODE_SM61" OFF)
45 | option(SLABHASH_GENCODE_SM70 "GENCODE_SM70" OFF)
46 | option(SLABHASH_GENCODE_SM71 "GENCODE_SM71" OFF)
47 | option(SLABHASH_GENCODE_SM75 "GENCODE_SM75" OFF)
48 | 
49 | if (SLABHASH_GENCODE_SM30)
50 |   set(GENCODE ${GENCODE} ${GENCODE_SM30})
51 | endif(SLABHASH_GENCODE_SM30)
52 | 
53 | if (SLABHASH_GENCODE_SM35)
54 |   set(GENCODE ${GENCODE} ${GENCODE_SM35})
55 | endif(SLABHASH_GENCODE_SM35)
56 | 
57 | if (SLABHASH_GENCODE_SM37)
58 |   set(GENCODE ${GENCODE} ${GENCODE_SM37})
59 | endif(SLABHASH_GENCODE_SM37)
60 | 
61 | if (SLABHASH_GENCODE_SM50)
62 |   set(GENCODE ${GENCODE} ${GENCODE_SM50})
63 | endif(SLABHASH_GENCODE_SM50)
64 | 
65 | if (SLABHASH_GENCODE_SM60)
66 |   set(GENCODE ${GENCODE} ${GENCODE_SM60})
67 | endif(SLABHASH_GENCODE_SM60)
68 | 
69 | if (SLABHASH_GENCODE_SM61)
70 |   set(GENCODE ${GENCODE} ${GENCODE_SM61})
71 | endif(SLABHASH_GENCODE_SM61)
72 | 
73 | if (SLABHASH_GENCODE_SM70)
74 |   set(GENCODE ${GENCODE} ${GENCODE_SM70})
75 | endif(SLABHASH_GENCODE_SM70)
76 | 
77 | if(SLABHASH_GENCODE_SM71)
78 |   set(GENCODE ${GENCODE} ${GENCODE_SM71})
79 | endif(SLABHASH_GENCODE_SM71)
80 | 
81 | if(SLABHASH_GENCODE_SM75)
82 |   set(GENCODE ${GENCODE} ${GENCODE_SM75})
83 | endif(SLABHASH_GENCODE_SM75)
84 | 
85 | include_directories(SlabAlloc/src)
86 | include_directories(src src/concurrent)
87 | include_directories(ThirdParty/rapidjson/include)
88 | include_directories(ThirdParty/googletest/googletest)
89 | include_directories(ThirdParty/googletest/googletest/include)
90 | add_subdirectory(ThirdParty/googletest/googletest)
91 | add_subdirectory(test)
92 | add_subdirectory(bench)
93 | 
94 | if (DGTEST)
95 |   enable_testing()
96 | endif()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SlabHash
  2 | A warp-oriented dynamic hash table for GPUs
  3 | 
  4 | ## Publication:
  5 | This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper:
  6 | * [Saman Ashkiani, Martin Farach-Colton, John Owens, *A Dynamic Hash Table for the GPU*, 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)](https://ieeexplore.ieee.org/abstract/document/8425196)
  7 | 
  8 | This library is a rafactored and slightly redesigned version of the original code, so that it can be extended and be used in other research projects as well. It is still under continuous development. If you find any problem with the code, or suggestions for potential additions to the library, we will appreciate it if you can raise issues on github. We will address them as soon as possible. 
  9 | 
 10 | ## Compilation
 11 | 1. `git submodule init`
 12 | 2. `git submodule update`
 13 | 3. Make sure to edit `CMakeLists.txt` such that it reflects the GPU device's compute capability. For example, to include compute 3.5 you should have `option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" ON)`. Alternatively, one can easily update these flags by using the `ccmake ..` interface from the build directory. 
 14 | 4. `mkdir build && cd build`
 15 | 5. `cmake ..`
 16 | 6. `make`
 17 | 
 18 | ## High level API
 19 | In order to use this code, it is required to include [https://github.com/owensgroup/SlabHash/blob/master/src/slab_hash.cuh](`src/slab_hash.cuh`), which itself will include all required variations of the GpuSlabHash main class.
 20 | We have provided a simple application class [https://github.com/owensgroup/SlabHash/blob/master/src/gpu_hash_table.cuh](gpu_hash_table), where the right instance of `GpuSlabHash<KeyT, ValueT, SlabHashT>` is initialized.
 21 | This class is just an example of how to use the GpuSlabHash in various contexts.
 22 | Any other similar application level API should also own the dynamic memory allocator that is used by all instances of GpuSlabHash class (here just one). Finally, GpuSlabHash will be constructed with a pointer to the mentioned dynamic allocator.
 23 | 
 24 | There are a few variations of GpuSlabHash class. The most complete one at the moment is [https://github.com/owensgroup/SlabHash/blob/master/src/concurrent_map/cmap_class.cuh](`GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>`) which is based on the initial idea of the slab hash proposed in the paper above.
 25 | This class partially owns all the memory allocated on the GPU to actually store all the contents, side by side all units allocated by the dynamic memory allocator. 
 26 | There is another class, named [https://github.com/owensgroup/SlabHash/blob/master/src/concurrent_map/cmap_class.cuh#L26](`GpuSlabHashContext`), which does not own any memory but has all the related member functions to use the data structure itself. The context class is the one which is used by GPU threads on the device. Here's an example of the way to use it for a [https://github.com/owensgroup/SlabHash/blob/master/src/concurrent_map/device/search_kernel.cuh](search kernel):
 27 | 
 28 | ```
 29 | template <typename KeyT, typename ValueT>
 30 | __global__ void search_table(
 31 |     KeyT* d_queries,
 32 |     ValueT* d_results,
 33 |     uint32_t num_queries,
 34 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
 35 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 36 |   uint32_t laneId = threadIdx.x & 0x1F;
 37 | 
 38 |   if ((tid - laneId) >= num_queries) {
 39 |     return;
 40 |   }
 41 | 
 42 |   // initializing the memory allocator on each warp:
 43 |   slab_hash.getAllocatorContext().initAllocator(tid, laneId);
 44 | 
 45 |   KeyT myQuery = 0;
 46 |   ValueT myResult = static_cast<ValueT>(SEARCH_NOT_FOUND);
 47 |   uint32_t myBucket = 0;
 48 |   bool to_search = false;
 49 |   if (tid < num_queries) {
 50 |     myQuery = d_queries[tid];
 51 |     myBucket = slab_hash.computeBucket(myQuery);
 52 |     to_search = true;
 53 |   }
 54 | 
 55 |   slab_hash.searchKey(to_search, laneId, myQuery, myResult, myBucket);
 56 | 
 57 |   // writing back the results:
 58 |   if (tid < num_queries) {
 59 |     d_results[tid] = myResult;
 60 |   }
 61 | }
 62 | ```
 63 | 
 64 | ## Simple benchmarking
 65 | A simplified set of benchmark scenarios are available through a Python API that can be used as follows: Once the code is successfully compiled you can run the following python code from the `build` directory: `python3 ../bench/bencher.py -m <experiment mode> -d <device index>`, where experiment mode and device to be used are chosen. So far, the following experiments are added:
 66 | 
 67 | * mode 0: singleton experiment, where the hash table is built given a fixed load factor (which set by using a parameter for expected chain length, or equivalently total number of initial buckets).
 68 | * mode 1: load factor experiment, where a series of scenarios are simulated. In each case, total number of elements to be inserted into the hash table are constant, but the load factor (number of buckets) varies from case to case.
 69 | * mode 2: variable sized tables experiment, where the load factor (number of buckets) is fixed, but the total number of elements to be inserted into the table is variable.
 70 | * mode 3: concurrent experiment, where a series of batches of operations are used in the data structure: each batch is consisted of `(insert_ratio, delete_ratio, search_exist_ratio, search_not_exist_ratio)` as it operation distribution. For example, a tuple of (0.1, 0.1, 0.4, 0.4) would mean that 10% of each batch's operations are new elements to be inserted, 10% are deletion of previously inserted elements (in previous batches), 40% are search queries of elements that are previously inserted, and the final 40% are search queries of elements that are not stored in the data structure at all. Simulation starts with a few number of initial batches with 100% of operations as insertion, and then the rest of the batches with its given probability distribution.
 71 | 
 72 | In the following, these benchmarks are run a few GPU architectures. It should be noted that majority of input parameters for these scenarios are not exposed as command line arguments in the python code. If interested to try with different set of settings, the reader should either use their corresponding C++ API (through `build/bin/benchmark` and with the parameters listed in [https://github.com/owensgroup/SlabHash/blob/master/bench/main_benchmarks.cu](bench/main_benchmark.cu)), or change these parameters in [https://github.com/owensgroup/SlabHash/blob/master/bench/bencher.py#L166](`bench/bencher.py`).
 73 | 
 74 | ### NVIDIA GeForce RTX 2080:
 75 | GeForce RTX 2080 has a Turing architecture with compute capability 7.5 and 8GB of DRAM memory. In our setting, we have NVIDIA driver 430.14, and CUDA 10.1.
 76 | 
 77 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6.
 78 | 
 79 | #### Mode 0:
 80 | ```
 81 | python3 ../bench/bencher.py -m 0 -d 0
 82 | 
 83 | GPU hardware: GeForce RTX 2080
 84 | ===============================================================================================
 85 | Singleton experiment:
 86 | 	Number of elements to be inserted: 4194304
 87 | 	Number of buckets: 466034
 88 | 	Expected chain length: 0.60
 89 | ===============================================================================================
 90 | load factor	build rate(M/s)		search rate(M/s)	search rate bulk(M/s)
 91 | ===============================================================================================
 92 | 0.55		912.650		1930.254		1973.352
 93 | ```
 94 | 
 95 | #### Mode 1:
 96 | ```
 97 | python3 ../bench/bencher.py -m 1 -d 0
 98 | 
 99 | GPU hardware: GeForce RTX 2080
100 | ===============================================================================================
101 | Load factor experiment:
102 | 	Total number of elements is fixed, load factor (number of buckets) is a variable
103 | 	Number of elements to be inserted: 4194304
104 | 	 1.00 of 4194304 queries exist in the data structure
105 | ===============================================================================================
106 | load factor	num buckets	build rate(M/s)		search rate(M/s)	search rate bulk(M/s)
107 | ===============================================================================================
108 | 0.06		4194304		861.149		1860.127		1897.779
109 | 0.19		1398102		868.142		1889.353		1917.126
110 | 0.25		1048576		865.396		1897.587		1935.070
111 | 0.37		699051		894.140		1925.491		1951.696
112 | 0.44		599187		888.786		1924.727		1971.126
113 | 0.55		466034		897.348		1945.381		1982.515
114 | 0.60		419431		905.537		1943.449		1969.260
115 | 0.65		349526		909.736		1896.900		1936.958
116 | 0.65		262144		865.819		1742.237		1785.819
117 | 0.65		279621		882.153		1794.917		1825.312
118 | 0.66		233017		840.275		1656.958		1696.176
119 | 0.66		322639		893.878		1871.789		1915.809
120 | 0.66		220753		831.960		1619.813		1653.572
121 | 0.69		199729		821.923		1542.169		1571.814
122 | 0.70		190651		812.457		1509.976		1536.384
123 | 0.73		174763		797.804		1444.304		1472.074
124 | 0.74		167773		788.925		1409.498		1451.453
125 | 0.75		155345		771.897		1361.815		1397.073
126 | 0.76		149797		764.415		1337.688		1364.367
127 | 0.76		139811		749.947		1282.041		1312.374
128 | ```
129 | 
130 | #### Mode 2:
131 | ```
132 | python3 ../bench/bencher.py -m 2 -d 0 
133 | 
134 | GPU hardware: GeForce RTX 2080
135 | ===============================================================================================
136 | Table size experiment:
137 | 	Table's expected chain length is fixed, and total number of elements is variable
138 | 	Expected chain length = 0.60
139 | 
140 | 	1.00 of 262144 queries exist in the data structure
141 | ===============================================================================================
142 | (num keys, num buckets, load factor)	build rate(M/s)		search rate(M/s)	search rate bulk(M/s)
143 | ===============================================================================================
144 | (262144, 29128, 0.55)			  1346.040		2577.722		2785.447
145 | (524288, 58255, 0.55)			  1271.655		2319.366		2461.538
146 | (1048576, 116509, 0.55)			  1116.761		2139.322		2209.873
147 | (2097152, 233017, 0.55)			   984.349		2076.750		2117.411
148 | (4194304, 466034, 0.55)			   916.741		1988.169		2020.658
149 | (8388608, 932068, 0.55)			   871.570		1898.617		1926.835
150 | ```
151 | 
152 | #### Mode 3:
153 | ```
154 | python3 ../bench/bencher.py -m 3 -d 0
155 | 
156 | GPU hardware: GeForce RTX 2080
157 | ===============================================================================================
158 | Concurrent experiment:
159 | 	variable load factor, fixed number of elements
160 | 	Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40])
161 | ===============================================================================================
162 | batch_size = 262144, init num batches = 3, final num batches = 4
163 | ===============================================================================================
164 | init lf		final lf	num buckets	init build rate(M/s)	concurrent rate(Mop/s)
165 | ===============================================================================================
166 | 0.05		0.05		1048576		855.979		        1406.593
167 | 0.14		0.14		349526		902.501		        1467.049
168 | 0.19		0.19		262144		937.121		        1488.642
169 | 0.28		0.28		174763		995.060		        1560.678
170 | 0.33		0.33		149797		1047.526		1552.986
171 | 0.42		0.42		116509		1070.523		1618.972
172 | 0.47		0.47		104858		1110.027		1635.456
173 | 0.55		0.55		87382		1138.991		1626.042
174 | 0.59		0.58		80660		1140.100		1615.779
175 | 0.63		0.62		69906		1115.924		1561.273
176 | ```
177 | 
178 | ### NVIDIA Titan V:
179 | 
180 | Titan V has Volta architecture with compute capability 7.0 and 12GB of DRAM memory. In our setting, we have NVIDIA driver 410.104, and CUDA 10.0 running.
181 | 
182 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6.
183 | 
184 | #### Mode 0:
185 | ```
186 | python3 ../bench/bencher.py -m 0 -d 0
187 | 
188 | 
189 | GPU hardware: TITAN V
190 | ===============================================================================================
191 | Singleton experiment:
192 |         Number of elements to be inserted: 4194304
193 |         Number of buckets: 466034
194 |         Expected chain length: 0.60
195 | ===============================================================================================
196 | load factor     build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
197 | ===============================================================================================
198 | 0.55            1525.352                4137.374                3241.468
199 | ```
200 | 
201 | #### Mode 1:
202 | ```
203 | python3 ../bench/bencher.py -m 1 -d 0
204 | 
205 | GPU hardware: TITAN V
206 | ===============================================================================================
207 | Load factor experiment:
208 |         Total number of elements is fixed, load factor (number of buckets) is a variable
209 |         Number of elements to be inserted: 4194304
210 |          1.00 of 4194304 queries exist in the data structure
211 | ===============================================================================================
212 | load factor     num buckets     build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
213 | ===============================================================================================
214 | 0.06            4194304         1416.107                3851.094                3454.809
215 | 0.19            1398102         1454.223                3934.442                3575.244
216 | 0.25            1048576         1466.819                3978.993                3603.156
217 | 0.37            699051          1491.658                4053.439                3629.898
218 | 0.44            599187          1508.881                4084.385                3512.300
219 | 0.55            466034          1527.094                4138.811                3239.865
220 | 0.60            419431          1528.536                4146.405                2877.604
221 | 0.65            349526          1522.836                4095.360                2125.584
222 | 0.65            262144          1476.884                3785.364                1318.751
223 | 0.65            279621          1481.709                3886.148                1436.972
224 | 0.66            233017          1451.372                3599.791                1164.226
225 | 0.66            322639          1512.172                4044.683                1811.162
226 | 0.66            220753          1431.386                3508.069                1110.930
227 | 0.69            199729          1408.241                3352.397                1024.753
228 | 0.70            190651          1413.983                3278.603                991.955
229 | 0.73            174763          1403.611                3149.785                934.420
230 | 0.74            167773          1381.567                3085.426                903.303
231 | 0.75            155345          1367.470                2973.300                850.200
232 | 0.76            149797          1363.288                2914.719                823.777
233 | 0.76            139811          1349.699                2808.064                777.419
234 | ```
235 | 
236 | #### Mode 2:
237 | ```
238 | python3 ../bench/bencher.py -m 2 -d 0
239 | 
240 | GPU hardware: TITAN V
241 | ===============================================================================================
242 | Table size experiment:
243 |         Table's expected chain length is fixed, and total number of elements is variable
244 |         Expected chain length = 0.60
245 | 
246 |         1.00 of 262144 queries exist in the data structure
247 | ===============================================================================================
248 | (num keys, num buckets, load factor)    build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
249 | ===============================================================================================
250 | (262144, 29128, 0.55)                     2640.026              4571.429                3529.513
251 | (524288, 58255, 0.55)                     2473.430              4701.291                3207.518
252 | (1048576, 116509, 0.55)                   2011.170              4821.660                3431.563
253 | (2097152, 233017, 0.55)                   1673.630              4426.912                3475.236
254 | (4194304, 466034, 0.55)                   1530.160              4154.290                3431.204
255 | (8388608, 932068, 0.55)                   1464.140              3996.341                3214.361
256 | ```
257 | 
258 | #### Mode 3:
259 | ```
260 | python3 ../bench/bencher.py -m 3 -d 0
261 | 
262 | GPU hardware: TITAN V
263 | ===============================================================================================
264 | Concurrent experiment:
265 |         variable load factor, fixed number of elements
266 |         Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40])
267 | ===============================================================================================
268 | batch_size = 262144, init num batches = 3, final num batches = 4
269 | ===============================================================================================
270 | init lf         final lf        num buckets     init build rate(M/s)    concurrent rate(Mop/s)
271 | ===============================================================================================
272 | 0.05            0.05            1048576         1427.426                2669.273
273 | 0.14            0.14            349526          1526.934                2826.777
274 | 0.19            0.19            262144          1590.783                2801.642
275 | 0.28            0.28            174763          1714.166                2952.072
276 | 0.33            0.33            149797          1781.644                3000.733
277 | 0.42            0.42            116509          1937.406                3119.574
278 | 0.47            0.47            104858          1992.379                3088.990
279 | 0.55            0.55            87382           2099.257                3144.722
280 | 0.59            0.58            80660           2137.415                3166.602
281 | 0.64            0.62            69906           2160.717                2986.511
282 | ```
283 | 
284 | ### Titan Xp
285 | 
286 | Titan Xp has Pascal architecture with compute capability 6.1 and 12GB of DRAM memory. In our setting, we have NVIDIA driver 410.104, and CUDA 10.0 running.
287 | 
288 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6.
289 | #### Mode 0:
290 | ```
291 | python3 ../bench/bencher.py -m 0 -d 1
292 | 
293 | GPU hardware: TITAN Xp
294 | ===============================================================================================
295 | Singleton experiment:
296 |         Number of elements to be inserted: 4194304
297 |         Number of buckets: 466034
298 |         Expected chain length: 0.60
299 | ===============================================================================================
300 | load factor     build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
301 | ===============================================================================================
302 | 0.55            1007.340                2162.619                2199.785
303 | ```
304 | 
305 | #### Mode 1:
306 | ```
307 | python3 ../bench/bencher.py -m 1 -d 1
308 | 
309 | GPU hardware: TITAN Xp
310 | ===============================================================================================
311 | Load factor experiment:
312 |         Total number of elements is fixed, load factor (number of buckets) is a variable
313 |         Number of elements to be inserted: 4194304
314 |          1.00 of 4194304 queries exist in the data structure
315 | ===============================================================================================
316 | load factor     num buckets     build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
317 | ===============================================================================================
318 | 0.06            4194304         964.644         2090.863                2121.181
319 | 0.19            1398102         985.215         2185.699                2202.151
320 | 0.25            1048576         991.760         2200.967                2216.450
321 | 0.37            699051          1004.214        2224.878                2244.384
322 | 0.44            599187          1011.303        2238.251                2257.993
323 | 0.55            466034          1016.487        2250.549                2267.996
324 | 0.60            419431          1009.784        2158.061                2192.719
325 | 0.65            349526          997.443         2122.280                2142.259
326 | 0.65            262144          972.467         1947.694                1925.717
327 | 0.65            279621          965.888         1998.049                1986.421
328 | 0.66            233017          439.267         1827.755                1790.210
329 | 0.66            322639          987.784         2089.796                2098.361
330 | 0.66            220753          907.927         1778.646                1735.593
331 | 0.69            199729          889.975         1693.262                1646.302
332 | 0.70            190651          881.868         1655.618                1608.166
333 | 0.73            174763          868.159         1587.597                1536.384
334 | 0.74            167773          861.239         1555.640                1503.119
335 | 0.75            155345          847.666         1493.902                1437.697
336 | 0.76            149797          837.248         1464.475                1408.044
337 | 0.76            139811          828.725         1409.983                1348.255
338 | 
339 | ```
340 | 
341 | #### Mode 2:
342 | ```
343 | python3 ../bench/bencher.py -m 2 -d 1
344 | 
345 | GPU hardware: TITAN Xp
346 | ===============================================================================================
347 | Table size experiment:
348 |         Table's expected chain length is fixed, and total number of elements is variable
349 |         Expected chain length = 0.60
350 | 
351 |         1.00 of 262144 queries exist in the data structure
352 | ===============================================================================================
353 | (num keys, num buckets, load factor)    build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
354 | ===============================================================================================
355 | (262144, 29128, 0.55)                     1409.983              2331.910                2694.737
356 | (524288, 58255, 0.55)                     1423.829              2392.523                2598.985
357 | (1048576, 116509, 0.55)                   1191.867              2560.000                2612.245
358 | (2097152, 233017, 0.55)                   1070.482              2375.870                2400.938
359 | (4194304, 466034, 0.55)                   1012.616              2275.556                2289.547
360 | (8388608, 932068, 0.55)                    992.530              2147.313                2177.692
361 | 
362 | ```
363 | 
364 | #### Mode 3:
365 | ```
366 | python3 ../bench/bencher.py -m 3 -d 1
367 | 
368 | GPU hardware: TITAN Xp
369 | ===============================================================================================
370 | Concurrent experiment:
371 |         variable load factor, fixed number of elements
372 |         Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40])
373 | ===============================================================================================
374 | batch_size = 262144, init num batches = 3, final num batches = 4
375 | ===============================================================================================
376 | init lf         final lf        num buckets     init build rate(M/s)    concurrent rate(Mop/s)
377 | ===============================================================================================
378 | 0.05            0.05            1048576         968.856         1651.613
379 | 0.14            0.14            349526          1017.219                1706.667
380 | 0.19            0.19            262144          1043.478                1753.425
381 | 0.28            0.28            174763          1097.339                1815.603
382 | 0.33            0.33            149797          1123.064                1855.072
383 | 0.42            0.42            116509          1174.593                1909.112
384 | 0.47            0.47            104858          1149.701                1741.867
385 | 0.55            0.55            87382           1193.010                1753.425
386 | 0.59            0.58            80660           1215.190                1753.425
387 | 0.63            0.62            69906           1238.710                1673.545
388 | ```
389 | ### Tesla K40c
390 | 
391 | Tesla K40c has Kepler architecture with compute capability 3.5 and 12GB of DRAM. In our setting, we have NVIDIA driver 410.72 and CUDA 10.0.
392 | 
393 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6.
394 | 
395 | #### Mode 0:
396 | ```
397 | python3 ../bench/bencher.py -m 0 -d 2
398 | 
399 | GPU hardware: Tesla K40c
400 | ===============================================================================================
401 | Singleton experiment:
402 |         Number of elements to be inserted: 4194304
403 |         Number of buckets: 466034
404 |         Expected chain length: 0.60
405 | ===============================================================================================
406 | load factor     build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
407 | ===============================================================================================
408 | 0.55            545.779         764.014         831.575
409 | ``` 
410 | 
411 | #### Mode 1:
412 | ```
413 | python3 ../bench/bencher.py -m 1 -d 2
414 | 
415 | GPU hardware: Tesla K40c
416 | ===============================================================================================
417 | Load factor experiment:
418 |         Total number of elements is fixed, load factor (number of buckets) is a variable
419 |         Number of elements to be inserted: 4194304
420 |          1.00 of 4194304 queries exist in the data structure
421 | ===============================================================================================
422 | load factor     num buckets     build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
423 | ===============================================================================================
424 | 0.06            4194304         427.761         737.781         797.139
425 | 0.19            1398102         539.284         758.641         828.134
426 | 0.25            1048576         548.825         769.378         841.300
427 | 0.37            699051          551.950         769.572         841.694
428 | 0.44            599187          551.411         769.604         841.559
429 | 0.55            466034          546.190         764.509         831.907
430 | 0.60            419431          540.693         758.150         819.574
431 | 0.65            349526          521.354         734.935         777.110
432 | 0.65            262144          467.077         660.569         675.041
433 | 0.65            279621          480.977         679.845         701.025
434 | 0.66            233017          443.047         621.548         630.487
435 | 0.66            322639          508.520         719.334         753.231
436 | 0.66            220753          432.415         603.049         610.993
437 | 0.69            199729          414.232         571.586         578.291
438 | 0.70            190651          406.401         557.613         564.020
439 | 0.73            174763          391.686         532.063         538.003
440 | 0.74            167773          384.449         520.422         525.573
441 | 0.75            155345          371.302         498.036         504.311
442 | 0.76            149797          364.787         487.541         492.959
443 | 0.76            139811          352.283         467.503         472.981
444 | ```
445 | 
446 | #### Mode 2:
447 | ```
448 | python3 ../bench/bencher.py -m 2 -d 2
449 | 
450 | GPU hardware: Tesla K40c
451 | ===============================================================================================
452 | Table size experiment:
453 |         Table's expected chain length is fixed, and total number of elements is variable
454 |         Expected chain length = 0.60
455 | 
456 |         1.00 of 262144 queries exist in the data structure
457 | ===============================================================================================
458 | (num keys, num buckets, load factor)    build rate(M/s)         search rate(M/s)        search rate bulk(M/s)
459 | ===============================================================================================
460 | (262144, 29128, 0.55)                      538.062              742.231         823.234
461 | (524288, 58255, 0.55)                      547.301              755.789         829.696
462 | (1048576, 116509, 0.55)                    550.168              761.621         832.457
463 | (2097152, 233017, 0.55)                    547.768              763.422         831.348
464 | (4194304, 466034, 0.55)                    546.646              764.558         832.098
465 | (8388608, 932068, 0.55)                    544.300              764.801         832.008
466 | ```
467 | #### Mode 3:
468 | ```
469 | python3 ../bench/bencher.py -m 3 -d 2
470 | 
471 | GPU hardware: Tesla K40c
472 | ===============================================================================================
473 | Concurrent experiment:
474 |         variable load factor, fixed number of elements
475 |         Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40])
476 | ===============================================================================================
477 | batch_size = 262144, init num batches = 3, final num batches = 4
478 | ===============================================================================================
479 | init lf         final lf        num buckets     init build rate(M/s)    concurrent rate(Mop/s)
480 | ===============================================================================================
481 | 0.05            0.05            1048576         502.381         649.592
482 | 0.14            0.14            349526          507.926         656.305
483 | 0.19            0.19            262144          509.950         660.272
484 | 0.28            0.28            174763          511.659         663.212
485 | 0.33            0.33            149797          512.075         662.354
486 | 0.42            0.42            116509          513.390         664.073
487 | 0.47            0.47            104858          511.723         657.781
488 | 0.55            0.55            87382           509.052         649.026
489 | 0.59            0.58            80660           501.725         639.850
490 | 0.64            0.62            69906           493.702         601.822
491 | 
492 | ```


--------------------------------------------------------------------------------
/bench/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(random_gen batched_data_gen.cpp)
2 | cuda_add_executable(benchmark main_benchmarks.cu
3 | 			OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS})
4 | 
5 | target_link_libraries(benchmark random_gen)
6 | 


--------------------------------------------------------------------------------
/bench/batched_data_gen.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "batched_data_gen.h"
 18 | 
 19 | BatchedDataGen::BatchedDataGen(uint32_t num_ref, uint32_t batch_size)
 20 |     : num_insert_(0),
 21 |       num_delete_(0),
 22 |       num_search_exist_(0),
 23 |       num_search_non_exist_(0),
 24 |       edge_index_(0),
 25 |       batch_counter_(0) {
 26 |   num_ref_ = num_ref;
 27 |   batch_size_ = batch_size;
 28 |   h_key_ref_ = new uint32_t[num_ref_];
 29 |   h_index_ref_ = new uint32_t[num_ref_];
 30 |   std::iota(h_index_ref_, h_index_ref_ + num_ref_, 0);
 31 |   h_batch_buffer_ = new uint32_t[batch_size_];
 32 |   temp_buffer_ = new uint32_t[batch_size_];
 33 | }
 34 | 
 35 | BatchedDataGen::~BatchedDataGen() {
 36 |   if (h_key_ref_)
 37 |     delete[] h_key_ref_;
 38 |   if (h_index_ref_)
 39 |     delete[] h_index_ref_;
 40 |   if (h_batch_buffer_)
 41 |     delete[] h_batch_buffer_;
 42 |   if (temp_buffer_)
 43 |     delete[] temp_buffer_;
 44 | }
 45 | 
 46 | void BatchedDataGen::shuffle(uint32_t* input, uint32_t size) {
 47 |   std::mt19937 rng(std::time(nullptr));
 48 |   for (int i = 0; i < size; i++) {
 49 |     unsigned int rand1 = rng();
 50 |     unsigned int rand2 = (rng() << 15) + rand1;
 51 |     unsigned int swap = i + (rand2 % (size - i));
 52 | 
 53 |     unsigned int temp = input[i];
 54 |     input[i] = input[swap];
 55 |     input[swap] = temp;
 56 |   }
 57 | }
 58 | 
 59 | void BatchedDataGen::shuffle_pairs(uint32_t* input,
 60 |                                    uint32_t* values,
 61 |                                    uint32_t size) {
 62 |   std::mt19937 rng(std::time(nullptr));
 63 |   for (int i = 0; i < size; i++) {
 64 |     unsigned int rand1 = rng();
 65 |     unsigned int rand2 = (rng() << 15) + rand1;
 66 |     unsigned int swap = i + (rand2 % (size - i));
 67 | 
 68 |     unsigned int temp = input[i];
 69 |     input[i] = input[swap];
 70 |     input[swap] = temp;
 71 | 
 72 |     temp = values[i];
 73 |     values[i] = values[swap];
 74 |     values[swap] = temp;
 75 |   }
 76 | }
 77 | 
 78 | void BatchedDataGen::generate_random_keys() {
 79 |   std::iota(h_key_ref_, h_key_ref_ + num_ref_, 0);
 80 |   std::random_shuffle(h_key_ref_, h_key_ref_ + num_ref_);
 81 | }
 82 | 
 83 | void BatchedDataGen::generate_random_keys(int seed,
 84 |                                           int num_msb = 0,
 85 |                                           bool ensure_uniqueness = false) {
 86 |   std::mt19937 rng(seed);
 87 |   std::unordered_set<uint32_t> key_dict;
 88 |   for (int i = 0; i < num_ref_; i++) {
 89 |     if (!ensure_uniqueness) {
 90 |       h_key_ref_[i] =
 91 |           (rng() & (0xFFFFFFFF >>
 92 |                     num_msb));  // except for the most significant two bits
 93 |     } else {
 94 |       uint32_t key = rng() & (0xFFFFFFFF >> num_msb);
 95 |       while (key_dict.find(key) != key_dict.end()) {
 96 |         key = rng();
 97 |       }
 98 |       key_dict.insert(key);
 99 |       h_key_ref_[i] = key;
100 |     }
101 |   }
102 | }
103 | 
104 | uint32_t* BatchedDataGen::getSingleBatchPointer(
105 |     uint32_t num_keys,
106 |     uint32_t num_queries,
107 |     uint32_t num_existing) {
108 |   assert(num_keys + num_queries <= batch_size_);
109 |   assert(batch_size_ <= num_ref_);
110 |   assert(num_existing <= num_queries);
111 |   std::copy(h_key_ref_, h_key_ref_ + num_keys, h_batch_buffer_);
112 |   auto begin_index = (num_keys > num_existing) ? (num_keys - num_existing) : 0;
113 |   std::copy(h_key_ref_ + begin_index, h_key_ref_ + begin_index + num_queries,
114 |             h_batch_buffer_ + num_keys);
115 |   std::mt19937 rng(std::time(nullptr));
116 |   std::shuffle(h_batch_buffer_, h_batch_buffer_ + num_keys, rng);
117 |   std::shuffle(h_batch_buffer_ + num_keys, h_batch_buffer_ + num_keys + num_queries, rng);
118 |   return h_batch_buffer_;
119 | }
120 | 
121 | uint32_t BatchedDataGen::get_edge_index() {
122 |   return edge_index_;
123 | }
124 | 
125 | void BatchedDataGen::set_edge_index(uint32_t new_edge_index) {
126 |   if (new_edge_index < num_ref_)
127 |     edge_index_ = new_edge_index;
128 | }
129 | 
130 | void BatchedDataGen::compute_batch_contents(float a_insert,
131 |                                             float b_delete,
132 |                                             float c_search_exist) {
133 |   assert(a_insert + b_delete + c_search_exist <= 1.0f);
134 |   num_insert_ = static_cast<uint32_t>(a_insert * batch_size_);
135 |   num_delete_ = static_cast<uint32_t>(b_delete * batch_size_);
136 |   num_search_exist_ = static_cast<uint32_t>(c_search_exist * batch_size_);
137 |   num_search_non_exist_ =
138 |       batch_size_ - (num_insert_ + num_delete_ + num_search_exist_);
139 | }
140 | 
141 | uint32_t* BatchedDataGen::next_batch(float a_insert,
142 |                                      float b_delete,
143 |                                      float c_search_exist) {
144 |   compute_batch_contents(a_insert, b_delete, c_search_exist);
145 | 
146 |   std::random_shuffle(h_index_ref_, h_index_ref_ + edge_index_);
147 |   std::random_shuffle(h_index_ref_ + edge_index_, h_index_ref_ + num_ref_);
148 | 
149 |   uint32_t output_offset = 0;
150 | 
151 |   // search queries that actually exist in the data structure
152 |   // choosing the first num_search_exist_ from the beginning of the references:
153 |   // code 3 for search queries
154 |   for (int i = 0; i < num_search_exist_; i++) {
155 |     h_batch_buffer_[output_offset + i] =
156 |         (0xC0000000 | h_key_ref_[h_index_ref_[i]]);
157 |   }
158 |   output_offset += num_search_exist_;
159 | 
160 |   // search queries that do not exist in the data structure
161 |   // choose the last num_search_non_exist_ from the end of the references:
162 |   // code 3 for search queries
163 |   for (int i = 0; i < num_search_non_exist_; i++) {
164 |     h_batch_buffer_[output_offset + i] =
165 |         (0xC0000000 | h_key_ref_[h_index_ref_[num_ref_ - i - 1]]);
166 |   }
167 |   output_offset += num_search_non_exist_;
168 | 
169 |   // inserting new items:
170 |   // code 1:
171 |   // the first num_isnert_ elements after the edge:
172 |   for (int i = 0; i < num_insert_; i++) {
173 |     temp_buffer_[i] = h_index_ref_[edge_index_ + i];
174 |     h_batch_buffer_[output_offset + i] =
175 |         (0x40000000 | h_key_ref_[temp_buffer_[i]]);
176 |   }
177 |   output_offset += num_insert_;
178 | 
179 |   // deleting previously inserted elements:
180 |   // code 2:
181 |   for (int i = 0; i < num_delete_; i++) {
182 |     temp_buffer_[num_insert_ + i] = h_index_ref_[edge_index_ - i - 1];
183 |     h_batch_buffer_[output_offset + i] =
184 |         (0x80000000 | h_key_ref_[temp_buffer_[num_insert_ + i]]);
185 |   }
186 | 
187 |   // shuffling the output buffer:
188 |   std::random_shuffle(h_batch_buffer_, h_batch_buffer_ + batch_size_);
189 | 
190 |   // updating the edge index:
191 |   std::copy(temp_buffer_, temp_buffer_ + batch_size_,
192 |             h_index_ref_ + edge_index_ - num_delete_);
193 |   edge_index_ += (num_insert_ - num_delete_);
194 | 
195 |   batch_counter_++;
196 |   return h_batch_buffer_;
197 | }
198 | 
199 | void BatchedDataGen::print_batch() {
200 |   printf("Batch %d:\n", batch_counter_);
201 |   for (int i = 0; i < batch_size_; i++) {
202 |     printf("(%d, %d), ", h_batch_buffer_[i] >> 30,
203 |            h_batch_buffer_[i] & 0x3FFFFFFF);
204 |     if (i % 10 == 9)
205 |       printf("\n");
206 |   }
207 |   printf("\n");
208 | }
209 | 
210 | void BatchedDataGen::print_reference() {
211 |   printf("Reference keys:");
212 |   for (int i = 0; i < num_ref_; i++) {
213 |     printf("%d, ", h_key_ref_[i]);
214 |     if (i % 16 == 31)
215 |       printf("\n");
216 |   }
217 |   printf("\n");
218 | }


--------------------------------------------------------------------------------
/bench/batched_data_gen.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <stdint.h>
20 | #include <algorithm>
21 | #include <cassert>
22 | #include <cstdlib>
23 | #include <cstring>
24 | #include <ctime>
25 | #include <numeric>
26 | #include <random>
27 | #include <unordered_set>
28 | 
29 | class BatchedDataGen {
30 |  private:
31 |   uint32_t* h_key_ref_;
32 |   uint32_t* h_index_ref_;
33 |   uint32_t num_ref_;
34 |   uint32_t edge_index_;
35 |   uint32_t* temp_buffer_;
36 | 
37 |   uint32_t batch_counter_;
38 |   uint32_t num_insert_;
39 |   uint32_t num_delete_;
40 |   uint32_t num_search_exist_;
41 |   uint32_t num_search_non_exist_;
42 | 
43 |  public:
44 |   uint32_t batch_size_;
45 |   uint32_t* h_batch_buffer_;
46 | 
47 |   BatchedDataGen(uint32_t num_ref_, uint32_t batch_size);
48 |   ~BatchedDataGen();
49 |   void shuffle(uint32_t* input, uint32_t size);
50 |   void shuffle_pairs(uint32_t* input, uint32_t* values, uint32_t size);
51 |   void generate_random_keys();
52 |   void generate_random_keys(int seed, int num_msb, bool ensure_uniqueness);
53 |   uint32_t* getSingleBatchPointer(uint32_t num_keys,
54 |                                   uint32_t num_queries,
55 |                                   uint32_t num_existing);
56 |   uint32_t* getKeyRefPointer() { return h_key_ref_; }
57 |   uint32_t get_edge_index();
58 |   void set_edge_index(uint32_t new_edge_index);
59 |   uint32_t* next_batch(float a_insert, float b_delete, float c_search_exist);
60 |   uint32_t getBatchCounter() { return batch_counter_; }
61 |   void print_batch();
62 |   void print_reference();
63 |   void compute_batch_contents(float a_insert,
64 |                               float b_delete,
65 |                               float c_search_exist);
66 | };


--------------------------------------------------------------------------------
/bench/bencher.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import datetime
  3 | import os
  4 | import json 
  5 | import sys 
  6 | import getopt
  7 | 
  8 | def analyze_singleton_experiment(input_file):
  9 | 	with open(input_file) as json_file:
 10 | 		data = json.load(json_file)
 11 | 		print("GPU hardware: %s" % (data["slab_hash"]['device_name']))
 12 | 		trials = data["slab_hash"]["trial"]
 13 | 
 14 | 		for trial in trials:
 15 | 			data_q0 = (trial["load_factor"], trial["build_rate_mps"], trial["search_rate_mps"], trial["search_rate_bulk_mps"])
 16 | 
 17 | 		print("===============================================================================================")
 18 | 		print("Singleton experiment:")
 19 | 		print("\tNumber of elements to be inserted: %d" % (trials[0]['num_keys']))
 20 | 		print("\tNumber of buckets: %d" % (trials[0]['num_buckets']))
 21 | 		print("\tExpected chain length: %.2f" % (trials[0]['exp_chain_length']))
 22 | 		print("===============================================================================================")
 23 | 		print("load factor\tbuild rate(M/s)\t\tsearch rate(M/s)\tsearch rate bulk(M/s)")
 24 | 		print("===============================================================================================")
 25 | 		print("%.2f\t\t%.3f\t\t%.3f\t\t%.3f" % (data_q0[0], data_q0[1], data_q0[2], data_q0[3]))
 26 | 
 27 | def analyze_load_factor_experiment(input_file):
 28 | 	with open(input_file) as json_file:
 29 | 		data = json.load(json_file)
 30 | 		print("GPU hardware: %s" % (data["slab_hash"]['device_name']))
 31 | 		trials = data["slab_hash"]["trial"]
 32 | 
 33 | 		tabular_data = []
 34 | 
 35 | 		for trial in trials:
 36 | 			tabular_data.append((trial["load_factor"], 
 37 | 				trial["build_rate_mps"], 
 38 | 				trial["search_rate_mps"], 
 39 | 				trial["search_rate_bulk_mps"], 
 40 | 				trial['num_buckets']))
 41 | 
 42 | 		tabular_data.sort()
 43 | 		print("===============================================================================================")
 44 | 		print("Load factor experiment:")
 45 | 		print("\tTotal number of elements is fixed, load factor (number of buckets) is a variable")
 46 | 		print("\tNumber of elements to be inserted: %d" % (trials[0]['num_keys']))
 47 | 		print("\t %.2f of %d queries exist in the data structure" % (trials[0]['query_ratio'], trials[0]['num_queries']))
 48 | 		print("===============================================================================================")
 49 | 		print("load factor\tnum buckets\tbuild rate(M/s)\t\tsearch rate(M/s)\tsearch rate bulk(M/s)")
 50 | 		print("===============================================================================================")
 51 | 		for pair in tabular_data:
 52 | 			print("%.2f\t\t%d\t\t%.3f\t\t%.3f\t\t%.3f" % (pair[0], pair[4], pair[1], pair[2], pair[3]))		
 53 | 
 54 | def analyze_table_size_experiment(input_file):
 55 | 	with open(input_file) as json_file:
 56 | 		data = json.load(json_file)
 57 | 		print("GPU hardware: %s" % (data["slab_hash"]['device_name']))
 58 | 		trials = data["slab_hash"]["trial"]
 59 | 
 60 | 		tabular_data = []
 61 | 
 62 | 		for trial in trials:
 63 | 			tabular_data.append((trial["num_keys"], 
 64 | 				trial['num_buckets'],
 65 | 				trial['load_factor'], 
 66 | 				trial["build_rate_mps"], 
 67 | 				trial["search_rate_mps"], 
 68 | 				trial["search_rate_bulk_mps"]))
 69 | 
 70 | 		tabular_data.sort()
 71 | 		print("===============================================================================================")
 72 | 		print("Table size experiment:")
 73 | 		print("\tTable's expected chain length is fixed, and total number of elements is variable")
 74 | 		print("\tExpected chain length = %.2f\n" % trials[0]['exp_chain_length'])
 75 | 		print("\t%.2f of %d queries exist in the data structure" % (trials[0]['query_ratio'], trials[0]['num_queries']))
 76 | 		print("===============================================================================================")
 77 | 		print("(num keys, num buckets, load factor)\tbuild rate(M/s)\t\tsearch rate(M/s)\tsearch rate bulk(M/s)")
 78 | 		print("===============================================================================================")
 79 | 		for pair in tabular_data:
 80 | 			print("(%d, %d, %.2f)\t\t\t%10.3f\t\t%.3f\t\t%.3f" % (pair[0], pair[1], pair[2], pair[3], pair[4], pair[5]))
 81 | 
 82 | def analyze_concurrent_experiment(input_file):
 83 | 	with open(input_file) as json_file:
 84 | 		data = json.load(json_file)
 85 | 		print("GPU hardware: %s" % (data["slab_hash"]['device_name']))
 86 | 		trials = data["slab_hash"]["trial"]
 87 | 
 88 | 		tabular_data = []
 89 | 
 90 | 		for trial in trials:
 91 | 			tabular_data.append((trial["init_load_factor"], 
 92 | 				trial['final_load_factor'], 
 93 | 				trial['num_buckets'], 
 94 | 				trial["initial_rate_mps"], 
 95 | 				trial["concurrent_rate_mps"]))
 96 | 
 97 | 		tabular_data.sort()
 98 | 		print("===============================================================================================")
 99 | 		print("Concurrent experiment:")
100 | 		print("\tvariable load factor, fixed number of elements")
101 | 		print("\tOperation ratio: (insert, delete, search) = (%.2f, %.2f, [%.2f, %.2f])" % (trials[0]['insert_ratio'], trials[0]['delete_ratio'], trials[0]['search_exist_ratio'], trials[0]['search_non_exist_ratio']))
102 | 		print("===============================================================================================")
103 | 		print("batch_size = %d, init num batches = %d, final num batches = %d" % (trials[0]['batch_size'], trials[0]['num_init_batches'], trials[0]['num_batches']))
104 | 		print("===============================================================================================")
105 | 		print("init lf\t\tfinal lf\tnum buckets\tinit build rate(M/s)\tconcurrent rate(Mop/s)")
106 | 		print("===============================================================================================")
107 | 		for pair in tabular_data:
108 | 			print("%.2f\t\t%.2f\t\t%d\t\t%.3f\t\t%.3f" % (pair[0], pair[1], pair[2], pair[3], pair[4]))					
109 | 
110 | def main(argv):
111 | 	input_file = ''
112 | 	try:
113 | 		opts, args = getopt.getopt(argv, "hvi:m:d:", ["help", "verbose", "ifile=", "mode=", "device="])
114 | 	except getopt.GetOptError:
115 | 		print("bencher.py -i <inputfile> -m <experiment mode> -d <device index> -v")
116 | 		sys.exit(2)
117 | 	
118 | 	for opt, arg in opts:
119 | 		if opt == '-h':
120 | 			print("===============================================================================================")
121 | 			print("-i/--ifile: 	\t\t Input file (optional)")
122 | 			print("-m/--mode: 	\t\t Experiment mode:")
123 | 			print("\t\t\t\t\t 0: singleton experiment")
124 | 			print("\t\t\t\t\t 1: load factor experiment")
125 | 			print("\t\t\t\t\t 2: variable sized table experiment")
126 | 			print("\t\t\t\t\t 3: concurrent experiment")
127 | 			print("-v/--verbose")
128 | 			print("===============================================================================================")
129 | 			sys.exit()
130 | 		else:
131 | 			if opt in ("-i", "--ifile"):
132 | 				input_file = arg
133 | 				print("input file: " + input_file)
134 | 			if opt in ("-m", "--mode"):
135 | 				mode = int(arg)
136 | 			if opt in ("-d", "--device"):
137 | 				device_idx = int(arg)
138 | 			if opt in ("-v", "--verbose"):
139 | 				verbose = True
140 | 			else:
141 | 				verbose =  False
142 | 	
143 | 	# if the input file is not given, proper experiments should be run first
144 | 	if not input_file:		
145 | 		# == creating a folder to store results
146 | 		out_directory = "../build/bench_result/"
147 | 		if (not os.path.isdir(out_directory)):
148 | 			os.mkdir(out_directory)
149 | 
150 | 		# == running benchmark files
151 | 		bin_file = "../build/bin/benchmark"
152 | 		if(not os.path.exists(bin_file)):
153 | 			raise Exception("binary file " + bin_file + " not found!")
154 | 
155 | 		# creating a unique name for the file
156 | 		cur_time_list = str(datetime.datetime.now()).split()
157 | 		out_file_name = "out"
158 | 		for s in cur_time_list:
159 | 			out_file_name += ("_" + s)
160 | 
161 | 		out_file_dest = out_directory + out_file_name + ".json"
162 | 		input_file = out_file_dest # input file for the next step
163 | 		print("intermediate results stored at: " + out_file_dest)
164 | 
165 | 		print("mode = %d" % mode)
166 | 		if mode == 0:
167 | 			args = (bin_file, "-mode", str(mode), 
168 | 				"-num_key", str(2**22),
169 | 				"-expected_chain", str(0.6),
170 | 				"-device", str(device_idx),
171 | 				"-filename", out_file_dest,
172 | 				"-verbose", "1" if verbose else "0")
173 | 		elif mode == 1:
174 | 			args = (bin_file, 
175 | 				"-mode", str(mode),
176 | 				"-num_keys", str(2**22),
177 | 				"-quary_ratio", str(1.0),
178 | 				"-device", str(device_idx),
179 | 				"-lf_bulk_step", str(0.1),
180 | 				"-lf_bulk_num_sample", str(20), 
181 | 				"-filename", out_file_dest,
182 | 				"-verbose", "1" if verbose else "0")
183 | 		elif mode == 2:
184 | 				args = (bin_file, "-mode", str(mode), 
185 | 				"-nStart", str(18), 
186 | 				"-nEnd", str(23), 
187 | 				"-expected_chain", str(0.6),
188 | 				"-query_ratio", str(1.0),
189 | 				"-device", str(device_idx),
190 | 				"-filename", out_file_dest,
191 | 				"-verbose", "1" if verbose else "0")
192 | 		elif mode == 3:
193 | 			args = (bin_file, "-mode", str(mode),
194 | 			"-nStart", str(18),
195 | 			"-nEnd", str(21),
196 | 			"-num_batch", str(4),
197 | 			"-init_batch", str(3),
198 | 			"-lf_conc_step", str(0.1),
199 | 			"-lf_conc_num_sample", str(10),
200 | 			"-device", str(device_idx), 
201 | 			"-filename", out_file_dest,
202 | 			"-verbose", "1" if verbose else "0")
203 | 
204 | 		print(" === Started benchmarking ... ")
205 | 
206 | 		popen = subprocess.Popen(args, stdout = subprocess.PIPE)
207 | 		popen.wait()
208 | 
209 | 		if verbose:
210 | 			output = popen.stdout.read()
211 | 			print(output)
212 | 		print(" === Done!")
213 | 	elif not os.path.exists(input_file):
214 | 		raise Exception("Input file " + input_file + " does not exist!")
215 | 
216 | 	# reading the json files:
217 | 	if mode == 0:
218 | 		analyze_singleton_experiment(input_file)
219 | 	elif mode == 1:
220 | 		analyze_load_factor_experiment(input_file)
221 | 	elif mode == 2:
222 | 		analyze_table_size_experiment(input_file)
223 | 	elif mode == 3:
224 | 		analyze_concurrent_experiment(input_file)	
225 | 	else:
226 | 		print("Invalid mode entered")
227 | 		sys.exit(2)
228 | 
229 | if __name__ == "__main__":
230 | 	main(sys.argv[1:])					


--------------------------------------------------------------------------------
/bench/main_benchmarks.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <unistd.h>
 19 | #include <algorithm>
 20 | #include <cstdlib>
 21 | #include <ctime>
 22 | #include <iomanip>
 23 | #include <iostream>
 24 | #include <sstream>
 25 | #include <string>
 26 | 
 27 | #include "CommandLine.h"
 28 | #include "experiments.cuh"
 29 | 
 30 | int main(int argc, char** argv) {
 31 |   int mode = 0;  // type of experiment
 32 |   uint32_t num_iter = 1;
 33 |   bool verbose = false;
 34 |   int device_idx = 0;
 35 |   uint32_t num_keys = (1 << 22);
 36 |   uint32_t n_start = 20;  // num_keys = 1 << n_start;
 37 |   uint32_t n_end = 20;
 38 |   uint32_t num_queries = num_keys;
 39 |   float expected_chain = 0.6f;
 40 |   float existing_ratio = 1.0f;
 41 | 
 42 |   // mode 1 parameters:
 43 |   float lf_bulk_step = 0.1f;
 44 |   uint32_t lf_bulk_num_sample = 10;
 45 | 
 46 |   // mode 3 parameters:
 47 |   int num_batch = 2;
 48 |   int init_batch = 1;
 49 |   float insert_ratio = 0.1f;
 50 |   float delete_ratio = 0.1f;
 51 |   float search_exist_ratio = 0.4f;
 52 |   float lf_conc_step = 0.1f;
 53 |   int lf_conc_num_sample = 10;
 54 | 
 55 |   if (cmdOptionExists(argv, argc + argv, "-mode"))
 56 |     mode = atoi(getCmdOption(argv, argv + argc, "-mode"));
 57 |   if (cmdOptionExists(argv, argc + argv, "-num_key"))
 58 |     num_keys = atoi(getCmdOption(argv, argv + argc, "-num_key"));
 59 |   if (cmdOptionExists(argv, argc + argv, "-num_query"))
 60 |     num_queries = atoi(getCmdOption(argv, argv + argc, "-num_query"));
 61 |   else {
 62 |     num_queries = num_keys;
 63 |   }
 64 | 
 65 |   if (cmdOptionExists(argv, argc + argv, "-expected_chain"))
 66 |     expected_chain = atof(getCmdOption(argv, argv + argc, "-expected_chain"));
 67 |   assert(expected_chain > 0);
 68 |   if (cmdOptionExists(argv, argc + argv, "-query_ratio"))
 69 |     existing_ratio = atof(getCmdOption(argv, argv + argc, "-query_ratio"));
 70 |   if (cmdOptionExists(argv, argc + argv, "-verbose")) {
 71 |     verbose = (atoi(getCmdOption(argv, argv + argc, "-verbose")) != 0) ? true : false;
 72 |   }
 73 | 
 74 |   if (cmdOptionExists(argv, argc + argv, "-device"))
 75 |     device_idx = atoi(getCmdOption(argv, argv + argc, "-device"));
 76 |   if (cmdOptionExists(argv, argc + argv, "-iter")) {
 77 |     num_iter = atoi(getCmdOption(argv, argv + argc, "-iter"));
 78 |   }
 79 |   if (cmdOptionExists(argv, argc + argv, "-nStart")) {
 80 |     n_start = atoi(getCmdOption(argv, argv + argc, "-nStart"));
 81 |     // for mode 0:
 82 |     num_keys = (1 << n_start);
 83 |     num_queries = num_keys;
 84 |   }
 85 |   if (cmdOptionExists(argv, argc + argv, "-nEnd")) {
 86 |     n_end = atoi(getCmdOption(argv, argv + argc, "-nEnd"));
 87 |   }
 88 |   if (cmdOptionExists(argv, argc + argv, "-num_batch")) {
 89 |     num_batch = atoi(getCmdOption(argv, argv + argc, "-num_batch"));
 90 |   }
 91 |   if (cmdOptionExists(argv, argc + argv, "-init_batch")) {
 92 |     init_batch = atoi(getCmdOption(argv, argv + argc, "-init_batch"));
 93 |   }
 94 |   if (cmdOptionExists(argv, argc + argv, "-insert_ratio"))
 95 |     insert_ratio = atof(getCmdOption(argv, argv + argc, "-insert_ratio"));
 96 |   if (cmdOptionExists(argv, argc + argv, "-delete_ratio"))
 97 |     delete_ratio = atof(getCmdOption(argv, argv + argc, "-delete_ratio"));
 98 |   if (cmdOptionExists(argv, argc + argv, "-search_exist_ratio"))
 99 |     search_exist_ratio =
100 |         atof(getCmdOption(argv, argv + argc, "-search_exist_ratio"));
101 |   if (cmdOptionExists(argv, argc + argv, "-lf_conc_step"))
102 |     lf_conc_step = atof(getCmdOption(argv, argv + argc, "-lf_conc_step"));
103 |   if (cmdOptionExists(argv, argc + argv, "-lf_conc_num_sample"))
104 |     lf_conc_num_sample =
105 |         atoi(getCmdOption(argv, argv + argc, "-lf_conc_num_sample"));
106 |   if (cmdOptionExists(argv, argc + argv, "-lf_bulk_step"))
107 |     lf_bulk_step = atof(getCmdOption(argv, argv + argc, "-lf_bulk_step"));
108 |   if (cmdOptionExists(argv, argc + argv, "-lf_bulk_num_sample"))
109 |     lf_bulk_num_sample =
110 |         atoi(getCmdOption(argv, argv + argc, "-lf_bulk_num_sample"));
111 | 
112 |   // input argument for the file to be used for storing the results
113 |   std::string filename("");
114 |   if (cmdOptionExists(argv, argc + argv, "-filename")) {
115 |     filename.append(getCmdOption(argv, argv + argc, "-filename"));
116 |     std::cout << filename << std::endl;
117 |   } else {
118 |     // setting the filename to be the current time:
119 |     filename += "bench/";
120 |     auto time = std::time(nullptr);
121 |     auto tm = *std::localtime(&time);
122 |     std::ostringstream temp;
123 |     temp << std::put_time(&tm, "%d-%m-%Y_%H-%M-%S");
124 |     filename += ("out_" + temp.str() + ".json");
125 |   }
126 | 
127 |   //=========
128 |   int devCount;
129 |   cudaGetDeviceCount(&devCount);
130 |   cudaDeviceProp devProp;
131 |   if (devCount) {
132 |     cudaSetDevice(device_idx);  // be changed later
133 |     cudaGetDeviceProperties(&devProp, device_idx);
134 |   }
135 |   printf("Device: %s\n", devProp.name);
136 |   printf("Experiment mode = %d\n", mode);
137 | 
138 |   using KeyT = uint32_t;
139 |   using ValueT = uint32_t;
140 | 
141 |   // running the actual experiment
142 |   switch (mode) {
143 |     case 0:  // singleton experiment
144 |       singleton_experiment<KeyT, ValueT>(num_keys, num_queries, expected_chain,
145 |                                          filename, device_idx, existing_ratio,
146 |                                          num_iter,
147 |                                          /*run_cudpp = */ false, verbose);
148 |       break;
149 |     case 1:  // bulk build, num elements fixed, load factor changing
150 |       load_factor_bulk_experiment<KeyT, ValueT>(
151 |           num_keys, num_queries, filename, device_idx, existing_ratio, num_iter,
152 |           false, lf_bulk_num_sample, lf_bulk_step);
153 |       break;
154 |     case 2:  // bulk build, load factor fixed, num elements changing
155 |       build_search_bulk_experiment<KeyT, ValueT>(
156 |           1 << n_start, 1 << n_end, filename, expected_chain, existing_ratio,
157 |           device_idx, num_iter,
158 |           /* run_cudpp = */ false,
159 |           /* verbose = */ verbose);
160 |       break;
161 |     case 3:  // concurrent experiment:
162 |       concurrent_batched_op_load_factor_experiment<KeyT, ValueT>(
163 |           /*max_num_keys = */ 1 << n_end, /*batch_size = */ 1 << n_start,
164 |           num_batch, init_batch, insert_ratio, delete_ratio, search_exist_ratio,
165 |           filename, device_idx, lf_conc_step, lf_conc_num_sample, num_iter,
166 |           verbose);
167 |       break;
168 |     default:
169 |       std::cout << "Error: invalid mode." << std::endl;
170 |       break;
171 |   }
172 | }


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(CUHFILES
2 | 	slab_hash_global.cuh
3 | 	slab_hash.cuh)
4 | 
5 | cuda_add_library(slab_hash STATIC
6 | 	${CUHFILES}
7 | 	${CUFILES}
8 | 	OPTIONS ${GENCODE} ${VERBOSE_PTXAS})
9 | 


--------------------------------------------------------------------------------
/src/CommandLine.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | inline char* getCmdOption(char** begin, char** end, const std::string& option) {
20 |   char** itr = std::find(begin, end, option);
21 |   if (itr != end && ++itr != end) {
22 |     return *itr;
23 |   }
24 |   return 0;
25 | }
26 | 
27 | inline bool cmdOptionExists(char** begin, char** end, const std::string& option) {
28 |   return std::find(begin, end, option) != end;
29 | }


--------------------------------------------------------------------------------
/src/concurrent_map/cmap_class.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | #include <cassert>
 19 | 
 20 | /*
 21 |  * This is the main class that will be shallowly copied into the device to be
 22 |  * used at runtime. This class does not own the allocated memory on the gpu
 23 |  * (i.e., d_table_)
 24 |  */
 25 | template <typename KeyT, typename ValueT>
 26 | class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 27 |  public:
 28 |   // fixed known parameters:
 29 |   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
 30 |   static constexpr uint32_t WARP_WIDTH_ = 32;
 31 | 
 32 | #pragma hd_warning_disable
 33 |   __host__ __device__ GpuSlabHashContext()
 34 |       : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) {}
 35 | 
 36 | #pragma hd_warning_disable
 37 |   __host__ __device__ GpuSlabHashContext(
 38 |       GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>& rhs) {
 39 |     num_buckets_ = rhs.getNumBuckets();
 40 |     hash_x_ = rhs.getHashX();
 41 |     hash_y_ = rhs.getHashY();
 42 |     d_table_ = rhs.getDeviceTablePointer();
 43 |     global_allocator_ctx_ = rhs.getAllocatorContext();
 44 |   }
 45 | 
 46 | #pragma hd_warning_disable
 47 |   __host__ __device__ ~GpuSlabHashContext() {}
 48 | 
 49 |   static size_t getSlabUnitSize() {
 50 |     return sizeof(typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT);
 51 |   }
 52 | 
 53 |   static std::string getSlabHashTypeName() {
 54 |     return ConcurrentMapT<KeyT, ValueT>::getTypeName();
 55 |   }
 56 | 
 57 |   __host__ void initParameters(const uint32_t num_buckets,
 58 |                                const uint32_t hash_x,
 59 |                                const uint32_t hash_y,
 60 |                                int8_t* d_table,
 61 |                                AllocatorContextT* allocator_ctx) {
 62 |     num_buckets_ = num_buckets;
 63 |     hash_x_ = hash_x;
 64 |     hash_y_ = hash_y;
 65 |     d_table_ =
 66 |         reinterpret_cast<typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT*>(d_table);
 67 |     global_allocator_ctx_ = *allocator_ctx;
 68 |   }
 69 | 
 70 |   __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
 71 |     return global_allocator_ctx_;
 72 |   }
 73 | 
 74 |   __device__ __host__ __forceinline__ typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT*
 75 |   getDeviceTablePointer() {
 76 |     return d_table_;
 77 |   }
 78 | 
 79 |   __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; }
 80 |   __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; }
 81 |   __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; }
 82 | 
 83 |   __device__ __host__ __forceinline__ uint32_t computeBucket(const KeyT& key) const {
 84 |     return (((hash_x_ ^ key) + hash_y_) % PRIME_DIVISOR_) % num_buckets_;
 85 |   }
 86 | 
 87 |   // threads in a warp cooperate with each other to insert key-value pairs
 88 |   // into the slab hash
 89 |   __device__ __forceinline__ void insertPair(bool& to_be_inserted,
 90 |                                              const uint32_t& laneId,
 91 |                                              const KeyT& myKey,
 92 |                                              const ValueT& myValue,
 93 |                                              const uint32_t bucket_id,
 94 |                                              AllocatorContextT& local_allocator_context);
 95 | 
 96 |   // threads in a warp cooperate with each other to insert a unique key (and its value)
 97 |   // into the slab hash
 98 |   __device__ __forceinline__ bool insertPairUnique(
 99 |       bool& to_be_inserted,
100 |       const uint32_t& laneId,
101 |       const KeyT& myKey,
102 |       const ValueT& myValue,
103 |       const uint32_t bucket_id,
104 |       AllocatorContextT& local_allocator_context);
105 | 
106 |   // threads in a warp cooperate with each other to search for keys
107 |   // if found, it returns the corresponding value, else SEARCH_NOT_FOUND
108 |   // is returned
109 |   __device__ __forceinline__ void searchKey(bool& to_be_searched,
110 |                                             const uint32_t& laneId,
111 |                                             const KeyT& myKey,
112 |                                             ValueT& myValue,
113 |                                             const uint32_t bucket_id);
114 | 
115 |   // threads in a warp cooperate with each other to search for keys.
116 |   // the main difference with above function is that it is assumed all
117 |   // threads have something to search for
118 |   __device__ __forceinline__ void searchKeyBulk(const uint32_t& laneId,
119 |                                                 const KeyT& myKey,
120 |                                                 ValueT& myValue,
121 |                                                 const uint32_t bucket_id);
122 | 
123 |   // threads in a warp cooperate with each other to count keys
124 |   __device__ __forceinline__ void countKey(bool& to_be_searched,
125 |                                            const uint32_t& laneId,
126 |                                            const KeyT& myKey,
127 |                                            uint32_t& myCount,
128 |                                            const uint32_t bucket_id);
129 | 
130 |   // all threads within a warp cooperate with each other to delete
131 |   // keys
132 |   __device__ __forceinline__ bool deleteKey(bool& to_be_deleted,
133 |                                             const uint32_t& laneId,
134 |                                             const KeyT& myKey,
135 |                                             const uint32_t bucket_id);
136 | 
137 |   __device__ __forceinline__ uint32_t* getPointerFromSlab(
138 |       const SlabAddressT& slab_address,
139 |       const uint32_t laneId) {
140 |     return global_allocator_ctx_.getPointerFromSlab(slab_address, laneId);
141 |   }
142 | 
143 |   __device__ __forceinline__ uint32_t* getPointerFromBucket(const uint32_t bucket_id,
144 |                                                             const uint32_t laneId) {
145 |     return reinterpret_cast<uint32_t*>(d_table_) +
146 |            bucket_id * ConcurrentMapT<KeyT, ValueT>::BASE_UNIT_SIZE + laneId;
147 |   }
148 | 
149 |  private:
150 |   // this function should be operated in a warp-wide fashion
151 |   // TODO: add required asserts to make sure this is true in tests/debugs
152 |   __device__ __forceinline__ SlabAllocAddressT allocateSlab(const uint32_t& laneId) {
153 |     return global_allocator_ctx_.warpAllocate(laneId);
154 |   }
155 | 
156 |   __device__ __forceinline__ SlabAllocAddressT
157 |   allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) {
158 |     return local_allocator_ctx.warpAllocate(laneId);
159 |   }
160 | 
161 |   // a thread-wide function to free the slab that was just allocated
162 |   __device__ __forceinline__ void freeSlab(const SlabAllocAddressT slab_ptr) {
163 |     global_allocator_ctx_.freeUntouched(slab_ptr);
164 |   }
165 | 
166 |   // === members:
167 |   uint32_t num_buckets_;
168 |   uint32_t hash_x_;
169 |   uint32_t hash_y_;
170 |   typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT* d_table_;
171 |   // a copy of dynamic allocator's context to be used on the GPU
172 |   AllocatorContextT global_allocator_ctx_;
173 | };
174 | 
175 | /*
176 |  * This class owns the allocated memory for the hash table
177 |  */
178 | template <typename KeyT, typename ValueT>
179 | class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
180 |  private:
181 |   // fixed known parameters:
182 |   static constexpr uint32_t BLOCKSIZE_ = 128;
183 |   static constexpr uint32_t WARP_WIDTH_ = 32;
184 |   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
185 | 
186 |   struct hash_function {
187 |     uint32_t x;
188 |     uint32_t y;
189 |   } hf_;
190 | 
191 |   // total number of buckets (slabs) for this hash table
192 |   uint32_t num_buckets_;
193 | 
194 |   // a raw pointer to the initial allocated memory for all buckets
195 |   int8_t* d_table_;
196 |   size_t slab_unit_size_;  // size of each slab unit in bytes (might differ
197 |                            // based on the type)
198 | 
199 |   // slab hash context, contains everything that a GPU application needs to be
200 |   // able to use this data structure
201 |   GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> gpu_context_;
202 | 
203 |   // const pointer to an allocator that all instances of slab hash are going to
204 |   // use. The allocator itself is not owned by this class
205 |   DynamicAllocatorT* dynamic_allocator_;
206 |   uint32_t device_idx_;
207 | 
208 |  public:
209 |   GpuSlabHash(const uint32_t num_buckets,
210 |               DynamicAllocatorT* dynamic_allocator,
211 |               uint32_t device_idx,
212 |               const time_t seed = 0,
213 |               const bool identity_hash = false)
214 |       : num_buckets_(num_buckets)
215 |       , d_table_(nullptr)
216 |       , slab_unit_size_(0)
217 |       , dynamic_allocator_(dynamic_allocator)
218 |       , device_idx_(device_idx) {
219 |     assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash.");
220 |     assert(sizeof(typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT) ==
221 |                (WARP_WIDTH_ * sizeof(uint32_t)) &&
222 |            "A single slab on a ConcurrentMap should be 128 bytes");
223 |     int32_t devCount = 0;
224 |     CHECK_CUDA_ERROR(cudaGetDeviceCount(&devCount));
225 |     assert(device_idx_ < devCount);
226 | 
227 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
228 | 
229 |     slab_unit_size_ =
230 |         GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::getSlabUnitSize();
231 | 
232 |     // allocating initial buckets:
233 |     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
234 | 
235 |     CHECK_CUDA_ERROR(cudaMemset(d_table_, 0xFF, slab_unit_size_ * num_buckets_));
236 | 
237 |     // creating a random number generator:
238 |     if (!identity_hash) {
239 |       std::mt19937 rng(seed ? seed : time(0));
240 |       hf_.x = rng() % PRIME_DIVISOR_;
241 |       if (hf_.x < 1)
242 |         hf_.x = 1;
243 |       hf_.y = rng() % PRIME_DIVISOR_;
244 |     } else {
245 |       hf_ = {0u, 0u};
246 |     }
247 | 
248 |     // initializing the gpu_context_:
249 |     gpu_context_.initParameters(
250 |         num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr());
251 |   }
252 | 
253 |   ~GpuSlabHash() {
254 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
255 |     CHECK_CUDA_ERROR(cudaFree(d_table_));
256 |   }
257 | 
258 |   // returns some debug information about the slab hash
259 |   std::string to_string();
260 |   double computeLoadFactor(int flag);
261 | 
262 |   void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys);
263 |   void buildBulkWithUniqueKeys(KeyT* d_key, ValueT* d_value, uint32_t num_keys);
264 |   void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries);
265 |   void searchBulk(KeyT* d_query, ValueT* d_result, uint32_t num_queries);
266 |   void deleteIndividual(KeyT* d_key, uint32_t num_keys);
267 |   void batchedOperation(KeyT* d_key, ValueT* d_result, uint32_t num_ops);
268 |   void countIndividual(KeyT* d_query, uint32_t* d_count, uint32_t num_queries);
269 | };


--------------------------------------------------------------------------------
/src/concurrent_map/cmap_implementation.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | template <typename KeyT, typename ValueT>
 20 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
 21 |     KeyT* d_key,
 22 |     ValueT* d_value,
 23 |     uint32_t num_keys) {
 24 |   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
 25 |   // calling the kernel for bulk build:
 26 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 27 |   build_table_kernel<KeyT, ValueT>
 28 |       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
 29 | }
 30 | template <typename KeyT, typename ValueT>
 31 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqueKeys(
 32 |     KeyT* d_key,
 33 |     ValueT* d_value,
 34 |     uint32_t num_keys) {
 35 |   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
 36 |   // calling the kernel for bulk build:
 37 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 38 |   build_table_with_unique_keys_kernel<KeyT, ValueT>
 39 |       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
 40 | }
 41 | template <typename KeyT, typename ValueT>
 42 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchIndividual(
 43 |     KeyT* d_query,
 44 |     ValueT* d_result,
 45 |     uint32_t num_queries) {
 46 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 47 |   const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_;
 48 |   search_table<KeyT, ValueT>
 49 |       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 50 | }
 51 | 
 52 | template <typename KeyT, typename ValueT>
 53 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchBulk(
 54 |     KeyT* d_query,
 55 |     ValueT* d_result,
 56 |     uint32_t num_queries) {
 57 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 58 |   const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_;
 59 |   search_table_bulk<KeyT, ValueT>
 60 |       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 61 | }
 62 | 
 63 | template <typename KeyT, typename ValueT>
 64 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countIndividual(
 65 |     KeyT* d_query,
 66 |     uint32_t* d_count,
 67 |     uint32_t num_queries) {
 68 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 69 |   const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_;
 70 |   count_key<KeyT, ValueT>
 71 |       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_count, num_queries, gpu_context_);
 72 | }
 73 | 
 74 | template <typename KeyT, typename ValueT>
 75 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteIndividual(
 76 |     KeyT* d_key,
 77 |     uint32_t num_keys) {
 78 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 79 |   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
 80 |   delete_table_keys<KeyT, ValueT>
 81 |       <<<num_blocks, BLOCKSIZE_>>>(d_key, num_keys, gpu_context_);
 82 | }
 83 | 
 84 | // perform a batch of (a mixture of) updates/searches
 85 | template <typename KeyT, typename ValueT>
 86 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::batchedOperation(
 87 |     KeyT* d_key,
 88 |     ValueT* d_result,
 89 |     uint32_t num_ops) {
 90 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 91 |   const uint32_t num_blocks = (num_ops + BLOCKSIZE_ - 1) / BLOCKSIZE_;
 92 |   batched_operations<KeyT, ValueT>
 93 |       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_result, num_ops, gpu_context_);
 94 | }
 95 | 
 96 | template <typename KeyT, typename ValueT>
 97 | std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::to_string() {
 98 |   std::string result;
 99 |   result += " ==== GpuSlabHash: \n";
100 |   result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n";
101 |   result += "\t SlabHashType:     \t\t " + gpu_context_.getSlabHashTypeName() + "\n";
102 |   result += "\t Number of buckets:\t\t " + std::to_string(num_buckets_) + "\n";
103 |   result += "\t d_table_ address: \t\t " +
104 |             std::to_string(reinterpret_cast<uint64_t>(static_cast<void*>(d_table_))) +
105 |             "\n";
106 |   result += "\t hash function = \t\t (" + std::to_string(hf_.x) + ", " +
107 |             std::to_string(hf_.y) + ")\n";
108 |   return result;
109 | }
110 | 
111 | template <typename KeyT, typename ValueT>
112 | double GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::computeLoadFactor(
113 |     int flag = 0) {
114 |   uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_];
115 |   uint32_t* d_bucket_pairs_count;
116 |   CHECK_CUDA_ERROR(
117 |       cudaMalloc((void**)&d_bucket_pairs_count, sizeof(uint32_t) * num_buckets_));
118 |   CHECK_CUDA_ERROR(cudaMemset(d_bucket_pairs_count, 0, sizeof(uint32_t) * num_buckets_));
119 | 
120 |   uint32_t* h_bucket_slabs_count = new uint32_t[num_buckets_];
121 |   uint32_t* d_bucket_slabs_count;
122 |   CHECK_CUDA_ERROR(
123 |       cudaMalloc((void**)&d_bucket_slabs_count, sizeof(uint32_t) * num_buckets_));
124 |   CHECK_CUDA_ERROR(cudaMemset(d_bucket_slabs_count, 0, sizeof(uint32_t) * num_buckets_));
125 | 
126 |   //---------------------------------
127 |   // counting the number of inserted elements:
128 |   const uint32_t blocksize = 128;
129 |   const uint32_t num_blocks = (num_buckets_ * 32 + blocksize - 1) / blocksize;
130 |   bucket_count_kernel<KeyT, ValueT><<<num_blocks, blocksize>>>(
131 |       gpu_context_, d_bucket_pairs_count, d_bucket_slabs_count, num_buckets_);
132 |   CHECK_CUDA_ERROR(cudaMemcpy(h_bucket_pairs_count,
133 |                               d_bucket_pairs_count,
134 |                               sizeof(uint32_t) * num_buckets_,
135 |                               cudaMemcpyDeviceToHost));
136 |   CHECK_CUDA_ERROR(cudaMemcpy(h_bucket_slabs_count,
137 |                               d_bucket_slabs_count,
138 |                               sizeof(uint32_t) * num_buckets_,
139 |                               cudaMemcpyDeviceToHost));
140 |   int total_elements_stored = 0;
141 |   int total_slabs_used = 0;
142 |   for (int i = 0; i < num_buckets_; i++) {
143 |     total_elements_stored += h_bucket_pairs_count[i];
144 |     total_slabs_used += h_bucket_slabs_count[i];
145 |   }
146 |   if (flag) {
147 |     printf("## Total elements stored: %d (%lu bytes).\n",
148 |            total_elements_stored,
149 |            total_elements_stored * (sizeof(KeyT) + sizeof(ValueT)));
150 |     printf("## Total number of slabs used: %d.\n", total_slabs_used);
151 |   }
152 | 
153 |   // computing load factor
154 |   double load_factor = double(total_elements_stored * (sizeof(KeyT) + sizeof(ValueT))) /
155 |                        double(total_slabs_used * WARP_WIDTH_ * sizeof(uint32_t));
156 | 
157 |   if (d_bucket_pairs_count)
158 |     CHECK_ERROR(cudaFree(d_bucket_pairs_count));
159 |   if (d_bucket_slabs_count)
160 |     CHECK_ERROR(cudaFree(d_bucket_slabs_count));
161 |   delete[] h_bucket_pairs_count;
162 |   delete[] h_bucket_slabs_count;
163 | 
164 |   return load_factor;
165 | }


--------------------------------------------------------------------------------
/src/concurrent_map/device/build.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | /*
19 |  *
20 |  */
21 | template <typename KeyT, typename ValueT>
22 | __global__ void build_table_kernel(
23 |     KeyT* d_key,
24 |     ValueT* d_value,
25 |     uint32_t num_keys,
26 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
27 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
28 |   uint32_t laneId = threadIdx.x & 0x1F;
29 | 
30 |   if ((tid - laneId) >= num_keys) {
31 |     return;
32 |   }
33 | 
34 |   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
35 |   local_allocator_ctx.initAllocator(tid, laneId);
36 | 
37 |   KeyT myKey = 0;
38 |   ValueT myValue = 0;
39 |   uint32_t myBucket = 0;
40 |   bool to_insert = false;
41 | 
42 |   if (tid < num_keys) {
43 |     myKey = d_key[tid];
44 |     myValue = d_value[tid];
45 |     myBucket = slab_hash.computeBucket(myKey);
46 |     to_insert = true;
47 |   }
48 | 
49 |   slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
50 | }
51 | 
52 | template <typename KeyT, typename ValueT>
53 | __global__ void build_table_with_unique_keys_kernel(
54 |     KeyT* d_key,
55 |     ValueT* d_value,
56 |     uint32_t num_keys,
57 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
58 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
59 |   uint32_t laneId = threadIdx.x & 0x1F;
60 | 
61 |   if ((tid - laneId) >= num_keys) {
62 |     return;
63 |   }
64 | 
65 |   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
66 |   local_allocator_ctx.initAllocator(tid, laneId);
67 | 
68 |   KeyT myKey = 0;
69 |   ValueT myValue = 0;
70 |   uint32_t myBucket = 0;
71 |   bool to_insert = false;
72 | 
73 |   if (tid < num_keys) {
74 |     myKey = d_key[tid];
75 |     myValue = d_value[tid];
76 |     myBucket = slab_hash.computeBucket(myKey);
77 |     to_insert = true;
78 |   }
79 | 
80 |   slab_hash.insertPairUnique(
81 |       to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
82 | }


--------------------------------------------------------------------------------
/src/concurrent_map/device/concurrent_kernel.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | template <typename KeyT, typename ValueT>
20 | __global__ void batched_operations(
21 |     uint32_t* d_operations,
22 |     uint32_t* d_results,
23 |     uint32_t num_operations,
24 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
25 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
26 |   uint32_t laneId = threadIdx.x & 0x1F;
27 | 
28 |   if ((tid - laneId) >= num_operations)
29 |     return;
30 | 
31 |   // initializing the memory allocator on each warp:
32 |   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
33 |   local_allocator_ctx.initAllocator(tid, laneId);
34 | 
35 |   uint32_t myOperation = 0;
36 |   uint32_t myKey = 0;
37 |   uint32_t myValue = 0;
38 |   uint32_t myBucket = 0;
39 | 
40 |   if (tid < num_operations) {
41 |     myOperation = d_operations[tid];
42 |     myKey = myOperation & 0x3FFFFFFF;
43 |     myBucket = slab_hash.computeBucket(myKey);
44 |     myOperation = myOperation >> 30;
45 |     // todo: should be changed to a more general case
46 |     myValue = myKey;  // for the sake of this benchmark
47 |   }
48 | 
49 |   bool to_insert = (myOperation == 1) ? true : false;
50 |   bool to_delete = (myOperation == 2) ? true : false;
51 |   bool to_search = (myOperation == 3) ? true : false;
52 | 
53 |   // first insertions:
54 |   slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
55 | 
56 |   // second deletions:
57 |   slab_hash.deleteKey(to_delete, laneId, myKey, myBucket);
58 | 
59 |   // finally search queries:
60 |   slab_hash.searchKey(to_search, laneId, myKey, myValue, myBucket);
61 | 
62 |   if (myOperation == 3 && myValue != SEARCH_NOT_FOUND) {
63 |     d_results[tid] = myValue;
64 |   }
65 | }


--------------------------------------------------------------------------------
/src/concurrent_map/device/count_kernel.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 University of California, Davis
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | template <typename KeyT, typename ValueT>
20 | __global__ void count_key(
21 |     KeyT* d_queries,
22 |     uint32_t* d_counts,
23 |     uint32_t num_queries,
24 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
25 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
26 |   uint32_t laneId = threadIdx.x & 0x1F;
27 | 
28 |   if ((tid - laneId) >= num_queries) {
29 |     return;
30 |   }
31 | 
32 |   KeyT myKey = 0;
33 |   uint32_t myCount = 0;
34 |   uint32_t myBucket = 0;
35 |   bool to_count = false;
36 | 
37 |   if (tid < num_queries) {
38 |     myKey = d_queries[tid];
39 |     myBucket = slab_hash.computeBucket(myKey);
40 |     to_count = true;
41 |   }
42 | 
43 |   // count the keys:
44 |   slab_hash.countKey(to_count, laneId, myKey, myCount, myBucket);
45 | 
46 |   // writing back the results:
47 |   if (tid < num_queries) {
48 |     d_counts[tid] = myCount;
49 |   }
50 | }


--------------------------------------------------------------------------------
/src/concurrent_map/device/delete_kernel.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | template <typename KeyT, typename ValueT>
20 | __global__ void delete_table_keys(
21 |     KeyT* d_key_deleted,
22 |     uint32_t num_keys,
23 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
24 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
25 |   uint32_t laneId = threadIdx.x & 0x1F;
26 | 
27 |   if ((tid - laneId) >= num_keys) {
28 |     return;
29 |   }
30 | 
31 |   KeyT myKey = 0;
32 |   uint32_t myBucket = 0;
33 |   bool to_delete = false;
34 | 
35 |   if (tid < num_keys) {
36 |     myKey = d_key_deleted[tid];
37 |     myBucket = slab_hash.computeBucket(myKey);
38 |     to_delete = true;
39 |   }
40 | 
41 |   // delete the keys:
42 |   slab_hash.deleteKey(to_delete, laneId, myKey, myBucket);
43 | }


--------------------------------------------------------------------------------
/src/concurrent_map/device/misc_kernels.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | /*
20 |  * This kernel can be used to compute the total number of elements and the total number of
21 |  * slabs per bucket. The final results per bucket is stored in d_pairs_count_result and
22 |  * d_slabs_count_result arrays respectively
23 |  */
24 | template <typename KeyT, typename ValueT>
25 | __global__ void bucket_count_kernel(
26 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash,
27 |     uint32_t* d_pairs_count_result,
28 |     uint32_t* d_slabs_count_result,
29 |     uint32_t num_buckets) {
30 |   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
31 |   // global warp ID
32 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
33 |   uint32_t wid = tid >> 5;
34 |   // assigning a warp per bucket
35 |   if (wid >= num_buckets) {
36 |     return;
37 |   }
38 | 
39 |   uint32_t laneId = threadIdx.x & 0x1F;
40 | 
41 |   // initializing the memory allocator on each warp:
42 |   slab_hash.getAllocatorContext().initAllocator(tid, laneId);
43 | 
44 |   uint32_t pairs_count = 0;
45 |   uint32_t slabs_count = 1;
46 | 
47 |   uint32_t src_unit_data = *slab_hash.getPointerFromBucket(wid, laneId);
48 | 
49 |   pairs_count += __popc(__ballot_sync(0xFFFFFFFF, src_unit_data != EMPTY_KEY) &
50 |                         SlabHashT::REGULAR_NODE_KEY_MASK);
51 |   uint32_t next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
52 | 
53 |   while (next != SlabHashT::EMPTY_INDEX_POINTER) {
54 |     // counting pairs
55 |     src_unit_data = *slab_hash.getPointerFromSlab(next, laneId);
56 |     pairs_count += __popc(__ballot_sync(0xFFFFFFFF, src_unit_data != EMPTY_KEY) &
57 |                           SlabHashT::REGULAR_NODE_KEY_MASK);
58 |     next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
59 |     // counting slabs
60 |     slabs_count++;
61 |   }
62 |   // writing back the results:
63 |   if (laneId == 0) {
64 |     d_pairs_count_result[wid] = pairs_count;
65 |     d_slabs_count_result[wid] = slabs_count;
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/concurrent_map/device/search_kernel.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | //=== Individual search kernel:
20 | template <typename KeyT, typename ValueT>
21 | __global__ void search_table(
22 |     KeyT* d_queries,
23 |     ValueT* d_results,
24 |     uint32_t num_queries,
25 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
26 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
27 |   uint32_t laneId = threadIdx.x & 0x1F;
28 | 
29 |   if ((tid - laneId) >= num_queries) {
30 |     return;
31 |   }
32 | 
33 |   KeyT myQuery = 0;
34 |   ValueT myResult = static_cast<ValueT>(SEARCH_NOT_FOUND);
35 |   uint32_t myBucket = 0;
36 |   bool to_search = false;
37 |   if (tid < num_queries) {
38 |     myQuery = d_queries[tid];
39 |     myBucket = slab_hash.computeBucket(myQuery);
40 |     to_search = true;
41 |   }
42 | 
43 |   slab_hash.searchKey(to_search, laneId, myQuery, myResult, myBucket);
44 | 
45 |   // writing back the results:
46 |   if (tid < num_queries) {
47 |     d_results[tid] = myResult;
48 |   }
49 | }
50 | 
51 | //=== Bulk search kernel:
52 | template <typename KeyT, typename ValueT>
53 | __global__ void search_table_bulk(
54 |     KeyT* d_queries,
55 |     ValueT* d_results,
56 |     uint32_t num_queries,
57 |     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
58 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
59 |   uint32_t laneId = threadIdx.x & 0x1F;
60 | 
61 |   if ((tid - laneId) >= num_queries) {
62 |     return;
63 |   }
64 | 
65 |   KeyT myQuery = 0;
66 |   ValueT myResult = static_cast<ValueT>(SEARCH_NOT_FOUND);
67 |   uint32_t myBucket = 0;
68 |   if (tid < num_queries) {
69 |     myQuery = d_queries[tid];
70 |     myBucket = slab_hash.computeBucket(myQuery);
71 |   }
72 | 
73 |   slab_hash.searchKeyBulk(laneId, myQuery, myResult, myBucket);
74 | 
75 |   // writing back the results:
76 |   if (tid < num_queries) {
77 |     d_results[tid] = myResult;
78 |   }
79 | }


--------------------------------------------------------------------------------
/src/concurrent_map/warp/count.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 University of California, Davis
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | //================================================
20 | // Individual Count Unit:
21 | //================================================
22 | template <typename KeyT, typename ValueT>
23 | __device__ __forceinline__ void
24 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countKey(
25 |     bool& to_be_searched,
26 |     const uint32_t& laneId,
27 |     const KeyT& myKey,
28 |     uint32_t& myCount,
29 |     const uint32_t bucket_id) {
30 |   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
31 |   uint32_t work_queue = 0;
32 |   uint32_t last_work_queue = work_queue;
33 |   uint32_t next = SlabHashT::A_INDEX_POINTER;
34 | 
35 |   while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_searched))) {
36 |     next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER
37 |                                            : next;
38 |     uint32_t src_lane = __ffs(work_queue) - 1;
39 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
40 |     uint32_t wanted_key = __shfl_sync(0xFFFFFFFF,
41 |                                       *reinterpret_cast<const uint32_t*>(
42 |                                           reinterpret_cast<const unsigned char*>(&myKey)),
43 |                                       src_lane,
44 |                                       32);
45 |     const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER)
46 |                                        ? *(getPointerFromBucket(src_bucket, laneId))
47 |                                        : *(getPointerFromSlab(next, laneId));
48 |     const int wanted_key_count = __popc(__ballot_sync(0xFFFFFFFF, src_unit_data == wanted_key) &
49 |                            SlabHashT::REGULAR_NODE_KEY_MASK);
50 |     
51 |     if(laneId == src_lane) //count
52 |       myCount += wanted_key_count;
53 | 
54 |     uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); //iterate
55 |     if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER){
56 |       if(laneId == src_lane){
57 |         to_be_searched = false;
58 |       }
59 |     }
60 |     else{
61 |       next = next_ptr;
62 |     }
63 | 
64 |     last_work_queue = work_queue;
65 |   }
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/src/concurrent_map/warp/delete.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | template <typename KeyT, typename ValueT>
20 | __device__ __forceinline__ bool
21 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteKey(
22 |     bool& to_be_deleted,
23 |     const uint32_t& laneId,
24 |     const KeyT& myKey,
25 |     const uint32_t bucket_id) {
26 |   // delete the first instance of key
27 | 
28 |   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
29 |   uint32_t work_queue = 0;
30 |   uint32_t last_work_queue = 0;
31 |   uint32_t next = SlabHashT::A_INDEX_POINTER;
32 |   bool successful_deletion = false;
33 | 
34 |   while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_deleted))) {
35 |     // to know whether it is a base node, or a regular node
36 |     next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER
37 |                                            : next;  // a successfull insertion in the warp
38 |     uint32_t src_lane = __ffs(work_queue) - 1;
39 |     uint32_t src_key = __shfl_sync(0xFFFFFFFF,
40 |                                    *reinterpret_cast<const uint32_t*>(
41 |                                        reinterpret_cast<const unsigned char*>(&myKey)),
42 |                                    src_lane,
43 |                                    32);
44 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
45 |     // starting with a base node OR regular node:
46 |     // need to define different masks to extract super block index, memory block
47 |     // index, and the memory unit index
48 | 
49 |     const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER)
50 |                                        ? *(getPointerFromBucket(src_bucket, laneId))
51 |                                        : *(getPointerFromSlab(next, laneId));
52 | 
53 |     // looking for the item to be deleted:
54 |     uint32_t isFound = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) &
55 |                        SlabHashT::REGULAR_NODE_KEY_MASK;
56 | 
57 |     if (isFound == 0) {  // no matching slot found:
58 |       uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
59 |       if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
60 |         // not found:
61 |         if (laneId == src_lane)
62 |           to_be_deleted = false;
63 |       } else {
64 |         next = next_ptr;
65 |       }
66 |     } else {  // The wanted key found:
67 |       int dest_lane = __ffs(isFound & SlabHashT::REGULAR_NODE_KEY_MASK) - 1;
68 |       if (laneId == src_lane) {
69 |         uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
70 |                           ? getPointerFromBucket(src_bucket, dest_lane)
71 |                           : getPointerFromSlab(next, dest_lane);
72 | 
73 |         uint64_t old_pair = atomicExch((unsigned long long int*)p, EMPTY_PAIR_64);
74 |         uint32_t deleted_key = static_cast<uint32_t>(old_pair);
75 |         successful_deletion = deleted_key == src_key;
76 |         to_be_deleted = false;
77 |       }
78 |     }
79 |     last_work_queue = work_queue;
80 |   }
81 |   return successful_deletion;
82 | }


--------------------------------------------------------------------------------
/src/concurrent_map/warp/insert.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | /*
 20 |  * each thread inserts a key-value pair into the hash table
 21 |  * it is assumed all threads within a warp are present and collaborating with
 22 |  * each other with a warp-cooperative work sharing (WCWS) strategy.
 23 |  */
 24 | template <typename KeyT, typename ValueT>
 25 | __device__ __forceinline__ void
 26 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
 27 |     bool& to_be_inserted,
 28 |     const uint32_t& laneId,
 29 |     const KeyT& myKey,
 30 |     const ValueT& myValue,
 31 |     const uint32_t bucket_id,
 32 |     AllocatorContextT& local_allocator_ctx) {
 33 |   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
 34 |   uint32_t work_queue = 0;
 35 |   uint32_t last_work_queue = 0;
 36 |   uint32_t next = SlabHashT::A_INDEX_POINTER;
 37 | 
 38 |   while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_inserted))) {
 39 |     // to know whether it is a base node, or a regular node
 40 |     next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER
 41 |                                            : next;  // a successfull insertion in the warp
 42 |     uint32_t src_lane = __ffs(work_queue) - 1;
 43 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
 44 | 
 45 |     uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER)
 46 |                                  ? *(getPointerFromBucket(src_bucket, laneId))
 47 |                                  : *(getPointerFromSlab(next, laneId));
 48 |     uint64_t old_key_value_pair = 0;
 49 | 
 50 |     uint32_t isEmpty = (__ballot_sync(0xFFFFFFFF, src_unit_data == EMPTY_KEY)) &
 51 |                        SlabHashT::REGULAR_NODE_KEY_MASK;
 52 |     if (isEmpty == 0) {  // no empty slot available:
 53 |       uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
 54 |       if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
 55 |         // allocate a new node:
 56 |         uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
 57 | 
 58 |         // TODO: experiment if it's better to use lane 0 instead
 59 |         if (laneId == 31) {
 60 |           const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
 61 |                                   ? getPointerFromBucket(src_bucket, 31)
 62 |                                   : getPointerFromSlab(next, 31);
 63 | 
 64 |           uint32_t temp =
 65 |               atomicCAS((unsigned int*)p, SlabHashT::EMPTY_INDEX_POINTER, new_node_ptr);
 66 |           // check whether it was successful, and
 67 |           // free the allocated memory otherwise
 68 |           if (temp != SlabHashT::EMPTY_INDEX_POINTER) {
 69 |             freeSlab(new_node_ptr);
 70 |           }
 71 |         }
 72 |       } else {
 73 |         next = next_ptr;
 74 |       }
 75 |     } else {  // there is an empty slot available
 76 |       int dest_lane = __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1;
 77 |       if (laneId == src_lane) {
 78 |         const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
 79 |                                 ? getPointerFromBucket(src_bucket, dest_lane)
 80 |                                 : getPointerFromSlab(next, dest_lane);
 81 | 
 82 |         old_key_value_pair =
 83 |             atomicCAS((unsigned long long int*)p,
 84 |                       EMPTY_PAIR_64,
 85 |                       ((uint64_t)(*reinterpret_cast<const uint32_t*>(
 86 |                            reinterpret_cast<const unsigned char*>(&myValue)))
 87 |                        << 32) |
 88 |                           *reinterpret_cast<const uint32_t*>(
 89 |                               reinterpret_cast<const unsigned char*>(&myKey)));
 90 |         if (old_key_value_pair == EMPTY_PAIR_64)
 91 |           to_be_inserted = false;  // succesfful insertion
 92 |       }
 93 |     }
 94 |     last_work_queue = work_queue;
 95 |   }
 96 | }
 97 | 
 98 | /*
 99 |  * each thread inserts a unique key (and its value) into the hash table
100 |  * if the key already exist in the hash table, it only keeps the first instance
101 |  * it is assumed all threads within a warp are present and collaborating with
102 |  * each other with a warp-cooperative work sharing (WCWS) strategy.
103 |  * returns true only if a new key was inserted into the hash table
104 |  */
105 | template <typename KeyT, typename ValueT>
106 | __device__ __forceinline__ bool
107 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique(
108 |     bool& to_be_inserted,
109 |     const uint32_t& laneId,
110 |     const KeyT& myKey,
111 |     const ValueT& myValue,
112 |     const uint32_t bucket_id,
113 |     AllocatorContextT& local_allocator_ctx) {
114 |   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
115 |   uint32_t work_queue = 0;
116 |   uint32_t last_work_queue = 0;
117 |   uint32_t next = SlabHashT::A_INDEX_POINTER;
118 |   bool new_insertion = false;
119 |   while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_inserted))) {
120 |     // to know whether it is a base node, or a regular node
121 |     next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER
122 |                                            : next;  // a successful insertion in the warp
123 |     uint32_t src_lane = __ffs(work_queue) - 1;
124 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
125 | 
126 |     uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER)
127 |                                  ? *(getPointerFromBucket(src_bucket, laneId))
128 |                                  : *(getPointerFromSlab(next, laneId));
129 |     uint64_t old_key_value_pair = 0;
130 | 
131 |     uint32_t isEmpty = (__ballot_sync(0xFFFFFFFF, src_unit_data == EMPTY_KEY)) &
132 |                        SlabHashT::REGULAR_NODE_KEY_MASK;
133 | 
134 |     uint32_t src_key = __shfl_sync(0xFFFFFFFF, myKey, src_lane, 32);
135 |     uint32_t isExisting = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) &
136 |                           SlabHashT::REGULAR_NODE_KEY_MASK;
137 |     if (isExisting) {  // key exist in the hash table
138 |       if (laneId == src_lane)
139 |         to_be_inserted = false;
140 |     } else {
141 |       if (isEmpty == 0) {  // no empty slot available:
142 |         uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
143 |         if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
144 |           // allocate a new node:
145 |           uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
146 | 
147 |           if (laneId == 31) {
148 |             const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
149 |                                     ? getPointerFromBucket(src_bucket, 31)
150 |                                     : getPointerFromSlab(next, 31);
151 | 
152 |             uint32_t temp =
153 |                 atomicCAS((unsigned int*)p, SlabHashT::EMPTY_INDEX_POINTER, new_node_ptr);
154 |             // check whether it was successful, and
155 |             // free the allocated memory otherwise
156 |             if (temp != SlabHashT::EMPTY_INDEX_POINTER) {
157 |               freeSlab(new_node_ptr);
158 |             }
159 |           }
160 |         } else {
161 |           next = next_ptr;
162 |         }
163 |       } else {  // there is an empty slot available
164 |         int dest_lane = __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1;
165 |         if (laneId == src_lane) {
166 |           const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
167 |                                   ? getPointerFromBucket(src_bucket, dest_lane)
168 |                                   : getPointerFromSlab(next, dest_lane);
169 | 
170 |           old_key_value_pair =
171 |               atomicCAS((unsigned long long int*)p,
172 |                         EMPTY_PAIR_64,
173 |                         ((uint64_t)(*reinterpret_cast<const uint32_t*>(
174 |                              reinterpret_cast<const unsigned char*>(&myValue)))
175 |                          << 32) |
176 |                             *reinterpret_cast<const uint32_t*>(
177 |                                 reinterpret_cast<const unsigned char*>(&myKey)));
178 |           if (old_key_value_pair == EMPTY_PAIR_64) {
179 |             to_be_inserted = false;  // successful insertion
180 |             new_insertion = true;
181 |           }
182 |         }
183 |       }
184 |     }
185 |     last_work_queue = work_queue;
186 |   }
187 |   return new_insertion;
188 | }


--------------------------------------------------------------------------------
/src/concurrent_map/warp/search.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | //================================================
 20 | // Individual Search Unit:
 21 | //================================================
 22 | template <typename KeyT, typename ValueT>
 23 | __device__ __forceinline__ void
 24 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKey(
 25 |     bool& to_be_searched,
 26 |     const uint32_t& laneId,
 27 |     const KeyT& myKey,
 28 |     ValueT& myValue,
 29 |     const uint32_t bucket_id) {
 30 |   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
 31 |   uint32_t work_queue = 0;
 32 |   uint32_t last_work_queue = work_queue;
 33 |   uint32_t next = SlabHashT::A_INDEX_POINTER;
 34 | 
 35 |   while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_searched))) {
 36 |     next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER
 37 |                                            : next;  // a successfull insertion in the warp
 38 |     uint32_t src_lane = __ffs(work_queue) - 1;
 39 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
 40 |     uint32_t wanted_key = __shfl_sync(0xFFFFFFFF,
 41 |                                       *reinterpret_cast<const uint32_t*>(
 42 |                                           reinterpret_cast<const unsigned char*>(&myKey)),
 43 |                                       src_lane,
 44 |                                       32);
 45 |     const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER)
 46 |                                        ? *(getPointerFromBucket(src_bucket, laneId))
 47 |                                        : *(getPointerFromSlab(next, laneId));
 48 |     int found_lane = __ffs(__ballot_sync(0xFFFFFFFF, src_unit_data == wanted_key) &
 49 |                            SlabHashT::REGULAR_NODE_KEY_MASK) -
 50 |                      1;
 51 |     if (found_lane < 0) {  // not found
 52 |       uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
 53 |       if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {  // not found
 54 |         if (laneId == src_lane) {
 55 |           myValue = static_cast<ValueT>(SEARCH_NOT_FOUND);
 56 |           to_be_searched = false;
 57 |         }
 58 |       } else {
 59 |         next = next_ptr;
 60 |       }
 61 |     } else {  // found the key:
 62 |       uint32_t found_value = __shfl_sync(0xFFFFFFFF, src_unit_data, found_lane + 1, 32);
 63 |       if (laneId == src_lane) {
 64 |         myValue = *reinterpret_cast<const ValueT*>(
 65 |             reinterpret_cast<const unsigned char*>(&found_value));
 66 |         to_be_searched = false;
 67 |       }
 68 |     }
 69 |     last_work_queue = work_queue;
 70 |   }
 71 | }
 72 | 
 73 | //================================================
 74 | // Bulk Search Unit:
 75 | //================================================
 76 | template <typename KeyT, typename ValueT>
 77 | __device__ __forceinline__ void
 78 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKeyBulk(
 79 |     const uint32_t& laneId,
 80 |     const KeyT& myKey,
 81 |     ValueT& myValue,
 82 |     const uint32_t bucket_id) {
 83 |   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
 84 | #pragma unroll
 85 |   for (int src_lane = 0; src_lane < WARP_WIDTH; src_lane++) {
 86 |     bool is_top_of_list = true;
 87 |     uint32_t next_ptr = SlabHashT::EMPTY_INDEX_POINTER;
 88 |     uint32_t found_lane_plus_1 = 0;
 89 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
 90 |     uint32_t wanted_key = __shfl_sync(0xFFFFFFFF,
 91 |                                       *reinterpret_cast<const uint32_t*>(
 92 |                                           reinterpret_cast<const unsigned char*>(&myKey)),
 93 |                                       src_lane,
 94 |                                       32);
 95 | 
 96 |     do {
 97 |       const uint32_t src_unit_data = (is_top_of_list)
 98 |                                          ? *(getPointerFromBucket(src_bucket, laneId))
 99 |                                          : *(getPointerFromSlab(next_ptr, laneId));
100 | 
101 |       next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
102 |       // if found_lane_plus_1 == 0, then the query is not found
103 |       found_lane_plus_1 = __ffs(__ballot_sync(0xFFFFFFFF, src_unit_data == wanted_key) &
104 |                                 SlabHashT::REGULAR_NODE_KEY_MASK);
105 |       // values are stored at (found_value + 1)
106 |       uint32_t found_value =
107 |           __shfl_sync(0xFFFFFFFF, src_unit_data, found_lane_plus_1, 32);
108 |       // The responsible thread stores the result if it is found correctly
109 |       myValue = ((found_lane_plus_1 != 0) && (src_lane == laneId))
110 |                     ? *reinterpret_cast<const ValueT*>(
111 |                           reinterpret_cast<const unsigned char*>(&found_value))
112 |                     : myValue;
113 |       is_top_of_list = false;
114 |     } while ((next_ptr != SlabHashT::EMPTY_INDEX_POINTER) && (found_lane_plus_1 == 0));
115 |   }
116 | }


--------------------------------------------------------------------------------
/src/concurrent_set/cset_class.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | /*
 20 |  * This is the main class that will be shallowly copied into the device to be
 21 |  * used at runtime. This class does not own the allocated memory on the gpu
 22 |  * (i.e., d_table_)
 23 |  */
 24 | template <typename KeyT, typename ValueT>
 25 | class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
 26 |  public:
 27 |   // fixed known parameters:
 28 |   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
 29 |   static constexpr uint32_t WARP_WIDTH_ = 32;
 30 | 
 31 | #pragma hd_warning_disable
 32 |   __device__ __host__ GpuSlabHashContext()
 33 |       : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) {
 34 |     // a single slab on a ConcurrentSet should be 128 bytes
 35 |   }
 36 | 
 37 | #pragma hd_warning_disable
 38 |   __host__ __device__ GpuSlabHashContext(
 39 |       GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>& rhs) {
 40 |     num_buckets_ = rhs.getNumBuckets();
 41 |     hash_x_ = rhs.getHashX();
 42 |     hash_y_ = rhs.getHashY();
 43 |     d_table_ = rhs.getDeviceTablePointer();
 44 |     global_allocator_ctx_ = rhs.getAllocatorContext();
 45 |   }
 46 | 
 47 | #pragma hd_warning_disable
 48 |   __host__ __device__ ~GpuSlabHashContext() {}
 49 | 
 50 |   static size_t getSlabUnitSize() {
 51 |     return sizeof(typename ConcurrentSetT<KeyT>::SlabTypeT);
 52 |   }
 53 | 
 54 |   static std::string getSlabHashTypeName() { return ConcurrentSetT<KeyT>::getTypeName(); }
 55 | 
 56 |   __host__ void initParameters(const uint32_t num_buckets,
 57 |                                const uint32_t hash_x,
 58 |                                const uint32_t hash_y,
 59 |                                int8_t* d_table,
 60 |                                AllocatorContextT* allocator_ctx) {
 61 |     num_buckets_ = num_buckets;
 62 |     hash_x_ = hash_x;
 63 |     hash_y_ = hash_y;
 64 |     d_table_ = reinterpret_cast<typename ConcurrentSetT<KeyT>::SlabTypeT*>(d_table);
 65 |     global_allocator_ctx_ = *allocator_ctx;
 66 |   }
 67 | 
 68 |   __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
 69 |     return global_allocator_ctx_;
 70 |   }
 71 | 
 72 |   __device__ __host__ __forceinline__ typename ConcurrentSetT<KeyT>::SlabTypeT*
 73 |   getDeviceTablePointer() {
 74 |     return d_table_;
 75 |   }
 76 | 
 77 |   __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; }
 78 |   __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; }
 79 |   __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; }
 80 | 
 81 |   __device__ __host__ __forceinline__ uint32_t computeBucket(const KeyT& key) const {
 82 |     return (((hash_x_ ^ key) + hash_y_) % PRIME_DIVISOR_) % num_buckets_;
 83 |   }
 84 | 
 85 |   // threads in a warp cooperate with each other to insert keys
 86 |   // into the slab hash set
 87 |   __device__ __forceinline__ bool insertKey(bool& to_be_inserted,
 88 |                                             const uint32_t& laneId,
 89 |                                             const KeyT& myKey,
 90 |                                             const uint32_t bucket_id,
 91 |                                             AllocatorContextT& local_allocator_context);
 92 | 
 93 |   // threads in a warp cooeparte with each other to search for keys
 94 |   // if found, it returns the true, else false
 95 |   __device__ __forceinline__ bool searchKey(bool& to_be_searched,
 96 |                                             const uint32_t& laneId,
 97 |                                             const KeyT& myKey,
 98 |                                             const uint32_t bucket_id);
 99 | 
100 |   // threads in a warp cooperate with each other to search for keys.
101 |   // the main difference with above function is that it is assumed all
102 |   // threads have something to search for (no to_be_searched argument)
103 |   __device__ __forceinline__ bool searchKeyBulk(const uint32_t& laneId,
104 |                                                 const KeyT& myKey,
105 |                                                 const uint32_t bucket_id);
106 | 
107 |   __device__ __forceinline__ uint32_t* getPointerFromSlab(
108 |       const SlabAddressT& slab_address,
109 |       const uint32_t laneId) {
110 |     return global_allocator_ctx_.getPointerFromSlab(slab_address, laneId);
111 |   }
112 | 
113 |   __device__ __forceinline__ uint32_t* getPointerFromBucket(const uint32_t bucket_id,
114 |                                                             const uint32_t laneId) {
115 |     return reinterpret_cast<uint32_t*>(d_table_) +
116 |            bucket_id * ConcurrentSetT<KeyT>::BASE_UNIT_SIZE + laneId;
117 |   }
118 | 
119 |  private:
120 |   // this function should be operated in a warp-wide fashion
121 |   // TODO: add required asserts to make sure this is true in tests/debugs
122 |   __device__ __forceinline__ SlabAllocAddressT allocateSlab(const uint32_t& laneId) {
123 |     return global_allocator_ctx_.warpAllocate(laneId);
124 |   }
125 | 
126 |   __device__ __forceinline__ SlabAllocAddressT
127 |   allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) {
128 |     return local_allocator_ctx.warpAllocate(laneId);
129 |   }
130 | 
131 |   // a thread-wide function to free the slab that was just allocated
132 |   __device__ __forceinline__ void freeSlab(const SlabAllocAddressT slab_ptr) {
133 |     global_allocator_ctx_.freeUntouched(slab_ptr);
134 |   }
135 | 
136 |   // === members:
137 |   uint32_t num_buckets_;
138 |   uint32_t hash_x_;
139 |   uint32_t hash_y_;
140 |   typename ConcurrentSetT<KeyT>::SlabTypeT* d_table_;
141 |   // a copy of dynamic allocator's context to be used on the GPU
142 |   AllocatorContextT global_allocator_ctx_;
143 | };
144 | 
145 | /*
146 |  * This class owns the allocated memory for the hash table
147 |  */
148 | template <typename KeyT, typename ValueT>
149 | class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
150 |  private:
151 |   // fixed known parameters:
152 |   static constexpr uint32_t BLOCKSIZE_ = 128;
153 |   static constexpr uint32_t WARP_WIDTH_ = 32;
154 |   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
155 | 
156 |   struct hash_function {
157 |     uint32_t x;
158 |     uint32_t y;
159 |   } hf_;
160 | 
161 |   // total number of buckets (slabs) for this hash table
162 |   uint32_t num_buckets_;
163 | 
164 |   // a raw pointer to the initial allocated memory for all buckets
165 |   int8_t* d_table_;
166 |   size_t slab_unit_size_;  // size of each slab unit in bytes (might differ
167 |                            // based on the type)
168 | 
169 |   // slab hash context, contains everything that a GPU application needs to be
170 |   // able to use this data structure
171 |   GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> gpu_context_;
172 | 
173 |   // const pointer to an allocator that all instances of slab hash are going to
174 |   // use. The allocator itself is not owned by this class
175 |   DynamicAllocatorT* dynamic_allocator_;
176 |   uint32_t device_idx_;
177 | 
178 |  public:
179 |   GpuSlabHash(const uint32_t num_buckets,
180 |               DynamicAllocatorT* dynamic_allocator,
181 |               uint32_t device_idx,
182 |               const time_t seed = 0,
183 |               const bool identity_hash = false)
184 |       : num_buckets_(num_buckets)
185 |       , d_table_(nullptr)
186 |       , slab_unit_size_(0)
187 |       , dynamic_allocator_(dynamic_allocator)
188 |       , device_idx_(device_idx) {
189 |     assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash.");
190 |     assert(sizeof(typename ConcurrentSetT<KeyT>::SlabTypeT) ==
191 |                (WARP_WIDTH_ * sizeof(uint32_t)) &&
192 |            "A single slab on a ConcurrentMap should be 128 bytes");
193 |     int32_t devCount = 0;
194 |     CHECK_CUDA_ERROR(cudaGetDeviceCount(&devCount));
195 |     assert(device_idx_ < devCount);
196 | 
197 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
198 | 
199 |     slab_unit_size_ =
200 |         GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::getSlabUnitSize();
201 | 
202 |     // allocating initial buckets:
203 |     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
204 | 
205 |     CHECK_CUDA_ERROR(cudaMemset(d_table_, 0xFF, slab_unit_size_ * num_buckets_));
206 | 
207 |     // creating a random number generator:
208 |     if (!identity_hash) {
209 |       std::mt19937 rng(seed ? seed : time(0));
210 |       hf_.x = rng() % PRIME_DIVISOR_;
211 |       if (hf_.x < 1)
212 |         hf_.x = 1;
213 |       hf_.y = rng() % PRIME_DIVISOR_;
214 |     } else {
215 |       hf_ = {0u, 0u};
216 |     }
217 | 
218 |     // initializing the gpu_context_:
219 |     gpu_context_.initParameters(
220 |         num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr());
221 |   }
222 | 
223 |   ~GpuSlabHash() {
224 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
225 |     CHECK_CUDA_ERROR(cudaFree(d_table_));
226 |   }
227 | 
228 |   // returns some debug information about the slab hash
229 |   std::string to_string();
230 |   double computeLoadFactor(int flag) {}
231 |   GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>& getSlabHashContext() {
232 |     return gpu_context_;
233 |   }
234 | 
235 |   void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys);
236 |   void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries);
237 |   void searchBulk(KeyT* d_query, ValueT* d_result, uint32_t num_queries) {}
238 |   void deleteIndividual(KeyT* d_key, uint32_t num_keys) {}
239 | };


--------------------------------------------------------------------------------
/src/concurrent_set/cset_helper_kernels.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | namespace cset {
19 | template <typename KeyT>
20 | __global__ void build_table_kernel(
21 |     KeyT* d_key,
22 |     uint32_t num_keys,
23 |     GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
24 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
25 |   uint32_t laneId = threadIdx.x & 0x1F;
26 | 
27 |   if ((tid - laneId) >= num_keys) {
28 |     return;
29 |   }
30 | 
31 |   // initializing the memory allocator on each warp:
32 |   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
33 |   local_allocator_ctx.initAllocator(tid, laneId);
34 | 
35 |   KeyT myKey = 0;
36 |   uint32_t myBucket = 0;
37 |   bool to_insert = false;
38 | 
39 |   if (tid < num_keys) {
40 |     myKey = d_key[tid];
41 |     myBucket = slab_hash.computeBucket(myKey);
42 |     to_insert = true;
43 |   }
44 | 
45 |   slab_hash.insertKey(to_insert, laneId, myKey, myBucket, local_allocator_ctx);
46 | }
47 | 
48 | //=== Individual search kernel:
49 | template <typename KeyT>
50 | __global__ void search_table(
51 |     KeyT* d_queries,
52 |     KeyT* d_results,
53 |     uint32_t num_queries,
54 |     GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
55 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
56 |   uint32_t laneId = threadIdx.x & 0x1F;
57 | 
58 |   if ((tid - laneId) >= num_queries) {
59 |     return;
60 |   }
61 | 
62 |   KeyT myQuery = 0;
63 |   uint32_t myBucket = 0;
64 |   bool to_search = false;
65 |   if (tid < num_queries) {
66 |     myQuery = d_queries[tid];
67 |     myBucket = slab_hash.computeBucket(myQuery);
68 |     to_search = true;
69 |   }
70 | 
71 |   bool myResult = slab_hash.searchKey(to_search, laneId, myQuery, myBucket);
72 | 
73 |   // writing back the results:
74 |   if (tid < num_queries) {
75 |     d_results[tid] = myResult ? myQuery : SEARCH_NOT_FOUND;
76 |   }
77 | }
78 | };  // namespace cset


--------------------------------------------------------------------------------
/src/concurrent_set/cset_implementation.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | template <typename KeyT, typename ValueT>
20 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::buildBulk(
21 |     KeyT* d_key,
22 |     ValueT* d_value,
23 |     uint32_t num_keys) {
24 |   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
25 |   // calling the kernel for bulk build:
26 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
27 |   cset::build_table_kernel<KeyT>
28 |       <<<num_blocks, BLOCKSIZE_>>>(d_key, num_keys, gpu_context_);
29 | }
30 | 
31 | template <typename KeyT, typename ValueT>
32 | void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchIndividual(
33 |     KeyT* d_query,
34 |     ValueT* d_result,
35 |     uint32_t num_queries) {
36 |   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
37 |   const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_;
38 |   cset::search_table<KeyT>
39 |       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
40 | }
41 | 
42 | template <typename KeyT, typename ValueT>
43 | std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::to_string() {
44 |   std::string result;
45 |   result += " ==== GpuSlabHash: \n";
46 |   result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n";
47 |   result += "\t SlabHashType:     \t\t " + gpu_context_.getSlabHashTypeName() + "\n";
48 |   result += "\t Number of buckets:\t\t " + std::to_string(num_buckets_) + "\n";
49 |   result += "\t d_table_ address: \t\t " +
50 |             std::to_string(reinterpret_cast<uint64_t>(static_cast<void*>(d_table_))) +
51 |             "\n";
52 |   result += "\t hash function = \t\t (" + std::to_string(hf_.x) + ", " +
53 |             std::to_string(hf_.y) + ")\n";
54 |   return result;
55 | }
56 | 


--------------------------------------------------------------------------------
/src/concurrent_set/cset_warp_operations.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | template <typename KeyT, typename ValueT>
 20 | __device__ __forceinline__ bool
 21 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::insertKey(
 22 |     bool& to_be_inserted,
 23 |     const uint32_t& laneId,
 24 |     const KeyT& myKey,
 25 |     const uint32_t bucket_id,
 26 |     AllocatorContextT& local_allocator_ctx) {
 27 |   using SlabHashT = ConcurrentSetT<KeyT>;
 28 |   uint32_t work_queue = 0;
 29 |   uint32_t last_work_queue = 0;
 30 |   uint32_t next = SlabHashT::A_INDEX_POINTER;
 31 |   bool new_insertion = false;
 32 | 
 33 |   while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_inserted))) {
 34 |     // to know whether it is a base node, or a regular node
 35 |     next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER
 36 |                                            : next;  // a successfull insertion in the warp
 37 |     uint32_t src_lane = __ffs(work_queue) - 1;
 38 |     KeyT src_key = __shfl_sync(0xFFFFFFFF, myKey, src_lane, 32);
 39 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
 40 | 
 41 |     uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER)
 42 |                                  ? *getPointerFromBucket(src_bucket, laneId)
 43 |                                  : *getPointerFromSlab(next, laneId);
 44 | 
 45 |     uint32_t old_key = 0;
 46 | 
 47 |     // looking for the same key (if it exists), or an empty spot:
 48 |     int32_t dest_lane = SlabHash_NS::findKeyOrEmptyPerWarp<KeyT, ConcurrentSetT<KeyT>>(
 49 |         src_key, src_unit_data);
 50 | 
 51 |     if (dest_lane == -1) {  // key not found and/or no empty slot available:
 52 |       uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
 53 |       if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
 54 |         // allocate a new node:
 55 |         uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
 56 | 
 57 |         if (laneId == 31) {
 58 |           uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
 59 |                             ? getPointerFromBucket(src_bucket, 31)
 60 |                             : getPointerFromSlab(next, 31);
 61 | 
 62 |           uint32_t temp =
 63 |               atomicCAS((unsigned int*)p, SlabHashT::EMPTY_INDEX_POINTER, new_node_ptr);
 64 |           // check whether it was successful, and
 65 |           // free the allocated memory otherwise
 66 |           if (temp != SlabHashT::EMPTY_INDEX_POINTER)
 67 |             freeSlab(new_node_ptr);
 68 |         }
 69 |       } else {
 70 |         next = next_ptr;
 71 |       }
 72 |     } else {  // either the key is found, or there is an empty slot available
 73 |       if (laneId == src_lane) {
 74 |         const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
 75 |                                 ? getPointerFromBucket(src_bucket, dest_lane)
 76 |                                 : getPointerFromSlab(next, dest_lane);
 77 | 
 78 |         old_key = atomicCAS((unsigned int*)p,
 79 |                             EMPTY_KEY,
 80 |                             *reinterpret_cast<const uint32_t*>(
 81 |                                 reinterpret_cast<const unsigned char*>(&myKey)));
 82 |         new_insertion = (old_key == EMPTY_KEY);
 83 |         if (new_insertion || (old_key == src_key)) {
 84 |           to_be_inserted = false;  // succesful insertion
 85 |         }
 86 |       }
 87 |     }
 88 |     last_work_queue = work_queue;
 89 |   }
 90 |   return new_insertion;
 91 | }
 92 | 
 93 | // ========
 94 | template <typename KeyT, typename ValueT>
 95 | __device__ __forceinline__ bool
 96 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKey(
 97 |     bool& to_be_searched,
 98 |     const uint32_t& laneId,
 99 |     const KeyT& myKey,
100 |     const uint32_t bucket_id) {
101 |   bool myResult = false;
102 |   using SlabHashT = ConcurrentSetT<KeyT>;
103 |   uint32_t work_queue = 0;
104 |   uint32_t last_work_queue = work_queue;
105 |   uint32_t next = SlabHashT::A_INDEX_POINTER;
106 | 
107 |   while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_searched))) {
108 |     next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER
109 |                                            : next;  // a successfull insertion in the warp
110 |     uint32_t src_lane = __ffs(work_queue) - 1;
111 |     uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32);
112 |     KeyT wanted_key = __shfl_sync(0xFFFFFFFF, myKey, src_lane, 32);
113 | 
114 |     const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER)
115 |                                        ? *getPointerFromBucket(src_bucket, laneId)
116 |                                        : *getPointerFromSlab(next, laneId);
117 | 
118 |     int32_t found_lane = SlabHash_NS::findKeyPerWarp<KeyT, ConcurrentSetT<KeyT>>(
119 |         wanted_key, src_unit_data);
120 | 
121 |     if (found_lane < 0) {  // not found
122 |       uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
123 |       if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {  // not found
124 |         if (laneId == src_lane) {
125 |           to_be_searched = false;
126 |         }
127 |       } else {
128 |         next = next_ptr;
129 |       }
130 |     } else {  // found the key:
131 |       if (laneId == src_lane) {
132 |         to_be_searched = false;
133 |         myResult = true;
134 |       }
135 |     }
136 |     last_work_queue = work_queue;
137 |   }
138 |   return myResult;
139 | }
140 | 
141 | template <typename KeyT, typename ValueT>
142 | __device__ __forceinline__ bool
143 | GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKeyBulk(
144 |     const uint32_t& laneId,
145 |     const KeyT& myKey,
146 |     const uint32_t bucket_id) {}


--------------------------------------------------------------------------------
/src/gpu_hash_table.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | #include "slab_hash.cuh"
 19 | 
 20 | /*
 21 |  * This class acts as a helper class to simplify simulations around different
 22 |  * kinds of slab hash implementations
 23 |  */
 24 | template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
 25 | class gpu_hash_table {
 26 |  private:
 27 |   uint32_t max_keys_;
 28 |   uint32_t num_buckets_;
 29 |   int64_t seed_;
 30 |   bool req_values_;
 31 |   bool identity_hash_;
 32 | 
 33 |  public:
 34 |   // Slab hash invariant
 35 |   GpuSlabHash<KeyT, ValueT, SlabHashT>* slab_hash_;
 36 | 
 37 |   // the dynamic allocator that is being used for slab hash
 38 |   DynamicAllocatorT* dynamic_allocator_;
 39 | 
 40 |   uint32_t device_idx_;
 41 | 
 42 |   // main arrays to hold keys, values, queries, results, etc.
 43 |   KeyT* d_key_;
 44 |   ValueT* d_value_;
 45 |   KeyT* d_query_;
 46 |   ValueT* d_result_;
 47 |   uint32_t* d_count_;
 48 | 
 49 |   gpu_hash_table(uint32_t max_keys,
 50 |                  uint32_t num_buckets,
 51 |                  const uint32_t device_idx,
 52 |                  const int64_t seed,
 53 |                  const bool req_values = true,
 54 |                  const bool identity_hash = false,
 55 |                  const bool verbose = false)
 56 |       : max_keys_(max_keys)
 57 |       , num_buckets_(num_buckets)
 58 |       , seed_(seed)
 59 |       , req_values_(req_values)
 60 |       , slab_hash_(nullptr)
 61 |       , identity_hash_(identity_hash)
 62 |       , dynamic_allocator_(nullptr)
 63 |       , device_idx_(device_idx) {
 64 |     int32_t devCount = 0;
 65 |     CHECK_CUDA_ERROR(cudaGetDeviceCount(&devCount));
 66 |     assert(device_idx_ < devCount);
 67 | 
 68 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 69 | 
 70 |     // allocating key, value arrays:
 71 |     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_key_, sizeof(KeyT) * max_keys_));
 72 |     if (req_values_) {
 73 |       CHECK_CUDA_ERROR(cudaMalloc((void**)&d_value_, sizeof(ValueT) * max_keys_));
 74 |     }
 75 |     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_));
 76 |     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_));
 77 |     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
 78 | 
 79 |     // allocate an initialize the allocator:
 80 |     dynamic_allocator_ = new DynamicAllocatorT();
 81 | 
 82 |     // slab hash:
 83 |     slab_hash_ = new GpuSlabHash<KeyT, ValueT, SlabHashT>(
 84 |         num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_);
 85 |     if (verbose) {
 86 |       std::cout << slab_hash_->to_string() << std::endl;
 87 |     }
 88 |   }
 89 | 
 90 |   ~gpu_hash_table() {
 91 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 92 |     CHECK_CUDA_ERROR(cudaFree(d_key_));
 93 |     if (req_values_) {
 94 |       CHECK_CUDA_ERROR(cudaFree(d_value_));
 95 |     }
 96 |     CHECK_CUDA_ERROR(cudaFree(d_query_));
 97 |     CHECK_CUDA_ERROR(cudaFree(d_result_));
 98 |     CHECK_CUDA_ERROR(cudaFree(d_count_));
 99 | 
100 |     // delete the dynamic allocator:
101 |     delete dynamic_allocator_;
102 | 
103 |     // slab hash:
104 |     delete (slab_hash_);
105 |   }
106 | 
107 |   std::string to_string() { return slab_hash_->to_string(); }
108 |   float hash_build(KeyT* h_key, ValueT* h_value, uint32_t num_keys) {
109 |     // moving key-values to the device:
110 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
111 |     CHECK_CUDA_ERROR(
112 |         cudaMemcpy(d_key_, h_key, sizeof(KeyT) * num_keys, cudaMemcpyHostToDevice));
113 |     if (req_values_) {
114 |       CHECK_CUDA_ERROR(cudaMemcpy(
115 |           d_value_, h_value, sizeof(ValueT) * num_keys, cudaMemcpyHostToDevice));
116 |     }
117 | 
118 |     float temp_time = 0.0f;
119 | 
120 |     cudaEvent_t start, stop;
121 |     cudaEventCreate(&start);
122 |     cudaEventCreate(&stop);
123 | 
124 |     cudaEventRecord(start, 0);
125 | 
126 |     // calling slab-hash's bulk build procedure:
127 |     slab_hash_->buildBulk(d_key_, d_value_, num_keys);
128 | 
129 |     cudaEventRecord(stop, 0);
130 |     cudaEventSynchronize(stop);
131 |     cudaEventElapsedTime(&temp_time, start, stop);
132 | 
133 |     cudaEventDestroy(start);
134 |     cudaEventDestroy(stop);
135 |     return temp_time;
136 |   }
137 |   float hash_build_with_unique_keys(KeyT* h_key, ValueT* h_value, uint32_t num_keys) {
138 |     // moving key-values to the device:
139 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
140 |     CHECK_CUDA_ERROR(
141 |         cudaMemcpy(d_key_, h_key, sizeof(KeyT) * num_keys, cudaMemcpyHostToDevice));
142 |     if (req_values_) {
143 |       CHECK_CUDA_ERROR(cudaMemcpy(
144 |           d_value_, h_value, sizeof(ValueT) * num_keys, cudaMemcpyHostToDevice));
145 |     }
146 | 
147 |     float temp_time = 0.0f;
148 | 
149 |     cudaEvent_t start, stop;
150 |     cudaEventCreate(&start);
151 |     cudaEventCreate(&stop);
152 | 
153 |     cudaEventRecord(start, 0);
154 | 
155 |     // calling slab-hash's bulk build procedure:
156 |     slab_hash_->buildBulkWithUniqueKeys(d_key_, d_value_, num_keys);
157 | 
158 |     cudaEventRecord(stop, 0);
159 |     cudaEventSynchronize(stop);
160 |     cudaEventElapsedTime(&temp_time, start, stop);
161 | 
162 |     cudaEventDestroy(start);
163 |     cudaEventDestroy(stop);
164 |     return temp_time;
165 |   }
166 |   float hash_search(KeyT* h_query, ValueT* h_result, uint32_t num_queries) {
167 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
168 |     CHECK_CUDA_ERROR(cudaMemcpy(
169 |         d_query_, h_query, sizeof(KeyT) * num_queries, cudaMemcpyHostToDevice));
170 |     CHECK_CUDA_ERROR(cudaMemset(d_result_, 0xFF, sizeof(ValueT) * num_queries));
171 | 
172 |     float temp_time = 0.0f;
173 | 
174 |     cudaEvent_t start, stop;
175 |     cudaEventCreate(&start);
176 |     cudaEventCreate(&stop);
177 |     cudaEventRecord(start, 0);
178 | 
179 |     // == calling slab hash's individual search:
180 |     slab_hash_->searchIndividual(d_query_, d_result_, num_queries);
181 |     //==
182 | 
183 |     cudaEventRecord(stop, 0);
184 |     cudaEventSynchronize(stop);
185 |     cudaEventElapsedTime(&temp_time, start, stop);
186 | 
187 |     cudaEventDestroy(start);
188 |     cudaEventDestroy(stop);
189 | 
190 |     CHECK_CUDA_ERROR(cudaMemcpy(
191 |         h_result, d_result_, sizeof(ValueT) * num_queries, cudaMemcpyDeviceToHost));
192 |     cudaDeviceSynchronize();
193 |     return temp_time;
194 |   }
195 |   float hash_search_bulk(KeyT* h_query, ValueT* h_result, uint32_t num_queries) {
196 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
197 |     CHECK_CUDA_ERROR(cudaMemcpy(
198 |         d_query_, h_query, sizeof(KeyT) * num_queries, cudaMemcpyHostToDevice));
199 |     CHECK_CUDA_ERROR(cudaMemset(d_result_, 0xFF, sizeof(ValueT) * num_queries));
200 | 
201 |     float temp_time = 0.0f;
202 | 
203 |     cudaEvent_t start, stop;
204 |     cudaEventCreate(&start);
205 |     cudaEventCreate(&stop);
206 |     cudaEventRecord(start, 0);
207 | 
208 |     //== slab hash's bulk search:
209 |     slab_hash_->searchBulk(d_query_, d_result_, num_queries);
210 |     //==
211 | 
212 |     cudaEventRecord(stop, 0);
213 |     cudaEventSynchronize(stop);
214 |     cudaEventElapsedTime(&temp_time, start, stop);
215 | 
216 |     cudaEventDestroy(start);
217 |     cudaEventDestroy(stop);
218 | 
219 |     CHECK_CUDA_ERROR(cudaMemcpy(
220 |         h_result, d_result_, sizeof(ValueT) * num_queries, cudaMemcpyDeviceToHost));
221 |     cudaDeviceSynchronize();
222 |     return temp_time;
223 |   }
224 | 
225 |   float hash_count(KeyT* h_query, uint32_t* h_count, uint32_t num_queries) {
226 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
227 |     CHECK_CUDA_ERROR(cudaMemcpy(
228 |         d_query_, h_query, sizeof(KeyT) * num_queries, cudaMemcpyHostToDevice));
229 |     CHECK_CUDA_ERROR(cudaMemset(d_result_, 0x00, sizeof(uint32_t) * num_queries));
230 | 
231 |     float temp_time = 0.0f;
232 | 
233 |     cudaEvent_t start, stop;
234 |     cudaEventCreate(&start);
235 |     cudaEventCreate(&stop);
236 |     cudaEventRecord(start, 0);
237 | 
238 |     // == calling slab hash's individual count:
239 |     slab_hash_->countIndividual(d_query_, d_count_, num_queries);
240 |     //==
241 | 
242 |     cudaEventRecord(stop, 0);
243 |     cudaEventSynchronize(stop);
244 |     cudaEventElapsedTime(&temp_time, start, stop);
245 | 
246 |     cudaEventDestroy(start);
247 |     cudaEventDestroy(stop);
248 | 
249 |     CHECK_CUDA_ERROR(cudaMemcpy(
250 |         h_count, d_count_, sizeof(uint32_t) * num_queries, cudaMemcpyDeviceToHost));
251 |     cudaDeviceSynchronize();
252 |     return temp_time;
253 |   }
254 | 
255 |   float hash_delete(KeyT* h_key, uint32_t num_keys) {
256 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
257 |     CHECK_CUDA_ERROR(
258 |         cudaMemcpy(d_key_, h_key, sizeof(KeyT) * num_keys, cudaMemcpyHostToDevice));
259 | 
260 |     float temp_time = 0.0f;
261 | 
262 |     cudaEvent_t start, stop;
263 |     cudaEventCreate(&start);
264 |     cudaEventCreate(&stop);
265 |     cudaEventRecord(start, 0);
266 | 
267 |     //=== slab hash's deletion:
268 |     slab_hash_->deleteIndividual(d_key_, num_keys);
269 | 
270 |     cudaEventRecord(stop, 0);
271 |     cudaEventSynchronize(stop);
272 |     cudaEventElapsedTime(&temp_time, start, stop);
273 | 
274 |     cudaEventDestroy(start);
275 |     cudaEventDestroy(stop);
276 |     return temp_time;
277 |   }
278 | 
279 |   float batched_operations(uint32_t* h_batch_op,
280 |                            uint32_t* h_results,
281 |                            uint32_t batch_size,
282 |                            uint32_t batch_id) {
283 |     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
284 |     CHECK_CUDA_ERROR(cudaMemcpy(d_key_ + batch_id * batch_size,
285 |                                 h_batch_op,
286 |                                 sizeof(uint32_t) * batch_size,
287 |                                 cudaMemcpyHostToDevice));
288 |     CHECK_CUDA_ERROR(cudaMemset(
289 |         d_result_ + batch_id * batch_size, 0xFF, sizeof(uint32_t) * batch_size));
290 | 
291 |     float temp_time = 0.0f;
292 | 
293 |     cudaEvent_t start, stop;
294 |     cudaEventCreate(&start);
295 |     cudaEventCreate(&stop);
296 | 
297 |     cudaEventRecord(start, 0);
298 |     slab_hash_->batchedOperation(d_key_ + batch_id * batch_size, d_result_, batch_size);
299 |     cudaEventRecord(stop, 0);
300 |     cudaEventSynchronize(stop);
301 |     cudaEventElapsedTime(&temp_time, start, stop);
302 | 
303 |     cudaEventDestroy(start);
304 |     cudaEventDestroy(stop);
305 | 
306 |     CHECK_ERROR(cudaMemcpy(h_results + batch_id * batch_size,
307 |                            d_result_ + batch_id * batch_size,
308 |                            sizeof(uint32_t) * batch_size,
309 |                            cudaMemcpyDeviceToHost));
310 |     cudaDeviceSynchronize();
311 |     return temp_time;
312 |   }
313 | 
314 |   float measureLoadFactor(int flag = 0) { return slab_hash_->computeLoadFactor(flag); }
315 | };


--------------------------------------------------------------------------------
/src/slab_hash.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cassert>
20 | #include <iostream>
21 | #include <random>
22 | #include <typeinfo>
23 | 
24 | // global declarations
25 | #include "slab_hash_global.cuh"
26 | 
27 | // global helper methods:
28 | #include "slab_hash_helper_methods.cuh"
29 | 
30 | // class declaration:
31 | #include "concurrent_map/cmap_class.cuh"
32 | #include "concurrent_set/cset_class.cuh"
33 | #include "slab_iterator.cuh"
34 | 
35 | // warp implementations of member functions:
36 | #include "concurrent_map/warp/delete.cuh"
37 | #include "concurrent_map/warp/insert.cuh"
38 | #include "concurrent_map/warp/search.cuh"
39 | #include "concurrent_map/warp/count.cuh"
40 | 
41 | #include "concurrent_set/cset_warp_operations.cuh"
42 | 
43 | // helper kernels:
44 | #include "concurrent_map/device/build.cuh"
45 | #include "concurrent_map/device/concurrent_kernel.cuh"
46 | #include "concurrent_map/device/delete_kernel.cuh"
47 | #include "concurrent_map/device/misc_kernels.cuh"
48 | #include "concurrent_map/device/search_kernel.cuh"
49 | #include "concurrent_map/device/count_kernel.cuh"
50 | #include "concurrent_set/cset_helper_kernels.cuh"
51 | 
52 | // implementations:
53 | #include "concurrent_map/cmap_implementation.cuh"
54 | #include "concurrent_set/cset_implementation.cuh"


--------------------------------------------------------------------------------
/src/slab_hash_global.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | #include "slab_alloc.cuh"
 20 | 
 21 | #define CHECK_CUDA_ERROR(call)                                                          \
 22 |   do {                                                                                  \
 23 |     cudaError_t err = call;                                                             \
 24 |     if (err != cudaSuccess) {                                                           \
 25 |       printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
 26 |       exit(EXIT_FAILURE);                                                               \
 27 |     }                                                                                   \
 28 |   } while (0)
 29 | 
 30 | // internal parameters for slab hash device functions:
 31 | static constexpr uint32_t EMPTY_KEY = 0xFFFFFFFF;
 32 | static constexpr uint32_t EMPTY_VALUE = 0xFFFFFFFF;
 33 | static constexpr uint64_t EMPTY_PAIR_64 = 0xFFFFFFFFFFFFFFFFLL;
 34 | static constexpr uint32_t WARP_WIDTH = 32;
 35 | static constexpr uint32_t SEARCH_NOT_FOUND = 0xFFFFFFFF;
 36 | 
 37 | // only works with up to 32-bit key/values
 38 | template <typename KeyT, typename ValueT>
 39 | struct key_value_pair {
 40 |   KeyT key;
 41 |   ValueT value;
 42 | };
 43 | 
 44 | template <typename KeyT, typename ValueT>
 45 | struct __align__(32) concurrent_slab {
 46 |   static constexpr uint32_t NUM_ELEMENTS_PER_SLAB = 15u;
 47 |   key_value_pair<KeyT, ValueT> data[NUM_ELEMENTS_PER_SLAB];
 48 |   uint32_t ptr_index[2];
 49 | };
 50 | 
 51 | // this slab structure is meant to be used in either concurrent sets,
 52 | // or phase-concurrent maps.
 53 | // | key 0 | key 1 | key 2 | ... | key 30 | next_ptr |
 54 | template <typename KeyT>
 55 | struct __align__(32) key_only_slab {
 56 |   static constexpr uint32_t NUM_ELEMENTS_PER_SLAB = 31u;
 57 |   KeyT keys[NUM_ELEMENTS_PER_SLAB];
 58 |   uint32_t next_ptr_index[1];
 59 | };
 60 | 
 61 | template <typename KeyT, typename ValueT>
 62 | struct __align__(32) phase_concurrent_slab {
 63 |   static constexpr uint32_t NUM_ELEMENTS_PER_SLAB = 31u;
 64 |   // main slab (128 bytes), contain keys
 65 |   key_only_slab<KeyT> keys;
 66 | 
 67 |   // value storage:
 68 |   ValueT values[NUM_ELEMENTS_PER_SLAB];
 69 | };
 70 | 
 71 | /*
 72 |  * Different types of slab hash:
 73 |  * 1. Concurrent map: it assumes that all operations can be performed
 74 |  * concurrently
 75 |  * 2. phase-concurrent map: supports concurrent updates, and concurrent
 76 |  * searches, but not a mixture of both
 77 |  */
 78 | enum class SlabHashTypeT { ConcurrentMap, ConcurrentSet, PhaseConcurrentMap };
 79 | 
 80 | template <typename KeyT, typename ValueT>
 81 | class ConcurrentMapT {
 82 |  public:
 83 |   // fixed parameters for the data structure
 84 |   static constexpr uint32_t A_INDEX_POINTER = 0xFFFFFFFE;
 85 |   static constexpr uint32_t EMPTY_INDEX_POINTER = 0xFFFFFFFF;
 86 |   static constexpr uint32_t BASE_UNIT_SIZE = 32;
 87 |   static constexpr uint32_t REGULAR_NODE_ADDRESS_MASK = 0x30000000;
 88 |   static constexpr uint32_t REGULAR_NODE_DATA_MASK = 0x3FFFFFFF;
 89 |   static constexpr uint32_t REGULAR_NODE_KEY_MASK = 0x15555555;
 90 | 
 91 |   using SlabTypeT = concurrent_slab<KeyT, ValueT>;
 92 | 
 93 |   static std::string getTypeName() { return std::string("ConcurrentMap"); }
 94 | };
 95 | 
 96 | template <typename KeyT>
 97 | class ConcurrentSetT {
 98 |  public:
 99 |   // fixed parameters for the data structure
100 |   static constexpr uint32_t A_INDEX_POINTER = 0xFFFFFFFE;
101 |   static constexpr uint32_t EMPTY_INDEX_POINTER = 0xFFFFFFFF;
102 |   static constexpr uint32_t BASE_UNIT_SIZE = 32;
103 |   static constexpr uint32_t REGULAR_NODE_ADDRESS_MASK = 0x80000000;
104 |   static constexpr uint32_t REGULAR_NODE_DATA_MASK = 0x7FFFFFFF;
105 |   static constexpr uint32_t REGULAR_NODE_KEY_MASK = 0x7FFFFFFF;
106 |   static constexpr uint32_t NEXT_PTR_LANE = 31u;
107 | 
108 |   using SlabTypeT = key_only_slab<KeyT>;
109 | 
110 |   static std::string getTypeName() { return std::string("ConcurrentSet"); }
111 | };
112 | 
113 | template <typename KeyT, typename ValueT>
114 | class PhaseConcurrentMapT {
115 |  public:
116 |   using SlabTypeT = phase_concurrent_slab<KeyT, ValueT>;
117 |   static std::string getTypeName() { return std::string("PhaseConcurrentMap"); }
118 | };
119 | 
120 | // the main class to be specialized for different types of hash tables
121 | template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
122 | class GpuSlabHash;
123 | 
124 | template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
125 | class GpuSlabHashContext;
126 | 
127 | // The custom allocator that is being used for this code:
128 | // this might need to be a template paramater itself
129 | namespace slab_alloc_par {
130 | constexpr uint32_t log_num_mem_blocks = 8;
131 | constexpr uint32_t num_super_blocks = 32;
132 | constexpr uint32_t num_replicas = 1;
133 | }  // namespace slab_alloc_par
134 | 
135 | using DynamicAllocatorT = SlabAllocLight<slab_alloc_par::log_num_mem_blocks,
136 |                                          slab_alloc_par::num_super_blocks,
137 |                                          slab_alloc_par::num_replicas>;
138 | 
139 | using AllocatorContextT = SlabAllocLightContext<slab_alloc_par::log_num_mem_blocks,
140 |                                                 slab_alloc_par::num_super_blocks,
141 |                                                 slab_alloc_par::num_replicas>;
142 | 
143 | using SlabAddressT = uint32_t;
144 | using BucketAddressT = SlabAddressT;


--------------------------------------------------------------------------------
/src/slab_hash_helper_methods.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | namespace SlabHash_NS {
20 | /*
21 |  * search for a key (and/or an empty spot) in a single slab, returns the laneId
22 |  * if found, otherwise returns -1
23 |  */
24 | template <typename KeyT, class SlabHashT>
25 | __device__ __forceinline__ int32_t findKeyOrEmptyPerWarp(const KeyT& src_key,
26 |                                                          const uint32_t read_data_chunk) {
27 |   uint32_t isEmpty = (__ballot_sync(
28 |       0xFFFFFFFF, (read_data_chunk == EMPTY_KEY) || (read_data_chunk == src_key)));
29 |   return __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1;
30 | }
31 | 
32 | // search for just the key
33 | template <typename KeyT, class SlabHashT>
34 | __device__ __forceinline__ int32_t findKeyPerWarp(const KeyT& src_key,
35 |                                                   const uint32_t read_data_chunk) {
36 |   uint32_t isEmpty = __ballot_sync(0xFFFFFFFF, (read_data_chunk == src_key));
37 |   return __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1;
38 | }
39 | 
40 | // search for an empty spot
41 | template <typename KeyT, class SlabHashT>
42 | __device__ __forceinline__ int32_t findEmptyPerWarp(const uint32_t read_data_chunk) {
43 |   uint32_t isEmpty = __ballot_sync(0xFFFFFFFF, (read_data_chunk == EMPTY_KEY));
44 |   return __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1;
45 | }
46 | };  // namespace SlabHash_NS


--------------------------------------------------------------------------------
/src/slab_iterator.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | // a forward iterator for the slab hash data structure:
20 | // currently just specialized for concurrent set
21 | // TODO implement for other types
22 | template <typename KeyT>
23 | class SlabIterator {
24 |  public:
25 |   using SlabHashT = ConcurrentSetT<KeyT>;
26 | 
27 |   GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>& slab_hash_;
28 | 
29 |   // current position of the iterator
30 |   KeyT* cur_ptr_;
31 |   uint32_t cur_size_;    // keep track of current level's size (in units of
32 |                          // sizeof(KeyT))
33 |   uint32_t cur_bucket_;  // keeping track of the current bucket
34 |   SlabAddressT cur_slab_address_;
35 |   // initialize the iterator with the first bucket's pointer address of the slab
36 |   // hash
37 |   __host__ __device__
38 |   SlabIterator(GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>& slab_hash)
39 |       : slab_hash_(slab_hash)
40 |       , cur_ptr_(reinterpret_cast<KeyT*>(slab_hash_.getDeviceTablePointer()))
41 |       , cur_size_(slab_hash_.getNumBuckets() * SlabHashT::BASE_UNIT_SIZE)
42 |       , cur_bucket_(0)
43 |       , cur_slab_address_(*slab_hash.getPointerFromBucket(0, SlabHashT::NEXT_PTR_LANE)) {}
44 | 
45 |   __device__ __forceinline__ KeyT* getPointer() const { return cur_ptr_; }
46 |   __device__ __forceinline__ uint32_t getSize() const { return cur_size_; }
47 | 
48 |   // returns true, if there's a valid next element, else returns false
49 |   // this function is being run by only one thread, so it is wrong to assume all
50 |   // threads within a warp have access to the caller's iterator state
51 |   __device__ __forceinline__ bool next() {
52 |     if (cur_bucket_ == slab_hash_.getNumBuckets()) {
53 |       return false;
54 |     }
55 | 
56 |     while (cur_slab_address_ == SlabHashT::EMPTY_INDEX_POINTER) {
57 |       cur_bucket_++;
58 |       if (cur_bucket_ == slab_hash_.getNumBuckets()) {
59 |         return false;
60 |       }
61 |       cur_slab_address_ =
62 |           *slab_hash_.getPointerFromBucket(cur_bucket_, SlabHashT::NEXT_PTR_LANE);
63 |     }
64 | 
65 |     cur_ptr_ = slab_hash_.getPointerFromSlab(cur_slab_address_, 0);
66 |     cur_slab_address_ =
67 |         *slab_hash_.getPointerFromSlab(cur_slab_address_, SlabHashT::NEXT_PTR_LANE);
68 |     cur_size_ = SlabHashT::BASE_UNIT_SIZE;
69 |     return true;
70 |   }
71 | };


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cuda_add_executable(cmap_test cmap_test.cu 
 2 | 		OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS})
 3 | target_link_libraries(cmap_test gtest)
 4 | 
 5 | cuda_add_executable(test_slab_hash test_slab_hash.cu 
 6 | 		OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS})
 7 | 
 8 | cuda_add_executable(concurrent_map concurrent_map.cu
 9 | 		OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS})
10 | 
11 | cuda_add_executable(concurrent_set concurrent_set.cu
12 |                 OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS})
13 | 
14 | cuda_add_executable(iterator_test iterator_test.cu
15 | 		OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS})
16 | 


--------------------------------------------------------------------------------
/test/cmap_test.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2018 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cuda.h>
 18 | #include <cuda_runtime_api.h>
 19 | #include <gtest/gtest.h>
 20 | #include <stdio.h>
 21 | #include <cstdlib>
 22 | #include <iostream>
 23 | #include <random>
 24 | #include <unordered_map>
 25 | 
 26 | #include "CommandLine.h"
 27 | #include "gpu_hash_table.cuh"
 28 | #include "slab_alloc.cuh"
 29 | #include "slab_hash.cuh"
 30 | 
 31 | size_t g_gpu_device_idx{0};  // the gpu device to run tests on
 32 | 
 33 | TEST(ConcurrentMap, Construction) {
 34 |   gpu_hash_table<uint32_t, uint32_t, SlabHashTypeT::ConcurrentMap> cmap(
 35 |       100, 10, g_gpu_device_idx, /*seed = */ 1);
 36 | 
 37 |   std::vector<uint32_t> h_key{10, 5, 1};
 38 |   std::vector<uint32_t> h_value{100, 50, 10};
 39 | 
 40 |   cmap.hash_build(h_key.data(), h_value.data(), h_key.size());
 41 | }
 42 | 
 43 | TEST(BulkBuild, IndividualSearch) {
 44 |   using KeyT = uint32_t;
 45 |   using ValueT = uint32_t;
 46 |   const uint32_t num_keys = 137;
 47 |   const uint32_t num_buckets = 2;
 48 |   // creating the data structures:
 49 |   gpu_hash_table<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> cmap(
 50 |       num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1);
 51 | 
 52 |   // creating key-value pairs:
 53 |   std::vector<KeyT> h_key;
 54 |   h_key.reserve(num_keys);
 55 |   std::vector<ValueT> h_value;
 56 |   h_value.reserve(num_keys);
 57 |   for (uint32_t i_key = 0; i_key < num_keys; i_key++) {
 58 |     h_key.push_back(13 + i_key);
 59 |     h_value.push_back(1000 + h_key.back());
 60 |   }
 61 | 
 62 |   // building the slab hash, and the host's data structure:
 63 |   cmap.hash_build(h_key.data(), h_value.data(), h_key.size());
 64 | 
 65 |   // generating random queries
 66 |   const auto num_queries = num_keys;
 67 |   std::random_device rd;
 68 |   std::mt19937 rng(rd());
 69 |   std::vector<KeyT> h_query(h_key);
 70 |   std::shuffle(h_query.begin(), h_query.end(), rng);
 71 |   std::vector<ValueT> cmap_results(num_queries);
 72 | 
 73 |   // searching for the queries:
 74 |   cmap.hash_search(h_query.data(), cmap_results.data(), num_queries);
 75 | 
 76 |   // validating the results:
 77 |   std::unordered_map<KeyT, ValueT> hash_map;
 78 |   for (uint32_t i_key = 0; i_key < num_keys; i_key++) {
 79 |     hash_map.insert(std::make_pair(h_key[i_key], h_value[i_key]));
 80 |   }
 81 | 
 82 |   for (uint32_t i = 0; i < num_queries; i++) {
 83 |     auto cmap_result = cmap_results[i];
 84 |     auto expected_result = hash_map[h_query[i]];
 85 |     ASSERT_EQ(expected_result, cmap_result);
 86 |   }
 87 | }
 88 | 
 89 | TEST(BulkBuild, BulkSearch) {
 90 |   using KeyT = uint32_t;
 91 |   using ValueT = uint32_t;
 92 |   const uint32_t num_keys = 137;
 93 |   const uint32_t num_buckets = 2;
 94 |   // creating the data structures:
 95 |   gpu_hash_table<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> cmap(
 96 |       num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1);
 97 | 
 98 |   // creating key-value pairs:
 99 |   std::vector<KeyT> h_key;
100 |   h_key.reserve(num_keys);
101 |   std::vector<ValueT> h_value;
102 |   h_value.reserve(num_keys);
103 |   for (uint32_t i_key = 0; i_key < num_keys; i_key++) {
104 |     h_key.push_back(13 + i_key);
105 |     h_value.push_back(1000 + h_key.back());
106 |   }
107 | 
108 |   // building the slab hash, and the host's data structure:
109 |   cmap.hash_build(h_key.data(), h_value.data(), h_key.size());
110 | 
111 |   // generating random queries
112 |   const auto num_queries = num_keys;
113 |   std::random_device rd;
114 |   std::mt19937 rng(rd());
115 |   std::vector<KeyT> h_query(h_key);
116 |   std::shuffle(h_query.begin(), h_query.end(), rng);
117 |   std::vector<ValueT> cmap_results(num_queries);
118 | 
119 |   // searching for the queries:
120 |   cmap.hash_search_bulk(h_query.data(), cmap_results.data(), num_queries);
121 | 
122 |   // validating the results:
123 |   std::unordered_map<KeyT, ValueT> hash_map;
124 |   for (uint32_t i_key = 0; i_key < num_keys; i_key++) {
125 |     hash_map.insert(std::make_pair(h_key[i_key], h_value[i_key]));
126 |   }
127 | 
128 |   for (uint32_t i = 0; i < num_queries; i++) {
129 |     auto cmap_result = cmap_results[i];
130 |     auto expected_result = hash_map[h_query[i]];
131 |     ASSERT_EQ(expected_result, cmap_result);
132 |   }
133 | }
134 | 
135 | TEST(BulkBuild, IndividualCount) {
136 |   using KeyT = uint32_t;
137 |   using ValueT = uint32_t;
138 |   const uint32_t num_unique = 2014;
139 |   const uint32_t num_buckets = 12;
140 |   const uint32_t max_count = 32;
141 | 
142 |   // rng
143 |   std::random_device rd;
144 |   std::mt19937 rng(rd());
145 | 
146 |   // random key counts
147 |   uint32_t num_keys = 0;
148 |   std::vector<uint32_t> h_count;
149 |   h_count.reserve(num_unique);
150 |   for (uint32_t i_key = 0; i_key < num_unique; i_key++) {
151 |     uint32_t key_count = rng() % max_count;
152 |     h_count.push_back(key_count);
153 |     num_keys += key_count;
154 |   }
155 | 
156 |   // creating key-value pairs:
157 |   std::vector<KeyT> h_key;
158 |   h_key.reserve(num_keys);
159 |   std::vector<KeyT> h_value;
160 |   h_value.reserve(num_keys);
161 |   std::vector<KeyT> h_key_unique;
162 |   h_key_unique.reserve(num_unique);
163 |   for (uint32_t i_key = 0; i_key < num_unique; i_key++) {
164 |     KeyT myKey = 13 + i_key;
165 |     ValueT myValue = 1000 + myKey;
166 |     h_key_unique.push_back(myKey);
167 |     for (uint32_t i_count = 0; i_count < h_count[i_key]; i_count++) {
168 |       h_key.push_back(myKey);
169 |       h_value.push_back(myValue);
170 |     }
171 |   }
172 | 
173 |   // creating the data structures:
174 |   gpu_hash_table<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> cmap(
175 |       num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1);
176 | 
177 |   // building the slab hash, and the host's data structure:
178 |   cmap.hash_build(h_key.data(), h_value.data(), h_key.size());
179 | 
180 |   // generating random queries
181 |   const auto num_queries = num_unique;
182 |   std::vector<KeyT> h_query(h_key_unique);
183 |   std::shuffle(h_query.begin(), h_query.end(), rng);
184 |   std::vector<ValueT> cmap_results(num_queries);
185 | 
186 |   // getting count per query:
187 |   cmap.hash_count(h_query.data(), cmap_results.data(), num_queries);
188 | 
189 |   // validating the results:
190 |   std::unordered_map<KeyT, ValueT> count_map;
191 |   for (uint32_t i_key = 0; i_key < num_unique; i_key++) {
192 |     count_map.insert(std::make_pair(h_key_unique[i_key], h_count[i_key]));
193 |   }
194 | 
195 |   for (uint32_t i = 0; i < num_queries; i++) {
196 |     auto cmap_result = cmap_results[i];
197 |     auto expected_result = count_map[h_query[i]];
198 |     ASSERT_EQ(expected_result, cmap_result);
199 |   }
200 | }
201 | 
202 | TEST(UniqueBulkBuild, IndividualCount) {
203 |   using KeyT = uint32_t;
204 |   using ValueT = uint32_t;
205 |   const uint32_t num_unique = 2014;
206 |   const uint32_t num_buckets = 12;
207 |   const uint32_t max_count = 32;
208 | 
209 |   // rng
210 |   std::random_device rd;
211 |   std::mt19937 rng(rd());
212 | 
213 |   // random key counts
214 |   uint32_t num_keys = 0;
215 |   std::vector<uint32_t> h_count;
216 |   h_count.reserve(num_unique);
217 |   for (uint32_t i_key = 0; i_key < num_unique; i_key++) {
218 |     uint32_t key_count = rng() % max_count;
219 |     h_count.push_back(key_count);
220 |     num_keys += key_count;
221 |   }
222 | 
223 |   // creating key-value pairs:
224 |   std::vector<KeyT> h_key;
225 |   h_key.reserve(num_keys);
226 |   std::vector<KeyT> h_value;
227 |   h_value.reserve(num_keys);
228 |   std::vector<KeyT> h_key_unique;
229 |   h_key_unique.reserve(num_unique);
230 |   for (uint32_t i_key = 0; i_key < num_unique; i_key++) {
231 |     KeyT myKey = 13 + i_key;
232 |     ValueT myValue = 1000 + myKey;
233 |     h_key_unique.push_back(myKey);
234 |     for (uint32_t i_count = 0; i_count < h_count[i_key]; i_count++) {
235 |       h_key.push_back(myKey);
236 |       h_value.push_back(myValue);
237 |     }
238 |   }
239 | 
240 |   // creating the data structures:
241 |   gpu_hash_table<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> cmap(
242 |       num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1);
243 | 
244 |   // building the unique-keys slab hash, and the host's data structure:
245 |   cmap.hash_build_with_unique_keys(h_key.data(), h_value.data(), h_key.size());
246 | 
247 |   // generating random queries
248 |   const auto num_queries = num_unique;
249 |   std::vector<KeyT> h_query(h_key_unique);
250 |   std::shuffle(h_query.begin(), h_query.end(), rng);
251 |   std::vector<ValueT> cmap_results(num_queries);
252 | 
253 |   // getting count per query:
254 |   cmap.hash_count(h_query.data(), cmap_results.data(), num_queries);
255 | 
256 |   // validating the results:
257 |   std::unordered_map<KeyT, ValueT> count_map;
258 |   for (uint32_t i_key = 0; i_key < num_unique; i_key++) {
259 |     count_map.insert(std::make_pair(h_key_unique[i_key], h_count[i_key]));
260 |   }
261 | 
262 |   for (uint32_t i = 0; i < num_queries; i++) {
263 |     auto cmap_result = cmap_results[i];
264 |     auto expected_result = (count_map[h_query[i]] != 0) ? 1 : 0;
265 |     ASSERT_EQ(expected_result, cmap_result);
266 |   }
267 | }
268 | 
269 | TEST(BulkBuild, IndividualDelete) {
270 |   using KeyT = uint32_t;
271 |   using ValueT = uint32_t;
272 |   const uint32_t num_keys = 137;
273 |   const uint32_t num_buckets = 2;
274 |   // creating the data structures:
275 |   gpu_hash_table<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> cmap(
276 |       num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1);
277 | 
278 |   // creating key-value pairs:
279 |   std::vector<KeyT> h_key;
280 |   h_key.reserve(num_keys);
281 |   std::vector<ValueT> h_value;
282 |   h_value.reserve(num_keys);
283 |   for (uint32_t i_key = 0; i_key < num_keys; i_key++) {
284 |     h_key.push_back(13 + i_key);
285 |     h_value.push_back(1000 + h_key.back());
286 |   }
287 | 
288 |   // building the slab hash:
289 |   cmap.hash_build(h_key.data(), h_value.data(), h_key.size());
290 | 
291 |   // generating random keys to delete:
292 |   const auto num_deletion = num_keys;
293 |   const auto extend_fact = 4;
294 |   std::random_device rd;
295 |   std::mt19937 rng(rd());
296 |   std::vector<KeyT> h_deleted_keys;
297 |   h_deleted_keys.reserve(num_deletion * extend_fact);
298 |   for (uint32_t i_key = 0; i_key < num_deletion * extend_fact; i_key++) {
299 |     h_deleted_keys.push_back(13 + i_key);
300 |   }
301 |   std::shuffle(h_deleted_keys.begin(), h_deleted_keys.end(), rng);
302 | 
303 |   // delete the keys:
304 |   cmap.hash_delete(h_deleted_keys.data(), num_deletion);
305 | 
306 |   // query all keys:
307 |   const auto num_queries = num_keys;
308 |   std::vector<KeyT> h_query(h_key);
309 |   std::vector<ValueT> cmap_results(num_queries);
310 | 
311 |   // searching for the queries:
312 |   cmap.hash_search_bulk(h_query.data(), cmap_results.data(), num_queries);
313 | 
314 |   // validating the results:
315 |   std::unordered_map<KeyT, ValueT> hash_map;
316 |   for (uint32_t i_key = 0; i_key < num_keys; i_key++) {
317 |     hash_map.insert(std::make_pair(h_key[i_key], h_value[i_key]));
318 |   }
319 |   for (uint32_t i_key = 0; i_key < num_deletion; i_key++) {
320 |     hash_map.erase(h_deleted_keys[i_key]);
321 |   }
322 | 
323 |   for (uint32_t i = 0; i < num_queries; i++) {
324 |     auto cmap_result = cmap_results[i];
325 |     auto expected_result_it = hash_map.find(h_query[i]);
326 |     auto expected_result = expected_result_it == hash_map.end()
327 |                                ? SEARCH_NOT_FOUND
328 |                                : expected_result_it->second;
329 |     ASSERT_EQ(expected_result, cmap_result);
330 |   }
331 | }
332 | 
333 | int main(int argc, char** argv) {
334 |   if (cmdOptionExists(argv, argc + argv, "-device")) {
335 |     g_gpu_device_idx = atoi(getCmdOption(argv, argv + argc, "-device"));
336 |   }
337 | 
338 |   ::testing::InitGoogleTest(&argc, argv);
339 |   return RUN_ALL_TESTS();
340 | }


--------------------------------------------------------------------------------
/test/concurrent_map.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <time.h>
 19 | #include <unistd.h>
 20 | #include <algorithm>
 21 | #include <cstdlib>
 22 | #include <iostream>
 23 | #include <random>
 24 | #include <vector>
 25 | #include "gpu_hash_table.cuh"
 26 | //=======================================
 27 | #define DEVICE_ID 0
 28 | 
 29 | int main(int argc, char** argv) {
 30 |   //=========
 31 |   int devCount;
 32 |   cudaGetDeviceCount(&devCount);
 33 |   cudaDeviceProp devProp;
 34 |   if (devCount) {
 35 |     cudaSetDevice(DEVICE_ID);  // be changed later
 36 |     cudaGetDeviceProperties(&devProp, DEVICE_ID);
 37 |   }
 38 |   printf("Device: %s\n", devProp.name);
 39 | 
 40 |   //======================================
 41 |   // Building my hash table:
 42 |   //======================================
 43 |   uint32_t num_keys = 1 << 20;
 44 | 
 45 |   float expected_chain = 0.6f;
 46 |   uint32_t num_elements_per_unit = 15;
 47 |   uint32_t expected_elements_per_bucket =
 48 |       expected_chain * num_elements_per_unit;
 49 |   uint32_t num_buckets = (num_keys + expected_elements_per_bucket - 1) /
 50 |                          expected_elements_per_bucket;
 51 | 
 52 |   // ==== generating key-values and queries on the host:
 53 |   float existing_ratio = 1.0f;  // ratio of queries within the table
 54 |   uint32_t num_queries = num_keys;
 55 | 
 56 |   using KeyT = uint32_t;
 57 |   using ValueT = uint32_t;
 58 |   auto num_elements = 2 * num_keys;
 59 | 
 60 |   std::vector<KeyT> h_key(num_elements);
 61 |   std::vector<ValueT> h_value(num_elements);
 62 |   std::vector<KeyT> h_query(num_queries);
 63 |   std::vector<ValueT> h_correct_result(num_queries);
 64 |   std::vector<ValueT> h_result(num_queries);
 65 | 
 66 |   // std::iota(h_key.begin(), h_key.end(), 0);
 67 |   const auto f = [](const KeyT& key) { return key * 10; };
 68 | 
 69 |   std::random_device rd;
 70 |   const int64_t seed = 1;
 71 |   std::mt19937 rng(seed);
 72 |   std::vector<uint32_t> index(num_elements);
 73 |   std::iota(index.begin(), index.end(), 0);
 74 |   std::shuffle(index.begin(), index.end(), rng);
 75 | 
 76 |   for (int32_t i = 0; i < index.size(); i++) {
 77 |     h_key[i] = index[i];
 78 |     h_value[i] = f(h_key[i]);
 79 |   }
 80 | 
 81 |   //=== generating random queries with a fixed ratio existing in keys
 82 |   uint32_t num_existing = static_cast<uint32_t>(existing_ratio * num_queries);
 83 | 
 84 |   for (int i = 0; i < num_existing; i++) {
 85 |     h_query[i] = h_key[num_keys - 1 - i];
 86 |     h_correct_result[i] = f(h_query[i]);
 87 |   }
 88 | 
 89 |   for (int i = 0; i < (num_queries - num_existing); i++) {
 90 |     h_query[num_existing + i] = h_key[num_keys + i];
 91 |     h_correct_result[num_existing + i] = SEARCH_NOT_FOUND;
 92 |   }
 93 |   // permuting the queries:
 94 |   std::vector<int> q_index(num_queries);
 95 |   std::iota(q_index.begin(), q_index.end(), 0);
 96 |   std::shuffle(q_index.begin(), q_index.end(), rng);
 97 |   for (int i = 0; i < num_queries; i++) {
 98 |     std::swap(h_query[i], h_query[q_index[i]]);
 99 |     std::swap(h_correct_result[i], h_correct_result[q_index[i]]);
100 |   }
101 |   gpu_hash_table<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>
102 |       hash_table(num_keys, num_buckets, DEVICE_ID, seed);
103 | 
104 |   float build_time =
105 |       hash_table.hash_build(h_key.data(), h_value.data(), num_keys);
106 |   float search_time =
107 |       hash_table.hash_search(h_query.data(), h_result.data(), num_queries);
108 |   float search_time_bulk =
109 |       hash_table.hash_search_bulk(h_query.data(), h_result.data(), num_queries);
110 |   // // hash_table.print_bucket(0);
111 |   printf("Hash table: \n");
112 |   printf("num_keys = %d, num_buckets = %d\n", num_keys, num_buckets);
113 |   printf("\t2) Hash table built in %.3f ms (%.3f M elements/s)\n", build_time,
114 |          double(num_keys) / build_time / 1000.0);
115 |   printf("\t3) Hash table search (%.2f) in %.3f ms (%.3f M queries/s)\n",
116 |          existing_ratio, search_time,
117 |          double(num_queries) / search_time / 1000.0);
118 |   printf("\t4) Hash table bulk search (%.2f) in %.3f ms (%.3f Mqueries/s)\n",
119 |          existing_ratio, search_time_bulk,
120 |          double(num_queries) / search_time_bulk / 1000.0);
121 | 
122 |   double load_factor = hash_table.measureLoadFactor();
123 | 
124 |   printf("The load factor is %.2f, number of buckets %d\n", load_factor,
125 |          num_buckets);
126 | 
127 |   // ==== validation:
128 |   for (int i = 0; i < num_queries; i++) {
129 |     if (h_correct_result[i] != h_result[i]) {
130 |       printf("### wrong result at index %d: [%d] -> %d, but should be %d\n", i,
131 |              h_query[i], h_result[i], h_correct_result[i]);
132 |       break;
133 |     }
134 |     if (i == (num_queries - 1))
135 |       printf("Validation done successfully\n");
136 |   }
137 | 
138 |   //	=== building cudpp for comparison
139 |   // float load_factor_cudpp = 0.8f;
140 |   // cudpp_hash_table cudpp_hash(h_key, h_value, num_keys, num_queries,
141 |   // load_factor_cudpp, false, false); float cudpp_build_time =
142 |   // cudpp_hash.hash_build(); float cudpp_search_time =
143 |   // cudpp_hash.lookup_hash_table(h_query, num_queries); printf(" CUDPP Hash
144 |   // table: \n"); printf("\t1) Hash table built in %.3f ms (%.3f M
145 |   // elements/s)\n", cudpp_build_time,
146 |   // double(num_keys)/cudpp_build_time/1000.0); printf("\t2) Hash table search
147 |   // (%.2f) in %.3f ms (%.3f M elements/s)\n", existing_ratio,
148 |   // cudpp_search_time, double(num_queries)/cudpp_search_time/1000.0);
149 |   //	===
150 | }


--------------------------------------------------------------------------------
/test/concurrent_set.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <time.h>
 19 | #include <unistd.h>
 20 | #include <algorithm>
 21 | #include <cstdlib>
 22 | #include <iostream>
 23 | #include <random>
 24 | #include <vector>
 25 | #include "gpu_hash_table.cuh"
 26 | #include "slab_hash.cuh"
 27 | //=======================================
 28 | #define DEVICE_ID 0
 29 | 
 30 | int main(int argc, char** argv) {
 31 |   //=========
 32 |   int devCount;
 33 |   cudaGetDeviceCount(&devCount);
 34 |   cudaDeviceProp devProp;
 35 |   if (devCount) {
 36 |     cudaSetDevice(DEVICE_ID);  // be changed later
 37 |     cudaGetDeviceProperties(&devProp, DEVICE_ID);
 38 |   }
 39 |   printf("Device: %s\n", devProp.name);
 40 | 
 41 |   //======================================
 42 |   // Building my hash table:
 43 |   //======================================
 44 |   uint32_t num_keys = 1<<20;
 45 | 
 46 |   float expected_chain = 0.6f;
 47 |   uint32_t num_elements_per_unit = 31;
 48 |   uint32_t expected_elements_per_bucket =
 49 |       expected_chain * num_elements_per_unit;
 50 |   uint32_t num_buckets = (num_keys + expected_elements_per_bucket - 1) /
 51 |                          expected_elements_per_bucket;
 52 | 
 53 |   // ==== generating key-values and queries on the host:
 54 |   float existing_ratio = 1.0f;  // ratio of queries within the table
 55 |   uint32_t num_queries = num_keys;
 56 | 
 57 |   using KeyT = uint32_t;
 58 |   auto num_elements = 2 * num_keys;
 59 | 
 60 |   std::vector<KeyT> h_key(num_elements);
 61 |   std::vector<KeyT> h_query(num_queries);
 62 |   std::vector<KeyT> h_correct_result(num_queries);
 63 |   std::vector<KeyT> h_result(num_queries);
 64 | 
 65 |   std::random_device rd;
 66 |   const int64_t seed = 1;
 67 |   std::mt19937 rng(seed);
 68 |   std::vector<uint32_t> index(num_elements);
 69 |   std::iota(index.begin(), index.end(), 0);
 70 |   std::shuffle(index.begin(), index.end(), rng);
 71 | 
 72 |   for (int32_t i = 0; i < index.size(); i++) {
 73 |     h_key[i] = index[i];
 74 |   }
 75 | 
 76 |   //=== generating random queries with a fixed ratio existing in keys
 77 |   uint32_t num_existing = static_cast<uint32_t>(existing_ratio * num_queries);
 78 | 
 79 |   for (int i = 0; i < num_existing; i++) {
 80 |     h_query[i] = h_key[num_keys - 1 - i];
 81 |     h_correct_result[i] = h_query[i];
 82 |   }
 83 | 
 84 |   for (int i = 0; i < (num_queries - num_existing); i++) {
 85 |     h_query[num_existing + i] = h_key[num_keys + i];
 86 |     h_correct_result[num_existing + i] = SEARCH_NOT_FOUND;
 87 |   }
 88 |   // permuting the queries:
 89 |   std::vector<int> q_index(num_queries);
 90 |   std::iota(q_index.begin(), q_index.end(), 0);
 91 |   std::shuffle(q_index.begin(), q_index.end(), rng);
 92 |   for (int i = 0; i < num_queries; i++) {
 93 |     std::swap(h_query[i], h_query[q_index[i]]);
 94 |     std::swap(h_correct_result[i], h_correct_result[q_index[i]]);
 95 |   }
 96 |   gpu_hash_table<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>
 97 |       hash_table(num_keys, num_buckets, DEVICE_ID, seed, false);
 98 | 
 99 |   float build_time =
100 |       hash_table.hash_build(h_key.data(), nullptr, num_keys);
101 |   float search_time =
102 |       hash_table.hash_search(h_query.data(), h_result.data(), num_queries);
103 |   // float search_time_bulk =
104 |   //     hash_table.hash_search_bulk(h_query.data(), h_result.data(), num_queries);
105 |   // // // hash_table.print_bucket(0);
106 |   // printf("Hash table: \n");
107 |   // printf("num_keys = %d, num_buckets = %d\n", num_keys, num_buckets);
108 |   // // printf("\t1) Hash table init in %.3f ms\n", init_time);
109 |   // printf("\t2) Hash table built in %.3f ms (%.3f M elements/s)\n", build_time,
110 |   //        double(num_keys) / build_time / 1000.0);
111 |   // printf("\t3) Hash table search (%.2f) in %.3f ms (%.3f M queries/s)\n",
112 |   //        existing_ratio, search_time,
113 |   //        double(num_queries) / search_time / 1000.0);
114 |   // printf("\t4) Hash table bulk search (%.2f) in %.3f ms (%.3f Mqueries/s)\n",
115 |   //        existing_ratio, search_time_bulk,
116 |   //        double(num_queries) / search_time_bulk / 1000.0);
117 | 
118 |   // double load_factor = hash_table.measureLoadFactor();
119 | 
120 |   // printf("The load factor is %.2f, number of buckets %d\n", load_factor,
121 |   //        num_buckets);
122 | 
123 |   // ==== validation:
124 |   for (int i = 0; i < num_queries; i++) {
125 |     if (h_correct_result[i] != h_result[i]) {
126 |       printf("### wrong result at index %d: [%d] -> %d, but should be %d\n", i,
127 |              h_query[i], h_result[i], h_correct_result[i]);
128 |       break;
129 |     }
130 |     if (i == (num_queries - 1))
131 |       printf("Validation done successfully\n");
132 |   }
133 | }


--------------------------------------------------------------------------------
/test/iterator_test.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Saman Ashkiani
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <time.h>
 19 | #include <unistd.h>
 20 | #include <algorithm>
 21 | #include <cstdlib>
 22 | #include <iostream>
 23 | #include <random>
 24 | #include <vector>
 25 | #include "gpu_hash_table.cuh"
 26 | #include "slab_hash.cuh"
 27 | //=======================================
 28 | #define DEVICE_ID 0
 29 | //=======================================
 30 | 
 31 | template <typename KeyT>
 32 | __global__ void print_table(
 33 |     GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
 34 |   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 35 |   uint32_t wid = tid >> 5;
 36 |   uint32_t laneId = threadIdx.x & 0x1F;
 37 | 
 38 |   if (wid >= slab_hash.getNumBuckets()) {
 39 |     return;
 40 |   }
 41 | 
 42 |   // initializing the memory allocator on each warp:
 43 |   slab_hash.getAllocatorContext().initAllocator(tid, laneId);
 44 | 
 45 |   if (tid == 0) {
 46 |     printf(" == Printing the base array\n");
 47 |     SlabIterator<KeyT> iter(slab_hash);
 48 |     for (int i = 0; i < iter.cur_size_; i++) {
 49 |       if ((i & 0x1F) == 0)
 50 |         printf(" == bucket %d:\n", i >> 5);
 51 |       printf("%8x, ", *(iter.cur_ptr_ + i));
 52 |       if ((i & 0x7) == 0x7)
 53 |         printf("\n");
 54 |     }
 55 |     printf("\n");
 56 | 
 57 |     printf(" == Printing the rest of slabs:\n");
 58 |     while (iter.next()) {
 59 |       for (int i = 0; i < iter.cur_size_; i++) {
 60 |         if ((i & 0x1F) == 0)
 61 |           printf(" == bucket %d:\n", iter.cur_bucket_);
 62 |         printf("%8x, ", *(iter.cur_ptr_ + i));
 63 |         if ((i & 0x7) == 0x7)
 64 |           printf("\n");
 65 |       }
 66 |       printf("\n");
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | //=======================================
 72 | int main(int argc, char** argv) {
 73 |   //=========
 74 |   int devCount;
 75 |   cudaGetDeviceCount(&devCount);
 76 |   cudaDeviceProp devProp;
 77 |   if (devCount) {
 78 |     cudaSetDevice(DEVICE_ID);  // be changed later
 79 |     cudaGetDeviceProperties(&devProp, DEVICE_ID);
 80 |   }
 81 |   printf("Device: %s\n", devProp.name);
 82 | 
 83 |   //======================================
 84 |   // Building my hash table:
 85 |   //======================================
 86 |   uint32_t num_buckets = 2;
 87 | 
 88 |   using KeyT = uint32_t;
 89 | 
 90 |   std::vector<KeyT> h_key = {2,  4,  6,  8,  10, 1,  3,  5,  7,  9,
 91 |                              11, 13, 15, 17, 19, 21, 23, 25, 27, 29,
 92 |                              31, 33, 35, 37, 39, 41, 43, 45, 47, 49,
 93 |                              51, 53, 55, 57, 59, 61, 63, 65, 67};
 94 |   uint32_t num_keys = h_key.size();
 95 | 
 96 |   const int64_t seed = 1;
 97 |   std::mt19937 rng(seed);
 98 |   std::shuffle(h_key.begin(), h_key.end(), rng);
 99 | 
100 |   gpu_hash_table<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>
101 |       hash_table(num_keys, num_buckets, DEVICE_ID, seed, false, /*identity_hash*/ true);
102 | 
103 |   float build_time = hash_table.hash_build(h_key.data(), nullptr, num_keys);
104 | 
105 |   const uint32_t num_blocks = 1;
106 |   const uint32_t num_threads = 128;
107 |   print_table<KeyT><<<num_blocks, num_threads>>>(
108 |       hash_table.slab_hash_->getSlabHashContext());
109 | 
110 |   return 0;
111 | }


--------------------------------------------------------------------------------
/test/test_slab_hash.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 Saman Ashkiani
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda.h>
18 | #include <cuda_runtime_api.h>
19 | #include <stdio.h>
20 | #include <iostream>
21 | #include "gpu_hash_table.cuh"
22 | #include "slab_alloc.cuh"
23 | #include "slab_hash.cuh"
24 | #define DEVICE_ID 0
25 | 
26 | int main(int argc, char** argv) {
27 |   //=========
28 |   int devCount;
29 |   cudaGetDeviceCount(&devCount);
30 |   cudaDeviceProp devProp;
31 |   if (devCount) {
32 |     cudaSetDevice(DEVICE_ID);  // be changed later
33 |     cudaGetDeviceProperties(&devProp, DEVICE_ID);
34 |   }
35 |   printf("Device: %s\n", devProp.name);
36 | 
37 |   auto my_hash_table =
38 |       new gpu_hash_table<uint32_t, uint32_t,
39 |                          SlabHashTypeT::ConcurrentMap>(100, 10, DEVICE_ID,/*seed = */ 1);
40 | 
41 |   std::vector<uint32_t> h_key{10, 5, 1};
42 |   std::vector<uint32_t> h_value{100, 50, 10};
43 | 
44 |   my_hash_table->hash_build(h_key.data(), h_value.data(), h_key.size());
45 |   // auto slab_alloc = new SlabAllocLight<8, 32, 1>();
46 |   // printf("slab alloc constructed\n");
47 | 
48 |   // delete slab_alloc;
49 | 
50 |   // auto slab_hash =
51 |   //     new GpuSlabHash<uint32_t, uint32_t, SlabHashType::ConcurrentMap>();
52 |   // std::cout << slab_hash->to_string() << std::endl;
53 |   delete my_hash_table;
54 |   return 0;
55 | }


--------------------------------------------------------------------------------