├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── bench ├── CMakeLists.txt ├── batched_data_gen.cpp ├── batched_data_gen.h ├── bencher.py ├── experiments.cuh └── main_benchmarks.cu ├── src ├── CMakeLists.txt ├── CommandLine.h ├── concurrent_map │ ├── cmap_class.cuh │ ├── cmap_implementation.cuh │ ├── device │ │ ├── build.cuh │ │ ├── concurrent_kernel.cuh │ │ ├── count_kernel.cuh │ │ ├── delete_kernel.cuh │ │ ├── misc_kernels.cuh │ │ └── search_kernel.cuh │ └── warp │ │ ├── count.cuh │ │ ├── delete.cuh │ │ ├── insert.cuh │ │ └── search.cuh ├── concurrent_set │ ├── cset_class.cuh │ ├── cset_helper_kernels.cuh │ ├── cset_implementation.cuh │ └── cset_warp_operations.cuh ├── gpu_hash_table.cuh ├── slab_hash.cuh ├── slab_hash_global.cuh ├── slab_hash_helper_methods.cuh └── slab_iterator.cuh └── test ├── CMakeLists.txt ├── cmap_test.cu ├── concurrent_map.cu ├── concurrent_set.cu ├── iterator_test.cu └── test_slab_hash.cu /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Chromium 2 | BinPackArguments: false 3 | BinPackParameters: false 4 | ColumnLimit: 90 5 | IndentWidth: 2 6 | BreakConstructorInitializers: BeforeComma 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # CMake files 2 | CMakeCache.txt 3 | CMakeFiles 4 | Makefile 5 | cmake_install.cmake 6 | install_manifest.txt 7 | _build/ 8 | build/ 9 | .vscode 10 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "SlabAlloc"] 2 | path = SlabAlloc 3 | url = https://github.com/owensgroup/SlabAlloc 4 | [submodule "ThirdParty/rapidjson"] 5 | path = ThirdParty/rapidjson 6 | url = https://github.com/Tencent/rapidjson 7 | [submodule "ThirdParty/googletest"] 8 | path = ThirdParty/googletest 9 | url = https://github.com/google/googletest 10 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.8 FATAL_ERROR) 2 | project (SlabHash) 3 | 4 | find_package(CUDA 8.0 REQUIRED) 5 | 6 | option(CMAKE_VERBOSE_MAKEFILE ON) 7 | option(DGTEST, "DGTEST" ON) 8 | 9 | set(CUDA_NVCC_FLAGS -std=c++11) 10 | set (CMAKE_CXX_STANDARD 11) 11 | 12 | if (CUDA_VERBOSE_PTXAS) 13 | set(VERBOSE_PTXAS --ptxas-options=-v) 14 | endif (CUDA_VERBOSE_PTXAS) 15 | 16 | set(CMAKE_BUILD_TYPE "Release") 17 | 18 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 19 | 20 | set(GENCODE_SM30 21 | -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_30,code=compute_30) 22 | set(GENCODE_SM35 23 | -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_35,code=compute_35) 24 | set(GENCODE_SM37 25 | -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_37,code=compute_37) 26 | set(GENCODE_SM50 27 | -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_50,code=compute_50) 28 | set(GENCODE_SM60 29 | -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60) 30 | set(GENCODE_SM61 31 | -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61) 32 | set(GENCODE_SM70 33 | -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70) 34 | set(GENCODE_SM71 35 | -gencode=arch=compute_71,code=sm_71 -gencode=arch=compute_71,code=compute_71) 36 | set(GENCODE_SM75 37 | -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_75,code=compute_75) 38 | 39 | option(SLABHASH_GENCODE_SM30 "GENCODE_SM30" OFF) 40 | option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" ON) 41 | option(SLABHASH_GENCODE_SM37 "GENCODE_SM37" OFF) 42 | option(SLABHASH_GENCODE_SM50 "GENCODE_SM50" OFF) 43 | option(SLABHASH_GENCODE_SM60 "GENCODE_SM60" OFF) 44 | option(SLABHASH_GENCODE_SM61 "GENCODE_SM61" OFF) 45 | option(SLABHASH_GENCODE_SM70 "GENCODE_SM70" OFF) 46 | option(SLABHASH_GENCODE_SM71 "GENCODE_SM71" OFF) 47 | option(SLABHASH_GENCODE_SM75 "GENCODE_SM75" OFF) 48 | 49 | if (SLABHASH_GENCODE_SM30) 50 | set(GENCODE ${GENCODE} ${GENCODE_SM30}) 51 | endif(SLABHASH_GENCODE_SM30) 52 | 53 | if (SLABHASH_GENCODE_SM35) 54 | set(GENCODE ${GENCODE} ${GENCODE_SM35}) 55 | endif(SLABHASH_GENCODE_SM35) 56 | 57 | if (SLABHASH_GENCODE_SM37) 58 | set(GENCODE ${GENCODE} ${GENCODE_SM37}) 59 | endif(SLABHASH_GENCODE_SM37) 60 | 61 | if (SLABHASH_GENCODE_SM50) 62 | set(GENCODE ${GENCODE} ${GENCODE_SM50}) 63 | endif(SLABHASH_GENCODE_SM50) 64 | 65 | if (SLABHASH_GENCODE_SM60) 66 | set(GENCODE ${GENCODE} ${GENCODE_SM60}) 67 | endif(SLABHASH_GENCODE_SM60) 68 | 69 | if (SLABHASH_GENCODE_SM61) 70 | set(GENCODE ${GENCODE} ${GENCODE_SM61}) 71 | endif(SLABHASH_GENCODE_SM61) 72 | 73 | if (SLABHASH_GENCODE_SM70) 74 | set(GENCODE ${GENCODE} ${GENCODE_SM70}) 75 | endif(SLABHASH_GENCODE_SM70) 76 | 77 | if(SLABHASH_GENCODE_SM71) 78 | set(GENCODE ${GENCODE} ${GENCODE_SM71}) 79 | endif(SLABHASH_GENCODE_SM71) 80 | 81 | if(SLABHASH_GENCODE_SM75) 82 | set(GENCODE ${GENCODE} ${GENCODE_SM75}) 83 | endif(SLABHASH_GENCODE_SM75) 84 | 85 | include_directories(SlabAlloc/src) 86 | include_directories(src src/concurrent) 87 | include_directories(ThirdParty/rapidjson/include) 88 | include_directories(ThirdParty/googletest/googletest) 89 | include_directories(ThirdParty/googletest/googletest/include) 90 | add_subdirectory(ThirdParty/googletest/googletest) 91 | add_subdirectory(test) 92 | add_subdirectory(bench) 93 | 94 | if (DGTEST) 95 | enable_testing() 96 | endif() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SlabHash 2 | A warp-oriented dynamic hash table for GPUs 3 | 4 | ## Publication: 5 | This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper: 6 | * [Saman Ashkiani, Martin Farach-Colton, John Owens, *A Dynamic Hash Table for the GPU*, 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)](https://ieeexplore.ieee.org/abstract/document/8425196) 7 | 8 | This library is a rafactored and slightly redesigned version of the original code, so that it can be extended and be used in other research projects as well. It is still under continuous development. If you find any problem with the code, or suggestions for potential additions to the library, we will appreciate it if you can raise issues on github. We will address them as soon as possible. 9 | 10 | ## Compilation 11 | 1. `git submodule init` 12 | 2. `git submodule update` 13 | 3. Make sure to edit `CMakeLists.txt` such that it reflects the GPU device's compute capability. For example, to include compute 3.5 you should have `option(SLABHASH_GENCODE_SM35 "GENCODE_SM35" ON)`. Alternatively, one can easily update these flags by using the `ccmake ..` interface from the build directory. 14 | 4. `mkdir build && cd build` 15 | 5. `cmake ..` 16 | 6. `make` 17 | 18 | ## High level API 19 | In order to use this code, it is required to include [https://github.com/owensgroup/SlabHash/blob/master/src/slab_hash.cuh](`src/slab_hash.cuh`), which itself will include all required variations of the GpuSlabHash main class. 20 | We have provided a simple application class [https://github.com/owensgroup/SlabHash/blob/master/src/gpu_hash_table.cuh](gpu_hash_table), where the right instance of `GpuSlabHash` is initialized. 21 | This class is just an example of how to use the GpuSlabHash in various contexts. 22 | Any other similar application level API should also own the dynamic memory allocator that is used by all instances of GpuSlabHash class (here just one). Finally, GpuSlabHash will be constructed with a pointer to the mentioned dynamic allocator. 23 | 24 | There are a few variations of GpuSlabHash class. The most complete one at the moment is [https://github.com/owensgroup/SlabHash/blob/master/src/concurrent_map/cmap_class.cuh](`GpuSlabHash`) which is based on the initial idea of the slab hash proposed in the paper above. 25 | This class partially owns all the memory allocated on the GPU to actually store all the contents, side by side all units allocated by the dynamic memory allocator. 26 | There is another class, named [https://github.com/owensgroup/SlabHash/blob/master/src/concurrent_map/cmap_class.cuh#L26](`GpuSlabHashContext`), which does not own any memory but has all the related member functions to use the data structure itself. The context class is the one which is used by GPU threads on the device. Here's an example of the way to use it for a [https://github.com/owensgroup/SlabHash/blob/master/src/concurrent_map/device/search_kernel.cuh](search kernel): 27 | 28 | ``` 29 | template 30 | __global__ void search_table( 31 | KeyT* d_queries, 32 | ValueT* d_results, 33 | uint32_t num_queries, 34 | GpuSlabHashContext slab_hash) { 35 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 36 | uint32_t laneId = threadIdx.x & 0x1F; 37 | 38 | if ((tid - laneId) >= num_queries) { 39 | return; 40 | } 41 | 42 | // initializing the memory allocator on each warp: 43 | slab_hash.getAllocatorContext().initAllocator(tid, laneId); 44 | 45 | KeyT myQuery = 0; 46 | ValueT myResult = static_cast(SEARCH_NOT_FOUND); 47 | uint32_t myBucket = 0; 48 | bool to_search = false; 49 | if (tid < num_queries) { 50 | myQuery = d_queries[tid]; 51 | myBucket = slab_hash.computeBucket(myQuery); 52 | to_search = true; 53 | } 54 | 55 | slab_hash.searchKey(to_search, laneId, myQuery, myResult, myBucket); 56 | 57 | // writing back the results: 58 | if (tid < num_queries) { 59 | d_results[tid] = myResult; 60 | } 61 | } 62 | ``` 63 | 64 | ## Simple benchmarking 65 | A simplified set of benchmark scenarios are available through a Python API that can be used as follows: Once the code is successfully compiled you can run the following python code from the `build` directory: `python3 ../bench/bencher.py -m -d `, where experiment mode and device to be used are chosen. So far, the following experiments are added: 66 | 67 | * mode 0: singleton experiment, where the hash table is built given a fixed load factor (which set by using a parameter for expected chain length, or equivalently total number of initial buckets). 68 | * mode 1: load factor experiment, where a series of scenarios are simulated. In each case, total number of elements to be inserted into the hash table are constant, but the load factor (number of buckets) varies from case to case. 69 | * mode 2: variable sized tables experiment, where the load factor (number of buckets) is fixed, but the total number of elements to be inserted into the table is variable. 70 | * mode 3: concurrent experiment, where a series of batches of operations are used in the data structure: each batch is consisted of `(insert_ratio, delete_ratio, search_exist_ratio, search_not_exist_ratio)` as it operation distribution. For example, a tuple of (0.1, 0.1, 0.4, 0.4) would mean that 10% of each batch's operations are new elements to be inserted, 10% are deletion of previously inserted elements (in previous batches), 40% are search queries of elements that are previously inserted, and the final 40% are search queries of elements that are not stored in the data structure at all. Simulation starts with a few number of initial batches with 100% of operations as insertion, and then the rest of the batches with its given probability distribution. 71 | 72 | In the following, these benchmarks are run a few GPU architectures. It should be noted that majority of input parameters for these scenarios are not exposed as command line arguments in the python code. If interested to try with different set of settings, the reader should either use their corresponding C++ API (through `build/bin/benchmark` and with the parameters listed in [https://github.com/owensgroup/SlabHash/blob/master/bench/main_benchmarks.cu](bench/main_benchmark.cu)), or change these parameters in [https://github.com/owensgroup/SlabHash/blob/master/bench/bencher.py#L166](`bench/bencher.py`). 73 | 74 | ### NVIDIA GeForce RTX 2080: 75 | GeForce RTX 2080 has a Turing architecture with compute capability 7.5 and 8GB of DRAM memory. In our setting, we have NVIDIA driver 430.14, and CUDA 10.1. 76 | 77 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6. 78 | 79 | #### Mode 0: 80 | ``` 81 | python3 ../bench/bencher.py -m 0 -d 0 82 | 83 | GPU hardware: GeForce RTX 2080 84 | =============================================================================================== 85 | Singleton experiment: 86 | Number of elements to be inserted: 4194304 87 | Number of buckets: 466034 88 | Expected chain length: 0.60 89 | =============================================================================================== 90 | load factor build rate(M/s) search rate(M/s) search rate bulk(M/s) 91 | =============================================================================================== 92 | 0.55 912.650 1930.254 1973.352 93 | ``` 94 | 95 | #### Mode 1: 96 | ``` 97 | python3 ../bench/bencher.py -m 1 -d 0 98 | 99 | GPU hardware: GeForce RTX 2080 100 | =============================================================================================== 101 | Load factor experiment: 102 | Total number of elements is fixed, load factor (number of buckets) is a variable 103 | Number of elements to be inserted: 4194304 104 | 1.00 of 4194304 queries exist in the data structure 105 | =============================================================================================== 106 | load factor num buckets build rate(M/s) search rate(M/s) search rate bulk(M/s) 107 | =============================================================================================== 108 | 0.06 4194304 861.149 1860.127 1897.779 109 | 0.19 1398102 868.142 1889.353 1917.126 110 | 0.25 1048576 865.396 1897.587 1935.070 111 | 0.37 699051 894.140 1925.491 1951.696 112 | 0.44 599187 888.786 1924.727 1971.126 113 | 0.55 466034 897.348 1945.381 1982.515 114 | 0.60 419431 905.537 1943.449 1969.260 115 | 0.65 349526 909.736 1896.900 1936.958 116 | 0.65 262144 865.819 1742.237 1785.819 117 | 0.65 279621 882.153 1794.917 1825.312 118 | 0.66 233017 840.275 1656.958 1696.176 119 | 0.66 322639 893.878 1871.789 1915.809 120 | 0.66 220753 831.960 1619.813 1653.572 121 | 0.69 199729 821.923 1542.169 1571.814 122 | 0.70 190651 812.457 1509.976 1536.384 123 | 0.73 174763 797.804 1444.304 1472.074 124 | 0.74 167773 788.925 1409.498 1451.453 125 | 0.75 155345 771.897 1361.815 1397.073 126 | 0.76 149797 764.415 1337.688 1364.367 127 | 0.76 139811 749.947 1282.041 1312.374 128 | ``` 129 | 130 | #### Mode 2: 131 | ``` 132 | python3 ../bench/bencher.py -m 2 -d 0 133 | 134 | GPU hardware: GeForce RTX 2080 135 | =============================================================================================== 136 | Table size experiment: 137 | Table's expected chain length is fixed, and total number of elements is variable 138 | Expected chain length = 0.60 139 | 140 | 1.00 of 262144 queries exist in the data structure 141 | =============================================================================================== 142 | (num keys, num buckets, load factor) build rate(M/s) search rate(M/s) search rate bulk(M/s) 143 | =============================================================================================== 144 | (262144, 29128, 0.55) 1346.040 2577.722 2785.447 145 | (524288, 58255, 0.55) 1271.655 2319.366 2461.538 146 | (1048576, 116509, 0.55) 1116.761 2139.322 2209.873 147 | (2097152, 233017, 0.55) 984.349 2076.750 2117.411 148 | (4194304, 466034, 0.55) 916.741 1988.169 2020.658 149 | (8388608, 932068, 0.55) 871.570 1898.617 1926.835 150 | ``` 151 | 152 | #### Mode 3: 153 | ``` 154 | python3 ../bench/bencher.py -m 3 -d 0 155 | 156 | GPU hardware: GeForce RTX 2080 157 | =============================================================================================== 158 | Concurrent experiment: 159 | variable load factor, fixed number of elements 160 | Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40]) 161 | =============================================================================================== 162 | batch_size = 262144, init num batches = 3, final num batches = 4 163 | =============================================================================================== 164 | init lf final lf num buckets init build rate(M/s) concurrent rate(Mop/s) 165 | =============================================================================================== 166 | 0.05 0.05 1048576 855.979 1406.593 167 | 0.14 0.14 349526 902.501 1467.049 168 | 0.19 0.19 262144 937.121 1488.642 169 | 0.28 0.28 174763 995.060 1560.678 170 | 0.33 0.33 149797 1047.526 1552.986 171 | 0.42 0.42 116509 1070.523 1618.972 172 | 0.47 0.47 104858 1110.027 1635.456 173 | 0.55 0.55 87382 1138.991 1626.042 174 | 0.59 0.58 80660 1140.100 1615.779 175 | 0.63 0.62 69906 1115.924 1561.273 176 | ``` 177 | 178 | ### NVIDIA Titan V: 179 | 180 | Titan V has Volta architecture with compute capability 7.0 and 12GB of DRAM memory. In our setting, we have NVIDIA driver 410.104, and CUDA 10.0 running. 181 | 182 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6. 183 | 184 | #### Mode 0: 185 | ``` 186 | python3 ../bench/bencher.py -m 0 -d 0 187 | 188 | 189 | GPU hardware: TITAN V 190 | =============================================================================================== 191 | Singleton experiment: 192 | Number of elements to be inserted: 4194304 193 | Number of buckets: 466034 194 | Expected chain length: 0.60 195 | =============================================================================================== 196 | load factor build rate(M/s) search rate(M/s) search rate bulk(M/s) 197 | =============================================================================================== 198 | 0.55 1525.352 4137.374 3241.468 199 | ``` 200 | 201 | #### Mode 1: 202 | ``` 203 | python3 ../bench/bencher.py -m 1 -d 0 204 | 205 | GPU hardware: TITAN V 206 | =============================================================================================== 207 | Load factor experiment: 208 | Total number of elements is fixed, load factor (number of buckets) is a variable 209 | Number of elements to be inserted: 4194304 210 | 1.00 of 4194304 queries exist in the data structure 211 | =============================================================================================== 212 | load factor num buckets build rate(M/s) search rate(M/s) search rate bulk(M/s) 213 | =============================================================================================== 214 | 0.06 4194304 1416.107 3851.094 3454.809 215 | 0.19 1398102 1454.223 3934.442 3575.244 216 | 0.25 1048576 1466.819 3978.993 3603.156 217 | 0.37 699051 1491.658 4053.439 3629.898 218 | 0.44 599187 1508.881 4084.385 3512.300 219 | 0.55 466034 1527.094 4138.811 3239.865 220 | 0.60 419431 1528.536 4146.405 2877.604 221 | 0.65 349526 1522.836 4095.360 2125.584 222 | 0.65 262144 1476.884 3785.364 1318.751 223 | 0.65 279621 1481.709 3886.148 1436.972 224 | 0.66 233017 1451.372 3599.791 1164.226 225 | 0.66 322639 1512.172 4044.683 1811.162 226 | 0.66 220753 1431.386 3508.069 1110.930 227 | 0.69 199729 1408.241 3352.397 1024.753 228 | 0.70 190651 1413.983 3278.603 991.955 229 | 0.73 174763 1403.611 3149.785 934.420 230 | 0.74 167773 1381.567 3085.426 903.303 231 | 0.75 155345 1367.470 2973.300 850.200 232 | 0.76 149797 1363.288 2914.719 823.777 233 | 0.76 139811 1349.699 2808.064 777.419 234 | ``` 235 | 236 | #### Mode 2: 237 | ``` 238 | python3 ../bench/bencher.py -m 2 -d 0 239 | 240 | GPU hardware: TITAN V 241 | =============================================================================================== 242 | Table size experiment: 243 | Table's expected chain length is fixed, and total number of elements is variable 244 | Expected chain length = 0.60 245 | 246 | 1.00 of 262144 queries exist in the data structure 247 | =============================================================================================== 248 | (num keys, num buckets, load factor) build rate(M/s) search rate(M/s) search rate bulk(M/s) 249 | =============================================================================================== 250 | (262144, 29128, 0.55) 2640.026 4571.429 3529.513 251 | (524288, 58255, 0.55) 2473.430 4701.291 3207.518 252 | (1048576, 116509, 0.55) 2011.170 4821.660 3431.563 253 | (2097152, 233017, 0.55) 1673.630 4426.912 3475.236 254 | (4194304, 466034, 0.55) 1530.160 4154.290 3431.204 255 | (8388608, 932068, 0.55) 1464.140 3996.341 3214.361 256 | ``` 257 | 258 | #### Mode 3: 259 | ``` 260 | python3 ../bench/bencher.py -m 3 -d 0 261 | 262 | GPU hardware: TITAN V 263 | =============================================================================================== 264 | Concurrent experiment: 265 | variable load factor, fixed number of elements 266 | Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40]) 267 | =============================================================================================== 268 | batch_size = 262144, init num batches = 3, final num batches = 4 269 | =============================================================================================== 270 | init lf final lf num buckets init build rate(M/s) concurrent rate(Mop/s) 271 | =============================================================================================== 272 | 0.05 0.05 1048576 1427.426 2669.273 273 | 0.14 0.14 349526 1526.934 2826.777 274 | 0.19 0.19 262144 1590.783 2801.642 275 | 0.28 0.28 174763 1714.166 2952.072 276 | 0.33 0.33 149797 1781.644 3000.733 277 | 0.42 0.42 116509 1937.406 3119.574 278 | 0.47 0.47 104858 1992.379 3088.990 279 | 0.55 0.55 87382 2099.257 3144.722 280 | 0.59 0.58 80660 2137.415 3166.602 281 | 0.64 0.62 69906 2160.717 2986.511 282 | ``` 283 | 284 | ### Titan Xp 285 | 286 | Titan Xp has Pascal architecture with compute capability 6.1 and 12GB of DRAM memory. In our setting, we have NVIDIA driver 410.104, and CUDA 10.0 running. 287 | 288 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6. 289 | #### Mode 0: 290 | ``` 291 | python3 ../bench/bencher.py -m 0 -d 1 292 | 293 | GPU hardware: TITAN Xp 294 | =============================================================================================== 295 | Singleton experiment: 296 | Number of elements to be inserted: 4194304 297 | Number of buckets: 466034 298 | Expected chain length: 0.60 299 | =============================================================================================== 300 | load factor build rate(M/s) search rate(M/s) search rate bulk(M/s) 301 | =============================================================================================== 302 | 0.55 1007.340 2162.619 2199.785 303 | ``` 304 | 305 | #### Mode 1: 306 | ``` 307 | python3 ../bench/bencher.py -m 1 -d 1 308 | 309 | GPU hardware: TITAN Xp 310 | =============================================================================================== 311 | Load factor experiment: 312 | Total number of elements is fixed, load factor (number of buckets) is a variable 313 | Number of elements to be inserted: 4194304 314 | 1.00 of 4194304 queries exist in the data structure 315 | =============================================================================================== 316 | load factor num buckets build rate(M/s) search rate(M/s) search rate bulk(M/s) 317 | =============================================================================================== 318 | 0.06 4194304 964.644 2090.863 2121.181 319 | 0.19 1398102 985.215 2185.699 2202.151 320 | 0.25 1048576 991.760 2200.967 2216.450 321 | 0.37 699051 1004.214 2224.878 2244.384 322 | 0.44 599187 1011.303 2238.251 2257.993 323 | 0.55 466034 1016.487 2250.549 2267.996 324 | 0.60 419431 1009.784 2158.061 2192.719 325 | 0.65 349526 997.443 2122.280 2142.259 326 | 0.65 262144 972.467 1947.694 1925.717 327 | 0.65 279621 965.888 1998.049 1986.421 328 | 0.66 233017 439.267 1827.755 1790.210 329 | 0.66 322639 987.784 2089.796 2098.361 330 | 0.66 220753 907.927 1778.646 1735.593 331 | 0.69 199729 889.975 1693.262 1646.302 332 | 0.70 190651 881.868 1655.618 1608.166 333 | 0.73 174763 868.159 1587.597 1536.384 334 | 0.74 167773 861.239 1555.640 1503.119 335 | 0.75 155345 847.666 1493.902 1437.697 336 | 0.76 149797 837.248 1464.475 1408.044 337 | 0.76 139811 828.725 1409.983 1348.255 338 | 339 | ``` 340 | 341 | #### Mode 2: 342 | ``` 343 | python3 ../bench/bencher.py -m 2 -d 1 344 | 345 | GPU hardware: TITAN Xp 346 | =============================================================================================== 347 | Table size experiment: 348 | Table's expected chain length is fixed, and total number of elements is variable 349 | Expected chain length = 0.60 350 | 351 | 1.00 of 262144 queries exist in the data structure 352 | =============================================================================================== 353 | (num keys, num buckets, load factor) build rate(M/s) search rate(M/s) search rate bulk(M/s) 354 | =============================================================================================== 355 | (262144, 29128, 0.55) 1409.983 2331.910 2694.737 356 | (524288, 58255, 0.55) 1423.829 2392.523 2598.985 357 | (1048576, 116509, 0.55) 1191.867 2560.000 2612.245 358 | (2097152, 233017, 0.55) 1070.482 2375.870 2400.938 359 | (4194304, 466034, 0.55) 1012.616 2275.556 2289.547 360 | (8388608, 932068, 0.55) 992.530 2147.313 2177.692 361 | 362 | ``` 363 | 364 | #### Mode 3: 365 | ``` 366 | python3 ../bench/bencher.py -m 3 -d 1 367 | 368 | GPU hardware: TITAN Xp 369 | =============================================================================================== 370 | Concurrent experiment: 371 | variable load factor, fixed number of elements 372 | Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40]) 373 | =============================================================================================== 374 | batch_size = 262144, init num batches = 3, final num batches = 4 375 | =============================================================================================== 376 | init lf final lf num buckets init build rate(M/s) concurrent rate(Mop/s) 377 | =============================================================================================== 378 | 0.05 0.05 1048576 968.856 1651.613 379 | 0.14 0.14 349526 1017.219 1706.667 380 | 0.19 0.19 262144 1043.478 1753.425 381 | 0.28 0.28 174763 1097.339 1815.603 382 | 0.33 0.33 149797 1123.064 1855.072 383 | 0.42 0.42 116509 1174.593 1909.112 384 | 0.47 0.47 104858 1149.701 1741.867 385 | 0.55 0.55 87382 1193.010 1753.425 386 | 0.59 0.58 80660 1215.190 1753.425 387 | 0.63 0.62 69906 1238.710 1673.545 388 | ``` 389 | ### Tesla K40c 390 | 391 | Tesla K40c has Kepler architecture with compute capability 3.5 and 12GB of DRAM. In our setting, we have NVIDIA driver 410.72 and CUDA 10.0. 392 | 393 | The following results are for master branch with commit hash cb1734ee02a22aebdecb22c0279c7a15da332ff6. 394 | 395 | #### Mode 0: 396 | ``` 397 | python3 ../bench/bencher.py -m 0 -d 2 398 | 399 | GPU hardware: Tesla K40c 400 | =============================================================================================== 401 | Singleton experiment: 402 | Number of elements to be inserted: 4194304 403 | Number of buckets: 466034 404 | Expected chain length: 0.60 405 | =============================================================================================== 406 | load factor build rate(M/s) search rate(M/s) search rate bulk(M/s) 407 | =============================================================================================== 408 | 0.55 545.779 764.014 831.575 409 | ``` 410 | 411 | #### Mode 1: 412 | ``` 413 | python3 ../bench/bencher.py -m 1 -d 2 414 | 415 | GPU hardware: Tesla K40c 416 | =============================================================================================== 417 | Load factor experiment: 418 | Total number of elements is fixed, load factor (number of buckets) is a variable 419 | Number of elements to be inserted: 4194304 420 | 1.00 of 4194304 queries exist in the data structure 421 | =============================================================================================== 422 | load factor num buckets build rate(M/s) search rate(M/s) search rate bulk(M/s) 423 | =============================================================================================== 424 | 0.06 4194304 427.761 737.781 797.139 425 | 0.19 1398102 539.284 758.641 828.134 426 | 0.25 1048576 548.825 769.378 841.300 427 | 0.37 699051 551.950 769.572 841.694 428 | 0.44 599187 551.411 769.604 841.559 429 | 0.55 466034 546.190 764.509 831.907 430 | 0.60 419431 540.693 758.150 819.574 431 | 0.65 349526 521.354 734.935 777.110 432 | 0.65 262144 467.077 660.569 675.041 433 | 0.65 279621 480.977 679.845 701.025 434 | 0.66 233017 443.047 621.548 630.487 435 | 0.66 322639 508.520 719.334 753.231 436 | 0.66 220753 432.415 603.049 610.993 437 | 0.69 199729 414.232 571.586 578.291 438 | 0.70 190651 406.401 557.613 564.020 439 | 0.73 174763 391.686 532.063 538.003 440 | 0.74 167773 384.449 520.422 525.573 441 | 0.75 155345 371.302 498.036 504.311 442 | 0.76 149797 364.787 487.541 492.959 443 | 0.76 139811 352.283 467.503 472.981 444 | ``` 445 | 446 | #### Mode 2: 447 | ``` 448 | python3 ../bench/bencher.py -m 2 -d 2 449 | 450 | GPU hardware: Tesla K40c 451 | =============================================================================================== 452 | Table size experiment: 453 | Table's expected chain length is fixed, and total number of elements is variable 454 | Expected chain length = 0.60 455 | 456 | 1.00 of 262144 queries exist in the data structure 457 | =============================================================================================== 458 | (num keys, num buckets, load factor) build rate(M/s) search rate(M/s) search rate bulk(M/s) 459 | =============================================================================================== 460 | (262144, 29128, 0.55) 538.062 742.231 823.234 461 | (524288, 58255, 0.55) 547.301 755.789 829.696 462 | (1048576, 116509, 0.55) 550.168 761.621 832.457 463 | (2097152, 233017, 0.55) 547.768 763.422 831.348 464 | (4194304, 466034, 0.55) 546.646 764.558 832.098 465 | (8388608, 932068, 0.55) 544.300 764.801 832.008 466 | ``` 467 | #### Mode 3: 468 | ``` 469 | python3 ../bench/bencher.py -m 3 -d 2 470 | 471 | GPU hardware: Tesla K40c 472 | =============================================================================================== 473 | Concurrent experiment: 474 | variable load factor, fixed number of elements 475 | Operation ratio: (insert, delete, search) = (0.10, 0.10, [0.40, 0.40]) 476 | =============================================================================================== 477 | batch_size = 262144, init num batches = 3, final num batches = 4 478 | =============================================================================================== 479 | init lf final lf num buckets init build rate(M/s) concurrent rate(Mop/s) 480 | =============================================================================================== 481 | 0.05 0.05 1048576 502.381 649.592 482 | 0.14 0.14 349526 507.926 656.305 483 | 0.19 0.19 262144 509.950 660.272 484 | 0.28 0.28 174763 511.659 663.212 485 | 0.33 0.33 149797 512.075 662.354 486 | 0.42 0.42 116509 513.390 664.073 487 | 0.47 0.47 104858 511.723 657.781 488 | 0.55 0.55 87382 509.052 649.026 489 | 0.59 0.58 80660 501.725 639.850 490 | 0.64 0.62 69906 493.702 601.822 491 | 492 | ``` -------------------------------------------------------------------------------- /bench/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(random_gen batched_data_gen.cpp) 2 | cuda_add_executable(benchmark main_benchmarks.cu 3 | OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS}) 4 | 5 | target_link_libraries(benchmark random_gen) 6 | -------------------------------------------------------------------------------- /bench/batched_data_gen.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "batched_data_gen.h" 18 | 19 | BatchedDataGen::BatchedDataGen(uint32_t num_ref, uint32_t batch_size) 20 | : num_insert_(0), 21 | num_delete_(0), 22 | num_search_exist_(0), 23 | num_search_non_exist_(0), 24 | edge_index_(0), 25 | batch_counter_(0) { 26 | num_ref_ = num_ref; 27 | batch_size_ = batch_size; 28 | h_key_ref_ = new uint32_t[num_ref_]; 29 | h_index_ref_ = new uint32_t[num_ref_]; 30 | std::iota(h_index_ref_, h_index_ref_ + num_ref_, 0); 31 | h_batch_buffer_ = new uint32_t[batch_size_]; 32 | temp_buffer_ = new uint32_t[batch_size_]; 33 | } 34 | 35 | BatchedDataGen::~BatchedDataGen() { 36 | if (h_key_ref_) 37 | delete[] h_key_ref_; 38 | if (h_index_ref_) 39 | delete[] h_index_ref_; 40 | if (h_batch_buffer_) 41 | delete[] h_batch_buffer_; 42 | if (temp_buffer_) 43 | delete[] temp_buffer_; 44 | } 45 | 46 | void BatchedDataGen::shuffle(uint32_t* input, uint32_t size) { 47 | std::mt19937 rng(std::time(nullptr)); 48 | for (int i = 0; i < size; i++) { 49 | unsigned int rand1 = rng(); 50 | unsigned int rand2 = (rng() << 15) + rand1; 51 | unsigned int swap = i + (rand2 % (size - i)); 52 | 53 | unsigned int temp = input[i]; 54 | input[i] = input[swap]; 55 | input[swap] = temp; 56 | } 57 | } 58 | 59 | void BatchedDataGen::shuffle_pairs(uint32_t* input, 60 | uint32_t* values, 61 | uint32_t size) { 62 | std::mt19937 rng(std::time(nullptr)); 63 | for (int i = 0; i < size; i++) { 64 | unsigned int rand1 = rng(); 65 | unsigned int rand2 = (rng() << 15) + rand1; 66 | unsigned int swap = i + (rand2 % (size - i)); 67 | 68 | unsigned int temp = input[i]; 69 | input[i] = input[swap]; 70 | input[swap] = temp; 71 | 72 | temp = values[i]; 73 | values[i] = values[swap]; 74 | values[swap] = temp; 75 | } 76 | } 77 | 78 | void BatchedDataGen::generate_random_keys() { 79 | std::iota(h_key_ref_, h_key_ref_ + num_ref_, 0); 80 | std::random_shuffle(h_key_ref_, h_key_ref_ + num_ref_); 81 | } 82 | 83 | void BatchedDataGen::generate_random_keys(int seed, 84 | int num_msb = 0, 85 | bool ensure_uniqueness = false) { 86 | std::mt19937 rng(seed); 87 | std::unordered_set key_dict; 88 | for (int i = 0; i < num_ref_; i++) { 89 | if (!ensure_uniqueness) { 90 | h_key_ref_[i] = 91 | (rng() & (0xFFFFFFFF >> 92 | num_msb)); // except for the most significant two bits 93 | } else { 94 | uint32_t key = rng() & (0xFFFFFFFF >> num_msb); 95 | while (key_dict.find(key) != key_dict.end()) { 96 | key = rng(); 97 | } 98 | key_dict.insert(key); 99 | h_key_ref_[i] = key; 100 | } 101 | } 102 | } 103 | 104 | uint32_t* BatchedDataGen::getSingleBatchPointer( 105 | uint32_t num_keys, 106 | uint32_t num_queries, 107 | uint32_t num_existing) { 108 | assert(num_keys + num_queries <= batch_size_); 109 | assert(batch_size_ <= num_ref_); 110 | assert(num_existing <= num_queries); 111 | std::copy(h_key_ref_, h_key_ref_ + num_keys, h_batch_buffer_); 112 | auto begin_index = (num_keys > num_existing) ? (num_keys - num_existing) : 0; 113 | std::copy(h_key_ref_ + begin_index, h_key_ref_ + begin_index + num_queries, 114 | h_batch_buffer_ + num_keys); 115 | std::mt19937 rng(std::time(nullptr)); 116 | std::shuffle(h_batch_buffer_, h_batch_buffer_ + num_keys, rng); 117 | std::shuffle(h_batch_buffer_ + num_keys, h_batch_buffer_ + num_keys + num_queries, rng); 118 | return h_batch_buffer_; 119 | } 120 | 121 | uint32_t BatchedDataGen::get_edge_index() { 122 | return edge_index_; 123 | } 124 | 125 | void BatchedDataGen::set_edge_index(uint32_t new_edge_index) { 126 | if (new_edge_index < num_ref_) 127 | edge_index_ = new_edge_index; 128 | } 129 | 130 | void BatchedDataGen::compute_batch_contents(float a_insert, 131 | float b_delete, 132 | float c_search_exist) { 133 | assert(a_insert + b_delete + c_search_exist <= 1.0f); 134 | num_insert_ = static_cast(a_insert * batch_size_); 135 | num_delete_ = static_cast(b_delete * batch_size_); 136 | num_search_exist_ = static_cast(c_search_exist * batch_size_); 137 | num_search_non_exist_ = 138 | batch_size_ - (num_insert_ + num_delete_ + num_search_exist_); 139 | } 140 | 141 | uint32_t* BatchedDataGen::next_batch(float a_insert, 142 | float b_delete, 143 | float c_search_exist) { 144 | compute_batch_contents(a_insert, b_delete, c_search_exist); 145 | 146 | std::random_shuffle(h_index_ref_, h_index_ref_ + edge_index_); 147 | std::random_shuffle(h_index_ref_ + edge_index_, h_index_ref_ + num_ref_); 148 | 149 | uint32_t output_offset = 0; 150 | 151 | // search queries that actually exist in the data structure 152 | // choosing the first num_search_exist_ from the beginning of the references: 153 | // code 3 for search queries 154 | for (int i = 0; i < num_search_exist_; i++) { 155 | h_batch_buffer_[output_offset + i] = 156 | (0xC0000000 | h_key_ref_[h_index_ref_[i]]); 157 | } 158 | output_offset += num_search_exist_; 159 | 160 | // search queries that do not exist in the data structure 161 | // choose the last num_search_non_exist_ from the end of the references: 162 | // code 3 for search queries 163 | for (int i = 0; i < num_search_non_exist_; i++) { 164 | h_batch_buffer_[output_offset + i] = 165 | (0xC0000000 | h_key_ref_[h_index_ref_[num_ref_ - i - 1]]); 166 | } 167 | output_offset += num_search_non_exist_; 168 | 169 | // inserting new items: 170 | // code 1: 171 | // the first num_isnert_ elements after the edge: 172 | for (int i = 0; i < num_insert_; i++) { 173 | temp_buffer_[i] = h_index_ref_[edge_index_ + i]; 174 | h_batch_buffer_[output_offset + i] = 175 | (0x40000000 | h_key_ref_[temp_buffer_[i]]); 176 | } 177 | output_offset += num_insert_; 178 | 179 | // deleting previously inserted elements: 180 | // code 2: 181 | for (int i = 0; i < num_delete_; i++) { 182 | temp_buffer_[num_insert_ + i] = h_index_ref_[edge_index_ - i - 1]; 183 | h_batch_buffer_[output_offset + i] = 184 | (0x80000000 | h_key_ref_[temp_buffer_[num_insert_ + i]]); 185 | } 186 | 187 | // shuffling the output buffer: 188 | std::random_shuffle(h_batch_buffer_, h_batch_buffer_ + batch_size_); 189 | 190 | // updating the edge index: 191 | std::copy(temp_buffer_, temp_buffer_ + batch_size_, 192 | h_index_ref_ + edge_index_ - num_delete_); 193 | edge_index_ += (num_insert_ - num_delete_); 194 | 195 | batch_counter_++; 196 | return h_batch_buffer_; 197 | } 198 | 199 | void BatchedDataGen::print_batch() { 200 | printf("Batch %d:\n", batch_counter_); 201 | for (int i = 0; i < batch_size_; i++) { 202 | printf("(%d, %d), ", h_batch_buffer_[i] >> 30, 203 | h_batch_buffer_[i] & 0x3FFFFFFF); 204 | if (i % 10 == 9) 205 | printf("\n"); 206 | } 207 | printf("\n"); 208 | } 209 | 210 | void BatchedDataGen::print_reference() { 211 | printf("Reference keys:"); 212 | for (int i = 0; i < num_ref_; i++) { 213 | printf("%d, ", h_key_ref_[i]); 214 | if (i % 16 == 31) 215 | printf("\n"); 216 | } 217 | printf("\n"); 218 | } -------------------------------------------------------------------------------- /bench/batched_data_gen.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | class BatchedDataGen { 30 | private: 31 | uint32_t* h_key_ref_; 32 | uint32_t* h_index_ref_; 33 | uint32_t num_ref_; 34 | uint32_t edge_index_; 35 | uint32_t* temp_buffer_; 36 | 37 | uint32_t batch_counter_; 38 | uint32_t num_insert_; 39 | uint32_t num_delete_; 40 | uint32_t num_search_exist_; 41 | uint32_t num_search_non_exist_; 42 | 43 | public: 44 | uint32_t batch_size_; 45 | uint32_t* h_batch_buffer_; 46 | 47 | BatchedDataGen(uint32_t num_ref_, uint32_t batch_size); 48 | ~BatchedDataGen(); 49 | void shuffle(uint32_t* input, uint32_t size); 50 | void shuffle_pairs(uint32_t* input, uint32_t* values, uint32_t size); 51 | void generate_random_keys(); 52 | void generate_random_keys(int seed, int num_msb, bool ensure_uniqueness); 53 | uint32_t* getSingleBatchPointer(uint32_t num_keys, 54 | uint32_t num_queries, 55 | uint32_t num_existing); 56 | uint32_t* getKeyRefPointer() { return h_key_ref_; } 57 | uint32_t get_edge_index(); 58 | void set_edge_index(uint32_t new_edge_index); 59 | uint32_t* next_batch(float a_insert, float b_delete, float c_search_exist); 60 | uint32_t getBatchCounter() { return batch_counter_; } 61 | void print_batch(); 62 | void print_reference(); 63 | void compute_batch_contents(float a_insert, 64 | float b_delete, 65 | float c_search_exist); 66 | }; -------------------------------------------------------------------------------- /bench/bencher.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import datetime 3 | import os 4 | import json 5 | import sys 6 | import getopt 7 | 8 | def analyze_singleton_experiment(input_file): 9 | with open(input_file) as json_file: 10 | data = json.load(json_file) 11 | print("GPU hardware: %s" % (data["slab_hash"]['device_name'])) 12 | trials = data["slab_hash"]["trial"] 13 | 14 | for trial in trials: 15 | data_q0 = (trial["load_factor"], trial["build_rate_mps"], trial["search_rate_mps"], trial["search_rate_bulk_mps"]) 16 | 17 | print("===============================================================================================") 18 | print("Singleton experiment:") 19 | print("\tNumber of elements to be inserted: %d" % (trials[0]['num_keys'])) 20 | print("\tNumber of buckets: %d" % (trials[0]['num_buckets'])) 21 | print("\tExpected chain length: %.2f" % (trials[0]['exp_chain_length'])) 22 | print("===============================================================================================") 23 | print("load factor\tbuild rate(M/s)\t\tsearch rate(M/s)\tsearch rate bulk(M/s)") 24 | print("===============================================================================================") 25 | print("%.2f\t\t%.3f\t\t%.3f\t\t%.3f" % (data_q0[0], data_q0[1], data_q0[2], data_q0[3])) 26 | 27 | def analyze_load_factor_experiment(input_file): 28 | with open(input_file) as json_file: 29 | data = json.load(json_file) 30 | print("GPU hardware: %s" % (data["slab_hash"]['device_name'])) 31 | trials = data["slab_hash"]["trial"] 32 | 33 | tabular_data = [] 34 | 35 | for trial in trials: 36 | tabular_data.append((trial["load_factor"], 37 | trial["build_rate_mps"], 38 | trial["search_rate_mps"], 39 | trial["search_rate_bulk_mps"], 40 | trial['num_buckets'])) 41 | 42 | tabular_data.sort() 43 | print("===============================================================================================") 44 | print("Load factor experiment:") 45 | print("\tTotal number of elements is fixed, load factor (number of buckets) is a variable") 46 | print("\tNumber of elements to be inserted: %d" % (trials[0]['num_keys'])) 47 | print("\t %.2f of %d queries exist in the data structure" % (trials[0]['query_ratio'], trials[0]['num_queries'])) 48 | print("===============================================================================================") 49 | print("load factor\tnum buckets\tbuild rate(M/s)\t\tsearch rate(M/s)\tsearch rate bulk(M/s)") 50 | print("===============================================================================================") 51 | for pair in tabular_data: 52 | print("%.2f\t\t%d\t\t%.3f\t\t%.3f\t\t%.3f" % (pair[0], pair[4], pair[1], pair[2], pair[3])) 53 | 54 | def analyze_table_size_experiment(input_file): 55 | with open(input_file) as json_file: 56 | data = json.load(json_file) 57 | print("GPU hardware: %s" % (data["slab_hash"]['device_name'])) 58 | trials = data["slab_hash"]["trial"] 59 | 60 | tabular_data = [] 61 | 62 | for trial in trials: 63 | tabular_data.append((trial["num_keys"], 64 | trial['num_buckets'], 65 | trial['load_factor'], 66 | trial["build_rate_mps"], 67 | trial["search_rate_mps"], 68 | trial["search_rate_bulk_mps"])) 69 | 70 | tabular_data.sort() 71 | print("===============================================================================================") 72 | print("Table size experiment:") 73 | print("\tTable's expected chain length is fixed, and total number of elements is variable") 74 | print("\tExpected chain length = %.2f\n" % trials[0]['exp_chain_length']) 75 | print("\t%.2f of %d queries exist in the data structure" % (trials[0]['query_ratio'], trials[0]['num_queries'])) 76 | print("===============================================================================================") 77 | print("(num keys, num buckets, load factor)\tbuild rate(M/s)\t\tsearch rate(M/s)\tsearch rate bulk(M/s)") 78 | print("===============================================================================================") 79 | for pair in tabular_data: 80 | print("(%d, %d, %.2f)\t\t\t%10.3f\t\t%.3f\t\t%.3f" % (pair[0], pair[1], pair[2], pair[3], pair[4], pair[5])) 81 | 82 | def analyze_concurrent_experiment(input_file): 83 | with open(input_file) as json_file: 84 | data = json.load(json_file) 85 | print("GPU hardware: %s" % (data["slab_hash"]['device_name'])) 86 | trials = data["slab_hash"]["trial"] 87 | 88 | tabular_data = [] 89 | 90 | for trial in trials: 91 | tabular_data.append((trial["init_load_factor"], 92 | trial['final_load_factor'], 93 | trial['num_buckets'], 94 | trial["initial_rate_mps"], 95 | trial["concurrent_rate_mps"])) 96 | 97 | tabular_data.sort() 98 | print("===============================================================================================") 99 | print("Concurrent experiment:") 100 | print("\tvariable load factor, fixed number of elements") 101 | print("\tOperation ratio: (insert, delete, search) = (%.2f, %.2f, [%.2f, %.2f])" % (trials[0]['insert_ratio'], trials[0]['delete_ratio'], trials[0]['search_exist_ratio'], trials[0]['search_non_exist_ratio'])) 102 | print("===============================================================================================") 103 | print("batch_size = %d, init num batches = %d, final num batches = %d" % (trials[0]['batch_size'], trials[0]['num_init_batches'], trials[0]['num_batches'])) 104 | print("===============================================================================================") 105 | print("init lf\t\tfinal lf\tnum buckets\tinit build rate(M/s)\tconcurrent rate(Mop/s)") 106 | print("===============================================================================================") 107 | for pair in tabular_data: 108 | print("%.2f\t\t%.2f\t\t%d\t\t%.3f\t\t%.3f" % (pair[0], pair[1], pair[2], pair[3], pair[4])) 109 | 110 | def main(argv): 111 | input_file = '' 112 | try: 113 | opts, args = getopt.getopt(argv, "hvi:m:d:", ["help", "verbose", "ifile=", "mode=", "device="]) 114 | except getopt.GetOptError: 115 | print("bencher.py -i -m -d -v") 116 | sys.exit(2) 117 | 118 | for opt, arg in opts: 119 | if opt == '-h': 120 | print("===============================================================================================") 121 | print("-i/--ifile: \t\t Input file (optional)") 122 | print("-m/--mode: \t\t Experiment mode:") 123 | print("\t\t\t\t\t 0: singleton experiment") 124 | print("\t\t\t\t\t 1: load factor experiment") 125 | print("\t\t\t\t\t 2: variable sized table experiment") 126 | print("\t\t\t\t\t 3: concurrent experiment") 127 | print("-v/--verbose") 128 | print("===============================================================================================") 129 | sys.exit() 130 | else: 131 | if opt in ("-i", "--ifile"): 132 | input_file = arg 133 | print("input file: " + input_file) 134 | if opt in ("-m", "--mode"): 135 | mode = int(arg) 136 | if opt in ("-d", "--device"): 137 | device_idx = int(arg) 138 | if opt in ("-v", "--verbose"): 139 | verbose = True 140 | else: 141 | verbose = False 142 | 143 | # if the input file is not given, proper experiments should be run first 144 | if not input_file: 145 | # == creating a folder to store results 146 | out_directory = "../build/bench_result/" 147 | if (not os.path.isdir(out_directory)): 148 | os.mkdir(out_directory) 149 | 150 | # == running benchmark files 151 | bin_file = "../build/bin/benchmark" 152 | if(not os.path.exists(bin_file)): 153 | raise Exception("binary file " + bin_file + " not found!") 154 | 155 | # creating a unique name for the file 156 | cur_time_list = str(datetime.datetime.now()).split() 157 | out_file_name = "out" 158 | for s in cur_time_list: 159 | out_file_name += ("_" + s) 160 | 161 | out_file_dest = out_directory + out_file_name + ".json" 162 | input_file = out_file_dest # input file for the next step 163 | print("intermediate results stored at: " + out_file_dest) 164 | 165 | print("mode = %d" % mode) 166 | if mode == 0: 167 | args = (bin_file, "-mode", str(mode), 168 | "-num_key", str(2**22), 169 | "-expected_chain", str(0.6), 170 | "-device", str(device_idx), 171 | "-filename", out_file_dest, 172 | "-verbose", "1" if verbose else "0") 173 | elif mode == 1: 174 | args = (bin_file, 175 | "-mode", str(mode), 176 | "-num_keys", str(2**22), 177 | "-quary_ratio", str(1.0), 178 | "-device", str(device_idx), 179 | "-lf_bulk_step", str(0.1), 180 | "-lf_bulk_num_sample", str(20), 181 | "-filename", out_file_dest, 182 | "-verbose", "1" if verbose else "0") 183 | elif mode == 2: 184 | args = (bin_file, "-mode", str(mode), 185 | "-nStart", str(18), 186 | "-nEnd", str(23), 187 | "-expected_chain", str(0.6), 188 | "-query_ratio", str(1.0), 189 | "-device", str(device_idx), 190 | "-filename", out_file_dest, 191 | "-verbose", "1" if verbose else "0") 192 | elif mode == 3: 193 | args = (bin_file, "-mode", str(mode), 194 | "-nStart", str(18), 195 | "-nEnd", str(21), 196 | "-num_batch", str(4), 197 | "-init_batch", str(3), 198 | "-lf_conc_step", str(0.1), 199 | "-lf_conc_num_sample", str(10), 200 | "-device", str(device_idx), 201 | "-filename", out_file_dest, 202 | "-verbose", "1" if verbose else "0") 203 | 204 | print(" === Started benchmarking ... ") 205 | 206 | popen = subprocess.Popen(args, stdout = subprocess.PIPE) 207 | popen.wait() 208 | 209 | if verbose: 210 | output = popen.stdout.read() 211 | print(output) 212 | print(" === Done!") 213 | elif not os.path.exists(input_file): 214 | raise Exception("Input file " + input_file + " does not exist!") 215 | 216 | # reading the json files: 217 | if mode == 0: 218 | analyze_singleton_experiment(input_file) 219 | elif mode == 1: 220 | analyze_load_factor_experiment(input_file) 221 | elif mode == 2: 222 | analyze_table_size_experiment(input_file) 223 | elif mode == 3: 224 | analyze_concurrent_experiment(input_file) 225 | else: 226 | print("Invalid mode entered") 227 | sys.exit(2) 228 | 229 | if __name__ == "__main__": 230 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /bench/main_benchmarks.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "CommandLine.h" 28 | #include "experiments.cuh" 29 | 30 | int main(int argc, char** argv) { 31 | int mode = 0; // type of experiment 32 | uint32_t num_iter = 1; 33 | bool verbose = false; 34 | int device_idx = 0; 35 | uint32_t num_keys = (1 << 22); 36 | uint32_t n_start = 20; // num_keys = 1 << n_start; 37 | uint32_t n_end = 20; 38 | uint32_t num_queries = num_keys; 39 | float expected_chain = 0.6f; 40 | float existing_ratio = 1.0f; 41 | 42 | // mode 1 parameters: 43 | float lf_bulk_step = 0.1f; 44 | uint32_t lf_bulk_num_sample = 10; 45 | 46 | // mode 3 parameters: 47 | int num_batch = 2; 48 | int init_batch = 1; 49 | float insert_ratio = 0.1f; 50 | float delete_ratio = 0.1f; 51 | float search_exist_ratio = 0.4f; 52 | float lf_conc_step = 0.1f; 53 | int lf_conc_num_sample = 10; 54 | 55 | if (cmdOptionExists(argv, argc + argv, "-mode")) 56 | mode = atoi(getCmdOption(argv, argv + argc, "-mode")); 57 | if (cmdOptionExists(argv, argc + argv, "-num_key")) 58 | num_keys = atoi(getCmdOption(argv, argv + argc, "-num_key")); 59 | if (cmdOptionExists(argv, argc + argv, "-num_query")) 60 | num_queries = atoi(getCmdOption(argv, argv + argc, "-num_query")); 61 | else { 62 | num_queries = num_keys; 63 | } 64 | 65 | if (cmdOptionExists(argv, argc + argv, "-expected_chain")) 66 | expected_chain = atof(getCmdOption(argv, argv + argc, "-expected_chain")); 67 | assert(expected_chain > 0); 68 | if (cmdOptionExists(argv, argc + argv, "-query_ratio")) 69 | existing_ratio = atof(getCmdOption(argv, argv + argc, "-query_ratio")); 70 | if (cmdOptionExists(argv, argc + argv, "-verbose")) { 71 | verbose = (atoi(getCmdOption(argv, argv + argc, "-verbose")) != 0) ? true : false; 72 | } 73 | 74 | if (cmdOptionExists(argv, argc + argv, "-device")) 75 | device_idx = atoi(getCmdOption(argv, argv + argc, "-device")); 76 | if (cmdOptionExists(argv, argc + argv, "-iter")) { 77 | num_iter = atoi(getCmdOption(argv, argv + argc, "-iter")); 78 | } 79 | if (cmdOptionExists(argv, argc + argv, "-nStart")) { 80 | n_start = atoi(getCmdOption(argv, argv + argc, "-nStart")); 81 | // for mode 0: 82 | num_keys = (1 << n_start); 83 | num_queries = num_keys; 84 | } 85 | if (cmdOptionExists(argv, argc + argv, "-nEnd")) { 86 | n_end = atoi(getCmdOption(argv, argv + argc, "-nEnd")); 87 | } 88 | if (cmdOptionExists(argv, argc + argv, "-num_batch")) { 89 | num_batch = atoi(getCmdOption(argv, argv + argc, "-num_batch")); 90 | } 91 | if (cmdOptionExists(argv, argc + argv, "-init_batch")) { 92 | init_batch = atoi(getCmdOption(argv, argv + argc, "-init_batch")); 93 | } 94 | if (cmdOptionExists(argv, argc + argv, "-insert_ratio")) 95 | insert_ratio = atof(getCmdOption(argv, argv + argc, "-insert_ratio")); 96 | if (cmdOptionExists(argv, argc + argv, "-delete_ratio")) 97 | delete_ratio = atof(getCmdOption(argv, argv + argc, "-delete_ratio")); 98 | if (cmdOptionExists(argv, argc + argv, "-search_exist_ratio")) 99 | search_exist_ratio = 100 | atof(getCmdOption(argv, argv + argc, "-search_exist_ratio")); 101 | if (cmdOptionExists(argv, argc + argv, "-lf_conc_step")) 102 | lf_conc_step = atof(getCmdOption(argv, argv + argc, "-lf_conc_step")); 103 | if (cmdOptionExists(argv, argc + argv, "-lf_conc_num_sample")) 104 | lf_conc_num_sample = 105 | atoi(getCmdOption(argv, argv + argc, "-lf_conc_num_sample")); 106 | if (cmdOptionExists(argv, argc + argv, "-lf_bulk_step")) 107 | lf_bulk_step = atof(getCmdOption(argv, argv + argc, "-lf_bulk_step")); 108 | if (cmdOptionExists(argv, argc + argv, "-lf_bulk_num_sample")) 109 | lf_bulk_num_sample = 110 | atoi(getCmdOption(argv, argv + argc, "-lf_bulk_num_sample")); 111 | 112 | // input argument for the file to be used for storing the results 113 | std::string filename(""); 114 | if (cmdOptionExists(argv, argc + argv, "-filename")) { 115 | filename.append(getCmdOption(argv, argv + argc, "-filename")); 116 | std::cout << filename << std::endl; 117 | } else { 118 | // setting the filename to be the current time: 119 | filename += "bench/"; 120 | auto time = std::time(nullptr); 121 | auto tm = *std::localtime(&time); 122 | std::ostringstream temp; 123 | temp << std::put_time(&tm, "%d-%m-%Y_%H-%M-%S"); 124 | filename += ("out_" + temp.str() + ".json"); 125 | } 126 | 127 | //========= 128 | int devCount; 129 | cudaGetDeviceCount(&devCount); 130 | cudaDeviceProp devProp; 131 | if (devCount) { 132 | cudaSetDevice(device_idx); // be changed later 133 | cudaGetDeviceProperties(&devProp, device_idx); 134 | } 135 | printf("Device: %s\n", devProp.name); 136 | printf("Experiment mode = %d\n", mode); 137 | 138 | using KeyT = uint32_t; 139 | using ValueT = uint32_t; 140 | 141 | // running the actual experiment 142 | switch (mode) { 143 | case 0: // singleton experiment 144 | singleton_experiment(num_keys, num_queries, expected_chain, 145 | filename, device_idx, existing_ratio, 146 | num_iter, 147 | /*run_cudpp = */ false, verbose); 148 | break; 149 | case 1: // bulk build, num elements fixed, load factor changing 150 | load_factor_bulk_experiment( 151 | num_keys, num_queries, filename, device_idx, existing_ratio, num_iter, 152 | false, lf_bulk_num_sample, lf_bulk_step); 153 | break; 154 | case 2: // bulk build, load factor fixed, num elements changing 155 | build_search_bulk_experiment( 156 | 1 << n_start, 1 << n_end, filename, expected_chain, existing_ratio, 157 | device_idx, num_iter, 158 | /* run_cudpp = */ false, 159 | /* verbose = */ verbose); 160 | break; 161 | case 3: // concurrent experiment: 162 | concurrent_batched_op_load_factor_experiment( 163 | /*max_num_keys = */ 1 << n_end, /*batch_size = */ 1 << n_start, 164 | num_batch, init_batch, insert_ratio, delete_ratio, search_exist_ratio, 165 | filename, device_idx, lf_conc_step, lf_conc_num_sample, num_iter, 166 | verbose); 167 | break; 168 | default: 169 | std::cout << "Error: invalid mode." << std::endl; 170 | break; 171 | } 172 | } -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CUHFILES 2 | slab_hash_global.cuh 3 | slab_hash.cuh) 4 | 5 | cuda_add_library(slab_hash STATIC 6 | ${CUHFILES} 7 | ${CUFILES} 8 | OPTIONS ${GENCODE} ${VERBOSE_PTXAS}) 9 | -------------------------------------------------------------------------------- /src/CommandLine.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | inline char* getCmdOption(char** begin, char** end, const std::string& option) { 20 | char** itr = std::find(begin, end, option); 21 | if (itr != end && ++itr != end) { 22 | return *itr; 23 | } 24 | return 0; 25 | } 26 | 27 | inline bool cmdOptionExists(char** begin, char** end, const std::string& option) { 28 | return std::find(begin, end, option) != end; 29 | } -------------------------------------------------------------------------------- /src/concurrent_map/cmap_class.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | 20 | /* 21 | * This is the main class that will be shallowly copied into the device to be 22 | * used at runtime. This class does not own the allocated memory on the gpu 23 | * (i.e., d_table_) 24 | */ 25 | template 26 | class GpuSlabHashContext { 27 | public: 28 | // fixed known parameters: 29 | static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; 30 | static constexpr uint32_t WARP_WIDTH_ = 32; 31 | 32 | #pragma hd_warning_disable 33 | __host__ __device__ GpuSlabHashContext() 34 | : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) {} 35 | 36 | #pragma hd_warning_disable 37 | __host__ __device__ GpuSlabHashContext( 38 | GpuSlabHashContext& rhs) { 39 | num_buckets_ = rhs.getNumBuckets(); 40 | hash_x_ = rhs.getHashX(); 41 | hash_y_ = rhs.getHashY(); 42 | d_table_ = rhs.getDeviceTablePointer(); 43 | global_allocator_ctx_ = rhs.getAllocatorContext(); 44 | } 45 | 46 | #pragma hd_warning_disable 47 | __host__ __device__ ~GpuSlabHashContext() {} 48 | 49 | static size_t getSlabUnitSize() { 50 | return sizeof(typename ConcurrentMapT::SlabTypeT); 51 | } 52 | 53 | static std::string getSlabHashTypeName() { 54 | return ConcurrentMapT::getTypeName(); 55 | } 56 | 57 | __host__ void initParameters(const uint32_t num_buckets, 58 | const uint32_t hash_x, 59 | const uint32_t hash_y, 60 | int8_t* d_table, 61 | AllocatorContextT* allocator_ctx) { 62 | num_buckets_ = num_buckets; 63 | hash_x_ = hash_x; 64 | hash_y_ = hash_y; 65 | d_table_ = 66 | reinterpret_cast::SlabTypeT*>(d_table); 67 | global_allocator_ctx_ = *allocator_ctx; 68 | } 69 | 70 | __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { 71 | return global_allocator_ctx_; 72 | } 73 | 74 | __device__ __host__ __forceinline__ typename ConcurrentMapT::SlabTypeT* 75 | getDeviceTablePointer() { 76 | return d_table_; 77 | } 78 | 79 | __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; } 80 | __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; } 81 | __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; } 82 | 83 | __device__ __host__ __forceinline__ uint32_t computeBucket(const KeyT& key) const { 84 | return (((hash_x_ ^ key) + hash_y_) % PRIME_DIVISOR_) % num_buckets_; 85 | } 86 | 87 | // threads in a warp cooperate with each other to insert key-value pairs 88 | // into the slab hash 89 | __device__ __forceinline__ void insertPair(bool& to_be_inserted, 90 | const uint32_t& laneId, 91 | const KeyT& myKey, 92 | const ValueT& myValue, 93 | const uint32_t bucket_id, 94 | AllocatorContextT& local_allocator_context); 95 | 96 | // threads in a warp cooperate with each other to insert a unique key (and its value) 97 | // into the slab hash 98 | __device__ __forceinline__ bool insertPairUnique( 99 | bool& to_be_inserted, 100 | const uint32_t& laneId, 101 | const KeyT& myKey, 102 | const ValueT& myValue, 103 | const uint32_t bucket_id, 104 | AllocatorContextT& local_allocator_context); 105 | 106 | // threads in a warp cooperate with each other to search for keys 107 | // if found, it returns the corresponding value, else SEARCH_NOT_FOUND 108 | // is returned 109 | __device__ __forceinline__ void searchKey(bool& to_be_searched, 110 | const uint32_t& laneId, 111 | const KeyT& myKey, 112 | ValueT& myValue, 113 | const uint32_t bucket_id); 114 | 115 | // threads in a warp cooperate with each other to search for keys. 116 | // the main difference with above function is that it is assumed all 117 | // threads have something to search for 118 | __device__ __forceinline__ void searchKeyBulk(const uint32_t& laneId, 119 | const KeyT& myKey, 120 | ValueT& myValue, 121 | const uint32_t bucket_id); 122 | 123 | // threads in a warp cooperate with each other to count keys 124 | __device__ __forceinline__ void countKey(bool& to_be_searched, 125 | const uint32_t& laneId, 126 | const KeyT& myKey, 127 | uint32_t& myCount, 128 | const uint32_t bucket_id); 129 | 130 | // all threads within a warp cooperate with each other to delete 131 | // keys 132 | __device__ __forceinline__ bool deleteKey(bool& to_be_deleted, 133 | const uint32_t& laneId, 134 | const KeyT& myKey, 135 | const uint32_t bucket_id); 136 | 137 | __device__ __forceinline__ uint32_t* getPointerFromSlab( 138 | const SlabAddressT& slab_address, 139 | const uint32_t laneId) { 140 | return global_allocator_ctx_.getPointerFromSlab(slab_address, laneId); 141 | } 142 | 143 | __device__ __forceinline__ uint32_t* getPointerFromBucket(const uint32_t bucket_id, 144 | const uint32_t laneId) { 145 | return reinterpret_cast(d_table_) + 146 | bucket_id * ConcurrentMapT::BASE_UNIT_SIZE + laneId; 147 | } 148 | 149 | private: 150 | // this function should be operated in a warp-wide fashion 151 | // TODO: add required asserts to make sure this is true in tests/debugs 152 | __device__ __forceinline__ SlabAllocAddressT allocateSlab(const uint32_t& laneId) { 153 | return global_allocator_ctx_.warpAllocate(laneId); 154 | } 155 | 156 | __device__ __forceinline__ SlabAllocAddressT 157 | allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) { 158 | return local_allocator_ctx.warpAllocate(laneId); 159 | } 160 | 161 | // a thread-wide function to free the slab that was just allocated 162 | __device__ __forceinline__ void freeSlab(const SlabAllocAddressT slab_ptr) { 163 | global_allocator_ctx_.freeUntouched(slab_ptr); 164 | } 165 | 166 | // === members: 167 | uint32_t num_buckets_; 168 | uint32_t hash_x_; 169 | uint32_t hash_y_; 170 | typename ConcurrentMapT::SlabTypeT* d_table_; 171 | // a copy of dynamic allocator's context to be used on the GPU 172 | AllocatorContextT global_allocator_ctx_; 173 | }; 174 | 175 | /* 176 | * This class owns the allocated memory for the hash table 177 | */ 178 | template 179 | class GpuSlabHash { 180 | private: 181 | // fixed known parameters: 182 | static constexpr uint32_t BLOCKSIZE_ = 128; 183 | static constexpr uint32_t WARP_WIDTH_ = 32; 184 | static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; 185 | 186 | struct hash_function { 187 | uint32_t x; 188 | uint32_t y; 189 | } hf_; 190 | 191 | // total number of buckets (slabs) for this hash table 192 | uint32_t num_buckets_; 193 | 194 | // a raw pointer to the initial allocated memory for all buckets 195 | int8_t* d_table_; 196 | size_t slab_unit_size_; // size of each slab unit in bytes (might differ 197 | // based on the type) 198 | 199 | // slab hash context, contains everything that a GPU application needs to be 200 | // able to use this data structure 201 | GpuSlabHashContext gpu_context_; 202 | 203 | // const pointer to an allocator that all instances of slab hash are going to 204 | // use. The allocator itself is not owned by this class 205 | DynamicAllocatorT* dynamic_allocator_; 206 | uint32_t device_idx_; 207 | 208 | public: 209 | GpuSlabHash(const uint32_t num_buckets, 210 | DynamicAllocatorT* dynamic_allocator, 211 | uint32_t device_idx, 212 | const time_t seed = 0, 213 | const bool identity_hash = false) 214 | : num_buckets_(num_buckets) 215 | , d_table_(nullptr) 216 | , slab_unit_size_(0) 217 | , dynamic_allocator_(dynamic_allocator) 218 | , device_idx_(device_idx) { 219 | assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash."); 220 | assert(sizeof(typename ConcurrentMapT::SlabTypeT) == 221 | (WARP_WIDTH_ * sizeof(uint32_t)) && 222 | "A single slab on a ConcurrentMap should be 128 bytes"); 223 | int32_t devCount = 0; 224 | CHECK_CUDA_ERROR(cudaGetDeviceCount(&devCount)); 225 | assert(device_idx_ < devCount); 226 | 227 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 228 | 229 | slab_unit_size_ = 230 | GpuSlabHashContext::getSlabUnitSize(); 231 | 232 | // allocating initial buckets: 233 | CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); 234 | 235 | CHECK_CUDA_ERROR(cudaMemset(d_table_, 0xFF, slab_unit_size_ * num_buckets_)); 236 | 237 | // creating a random number generator: 238 | if (!identity_hash) { 239 | std::mt19937 rng(seed ? seed : time(0)); 240 | hf_.x = rng() % PRIME_DIVISOR_; 241 | if (hf_.x < 1) 242 | hf_.x = 1; 243 | hf_.y = rng() % PRIME_DIVISOR_; 244 | } else { 245 | hf_ = {0u, 0u}; 246 | } 247 | 248 | // initializing the gpu_context_: 249 | gpu_context_.initParameters( 250 | num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr()); 251 | } 252 | 253 | ~GpuSlabHash() { 254 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 255 | CHECK_CUDA_ERROR(cudaFree(d_table_)); 256 | } 257 | 258 | // returns some debug information about the slab hash 259 | std::string to_string(); 260 | double computeLoadFactor(int flag); 261 | 262 | void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys); 263 | void buildBulkWithUniqueKeys(KeyT* d_key, ValueT* d_value, uint32_t num_keys); 264 | void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries); 265 | void searchBulk(KeyT* d_query, ValueT* d_result, uint32_t num_queries); 266 | void deleteIndividual(KeyT* d_key, uint32_t num_keys); 267 | void batchedOperation(KeyT* d_key, ValueT* d_result, uint32_t num_ops); 268 | void countIndividual(KeyT* d_query, uint32_t* d_count, uint32_t num_queries); 269 | }; -------------------------------------------------------------------------------- /src/concurrent_map/cmap_implementation.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | template 20 | void GpuSlabHash::buildBulk( 21 | KeyT* d_key, 22 | ValueT* d_value, 23 | uint32_t num_keys) { 24 | const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; 25 | // calling the kernel for bulk build: 26 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 27 | build_table_kernel 28 | <<>>(d_key, d_value, num_keys, gpu_context_); 29 | } 30 | template 31 | void GpuSlabHash::buildBulkWithUniqueKeys( 32 | KeyT* d_key, 33 | ValueT* d_value, 34 | uint32_t num_keys) { 35 | const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; 36 | // calling the kernel for bulk build: 37 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 38 | build_table_with_unique_keys_kernel 39 | <<>>(d_key, d_value, num_keys, gpu_context_); 40 | } 41 | template 42 | void GpuSlabHash::searchIndividual( 43 | KeyT* d_query, 44 | ValueT* d_result, 45 | uint32_t num_queries) { 46 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 47 | const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_; 48 | search_table 49 | <<>>(d_query, d_result, num_queries, gpu_context_); 50 | } 51 | 52 | template 53 | void GpuSlabHash::searchBulk( 54 | KeyT* d_query, 55 | ValueT* d_result, 56 | uint32_t num_queries) { 57 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 58 | const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_; 59 | search_table_bulk 60 | <<>>(d_query, d_result, num_queries, gpu_context_); 61 | } 62 | 63 | template 64 | void GpuSlabHash::countIndividual( 65 | KeyT* d_query, 66 | uint32_t* d_count, 67 | uint32_t num_queries) { 68 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 69 | const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_; 70 | count_key 71 | <<>>(d_query, d_count, num_queries, gpu_context_); 72 | } 73 | 74 | template 75 | void GpuSlabHash::deleteIndividual( 76 | KeyT* d_key, 77 | uint32_t num_keys) { 78 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 79 | const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; 80 | delete_table_keys 81 | <<>>(d_key, num_keys, gpu_context_); 82 | } 83 | 84 | // perform a batch of (a mixture of) updates/searches 85 | template 86 | void GpuSlabHash::batchedOperation( 87 | KeyT* d_key, 88 | ValueT* d_result, 89 | uint32_t num_ops) { 90 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 91 | const uint32_t num_blocks = (num_ops + BLOCKSIZE_ - 1) / BLOCKSIZE_; 92 | batched_operations 93 | <<>>(d_key, d_result, num_ops, gpu_context_); 94 | } 95 | 96 | template 97 | std::string GpuSlabHash::to_string() { 98 | std::string result; 99 | result += " ==== GpuSlabHash: \n"; 100 | result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n"; 101 | result += "\t SlabHashType: \t\t " + gpu_context_.getSlabHashTypeName() + "\n"; 102 | result += "\t Number of buckets:\t\t " + std::to_string(num_buckets_) + "\n"; 103 | result += "\t d_table_ address: \t\t " + 104 | std::to_string(reinterpret_cast(static_cast(d_table_))) + 105 | "\n"; 106 | result += "\t hash function = \t\t (" + std::to_string(hf_.x) + ", " + 107 | std::to_string(hf_.y) + ")\n"; 108 | return result; 109 | } 110 | 111 | template 112 | double GpuSlabHash::computeLoadFactor( 113 | int flag = 0) { 114 | uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_]; 115 | uint32_t* d_bucket_pairs_count; 116 | CHECK_CUDA_ERROR( 117 | cudaMalloc((void**)&d_bucket_pairs_count, sizeof(uint32_t) * num_buckets_)); 118 | CHECK_CUDA_ERROR(cudaMemset(d_bucket_pairs_count, 0, sizeof(uint32_t) * num_buckets_)); 119 | 120 | uint32_t* h_bucket_slabs_count = new uint32_t[num_buckets_]; 121 | uint32_t* d_bucket_slabs_count; 122 | CHECK_CUDA_ERROR( 123 | cudaMalloc((void**)&d_bucket_slabs_count, sizeof(uint32_t) * num_buckets_)); 124 | CHECK_CUDA_ERROR(cudaMemset(d_bucket_slabs_count, 0, sizeof(uint32_t) * num_buckets_)); 125 | 126 | //--------------------------------- 127 | // counting the number of inserted elements: 128 | const uint32_t blocksize = 128; 129 | const uint32_t num_blocks = (num_buckets_ * 32 + blocksize - 1) / blocksize; 130 | bucket_count_kernel<<>>( 131 | gpu_context_, d_bucket_pairs_count, d_bucket_slabs_count, num_buckets_); 132 | CHECK_CUDA_ERROR(cudaMemcpy(h_bucket_pairs_count, 133 | d_bucket_pairs_count, 134 | sizeof(uint32_t) * num_buckets_, 135 | cudaMemcpyDeviceToHost)); 136 | CHECK_CUDA_ERROR(cudaMemcpy(h_bucket_slabs_count, 137 | d_bucket_slabs_count, 138 | sizeof(uint32_t) * num_buckets_, 139 | cudaMemcpyDeviceToHost)); 140 | int total_elements_stored = 0; 141 | int total_slabs_used = 0; 142 | for (int i = 0; i < num_buckets_; i++) { 143 | total_elements_stored += h_bucket_pairs_count[i]; 144 | total_slabs_used += h_bucket_slabs_count[i]; 145 | } 146 | if (flag) { 147 | printf("## Total elements stored: %d (%lu bytes).\n", 148 | total_elements_stored, 149 | total_elements_stored * (sizeof(KeyT) + sizeof(ValueT))); 150 | printf("## Total number of slabs used: %d.\n", total_slabs_used); 151 | } 152 | 153 | // computing load factor 154 | double load_factor = double(total_elements_stored * (sizeof(KeyT) + sizeof(ValueT))) / 155 | double(total_slabs_used * WARP_WIDTH_ * sizeof(uint32_t)); 156 | 157 | if (d_bucket_pairs_count) 158 | CHECK_ERROR(cudaFree(d_bucket_pairs_count)); 159 | if (d_bucket_slabs_count) 160 | CHECK_ERROR(cudaFree(d_bucket_slabs_count)); 161 | delete[] h_bucket_pairs_count; 162 | delete[] h_bucket_slabs_count; 163 | 164 | return load_factor; 165 | } -------------------------------------------------------------------------------- /src/concurrent_map/device/build.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | /* 19 | * 20 | */ 21 | template 22 | __global__ void build_table_kernel( 23 | KeyT* d_key, 24 | ValueT* d_value, 25 | uint32_t num_keys, 26 | GpuSlabHashContext slab_hash) { 27 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 28 | uint32_t laneId = threadIdx.x & 0x1F; 29 | 30 | if ((tid - laneId) >= num_keys) { 31 | return; 32 | } 33 | 34 | AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); 35 | local_allocator_ctx.initAllocator(tid, laneId); 36 | 37 | KeyT myKey = 0; 38 | ValueT myValue = 0; 39 | uint32_t myBucket = 0; 40 | bool to_insert = false; 41 | 42 | if (tid < num_keys) { 43 | myKey = d_key[tid]; 44 | myValue = d_value[tid]; 45 | myBucket = slab_hash.computeBucket(myKey); 46 | to_insert = true; 47 | } 48 | 49 | slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); 50 | } 51 | 52 | template 53 | __global__ void build_table_with_unique_keys_kernel( 54 | KeyT* d_key, 55 | ValueT* d_value, 56 | uint32_t num_keys, 57 | GpuSlabHashContext slab_hash) { 58 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 59 | uint32_t laneId = threadIdx.x & 0x1F; 60 | 61 | if ((tid - laneId) >= num_keys) { 62 | return; 63 | } 64 | 65 | AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); 66 | local_allocator_ctx.initAllocator(tid, laneId); 67 | 68 | KeyT myKey = 0; 69 | ValueT myValue = 0; 70 | uint32_t myBucket = 0; 71 | bool to_insert = false; 72 | 73 | if (tid < num_keys) { 74 | myKey = d_key[tid]; 75 | myValue = d_value[tid]; 76 | myBucket = slab_hash.computeBucket(myKey); 77 | to_insert = true; 78 | } 79 | 80 | slab_hash.insertPairUnique( 81 | to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); 82 | } -------------------------------------------------------------------------------- /src/concurrent_map/device/concurrent_kernel.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | template 20 | __global__ void batched_operations( 21 | uint32_t* d_operations, 22 | uint32_t* d_results, 23 | uint32_t num_operations, 24 | GpuSlabHashContext slab_hash) { 25 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 26 | uint32_t laneId = threadIdx.x & 0x1F; 27 | 28 | if ((tid - laneId) >= num_operations) 29 | return; 30 | 31 | // initializing the memory allocator on each warp: 32 | AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); 33 | local_allocator_ctx.initAllocator(tid, laneId); 34 | 35 | uint32_t myOperation = 0; 36 | uint32_t myKey = 0; 37 | uint32_t myValue = 0; 38 | uint32_t myBucket = 0; 39 | 40 | if (tid < num_operations) { 41 | myOperation = d_operations[tid]; 42 | myKey = myOperation & 0x3FFFFFFF; 43 | myBucket = slab_hash.computeBucket(myKey); 44 | myOperation = myOperation >> 30; 45 | // todo: should be changed to a more general case 46 | myValue = myKey; // for the sake of this benchmark 47 | } 48 | 49 | bool to_insert = (myOperation == 1) ? true : false; 50 | bool to_delete = (myOperation == 2) ? true : false; 51 | bool to_search = (myOperation == 3) ? true : false; 52 | 53 | // first insertions: 54 | slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); 55 | 56 | // second deletions: 57 | slab_hash.deleteKey(to_delete, laneId, myKey, myBucket); 58 | 59 | // finally search queries: 60 | slab_hash.searchKey(to_search, laneId, myKey, myValue, myBucket); 61 | 62 | if (myOperation == 3 && myValue != SEARCH_NOT_FOUND) { 63 | d_results[tid] = myValue; 64 | } 65 | } -------------------------------------------------------------------------------- /src/concurrent_map/device/count_kernel.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of California, Davis 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | template 20 | __global__ void count_key( 21 | KeyT* d_queries, 22 | uint32_t* d_counts, 23 | uint32_t num_queries, 24 | GpuSlabHashContext slab_hash) { 25 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 26 | uint32_t laneId = threadIdx.x & 0x1F; 27 | 28 | if ((tid - laneId) >= num_queries) { 29 | return; 30 | } 31 | 32 | KeyT myKey = 0; 33 | uint32_t myCount = 0; 34 | uint32_t myBucket = 0; 35 | bool to_count = false; 36 | 37 | if (tid < num_queries) { 38 | myKey = d_queries[tid]; 39 | myBucket = slab_hash.computeBucket(myKey); 40 | to_count = true; 41 | } 42 | 43 | // count the keys: 44 | slab_hash.countKey(to_count, laneId, myKey, myCount, myBucket); 45 | 46 | // writing back the results: 47 | if (tid < num_queries) { 48 | d_counts[tid] = myCount; 49 | } 50 | } -------------------------------------------------------------------------------- /src/concurrent_map/device/delete_kernel.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | template 20 | __global__ void delete_table_keys( 21 | KeyT* d_key_deleted, 22 | uint32_t num_keys, 23 | GpuSlabHashContext slab_hash) { 24 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 25 | uint32_t laneId = threadIdx.x & 0x1F; 26 | 27 | if ((tid - laneId) >= num_keys) { 28 | return; 29 | } 30 | 31 | KeyT myKey = 0; 32 | uint32_t myBucket = 0; 33 | bool to_delete = false; 34 | 35 | if (tid < num_keys) { 36 | myKey = d_key_deleted[tid]; 37 | myBucket = slab_hash.computeBucket(myKey); 38 | to_delete = true; 39 | } 40 | 41 | // delete the keys: 42 | slab_hash.deleteKey(to_delete, laneId, myKey, myBucket); 43 | } -------------------------------------------------------------------------------- /src/concurrent_map/device/misc_kernels.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | /* 20 | * This kernel can be used to compute the total number of elements and the total number of 21 | * slabs per bucket. The final results per bucket is stored in d_pairs_count_result and 22 | * d_slabs_count_result arrays respectively 23 | */ 24 | template 25 | __global__ void bucket_count_kernel( 26 | GpuSlabHashContext slab_hash, 27 | uint32_t* d_pairs_count_result, 28 | uint32_t* d_slabs_count_result, 29 | uint32_t num_buckets) { 30 | using SlabHashT = ConcurrentMapT; 31 | // global warp ID 32 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 33 | uint32_t wid = tid >> 5; 34 | // assigning a warp per bucket 35 | if (wid >= num_buckets) { 36 | return; 37 | } 38 | 39 | uint32_t laneId = threadIdx.x & 0x1F; 40 | 41 | // initializing the memory allocator on each warp: 42 | slab_hash.getAllocatorContext().initAllocator(tid, laneId); 43 | 44 | uint32_t pairs_count = 0; 45 | uint32_t slabs_count = 1; 46 | 47 | uint32_t src_unit_data = *slab_hash.getPointerFromBucket(wid, laneId); 48 | 49 | pairs_count += __popc(__ballot_sync(0xFFFFFFFF, src_unit_data != EMPTY_KEY) & 50 | SlabHashT::REGULAR_NODE_KEY_MASK); 51 | uint32_t next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 52 | 53 | while (next != SlabHashT::EMPTY_INDEX_POINTER) { 54 | // counting pairs 55 | src_unit_data = *slab_hash.getPointerFromSlab(next, laneId); 56 | pairs_count += __popc(__ballot_sync(0xFFFFFFFF, src_unit_data != EMPTY_KEY) & 57 | SlabHashT::REGULAR_NODE_KEY_MASK); 58 | next = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 59 | // counting slabs 60 | slabs_count++; 61 | } 62 | // writing back the results: 63 | if (laneId == 0) { 64 | d_pairs_count_result[wid] = pairs_count; 65 | d_slabs_count_result[wid] = slabs_count; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/concurrent_map/device/search_kernel.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | //=== Individual search kernel: 20 | template 21 | __global__ void search_table( 22 | KeyT* d_queries, 23 | ValueT* d_results, 24 | uint32_t num_queries, 25 | GpuSlabHashContext slab_hash) { 26 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 27 | uint32_t laneId = threadIdx.x & 0x1F; 28 | 29 | if ((tid - laneId) >= num_queries) { 30 | return; 31 | } 32 | 33 | KeyT myQuery = 0; 34 | ValueT myResult = static_cast(SEARCH_NOT_FOUND); 35 | uint32_t myBucket = 0; 36 | bool to_search = false; 37 | if (tid < num_queries) { 38 | myQuery = d_queries[tid]; 39 | myBucket = slab_hash.computeBucket(myQuery); 40 | to_search = true; 41 | } 42 | 43 | slab_hash.searchKey(to_search, laneId, myQuery, myResult, myBucket); 44 | 45 | // writing back the results: 46 | if (tid < num_queries) { 47 | d_results[tid] = myResult; 48 | } 49 | } 50 | 51 | //=== Bulk search kernel: 52 | template 53 | __global__ void search_table_bulk( 54 | KeyT* d_queries, 55 | ValueT* d_results, 56 | uint32_t num_queries, 57 | GpuSlabHashContext slab_hash) { 58 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 59 | uint32_t laneId = threadIdx.x & 0x1F; 60 | 61 | if ((tid - laneId) >= num_queries) { 62 | return; 63 | } 64 | 65 | KeyT myQuery = 0; 66 | ValueT myResult = static_cast(SEARCH_NOT_FOUND); 67 | uint32_t myBucket = 0; 68 | if (tid < num_queries) { 69 | myQuery = d_queries[tid]; 70 | myBucket = slab_hash.computeBucket(myQuery); 71 | } 72 | 73 | slab_hash.searchKeyBulk(laneId, myQuery, myResult, myBucket); 74 | 75 | // writing back the results: 76 | if (tid < num_queries) { 77 | d_results[tid] = myResult; 78 | } 79 | } -------------------------------------------------------------------------------- /src/concurrent_map/warp/count.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of California, Davis 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | //================================================ 20 | // Individual Count Unit: 21 | //================================================ 22 | template 23 | __device__ __forceinline__ void 24 | GpuSlabHashContext::countKey( 25 | bool& to_be_searched, 26 | const uint32_t& laneId, 27 | const KeyT& myKey, 28 | uint32_t& myCount, 29 | const uint32_t bucket_id) { 30 | using SlabHashT = ConcurrentMapT; 31 | uint32_t work_queue = 0; 32 | uint32_t last_work_queue = work_queue; 33 | uint32_t next = SlabHashT::A_INDEX_POINTER; 34 | 35 | while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_searched))) { 36 | next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER 37 | : next; 38 | uint32_t src_lane = __ffs(work_queue) - 1; 39 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 40 | uint32_t wanted_key = __shfl_sync(0xFFFFFFFF, 41 | *reinterpret_cast( 42 | reinterpret_cast(&myKey)), 43 | src_lane, 44 | 32); 45 | const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER) 46 | ? *(getPointerFromBucket(src_bucket, laneId)) 47 | : *(getPointerFromSlab(next, laneId)); 48 | const int wanted_key_count = __popc(__ballot_sync(0xFFFFFFFF, src_unit_data == wanted_key) & 49 | SlabHashT::REGULAR_NODE_KEY_MASK); 50 | 51 | if(laneId == src_lane) //count 52 | myCount += wanted_key_count; 53 | 54 | uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); //iterate 55 | if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER){ 56 | if(laneId == src_lane){ 57 | to_be_searched = false; 58 | } 59 | } 60 | else{ 61 | next = next_ptr; 62 | } 63 | 64 | last_work_queue = work_queue; 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /src/concurrent_map/warp/delete.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | template 20 | __device__ __forceinline__ bool 21 | GpuSlabHashContext::deleteKey( 22 | bool& to_be_deleted, 23 | const uint32_t& laneId, 24 | const KeyT& myKey, 25 | const uint32_t bucket_id) { 26 | // delete the first instance of key 27 | 28 | using SlabHashT = ConcurrentMapT; 29 | uint32_t work_queue = 0; 30 | uint32_t last_work_queue = 0; 31 | uint32_t next = SlabHashT::A_INDEX_POINTER; 32 | bool successful_deletion = false; 33 | 34 | while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_deleted))) { 35 | // to know whether it is a base node, or a regular node 36 | next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER 37 | : next; // a successfull insertion in the warp 38 | uint32_t src_lane = __ffs(work_queue) - 1; 39 | uint32_t src_key = __shfl_sync(0xFFFFFFFF, 40 | *reinterpret_cast( 41 | reinterpret_cast(&myKey)), 42 | src_lane, 43 | 32); 44 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 45 | // starting with a base node OR regular node: 46 | // need to define different masks to extract super block index, memory block 47 | // index, and the memory unit index 48 | 49 | const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER) 50 | ? *(getPointerFromBucket(src_bucket, laneId)) 51 | : *(getPointerFromSlab(next, laneId)); 52 | 53 | // looking for the item to be deleted: 54 | uint32_t isFound = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) & 55 | SlabHashT::REGULAR_NODE_KEY_MASK; 56 | 57 | if (isFound == 0) { // no matching slot found: 58 | uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 59 | if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { 60 | // not found: 61 | if (laneId == src_lane) 62 | to_be_deleted = false; 63 | } else { 64 | next = next_ptr; 65 | } 66 | } else { // The wanted key found: 67 | int dest_lane = __ffs(isFound & SlabHashT::REGULAR_NODE_KEY_MASK) - 1; 68 | if (laneId == src_lane) { 69 | uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) 70 | ? getPointerFromBucket(src_bucket, dest_lane) 71 | : getPointerFromSlab(next, dest_lane); 72 | 73 | uint64_t old_pair = atomicExch((unsigned long long int*)p, EMPTY_PAIR_64); 74 | uint32_t deleted_key = static_cast(old_pair); 75 | successful_deletion = deleted_key == src_key; 76 | to_be_deleted = false; 77 | } 78 | } 79 | last_work_queue = work_queue; 80 | } 81 | return successful_deletion; 82 | } -------------------------------------------------------------------------------- /src/concurrent_map/warp/insert.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | /* 20 | * each thread inserts a key-value pair into the hash table 21 | * it is assumed all threads within a warp are present and collaborating with 22 | * each other with a warp-cooperative work sharing (WCWS) strategy. 23 | */ 24 | template 25 | __device__ __forceinline__ void 26 | GpuSlabHashContext::insertPair( 27 | bool& to_be_inserted, 28 | const uint32_t& laneId, 29 | const KeyT& myKey, 30 | const ValueT& myValue, 31 | const uint32_t bucket_id, 32 | AllocatorContextT& local_allocator_ctx) { 33 | using SlabHashT = ConcurrentMapT; 34 | uint32_t work_queue = 0; 35 | uint32_t last_work_queue = 0; 36 | uint32_t next = SlabHashT::A_INDEX_POINTER; 37 | 38 | while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_inserted))) { 39 | // to know whether it is a base node, or a regular node 40 | next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER 41 | : next; // a successfull insertion in the warp 42 | uint32_t src_lane = __ffs(work_queue) - 1; 43 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 44 | 45 | uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER) 46 | ? *(getPointerFromBucket(src_bucket, laneId)) 47 | : *(getPointerFromSlab(next, laneId)); 48 | uint64_t old_key_value_pair = 0; 49 | 50 | uint32_t isEmpty = (__ballot_sync(0xFFFFFFFF, src_unit_data == EMPTY_KEY)) & 51 | SlabHashT::REGULAR_NODE_KEY_MASK; 52 | if (isEmpty == 0) { // no empty slot available: 53 | uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 54 | if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { 55 | // allocate a new node: 56 | uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); 57 | 58 | // TODO: experiment if it's better to use lane 0 instead 59 | if (laneId == 31) { 60 | const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) 61 | ? getPointerFromBucket(src_bucket, 31) 62 | : getPointerFromSlab(next, 31); 63 | 64 | uint32_t temp = 65 | atomicCAS((unsigned int*)p, SlabHashT::EMPTY_INDEX_POINTER, new_node_ptr); 66 | // check whether it was successful, and 67 | // free the allocated memory otherwise 68 | if (temp != SlabHashT::EMPTY_INDEX_POINTER) { 69 | freeSlab(new_node_ptr); 70 | } 71 | } 72 | } else { 73 | next = next_ptr; 74 | } 75 | } else { // there is an empty slot available 76 | int dest_lane = __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1; 77 | if (laneId == src_lane) { 78 | const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) 79 | ? getPointerFromBucket(src_bucket, dest_lane) 80 | : getPointerFromSlab(next, dest_lane); 81 | 82 | old_key_value_pair = 83 | atomicCAS((unsigned long long int*)p, 84 | EMPTY_PAIR_64, 85 | ((uint64_t)(*reinterpret_cast( 86 | reinterpret_cast(&myValue))) 87 | << 32) | 88 | *reinterpret_cast( 89 | reinterpret_cast(&myKey))); 90 | if (old_key_value_pair == EMPTY_PAIR_64) 91 | to_be_inserted = false; // succesfful insertion 92 | } 93 | } 94 | last_work_queue = work_queue; 95 | } 96 | } 97 | 98 | /* 99 | * each thread inserts a unique key (and its value) into the hash table 100 | * if the key already exist in the hash table, it only keeps the first instance 101 | * it is assumed all threads within a warp are present and collaborating with 102 | * each other with a warp-cooperative work sharing (WCWS) strategy. 103 | * returns true only if a new key was inserted into the hash table 104 | */ 105 | template 106 | __device__ __forceinline__ bool 107 | GpuSlabHashContext::insertPairUnique( 108 | bool& to_be_inserted, 109 | const uint32_t& laneId, 110 | const KeyT& myKey, 111 | const ValueT& myValue, 112 | const uint32_t bucket_id, 113 | AllocatorContextT& local_allocator_ctx) { 114 | using SlabHashT = ConcurrentMapT; 115 | uint32_t work_queue = 0; 116 | uint32_t last_work_queue = 0; 117 | uint32_t next = SlabHashT::A_INDEX_POINTER; 118 | bool new_insertion = false; 119 | while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_inserted))) { 120 | // to know whether it is a base node, or a regular node 121 | next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER 122 | : next; // a successful insertion in the warp 123 | uint32_t src_lane = __ffs(work_queue) - 1; 124 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 125 | 126 | uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER) 127 | ? *(getPointerFromBucket(src_bucket, laneId)) 128 | : *(getPointerFromSlab(next, laneId)); 129 | uint64_t old_key_value_pair = 0; 130 | 131 | uint32_t isEmpty = (__ballot_sync(0xFFFFFFFF, src_unit_data == EMPTY_KEY)) & 132 | SlabHashT::REGULAR_NODE_KEY_MASK; 133 | 134 | uint32_t src_key = __shfl_sync(0xFFFFFFFF, myKey, src_lane, 32); 135 | uint32_t isExisting = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) & 136 | SlabHashT::REGULAR_NODE_KEY_MASK; 137 | if (isExisting) { // key exist in the hash table 138 | if (laneId == src_lane) 139 | to_be_inserted = false; 140 | } else { 141 | if (isEmpty == 0) { // no empty slot available: 142 | uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 143 | if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { 144 | // allocate a new node: 145 | uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); 146 | 147 | if (laneId == 31) { 148 | const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) 149 | ? getPointerFromBucket(src_bucket, 31) 150 | : getPointerFromSlab(next, 31); 151 | 152 | uint32_t temp = 153 | atomicCAS((unsigned int*)p, SlabHashT::EMPTY_INDEX_POINTER, new_node_ptr); 154 | // check whether it was successful, and 155 | // free the allocated memory otherwise 156 | if (temp != SlabHashT::EMPTY_INDEX_POINTER) { 157 | freeSlab(new_node_ptr); 158 | } 159 | } 160 | } else { 161 | next = next_ptr; 162 | } 163 | } else { // there is an empty slot available 164 | int dest_lane = __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1; 165 | if (laneId == src_lane) { 166 | const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) 167 | ? getPointerFromBucket(src_bucket, dest_lane) 168 | : getPointerFromSlab(next, dest_lane); 169 | 170 | old_key_value_pair = 171 | atomicCAS((unsigned long long int*)p, 172 | EMPTY_PAIR_64, 173 | ((uint64_t)(*reinterpret_cast( 174 | reinterpret_cast(&myValue))) 175 | << 32) | 176 | *reinterpret_cast( 177 | reinterpret_cast(&myKey))); 178 | if (old_key_value_pair == EMPTY_PAIR_64) { 179 | to_be_inserted = false; // successful insertion 180 | new_insertion = true; 181 | } 182 | } 183 | } 184 | } 185 | last_work_queue = work_queue; 186 | } 187 | return new_insertion; 188 | } -------------------------------------------------------------------------------- /src/concurrent_map/warp/search.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | //================================================ 20 | // Individual Search Unit: 21 | //================================================ 22 | template 23 | __device__ __forceinline__ void 24 | GpuSlabHashContext::searchKey( 25 | bool& to_be_searched, 26 | const uint32_t& laneId, 27 | const KeyT& myKey, 28 | ValueT& myValue, 29 | const uint32_t bucket_id) { 30 | using SlabHashT = ConcurrentMapT; 31 | uint32_t work_queue = 0; 32 | uint32_t last_work_queue = work_queue; 33 | uint32_t next = SlabHashT::A_INDEX_POINTER; 34 | 35 | while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_searched))) { 36 | next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER 37 | : next; // a successfull insertion in the warp 38 | uint32_t src_lane = __ffs(work_queue) - 1; 39 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 40 | uint32_t wanted_key = __shfl_sync(0xFFFFFFFF, 41 | *reinterpret_cast( 42 | reinterpret_cast(&myKey)), 43 | src_lane, 44 | 32); 45 | const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER) 46 | ? *(getPointerFromBucket(src_bucket, laneId)) 47 | : *(getPointerFromSlab(next, laneId)); 48 | int found_lane = __ffs(__ballot_sync(0xFFFFFFFF, src_unit_data == wanted_key) & 49 | SlabHashT::REGULAR_NODE_KEY_MASK) - 50 | 1; 51 | if (found_lane < 0) { // not found 52 | uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 53 | if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { // not found 54 | if (laneId == src_lane) { 55 | myValue = static_cast(SEARCH_NOT_FOUND); 56 | to_be_searched = false; 57 | } 58 | } else { 59 | next = next_ptr; 60 | } 61 | } else { // found the key: 62 | uint32_t found_value = __shfl_sync(0xFFFFFFFF, src_unit_data, found_lane + 1, 32); 63 | if (laneId == src_lane) { 64 | myValue = *reinterpret_cast( 65 | reinterpret_cast(&found_value)); 66 | to_be_searched = false; 67 | } 68 | } 69 | last_work_queue = work_queue; 70 | } 71 | } 72 | 73 | //================================================ 74 | // Bulk Search Unit: 75 | //================================================ 76 | template 77 | __device__ __forceinline__ void 78 | GpuSlabHashContext::searchKeyBulk( 79 | const uint32_t& laneId, 80 | const KeyT& myKey, 81 | ValueT& myValue, 82 | const uint32_t bucket_id) { 83 | using SlabHashT = ConcurrentMapT; 84 | #pragma unroll 85 | for (int src_lane = 0; src_lane < WARP_WIDTH; src_lane++) { 86 | bool is_top_of_list = true; 87 | uint32_t next_ptr = SlabHashT::EMPTY_INDEX_POINTER; 88 | uint32_t found_lane_plus_1 = 0; 89 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 90 | uint32_t wanted_key = __shfl_sync(0xFFFFFFFF, 91 | *reinterpret_cast( 92 | reinterpret_cast(&myKey)), 93 | src_lane, 94 | 32); 95 | 96 | do { 97 | const uint32_t src_unit_data = (is_top_of_list) 98 | ? *(getPointerFromBucket(src_bucket, laneId)) 99 | : *(getPointerFromSlab(next_ptr, laneId)); 100 | 101 | next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 102 | // if found_lane_plus_1 == 0, then the query is not found 103 | found_lane_plus_1 = __ffs(__ballot_sync(0xFFFFFFFF, src_unit_data == wanted_key) & 104 | SlabHashT::REGULAR_NODE_KEY_MASK); 105 | // values are stored at (found_value + 1) 106 | uint32_t found_value = 107 | __shfl_sync(0xFFFFFFFF, src_unit_data, found_lane_plus_1, 32); 108 | // The responsible thread stores the result if it is found correctly 109 | myValue = ((found_lane_plus_1 != 0) && (src_lane == laneId)) 110 | ? *reinterpret_cast( 111 | reinterpret_cast(&found_value)) 112 | : myValue; 113 | is_top_of_list = false; 114 | } while ((next_ptr != SlabHashT::EMPTY_INDEX_POINTER) && (found_lane_plus_1 == 0)); 115 | } 116 | } -------------------------------------------------------------------------------- /src/concurrent_set/cset_class.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | /* 20 | * This is the main class that will be shallowly copied into the device to be 21 | * used at runtime. This class does not own the allocated memory on the gpu 22 | * (i.e., d_table_) 23 | */ 24 | template 25 | class GpuSlabHashContext { 26 | public: 27 | // fixed known parameters: 28 | static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; 29 | static constexpr uint32_t WARP_WIDTH_ = 32; 30 | 31 | #pragma hd_warning_disable 32 | __device__ __host__ GpuSlabHashContext() 33 | : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) { 34 | // a single slab on a ConcurrentSet should be 128 bytes 35 | } 36 | 37 | #pragma hd_warning_disable 38 | __host__ __device__ GpuSlabHashContext( 39 | GpuSlabHashContext& rhs) { 40 | num_buckets_ = rhs.getNumBuckets(); 41 | hash_x_ = rhs.getHashX(); 42 | hash_y_ = rhs.getHashY(); 43 | d_table_ = rhs.getDeviceTablePointer(); 44 | global_allocator_ctx_ = rhs.getAllocatorContext(); 45 | } 46 | 47 | #pragma hd_warning_disable 48 | __host__ __device__ ~GpuSlabHashContext() {} 49 | 50 | static size_t getSlabUnitSize() { 51 | return sizeof(typename ConcurrentSetT::SlabTypeT); 52 | } 53 | 54 | static std::string getSlabHashTypeName() { return ConcurrentSetT::getTypeName(); } 55 | 56 | __host__ void initParameters(const uint32_t num_buckets, 57 | const uint32_t hash_x, 58 | const uint32_t hash_y, 59 | int8_t* d_table, 60 | AllocatorContextT* allocator_ctx) { 61 | num_buckets_ = num_buckets; 62 | hash_x_ = hash_x; 63 | hash_y_ = hash_y; 64 | d_table_ = reinterpret_cast::SlabTypeT*>(d_table); 65 | global_allocator_ctx_ = *allocator_ctx; 66 | } 67 | 68 | __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { 69 | return global_allocator_ctx_; 70 | } 71 | 72 | __device__ __host__ __forceinline__ typename ConcurrentSetT::SlabTypeT* 73 | getDeviceTablePointer() { 74 | return d_table_; 75 | } 76 | 77 | __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; } 78 | __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; } 79 | __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; } 80 | 81 | __device__ __host__ __forceinline__ uint32_t computeBucket(const KeyT& key) const { 82 | return (((hash_x_ ^ key) + hash_y_) % PRIME_DIVISOR_) % num_buckets_; 83 | } 84 | 85 | // threads in a warp cooperate with each other to insert keys 86 | // into the slab hash set 87 | __device__ __forceinline__ bool insertKey(bool& to_be_inserted, 88 | const uint32_t& laneId, 89 | const KeyT& myKey, 90 | const uint32_t bucket_id, 91 | AllocatorContextT& local_allocator_context); 92 | 93 | // threads in a warp cooeparte with each other to search for keys 94 | // if found, it returns the true, else false 95 | __device__ __forceinline__ bool searchKey(bool& to_be_searched, 96 | const uint32_t& laneId, 97 | const KeyT& myKey, 98 | const uint32_t bucket_id); 99 | 100 | // threads in a warp cooperate with each other to search for keys. 101 | // the main difference with above function is that it is assumed all 102 | // threads have something to search for (no to_be_searched argument) 103 | __device__ __forceinline__ bool searchKeyBulk(const uint32_t& laneId, 104 | const KeyT& myKey, 105 | const uint32_t bucket_id); 106 | 107 | __device__ __forceinline__ uint32_t* getPointerFromSlab( 108 | const SlabAddressT& slab_address, 109 | const uint32_t laneId) { 110 | return global_allocator_ctx_.getPointerFromSlab(slab_address, laneId); 111 | } 112 | 113 | __device__ __forceinline__ uint32_t* getPointerFromBucket(const uint32_t bucket_id, 114 | const uint32_t laneId) { 115 | return reinterpret_cast(d_table_) + 116 | bucket_id * ConcurrentSetT::BASE_UNIT_SIZE + laneId; 117 | } 118 | 119 | private: 120 | // this function should be operated in a warp-wide fashion 121 | // TODO: add required asserts to make sure this is true in tests/debugs 122 | __device__ __forceinline__ SlabAllocAddressT allocateSlab(const uint32_t& laneId) { 123 | return global_allocator_ctx_.warpAllocate(laneId); 124 | } 125 | 126 | __device__ __forceinline__ SlabAllocAddressT 127 | allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) { 128 | return local_allocator_ctx.warpAllocate(laneId); 129 | } 130 | 131 | // a thread-wide function to free the slab that was just allocated 132 | __device__ __forceinline__ void freeSlab(const SlabAllocAddressT slab_ptr) { 133 | global_allocator_ctx_.freeUntouched(slab_ptr); 134 | } 135 | 136 | // === members: 137 | uint32_t num_buckets_; 138 | uint32_t hash_x_; 139 | uint32_t hash_y_; 140 | typename ConcurrentSetT::SlabTypeT* d_table_; 141 | // a copy of dynamic allocator's context to be used on the GPU 142 | AllocatorContextT global_allocator_ctx_; 143 | }; 144 | 145 | /* 146 | * This class owns the allocated memory for the hash table 147 | */ 148 | template 149 | class GpuSlabHash { 150 | private: 151 | // fixed known parameters: 152 | static constexpr uint32_t BLOCKSIZE_ = 128; 153 | static constexpr uint32_t WARP_WIDTH_ = 32; 154 | static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; 155 | 156 | struct hash_function { 157 | uint32_t x; 158 | uint32_t y; 159 | } hf_; 160 | 161 | // total number of buckets (slabs) for this hash table 162 | uint32_t num_buckets_; 163 | 164 | // a raw pointer to the initial allocated memory for all buckets 165 | int8_t* d_table_; 166 | size_t slab_unit_size_; // size of each slab unit in bytes (might differ 167 | // based on the type) 168 | 169 | // slab hash context, contains everything that a GPU application needs to be 170 | // able to use this data structure 171 | GpuSlabHashContext gpu_context_; 172 | 173 | // const pointer to an allocator that all instances of slab hash are going to 174 | // use. The allocator itself is not owned by this class 175 | DynamicAllocatorT* dynamic_allocator_; 176 | uint32_t device_idx_; 177 | 178 | public: 179 | GpuSlabHash(const uint32_t num_buckets, 180 | DynamicAllocatorT* dynamic_allocator, 181 | uint32_t device_idx, 182 | const time_t seed = 0, 183 | const bool identity_hash = false) 184 | : num_buckets_(num_buckets) 185 | , d_table_(nullptr) 186 | , slab_unit_size_(0) 187 | , dynamic_allocator_(dynamic_allocator) 188 | , device_idx_(device_idx) { 189 | assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash."); 190 | assert(sizeof(typename ConcurrentSetT::SlabTypeT) == 191 | (WARP_WIDTH_ * sizeof(uint32_t)) && 192 | "A single slab on a ConcurrentMap should be 128 bytes"); 193 | int32_t devCount = 0; 194 | CHECK_CUDA_ERROR(cudaGetDeviceCount(&devCount)); 195 | assert(device_idx_ < devCount); 196 | 197 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 198 | 199 | slab_unit_size_ = 200 | GpuSlabHashContext::getSlabUnitSize(); 201 | 202 | // allocating initial buckets: 203 | CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); 204 | 205 | CHECK_CUDA_ERROR(cudaMemset(d_table_, 0xFF, slab_unit_size_ * num_buckets_)); 206 | 207 | // creating a random number generator: 208 | if (!identity_hash) { 209 | std::mt19937 rng(seed ? seed : time(0)); 210 | hf_.x = rng() % PRIME_DIVISOR_; 211 | if (hf_.x < 1) 212 | hf_.x = 1; 213 | hf_.y = rng() % PRIME_DIVISOR_; 214 | } else { 215 | hf_ = {0u, 0u}; 216 | } 217 | 218 | // initializing the gpu_context_: 219 | gpu_context_.initParameters( 220 | num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr()); 221 | } 222 | 223 | ~GpuSlabHash() { 224 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 225 | CHECK_CUDA_ERROR(cudaFree(d_table_)); 226 | } 227 | 228 | // returns some debug information about the slab hash 229 | std::string to_string(); 230 | double computeLoadFactor(int flag) {} 231 | GpuSlabHashContext& getSlabHashContext() { 232 | return gpu_context_; 233 | } 234 | 235 | void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys); 236 | void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries); 237 | void searchBulk(KeyT* d_query, ValueT* d_result, uint32_t num_queries) {} 238 | void deleteIndividual(KeyT* d_key, uint32_t num_keys) {} 239 | }; -------------------------------------------------------------------------------- /src/concurrent_set/cset_helper_kernels.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | namespace cset { 19 | template 20 | __global__ void build_table_kernel( 21 | KeyT* d_key, 22 | uint32_t num_keys, 23 | GpuSlabHashContext slab_hash) { 24 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 25 | uint32_t laneId = threadIdx.x & 0x1F; 26 | 27 | if ((tid - laneId) >= num_keys) { 28 | return; 29 | } 30 | 31 | // initializing the memory allocator on each warp: 32 | AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); 33 | local_allocator_ctx.initAllocator(tid, laneId); 34 | 35 | KeyT myKey = 0; 36 | uint32_t myBucket = 0; 37 | bool to_insert = false; 38 | 39 | if (tid < num_keys) { 40 | myKey = d_key[tid]; 41 | myBucket = slab_hash.computeBucket(myKey); 42 | to_insert = true; 43 | } 44 | 45 | slab_hash.insertKey(to_insert, laneId, myKey, myBucket, local_allocator_ctx); 46 | } 47 | 48 | //=== Individual search kernel: 49 | template 50 | __global__ void search_table( 51 | KeyT* d_queries, 52 | KeyT* d_results, 53 | uint32_t num_queries, 54 | GpuSlabHashContext slab_hash) { 55 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 56 | uint32_t laneId = threadIdx.x & 0x1F; 57 | 58 | if ((tid - laneId) >= num_queries) { 59 | return; 60 | } 61 | 62 | KeyT myQuery = 0; 63 | uint32_t myBucket = 0; 64 | bool to_search = false; 65 | if (tid < num_queries) { 66 | myQuery = d_queries[tid]; 67 | myBucket = slab_hash.computeBucket(myQuery); 68 | to_search = true; 69 | } 70 | 71 | bool myResult = slab_hash.searchKey(to_search, laneId, myQuery, myBucket); 72 | 73 | // writing back the results: 74 | if (tid < num_queries) { 75 | d_results[tid] = myResult ? myQuery : SEARCH_NOT_FOUND; 76 | } 77 | } 78 | }; // namespace cset -------------------------------------------------------------------------------- /src/concurrent_set/cset_implementation.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | template 20 | void GpuSlabHash::buildBulk( 21 | KeyT* d_key, 22 | ValueT* d_value, 23 | uint32_t num_keys) { 24 | const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; 25 | // calling the kernel for bulk build: 26 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 27 | cset::build_table_kernel 28 | <<>>(d_key, num_keys, gpu_context_); 29 | } 30 | 31 | template 32 | void GpuSlabHash::searchIndividual( 33 | KeyT* d_query, 34 | ValueT* d_result, 35 | uint32_t num_queries) { 36 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 37 | const uint32_t num_blocks = (num_queries + BLOCKSIZE_ - 1) / BLOCKSIZE_; 38 | cset::search_table 39 | <<>>(d_query, d_result, num_queries, gpu_context_); 40 | } 41 | 42 | template 43 | std::string GpuSlabHash::to_string() { 44 | std::string result; 45 | result += " ==== GpuSlabHash: \n"; 46 | result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n"; 47 | result += "\t SlabHashType: \t\t " + gpu_context_.getSlabHashTypeName() + "\n"; 48 | result += "\t Number of buckets:\t\t " + std::to_string(num_buckets_) + "\n"; 49 | result += "\t d_table_ address: \t\t " + 50 | std::to_string(reinterpret_cast(static_cast(d_table_))) + 51 | "\n"; 52 | result += "\t hash function = \t\t (" + std::to_string(hf_.x) + ", " + 53 | std::to_string(hf_.y) + ")\n"; 54 | return result; 55 | } 56 | -------------------------------------------------------------------------------- /src/concurrent_set/cset_warp_operations.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | template 20 | __device__ __forceinline__ bool 21 | GpuSlabHashContext::insertKey( 22 | bool& to_be_inserted, 23 | const uint32_t& laneId, 24 | const KeyT& myKey, 25 | const uint32_t bucket_id, 26 | AllocatorContextT& local_allocator_ctx) { 27 | using SlabHashT = ConcurrentSetT; 28 | uint32_t work_queue = 0; 29 | uint32_t last_work_queue = 0; 30 | uint32_t next = SlabHashT::A_INDEX_POINTER; 31 | bool new_insertion = false; 32 | 33 | while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_inserted))) { 34 | // to know whether it is a base node, or a regular node 35 | next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER 36 | : next; // a successfull insertion in the warp 37 | uint32_t src_lane = __ffs(work_queue) - 1; 38 | KeyT src_key = __shfl_sync(0xFFFFFFFF, myKey, src_lane, 32); 39 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 40 | 41 | uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER) 42 | ? *getPointerFromBucket(src_bucket, laneId) 43 | : *getPointerFromSlab(next, laneId); 44 | 45 | uint32_t old_key = 0; 46 | 47 | // looking for the same key (if it exists), or an empty spot: 48 | int32_t dest_lane = SlabHash_NS::findKeyOrEmptyPerWarp>( 49 | src_key, src_unit_data); 50 | 51 | if (dest_lane == -1) { // key not found and/or no empty slot available: 52 | uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 53 | if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { 54 | // allocate a new node: 55 | uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); 56 | 57 | if (laneId == 31) { 58 | uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) 59 | ? getPointerFromBucket(src_bucket, 31) 60 | : getPointerFromSlab(next, 31); 61 | 62 | uint32_t temp = 63 | atomicCAS((unsigned int*)p, SlabHashT::EMPTY_INDEX_POINTER, new_node_ptr); 64 | // check whether it was successful, and 65 | // free the allocated memory otherwise 66 | if (temp != SlabHashT::EMPTY_INDEX_POINTER) 67 | freeSlab(new_node_ptr); 68 | } 69 | } else { 70 | next = next_ptr; 71 | } 72 | } else { // either the key is found, or there is an empty slot available 73 | if (laneId == src_lane) { 74 | const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) 75 | ? getPointerFromBucket(src_bucket, dest_lane) 76 | : getPointerFromSlab(next, dest_lane); 77 | 78 | old_key = atomicCAS((unsigned int*)p, 79 | EMPTY_KEY, 80 | *reinterpret_cast( 81 | reinterpret_cast(&myKey))); 82 | new_insertion = (old_key == EMPTY_KEY); 83 | if (new_insertion || (old_key == src_key)) { 84 | to_be_inserted = false; // succesful insertion 85 | } 86 | } 87 | } 88 | last_work_queue = work_queue; 89 | } 90 | return new_insertion; 91 | } 92 | 93 | // ======== 94 | template 95 | __device__ __forceinline__ bool 96 | GpuSlabHashContext::searchKey( 97 | bool& to_be_searched, 98 | const uint32_t& laneId, 99 | const KeyT& myKey, 100 | const uint32_t bucket_id) { 101 | bool myResult = false; 102 | using SlabHashT = ConcurrentSetT; 103 | uint32_t work_queue = 0; 104 | uint32_t last_work_queue = work_queue; 105 | uint32_t next = SlabHashT::A_INDEX_POINTER; 106 | 107 | while ((work_queue = __ballot_sync(0xFFFFFFFF, to_be_searched))) { 108 | next = (last_work_queue != work_queue) ? SlabHashT::A_INDEX_POINTER 109 | : next; // a successfull insertion in the warp 110 | uint32_t src_lane = __ffs(work_queue) - 1; 111 | uint32_t src_bucket = __shfl_sync(0xFFFFFFFF, bucket_id, src_lane, 32); 112 | KeyT wanted_key = __shfl_sync(0xFFFFFFFF, myKey, src_lane, 32); 113 | 114 | const uint32_t src_unit_data = (next == SlabHashT::A_INDEX_POINTER) 115 | ? *getPointerFromBucket(src_bucket, laneId) 116 | : *getPointerFromSlab(next, laneId); 117 | 118 | int32_t found_lane = SlabHash_NS::findKeyPerWarp>( 119 | wanted_key, src_unit_data); 120 | 121 | if (found_lane < 0) { // not found 122 | uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); 123 | if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { // not found 124 | if (laneId == src_lane) { 125 | to_be_searched = false; 126 | } 127 | } else { 128 | next = next_ptr; 129 | } 130 | } else { // found the key: 131 | if (laneId == src_lane) { 132 | to_be_searched = false; 133 | myResult = true; 134 | } 135 | } 136 | last_work_queue = work_queue; 137 | } 138 | return myResult; 139 | } 140 | 141 | template 142 | __device__ __forceinline__ bool 143 | GpuSlabHashContext::searchKeyBulk( 144 | const uint32_t& laneId, 145 | const KeyT& myKey, 146 | const uint32_t bucket_id) {} -------------------------------------------------------------------------------- /src/gpu_hash_table.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include "slab_hash.cuh" 19 | 20 | /* 21 | * This class acts as a helper class to simplify simulations around different 22 | * kinds of slab hash implementations 23 | */ 24 | template 25 | class gpu_hash_table { 26 | private: 27 | uint32_t max_keys_; 28 | uint32_t num_buckets_; 29 | int64_t seed_; 30 | bool req_values_; 31 | bool identity_hash_; 32 | 33 | public: 34 | // Slab hash invariant 35 | GpuSlabHash* slab_hash_; 36 | 37 | // the dynamic allocator that is being used for slab hash 38 | DynamicAllocatorT* dynamic_allocator_; 39 | 40 | uint32_t device_idx_; 41 | 42 | // main arrays to hold keys, values, queries, results, etc. 43 | KeyT* d_key_; 44 | ValueT* d_value_; 45 | KeyT* d_query_; 46 | ValueT* d_result_; 47 | uint32_t* d_count_; 48 | 49 | gpu_hash_table(uint32_t max_keys, 50 | uint32_t num_buckets, 51 | const uint32_t device_idx, 52 | const int64_t seed, 53 | const bool req_values = true, 54 | const bool identity_hash = false, 55 | const bool verbose = false) 56 | : max_keys_(max_keys) 57 | , num_buckets_(num_buckets) 58 | , seed_(seed) 59 | , req_values_(req_values) 60 | , slab_hash_(nullptr) 61 | , identity_hash_(identity_hash) 62 | , dynamic_allocator_(nullptr) 63 | , device_idx_(device_idx) { 64 | int32_t devCount = 0; 65 | CHECK_CUDA_ERROR(cudaGetDeviceCount(&devCount)); 66 | assert(device_idx_ < devCount); 67 | 68 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 69 | 70 | // allocating key, value arrays: 71 | CHECK_CUDA_ERROR(cudaMalloc((void**)&d_key_, sizeof(KeyT) * max_keys_)); 72 | if (req_values_) { 73 | CHECK_CUDA_ERROR(cudaMalloc((void**)&d_value_, sizeof(ValueT) * max_keys_)); 74 | } 75 | CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_)); 76 | CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_)); 77 | CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); 78 | 79 | // allocate an initialize the allocator: 80 | dynamic_allocator_ = new DynamicAllocatorT(); 81 | 82 | // slab hash: 83 | slab_hash_ = new GpuSlabHash( 84 | num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_); 85 | if (verbose) { 86 | std::cout << slab_hash_->to_string() << std::endl; 87 | } 88 | } 89 | 90 | ~gpu_hash_table() { 91 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 92 | CHECK_CUDA_ERROR(cudaFree(d_key_)); 93 | if (req_values_) { 94 | CHECK_CUDA_ERROR(cudaFree(d_value_)); 95 | } 96 | CHECK_CUDA_ERROR(cudaFree(d_query_)); 97 | CHECK_CUDA_ERROR(cudaFree(d_result_)); 98 | CHECK_CUDA_ERROR(cudaFree(d_count_)); 99 | 100 | // delete the dynamic allocator: 101 | delete dynamic_allocator_; 102 | 103 | // slab hash: 104 | delete (slab_hash_); 105 | } 106 | 107 | std::string to_string() { return slab_hash_->to_string(); } 108 | float hash_build(KeyT* h_key, ValueT* h_value, uint32_t num_keys) { 109 | // moving key-values to the device: 110 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 111 | CHECK_CUDA_ERROR( 112 | cudaMemcpy(d_key_, h_key, sizeof(KeyT) * num_keys, cudaMemcpyHostToDevice)); 113 | if (req_values_) { 114 | CHECK_CUDA_ERROR(cudaMemcpy( 115 | d_value_, h_value, sizeof(ValueT) * num_keys, cudaMemcpyHostToDevice)); 116 | } 117 | 118 | float temp_time = 0.0f; 119 | 120 | cudaEvent_t start, stop; 121 | cudaEventCreate(&start); 122 | cudaEventCreate(&stop); 123 | 124 | cudaEventRecord(start, 0); 125 | 126 | // calling slab-hash's bulk build procedure: 127 | slab_hash_->buildBulk(d_key_, d_value_, num_keys); 128 | 129 | cudaEventRecord(stop, 0); 130 | cudaEventSynchronize(stop); 131 | cudaEventElapsedTime(&temp_time, start, stop); 132 | 133 | cudaEventDestroy(start); 134 | cudaEventDestroy(stop); 135 | return temp_time; 136 | } 137 | float hash_build_with_unique_keys(KeyT* h_key, ValueT* h_value, uint32_t num_keys) { 138 | // moving key-values to the device: 139 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 140 | CHECK_CUDA_ERROR( 141 | cudaMemcpy(d_key_, h_key, sizeof(KeyT) * num_keys, cudaMemcpyHostToDevice)); 142 | if (req_values_) { 143 | CHECK_CUDA_ERROR(cudaMemcpy( 144 | d_value_, h_value, sizeof(ValueT) * num_keys, cudaMemcpyHostToDevice)); 145 | } 146 | 147 | float temp_time = 0.0f; 148 | 149 | cudaEvent_t start, stop; 150 | cudaEventCreate(&start); 151 | cudaEventCreate(&stop); 152 | 153 | cudaEventRecord(start, 0); 154 | 155 | // calling slab-hash's bulk build procedure: 156 | slab_hash_->buildBulkWithUniqueKeys(d_key_, d_value_, num_keys); 157 | 158 | cudaEventRecord(stop, 0); 159 | cudaEventSynchronize(stop); 160 | cudaEventElapsedTime(&temp_time, start, stop); 161 | 162 | cudaEventDestroy(start); 163 | cudaEventDestroy(stop); 164 | return temp_time; 165 | } 166 | float hash_search(KeyT* h_query, ValueT* h_result, uint32_t num_queries) { 167 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 168 | CHECK_CUDA_ERROR(cudaMemcpy( 169 | d_query_, h_query, sizeof(KeyT) * num_queries, cudaMemcpyHostToDevice)); 170 | CHECK_CUDA_ERROR(cudaMemset(d_result_, 0xFF, sizeof(ValueT) * num_queries)); 171 | 172 | float temp_time = 0.0f; 173 | 174 | cudaEvent_t start, stop; 175 | cudaEventCreate(&start); 176 | cudaEventCreate(&stop); 177 | cudaEventRecord(start, 0); 178 | 179 | // == calling slab hash's individual search: 180 | slab_hash_->searchIndividual(d_query_, d_result_, num_queries); 181 | //== 182 | 183 | cudaEventRecord(stop, 0); 184 | cudaEventSynchronize(stop); 185 | cudaEventElapsedTime(&temp_time, start, stop); 186 | 187 | cudaEventDestroy(start); 188 | cudaEventDestroy(stop); 189 | 190 | CHECK_CUDA_ERROR(cudaMemcpy( 191 | h_result, d_result_, sizeof(ValueT) * num_queries, cudaMemcpyDeviceToHost)); 192 | cudaDeviceSynchronize(); 193 | return temp_time; 194 | } 195 | float hash_search_bulk(KeyT* h_query, ValueT* h_result, uint32_t num_queries) { 196 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 197 | CHECK_CUDA_ERROR(cudaMemcpy( 198 | d_query_, h_query, sizeof(KeyT) * num_queries, cudaMemcpyHostToDevice)); 199 | CHECK_CUDA_ERROR(cudaMemset(d_result_, 0xFF, sizeof(ValueT) * num_queries)); 200 | 201 | float temp_time = 0.0f; 202 | 203 | cudaEvent_t start, stop; 204 | cudaEventCreate(&start); 205 | cudaEventCreate(&stop); 206 | cudaEventRecord(start, 0); 207 | 208 | //== slab hash's bulk search: 209 | slab_hash_->searchBulk(d_query_, d_result_, num_queries); 210 | //== 211 | 212 | cudaEventRecord(stop, 0); 213 | cudaEventSynchronize(stop); 214 | cudaEventElapsedTime(&temp_time, start, stop); 215 | 216 | cudaEventDestroy(start); 217 | cudaEventDestroy(stop); 218 | 219 | CHECK_CUDA_ERROR(cudaMemcpy( 220 | h_result, d_result_, sizeof(ValueT) * num_queries, cudaMemcpyDeviceToHost)); 221 | cudaDeviceSynchronize(); 222 | return temp_time; 223 | } 224 | 225 | float hash_count(KeyT* h_query, uint32_t* h_count, uint32_t num_queries) { 226 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 227 | CHECK_CUDA_ERROR(cudaMemcpy( 228 | d_query_, h_query, sizeof(KeyT) * num_queries, cudaMemcpyHostToDevice)); 229 | CHECK_CUDA_ERROR(cudaMemset(d_result_, 0x00, sizeof(uint32_t) * num_queries)); 230 | 231 | float temp_time = 0.0f; 232 | 233 | cudaEvent_t start, stop; 234 | cudaEventCreate(&start); 235 | cudaEventCreate(&stop); 236 | cudaEventRecord(start, 0); 237 | 238 | // == calling slab hash's individual count: 239 | slab_hash_->countIndividual(d_query_, d_count_, num_queries); 240 | //== 241 | 242 | cudaEventRecord(stop, 0); 243 | cudaEventSynchronize(stop); 244 | cudaEventElapsedTime(&temp_time, start, stop); 245 | 246 | cudaEventDestroy(start); 247 | cudaEventDestroy(stop); 248 | 249 | CHECK_CUDA_ERROR(cudaMemcpy( 250 | h_count, d_count_, sizeof(uint32_t) * num_queries, cudaMemcpyDeviceToHost)); 251 | cudaDeviceSynchronize(); 252 | return temp_time; 253 | } 254 | 255 | float hash_delete(KeyT* h_key, uint32_t num_keys) { 256 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 257 | CHECK_CUDA_ERROR( 258 | cudaMemcpy(d_key_, h_key, sizeof(KeyT) * num_keys, cudaMemcpyHostToDevice)); 259 | 260 | float temp_time = 0.0f; 261 | 262 | cudaEvent_t start, stop; 263 | cudaEventCreate(&start); 264 | cudaEventCreate(&stop); 265 | cudaEventRecord(start, 0); 266 | 267 | //=== slab hash's deletion: 268 | slab_hash_->deleteIndividual(d_key_, num_keys); 269 | 270 | cudaEventRecord(stop, 0); 271 | cudaEventSynchronize(stop); 272 | cudaEventElapsedTime(&temp_time, start, stop); 273 | 274 | cudaEventDestroy(start); 275 | cudaEventDestroy(stop); 276 | return temp_time; 277 | } 278 | 279 | float batched_operations(uint32_t* h_batch_op, 280 | uint32_t* h_results, 281 | uint32_t batch_size, 282 | uint32_t batch_id) { 283 | CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); 284 | CHECK_CUDA_ERROR(cudaMemcpy(d_key_ + batch_id * batch_size, 285 | h_batch_op, 286 | sizeof(uint32_t) * batch_size, 287 | cudaMemcpyHostToDevice)); 288 | CHECK_CUDA_ERROR(cudaMemset( 289 | d_result_ + batch_id * batch_size, 0xFF, sizeof(uint32_t) * batch_size)); 290 | 291 | float temp_time = 0.0f; 292 | 293 | cudaEvent_t start, stop; 294 | cudaEventCreate(&start); 295 | cudaEventCreate(&stop); 296 | 297 | cudaEventRecord(start, 0); 298 | slab_hash_->batchedOperation(d_key_ + batch_id * batch_size, d_result_, batch_size); 299 | cudaEventRecord(stop, 0); 300 | cudaEventSynchronize(stop); 301 | cudaEventElapsedTime(&temp_time, start, stop); 302 | 303 | cudaEventDestroy(start); 304 | cudaEventDestroy(stop); 305 | 306 | CHECK_ERROR(cudaMemcpy(h_results + batch_id * batch_size, 307 | d_result_ + batch_id * batch_size, 308 | sizeof(uint32_t) * batch_size, 309 | cudaMemcpyDeviceToHost)); 310 | cudaDeviceSynchronize(); 311 | return temp_time; 312 | } 313 | 314 | float measureLoadFactor(int flag = 0) { return slab_hash_->computeLoadFactor(flag); } 315 | }; -------------------------------------------------------------------------------- /src/slab_hash.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | // global declarations 25 | #include "slab_hash_global.cuh" 26 | 27 | // global helper methods: 28 | #include "slab_hash_helper_methods.cuh" 29 | 30 | // class declaration: 31 | #include "concurrent_map/cmap_class.cuh" 32 | #include "concurrent_set/cset_class.cuh" 33 | #include "slab_iterator.cuh" 34 | 35 | // warp implementations of member functions: 36 | #include "concurrent_map/warp/delete.cuh" 37 | #include "concurrent_map/warp/insert.cuh" 38 | #include "concurrent_map/warp/search.cuh" 39 | #include "concurrent_map/warp/count.cuh" 40 | 41 | #include "concurrent_set/cset_warp_operations.cuh" 42 | 43 | // helper kernels: 44 | #include "concurrent_map/device/build.cuh" 45 | #include "concurrent_map/device/concurrent_kernel.cuh" 46 | #include "concurrent_map/device/delete_kernel.cuh" 47 | #include "concurrent_map/device/misc_kernels.cuh" 48 | #include "concurrent_map/device/search_kernel.cuh" 49 | #include "concurrent_map/device/count_kernel.cuh" 50 | #include "concurrent_set/cset_helper_kernels.cuh" 51 | 52 | // implementations: 53 | #include "concurrent_map/cmap_implementation.cuh" 54 | #include "concurrent_set/cset_implementation.cuh" -------------------------------------------------------------------------------- /src/slab_hash_global.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "slab_alloc.cuh" 20 | 21 | #define CHECK_CUDA_ERROR(call) \ 22 | do { \ 23 | cudaError_t err = call; \ 24 | if (err != cudaSuccess) { \ 25 | printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 26 | exit(EXIT_FAILURE); \ 27 | } \ 28 | } while (0) 29 | 30 | // internal parameters for slab hash device functions: 31 | static constexpr uint32_t EMPTY_KEY = 0xFFFFFFFF; 32 | static constexpr uint32_t EMPTY_VALUE = 0xFFFFFFFF; 33 | static constexpr uint64_t EMPTY_PAIR_64 = 0xFFFFFFFFFFFFFFFFLL; 34 | static constexpr uint32_t WARP_WIDTH = 32; 35 | static constexpr uint32_t SEARCH_NOT_FOUND = 0xFFFFFFFF; 36 | 37 | // only works with up to 32-bit key/values 38 | template 39 | struct key_value_pair { 40 | KeyT key; 41 | ValueT value; 42 | }; 43 | 44 | template 45 | struct __align__(32) concurrent_slab { 46 | static constexpr uint32_t NUM_ELEMENTS_PER_SLAB = 15u; 47 | key_value_pair data[NUM_ELEMENTS_PER_SLAB]; 48 | uint32_t ptr_index[2]; 49 | }; 50 | 51 | // this slab structure is meant to be used in either concurrent sets, 52 | // or phase-concurrent maps. 53 | // | key 0 | key 1 | key 2 | ... | key 30 | next_ptr | 54 | template 55 | struct __align__(32) key_only_slab { 56 | static constexpr uint32_t NUM_ELEMENTS_PER_SLAB = 31u; 57 | KeyT keys[NUM_ELEMENTS_PER_SLAB]; 58 | uint32_t next_ptr_index[1]; 59 | }; 60 | 61 | template 62 | struct __align__(32) phase_concurrent_slab { 63 | static constexpr uint32_t NUM_ELEMENTS_PER_SLAB = 31u; 64 | // main slab (128 bytes), contain keys 65 | key_only_slab keys; 66 | 67 | // value storage: 68 | ValueT values[NUM_ELEMENTS_PER_SLAB]; 69 | }; 70 | 71 | /* 72 | * Different types of slab hash: 73 | * 1. Concurrent map: it assumes that all operations can be performed 74 | * concurrently 75 | * 2. phase-concurrent map: supports concurrent updates, and concurrent 76 | * searches, but not a mixture of both 77 | */ 78 | enum class SlabHashTypeT { ConcurrentMap, ConcurrentSet, PhaseConcurrentMap }; 79 | 80 | template 81 | class ConcurrentMapT { 82 | public: 83 | // fixed parameters for the data structure 84 | static constexpr uint32_t A_INDEX_POINTER = 0xFFFFFFFE; 85 | static constexpr uint32_t EMPTY_INDEX_POINTER = 0xFFFFFFFF; 86 | static constexpr uint32_t BASE_UNIT_SIZE = 32; 87 | static constexpr uint32_t REGULAR_NODE_ADDRESS_MASK = 0x30000000; 88 | static constexpr uint32_t REGULAR_NODE_DATA_MASK = 0x3FFFFFFF; 89 | static constexpr uint32_t REGULAR_NODE_KEY_MASK = 0x15555555; 90 | 91 | using SlabTypeT = concurrent_slab; 92 | 93 | static std::string getTypeName() { return std::string("ConcurrentMap"); } 94 | }; 95 | 96 | template 97 | class ConcurrentSetT { 98 | public: 99 | // fixed parameters for the data structure 100 | static constexpr uint32_t A_INDEX_POINTER = 0xFFFFFFFE; 101 | static constexpr uint32_t EMPTY_INDEX_POINTER = 0xFFFFFFFF; 102 | static constexpr uint32_t BASE_UNIT_SIZE = 32; 103 | static constexpr uint32_t REGULAR_NODE_ADDRESS_MASK = 0x80000000; 104 | static constexpr uint32_t REGULAR_NODE_DATA_MASK = 0x7FFFFFFF; 105 | static constexpr uint32_t REGULAR_NODE_KEY_MASK = 0x7FFFFFFF; 106 | static constexpr uint32_t NEXT_PTR_LANE = 31u; 107 | 108 | using SlabTypeT = key_only_slab; 109 | 110 | static std::string getTypeName() { return std::string("ConcurrentSet"); } 111 | }; 112 | 113 | template 114 | class PhaseConcurrentMapT { 115 | public: 116 | using SlabTypeT = phase_concurrent_slab; 117 | static std::string getTypeName() { return std::string("PhaseConcurrentMap"); } 118 | }; 119 | 120 | // the main class to be specialized for different types of hash tables 121 | template 122 | class GpuSlabHash; 123 | 124 | template 125 | class GpuSlabHashContext; 126 | 127 | // The custom allocator that is being used for this code: 128 | // this might need to be a template paramater itself 129 | namespace slab_alloc_par { 130 | constexpr uint32_t log_num_mem_blocks = 8; 131 | constexpr uint32_t num_super_blocks = 32; 132 | constexpr uint32_t num_replicas = 1; 133 | } // namespace slab_alloc_par 134 | 135 | using DynamicAllocatorT = SlabAllocLight; 138 | 139 | using AllocatorContextT = SlabAllocLightContext; 142 | 143 | using SlabAddressT = uint32_t; 144 | using BucketAddressT = SlabAddressT; -------------------------------------------------------------------------------- /src/slab_hash_helper_methods.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | namespace SlabHash_NS { 20 | /* 21 | * search for a key (and/or an empty spot) in a single slab, returns the laneId 22 | * if found, otherwise returns -1 23 | */ 24 | template 25 | __device__ __forceinline__ int32_t findKeyOrEmptyPerWarp(const KeyT& src_key, 26 | const uint32_t read_data_chunk) { 27 | uint32_t isEmpty = (__ballot_sync( 28 | 0xFFFFFFFF, (read_data_chunk == EMPTY_KEY) || (read_data_chunk == src_key))); 29 | return __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1; 30 | } 31 | 32 | // search for just the key 33 | template 34 | __device__ __forceinline__ int32_t findKeyPerWarp(const KeyT& src_key, 35 | const uint32_t read_data_chunk) { 36 | uint32_t isEmpty = __ballot_sync(0xFFFFFFFF, (read_data_chunk == src_key)); 37 | return __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1; 38 | } 39 | 40 | // search for an empty spot 41 | template 42 | __device__ __forceinline__ int32_t findEmptyPerWarp(const uint32_t read_data_chunk) { 43 | uint32_t isEmpty = __ballot_sync(0xFFFFFFFF, (read_data_chunk == EMPTY_KEY)); 44 | return __ffs(isEmpty & SlabHashT::REGULAR_NODE_KEY_MASK) - 1; 45 | } 46 | }; // namespace SlabHash_NS -------------------------------------------------------------------------------- /src/slab_iterator.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | // a forward iterator for the slab hash data structure: 20 | // currently just specialized for concurrent set 21 | // TODO implement for other types 22 | template 23 | class SlabIterator { 24 | public: 25 | using SlabHashT = ConcurrentSetT; 26 | 27 | GpuSlabHashContext& slab_hash_; 28 | 29 | // current position of the iterator 30 | KeyT* cur_ptr_; 31 | uint32_t cur_size_; // keep track of current level's size (in units of 32 | // sizeof(KeyT)) 33 | uint32_t cur_bucket_; // keeping track of the current bucket 34 | SlabAddressT cur_slab_address_; 35 | // initialize the iterator with the first bucket's pointer address of the slab 36 | // hash 37 | __host__ __device__ 38 | SlabIterator(GpuSlabHashContext& slab_hash) 39 | : slab_hash_(slab_hash) 40 | , cur_ptr_(reinterpret_cast(slab_hash_.getDeviceTablePointer())) 41 | , cur_size_(slab_hash_.getNumBuckets() * SlabHashT::BASE_UNIT_SIZE) 42 | , cur_bucket_(0) 43 | , cur_slab_address_(*slab_hash.getPointerFromBucket(0, SlabHashT::NEXT_PTR_LANE)) {} 44 | 45 | __device__ __forceinline__ KeyT* getPointer() const { return cur_ptr_; } 46 | __device__ __forceinline__ uint32_t getSize() const { return cur_size_; } 47 | 48 | // returns true, if there's a valid next element, else returns false 49 | // this function is being run by only one thread, so it is wrong to assume all 50 | // threads within a warp have access to the caller's iterator state 51 | __device__ __forceinline__ bool next() { 52 | if (cur_bucket_ == slab_hash_.getNumBuckets()) { 53 | return false; 54 | } 55 | 56 | while (cur_slab_address_ == SlabHashT::EMPTY_INDEX_POINTER) { 57 | cur_bucket_++; 58 | if (cur_bucket_ == slab_hash_.getNumBuckets()) { 59 | return false; 60 | } 61 | cur_slab_address_ = 62 | *slab_hash_.getPointerFromBucket(cur_bucket_, SlabHashT::NEXT_PTR_LANE); 63 | } 64 | 65 | cur_ptr_ = slab_hash_.getPointerFromSlab(cur_slab_address_, 0); 66 | cur_slab_address_ = 67 | *slab_hash_.getPointerFromSlab(cur_slab_address_, SlabHashT::NEXT_PTR_LANE); 68 | cur_size_ = SlabHashT::BASE_UNIT_SIZE; 69 | return true; 70 | } 71 | }; -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cuda_add_executable(cmap_test cmap_test.cu 2 | OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS}) 3 | target_link_libraries(cmap_test gtest) 4 | 5 | cuda_add_executable(test_slab_hash test_slab_hash.cu 6 | OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS}) 7 | 8 | cuda_add_executable(concurrent_map concurrent_map.cu 9 | OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS}) 10 | 11 | cuda_add_executable(concurrent_set concurrent_set.cu 12 | OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS}) 13 | 14 | cuda_add_executable(iterator_test iterator_test.cu 15 | OPTIONS ${GENCODE} ${CUDA_VERBOSE_PTXAS}) 16 | -------------------------------------------------------------------------------- /test/cmap_test.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "CommandLine.h" 27 | #include "gpu_hash_table.cuh" 28 | #include "slab_alloc.cuh" 29 | #include "slab_hash.cuh" 30 | 31 | size_t g_gpu_device_idx{0}; // the gpu device to run tests on 32 | 33 | TEST(ConcurrentMap, Construction) { 34 | gpu_hash_table cmap( 35 | 100, 10, g_gpu_device_idx, /*seed = */ 1); 36 | 37 | std::vector h_key{10, 5, 1}; 38 | std::vector h_value{100, 50, 10}; 39 | 40 | cmap.hash_build(h_key.data(), h_value.data(), h_key.size()); 41 | } 42 | 43 | TEST(BulkBuild, IndividualSearch) { 44 | using KeyT = uint32_t; 45 | using ValueT = uint32_t; 46 | const uint32_t num_keys = 137; 47 | const uint32_t num_buckets = 2; 48 | // creating the data structures: 49 | gpu_hash_table cmap( 50 | num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1); 51 | 52 | // creating key-value pairs: 53 | std::vector h_key; 54 | h_key.reserve(num_keys); 55 | std::vector h_value; 56 | h_value.reserve(num_keys); 57 | for (uint32_t i_key = 0; i_key < num_keys; i_key++) { 58 | h_key.push_back(13 + i_key); 59 | h_value.push_back(1000 + h_key.back()); 60 | } 61 | 62 | // building the slab hash, and the host's data structure: 63 | cmap.hash_build(h_key.data(), h_value.data(), h_key.size()); 64 | 65 | // generating random queries 66 | const auto num_queries = num_keys; 67 | std::random_device rd; 68 | std::mt19937 rng(rd()); 69 | std::vector h_query(h_key); 70 | std::shuffle(h_query.begin(), h_query.end(), rng); 71 | std::vector cmap_results(num_queries); 72 | 73 | // searching for the queries: 74 | cmap.hash_search(h_query.data(), cmap_results.data(), num_queries); 75 | 76 | // validating the results: 77 | std::unordered_map hash_map; 78 | for (uint32_t i_key = 0; i_key < num_keys; i_key++) { 79 | hash_map.insert(std::make_pair(h_key[i_key], h_value[i_key])); 80 | } 81 | 82 | for (uint32_t i = 0; i < num_queries; i++) { 83 | auto cmap_result = cmap_results[i]; 84 | auto expected_result = hash_map[h_query[i]]; 85 | ASSERT_EQ(expected_result, cmap_result); 86 | } 87 | } 88 | 89 | TEST(BulkBuild, BulkSearch) { 90 | using KeyT = uint32_t; 91 | using ValueT = uint32_t; 92 | const uint32_t num_keys = 137; 93 | const uint32_t num_buckets = 2; 94 | // creating the data structures: 95 | gpu_hash_table cmap( 96 | num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1); 97 | 98 | // creating key-value pairs: 99 | std::vector h_key; 100 | h_key.reserve(num_keys); 101 | std::vector h_value; 102 | h_value.reserve(num_keys); 103 | for (uint32_t i_key = 0; i_key < num_keys; i_key++) { 104 | h_key.push_back(13 + i_key); 105 | h_value.push_back(1000 + h_key.back()); 106 | } 107 | 108 | // building the slab hash, and the host's data structure: 109 | cmap.hash_build(h_key.data(), h_value.data(), h_key.size()); 110 | 111 | // generating random queries 112 | const auto num_queries = num_keys; 113 | std::random_device rd; 114 | std::mt19937 rng(rd()); 115 | std::vector h_query(h_key); 116 | std::shuffle(h_query.begin(), h_query.end(), rng); 117 | std::vector cmap_results(num_queries); 118 | 119 | // searching for the queries: 120 | cmap.hash_search_bulk(h_query.data(), cmap_results.data(), num_queries); 121 | 122 | // validating the results: 123 | std::unordered_map hash_map; 124 | for (uint32_t i_key = 0; i_key < num_keys; i_key++) { 125 | hash_map.insert(std::make_pair(h_key[i_key], h_value[i_key])); 126 | } 127 | 128 | for (uint32_t i = 0; i < num_queries; i++) { 129 | auto cmap_result = cmap_results[i]; 130 | auto expected_result = hash_map[h_query[i]]; 131 | ASSERT_EQ(expected_result, cmap_result); 132 | } 133 | } 134 | 135 | TEST(BulkBuild, IndividualCount) { 136 | using KeyT = uint32_t; 137 | using ValueT = uint32_t; 138 | const uint32_t num_unique = 2014; 139 | const uint32_t num_buckets = 12; 140 | const uint32_t max_count = 32; 141 | 142 | // rng 143 | std::random_device rd; 144 | std::mt19937 rng(rd()); 145 | 146 | // random key counts 147 | uint32_t num_keys = 0; 148 | std::vector h_count; 149 | h_count.reserve(num_unique); 150 | for (uint32_t i_key = 0; i_key < num_unique; i_key++) { 151 | uint32_t key_count = rng() % max_count; 152 | h_count.push_back(key_count); 153 | num_keys += key_count; 154 | } 155 | 156 | // creating key-value pairs: 157 | std::vector h_key; 158 | h_key.reserve(num_keys); 159 | std::vector h_value; 160 | h_value.reserve(num_keys); 161 | std::vector h_key_unique; 162 | h_key_unique.reserve(num_unique); 163 | for (uint32_t i_key = 0; i_key < num_unique; i_key++) { 164 | KeyT myKey = 13 + i_key; 165 | ValueT myValue = 1000 + myKey; 166 | h_key_unique.push_back(myKey); 167 | for (uint32_t i_count = 0; i_count < h_count[i_key]; i_count++) { 168 | h_key.push_back(myKey); 169 | h_value.push_back(myValue); 170 | } 171 | } 172 | 173 | // creating the data structures: 174 | gpu_hash_table cmap( 175 | num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1); 176 | 177 | // building the slab hash, and the host's data structure: 178 | cmap.hash_build(h_key.data(), h_value.data(), h_key.size()); 179 | 180 | // generating random queries 181 | const auto num_queries = num_unique; 182 | std::vector h_query(h_key_unique); 183 | std::shuffle(h_query.begin(), h_query.end(), rng); 184 | std::vector cmap_results(num_queries); 185 | 186 | // getting count per query: 187 | cmap.hash_count(h_query.data(), cmap_results.data(), num_queries); 188 | 189 | // validating the results: 190 | std::unordered_map count_map; 191 | for (uint32_t i_key = 0; i_key < num_unique; i_key++) { 192 | count_map.insert(std::make_pair(h_key_unique[i_key], h_count[i_key])); 193 | } 194 | 195 | for (uint32_t i = 0; i < num_queries; i++) { 196 | auto cmap_result = cmap_results[i]; 197 | auto expected_result = count_map[h_query[i]]; 198 | ASSERT_EQ(expected_result, cmap_result); 199 | } 200 | } 201 | 202 | TEST(UniqueBulkBuild, IndividualCount) { 203 | using KeyT = uint32_t; 204 | using ValueT = uint32_t; 205 | const uint32_t num_unique = 2014; 206 | const uint32_t num_buckets = 12; 207 | const uint32_t max_count = 32; 208 | 209 | // rng 210 | std::random_device rd; 211 | std::mt19937 rng(rd()); 212 | 213 | // random key counts 214 | uint32_t num_keys = 0; 215 | std::vector h_count; 216 | h_count.reserve(num_unique); 217 | for (uint32_t i_key = 0; i_key < num_unique; i_key++) { 218 | uint32_t key_count = rng() % max_count; 219 | h_count.push_back(key_count); 220 | num_keys += key_count; 221 | } 222 | 223 | // creating key-value pairs: 224 | std::vector h_key; 225 | h_key.reserve(num_keys); 226 | std::vector h_value; 227 | h_value.reserve(num_keys); 228 | std::vector h_key_unique; 229 | h_key_unique.reserve(num_unique); 230 | for (uint32_t i_key = 0; i_key < num_unique; i_key++) { 231 | KeyT myKey = 13 + i_key; 232 | ValueT myValue = 1000 + myKey; 233 | h_key_unique.push_back(myKey); 234 | for (uint32_t i_count = 0; i_count < h_count[i_key]; i_count++) { 235 | h_key.push_back(myKey); 236 | h_value.push_back(myValue); 237 | } 238 | } 239 | 240 | // creating the data structures: 241 | gpu_hash_table cmap( 242 | num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1); 243 | 244 | // building the unique-keys slab hash, and the host's data structure: 245 | cmap.hash_build_with_unique_keys(h_key.data(), h_value.data(), h_key.size()); 246 | 247 | // generating random queries 248 | const auto num_queries = num_unique; 249 | std::vector h_query(h_key_unique); 250 | std::shuffle(h_query.begin(), h_query.end(), rng); 251 | std::vector cmap_results(num_queries); 252 | 253 | // getting count per query: 254 | cmap.hash_count(h_query.data(), cmap_results.data(), num_queries); 255 | 256 | // validating the results: 257 | std::unordered_map count_map; 258 | for (uint32_t i_key = 0; i_key < num_unique; i_key++) { 259 | count_map.insert(std::make_pair(h_key_unique[i_key], h_count[i_key])); 260 | } 261 | 262 | for (uint32_t i = 0; i < num_queries; i++) { 263 | auto cmap_result = cmap_results[i]; 264 | auto expected_result = (count_map[h_query[i]] != 0) ? 1 : 0; 265 | ASSERT_EQ(expected_result, cmap_result); 266 | } 267 | } 268 | 269 | TEST(BulkBuild, IndividualDelete) { 270 | using KeyT = uint32_t; 271 | using ValueT = uint32_t; 272 | const uint32_t num_keys = 137; 273 | const uint32_t num_buckets = 2; 274 | // creating the data structures: 275 | gpu_hash_table cmap( 276 | num_keys, num_buckets, g_gpu_device_idx, /*seed = */ 1); 277 | 278 | // creating key-value pairs: 279 | std::vector h_key; 280 | h_key.reserve(num_keys); 281 | std::vector h_value; 282 | h_value.reserve(num_keys); 283 | for (uint32_t i_key = 0; i_key < num_keys; i_key++) { 284 | h_key.push_back(13 + i_key); 285 | h_value.push_back(1000 + h_key.back()); 286 | } 287 | 288 | // building the slab hash: 289 | cmap.hash_build(h_key.data(), h_value.data(), h_key.size()); 290 | 291 | // generating random keys to delete: 292 | const auto num_deletion = num_keys; 293 | const auto extend_fact = 4; 294 | std::random_device rd; 295 | std::mt19937 rng(rd()); 296 | std::vector h_deleted_keys; 297 | h_deleted_keys.reserve(num_deletion * extend_fact); 298 | for (uint32_t i_key = 0; i_key < num_deletion * extend_fact; i_key++) { 299 | h_deleted_keys.push_back(13 + i_key); 300 | } 301 | std::shuffle(h_deleted_keys.begin(), h_deleted_keys.end(), rng); 302 | 303 | // delete the keys: 304 | cmap.hash_delete(h_deleted_keys.data(), num_deletion); 305 | 306 | // query all keys: 307 | const auto num_queries = num_keys; 308 | std::vector h_query(h_key); 309 | std::vector cmap_results(num_queries); 310 | 311 | // searching for the queries: 312 | cmap.hash_search_bulk(h_query.data(), cmap_results.data(), num_queries); 313 | 314 | // validating the results: 315 | std::unordered_map hash_map; 316 | for (uint32_t i_key = 0; i_key < num_keys; i_key++) { 317 | hash_map.insert(std::make_pair(h_key[i_key], h_value[i_key])); 318 | } 319 | for (uint32_t i_key = 0; i_key < num_deletion; i_key++) { 320 | hash_map.erase(h_deleted_keys[i_key]); 321 | } 322 | 323 | for (uint32_t i = 0; i < num_queries; i++) { 324 | auto cmap_result = cmap_results[i]; 325 | auto expected_result_it = hash_map.find(h_query[i]); 326 | auto expected_result = expected_result_it == hash_map.end() 327 | ? SEARCH_NOT_FOUND 328 | : expected_result_it->second; 329 | ASSERT_EQ(expected_result, cmap_result); 330 | } 331 | } 332 | 333 | int main(int argc, char** argv) { 334 | if (cmdOptionExists(argv, argc + argv, "-device")) { 335 | g_gpu_device_idx = atoi(getCmdOption(argv, argv + argc, "-device")); 336 | } 337 | 338 | ::testing::InitGoogleTest(&argc, argv); 339 | return RUN_ALL_TESTS(); 340 | } -------------------------------------------------------------------------------- /test/concurrent_map.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "gpu_hash_table.cuh" 26 | //======================================= 27 | #define DEVICE_ID 0 28 | 29 | int main(int argc, char** argv) { 30 | //========= 31 | int devCount; 32 | cudaGetDeviceCount(&devCount); 33 | cudaDeviceProp devProp; 34 | if (devCount) { 35 | cudaSetDevice(DEVICE_ID); // be changed later 36 | cudaGetDeviceProperties(&devProp, DEVICE_ID); 37 | } 38 | printf("Device: %s\n", devProp.name); 39 | 40 | //====================================== 41 | // Building my hash table: 42 | //====================================== 43 | uint32_t num_keys = 1 << 20; 44 | 45 | float expected_chain = 0.6f; 46 | uint32_t num_elements_per_unit = 15; 47 | uint32_t expected_elements_per_bucket = 48 | expected_chain * num_elements_per_unit; 49 | uint32_t num_buckets = (num_keys + expected_elements_per_bucket - 1) / 50 | expected_elements_per_bucket; 51 | 52 | // ==== generating key-values and queries on the host: 53 | float existing_ratio = 1.0f; // ratio of queries within the table 54 | uint32_t num_queries = num_keys; 55 | 56 | using KeyT = uint32_t; 57 | using ValueT = uint32_t; 58 | auto num_elements = 2 * num_keys; 59 | 60 | std::vector h_key(num_elements); 61 | std::vector h_value(num_elements); 62 | std::vector h_query(num_queries); 63 | std::vector h_correct_result(num_queries); 64 | std::vector h_result(num_queries); 65 | 66 | // std::iota(h_key.begin(), h_key.end(), 0); 67 | const auto f = [](const KeyT& key) { return key * 10; }; 68 | 69 | std::random_device rd; 70 | const int64_t seed = 1; 71 | std::mt19937 rng(seed); 72 | std::vector index(num_elements); 73 | std::iota(index.begin(), index.end(), 0); 74 | std::shuffle(index.begin(), index.end(), rng); 75 | 76 | for (int32_t i = 0; i < index.size(); i++) { 77 | h_key[i] = index[i]; 78 | h_value[i] = f(h_key[i]); 79 | } 80 | 81 | //=== generating random queries with a fixed ratio existing in keys 82 | uint32_t num_existing = static_cast(existing_ratio * num_queries); 83 | 84 | for (int i = 0; i < num_existing; i++) { 85 | h_query[i] = h_key[num_keys - 1 - i]; 86 | h_correct_result[i] = f(h_query[i]); 87 | } 88 | 89 | for (int i = 0; i < (num_queries - num_existing); i++) { 90 | h_query[num_existing + i] = h_key[num_keys + i]; 91 | h_correct_result[num_existing + i] = SEARCH_NOT_FOUND; 92 | } 93 | // permuting the queries: 94 | std::vector q_index(num_queries); 95 | std::iota(q_index.begin(), q_index.end(), 0); 96 | std::shuffle(q_index.begin(), q_index.end(), rng); 97 | for (int i = 0; i < num_queries; i++) { 98 | std::swap(h_query[i], h_query[q_index[i]]); 99 | std::swap(h_correct_result[i], h_correct_result[q_index[i]]); 100 | } 101 | gpu_hash_table 102 | hash_table(num_keys, num_buckets, DEVICE_ID, seed); 103 | 104 | float build_time = 105 | hash_table.hash_build(h_key.data(), h_value.data(), num_keys); 106 | float search_time = 107 | hash_table.hash_search(h_query.data(), h_result.data(), num_queries); 108 | float search_time_bulk = 109 | hash_table.hash_search_bulk(h_query.data(), h_result.data(), num_queries); 110 | // // hash_table.print_bucket(0); 111 | printf("Hash table: \n"); 112 | printf("num_keys = %d, num_buckets = %d\n", num_keys, num_buckets); 113 | printf("\t2) Hash table built in %.3f ms (%.3f M elements/s)\n", build_time, 114 | double(num_keys) / build_time / 1000.0); 115 | printf("\t3) Hash table search (%.2f) in %.3f ms (%.3f M queries/s)\n", 116 | existing_ratio, search_time, 117 | double(num_queries) / search_time / 1000.0); 118 | printf("\t4) Hash table bulk search (%.2f) in %.3f ms (%.3f Mqueries/s)\n", 119 | existing_ratio, search_time_bulk, 120 | double(num_queries) / search_time_bulk / 1000.0); 121 | 122 | double load_factor = hash_table.measureLoadFactor(); 123 | 124 | printf("The load factor is %.2f, number of buckets %d\n", load_factor, 125 | num_buckets); 126 | 127 | // ==== validation: 128 | for (int i = 0; i < num_queries; i++) { 129 | if (h_correct_result[i] != h_result[i]) { 130 | printf("### wrong result at index %d: [%d] -> %d, but should be %d\n", i, 131 | h_query[i], h_result[i], h_correct_result[i]); 132 | break; 133 | } 134 | if (i == (num_queries - 1)) 135 | printf("Validation done successfully\n"); 136 | } 137 | 138 | // === building cudpp for comparison 139 | // float load_factor_cudpp = 0.8f; 140 | // cudpp_hash_table cudpp_hash(h_key, h_value, num_keys, num_queries, 141 | // load_factor_cudpp, false, false); float cudpp_build_time = 142 | // cudpp_hash.hash_build(); float cudpp_search_time = 143 | // cudpp_hash.lookup_hash_table(h_query, num_queries); printf(" CUDPP Hash 144 | // table: \n"); printf("\t1) Hash table built in %.3f ms (%.3f M 145 | // elements/s)\n", cudpp_build_time, 146 | // double(num_keys)/cudpp_build_time/1000.0); printf("\t2) Hash table search 147 | // (%.2f) in %.3f ms (%.3f M elements/s)\n", existing_ratio, 148 | // cudpp_search_time, double(num_queries)/cudpp_search_time/1000.0); 149 | // === 150 | } -------------------------------------------------------------------------------- /test/concurrent_set.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "gpu_hash_table.cuh" 26 | #include "slab_hash.cuh" 27 | //======================================= 28 | #define DEVICE_ID 0 29 | 30 | int main(int argc, char** argv) { 31 | //========= 32 | int devCount; 33 | cudaGetDeviceCount(&devCount); 34 | cudaDeviceProp devProp; 35 | if (devCount) { 36 | cudaSetDevice(DEVICE_ID); // be changed later 37 | cudaGetDeviceProperties(&devProp, DEVICE_ID); 38 | } 39 | printf("Device: %s\n", devProp.name); 40 | 41 | //====================================== 42 | // Building my hash table: 43 | //====================================== 44 | uint32_t num_keys = 1<<20; 45 | 46 | float expected_chain = 0.6f; 47 | uint32_t num_elements_per_unit = 31; 48 | uint32_t expected_elements_per_bucket = 49 | expected_chain * num_elements_per_unit; 50 | uint32_t num_buckets = (num_keys + expected_elements_per_bucket - 1) / 51 | expected_elements_per_bucket; 52 | 53 | // ==== generating key-values and queries on the host: 54 | float existing_ratio = 1.0f; // ratio of queries within the table 55 | uint32_t num_queries = num_keys; 56 | 57 | using KeyT = uint32_t; 58 | auto num_elements = 2 * num_keys; 59 | 60 | std::vector h_key(num_elements); 61 | std::vector h_query(num_queries); 62 | std::vector h_correct_result(num_queries); 63 | std::vector h_result(num_queries); 64 | 65 | std::random_device rd; 66 | const int64_t seed = 1; 67 | std::mt19937 rng(seed); 68 | std::vector index(num_elements); 69 | std::iota(index.begin(), index.end(), 0); 70 | std::shuffle(index.begin(), index.end(), rng); 71 | 72 | for (int32_t i = 0; i < index.size(); i++) { 73 | h_key[i] = index[i]; 74 | } 75 | 76 | //=== generating random queries with a fixed ratio existing in keys 77 | uint32_t num_existing = static_cast(existing_ratio * num_queries); 78 | 79 | for (int i = 0; i < num_existing; i++) { 80 | h_query[i] = h_key[num_keys - 1 - i]; 81 | h_correct_result[i] = h_query[i]; 82 | } 83 | 84 | for (int i = 0; i < (num_queries - num_existing); i++) { 85 | h_query[num_existing + i] = h_key[num_keys + i]; 86 | h_correct_result[num_existing + i] = SEARCH_NOT_FOUND; 87 | } 88 | // permuting the queries: 89 | std::vector q_index(num_queries); 90 | std::iota(q_index.begin(), q_index.end(), 0); 91 | std::shuffle(q_index.begin(), q_index.end(), rng); 92 | for (int i = 0; i < num_queries; i++) { 93 | std::swap(h_query[i], h_query[q_index[i]]); 94 | std::swap(h_correct_result[i], h_correct_result[q_index[i]]); 95 | } 96 | gpu_hash_table 97 | hash_table(num_keys, num_buckets, DEVICE_ID, seed, false); 98 | 99 | float build_time = 100 | hash_table.hash_build(h_key.data(), nullptr, num_keys); 101 | float search_time = 102 | hash_table.hash_search(h_query.data(), h_result.data(), num_queries); 103 | // float search_time_bulk = 104 | // hash_table.hash_search_bulk(h_query.data(), h_result.data(), num_queries); 105 | // // // hash_table.print_bucket(0); 106 | // printf("Hash table: \n"); 107 | // printf("num_keys = %d, num_buckets = %d\n", num_keys, num_buckets); 108 | // // printf("\t1) Hash table init in %.3f ms\n", init_time); 109 | // printf("\t2) Hash table built in %.3f ms (%.3f M elements/s)\n", build_time, 110 | // double(num_keys) / build_time / 1000.0); 111 | // printf("\t3) Hash table search (%.2f) in %.3f ms (%.3f M queries/s)\n", 112 | // existing_ratio, search_time, 113 | // double(num_queries) / search_time / 1000.0); 114 | // printf("\t4) Hash table bulk search (%.2f) in %.3f ms (%.3f Mqueries/s)\n", 115 | // existing_ratio, search_time_bulk, 116 | // double(num_queries) / search_time_bulk / 1000.0); 117 | 118 | // double load_factor = hash_table.measureLoadFactor(); 119 | 120 | // printf("The load factor is %.2f, number of buckets %d\n", load_factor, 121 | // num_buckets); 122 | 123 | // ==== validation: 124 | for (int i = 0; i < num_queries; i++) { 125 | if (h_correct_result[i] != h_result[i]) { 126 | printf("### wrong result at index %d: [%d] -> %d, but should be %d\n", i, 127 | h_query[i], h_result[i], h_correct_result[i]); 128 | break; 129 | } 130 | if (i == (num_queries - 1)) 131 | printf("Validation done successfully\n"); 132 | } 133 | } -------------------------------------------------------------------------------- /test/iterator_test.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "gpu_hash_table.cuh" 26 | #include "slab_hash.cuh" 27 | //======================================= 28 | #define DEVICE_ID 0 29 | //======================================= 30 | 31 | template 32 | __global__ void print_table( 33 | GpuSlabHashContext slab_hash) { 34 | uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; 35 | uint32_t wid = tid >> 5; 36 | uint32_t laneId = threadIdx.x & 0x1F; 37 | 38 | if (wid >= slab_hash.getNumBuckets()) { 39 | return; 40 | } 41 | 42 | // initializing the memory allocator on each warp: 43 | slab_hash.getAllocatorContext().initAllocator(tid, laneId); 44 | 45 | if (tid == 0) { 46 | printf(" == Printing the base array\n"); 47 | SlabIterator iter(slab_hash); 48 | for (int i = 0; i < iter.cur_size_; i++) { 49 | if ((i & 0x1F) == 0) 50 | printf(" == bucket %d:\n", i >> 5); 51 | printf("%8x, ", *(iter.cur_ptr_ + i)); 52 | if ((i & 0x7) == 0x7) 53 | printf("\n"); 54 | } 55 | printf("\n"); 56 | 57 | printf(" == Printing the rest of slabs:\n"); 58 | while (iter.next()) { 59 | for (int i = 0; i < iter.cur_size_; i++) { 60 | if ((i & 0x1F) == 0) 61 | printf(" == bucket %d:\n", iter.cur_bucket_); 62 | printf("%8x, ", *(iter.cur_ptr_ + i)); 63 | if ((i & 0x7) == 0x7) 64 | printf("\n"); 65 | } 66 | printf("\n"); 67 | } 68 | } 69 | } 70 | 71 | //======================================= 72 | int main(int argc, char** argv) { 73 | //========= 74 | int devCount; 75 | cudaGetDeviceCount(&devCount); 76 | cudaDeviceProp devProp; 77 | if (devCount) { 78 | cudaSetDevice(DEVICE_ID); // be changed later 79 | cudaGetDeviceProperties(&devProp, DEVICE_ID); 80 | } 81 | printf("Device: %s\n", devProp.name); 82 | 83 | //====================================== 84 | // Building my hash table: 85 | //====================================== 86 | uint32_t num_buckets = 2; 87 | 88 | using KeyT = uint32_t; 89 | 90 | std::vector h_key = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9, 91 | 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 92 | 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 93 | 51, 53, 55, 57, 59, 61, 63, 65, 67}; 94 | uint32_t num_keys = h_key.size(); 95 | 96 | const int64_t seed = 1; 97 | std::mt19937 rng(seed); 98 | std::shuffle(h_key.begin(), h_key.end(), rng); 99 | 100 | gpu_hash_table 101 | hash_table(num_keys, num_buckets, DEVICE_ID, seed, false, /*identity_hash*/ true); 102 | 103 | float build_time = hash_table.hash_build(h_key.data(), nullptr, num_keys); 104 | 105 | const uint32_t num_blocks = 1; 106 | const uint32_t num_threads = 128; 107 | print_table<<>>( 108 | hash_table.slab_hash_->getSlabHashContext()); 109 | 110 | return 0; 111 | } -------------------------------------------------------------------------------- /test/test_slab_hash.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Saman Ashkiani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "gpu_hash_table.cuh" 22 | #include "slab_alloc.cuh" 23 | #include "slab_hash.cuh" 24 | #define DEVICE_ID 0 25 | 26 | int main(int argc, char** argv) { 27 | //========= 28 | int devCount; 29 | cudaGetDeviceCount(&devCount); 30 | cudaDeviceProp devProp; 31 | if (devCount) { 32 | cudaSetDevice(DEVICE_ID); // be changed later 33 | cudaGetDeviceProperties(&devProp, DEVICE_ID); 34 | } 35 | printf("Device: %s\n", devProp.name); 36 | 37 | auto my_hash_table = 38 | new gpu_hash_table(100, 10, DEVICE_ID,/*seed = */ 1); 40 | 41 | std::vector h_key{10, 5, 1}; 42 | std::vector h_value{100, 50, 10}; 43 | 44 | my_hash_table->hash_build(h_key.data(), h_value.data(), h_key.size()); 45 | // auto slab_alloc = new SlabAllocLight<8, 32, 1>(); 46 | // printf("slab alloc constructed\n"); 47 | 48 | // delete slab_alloc; 49 | 50 | // auto slab_hash = 51 | // new GpuSlabHash(); 52 | // std::cout << slab_hash->to_string() << std::endl; 53 | delete my_hash_table; 54 | return 0; 55 | } --------------------------------------------------------------------------------