├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── SboostConfig.cmake.in ├── bitmap_writer.cc ├── bitmap_writer.h ├── bitmap_writer_test.cc ├── byteutils.cc ├── byteutils.h ├── byteutils_test.cc ├── cmake_modules └── Thirdparty.cmake ├── encoding ├── deltabp.cc ├── deltabp.h ├── deltabp_test.cc ├── encoding_utils.cc ├── encoding_utils.h ├── rlehybrid.cc ├── rlehybrid.h └── rlehybrid_test.cc ├── loader.cc ├── loader.h ├── loader_benchmark.cc ├── resources └── testres │ ├── encoding │ ├── comment │ ├── comment.binary │ ├── deltabp │ └── deltabp.txt │ ├── lineitem │ ├── lineitem2 │ ├── multipage_delta │ ├── nation │ ├── orders │ └── sboost │ ├── deltabpcontent │ ├── deltabpval │ └── rlecontent ├── sboost.cc ├── sboost.h ├── sboost_benchmark.cc ├── sboost_runner.cc ├── sboost_test.cc ├── simd.cc ├── simd.h ├── tool.cc ├── unpacker.cc ├── unpacker.h ├── unpacker_benchmark.cc └── unpacker_test.cc /.gitignore: -------------------------------------------------------------------------------- 1 | /third_party/googletest/ 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15) 2 | project(sboost) 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | 6 | set(SBOOST_VERSION 0.1) 7 | set(SBOOST_SIMD_FLAGS -msse4.1 -mavx -mavx2 -mavx512f -mavx512bw -mavx512dq -mavx512vl -mbmi2) 8 | set(RUNTIME_INSTALL_DIR bin) 9 | set(CMAKE_INSTALL_LIBDIR lib) 10 | set(CMAKE_INSTALL_HEADERDIR include) 11 | set(CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") 12 | string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) 13 | set(CMAKE_BUILD_TYPE ${UPPERCASE_BUILD_TYPE}) 14 | string(TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME) 15 | set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}/") 16 | 17 | 18 | set(C_FLAGS_DEBUG "-ggdb -O0") 19 | set(C_FLAGS_FASTDEBUG "-ggdb -O1") 20 | set(CXX_FLAGS_DEBUG "-ggdb -O0") 21 | set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") 22 | set(C_FLAGS_RELEASE "-O3 -DNDEBUG") 23 | set(CXX_FLAGS_RELEASE "-O3 -DNDEBUG") 24 | 25 | 26 | # Set compile flags based on the build type. 27 | message( 28 | "Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})" 29 | ) 30 | if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") 31 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_DEBUG}") 32 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_DEBUG}") 33 | elseif("${CMAKE_BUILD_TYPE}" STREQUAL "RELWITHDEBINFO") 34 | 35 | elseif("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") 36 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_FASTDEBUG}") 37 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_FASTDEBUG}") 38 | elseif("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") 39 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_RELEASE}") 40 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_RELEASE}") 41 | elseif("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_GEN") 42 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_PROFILE_GEN}") 43 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_PROFILE_GEN}") 44 | elseif("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_BUILD") 45 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_PROFILE_BUILD}") 46 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_PROFILE_BUILD}") 47 | else() 48 | message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") 49 | endif() 50 | 51 | if("${CMAKE_CXX_FLAGS}" MATCHES "-DNDEBUG") 52 | set(ARROW_DEFINITION_FLAGS "-DNDEBUG") 53 | else() 54 | set(ARROW_DEFINITION_FLAGS "") 55 | endif() 56 | 57 | message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}") 58 | 59 | 60 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules") 61 | include(CMakePackageConfigHelpers) 62 | 63 | set(SBOOST_SRC 64 | bitmap_writer.cc 65 | byteutils.cc 66 | sboost.cc 67 | unpacker.cc 68 | encoding/deltabp.cc 69 | encoding/encoding_utils.cc 70 | encoding/rlehybrid.cc 71 | simd.cc 72 | loader.cc) 73 | 74 | set(SBOOST_HEADER "unpacker.h;sboost.h;byteutils.h") 75 | 76 | add_library(sboost_objlib OBJECT ${SBOOST_SRC}) 77 | set_property(TARGET sboost_objlib PROPERTY POSITION_INDEPENDENT_CODE 1) 78 | target_compile_options(sboost_objlib PUBLIC -pthread -fvisibility=default ${SBOOST_SIMD_FLAGS}) 79 | 80 | if(CMAKE_BUILD_TYPE MATCHES DEBUG) 81 | set(CMAKE_SBOOST_DEBUG_EXTENSION "d") 82 | else() 83 | set(CMAKE_SBOOST_DEBUG_EXTENSION "") 84 | endif() 85 | 86 | add_library(sboost_static STATIC $) 87 | set_target_properties(sboost_static 88 | PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" 89 | OUTPUT_NAME sboost${CMAKE_SBOOST_DEBUG_EXTENSION} 90 | PUBLIC_HEADER "${SBOOST_HEADER}") 91 | install(TARGETS sboost_static 92 | EXPORT sboost_targets 93 | RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} 94 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 95 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 96 | PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_HEADERDIR}) 97 | 98 | add_library(sboost_shared SHARED $) 99 | set_target_properties(sboost_shared 100 | PROPERTIES LIBRARY_OUTPUT_DIRECTORY 101 | "${BUILD_OUTPUT_ROOT_DIRECTORY}" 102 | RUNTIME_OUTPUT_DIRECTORY 103 | "${BUILD_OUTPUT_ROOT_DIRECTORY}" 104 | OUTPUT_NAME sboost${CMAKE_SBOOST_DEBUG_EXTENSION} 105 | PUBLIC_HEADER "${SBOOST_HEADER}" 106 | VERSION 107 | 1.0.0 108 | SOVERSION 109 | 1.0.0 110 | ) 111 | 112 | install(TARGETS sboost_shared 113 | EXPORT sboost_targets 114 | RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} 115 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 116 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 117 | INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 118 | 119 | set(TARGETS_CMAKE "SboostTargets.cmake") 120 | install(EXPORT sboost_targets 121 | FILE "${TARGETS_CMAKE}" 122 | DESTINATION "${CMAKE_INSTALL_DIR}") 123 | 124 | set(CONFIG_CMAKE "SboostConfig.cmake") 125 | set(BUILT_CONFIG_CMAKE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_CMAKE}") 126 | configure_package_config_file("${CONFIG_CMAKE}.in" "${BUILT_CONFIG_CMAKE}" 127 | INSTALL_DESTINATION "${CMAKE_INSTALL_DIR}") 128 | install(FILES "${BUILT_CONFIG_CMAKE}" DESTINATION "${CMAKE_INSTALL_DIR}") 129 | 130 | set(CONFIG_VERSION_CMAKE "SboostConfigVersion.cmake") 131 | set(BUILT_CONFIG_VERSION_CMAKE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_VERSION_CMAKE}") 132 | write_basic_package_version_file("${BUILT_CONFIG_VERSION_CMAKE}" 133 | VERSION SBOOST_VERSION} 134 | COMPATIBILITY AnyNewerVersion) 135 | install(FILES "${BUILT_CONFIG_VERSION_CMAKE}" 136 | DESTINATION "${CMAKE_INSTALL_DIR}") 137 | 138 | include(Thirdparty) 139 | set(THREADS_PREFER_PTHREAD_FLAG ON) 140 | find_package(Threads REQUIRED) 141 | # TEST 142 | if(SBOOST_TEST) 143 | file(COPY "resources/testres" DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}) 144 | 145 | build_gtest() 146 | # Setup googletest 147 | 148 | set(SBOOST_TEST_SRC 149 | bitmap_writer_test.cc 150 | byteutils_test.cc 151 | sboost_test.cc 152 | unpacker_test.cc 153 | encoding/rlehybrid_test.cc 154 | encoding/deltabp_test.cc) 155 | 156 | add_executable(sboost_all_test ${SBOOST_TEST_SRC}) 157 | target_link_libraries(sboost_all_test PRIVATE sboost_static GTest::GTest GTest::Main pthread) 158 | target_compile_options(sboost_all_test PUBLIC ${SBOOST_SIMD_FLAGS}) 159 | endif() 160 | 161 | # Benchmark 162 | 163 | if(SBOOST_BENCHMARK) 164 | build_gbenchmark() 165 | set(SBOOST_BENCHMARK_LINK_OPTION benchmark::benchmark benchmark::benchmark_main Threads::Threads 166 | sboost_static) 167 | 168 | add_executable(sboost_unpacker_benchmark unpacker_benchmark.cc) 169 | target_link_libraries(sboost_unpacker_benchmark PRIVATE ${SBOOST_BENCHMARK_LINK_OPTION}) 170 | target_compile_options(sboost_unpacker_benchmark PUBLIC ${SBOOST_SIMD_FLAGS}) 171 | 172 | add_executable(sboost_sboost_benchmark sboost_benchmark.cc) 173 | target_link_libraries(sboost_sboost_benchmark PRIVATE ${SBOOST_BENCHMARK_LINK_OPTION}) 174 | target_compile_options(sboost_sboost_benchmark PUBLIC ${SBOOST_SIMD_FLAGS}) 175 | 176 | add_executable(sboost_loader_benchmark loader_benchmark.cc) 177 | target_link_libraries(sboost_loader_benchmark PRIVATE ${SBOOST_BENCHMARK_LINK_OPTION}) 178 | target_compile_options(sboost_loader_benchmark PUBLIC ${SBOOST_SIMD_FLAGS}) 179 | endif() 180 | 181 | add_executable(sboost_tool tool.cc) 182 | target_link_libraries(sboost_tool PRIVATE sboost_static) 183 | target_compile_options(sboost_tool PUBLIC ${SBOOST_SIMD_FLAGS}) 184 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SBoost 2 | 3 | SBoost is a SIMD based C++ library for quick scan and decode bit-packed and delta encoded integer. 4 | 5 | See http://people.cs.uchicago.edu/~hajiang/paper/damon2018_sboost.pdf for more technical detail 6 | 7 | ## System Requirement 8 | 9 | SBoost uses CMake for building. It requires C++14 and CMake >= 3.15. We developed/tested the library with GCC-9 and CMake 3.16. 10 | 11 | SBoost requires AVX512 support and is tested on Skylake CPU platforms. For other CPU platforms, please make sure the following compilation flags are supported 12 | 13 | `-msse4.1 -mavx -mavx2 -mavx512f -mavx512bw -mavx512dq -mavx512vl -mbmi2` 14 | 15 | ## Use SBoost 16 | 17 | ### Filter Bit-Packed Data 18 | The following code use SBoost to filter bit-packed integers 19 | ``` 20 | ``` 21 | ### Unpack Bit-Packed Data 22 | 23 | ### Unpack Delta Encoded Data 24 | 25 | ## Import SBoost as a Thirdparty Library 26 | 27 | The following CMake snippet supports importing SBoost as a thirdparty library in a CMake project 28 | ``` 29 | macro(build_sboost) 30 | 31 | set(SBOOST_REPO_URL "https://github.com/UCHI-DB/sboost.git") 32 | 33 | if(CMAKE_BUILD_TYPE MATCHES DEBUG) 34 | set(CMAKE_SBOOST_DEBUG_EXTENSION "d") 35 | else() 36 | set(CMAKE_SBOOST_DEBUG_EXTENSION "") 37 | endif() 38 | 39 | set(SBOOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/sboost_ep-prefix/src/sboost_ep") 40 | set(_SBOOST_LIBRARY_SUFFIX "${CMAKE_SBOOST_DEBUG_EXTENSION}${CMAKE_STATIC_LIBRARY_SUFFIX}") 41 | 42 | 43 | set(SBOOST_STATIC_LIB ${SBOOST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}sboost${_SBOOST_LIBRARY_SUFFIX}) 44 | set(SBOOST_INCLUDE_DIR ${SBOOST_PREFIX}/include) 45 | 46 | 47 | set(SBOOST_CMAKE_ARGS 48 | ${EP_COMMON_TOOLCHAIN} 49 | -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} 50 | "-DCMAKE_INSTALL_PREFIX=${SBOOST_PREFIX}" 51 | -DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS} 52 | -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS}) 53 | 54 | 55 | externalproject_add(sboost_ep 56 | GIT_REPOSITORY ${SBOOST_REPO_URL} 57 | GIT_TAG main 58 | BUILD_BYPRODUCTS ${SBOOST_STATIC_LIB} 59 | CMAKE_ARGS ${SBOOST_CMAKE_ARGS} ${EP_LOG_OPTIONS}) 60 | 61 | add_library(sboost_static STATIC IMPORTED) 62 | set_target_properties(sboost_static 63 | PROPERTIES IMPORTED_LOCATION "${SBOOST_STATIC_LIB}" 64 | INTERFACE_INCLUDE_DIRECTORIES "${SBOOST_INCLUDE_DIR}") 65 | add_dependencies(sboost_static sboost_ep) 66 | file(MAKE_DIRECTORY ${SBOOST_INCLUDE_DIR}) 67 | include_directories(${SBOOST_INCLUDE_DIR}) 68 | set(SBOOST_SIMD_FLAGS -msse4.1 -mavx -mavx2 -mavx512f -mavx512bw -mavx512dq -mavx512vl -mbmi2) 69 | endmacro() 70 | ``` 71 | -------------------------------------------------------------------------------- /SboostConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | # This config sets the following variables in your project:: 19 | # 20 | # Sboost_FOUND - true if Sboost found on the system 21 | # SBOOST_FULL_SO_VERSION - full shared library version of the found Sboost 22 | # SBOOST_SO_VERSION - shared library version of the found Sboost 23 | # SBOOST_VERSION - version of the found Sboost 24 | # 25 | # This config sets the following targets in your project:: 26 | # 27 | # sboost_shared - for linked as shared library if shared library is built 28 | # sboost_static - for linked as static library if static library is built 29 | 30 | @PACKAGE_INIT@ 31 | 32 | include(CMakeFindDependencyMacro) 33 | 34 | set(SBOOST_VERSION "@SBOOST_VERSION@") 35 | set(SBOOST_SO_VERSION "@SBOOST_VERSION@") 36 | set(SBOOST_FULL_SO_VERSION "@SBOOST_VERSION@") 37 | 38 | # Load targets only once. If we load targets multiple times, CMake reports 39 | # already existent target error. 40 | if(NOT (TARGET sboost_shared OR TARGET sboost_static)) 41 | include("${CMAKE_CURRENT_LIST_DIR}/SboostTargets.cmake") 42 | endif() 43 | -------------------------------------------------------------------------------- /bitmap_writer.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/14/19. 3 | // 4 | #include "bitmap_writer.h" 5 | #include 6 | 7 | namespace sboost { 8 | BitmapWriter::BitmapWriter(uint64_t *data, uint32_t offset) { 9 | this->data = data; 10 | this->index = offset >> 6; 11 | this->bitOffset = offset & 0x3F; 12 | } 13 | 14 | void BitmapWriter::appendBits(uint8_t bit, uint32_t repetition) { 15 | if (bit != 0) { 16 | // We simply assume this is an empty bitmap, and do not put zero inside 17 | if (repetition + bitOffset < 64) { 18 | data[index] |= ((1L << repetition) - 1) << bitOffset; 19 | } else { 20 | uint32_t remain = repetition; 21 | data[index++] |= -1L << bitOffset; 22 | remain -= 64 - bitOffset; 23 | while (remain >= 64) { 24 | data[index++] = -1L; 25 | remain -= 64; 26 | } 27 | if (remain > 0) { 28 | data[index] = (1L << remain) - 1; 29 | } 30 | } 31 | } else { 32 | index += (bitOffset + repetition) >> 6; 33 | } 34 | bitOffset = (bitOffset + repetition) & 0x3F; 35 | } 36 | 37 | void BitmapWriter::appendWord(uint64_t *word, uint32_t count) { 38 | uint32_t remain = count; 39 | if (bitOffset == 0) { 40 | // while (remain >= 64) { 41 | // data[index++] = word[destIndex++]; 42 | // remain -= 64; 43 | // } 44 | int destIndex = remain >> 6; 45 | memcpy((void *) (data + index), (void *) word, sizeof(uint64_t) * destIndex); 46 | index += destIndex; 47 | remain &= 0x3F; 48 | if (remain > 0) { 49 | data[index] = word[destIndex] & ((1L << remain) - 1); 50 | } 51 | } else { 52 | int destIndex = 0; 53 | while (remain >= 64) { 54 | data[index] |= word[destIndex] << bitOffset; 55 | data[++index] = word[destIndex++] >> (64 - bitOffset); 56 | remain -= 64; 57 | } 58 | if (remain > 0) { 59 | auto mask = (1L << remain) - 1; 60 | auto masked = word[destIndex] & mask; 61 | if (remain < 64 - bitOffset) { 62 | data[index] |= masked << bitOffset; 63 | } else if (remain == 64 - bitOffset) { 64 | data[index++] |= masked << bitOffset; 65 | } else { 66 | data[index++] |= masked << bitOffset; 67 | data[index] = masked >> (64 - bitOffset); 68 | } 69 | } 70 | } 71 | bitOffset = (bitOffset + count) & 0x3F; 72 | } 73 | 74 | void BitmapWriter::appendByte(uint8_t byte) { 75 | uint64_t value = byte; 76 | 77 | data[index] |= value << bitOffset; 78 | bitOffset += 8; 79 | if (bitOffset >= 64) { 80 | data[++index] = value >> (72 - bitOffset); 81 | bitOffset -= 64; 82 | } 83 | } 84 | 85 | void BitmapWriter::moveForward(uint32_t count) { 86 | bitOffset += count; 87 | index += bitOffset >> 6; 88 | bitOffset &= 0x3f; 89 | } 90 | 91 | } -------------------------------------------------------------------------------- /bitmap_writer.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/14/19. 3 | // 4 | 5 | #ifndef SBOOST_BITMAP_H 6 | #define SBOOST_BITMAP_H 7 | 8 | #include 9 | 10 | namespace sboost { 11 | class BitmapWriter { 12 | protected: 13 | uint64_t *data; 14 | uint32_t index; 15 | uint32_t bitOffset; 16 | public: 17 | BitmapWriter(uint64_t *data, uint32_t offset); 18 | 19 | void appendWord(uint64_t *word, uint32_t count); 20 | 21 | void appendBits(uint8_t bit, uint32_t rep); 22 | 23 | void appendByte(uint8_t byte); 24 | 25 | inline uint64_t *base() { 26 | return data; 27 | } 28 | 29 | inline uint32_t offset() { 30 | return (index << 6) + bitOffset; 31 | } 32 | 33 | void moveForward(uint32_t count); 34 | }; 35 | } 36 | 37 | #endif //SBOOST_BITMAP_H 38 | -------------------------------------------------------------------------------- /bitmap_writer_test.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/14/19. 3 | // 4 | #include 5 | #include "bitmap_writer.h" 6 | 7 | using namespace sboost; 8 | 9 | TEST(Bitmap, appendWord) { 10 | uint64_t content[100] = {0}; 11 | BitmapWriter bitmap(content, 137); 12 | 13 | uint64_t another[5] = {0x3f323daeeb004231, 0x33214, 0x4243234, 0x42342343, 38429}; 14 | 15 | bitmap.appendWord(another, 100); 16 | 17 | for (int i = 0; i < 100; i++) { 18 | int idxa = (i + 137) >> 6; 19 | int offa = (i + 137) & 0x3f; 20 | int idxb = i >> 6; 21 | int offb = i & 0x3f; 22 | 23 | EXPECT_EQ((content[idxa] >> offa) & 1, (another[idxb] >> offb) & 1) << i; 24 | } 25 | for (int i = 100; i < 128; i++) { 26 | int idxa = (i + 137) >> 6; 27 | int offa = (i + 137) & 0x3f; 28 | 29 | EXPECT_EQ((content[idxa] >> offa) & 1, 0) << i; 30 | } 31 | } 32 | 33 | TEST(Bitmap, appendWordZeroOffset) { 34 | uint64_t content[100] = {0}; 35 | BitmapWriter bitmap(content, 0); 36 | 37 | uint64_t another[5] = {0x3f323daeeb004231, 0x3424ff1220183214, 0x4200325411013234, 0x423133134234dd23, 38 | 0x3134143138420009}; 39 | 40 | bitmap.appendWord(another, 179); 41 | 42 | for (int i = 0; i < 179; i++) { 43 | int idxa = (i) >> 6; 44 | int offa = (i) & 0x3f; 45 | EXPECT_EQ((content[idxa] >> offa) & 1, (another[idxa] >> offa) & 1) << i; 46 | } 47 | } 48 | 49 | 50 | TEST(Bitmap, appendBits) { 51 | uint64_t content[100] = {0}; 52 | BitmapWriter bitmap(content, 139); 53 | 54 | bitmap.appendBits(1, 415); 55 | 56 | for (int i = 0; i < 415; i++) { 57 | int idxa = (i + 139) >> 6; 58 | int offa = (i + 139) & 0x3f; 59 | 60 | EXPECT_EQ((content[idxa] >> offa) & 1, 1) << i; 61 | } 62 | } 63 | 64 | TEST(Bitmap, appendByte) { 65 | uint64_t content[100] = {0}; 66 | BitmapWriter bitmap(content, 1); 67 | 68 | bitmap.appendByte(0x42); 69 | bitmap.appendByte(0x37); 70 | bitmap.appendByte(0x51); 71 | bitmap.appendByte(0x85); 72 | bitmap.appendByte(0x21); 73 | bitmap.appendByte(0x13); 74 | bitmap.appendByte(0xaa); 75 | bitmap.appendByte(0x4c); 76 | 77 | uint64_t num = 0x4caa132185513742; 78 | ASSERT_EQ(num << 1, content[0]); 79 | ASSERT_EQ(num >> 63, content[1]); 80 | } -------------------------------------------------------------------------------- /byteutils.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 8/18/19. 3 | // 4 | 5 | #include 6 | #include "byteutils.h" 7 | 8 | namespace sboost { 9 | namespace byteutils { 10 | uint64_t spanTo64(uint8_t bitwidth, uint32_t value) { 11 | uint64_t mask = (1L << bitwidth) - 1; 12 | uint64_t base = value & mask; 13 | uint64_t result = base; 14 | uint32_t round = 64 / bitwidth; 15 | uint32_t i = 1; 16 | while (i <= (round >> 1)) { 17 | result = result | (result << (i * bitwidth)); 18 | i *= 2; 19 | } 20 | for (; i < round; i++) { 21 | result |= base << (i * bitwidth); 22 | } 23 | return result; 24 | } 25 | 26 | void bitpack(uint32_t *values, uint64_t entryCount, uint32_t bitwidth, uint8_t *output) { 27 | // uint32_t mask = (1 << bitwidth) - 1; 28 | 29 | // uint64_t bitsize = bitwidth * entryCount; 30 | // uint64_t rescount = (bitsize >> 3) + ((bitsize & 7) ? 1 : 0); 31 | uint64_t *res = (uint64_t *) output; 32 | 33 | uint32_t resindex = 0; 34 | uint32_t resoffset = 0; 35 | 36 | for (uint64_t i = 0; i < entryCount; i++) { 37 | res[resindex] |= ((uint64_t) values[i]) << resoffset; 38 | resoffset += bitwidth; 39 | if (resoffset >= 64) { 40 | resindex += 1; 41 | resoffset &= 0x3F; 42 | res[resindex] |= ((uint64_t) values[i]) >> (bitwidth - resoffset); 43 | } 44 | } 45 | } 46 | 47 | uint32_t readIntLittleEndian(const uint8_t *input, uint32_t *offset) { 48 | uint32_t result = ((uint32_t *) input)[*offset]; 49 | *offset += 4; 50 | return result; 51 | } 52 | 53 | uint32_t readIntLittleEndianPaddedOnBitWidth(const uint8_t *input, uint32_t *offset, uint8_t bitWidth) { 54 | uint8_t byteWidth = (bitWidth + 7) >> 3; 55 | uint32_t result; 56 | switch (byteWidth) { 57 | case 1: 58 | result = input[*offset]; 59 | (*offset)++; 60 | break; 61 | case 2: 62 | result = ((uint16_t *) input)[*offset]; 63 | *offset += 2; 64 | break; 65 | case 3: 66 | result = ((uint32_t *) input)[*offset] & 0xFFFFFF; 67 | *offset += 3; 68 | break; 69 | case 4: 70 | result = ((uint32_t *) input)[*offset]; 71 | *offset += 4; 72 | break; 73 | default: 74 | return 0; 75 | } 76 | return result; 77 | } 78 | 79 | uint32_t readUnsignedVarInt(const uint8_t *input, uint32_t *offset) { 80 | uint32_t value = 0; 81 | uint32_t i = 0; 82 | uint32_t b; 83 | while (((b = input[(*offset)++]) & 0x80) != 0) { 84 | value |= (b & 0x7F) << i; 85 | i += 7; 86 | } 87 | return value | (b << i); 88 | } 89 | 90 | uint64_t readUnsignedVarLong(const uint8_t *input, uint32_t *offset) { 91 | uint64_t value = 0; 92 | uint32_t i = 0; 93 | uint32_t b; 94 | while (((b = input[(*offset)++]) & 0x80) != 0) { 95 | value |= (b & 0x7F) << i; 96 | i += 7; 97 | } 98 | return value | (b << i); 99 | } 100 | 101 | int32_t readZigZagVarInt(const uint8_t *input, uint32_t *offset) { 102 | uint32_t raw = readUnsignedVarInt(input, offset); 103 | int32_t result; 104 | *reinterpret_cast(&result) = (raw >> 1) ^ -(static_cast(raw & 1)); 105 | return result; 106 | } 107 | 108 | int64_t readZigZagVarLong(const uint8_t *input, uint32_t *offset) { 109 | uint64_t raw = readUnsignedVarLong(input, offset); 110 | return (int64_t)((raw >> 1) ^ -(static_cast(raw & 1))); 111 | } 112 | 113 | } 114 | } -------------------------------------------------------------------------------- /byteutils.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 8/18/19. 3 | // 4 | 5 | #ifndef SBOOST_BYTESUTILS_H 6 | #define SBOOST_BYTESUTILS_H 7 | 8 | #include 9 | 10 | namespace sboost { 11 | namespace byteutils { 12 | uint64_t spanTo64(uint8_t bitwidth, uint32_t value); 13 | 14 | void bitpack(uint32_t *values, uint64_t entryCount, uint32_t bitwidth, uint8_t *output); 15 | 16 | uint32_t readIntLittleEndian(const uint8_t *input, uint32_t *); 17 | 18 | uint32_t readUnsignedVarInt(const uint8_t *input, uint32_t *); 19 | 20 | uint64_t readUnsignedVarLong(const uint8_t *input, uint32_t *); 21 | 22 | uint32_t readIntLittleEndianPaddedOnBitWidth(const uint8_t *input, uint32_t *, uint8_t); 23 | 24 | int32_t readZigZagVarInt(const uint8_t *input, uint32_t *offset); 25 | 26 | int64_t readZigZagVarLong(const uint8_t *input, uint32_t *offset); 27 | } 28 | 29 | } 30 | #endif //SBOOST_BYTESUTILS_H 31 | -------------------------------------------------------------------------------- /byteutils_test.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 8/18/19. 3 | // 4 | 5 | 6 | #include 7 | #include 8 | #include "byteutils.h" 9 | 10 | using namespace sboost; 11 | 12 | TEST(BytesUtils, spanTo64) { 13 | int bitwidth = 9; // 0x16 14 | uint64_t res = byteutils::spanTo64(bitwidth, 22); 15 | 16 | // 7 copies 17 | EXPECT_EQ(res, 22L | (22L << 9) | (22L << 18) | (22L << 27) | (22L << 36) | (22L << 45) | (22L << 54)); 18 | 19 | // for (int i = 2; i < 32; i++) { 20 | // uint32_t value = 1 << (i - 1); 21 | // printf("0x%llXL,", common::spanTo64(i, value)); 22 | // } 23 | } 24 | 25 | TEST(BytesUtils, bitpack) { 26 | uint32_t input[] = {13, 22, 1, 9, 25, 17, 6, 22, 12, 31, 12, 21, 0, 5}; 27 | 28 | uint8_t result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 29 | 30 | byteutils::bitpack(input, 14, 5, result); 31 | 32 | uint8_t expected[] = {0xCD, 0x86, 0x94, 0xA3, 0xB1, 0xEC, 0xB3, 0x0A, 0x0A}; 33 | 34 | for (int i = 0; i < 9; i++) { 35 | EXPECT_EQ(expected[i], result[i]); 36 | } 37 | } 38 | 39 | TEST(BytesUtils, readZigzag) { 40 | uint8_t data[] = {187, 37, 0}; 41 | uint32_t offset = 0; 42 | int64_t zigzag = byteutils::readZigZagVarLong(data, &offset); 43 | ASSERT_EQ(-2398, zigzag); 44 | 45 | } 46 | -------------------------------------------------------------------------------- /cmake_modules/Thirdparty.cmake: -------------------------------------------------------------------------------- 1 | macro(build_gtest) 2 | message(STATUS "Building gtest from source") 3 | set(GTEST_BUILD_VERSION 1.10.0) 4 | set(GTEST_SOURCE_URL 5 | "https://github.com/google/googletest/archive/release-${GTEST_BUILD_VERSION}.tar.gz") 6 | 7 | set(GTEST_VENDORED TRUE) 8 | set(GTEST_CMAKE_CXX_FLAGS ${EP_CXX_FLAGS}) 9 | 10 | if (UPPERCASE_BUILD_TYPE MATCHES DEBUG) 11 | set(CMAKE_GTEST_DEBUG_EXTENSION "d") 12 | else () 13 | set(CMAKE_GTEST_DEBUG_EXTENSION "") 14 | endif () 15 | 16 | set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") 17 | 18 | set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") 19 | set(GMOCK_INCLUDE_DIR "${GTEST_PREFIX}/include") 20 | set(GTEST_LIBRARY_DIR ${GTEST_PREFIX}/lib) 21 | 22 | set(_GTEST_RUNTIME_DIR ${BUILD_OUTPUT_ROOT_DIRECTORY}) 23 | 24 | set(_GTEST_LIBRARY_SUFFIX "${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_STATIC_LIBRARY_SUFFIX}") 25 | 26 | 27 | set(GTEST_STATIC_LIB 28 | "${GTEST_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${_GTEST_LIBRARY_SUFFIX}") 29 | set(GMOCK_STATIC_LIB 30 | "${GTEST_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}gmock${_GTEST_LIBRARY_SUFFIX}") 31 | set(GTEST_MAIN_STATIC_LIB 32 | "${GTEST_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${_GTEST_LIBRARY_SUFFIX}") 33 | set(GTEST_CMAKE_ARGS 34 | ${EP_COMMON_TOOLCHAIN} 35 | -DCMAKE_BUILD_TYPE=${UPPERCASE_BUILD_TYPE} 36 | "-DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX}" 37 | -DBUILD_SHARED_LIBS=OFF 38 | -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS} 39 | -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${GTEST_CMAKE_CXX_FLAGS}) 40 | 41 | 42 | #list(APPEND GTEST_CMAKE_ARGS "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=${_GTEST_RUNTIME_DIR}" 43 | # "-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_${UPPERCASE_BUILD_TYPE}=${_GTEST_RUNTIME_DIR}") 44 | 45 | # add_definitions(-DGTEST_LINKED_AS_SHARED_LIBRARY=1) 46 | 47 | externalproject_add(googletest_ep 48 | URL ${GTEST_SOURCE_URL} 49 | BUILD_BYPRODUCTS ${GTEST_STATIC_LIB} ${GTEST_MAIN_STATIC_LIB} 50 | ${GMOCK_STATIC_LIB} 51 | CMAKE_ARGS ${GTEST_CMAKE_ARGS} ${EP_LOG_OPTIONS}) 52 | 53 | # The include directory must exist before it is referenced by a target. 54 | file(MAKE_DIRECTORY "${GTEST_INCLUDE_DIR}") 55 | 56 | add_library(GTest::GTest STATIC IMPORTED) 57 | 58 | set_target_properties(GTest::GTest 59 | PROPERTIES IMPORTED_LOCATION "${GTEST_STATIC_LIB}" 60 | INTERFACE_INCLUDE_DIRECTORIES "${GTEST_INCLUDE_DIR}") 61 | 62 | add_library(GTest::Main STATIC IMPORTED) 63 | set_target_properties(GTest::Main 64 | PROPERTIES IMPORTED_LOCATION "${GTEST_MAIN_STATIC_LIB}" 65 | INTERFACE_INCLUDE_DIRECTORIES "${GTEST_INCLUDE_DIR}") 66 | 67 | add_library(GTest::GMock STATIC IMPORTED) 68 | set_target_properties(GTest::GMock 69 | PROPERTIES IMPORTED_LOCATION "${GMOCK_STATIC_LIB}" 70 | INTERFACE_INCLUDE_DIRECTORIES "${GTEST_INCLUDE_DIR}") 71 | add_dependencies(GTest::GTest googletest_ep) 72 | add_dependencies(GTest::Main googletest_ep) 73 | add_dependencies(GTest::GMock googletest_ep) 74 | endmacro() 75 | 76 | macro(build_gbenchmark) 77 | message(STATUS "Building benchmark from source") 78 | set(GBENCHMARK_BUILD_VERSION 1.5.2) 79 | set(GBENCHMARK_SOURCE_URL "https://github.com/google/benchmark/archive/v${GBENCHMARK_BUILD_VERSION}.tar.gz") 80 | 81 | set(GBENCHMARK_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} -std=c++11") 82 | 83 | set(GBENCHMARK_PREFIX 84 | "${CMAKE_CURRENT_BINARY_DIR}/gbenchmark_ep/src/gbenchmark_ep-install") 85 | set(GBENCHMARK_INCLUDE_DIR "${GBENCHMARK_PREFIX}/include") 86 | set( 87 | GBENCHMARK_STATIC_LIB 88 | "${GBENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark${CMAKE_STATIC_LIBRARY_SUFFIX}" 89 | ) 90 | set( 91 | GBENCHMARK_MAIN_STATIC_LIB 92 | "${GBENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark_main${CMAKE_STATIC_LIBRARY_SUFFIX}" 93 | ) 94 | set(GBENCHMARK_CMAKE_ARGS 95 | ${EP_COMMON_CMAKE_ARGS} 96 | "-DCMAKE_INSTALL_PREFIX=${GBENCHMARK_PREFIX}" 97 | -DCMAKE_INSTALL_LIBDIR=lib 98 | -DBENCHMARK_ENABLE_TESTING=OFF 99 | -DCMAKE_CXX_FLAGS=${GBENCHMARK_CMAKE_CXX_FLAGS}) 100 | 101 | externalproject_add(gbenchmark_ep 102 | URL ${GBENCHMARK_SOURCE_URL} 103 | BUILD_BYPRODUCTS "${GBENCHMARK_STATIC_LIB}" 104 | "${GBENCHMARK_MAIN_STATIC_LIB}" 105 | CMAKE_ARGS ${GBENCHMARK_CMAKE_ARGS} ${EP_LOG_OPTIONS}) 106 | 107 | # The include directory must exist before it is referenced by a target. 108 | file(MAKE_DIRECTORY "${GBENCHMARK_INCLUDE_DIR}") 109 | 110 | add_library(benchmark::benchmark STATIC IMPORTED) 111 | set_target_properties(benchmark::benchmark 112 | PROPERTIES IMPORTED_LOCATION "${GBENCHMARK_STATIC_LIB}" 113 | INTERFACE_INCLUDE_DIRECTORIES 114 | "${GBENCHMARK_INCLUDE_DIR}") 115 | 116 | add_library(benchmark::benchmark_main STATIC IMPORTED) 117 | set_target_properties(benchmark::benchmark_main 118 | PROPERTIES IMPORTED_LOCATION "${GBENCHMARK_MAIN_STATIC_LIB}" 119 | INTERFACE_INCLUDE_DIRECTORIES 120 | "${GBENCHMARK_INCLUDE_DIR}") 121 | 122 | add_dependencies(benchmark::benchmark gbenchmark_ep) 123 | add_dependencies(benchmark::benchmark_main gbenchmark_ep) 124 | include_directories(${GBENCHMARK_INCLUDE_DIR}) 125 | endmacro() 126 | 127 | set(EP_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}") 128 | set(EP_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") 129 | set(EP_CXX_FLAGS "${EP_CXX_FLAGS} -fPIC") 130 | set(EP_C_FLAGS "${EP_C_FLAGS} -fPIC") 131 | 132 | set(EP_COMMON_TOOLCHAIN -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} 133 | -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}) 134 | 135 | if (CMAKE_AR) 136 | set(EP_COMMON_TOOLCHAIN ${EP_COMMON_TOOLCHAIN} -DCMAKE_AR=${CMAKE_AR}) 137 | endif () 138 | 139 | if (CMAKE_RANLIB) 140 | set(EP_COMMON_TOOLCHAIN ${EP_COMMON_TOOLCHAIN} -DCMAKE_RANLIB=${CMAKE_RANLIB}) 141 | endif () 142 | 143 | # External projects are still able to override the following declarations. 144 | # cmake command line will favor the last defined variable when a duplicate is 145 | # encountered. This requires that `EP_COMMON_CMAKE_ARGS` is always the first 146 | # argument. 147 | set(EP_COMMON_CMAKE_ARGS 148 | ${EP_COMMON_TOOLCHAIN} 149 | -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} 150 | -DCMAKE_C_FLAGS=${EP_C_FLAGS} 151 | -DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS} 152 | -DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS} 153 | -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS}) 154 | 155 | set(EP_LOG_OPTIONS 156 | LOG_CONFIGURE 157 | 1 158 | LOG_BUILD 159 | 1 160 | LOG_INSTALL 161 | 1 162 | LOG_DOWNLOAD 163 | 1) 164 | 165 | 166 | include(ExternalProject) -------------------------------------------------------------------------------- /encoding/deltabp.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 2/18/20. 3 | // 4 | #include "deltabp.h" 5 | #include "../byteutils.h" 6 | #include "../unpacker.h" 7 | #include "../sboost.h" 8 | #include "../bitmap_writer.h" 9 | 10 | namespace sboost { 11 | namespace encoding { 12 | namespace deltabp { 13 | 14 | using namespace std; 15 | using namespace std::placeholders; 16 | 17 | /** 18 | * Process a Delta BitPacking Input using SBoost 19 | * 20 | * @param input 21 | * @param offset 22 | * @param output 23 | * @param output_offset 24 | * @param bit_width 25 | * @param num_entry 26 | * @param processor 27 | */ 28 | template 29 | uint32_t process(const uint8_t *input, uint64_t *output, 30 | uint32_t output_offset, uint32_t num_entry, 31 | PRED &pred) { 32 | BitmapWriter bitmap(output, output_offset); 33 | 34 | // Read Block Size and Num of mini Blocks 35 | uint32_t read_pos = 0; 36 | uint32_t block_size = byteutils::readUnsignedVarInt(input, &read_pos); 37 | uint32_t num_miniblock = byteutils::readUnsignedVarInt(input, &read_pos); 38 | byteutils::readUnsignedVarInt(input, &read_pos); 39 | uint32_t miniblock_size = block_size / num_miniblock; 40 | 41 | uint32_t bit_widths[num_miniblock]; 42 | 43 | int64_t last_value = byteutils::readZigZagVarLong(input, &read_pos); 44 | 45 | // Process the first value 46 | uint8_t res = pred.apply(last_value); 47 | bitmap.appendBits(res, 1); 48 | 49 | uint32_t processed = 1; 50 | while (processed < num_entry) { 51 | // Load and process block 52 | int64_t min_delta = byteutils::readZigZagVarLong(input, &read_pos); 53 | __m256i mdelta = _mm256_set1_epi32(min_delta); 54 | for (uint32_t i = 0; i < num_miniblock; ++i) { 55 | bit_widths[i] = input[read_pos++]; 56 | } 57 | // mini block is atomic for reading, we read a mini block when there are more values left 58 | for (uint32_t i = 0; i < num_miniblock && processed < num_entry; ++i) { 59 | auto unpacker = sboost::unpackers[(bit_widths[i])]; 60 | for (uint32_t j = 0; j < miniblock_size; j += 8) { 61 | __m256i unpacked = unpacker->unpack(input + read_pos); 62 | __m256i delta = _mm256_add_epi32(unpacked, mdelta); 63 | __m256i csum = cumsum32(delta); 64 | __m256i final = _mm256_add_epi32(csum, _mm256_set1_epi32(last_value)); 65 | 66 | uint8_t result = pred.apply(final); 67 | bitmap.appendByte(result); 68 | last_value = (final[3] >> 32) & 0xFFFFFFFF; 69 | read_pos += bit_widths[i]; 70 | } 71 | } 72 | processed += block_size; 73 | } 74 | return read_pos; 75 | } 76 | 77 | class EqualPred { 78 | private: 79 | int32_t target_; 80 | __m256i target256_; 81 | public: 82 | EqualPred(int32_t target) : target_(target), target256_(_mm256_set1_epi32(target)) {} 83 | 84 | inline bool apply(int32_t val) { 85 | return val == target_; 86 | } 87 | 88 | inline uint8_t apply(__m256i val) { 89 | return _mm256_cmp_epi32_mask(val, target256_, 0); 90 | } 91 | }; 92 | 93 | uint32_t equal(const uint8_t *input, uint64_t *output, uint32_t output_offset, 94 | uint32_t num_entry, int32_t value) { 95 | EqualPred pred(value); 96 | return process(input, output, output_offset, num_entry, pred); 97 | } 98 | 99 | class LessPred { 100 | private: 101 | int32_t target_; 102 | __m256i target256_; 103 | public: 104 | LessPred(int32_t target) : target_(target), target256_(_mm256_set1_epi32(target)) {} 105 | 106 | inline bool apply(int32_t val) { 107 | return val < target_; 108 | } 109 | 110 | inline uint8_t apply(__m256i val) { 111 | return _mm256_cmp_epi32_mask(val, target256_, 1); 112 | } 113 | }; 114 | 115 | uint32_t less(const uint8_t *input, uint64_t *output, uint32_t output_offset, 116 | uint32_t num_entry, int32_t value) { 117 | LessPred pred(value); 118 | return process(input, output, output_offset, num_entry, pred); 119 | } 120 | 121 | class GreaterPred { 122 | private: 123 | int32_t target_; 124 | __m256i target256_; 125 | public: 126 | GreaterPred(int32_t target) : target_(target), target256_(_mm256_set1_epi32(target)) {} 127 | 128 | inline bool apply(int32_t val) { 129 | return val > target_; 130 | } 131 | 132 | inline uint8_t apply(__m256i val) { 133 | return _mm256_cmp_epi32_mask(val, target256_, 6); 134 | } 135 | }; 136 | 137 | uint32_t greater(const uint8_t *input, uint64_t *output, uint32_t output_offset, 138 | uint32_t num_entry, int32_t value) { 139 | GreaterPred pred(value); 140 | return process(input, output, output_offset, num_entry, pred); 141 | } 142 | 143 | class BetweenPred { 144 | private: 145 | int32_t lb_; 146 | int32_t ub_; 147 | __m256i lb256_; 148 | __m256i ub256_; 149 | public: 150 | BetweenPred(int32_t lb, int32_t ub) : lb_(lb), ub_(ub), 151 | lb256_(_mm256_set1_epi32(lb)), 152 | ub256_(_mm256_set1_epi32(ub)) {} 153 | 154 | inline bool apply(int32_t val) { 155 | return val >= lb_ && val <= ub_; 156 | } 157 | 158 | inline uint8_t apply(__m256i val) { 159 | return _mm256_cmp_epi32_mask(val, lb256_, 5) 160 | & _mm256_cmp_epi32_mask(val, ub256_, 2); 161 | } 162 | }; 163 | 164 | uint32_t between(const uint8_t *input, uint64_t *output, uint32_t output_offset, 165 | uint32_t num_entry, int32_t lb, int32_t ub) { 166 | BetweenPred pred(lb, ub); 167 | return process(input, output, output_offset, num_entry, pred); 168 | } 169 | 170 | class RangelePred { 171 | private: 172 | int32_t lb_; 173 | int32_t ub_; 174 | __m256i lb256_; 175 | __m256i ub256_; 176 | public: 177 | RangelePred(int32_t lb, int32_t ub) : lb_(lb), ub_(ub), 178 | lb256_(_mm256_set1_epi32(lb)), 179 | ub256_(_mm256_set1_epi32(ub)) {} 180 | 181 | inline bool apply(int32_t val) { 182 | return val >= lb_ && val < ub_; 183 | } 184 | 185 | inline uint8_t apply(__m256i val) { 186 | return _mm256_cmp_epi32_mask(val, lb256_, 5) 187 | & _mm256_cmp_epi32_mask(val, ub256_, 1); 188 | } 189 | }; 190 | 191 | uint32_t rangele(const uint8_t *input, uint64_t *output, uint32_t output_offset, 192 | uint32_t num_entry, int32_t lb, int32_t ub) { 193 | RangelePred pred(lb, ub); 194 | return process(input, output, output_offset, num_entry, pred); 195 | } 196 | } 197 | } 198 | } -------------------------------------------------------------------------------- /encoding/deltabp.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 2/18/20. 3 | // 4 | 5 | #ifndef SBOOST_ENCODING_DELTABP_H 6 | #define SBOOST_ENCODING_DELTABP_H 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace sboost { 13 | namespace encoding { 14 | namespace deltabp { 15 | 16 | uint32_t equal(const uint8_t *, uint64_t *, uint32_t, uint32_t, int32_t); 17 | 18 | uint32_t less(const uint8_t *, uint64_t *, uint32_t, uint32_t, int32_t); 19 | 20 | uint32_t greater(const uint8_t *, uint64_t *, uint32_t, uint32_t, int32_t); 21 | 22 | uint32_t between(const uint8_t *, uint64_t *, uint32_t, uint32_t, int32_t, int32_t); 23 | 24 | uint32_t rangele(const uint8_t *, uint64_t *, uint32_t, uint32_t, int32_t, int32_t); 25 | } 26 | } 27 | } 28 | 29 | #endif //SBOOST_ENCODING_DELTABP_H 30 | -------------------------------------------------------------------------------- /encoding/deltabp_test.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 2/19/20. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "deltabp.h" 11 | #include "../bitmap_writer.h" 12 | 13 | using namespace sboost; 14 | using namespace sboost::encoding; 15 | 16 | class DeltaBP : public ::testing::Test { 17 | protected: 18 | uint8_t *content; 19 | uint32_t file_size; 20 | int fd; 21 | 22 | 23 | virtual void SetUp() override { 24 | struct stat st; 25 | stat("testres/sboost/deltabpcontent", &st); 26 | file_size = st.st_size; 27 | fd = open("testres/sboost/deltabpcontent", O_RDONLY, 0); 28 | assert(fd != -1); 29 | //Execute mmap 30 | void *mmappedData = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0); 31 | assert(mmappedData != MAP_FAILED); 32 | content = (uint8_t *) mmappedData; 33 | } 34 | 35 | virtual void TearDown() override { 36 | munmap(content, file_size); 37 | close(fd); 38 | } 39 | }; 40 | 41 | 42 | TEST_F(DeltaBP, Equal) { 43 | uint64_t output[10000] = {0}; 44 | 45 | int pred = 1020; 46 | 47 | uint64_t expect[10000] = {0}; 48 | BitmapWriter expectedBitmap(expect, 0); 49 | std::ifstream infile("testres/sboost/deltabpval"); 50 | int lineval; 51 | while (infile >> lineval) { 52 | expectedBitmap.appendBits(lineval == pred, 1); 53 | } 54 | 55 | deltabp::equal(content, output, 0, 50000, pred); 56 | 57 | 58 | for (int i = 0; i < 781; ++i) { 59 | ASSERT_EQ(output[i], expect[i]) << i; 60 | } 61 | ASSERT_EQ(output[781] & 0xFFFF, expect[781] & 0xffff); 62 | } 63 | 64 | TEST_F(DeltaBP, Less) { 65 | uint64_t output[10000] = {0}; 66 | 67 | int pred = 1249; 68 | 69 | uint64_t expect[10000] = {0}; 70 | BitmapWriter expectedBitmap(expect, 0); 71 | std::ifstream infile("testres/sboost/deltabpval"); 72 | int lineval; 73 | while (infile >> lineval) { 74 | expectedBitmap.appendBits(lineval < pred, 1); 75 | } 76 | 77 | deltabp::less(content, output, 0, 50000, pred); 78 | 79 | for (int i = 0; i < 781; ++i) { 80 | ASSERT_EQ(output[i], expect[i]) << i; 81 | } 82 | ASSERT_EQ(output[781] & 0xFFFF, expect[781] & 0xffff); 83 | } 84 | 85 | TEST_F(DeltaBP, Greater) { 86 | uint64_t output[10000] = {0}; 87 | 88 | int pred = 1020; 89 | 90 | uint64_t expect[10000] = {0}; 91 | BitmapWriter expectedBitmap(expect, 0); 92 | std::ifstream infile("testres/sboost/deltabpval"); 93 | int lineval; 94 | while (infile >> lineval) { 95 | expectedBitmap.appendBits(lineval > pred, 1); 96 | } 97 | 98 | deltabp::greater(content, output, 0, 50000, pred); 99 | 100 | for (int i = 0; i < 781; ++i) { 101 | ASSERT_EQ(output[i], expect[i]) << i; 102 | } 103 | ASSERT_EQ(output[781] & 0xFFFF, expect[781] & 0xffff); 104 | } 105 | 106 | TEST_F(DeltaBP, Between) { 107 | uint64_t output[10000] = {0}; 108 | 109 | int ub = 2010; 110 | int lb = 424; 111 | 112 | uint64_t expect[10000] = {0}; 113 | BitmapWriter expectedBitmap(expect, 0); 114 | std::ifstream infile("testres/sboost/deltabpval"); 115 | int lineval; 116 | while (infile >> lineval) { 117 | expectedBitmap.appendBits(lineval >= lb && lineval <= ub, 1); 118 | } 119 | 120 | deltabp::between(content, output, 0, 50000, lb, ub); 121 | 122 | for (int i = 0; i < 781; ++i) { 123 | ASSERT_EQ(output[i], expect[i]) << i; 124 | } 125 | ASSERT_EQ(output[781] & 0xFFFF, expect[781] & 0xffff); 126 | } 127 | 128 | TEST_F(DeltaBP, Rangele) { 129 | uint64_t output[10000] = {0}; 130 | 131 | int ub = 2322; 132 | int lb = 1421; 133 | 134 | uint64_t expect[10000] = {0}; 135 | BitmapWriter expectedBitmap(expect, 0); 136 | std::ifstream infile("testres/sboost/deltabpval"); 137 | int lineval; 138 | while (infile >> lineval) { 139 | expectedBitmap.appendBits(lineval >= lb && lineval < ub, 1); 140 | } 141 | 142 | deltabp::rangele(content, output, 0, 50000, lb, ub); 143 | 144 | for (int i = 0; i < 781; ++i) { 145 | ASSERT_EQ(output[i], expect[i]) << i; 146 | } 147 | ASSERT_EQ(output[781] & 0xFFFF, expect[781] & 0xffff); 148 | } 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /encoding/encoding_utils.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 10/21/19. 3 | // 4 | 5 | #include "encoding_utils.h" 6 | 7 | namespace sboost { 8 | namespace encoding { 9 | 10 | void cleanup(uint32_t counter, uint32_t numEntry, uint64_t *output, uint32_t outputOffset) { 11 | if (counter > numEntry) { 12 | // Erase off extra entries 13 | uint32_t eraseOffset = outputOffset + numEntry; 14 | uint32_t eraseIndex = eraseOffset >> 6; 15 | eraseOffset &= 0x3F; 16 | output[eraseIndex] &= (1L << eraseOffset) - 1; 17 | uint32_t moreToErase = eraseIndex + 1; 18 | while ((moreToErase << 6) < outputOffset + counter) { 19 | output[moreToErase++] = 0; 20 | } 21 | } 22 | } 23 | 24 | } 25 | } -------------------------------------------------------------------------------- /encoding/encoding_utils.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 10/21/19. 3 | // 4 | 5 | #ifndef SBOOST_ENCODING_ENCODINGUTILS_H 6 | #define SBOOST_ENCODING_ENCODINGUTILS_H 7 | 8 | #include 9 | 10 | namespace sboost { 11 | namespace encoding { 12 | /** 13 | * Clean up the bitmap that is overflown 14 | * 15 | * @param counter 16 | * @param numEntry 17 | * @param output 18 | * @param outputOffset 19 | */ 20 | void cleanup(uint32_t counter, uint32_t numEntry, uint64_t *output, uint32_t outputOffset); 21 | } 22 | } 23 | #endif //SBOOST_ENCODING_ENCODINGUTILS_H 24 | -------------------------------------------------------------------------------- /encoding/rlehybrid.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/14/19. 3 | // 4 | 5 | #include 6 | #include "rlehybrid.h" 7 | #include "../bitmap_writer.h" 8 | #include "../byteutils.h" 9 | #include "../sboost.h" 10 | #include "encoding_utils.h" 11 | 12 | namespace sboost { 13 | namespace encoding { 14 | namespace rlehybrid { 15 | 16 | SegmentReader::SegmentReader(const uint8_t *data, uint32_t bit_width, uint32_t num_entry) 17 | : data_(data), bit_width_(bit_width), num_entry_(num_entry), 18 | pointer_(0), counter_(0) {} 19 | 20 | bool SegmentReader::hasNext() { 21 | return counter_ < num_entry_; 22 | } 23 | 24 | Segment SegmentReader::next() { 25 | uint32_t header = byteutils::readUnsignedVarInt(data_, &pointer_); 26 | Segment segment; 27 | segment.mode_ = (header & 1) == 0 ? RLE : PACKED; 28 | switch (segment.mode_) { 29 | case RLE: { 30 | segment.num_entry_ = header >> 1; 31 | segment.value_ = byteutils::readIntLittleEndianPaddedOnBitWidth(data_, &pointer_, bit_width_); 32 | break; 33 | } 34 | case PACKED: { 35 | int numGroups = header >> 1; 36 | segment.num_entry_ = numGroups << 3; 37 | segment.data_ = data_ + pointer_; 38 | segment.data_length_ = numGroups * bit_width_; 39 | pointer_ += numGroups * bit_width_; 40 | break; 41 | } 42 | } 43 | counter_ += segment.num_entry_; 44 | return segment; 45 | } 46 | 47 | template 48 | void process(const uint8_t *input, 49 | uint64_t *output, uint32_t outputOffset, 50 | uint32_t bitWidth, uint32_t numEntry, PRED pred) { 51 | uint32_t pointer = 0; 52 | uint32_t counter = 0; 53 | uint32_t currentCount = 0; 54 | int currentValue = 0; 55 | MODE mode; 56 | 57 | BitmapWriter bitmap(output, outputOffset); 58 | 59 | while (counter < numEntry) { 60 | uint32_t header = byteutils::readUnsignedVarInt(input, &pointer); 61 | mode = (header & 1) == 0 ? RLE : PACKED; 62 | switch (mode) { 63 | case RLE: { 64 | currentCount = header >> 1; 65 | currentValue = byteutils::readIntLittleEndianPaddedOnBitWidth(input, &pointer, bitWidth); 66 | uint8_t bit = pred.test(currentValue); 67 | bitmap.appendBits(bit, currentCount); 68 | break; 69 | } 70 | case PACKED: { 71 | int numGroups = header >> 1; 72 | currentCount = numGroups << 3; 73 | pred.test(input + pointer, currentCount, output, bitmap.offset()); 74 | pointer += numGroups * bitWidth; 75 | bitmap.moveForward(currentCount); 76 | break; 77 | } 78 | } 79 | counter += currentCount; 80 | } 81 | encoding::cleanup(counter, numEntry, output, outputOffset); 82 | } 83 | 84 | class EqualPred { 85 | protected: 86 | uint32_t target_; 87 | sboost::Bitpack sboost_; 88 | public: 89 | EqualPred(uint32_t bitWidth, uint32_t target) : target_(target), sboost_(bitWidth, target) {} 90 | 91 | inline bool test(int32_t value) { 92 | return (int32_t) target_ == value; 93 | } 94 | 95 | inline void test(const uint8_t *data, uint32_t numEntry, uint64_t *bitmap, uint32_t bitmap_offset) { 96 | sboost_.equal(data, numEntry, bitmap, bitmap_offset); 97 | } 98 | }; 99 | 100 | void equal(const uint8_t *input, 101 | uint64_t *output, uint32_t outputOffset, 102 | uint32_t bitWidth, uint32_t numEntry, uint32_t value) { 103 | EqualPred pred(bitWidth, value); 104 | process(input, output, outputOffset, bitWidth, numEntry, pred); 105 | } 106 | 107 | class LessPred { 108 | protected: 109 | uint32_t target_; 110 | sboost::Bitpack sboost_; 111 | public: 112 | LessPred(uint32_t bitWidth, uint32_t target) : target_(target), sboost_(bitWidth, target) {} 113 | 114 | inline bool test(int32_t value) { 115 | return value < (int32_t) target_; 116 | } 117 | 118 | inline void test(const uint8_t *data, uint32_t numEntry, uint64_t *bitmap, uint32_t bitmap_offset) { 119 | sboost_.less(data, numEntry, bitmap, bitmap_offset); 120 | } 121 | }; 122 | 123 | void less(const uint8_t *input, 124 | uint64_t *output, uint32_t outputOffset, 125 | uint32_t bitWidth, uint32_t numEntry, uint32_t value) { 126 | LessPred pred(bitWidth, value); 127 | process(input, output, outputOffset, bitWidth, numEntry, pred); 128 | } 129 | 130 | class GreaterPred { 131 | protected: 132 | uint32_t target_; 133 | sboost::Bitpack sboost_; 134 | public: 135 | GreaterPred(uint32_t bitWidth, uint32_t target) : target_(target), sboost_(bitWidth, target) {} 136 | 137 | inline bool test(int32_t value) { 138 | return value > (int32_t) target_; 139 | } 140 | 141 | inline void test(const uint8_t *data, uint32_t numEntry, uint64_t *bitmap, uint32_t bitmap_offset) { 142 | sboost_.greater(data, numEntry, bitmap, bitmap_offset); 143 | } 144 | }; 145 | 146 | void greater(const uint8_t *input, 147 | uint64_t *output, uint32_t outputOffset, 148 | uint32_t bitWidth, uint32_t numEntry, uint32_t value) { 149 | GreaterPred pred(bitWidth, value); 150 | process(input, output, outputOffset, bitWidth, numEntry, pred); 151 | } 152 | 153 | class RangelePred { 154 | protected: 155 | uint32_t lower_; 156 | uint32_t upper_; 157 | sboost::Bitpack sboost_; 158 | public: 159 | RangelePred(uint32_t bitWidth, uint32_t lower, uint32_t upper) 160 | : lower_(lower), upper_(upper), sboost_(bitWidth, lower, upper) {} 161 | 162 | inline bool test(int32_t value) { 163 | uint32_t uvalue = static_cast(value); 164 | return uvalue >= lower_ && uvalue < upper_; 165 | } 166 | 167 | inline void test(const uint8_t *data, uint32_t numEntry, uint64_t *bitmap, uint32_t bitmap_offset) { 168 | sboost_.rangele(data, numEntry, bitmap, bitmap_offset); 169 | } 170 | }; 171 | 172 | void rangele(const uint8_t *input, 173 | uint64_t *output, uint32_t outputOffset, 174 | uint32_t bitWidth, uint32_t numEntry, uint32_t lower, uint32_t upper) { 175 | RangelePred pred(bitWidth, lower, upper); 176 | process(input, output, outputOffset, bitWidth, numEntry, pred); 177 | } 178 | 179 | class BetweenPred { 180 | protected: 181 | uint32_t lower_; 182 | uint32_t upper_; 183 | sboost::Bitpack sboost_; 184 | public: 185 | BetweenPred(uint32_t bitWidth, uint32_t lower, uint32_t upper) 186 | : lower_(lower), upper_(upper), sboost_(bitWidth, lower, upper) {} 187 | 188 | inline bool test(int32_t value) { 189 | uint32_t uvalue = static_cast(value); 190 | return uvalue >= lower_ && uvalue <= upper_; 191 | } 192 | 193 | inline void test(const uint8_t *data, uint32_t numEntry, uint64_t *bitmap, uint32_t bitmap_offset) { 194 | sboost_.between(data, numEntry, bitmap, bitmap_offset); 195 | } 196 | }; 197 | 198 | 199 | void between(const uint8_t *input, 200 | uint64_t *output, uint32_t outputOffset, 201 | uint32_t bitWidth, uint32_t numEntry, uint32_t lower, uint32_t upper) { 202 | BetweenPred pred(bitWidth, lower, upper); 203 | process(input, output, outputOffset, bitWidth, numEntry, pred); 204 | } 205 | } 206 | } 207 | } -------------------------------------------------------------------------------- /encoding/rlehybrid.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/14/19. 3 | // 4 | 5 | #ifndef SBOOST_ENCODING_RLEHYBRID_H 6 | #define SBOOST_ENCODING_RLEHYBRID_H 7 | 8 | #include 9 | 10 | namespace sboost { 11 | namespace encoding { 12 | namespace rlehybrid { 13 | 14 | typedef enum { 15 | RLE, PACKED 16 | } MODE; 17 | 18 | void equal(const uint8_t *, uint64_t *, uint32_t, uint32_t, uint32_t, uint32_t); 19 | 20 | void less(const uint8_t *, uint64_t *, uint32_t, uint32_t, uint32_t, uint32_t); 21 | 22 | void greater(const uint8_t *, uint64_t *, uint32_t, uint32_t, uint32_t, uint32_t); 23 | 24 | void between(const uint8_t *, uint64_t *, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); 25 | 26 | void rangele(const uint8_t *, uint64_t *, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); 27 | 28 | // Representing one segment 29 | struct Segment { 30 | MODE mode_; 31 | uint32_t num_entry_; 32 | const uint8_t *data_; 33 | int32_t data_length_; 34 | int32_t value_; 35 | }; 36 | 37 | class SegmentReader { 38 | protected: 39 | const uint8_t *data_; 40 | uint32_t bit_width_; 41 | uint32_t num_entry_; 42 | uint32_t pointer_; 43 | uint32_t counter_; 44 | public: 45 | SegmentReader(const uint8_t *, uint32_t, uint32_t); 46 | 47 | virtual ~SegmentReader() = default; 48 | 49 | bool hasNext(); 50 | 51 | Segment next(); 52 | }; 53 | } 54 | } 55 | } 56 | #endif //SBOOST_ENCODING_RLEHYBRID_H 57 | -------------------------------------------------------------------------------- /encoding/rlehybrid_test.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/24/19. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "rlehybrid.h" 11 | 12 | using namespace sboost::encoding::rlehybrid; 13 | 14 | TEST(RLEHybrid, equal) { 15 | struct stat st; 16 | stat("testres/sboost/rlecontent", &st); 17 | uint32_t filesize = st.st_size; 18 | int fd = open("testres/sboost/rlecontent", O_RDONLY, 0); 19 | assert(fd != -1); 20 | //Execute mmap 21 | void *mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0); 22 | assert(mmappedData != MAP_FAILED); 23 | 24 | uint64_t output[10000] = {0}; 25 | 26 | equal(((uint8_t *) mmappedData) + 1, output, 0, 3, 15000, 3); 27 | 28 | //Cleanup 29 | int rc = munmap(mmappedData, filesize); 30 | assert(rc == 0); 31 | close(fd); 32 | 33 | uint64_t exp[10] = { 34 | 0x218012640444A10, 0xA20400042020005A, 0x24000184149010F, 0x8448308000120002, 35 | 0x3108154CA2000204, 0x80405C0088444832, 0x490043A00404246C, 0x2324000201206016, 36 | 0x9400220004146881, 0xB120130C18C60040 37 | }; 38 | 39 | for (int i = 0; i < 10; i++) { 40 | EXPECT_EQ(output[i], exp[i]) << i; 41 | } 42 | return; 43 | } 44 | 45 | TEST(SegmentReaderTest, Next) { 46 | struct stat st; 47 | stat("testres/sboost/rlecontent", &st); 48 | uint32_t filesize = st.st_size; 49 | int fd = open("testres/sboost/rlecontent", O_RDONLY, 0); 50 | assert(fd != -1); 51 | //Execute mmap 52 | void *mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0); 53 | assert(mmappedData != MAP_FAILED); 54 | const uint8_t *data = (const uint8_t *) mmappedData; 55 | auto bitwidth = data[0]; 56 | SegmentReader reader(data + 1, bitwidth, 15000); 57 | 58 | int counter = 0; 59 | int num_entry = 0; 60 | while (reader.hasNext()) { 61 | auto segment = reader.next(); 62 | num_entry += segment.num_entry_; 63 | ++counter; 64 | } 65 | 66 | EXPECT_EQ(30, counter); 67 | EXPECT_EQ(15000, num_entry); 68 | } -------------------------------------------------------------------------------- /loader.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Hao Jiang on 12/1/20. 3 | // 4 | 5 | #ifndef SBOOST_LOADER_H 6 | #define SBOOST_LOADER_H 7 | 8 | #include 9 | #include 10 | 11 | namespace sboost { 12 | namespace loader { 13 | 14 | extern void (*loaders[32])(const uint8_t *, uint64_t *); 15 | 16 | extern void (*writers[32])(__m512i, uint64_t *, uint64_t); 17 | 18 | extern void (*writeinvs[32])(__m512i, uint64_t *, uint64_t); 19 | 20 | extern uint32_t entryInBlocks[32][8]; 21 | } 22 | } 23 | 24 | #endif //SBOOST_LOADER_H 25 | -------------------------------------------------------------------------------- /loader_benchmark.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Hao Jiang on 12/1/20. 3 | // This benchmark is to evaluate whether it is necessary to create dedicated loader functions 4 | // 5 | #include 6 | #include "byteutils.h" 7 | #include "sboost.h" 8 | #include "loader.h" 9 | 10 | 11 | class LoaderBenchmark : public benchmark::Fixture { 12 | protected: 13 | uint32_t *data_; 14 | uint8_t *output_; 15 | uint32_t numEntry_; 16 | uint8_t bitWidth_; 17 | uint64_t *res_; 18 | public: 19 | // add members as needed 20 | 21 | LoaderBenchmark() { 22 | numEntry_ = 100000; 23 | bitWidth_ = 11; 24 | output_ = (uint8_t *) malloc(sizeof(uint32_t) * numEntry_); 25 | data_ = (uint32_t *) malloc(sizeof(uint32_t) * numEntry_); 26 | res_ = (uint64_t *) malloc(sizeof(uint64_t) * (numEntry_ + 1) / 64); 27 | 28 | for (uint32_t i = 0; i < numEntry_; ++i) { 29 | data_[i] = i; 30 | } 31 | ::sboost::byteutils::bitpack(data_, numEntry_, bitWidth_, output_); 32 | } 33 | 34 | virtual ~LoaderBenchmark() { 35 | free(data_); 36 | free(output_); 37 | free(res_); 38 | } 39 | }; 40 | 41 | BENCHMARK_F(LoaderBenchmark, Dynamic)(benchmark::State &state) { 42 | uint64_t buffer_[8]; 43 | 44 | for (auto _ : state) { 45 | sboost::SortedBitpack sbp(bitWidth_, 134); 46 | // sbp.geqGroup2(output_, buffer_); 47 | } 48 | } 49 | 50 | BENCHMARK_F(LoaderBenchmark, Template)(benchmark::State &state) { 51 | uint64_t buffer_[8]; 52 | 53 | for (auto _ : state) { 54 | sboost::SortedBitpack sbp(bitWidth_, 134); 55 | // sbp.geqGroup(output_, buffer_); 56 | } 57 | } 58 | 59 | 60 | -------------------------------------------------------------------------------- /resources/testres/encoding/comment.binary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/encoding/comment.binary -------------------------------------------------------------------------------- /resources/testres/encoding/deltabp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/encoding/deltabp -------------------------------------------------------------------------------- /resources/testres/lineitem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/lineitem -------------------------------------------------------------------------------- /resources/testres/lineitem2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/lineitem2 -------------------------------------------------------------------------------- /resources/testres/multipage_delta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/multipage_delta -------------------------------------------------------------------------------- /resources/testres/nation: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/nation -------------------------------------------------------------------------------- /resources/testres/orders: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/orders -------------------------------------------------------------------------------- /resources/testres/sboost/deltabpcontent: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/sboost/deltabpcontent -------------------------------------------------------------------------------- /resources/testres/sboost/rlecontent: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCHI-DB/sboost/317323a84c63788d87ff316a2a0a28c3d45cf36b/resources/testres/sboost/rlecontent -------------------------------------------------------------------------------- /sboost.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/30/19. 3 | // 4 | 5 | #include 6 | #include "sboost.h" 7 | #include "loader.h" 8 | #include "byteutils.h" 9 | 10 | namespace sboost { 11 | 12 | uint64_t 13 | loadNext(const uint8_t *data, uint32_t bitwidth, uint32_t *byteindex, uint32_t *bitoffset, uint32_t *entryInBlock) { 14 | // Load next block and align it 15 | uint64_t loaded = (((uint64_t *) (data + *byteindex))[0] >> *bitoffset); 16 | 17 | // Update index and offset 18 | *entryInBlock = (64 - *bitoffset) / bitwidth; 19 | uint32_t bitadvance = (*entryInBlock) * bitwidth + (*bitoffset); 20 | *bitoffset = bitadvance & 0x7; 21 | *byteindex += bitadvance >> 3; 22 | return loaded; 23 | } 24 | 25 | uint8_t 26 | writeNext(uint64_t *res, uint64_t bits, uint32_t entryInBlock, uint32_t *resindex, uint32_t *resoffset) { 27 | res[*resindex] |= bits << *resoffset; 28 | *resoffset += entryInBlock; 29 | if (*resoffset >= 64) { 30 | // Be aware that we do not check resindex against rescount here so 31 | // resindex may be 1 larger than rescount if rescount is a multiple of 64 32 | // and the res[resindex] actually overflow the array, but as it did not write 33 | // anything it should be fine. But be aware of this in case it causes any bugs 34 | *resindex += 1; 35 | *resoffset &= 0x3F; 36 | if (*resoffset != 0) 37 | res[*resindex] |= bits >> (entryInBlock - *resoffset); 38 | } 39 | return _mm_popcnt_u64(bits); 40 | } 41 | 42 | 43 | static uint32_t lowestBit(uint64_t input) { 44 | return _mm_popcnt_u64((input & (input - 1) ^ input) - 1); 45 | } 46 | 47 | const uint64_t MASKS_64[] = {0, 0, 0x5555555555555555L, 0x36DB6DB6DB6DB6DBL, 0x7777777777777777L, 48 | 0x7BDEF7BDEF7BDEFL, 49 | 0x7DF7DF7DF7DF7DFL, 0x3F7EFDFBF7EFDFBFL, 0x7F7F7F7F7F7F7F7FL, 0x3FDFEFF7FBFDFEFFL, 50 | 0x7FDFF7FDFF7FDFFL, 0x3FF7FEFFDFFBFFL, 0x7FF7FF7FF7FF7FFL, 0x7FFBFFDFFEFFFL, 51 | 0x7FFDFFF7FFDFFFL, 0x7FFEFFFDFFFBFFFL, 0x7FFF7FFF7FFF7FFFL, 0x3FFFDFFFEFFFFL, 52 | 0x1FFFF7FFFDFFFFL, 0xFFFFDFFFFBFFFFL, 0x7FFFF7FFFF7FFFFL, 0x3FFFFDFFFFEFFFFFL, 53 | 0x7FFFFDFFFFFL, 0x1FFFFFBFFFFFL, 0x7FFFFF7FFFFFL, 0x1FFFFFEFFFFFFL, 0x7FFFFFDFFFFFFL, 54 | 0x1FFFFFFBFFFFFFL, 0x7FFFFFF7FFFFFFL, 0x1FFFFFFEFFFFFFFL, 0x7FFFFFFDFFFFFFFL, 55 | 0x1FFFFFFFBFFFFFFFL, 56 | }; 57 | 58 | extern const uint64_t EXTRACT_64[] = {0, 0, 59 | 0xAAAAAAAAAAAAAAAAL, 0x4924924924924924L, 0x8888888888888888L, 60 | 0x842108421084210L, 61 | 0x820820820820820L, 0x4081020408102040L, 0x8080808080808080L, 62 | 0x4020100804020100L, 63 | 0x802008020080200L, 0x40080100200400L, 0x800800800800800L, 0x8004002001000L, 64 | 0x80020008002000L, 0x800100020004000L, 0x8000800080008000L, 0x4000200010000L, 65 | 0x20000800020000L, 0x100002000040000L, 0x800008000080000L, 66 | 0x4000020000100000L, 67 | 0x80000200000L, 0x200000400000L, 0x800000800000L, 0x2000001000000L, 68 | 0x8000002000000L, 69 | 0x20000004000000L, 0x80000008000000L, 0x200000010000000L, 0x800000020000000L, 70 | 0x2000000040000000L, 71 | }; 72 | 73 | Bitpack::Bitpack(uint32_t bitWidth, uint32_t target) { 74 | this->bitWidth = bitWidth; 75 | this->target = target; 76 | this->extract = EXTRACT_64[bitWidth]; 77 | this->mask = _mm512_set1_epi64(MASKS_64[bitWidth]); 78 | this->msbmask = _mm512_set1_epi64(EXTRACT_64[bitWidth]); 79 | uint64_t singleSpan = byteutils::spanTo64(this->bitWidth, this->target); 80 | this->spanned = _mm512_set1_epi64(singleSpan); 81 | this->nspanned = _mm512_set1_epi64(~singleSpan); 82 | this->l2 = _mm512_and_si512(spanned, mask); 83 | this->g2 = _mm512_or_si512(spanned, msbmask); 84 | } 85 | 86 | Bitpack::Bitpack(uint32_t bitWidth, uint32_t t1, uint32_t t2) : Bitpack(bitWidth, t1) { 87 | this->bitWidth = bitWidth; 88 | this->target = t1; 89 | this->target2 = t2; 90 | this->extract = EXTRACT_64[bitWidth]; 91 | this->target2 = t2; 92 | uint64_t singleSpan2 = byteutils::spanTo64(this->bitWidth, this->target2); 93 | this->spanned2 = _mm512_set1_epi64(singleSpan2); 94 | this->nspanned2 = _mm512_set1_epi64(~singleSpan2); 95 | this->l22 = _mm512_and_si512(spanned2, mask); 96 | this->g22 = _mm512_or_si512(spanned2, msbmask); 97 | } 98 | 99 | Bitpack::~Bitpack() {} 100 | 101 | void Bitpack::equal(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 102 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 103 | 104 | uint32_t counter = 0; 105 | uint32_t byteindex = 0; 106 | uint32_t bitoffset = 0; 107 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 108 | 109 | uint32_t resindex = resoffset >> 6; 110 | resoffset &= 0x3F; 111 | 112 | while (counter < numEntry) { 113 | // Load next block and align it 114 | for (int i = 0; i < 8; i++) { 115 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 116 | counter += entryInBlock[i]; 117 | if (counter > numEntry) { 118 | entryInBlock[i] -= counter - numEntry; 119 | counter = numEntry; 120 | } 121 | } 122 | // Use SBoost_old algorithm to compare the loaded block with spanned 123 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 124 | buffer[4], buffer[5], buffer[6], buffer[7]); 125 | __m512i d = _mm512_xor_si512(loaded, spanned); 126 | __m512i r = _mm512_or_si512(d, _mm512_add_epi64(_mm512_and_si512(d, mask), mask)); 127 | 128 | // Use PEXT to collect result, use popcnt to count data 129 | for (int i = 0; i < 8; i++) { 130 | writeNext(res, ~_pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1), 131 | entryInBlock[i], &resindex, &resoffset); 132 | } 133 | } 134 | } 135 | 136 | void Bitpack::less(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 137 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 138 | 139 | uint32_t counter = 0; 140 | uint32_t byteindex = 0; 141 | uint32_t bitoffset = 0; 142 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 143 | 144 | uint32_t resindex = resoffset >> 6; 145 | resoffset &= 0x3F; 146 | 147 | while (counter < numEntry) { 148 | // Load next block and align it 149 | for (int i = 0; i < 8; i++) { 150 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 151 | counter += entryInBlock[i]; 152 | if (counter > numEntry) { 153 | entryInBlock[i] -= counter - numEntry; 154 | counter = numEntry; 155 | } 156 | } 157 | // Use SBoost_old algorithm to compare the loaded block with spanned 158 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 159 | buffer[4], buffer[5], buffer[6], buffer[7]); 160 | __m512i l = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 161 | __m512i r = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 162 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), l)); 163 | 164 | // Use PEXT to collect result, use popcnt to count data 165 | for (int i = 0; i < 8; i++) { 166 | writeNext(res, ~_pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1), 167 | entryInBlock[i], &resindex, &resoffset); 168 | } 169 | } 170 | } 171 | 172 | void Bitpack::geq(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 173 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 174 | 175 | uint32_t counter = 0; 176 | uint32_t byteindex = 0; 177 | uint32_t bitoffset = 0; 178 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 179 | 180 | uint32_t resindex = resoffset >> 6; 181 | resoffset &= 0x3F; 182 | 183 | while (counter < numEntry) { 184 | // Load next block and align it 185 | for (int i = 0; i < 8; i++) { 186 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 187 | counter += entryInBlock[i]; 188 | if (counter > numEntry) { 189 | entryInBlock[i] -= counter - numEntry; 190 | counter = numEntry; 191 | } 192 | } 193 | // Use SBoost_old algorithm to compare the loaded block with spanned 194 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 195 | buffer[4], buffer[5], buffer[6], buffer[7]); 196 | __m512i l = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 197 | __m512i r = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 198 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), l)); 199 | 200 | // Use PEXT to collect result, use popcnt to count data 201 | for (int i = 0; i < 8; i++) { 202 | writeNext(res, _pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1), 203 | entryInBlock[i], &resindex, &resoffset); 204 | } 205 | } 206 | } 207 | 208 | uint32_t Bitpack::geq(const uint8_t *data, uint32_t numEntry) { 209 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 210 | 211 | uint32_t counter = 0; 212 | uint32_t byteindex = 0; 213 | uint32_t bitoffset = 0; 214 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 215 | 216 | while (counter < numEntry) { 217 | // Load next block and align it 218 | for (int i = 0; i < 8; i++) { 219 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 220 | } 221 | // Use SBoost algorithm to compare the loaded block with spanned 222 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 223 | buffer[4], buffer[5], buffer[6], buffer[7]); 224 | __m512i l = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 225 | __m512i r = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 226 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), l)); 227 | 228 | // Use PEXT to collect result, use popcnt to count data 229 | 230 | for (int i = 0; i < 8; ++i) { 231 | auto res = _pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1); 232 | if (res != 0) { 233 | return counter + lowestBit(res); 234 | } else { 235 | counter += entryInBlock[i]; 236 | } 237 | } 238 | } 239 | return -1; 240 | } 241 | 242 | void Bitpack::greater(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 243 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 244 | 245 | uint32_t counter = 0; 246 | uint32_t byteindex = 0; 247 | uint32_t bitoffset = 0; 248 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 249 | 250 | uint32_t resindex = resoffset >> 6; 251 | resoffset &= 0x3F; 252 | 253 | while (counter < numEntry) { 254 | // Load next block and align it 255 | for (int i = 0; i < 8; i++) { 256 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 257 | counter += entryInBlock[i]; 258 | if (counter > numEntry) { 259 | entryInBlock[i] -= counter - numEntry; 260 | counter = numEntry; 261 | } 262 | } 263 | // Use SBoost_old algorithm to compare the loaded block with spanned 264 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 265 | buffer[4], buffer[5], buffer[6], buffer[7]); 266 | __m512i nloaded = _mm512_xor_si512(loaded, _mm512_set1_epi32(-1)); 267 | __m512i l = _mm512_sub_epi64(g2, _mm512_and_si512(loaded, mask)); 268 | __m512i r = _mm512_and_si512(_mm512_or_si512(nloaded, spanned), 269 | _mm512_or_si512(_mm512_and_si512(nloaded, spanned), l)); 270 | 271 | // Use PEXT to collect result, use popcnt to count data 272 | for (int i = 0; i < 8; i++) { 273 | writeNext(res, ~_pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1), 274 | entryInBlock[i], &resindex, &resoffset); 275 | } 276 | } 277 | } 278 | 279 | void Bitpack::leq(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 280 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 281 | 282 | uint32_t counter = 0; 283 | uint32_t byteindex = 0; 284 | uint32_t bitoffset = 0; 285 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 286 | 287 | uint32_t resindex = resoffset >> 6; 288 | resoffset &= 0x3F; 289 | 290 | while (counter < numEntry) { 291 | // Load next block and align it 292 | for (int i = 0; i < 8; i++) { 293 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 294 | counter += entryInBlock[i]; 295 | if (counter > numEntry) { 296 | entryInBlock[i] -= counter - numEntry; 297 | counter = numEntry; 298 | } 299 | } 300 | // Use SBoost_old algorithm to compare the loaded block with spanned 301 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 302 | buffer[4], buffer[5], buffer[6], buffer[7]); 303 | __m512i nloaded = _mm512_xor_si512(loaded, _mm512_set1_epi32(-1)); 304 | __m512i l = _mm512_sub_epi64(g2, _mm512_and_si512(loaded, mask)); 305 | __m512i r = _mm512_and_si512(_mm512_or_si512(nloaded, spanned), 306 | _mm512_or_si512(_mm512_and_si512(nloaded, spanned), l)); 307 | 308 | // Use PEXT to collect result, use popcnt to count data 309 | for (int i = 0; i < 8; i++) { 310 | writeNext(res, _pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1), 311 | entryInBlock[i], &resindex, &resoffset); 312 | } 313 | } 314 | } 315 | 316 | void Bitpack::rangele(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 317 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 318 | 319 | uint32_t counter = 0; 320 | uint32_t byteindex = 0; 321 | uint32_t bitoffset = 0; 322 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 323 | 324 | uint32_t resindex = resoffset >> 6; 325 | resoffset &= 0x3F; 326 | 327 | while (counter < numEntry) { 328 | // Load next block and align it 329 | for (int i = 0; i < 8; i++) { 330 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 331 | counter += entryInBlock[i]; 332 | if (counter > numEntry) { 333 | entryInBlock[i] -= counter - numEntry; 334 | counter = numEntry; 335 | } 336 | } 337 | // Use SBoost_old algorithm to compare the loaded block with spanned 338 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 339 | buffer[4], buffer[5], buffer[6], buffer[7]); 340 | __m512i ll = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 341 | __m512i rl = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 342 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), ll)); 343 | __m512i lu = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l22); 344 | __m512i ru = _mm512_and_si512(_mm512_or_si512(loaded, nspanned2), 345 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned2), lu)); 346 | 347 | __m512i r = _mm512_xor_si512(rl, ru); 348 | 349 | // Use PEXT to collect result, use popcnt to count data 350 | for (int i = 0; i < 8; i++) { 351 | writeNext(res, _pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1), 352 | entryInBlock[i], &resindex, &resoffset); 353 | } 354 | } 355 | } 356 | 357 | void Bitpack::between(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 358 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 359 | 360 | uint32_t counter = 0; 361 | uint32_t byteindex = 0; 362 | uint32_t bitoffset = 0; 363 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 364 | 365 | uint32_t resindex = resoffset >> 6; 366 | resoffset &= 0x3F; 367 | 368 | while (counter < numEntry) { 369 | // Load next block and align it 370 | for (int i = 0; i < 8; i++) { 371 | buffer[i] = loadNext(data, bitWidth, &byteindex, &bitoffset, entryInBlock + i); 372 | counter += entryInBlock[i]; 373 | if (counter > numEntry) { 374 | entryInBlock[i] -= counter - numEntry; 375 | counter = numEntry; 376 | } 377 | } 378 | // Use SBoost_old algorithm to compare the loaded block with spanned 379 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 380 | buffer[4], buffer[5], buffer[6], buffer[7]); 381 | __m512i ll = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 382 | __m512i rl = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 383 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), ll)); 384 | __m512i nloaded = _mm512_xor_si512(loaded, _mm512_set1_epi32(-1)); 385 | __m512i lu = _mm512_sub_epi64(g22, _mm512_and_si512(loaded, mask)); 386 | __m512i ru = _mm512_and_si512(_mm512_or_si512(nloaded, spanned2), 387 | _mm512_or_si512(_mm512_and_si512(nloaded, spanned2), lu)); 388 | 389 | __m512i r = _mm512_and_si512(rl, ru); 390 | 391 | // Use PEXT to collect result, use popcnt to count data 392 | for (int i = 0; i < 8; i++) { 393 | writeNext(res, _pext_u64(r[i], extract) & ((1L << entryInBlock[i]) - 1), 394 | entryInBlock[i], &resindex, &resoffset); 395 | } 396 | } 397 | } 398 | 399 | uint32_t estimateGroupSize(uint32_t bitWidth) { 400 | // Estimate group size 401 | uint32_t byteindex = 0; 402 | uint32_t bitoffset = 0; 403 | uint32_t entries = 0; 404 | for (int i = 0; i < 8; i++) { 405 | // Update index and offset 406 | auto entryInBlock = (64 - bitoffset) / bitWidth; 407 | uint32_t bitadvance = entryInBlock * bitWidth + (bitoffset); 408 | bitoffset = bitadvance & 0x7; 409 | byteindex += bitadvance >> 3; 410 | 411 | entries += entryInBlock; 412 | } 413 | return ((entries >> 3) << 3); 414 | } 415 | 416 | TemplateBitpack::TemplateBitpack(uint32_t bitWidth, uint32_t target) : Bitpack(bitWidth, target) { 417 | entry_in_blocks_ = loader::entryInBlocks[bitWidth]; 418 | group_size_ = estimateGroupSize(bitWidth); 419 | group_bytes_ = (group_size_ * bitWidth) >> 3; 420 | } 421 | 422 | void TemplateBitpack::tgeq(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 423 | uint64_t buffer[] = {0, 0, 0, 0, 0, 0, 0, 0}; 424 | 425 | uint32_t counter = 0; 426 | 427 | uint32_t resindex = resoffset >> 6; 428 | resoffset &= 0x3F; 429 | 430 | auto loader = loader::loaders[bitWidth]; 431 | 432 | while (counter < numEntry) { 433 | // Load next block and align it 434 | loader(data, buffer); 435 | data += group_bytes_; 436 | counter += group_size_; 437 | if (counter > numEntry) { 438 | counter = numEntry; 439 | } 440 | 441 | // Use SBoost algorithm to compare the loaded block with spanned 442 | __m512i loaded = _mm512_setr_epi64(buffer[0], buffer[1], buffer[2], buffer[3], 443 | buffer[4], buffer[5], buffer[6], buffer[7]); 444 | __m512i l = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 445 | __m512i r = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 446 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), l)); 447 | 448 | // Use PEXT to collect result, use popcnt to count data 449 | for (int i = 0; i < 8; i++) { 450 | writeNext(res, _pext_u64(r[i], extract) & ((1L << entry_in_blocks_[i]) - 1), 451 | entry_in_blocks_[i], &resindex, &resoffset); 452 | } 453 | } 454 | } 455 | 456 | SortedBitpack::SortedBitpack(uint32_t bitWidth, uint32_t target) : Bitpack(bitWidth, target) { 457 | loader_ = loader::loaders[bitWidth]; 458 | writer_ = loader::writers[bitWidth]; 459 | writerinv_ = loader::writeinvs[bitWidth]; 460 | group_size_ = estimateGroupSize(bitWidth); 461 | 462 | entry_in_block_ = loader::entryInBlocks[bitWidth]; 463 | 464 | group_bytes_ = (group_size_ * bitWidth) >> 3; 465 | } 466 | 467 | SortedBitpack::~SortedBitpack() noexcept {} 468 | 469 | int8_t SortedBitpack::eqGroup(const uint8_t *group_start, uint64_t *res) { 470 | loader_(group_start, buffer_); 471 | 472 | // Use SBoost algorithm to compare the loaded block with spanned 473 | __m512i loaded = _mm512_setr_epi64(buffer_[0], buffer_[1], buffer_[2], buffer_[3], 474 | buffer_[4], buffer_[5], buffer_[6], buffer_[7]); 475 | __m512i d = _mm512_xor_si512(loaded, spanned); 476 | __m512i r = _mm512_or_si512(d, _mm512_add_epi64(_mm512_and_si512(d, mask), mask)); 477 | 478 | memset(res, 0, 64); 479 | writerinv_(r, res, extract); 480 | uint8_t index = 0; 481 | for (uint32_t i = 0; i <= last_index_; ++i) { 482 | if (res[i] != 0) { 483 | return index + _mm_popcnt_u64(res[i] - 1); 484 | } else { 485 | index += 64; 486 | } 487 | } 488 | return -1; 489 | } 490 | 491 | int8_t SortedBitpack::greaterGroup(const uint8_t *group_start, uint64_t *res) { 492 | loader_(group_start, buffer_); 493 | 494 | // Use SBoost algorithm to compare the loaded block with spanned 495 | __m512i loaded = _mm512_setr_epi64(buffer_[0], buffer_[1], buffer_[2], buffer_[3], 496 | buffer_[4], buffer_[5], buffer_[6], buffer_[7]); 497 | __m512i nloaded = _mm512_xor_si512(loaded, _mm512_set1_epi32(-1)); 498 | __m512i l = _mm512_sub_epi64(g2, _mm512_and_si512(loaded, mask)); 499 | __m512i r = _mm512_and_si512(_mm512_or_si512(nloaded, spanned), 500 | _mm512_or_si512(_mm512_and_si512(nloaded, spanned), l)); 501 | 502 | memset(res, 0, 64); 503 | writerinv_(r, res, extract); 504 | 505 | auto first = res[0] & 1; 506 | auto last = (res[last_index_] >> last_offset_) & 1; 507 | if (first) { 508 | return 0; 509 | } else if (!last) { 510 | return -1; 511 | } else { 512 | uint32_t offset = 0; 513 | // Found between 514 | for (uint32_t i = 0; i <= last_index_; ++i) { 515 | if (res[i] == 0) { 516 | offset += 64; 517 | } else { 518 | offset += lowestBit(res[i]); 519 | break; 520 | } 521 | } 522 | return offset; 523 | } 524 | } 525 | 526 | int8_t SortedBitpack::geqGroup(const uint8_t *group_start, uint64_t *res) { 527 | loader_(group_start, buffer_); 528 | 529 | // Use SBoost algorithm to compare the loaded block with spanned 530 | __m512i loaded = _mm512_setr_epi64(buffer_[0], buffer_[1], buffer_[2], buffer_[3], 531 | buffer_[4], buffer_[5], buffer_[6], buffer_[7]); 532 | __m512i l = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 533 | __m512i r = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 534 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), l)); 535 | 536 | memset(res, 0, 64); 537 | writer_(r, res, extract); 538 | 539 | auto first = res[0] & 1; 540 | auto last = (res[last_index_] >> last_offset_) & 1; 541 | if (first) { 542 | return 0; 543 | } else if (!last) { 544 | return -1; 545 | } else { 546 | uint32_t offset = 0; 547 | // Found between 548 | for (uint32_t i = 0; i <= last_index_; ++i) { 549 | if (res[i] == 0) { 550 | offset += 64; 551 | } else { 552 | offset += lowestBit(res[i]); 553 | break; 554 | } 555 | } 556 | return offset; 557 | } 558 | } 559 | 560 | void SortedBitpack::geqGroup2(const uint8_t *group_start, uint64_t *res) { 561 | loader_(group_start, buffer_); 562 | 563 | // Use SBoost algorithm to compare the loaded block with spanned 564 | __m512i loaded = _mm512_setr_epi64(buffer_[0], buffer_[1], buffer_[2], buffer_[3], 565 | buffer_[4], buffer_[5], buffer_[6], buffer_[7]); 566 | __m512i l = _mm512_sub_epi64(_mm512_or_si512(loaded, msbmask), l2); 567 | __m512i r = _mm512_and_si512(_mm512_or_si512(loaded, nspanned), 568 | _mm512_or_si512(_mm512_and_si512(loaded, nspanned), l)); 569 | 570 | uint32_t resindex = 0; 571 | uint32_t resoffset = 0; 572 | memset(res, 0, 64); 573 | for (int i = 0; i < 8; i++) { 574 | writeNext(res, _pext_u64(r[i], extract) & ((1L << entry_in_block_[i]) - 1), 575 | entry_in_block_[i], &resindex, &resoffset); 576 | } 577 | } 578 | 579 | uint32_t SortedBitpack::equal(const uint8_t *data, uint32_t numEntry) { 580 | // Init these here as they will be changed by the last call of geqGroup 581 | last_index_ = (group_size_ - 1) >> 6; 582 | last_offset_ = ((group_size_ - 1) & 0x3F); 583 | auto num_groups = (numEntry + group_size_ - 1) / group_size_; 584 | 585 | uint64_t bitmap_result[8]; 586 | 587 | uint32_t begin = 0; 588 | uint32_t end = num_groups - 1; 589 | while (begin < end) { 590 | auto current = (begin + end + 1) / 2; 591 | const uint8_t *current_buffer = data + current * group_bytes_; 592 | // Make comparison 593 | auto index = geqGroup(current_buffer, bitmap_result); 594 | if (index == -1) { 595 | // all small 596 | begin = current + 1; 597 | } else { 598 | auto eqindex = eqGroup(current_buffer, bitmap_result); 599 | if (eqindex != -1) { 600 | return current * group_size_ + eqindex; 601 | } 602 | if (index != 0) { 603 | return -1; 604 | } else { 605 | end = current - 1; 606 | } 607 | } 608 | } 609 | // Still need to search left group 610 | if (begin == num_groups - 1) { 611 | auto remain = numEntry % group_size_; 612 | last_index_ = (remain - 1) >> 6; 613 | last_offset_ = ((remain - 1) & 0x3F); 614 | } 615 | auto index = eqGroup(data + begin * group_bytes_, bitmap_result); 616 | return (index == -1) ? -1 : (begin * group_size_ + index); 617 | } 618 | 619 | uint32_t SortedBitpack::geq(const uint8_t *data, uint32_t numEntry) { 620 | // Init these here as they will be changed by the last call of geqGroup 621 | last_index_ = (group_size_ - 1) >> 6; 622 | last_offset_ = ((group_size_ - 1) & 0x3F); 623 | auto num_groups = (numEntry + group_size_ - 1) / group_size_; 624 | 625 | uint64_t bitmap_result[8]; 626 | 627 | uint32_t begin = 0; 628 | uint32_t end = num_groups - 1; 629 | while (begin < end) { 630 | auto current = (begin + end + 1) / 2; 631 | const uint8_t *current_buffer = data + current * group_bytes_; 632 | // Make comparison 633 | auto index = geqGroup(current_buffer, bitmap_result); 634 | if (index == -1) { 635 | // all small 636 | begin = current + 1; 637 | } else if (index != 0) { 638 | return current * group_size_ + index; 639 | } else { 640 | // This early stop does not bring obvious performance improvement 641 | // Comment it out for code clarity 642 | // auto eqindex = eqGroup(current_buffer, bitmap_result); 643 | // if (eqindex != -1) { 644 | // return current * group_size_ + eqindex; 645 | // } 646 | end = current - 1; 647 | } 648 | } 649 | if (begin > end) { 650 | // 651 | return begin * group_size_; 652 | } 653 | // Still need to search left group 654 | if (begin == num_groups - 1) { 655 | auto remain = numEntry % group_size_; 656 | last_index_ = (remain - 1) >> 6; 657 | last_offset_ = ((remain - 1) & 0x3F); 658 | } 659 | auto index = geqGroup(data + begin * group_bytes_, bitmap_result); 660 | return (index == -1) ? ((begin + 1) * group_size_) : (begin * group_size_ + index); 661 | } 662 | 663 | uint32_t SortedBitpack::greater(const uint8_t *data, uint32_t numEntry) { 664 | // Init these here as they will be changed by the last call of geqGroup 665 | last_index_ = (group_size_ - 1) >> 6; 666 | last_offset_ = ((group_size_ - 1) & 0x3F); 667 | auto num_groups = (numEntry + group_size_ - 1) / group_size_; 668 | 669 | uint64_t bitmap_result[8]; 670 | 671 | uint32_t begin = 0; 672 | uint32_t end = num_groups - 1; 673 | while (begin < end) { 674 | auto current = (begin + end + 1) / 2; 675 | const uint8_t *current_buffer = data + current * group_bytes_; 676 | // Make comparison 677 | auto index = greaterGroup(current_buffer, bitmap_result); 678 | if (index == -1) { 679 | // all leq 680 | begin = current + 1; 681 | } else if (index != 0) { 682 | return current * group_size_ + index; 683 | } else { 684 | // This early stop does not bring obvious performance improvement 685 | // Comment it out for code clarity 686 | // auto eqindex = eqGroup(current_buffer, bitmap_result); 687 | // if (eqindex != -1) { 688 | // return current * group_size_ + eqindex; 689 | // } 690 | end = current - 1; 691 | } 692 | } 693 | if (begin > end) { 694 | // 695 | return begin * group_size_; 696 | } 697 | // Still need to search left group 698 | if (begin == num_groups - 1) { 699 | auto remain = numEntry % group_size_; 700 | last_index_ = (remain - 1) >> 6; 701 | last_offset_ = ((remain - 1) & 0x3F); 702 | } 703 | auto index = greaterGroup(data + begin * group_bytes_, bitmap_result); 704 | return (index == -1) ? ((begin + 1) * group_size_) : (begin * group_size_ + index); 705 | } 706 | 707 | BitpackCompare::BitpackCompare(uint32_t bitWidth) { 708 | this->bit_width_ = bitWidth; 709 | this->extract_ = EXTRACT_64[bitWidth]; 710 | this->mask_ = _mm512_set1_epi64(MASKS_64[bitWidth]); 711 | this->msbmask_ = _mm512_set1_epi64(EXTRACT_64[bitWidth]); 712 | } 713 | 714 | void BitpackCompare::less(const uint8_t *data1, const uint8_t *data2, 715 | uint32_t numEntry, uint64_t *res, uint32_t resoffset) { 716 | uint64_t buffer1[] = {0, 0, 0, 0, 0, 0, 0, 0}; 717 | uint64_t buffer2[] = {0, 0, 0, 0, 0, 0, 0, 0}; 718 | 719 | uint32_t counter = 0; 720 | uint32_t byteindex = 0; 721 | uint32_t bitoffset = 0; 722 | uint32_t entryInBlock[] = {0, 0, 0, 0, 0, 0, 0, 0}; 723 | 724 | uint32_t resindex = resoffset >> 6; 725 | resoffset &= 0x3F; 726 | 727 | while (counter < numEntry) { 728 | // Load next block and align it 729 | for (int i = 0; i < 8; i++) { 730 | auto bytidx = byteindex; 731 | auto bitoff = bitoffset; 732 | buffer1[i] = loadNext(data1, bit_width_, &byteindex, &bitoffset, entryInBlock + i); 733 | buffer2[i] = loadNext(data2, bit_width_, &bytidx, &bitoff, entryInBlock + i); 734 | counter += entryInBlock[i]; 735 | if (counter > numEntry) { 736 | entryInBlock[i] -= counter - numEntry; 737 | counter = numEntry; 738 | } 739 | } 740 | // Use SBoost algorithm to compare the loaded block 741 | __m512i loaded1 = _mm512_setr_epi64(buffer1[0], buffer1[1], buffer1[2], buffer1[3], 742 | buffer1[4], buffer1[5], buffer1[6], buffer1[7]); 743 | __m512i loaded2 = _mm512_setr_epi64(buffer2[0], buffer2[1], buffer2[2], buffer2[3], 744 | buffer2[4], buffer2[5], buffer2[6], buffer2[7]); 745 | 746 | __m512i spanned = loaded2; 747 | __m512i nspanned = _mm512_xor_si512(spanned, _mm512_set1_epi64(-1)); 748 | __m512i l2 = _mm512_and_si512(spanned, mask_); 749 | __m512i g2 = _mm512_or_si512(spanned, msbmask_); 750 | 751 | __m512i l = _mm512_sub_epi64(_mm512_or_si512(loaded1, msbmask_), l2); 752 | __m512i r = _mm512_and_si512(_mm512_or_si512(loaded1, nspanned), 753 | _mm512_or_si512(_mm512_and_si512(loaded1, nspanned), l)); 754 | 755 | // Use PEXT to collect result, use popcnt to count data 756 | for (int i = 0; i < 8; i++) { 757 | writeNext(res, ~_pext_u64(r[i], extract_) & ((1L << entryInBlock[i]) - 1), 758 | entryInBlock[i], &resindex, &resoffset); 759 | } 760 | } 761 | } 762 | } -------------------------------------------------------------------------------- /sboost.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 9/30/19. 3 | // 4 | 5 | #ifndef SBOOST_SBOOST_H 6 | #define SBOOST_SBOOST_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include "unpacker.h" 12 | 13 | namespace sboost { 14 | 15 | class Bitpack { 16 | protected: 17 | uint32_t bitWidth; 18 | uint32_t target; 19 | uint32_t target2; 20 | uint64_t extract; 21 | 22 | __m512i mask; 23 | __m512i msbmask; 24 | __m512i spanned; 25 | __m512i nspanned; 26 | __m512i l2; 27 | __m512i g2; 28 | 29 | __m512i spanned2; 30 | __m512i nspanned2; 31 | __m512i l22; 32 | __m512i g22; 33 | public: 34 | Bitpack(uint32_t bitWidth, uint32_t target); 35 | 36 | Bitpack(uint32_t bitWidth, uint32_t t1, uint32_t t2); 37 | 38 | virtual ~Bitpack(); 39 | 40 | void equal(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 41 | 42 | void less(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 43 | 44 | void leq(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 45 | 46 | void greater(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 47 | 48 | void geq(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 49 | 50 | uint32_t geq(const uint8_t *data, uint32_t numEntry); 51 | 52 | void rangele(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 53 | 54 | void between(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 55 | }; 56 | 57 | /** 58 | * Deprecated 59 | * 60 | * Benchmark shows using template does not make it faster 61 | */ 62 | class TemplateBitpack : public Bitpack { 63 | protected: 64 | uint32_t group_size_; 65 | uint32_t group_bytes_; 66 | uint32_t *entry_in_blocks_; 67 | public: 68 | TemplateBitpack(uint32_t bitWidth, uint32_t target); 69 | 70 | void tgeq(const uint8_t *data, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 71 | }; 72 | 73 | /** 74 | * Specialized in searching sorted bitpack stream, using binary search 75 | */ 76 | class SortedBitpack : Bitpack { 77 | public: 78 | SortedBitpack(uint32_t bitWidth, uint32_t target); 79 | 80 | virtual ~SortedBitpack(); 81 | 82 | /** 83 | * Find the index of the entry equal to target 84 | * @param data 85 | * @param numEntry 86 | * @return -1 if not found, otherwise the index 87 | */ 88 | uint32_t equal(const uint8_t *data, uint32_t numEntry); 89 | /** 90 | * Find the index of first entry geq target 91 | * @param data 92 | * @param numEntry 93 | * @return 94 | */ 95 | uint32_t geq(const uint8_t *data, uint32_t numEntry); 96 | /** 97 | * Find the index of first entry larger than target 98 | * @param data 99 | * @param numEntry 100 | * @return 101 | */ 102 | uint32_t greater(const uint8_t *data, uint32_t numEntry); 103 | 104 | protected: 105 | uint64_t buffer_[8]; 106 | uint32_t group_size_; 107 | uint32_t group_bytes_; 108 | uint32_t* entry_in_block_; 109 | 110 | uint32_t last_index_; 111 | uint32_t last_offset_; 112 | 113 | void (*loader_)(const uint8_t *, uint64_t *); 114 | void (*writer_)(__m512i, uint64_t*,uint64_t); 115 | void (*writerinv_)(__m512i, uint64_t*,uint64_t); 116 | 117 | /** 118 | * Find the index of entries equals to target 119 | * @param group_start 120 | * @return 121 | */ 122 | int8_t eqGroup(const uint8_t *group_start, uint64_t *); 123 | 124 | int8_t greaterGroup(const uint8_t *group_start, uint64_t *); 125 | /** 126 | * Find the index of first entry that is geq target. 127 | * @param group_start 128 | */ 129 | int8_t geqGroup(const uint8_t *group_start, uint64_t *); 130 | // Benchmark (loader_benchmark) shows this is not much faster 131 | /** 132 | * Same as geqGroup, used to determine whether generated code is 133 | * faster than dynamic code. 134 | * 135 | * Benchmark (loader_benchmark) shows this is not much faster 136 | * @param group_start 137 | */ 138 | void geqGroup2(const uint8_t *group_start, uint64_t *); 139 | }; 140 | 141 | class BitpackCompare { 142 | protected: 143 | uint32_t bit_width_; 144 | uint64_t extract_; 145 | 146 | __m512i mask_; 147 | __m512i msbmask_; 148 | public: 149 | BitpackCompare(uint32_t bitWidth); 150 | 151 | virtual ~BitpackCompare() = default; 152 | 153 | void less(const uint8_t *left, const uint8_t *right, uint32_t numEntry, uint64_t *res, uint32_t resoffset); 154 | }; 155 | 156 | using namespace std; 157 | 158 | const auto ZERO = _mm256_set1_epi64x(0); 159 | const auto IDX = _mm256_setr_epi32(8, 0, 1, 2, 3, 4, 5, 6); 160 | const auto IDX2 = _mm256_setr_epi32(0, 8, 2, 8, 1, 4, 3, 6); 161 | // const auto IDX3 = _mm256_setr_epi32(8,8,8,8,0,1,2,3); 162 | const auto INV = _mm256_setr_epi32(3, 2, 1, 0, 7, 6, 5, 4); 163 | 164 | inline __m256i cumsum32(__m256i b) { 165 | __m256i bp = _mm256_permutex2var_epi32(b, IDX, ZERO); 166 | __m256i s1 = _mm256_hadd_epi32(b, bp); 167 | __m256i s2 = _mm256_permutex2var_epi32(s1, IDX2, ZERO); 168 | __m256i s3 = _mm256_hadd_epi32(s1, s2); 169 | __m256i s4 = _mm256_permute2x128_si256(s3, ZERO, 0x2); 170 | __m256i result = _mm256_add_epi32(s3, s4); 171 | return _mm256_permutevar8x32_epi32(result, INV); 172 | } 173 | } 174 | #endif //SBOOST_SBOOST_H 175 | -------------------------------------------------------------------------------- /sboost_benchmark.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Hao Jiang on 11/30/20. 3 | // 4 | // This benchmark is to study if we should template the loadNext function 5 | // 6 | 7 | #include 8 | #include "byteutils.h" 9 | #include "sboost.h" 10 | 11 | 12 | class SboostBenchmark : public benchmark::Fixture { 13 | protected: 14 | uint32_t *data_; 15 | uint8_t *output_; 16 | uint32_t numEntry_; 17 | uint8_t bitWidth_; 18 | uint64_t *res_; 19 | public: 20 | // add members as needed 21 | 22 | SboostBenchmark() { 23 | numEntry_ = 100000; 24 | bitWidth_ = 11; 25 | output_ = (uint8_t *) malloc(sizeof(uint32_t) * numEntry_); 26 | data_ = (uint32_t *) malloc(sizeof(uint32_t) * numEntry_); 27 | res_ = (uint64_t *) malloc(sizeof(uint64_t) * (numEntry_ + 1) / 64); 28 | 29 | for (uint32_t i = 0; i < numEntry_; ++i) { 30 | data_[i] = i; 31 | } 32 | ::sboost::byteutils::bitpack(data_, numEntry_, bitWidth_, output_); 33 | } 34 | 35 | virtual ~SboostBenchmark() { 36 | free(data_); 37 | free(output_); 38 | free(res_); 39 | } 40 | }; 41 | 42 | //BENCHMARK_F(SboostBenchmark, Dynamic)(benchmark::State &state) { 43 | // for (auto _ : state) { 44 | // //run your benchmark 45 | // sboost::Bitpack bp(11, 131); 46 | // bp.geq(output_, numEntry_, res_, 0); 47 | // } 48 | //} 49 | // 50 | //BENCHMARK_F(SboostBenchmark, Template)(benchmark::State &state) { 51 | // for (auto _ : state) { 52 | // sboost::TemplateBitpack tbp(11, 131); 53 | // tbp.geq(output_, numEntry_, res_, 0); 54 | // } 55 | //} 56 | 57 | int32_t binary_search(vector &data, int target) { 58 | auto begin = 0; 59 | auto end = data.size() - 1; 60 | while (begin < end) { 61 | auto current = (begin + end + 1) / 2; 62 | if (data[current] < target) { 63 | begin = current; 64 | } else if (data[current] == target) { 65 | return current; 66 | } else { 67 | end = current - 1; 68 | } 69 | } 70 | return begin; 71 | } 72 | 73 | int32_t binary_search_packed(uint8_t* data, int target, uint8_t offset) { 74 | 75 | } 76 | 77 | class SearchBenchmark : public benchmark::Fixture { 78 | protected: 79 | vector data_; 80 | uint8_t *output_; 81 | uint32_t numEntry_; 82 | uint8_t bitWidth_; 83 | uint64_t *res_; 84 | int32_t result_; 85 | public: 86 | 87 | SearchBenchmark() { 88 | numEntry_ = 1024; 89 | bitWidth_ = 17; 90 | output_ = (uint8_t *) malloc(sizeof(uint32_t) * numEntry_); 91 | 92 | for (uint32_t i = 0; i < numEntry_; ++i) { 93 | data_.push_back(i); 94 | } 95 | ::sboost::byteutils::bitpack(data_.data(), numEntry_, bitWidth_, output_); 96 | } 97 | 98 | virtual ~SearchBenchmark() { 99 | free(output_); 100 | free(res_); 101 | } 102 | }; 103 | 104 | BENCHMARK_F(SearchBenchmark, STL)(benchmark::State &state) { 105 | for (auto _: state) { 106 | // for (int i = 0; i < numEntry_; ++i) { 107 | auto i = numEntry_ / 5; 108 | result_ = binary_search(data_, i); 109 | // } 110 | } 111 | } 112 | 113 | BENCHMARK_F(SearchBenchmark, SBoost)(benchmark::State &state) { 114 | for (auto _ : state) { 115 | // for (int i = 0; i < numEntry_; ++i) { 116 | auto i = numEntry_ / 5; 117 | sboost::SortedBitpack sbp(bitWidth_, i); 118 | result_ = sbp.geq(output_, numEntry_); 119 | // } 120 | } 121 | } 122 | 123 | BENCHMARK_F(SearchBenchmark, SBoostSeq)(benchmark::State &state) { 124 | for (auto _ : state) { 125 | // for (int i = 0; i < numEntry_; ++i) { 126 | auto i = numEntry_ / 5; 127 | sboost::Bitpack bp(bitWidth_, i); 128 | result_ = bp.geq(output_, numEntry_); 129 | // } 130 | } 131 | } -------------------------------------------------------------------------------- /sboost_runner.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Hao Jiang on 12/8/20. 3 | // 4 | 5 | int main() { 6 | 7 | } -------------------------------------------------------------------------------- /sboost_test.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 8/18/19. 3 | // 4 | 5 | #include 6 | #include "byteutils.h" 7 | #include "sboost.h" 8 | 9 | using namespace sboost; 10 | 11 | void prepareData(uint8_t *); 12 | 13 | TEST(SBoostTest, Equal) { 14 | uint8_t bitpacked2[192]; 15 | prepareData(bitpacked2); 16 | 17 | uint64_t result2_exp[8] = { 18 | 0x218012640444A10, 0xA20400042020005A, 0x24000184149010F, 0x8448308000120002, 19 | 0x3108154CA2000204, 0x80405C0088444832, 0x490043A00404246C, 0x24000201206016 20 | }; 21 | uint64_t result2[8] = {0}; 22 | 23 | sboost::Bitpack sBoost256(3, 3); 24 | 25 | sBoost256.equal(bitpacked2, 504, result2, 0); 26 | 27 | for (int i = 0; i < 8; i++) { 28 | EXPECT_EQ(result2[i], result2_exp[i]) << i; 29 | } 30 | } 31 | 32 | TEST(SBoostTest, Equal256SmallWidth) { 33 | uint32_t input[] = {1, 1, 2, 3, 1, 2, 1, 1, 2, 2, 1, 3, 1, 1, 2, 2, 34 | 1, 1, 2, 3, 1, 2, 1, 1, 2, 2, 1, 3, 1, 1, 2, 2, 35 | 1, 1, 2, 3, 1, 2, 1, 1, 2, 2, 1, 3, 1, 1, 2, 2, 36 | 1, 1, 2, 3, 1, 2, 1, 1, 2, 2, 1, 3, 1, 1, 2, 2, 37 | 1, 1, 2, 3, 1, 2, 1, 1, 2, 2, 1, 3, 1, 1, 2, 2, 38 | 1, 1, 2, 3, 1, 2, 1, 1, 2, 2, 1, 3, 1, 1, 2, 2 39 | }; 40 | uint8_t bitpacked[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 41 | byteutils::bitpack(input, 96, 2, bitpacked); 42 | 43 | sboost::Bitpack sBoost(2, 1); 44 | 45 | uint64_t result[] = {0, 0}; 46 | sBoost.equal(bitpacked, 96, result, 0); 47 | 48 | EXPECT_EQ(result[0], 0x34d334d334d334D3); 49 | EXPECT_EQ(result[1], 0x34d334D3); 50 | } 51 | 52 | TEST(SBoostTest, Less256) { 53 | uint32_t input[] = {13, 22, 1, 9, 25, 17, 6, 22, 12, 31, 12, 21, 0, 5}; 54 | uint8_t bitpacked[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 55 | byteutils::bitpack(input, 14, 5, bitpacked); 56 | 57 | sboost::Bitpack sBoost(5, 22); 58 | 59 | uint64_t result[] = {0}; 60 | sBoost.less(bitpacked, 14, result, 0); 61 | 62 | EXPECT_EQ(result[0], 0x3D6D); 63 | 64 | } 65 | 66 | TEST(SBoostTest, Greater256) { 67 | uint32_t input[] = {13, 22, 1, 9, 25, 17, 6, 22, 12, 31, 12, 21, 0, 5}; 68 | uint8_t bitpacked[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 69 | byteutils::bitpack(input, 14, 5, bitpacked); 70 | 71 | sboost::Bitpack sBoost(5, 22); 72 | 73 | uint64_t result[] = {0}; 74 | sBoost.greater(bitpacked, 14, result, 0); 75 | 76 | EXPECT_EQ(result[0], 0x210); 77 | } 78 | 79 | 80 | TEST(SBoostTest, RangeLe256) { 81 | uint32_t input[] = {13, 22, 1, 9, 25, 17, 6, 22, 12, 31, 12, 21, 0, 5}; 82 | uint8_t bitpacked[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 83 | byteutils::bitpack(input, 14, 5, bitpacked); 84 | 85 | sboost::Bitpack sBoost(5, 17, 23); 86 | 87 | uint64_t result[] = {0}; 88 | sBoost.rangele(bitpacked, 14, result, 0); 89 | 90 | EXPECT_EQ(result[0], 0x8A2); 91 | } 92 | 93 | TEST(SBoostTest, Between256) { 94 | uint32_t input[] = {13, 22, 1, 9, 25, 17, 6, 22, 12, 31, 12, 21, 0, 5}; 95 | uint8_t bitpacked[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 96 | byteutils::bitpack(input, 14, 5, bitpacked); 97 | 98 | sboost::Bitpack sBoost(5, 17, 22); 99 | 100 | uint64_t result[] = {0}; 101 | sBoost.between(bitpacked, 14, result, 0); 102 | 103 | EXPECT_EQ(result[0], 0x8A2); 104 | } 105 | 106 | 107 | TEST(SBoostTest, Equal512) { 108 | uint8_t bitpacked2[192]; 109 | prepareData(bitpacked2); 110 | 111 | uint64_t result2_exp[8] = { 112 | 0x218012640444A10, 0xA20400042020005A, 0x24000184149010F, 0x8448308000120002, 113 | 0x3108154CA2000204, 0x80405C0088444832, 0x490043A00404246C, 0x24000201206016 114 | }; 115 | uint64_t result2[8] = {0}; 116 | sboost::Bitpack sBoost512(3, 3); 117 | sBoost512.equal(bitpacked2, 504, result2, 0); 118 | 119 | for (int i = 0; i < 8; i++) { 120 | EXPECT_EQ(result2[i], result2_exp[i]) << i; 121 | } 122 | } 123 | 124 | TEST(SortedBitpackTest, GeqOnEq) { 125 | uint32_t input[10000]; 126 | for (int i = 0; i < 10000; ++i) { 127 | input[i] = i; 128 | } 129 | uint8_t bitpacked[10000 * 4]; 130 | memset(bitpacked, 0, 40000); 131 | byteutils::bitpack(input, 10000, 14, bitpacked); 132 | 133 | 134 | auto result = 0; 135 | for (int i = 0; i < 10000; ++i) { 136 | // Test search in the middle 137 | SortedBitpack sbp(14, i); 138 | result = sbp.geq(bitpacked, 10000); 139 | EXPECT_EQ(i, result) << i; 140 | } 141 | } 142 | 143 | TEST(SortedBitpackTest, GeqOnSkip) { 144 | uint32_t input[10000]; 145 | for (int i = 0; i < 10000; ++i) { 146 | input[i] = i * 2; 147 | } 148 | uint8_t bitpacked[10000 * 4]; 149 | memset(bitpacked, 0, 40000); 150 | byteutils::bitpack(input, 10000, 15, bitpacked); 151 | 152 | auto result = 0; 153 | for (int i = 0; i < 19999; ++i) { 154 | // Test search in the middle 155 | SortedBitpack sbp(15, i); 156 | result = sbp.geq(bitpacked, 10000); 157 | ASSERT_EQ(((i + 1) / 2), result) << i; 158 | } 159 | } 160 | 161 | TEST(SortedBitpackTest, EqOnEq) { 162 | uint32_t input[10000]; 163 | for (int i = 0; i < 10000; ++i) { 164 | input[i] = i; 165 | } 166 | uint8_t bitpacked[10000 * 4]; 167 | memset(bitpacked, 0, 40000); 168 | byteutils::bitpack(input, 10000, 14, bitpacked); 169 | 170 | 171 | auto result = 0; 172 | for (int i = 0; i < 10000; ++i) { 173 | // Test search in the middle 174 | SortedBitpack sbp(14, i); 175 | result = sbp.equal(bitpacked, 10000); 176 | EXPECT_EQ(i, result) << i; 177 | } 178 | 179 | // SortedBitpack sbp(14, 32); 180 | // result = sbp.equal(bitpacked, 10000); 181 | // EXPECT_EQ(32, result); 182 | } 183 | 184 | TEST(SortedBitpackTest, EqOnSkip) { 185 | uint32_t input[10000]; 186 | for (int i = 0; i < 10000; ++i) { 187 | input[i] = i * 2; 188 | } 189 | uint8_t bitpacked[10000 * 4]; 190 | memset(bitpacked, 0, 40000); 191 | byteutils::bitpack(input, 10000, 15, bitpacked); 192 | 193 | auto result = 0; 194 | for (int i = 0; i < 19999; ++i) { 195 | // Test search in the middle 196 | SortedBitpack sbp(15, i); 197 | result = sbp.equal(bitpacked, 10000); 198 | if (i % 2) { 199 | ASSERT_EQ(-1, result) << i; 200 | } else { 201 | ASSERT_EQ(i / 2, result) << i; 202 | } 203 | } 204 | } 205 | 206 | TEST(BitpackCompareTest, Less) { 207 | uint32_t input1[] = {13, 22, 1, 9, 25, 17, 6, 22, 12, 31, 12, 21, 0, 5}; 208 | uint8_t bitpacked1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 209 | byteutils::bitpack(input1, 14, 5, bitpacked1); 210 | 211 | uint32_t input2[] = {30, 10, 9, 29, 5, 17, 8, 2, 2, 8, 22, 10, 0, 9}; 212 | uint8_t bitpacked2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 213 | byteutils::bitpack(input2, 14, 5, bitpacked2); 214 | 215 | uint64_t result[] = {0, 0}; 216 | 217 | BitpackCompare compare(5); 218 | compare.less(bitpacked1, bitpacked2, 14, result, 0); 219 | 220 | EXPECT_EQ(0x244d, /* 10 0100 0100 1101*/ result[0]); 221 | } 222 | 223 | TEST(SDeltaTest, Cumsum) { 224 | // int32_t values[8] = {32432, 42442, 529532, 13442, 2525232, 3143243, 423432, 23232}; 225 | int32_t values[8] = {1, 2, 3, 4, 5, 6, 7, 8}; 226 | __m256i holder = _mm256_setr_epi32(values[0], values[1], values[2], values[3], 227 | values[4], values[5], values[6], values[7]); 228 | __m256i cs = cumsum32(holder); 229 | 230 | for (int i = 1; i < 8; ++i) { 231 | values[i] += values[i - 1]; 232 | } 233 | 234 | for (int i = 0; i < 8; i++) { 235 | EXPECT_EQ(values[i], (cs[i / 2] >> (i % 2) * 32) & 0xFFFFFFFF); 236 | } 237 | } 238 | 239 | namespace sboost { 240 | extern uint8_t 241 | writeNext(uint64_t *res, uint64_t bits, uint32_t entryInBlock, uint32_t *resindex, uint32_t *resoffset); 242 | } 243 | 244 | #include "loader.h" 245 | 246 | namespace sboost { 247 | extern const uint64_t EXTRACT_64[]; 248 | } 249 | 250 | TEST(LoaderTest, Writer) { 251 | __m512i target = _mm512_setr4_epi64(0x12345678ABCDEF2, 0x112234552DEF4233, 0x2424249873DDDEFF, 0x375928DEab558901); 252 | for (int i = 2; i < 32; ++i) { 253 | auto extract = sboost::EXTRACT_64[i]; 254 | auto entryInBlock = loader::entryInBlocks[i]; 255 | uint64_t buffer1[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 256 | 257 | loader::writers[i](target, buffer1, extract); 258 | 259 | uint64_t buffer2[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 260 | uint32_t resindex = 0; 261 | uint32_t resoffset = 0; 262 | for (int i = 0; i < 8; i++) { 263 | writeNext(buffer2, _pext_u64(target[i], extract) & ((1L << entryInBlock[i]) - 1), 264 | entryInBlock[i], &resindex, &resoffset); 265 | } 266 | 267 | for (int k = 0; k < 8; ++k) { 268 | EXPECT_EQ(buffer1[k], buffer2[k]) << i << ',' << k; 269 | } 270 | } 271 | } 272 | 273 | void prepareData(uint8_t *bitpacked) { 274 | bitpacked[0] = 0x48; 275 | bitpacked[1] = 0xB4; 276 | bitpacked[2] = 0x4; 277 | bitpacked[3] = 0x1C; 278 | bitpacked[4] = 0x6; 279 | bitpacked[5] = 0x8E; 280 | bitpacked[6] = 0xC1; 281 | bitpacked[7] = 0x28; 282 | bitpacked[8] = 0x4D; 283 | bitpacked[9] = 0xC; 284 | bitpacked[10] = 0x48; 285 | bitpacked[11] = 0xC; 286 | bitpacked[12] = 0xD9; 287 | bitpacked[13] = 0xC0; 288 | bitpacked[14] = 0x5; 289 | bitpacked[15] = 0x83; 290 | bitpacked[16] = 0x92; 291 | bitpacked[17] = 0x0; 292 | bitpacked[18] = 0x14; 293 | bitpacked[19] = 0xB7; 294 | bitpacked[20] = 0x88; 295 | bitpacked[21] = 0x99; 296 | bitpacked[22] = 0x44; 297 | bitpacked[23] = 0x5; 298 | bitpacked[24] = 0x99; 299 | bitpacked[25] = 0x36; 300 | bitpacked[26] = 0x8E; 301 | bitpacked[27] = 0x50; 302 | bitpacked[28] = 0x8; 303 | bitpacked[29] = 0x8A; 304 | bitpacked[30] = 0x8; 305 | bitpacked[31] = 0xC8; 306 | bitpacked[32] = 0x31; 307 | bitpacked[33] = 0x44; 308 | bitpacked[34] = 0xA4; 309 | bitpacked[35] = 0x29; 310 | bitpacked[36] = 0xC1; 311 | bitpacked[37] = 0x28; 312 | bitpacked[38] = 0x80; 313 | bitpacked[39] = 0x52; 314 | bitpacked[40] = 0x80; 315 | bitpacked[41] = 0x88; 316 | bitpacked[42] = 0xE0; 317 | bitpacked[43] = 0xC0; 318 | bitpacked[44] = 0x50; 319 | bitpacked[45] = 0x18; 320 | bitpacked[46] = 0xC2; 321 | bitpacked[47] = 0x69; 322 | bitpacked[48] = 0xDB; 323 | bitpacked[49] = 0x16; 324 | bitpacked[50] = 0x90; 325 | bitpacked[51] = 0x93; 326 | bitpacked[52] = 0xC4; 327 | bitpacked[53] = 0x48; 328 | bitpacked[54] = 0x63; 329 | bitpacked[55] = 0x16; 330 | bitpacked[56] = 0xD; 331 | bitpacked[57] = 0x63; 332 | bitpacked[58] = 0x82; 333 | bitpacked[59] = 0x4C; 334 | bitpacked[60] = 0x12; 335 | bitpacked[61] = 0x37; 336 | bitpacked[62] = 0x86; 337 | bitpacked[63] = 0x84; 338 | bitpacked[64] = 0x4; 339 | bitpacked[65] = 0x92; 340 | bitpacked[66] = 0xA0; 341 | bitpacked[67] = 0xC8; 342 | bitpacked[68] = 0x2C; 343 | bitpacked[69] = 0x18; 344 | bitpacked[70] = 0xA1; 345 | bitpacked[71] = 0x40; 346 | bitpacked[72] = 0x5A; 347 | bitpacked[73] = 0x98; 348 | bitpacked[74] = 0x10; 349 | bitpacked[75] = 0x8A; 350 | bitpacked[76] = 0x80; 351 | bitpacked[77] = 0x44; 352 | bitpacked[78] = 0x1C; 353 | bitpacked[79] = 0x30; 354 | bitpacked[80] = 0x91; 355 | bitpacked[81] = 0x51; 356 | bitpacked[82] = 0x0; 357 | bitpacked[83] = 0x0; 358 | bitpacked[84] = 0x89; 359 | bitpacked[85] = 0x8; 360 | bitpacked[86] = 0x61; 361 | bitpacked[87] = 0x4C; 362 | bitpacked[88] = 0xB2; 363 | bitpacked[89] = 0x51; 364 | bitpacked[90] = 0x51; 365 | bitpacked[91] = 0xA6; 366 | bitpacked[92] = 0x8C; 367 | bitpacked[93] = 0xE2; 368 | bitpacked[94] = 0x18; 369 | bitpacked[95] = 0x61; 370 | bitpacked[96] = 0xD1; 371 | bitpacked[97] = 0x94; 372 | bitpacked[98] = 0x50; 373 | bitpacked[99] = 0x5A; 374 | bitpacked[100] = 0xA4; 375 | bitpacked[101] = 0x48; 376 | bitpacked[102] = 0x64; 377 | bitpacked[103] = 0x2; 378 | bitpacked[104] = 0x4; 379 | bitpacked[105] = 0x58; 380 | bitpacked[106] = 0xC2; 381 | bitpacked[107] = 0x69; 382 | bitpacked[108] = 0xD1; 383 | bitpacked[109] = 0x26; 384 | bitpacked[110] = 0xC; 385 | bitpacked[111] = 0xC3; 386 | bitpacked[112] = 0x32; 387 | bitpacked[113] = 0x45; 388 | bitpacked[114] = 0xC; 389 | bitpacked[115] = 0x47; 390 | bitpacked[116] = 0x26; 391 | bitpacked[117] = 0x13; 392 | bitpacked[118] = 0xB5; 393 | bitpacked[119] = 0x51; 394 | bitpacked[120] = 0x5C; 395 | bitpacked[121] = 0xB8; 396 | bitpacked[122] = 0x9; 397 | bitpacked[123] = 0x50; 398 | bitpacked[124] = 0x16; 399 | bitpacked[125] = 0x2C; 400 | bitpacked[126] = 0xE2; 401 | bitpacked[127] = 0xC0; 402 | bitpacked[128] = 0x8C; 403 | bitpacked[129] = 0x21; 404 | bitpacked[130] = 0x6; 405 | bitpacked[131] = 0x64; 406 | bitpacked[132] = 0x0; 407 | bitpacked[133] = 0x19; 408 | bitpacked[134] = 0x91; 409 | bitpacked[135] = 0xC8; 410 | bitpacked[136] = 0x36; 411 | bitpacked[137] = 0x4D; 412 | bitpacked[138] = 0x64; 413 | bitpacked[139] = 0xA4; 414 | bitpacked[140] = 0x2C; 415 | bitpacked[141] = 0x42; 416 | bitpacked[142] = 0x80; 417 | bitpacked[143] = 0x68; 418 | bitpacked[144] = 0xC1; 419 | bitpacked[145] = 0x96; 420 | bitpacked[146] = 0xD; 421 | bitpacked[147] = 0xD4; 422 | bitpacked[148] = 0xC0; 423 | bitpacked[149] = 0x1; 424 | bitpacked[150] = 0xC8; 425 | bitpacked[151] = 0x20; 426 | bitpacked[152] = 0xA; 427 | bitpacked[153] = 0xC4; 428 | bitpacked[154] = 0xC0; 429 | bitpacked[155] = 0x50; 430 | bitpacked[156] = 0x20; 431 | bitpacked[157] = 0xC0; 432 | bitpacked[158] = 0x61; 433 | bitpacked[159] = 0x9B; 434 | bitpacked[160] = 0x40; 435 | bitpacked[161] = 0xC; 436 | bitpacked[162] = 0x94; 437 | bitpacked[163] = 0x18; 438 | bitpacked[164] = 0x41; 439 | bitpacked[165] = 0x23; 440 | bitpacked[166] = 0x47; 441 | bitpacked[167] = 0xE; 442 | bitpacked[168] = 0xDA; 443 | bitpacked[169] = 0xB0; 444 | bitpacked[170] = 0x80; 445 | bitpacked[171] = 0x84; 446 | bitpacked[172] = 0x98; 447 | bitpacked[173] = 0x2D; 448 | bitpacked[174] = 0x94; 449 | bitpacked[175] = 0x88; 450 | bitpacked[176] = 0x51; 451 | bitpacked[177] = 0x13; 452 | bitpacked[178] = 0x10; 453 | bitpacked[179] = 0x84; 454 | bitpacked[180] = 0x9C; 455 | bitpacked[181] = 0xA0; 456 | bitpacked[182] = 0x24; 457 | bitpacked[183] = 0x61; 458 | bitpacked[184] = 0x22; 459 | bitpacked[185] = 0x42; 460 | bitpacked[186] = 0xC0; 461 | bitpacked[187] = 0xC2; 462 | bitpacked[188] = 0x89; 463 | bitpacked[189] = 0; 464 | bitpacked[190] = 0; 465 | bitpacked[191] = 0; 466 | } 467 | -------------------------------------------------------------------------------- /simd.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 3/17/20. 3 | // 4 | 5 | #include "simd.h" 6 | #include 7 | 8 | namespace sboost { 9 | 10 | namespace simd { 11 | 12 | void simd_or(uint64_t *a, uint64_t *b, uint32_t size) { 13 | uint32_t loop = size >> 3; 14 | uint64_t *inout = a; 15 | uint64_t *in = b; 16 | for (uint32_t i = 0; i < loop; ++i) { 17 | _mm512_store_si512((void *) inout, _mm512_or_si512(_mm512_load_si512((const void *) inout), 18 | _mm512_load_si512((const void *) in))); 19 | inout += 8; 20 | in += 8; 21 | } 22 | uint32_t remain = size & 0x7; 23 | for (uint32_t i = 0; i < remain; ++i) { 24 | *inout |= *in; 25 | inout++; 26 | in++; 27 | } 28 | } 29 | 30 | void simd_and(uint64_t *a, uint64_t *b, uint32_t size) { 31 | uint32_t loop = size >> 3; 32 | uint64_t *inout = a; 33 | uint64_t *in = b; 34 | for (uint32_t i = 0; i < loop; ++i) { 35 | _mm512_store_si512((void *) inout, _mm512_and_si512(_mm512_load_si512((const void *) inout), 36 | _mm512_load_si512((const void *) in))); 37 | inout += 8; 38 | in += 8; 39 | } 40 | uint32_t remain = size & 0x7; 41 | for (uint32_t i = 0; i < remain; ++i) { 42 | *inout &= *in; 43 | inout++; 44 | in++; 45 | } 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /simd.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 3/17/20. 3 | // 4 | 5 | #ifndef ARROW_SIMD_H 6 | #define ARROW_SIMD_H 7 | 8 | #include 9 | 10 | namespace sboost { 11 | 12 | namespace simd { 13 | 14 | void simd_or(uint64_t *a, uint64_t *b, uint32_t size); 15 | 16 | void simd_and(uint64_t *a, uint64_t *b, uint32_t size); 17 | } 18 | } 19 | 20 | 21 | #endif //ARROW_SIMD_H 22 | -------------------------------------------------------------------------------- /tool.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Hao Jiang on 11/30/20. 3 | // 4 | #include 5 | #include "loader.h" 6 | 7 | using namespace std; 8 | 9 | int main() { 10 | for (int i = 1; i < 32; ++i) { 11 | auto bitwidth = i; 12 | auto entryInBlocks = sboost::loader::entryInBlocks[i]; 13 | auto resindex = 0; 14 | auto resoffset = 0; 15 | cout << "void write" << i << "(__m512i res,uint64_t* bitmap,uint64_t extract) {\n"; 16 | cout << "uint64_t bits = 0;\n"; 17 | for (int j = 0; j < 8; ++j) { 18 | auto eib = entryInBlocks[j]; 19 | cout << "bits = _pext_u64(res[" << j << "],extract) & ((1L << " << eib << ")-1);\n"; 20 | if (resoffset + eib < 64) { 21 | cout << "bitmap[" << resindex << "] |= bits << " << resoffset << ";\n"; 22 | resoffset += eib; 23 | } else if (resoffset + eib == 64) { 24 | cout << "bitmap[" << resindex << "] |= bits << " << resoffset << ";\n"; 25 | resindex += 1; 26 | resoffset = 0; 27 | } else { 28 | cout << "bitmap[" << resindex << "] |= bits << " << resoffset << ";\n"; 29 | resoffset += eib; 30 | resindex += 1; 31 | resoffset &= 0x3F; 32 | cout << "bitmap[" << resindex << "] |= bits >> " << (eib - resoffset) << ";\n"; 33 | } 34 | } 35 | cout << "}\n"; 36 | } 37 | } -------------------------------------------------------------------------------- /unpacker.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 3/15/18. 3 | // 4 | 5 | #include 6 | #include 7 | #include "unpacker.h" 8 | 9 | namespace sboost { 10 | using namespace std; 11 | 12 | Small32Unpacker::Small32Unpacker(uint32_t es) : entrySize_(es) { 13 | assert(es < 26); 14 | if (es == 0) { 15 | return; 16 | } 17 | 18 | this->shuffleInst_ = (__m256i *) aligned_alloc(32, 32 * 8); 19 | this->shiftInst_ = (__m256i *) aligned_alloc(32, 32 * 8); 20 | 21 | this->mask_ = (__m256i *) aligned_alloc(32, 32); 22 | mask_[0] = _mm256_set1_epi32((1 << es) - 1); 23 | 24 | __m128i *shuffleBuffer = (__m128i *) aligned_alloc(16, 16 * 8); 25 | __m128i *shiftBuffer = (__m128i *) aligned_alloc(16, 16 * 8); 26 | uint32_t *shuffleDataBuffer = (uint32_t *) aligned_alloc(16, 16); 27 | uint32_t *shiftDataBuffer = (uint32_t *) aligned_alloc(16, 16); 28 | 29 | nextPos_ = (entrySize_ * 4) / 8; 30 | int higher = (entrySize_ * 4) % 8; 31 | // Compute shuffle and shift instructions 32 | std::vector cand{0, higher}; 33 | for (int const &offset : cand) { 34 | for (int idx = 0; idx < 4; idx++) { 35 | uint32_t entryoff = offset + entrySize_ * idx; 36 | shiftDataBuffer[idx] = entryoff % 8; 37 | 38 | uint8_t round = (shiftDataBuffer[idx] + entrySize_ - 1) / 8; 39 | shuffleDataBuffer[idx] = 0; 40 | uint32_t start = entryoff / 8; 41 | for (uint32_t bi = 0; bi <= round; bi++) { 42 | shuffleDataBuffer[idx] |= (start + bi) << (bi * 8); 43 | } 44 | for (uint32_t bi = round + 1; bi < 4; bi++) { 45 | shuffleDataBuffer[idx] |= 0xff << bi * 8; 46 | } 47 | } 48 | 49 | shuffleBuffer[offset] = _mm_load_si128((__m128i *) shuffleDataBuffer); 50 | shiftBuffer[offset] = _mm_load_si128((__m128i *) shiftDataBuffer); 51 | } 52 | 53 | // Combine them to make 256-bit shuffle and shift instructions 54 | this->shuffleInst_[0] = _mm256_set_m128i(shuffleBuffer[higher], shuffleBuffer[0]); 55 | this->shiftInst_[0] = _mm256_set_m128i(shiftBuffer[higher], shiftBuffer[0]); 56 | 57 | free(shuffleDataBuffer); 58 | free(shiftDataBuffer); 59 | free(shuffleBuffer); 60 | free(shiftBuffer); 61 | } 62 | 63 | Small32Unpacker::~Small32Unpacker() { 64 | free(shuffleInst_); 65 | free(mask_); 66 | free(shiftInst_); 67 | } 68 | 69 | __m256i Small32Unpacker::unpack(const uint8_t *data) { 70 | // Load data into 2 128 buffer and combine as 256 71 | __m256i main = _mm256_loadu2_m128i((__m128i *) (data + nextPos_), (__m128i *) data); 72 | // Shuffle 73 | __m256i shuffle = _mm256_shuffle_epi8(main, shuffleInst_[0]); 74 | // Shift 75 | __m256i shift = _mm256_srlv_epi32(shuffle, shiftInst_[0]); 76 | // Mask 77 | return _mm256_and_si256(shift, mask_[0]); 78 | } 79 | 80 | 81 | Large32Unpacker::Large32Unpacker(uint32_t es) : entrySize_(es) { 82 | assert(es >= 26 && es < 32); 83 | 84 | shiftInst_ = (__m512i *) aligned_alloc(64, 8 * 64); 85 | shuffleInst_ = (__m512i *) aligned_alloc(64, 8 * 64); 86 | 87 | mask_ = (__m256i *) aligned_alloc(32, 32); 88 | mask_[0] = _mm256_set1_epi32((1u << entrySize_) - 1); 89 | 90 | __m128i *shuffleBuffer = (__m128i *) aligned_alloc(16, 16 * 8); 91 | __m128i *shiftBuffer = (__m128i *) aligned_alloc(16, 16 * 8); 92 | uint64_t *shuffleDataBuffer = (uint64_t *) aligned_alloc(16, 16); 93 | uint64_t *shiftDataBuffer = (uint64_t *) aligned_alloc(16, 16); 94 | 95 | for (int j = 0; j < 3; j++) { 96 | nextPos_[j] = (entrySize_ * 2 * (j + 1)) / 8; 97 | } 98 | 99 | int i = 0; 100 | int high = (i + entrySize_ * 2) % 8; 101 | int higher = (high + entrySize_ * 2) % 8; 102 | int evenHigher = (higher + entrySize_ * 2) % 8; 103 | 104 | // Compute shuffle and shift instructions 105 | for (int offset : {i, high, higher, evenHigher}) { 106 | for (int idx = 0; idx < 2; idx++) { 107 | uint32_t entryoff = offset + entrySize_ * idx; 108 | shiftDataBuffer[idx] = entryoff % 8; 109 | 110 | uint8_t round = (shiftDataBuffer[idx] + entrySize_) / 8; 111 | shuffleDataBuffer[idx] = 0; 112 | uint64_t start = entryoff / 8; 113 | for (int bi = 0; bi <= round; bi++) { 114 | shuffleDataBuffer[idx] |= (start + bi) << bi * 8; 115 | } 116 | for (int bi = round + 1; bi < 8; bi++) { 117 | shuffleDataBuffer[idx] |= 0xffL << bi * 8; 118 | } 119 | } 120 | 121 | 122 | shuffleBuffer[offset] = _mm_load_si128((__m128i *) shuffleDataBuffer); 123 | shiftBuffer[offset] = _mm_load_si128((__m128i *) shiftDataBuffer); 124 | } 125 | 126 | // Combine them to make 512-bit shuffle and shift instructions 127 | 128 | __m128i su0 = shuffleBuffer[i]; 129 | __m128i su1 = shuffleBuffer[high]; 130 | __m128i su2 = shuffleBuffer[higher]; 131 | __m128i su3 = shuffleBuffer[evenHigher]; 132 | 133 | __m512i shuffle = _mm512_castsi128_si512(su0); 134 | shuffle = _mm512_inserti64x2(shuffle, su1, 1); 135 | shuffle = _mm512_inserti64x2(shuffle, su2, 2); 136 | shuffle = _mm512_inserti64x2(shuffle, su3, 3); 137 | this->shuffleInst_[0] = shuffle; 138 | 139 | __m128i sh0 = shiftBuffer[i]; 140 | __m128i sh1 = shiftBuffer[high]; 141 | __m128i sh2 = shiftBuffer[higher]; 142 | __m128i sh3 = shiftBuffer[evenHigher]; 143 | 144 | __m512i shift = _mm512_castsi128_si512(sh0); 145 | shift = _mm512_inserti64x2(shift, sh1, 1); 146 | shift = _mm512_inserti64x2(shift, sh2, 2); 147 | shift = _mm512_inserti64x2(shift, sh3, 3); 148 | this->shiftInst_[i] = shift; 149 | 150 | 151 | free(shuffleDataBuffer); 152 | free(shiftDataBuffer); 153 | free(shuffleBuffer); 154 | free(shiftBuffer); 155 | } 156 | 157 | Large32Unpacker::~Large32Unpacker() { 158 | free(shiftInst_); 159 | free(shuffleInst_); 160 | free(mask_); 161 | } 162 | 163 | __m256i Large32Unpacker::unpack(const uint8_t *data) { 164 | // Load 4 128 bit into a 512 bit register 165 | __m256i lower = _mm256_loadu2_m128i((__m128i *) (data + nextPos_[0]), (__m128i *) data); 166 | __m256i higher = _mm256_loadu2_m128i((__m128i *) (data + nextPos_[2]), 167 | (__m128i *) (data + nextPos_[1])); 168 | // Get a single 512 bit 169 | __m512i main = _mm512_castsi256_si512(lower); 170 | main = _mm512_inserti64x4(main, higher, 1); 171 | 172 | // Shuffle 173 | __m512i shuffle = _mm512_shuffle_epi8(main, shuffleInst_[0]); 174 | // Shift 175 | __m512i shift = _mm512_srlv_epi64(shuffle, shiftInst_[0]); 176 | // Mask 177 | return _mm256_and_si256(_mm512_cvtepi64_epi32(shift), mask_[0]); 178 | } 179 | 180 | void unpackScalar(const uint8_t *input, uint32_t numEntry, uint8_t bitWidth, uint32_t *output) { 181 | uint32_t index = 0; 182 | uint8_t offset = 0; 183 | uint32_t mask = (1u << bitWidth) - 1; 184 | const uint64_t *view = (const uint64_t *) input; 185 | for (uint32_t i = 0; i < numEntry; ++i) { 186 | output[i] = static_cast((view[index] >> offset) & mask); 187 | offset += bitWidth; 188 | index += offset >> 6; 189 | if (offset > 64) { 190 | int extra = offset - 64; 191 | output[i] |= ((view[index] & ((1 << extra) - 1))) << (bitWidth - extra); 192 | } 193 | offset &= 0x3F; 194 | } 195 | } 196 | } -------------------------------------------------------------------------------- /unpacker.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 3/15/18. 3 | // 4 | 5 | #ifndef SBOOST_UNPACKER_H 6 | #define SBOOST_UNPACKER_H 7 | 8 | 9 | /** 10 | * Implementations of unpacking integer into 256-bit SIMD 11 | */ 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #define _mm256_loadu2_m128i(vh, vl) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((vl))), _mm_loadu_si128(vh), 1) 18 | #define _mm256_set_m128i(vh, vl) _mm256_insertf128_si256(_mm256_castsi128_si256(vl), (vh), 1) 19 | 20 | using namespace std; 21 | using namespace std::placeholders; 22 | namespace sboost { 23 | 24 | class Unpacker { 25 | public: 26 | virtual __m256i unpack(const uint8_t *) = 0; 27 | }; 28 | 29 | class Small32Unpacker : public Unpacker { 30 | private: 31 | uint32_t entrySize_; 32 | uint8_t nextPos_; 33 | __m256i *shuffleInst_; 34 | __m256i *shiftInst_; 35 | __m256i *mask_; 36 | public: 37 | Small32Unpacker(uint32_t es); 38 | 39 | virtual ~Small32Unpacker(); 40 | 41 | __m256i unpack(const uint8_t *data) override; 42 | }; 43 | 44 | class Large32Unpacker : public Unpacker { 45 | private: 46 | uint32_t entrySize_; 47 | std::array nextPos_; 48 | __m512i *shuffleInst_; 49 | __m512i *shiftInst_; 50 | __m256i *mask_; 51 | public: 52 | Large32Unpacker(uint32_t es); 53 | 54 | virtual ~Large32Unpacker(); 55 | 56 | __m256i unpack(const uint8_t *data); 57 | }; 58 | 59 | /** 60 | * Do not use inheritance for performance 61 | */ 62 | static array unpackers = { 63 | new Small32Unpacker(0), new Small32Unpacker(1), new Small32Unpacker(2), 64 | new Small32Unpacker(3), new Small32Unpacker(4), new Small32Unpacker(5), 65 | new Small32Unpacker(6), new Small32Unpacker(7), new Small32Unpacker(8), 66 | new Small32Unpacker(9), new Small32Unpacker(10), new Small32Unpacker(11), 67 | new Small32Unpacker(12), new Small32Unpacker(13), new Small32Unpacker(14), 68 | new Small32Unpacker(15), new Small32Unpacker(16), new Small32Unpacker(17), 69 | new Small32Unpacker(18), new Small32Unpacker(19), new Small32Unpacker(20), 70 | new Small32Unpacker(21), new Small32Unpacker(22), new Small32Unpacker(23), 71 | new Small32Unpacker(24), new Small32Unpacker(25), new Large32Unpacker(26), 72 | new Large32Unpacker(27), new Large32Unpacker(28), new Large32Unpacker(29), 73 | new Large32Unpacker(30), new Large32Unpacker(31) 74 | }; 75 | 76 | void unpackScalar(const uint8_t *input, uint32_t numEntry, uint8_t bitWidth, uint32_t *output); 77 | 78 | template 79 | void unpack(const uint8_t *input, uint32_t numEntry, uint32_t *output) { 80 | auto upckr = unpackers[bitWidth]; 81 | uint32_t round = numEntry >> 3; 82 | uint32_t ioff = 0; 83 | for (uint i = 0; i < round; ++i) { 84 | __m256i result = upckr->unpack(input + ioff); 85 | _mm256_storeu_si256(((__m256i *) output) + i, result); 86 | ioff += bitWidth; 87 | } 88 | unpackScalar(input + (round * bitWidth), numEntry & 0x7, bitWidth, output + (round << 3)); 89 | } 90 | 91 | static std::array, 32> unpacks = 92 | {unpack<0>, unpack<1>, unpack<2>, unpack<3>, unpack<4>, unpack<5>, unpack<6>, unpack<7>, unpack<8>, 93 | unpack<9>, unpack<10>, unpack<11>, unpack<12>, unpack<13>, unpack<14>, unpack<15>, unpack<16>, unpack<17>, 94 | unpack<18>, unpack<19>, unpack<20>, unpack<21>, unpack<22>, unpack<23>, unpack<24>, unpack<25>, unpack<26>, 95 | unpack<27>, unpack<28>, unpack<29>, unpack<30>, unpack<31>}; 96 | } 97 | #endif //SBOOST_UNPACKER_H 98 | -------------------------------------------------------------------------------- /unpacker_benchmark.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 3/2/20. 3 | // 4 | 5 | #include 6 | #include "byteutils.h" 7 | #include "unpacker.h" 8 | 9 | class UnpackerBenchmark : public benchmark::Fixture { 10 | protected: 11 | uint8_t *data_; 12 | uint32_t *output_; 13 | uint64_t numEntry_; 14 | uint8_t bitWidth_; 15 | public: 16 | // add members as needed 17 | 18 | UnpackerBenchmark() { 19 | numEntry_ = 1000000; 20 | bitWidth_ = 11; 21 | uint32_t mask = (1 << bitWidth_) - 1; 22 | data_ = (uint8_t *) malloc((bitWidth_ * numEntry_ + 8) >> 3); 23 | output_ = (uint32_t *) malloc(sizeof(uint32_t) * numEntry_); 24 | 25 | srand(time(NULL)); 26 | uint32_t *values = (uint32_t *) malloc(sizeof(int32_t) * numEntry_); 27 | for (uint32_t i = 0; i < numEntry_; ++i) { 28 | values[i] = static_cast(rand() & mask); 29 | } 30 | ::sboost::byteutils::bitpack(values, numEntry_, bitWidth_, data_); 31 | free(values); 32 | } 33 | 34 | virtual ~UnpackerBenchmark() { 35 | free(data_); 36 | free(output_); 37 | } 38 | }; 39 | 40 | BENCHMARK_F(UnpackerBenchmark, Scalar)(benchmark::State &state) { 41 | for (auto _ : state) { 42 | //run your benchmark 43 | sboost::unpackScalar(data_, numEntry_, bitWidth_, output_); 44 | } 45 | } 46 | 47 | BENCHMARK_F(UnpackerBenchmark, SBoost)(benchmark::State &state) { 48 | for (auto _ : state) { 49 | sboost::unpacks[bitWidth_](data_, numEntry_, output_); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /unpacker_test.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by harper on 3/15/18. 3 | // 4 | 5 | #include 6 | #include "byteutils.h" 7 | #include "unpacker.h" 8 | 9 | using namespace sboost; 10 | 11 | 12 | int extract_entry(int *input, int index, int offset, int entrySize) { 13 | int mask = (1 << entrySize) - 1; 14 | int word0 = *(input + index); 15 | if (offset + entrySize <= 32) { 16 | return (word0 >> offset) & mask; 17 | } else { 18 | int mask0 = (1 << (32 - offset)) - 1; 19 | int word1 = *(input + index + 1); 20 | return ((word0 >> offset & mask0) | word1 << (32 - offset)) & mask; 21 | } 22 | } 23 | 24 | TEST(Small32Unpacker, unpack) { 25 | int entrySize = 21; 26 | int entryCount = 17; 27 | uint32_t data[17] = {20124, 8831, 2575, 1977, 15, 42441, 302690, 871222, 323452, 424532, 29434, 28 | 939141, 4244, 324314, 13, 1, 874255}; 29 | uint8_t output[64]{0}; 30 | byteutils::bitpack(data, entryCount, entrySize, output); 31 | 32 | Small32Unpacker unpacker(entrySize); 33 | 34 | 35 | for (int o = 0; o < entryCount; o += 8) { 36 | __m256i unpacked = unpacker.unpack(output + entrySize * (o >> 3)); 37 | 38 | for (int i = 0; i < 8; i++) { 39 | int bitoff = (o + i) * entrySize; 40 | int extract = extract_entry((int *) output, bitoff / 32, bitoff % 32, entrySize); 41 | EXPECT_EQ(extract, (unpacked[i / 2] >> (i % 2) * 32) & 0xFFFFFFFF) << o << "," << i; 42 | } 43 | } 44 | } 45 | 46 | TEST(Small32Unpacker, unpackSmall) { 47 | int entrySize = 14; 48 | int entryCount = 17; 49 | uint32_t data[17] = {2124, 831, 2575, 1977, 15, 4241, 3090, 822, 3252, 4245, 934, 50 | 941, 4244, 3314, 13, 1, 874}; 51 | uint8_t output[64]{0}; 52 | byteutils::bitpack(data, 17, entrySize, output); 53 | 54 | Small32Unpacker unpacker(entrySize); 55 | 56 | 57 | for (int o = 0; o < entryCount; o += 8) { 58 | __m256i unpacked = unpacker.unpack(output + entrySize * (o >> 3)); 59 | 60 | for (int i = 0; i < 8; i++) { 61 | int bitoff = (o + i) * entrySize; 62 | int extract = extract_entry((int *) output, bitoff / 32, bitoff % 32, entrySize); 63 | EXPECT_EQ(extract, (unpacked[i / 2] >> (i % 2) * 32) & 0xFFFFFFFF) << o << "," << i; 64 | } 65 | } 66 | } 67 | 68 | 69 | TEST(Large32Unpacker, unpack) { 70 | int entrySize = 30; 71 | int entryCount = 17; 72 | uint32_t data[17] = {82934, 1941331, 224875, 4201277, 304135, 224241, 26, 112192, 99552, 4234532, 73 | 990342, 32342411, 42349022, 42431414, 324231342, 32324414, 32767}; 74 | uint8_t output[64]{0}; 75 | byteutils::bitpack(data, 17, entrySize, output); 76 | 77 | Large32Unpacker unpacker(entrySize); 78 | 79 | 80 | for (int o = 0; o < entryCount; o += 8) { 81 | __m256i unpacked = unpacker.unpack(output + entrySize * (o >> 3)); 82 | 83 | for (int i = 0; i < 8; i++) { 84 | int bitoff = (o + i) * entrySize; 85 | int extract = extract_entry((int *) output, bitoff / 32, bitoff % 32, entrySize); 86 | EXPECT_EQ(extract, (unpacked[i / 2] >> (i % 2) * 32) & 0xFFFFFFFF) << o << "," << i; 87 | } 88 | } 89 | } 90 | 91 | TEST(Unpacker, unpack) { 92 | std::array buffer; 93 | uint32_t size= 10007; 94 | uint8_t *bytebuffer = (uint8_t *) malloc(sizeof(uint8_t) * 4 * size); 95 | uint32_t *outputbuffer = (uint32_t *) malloc(sizeof(uint32_t) * size); 96 | srand(1024); 97 | for (int es = 2; es <= 31; ++es) { 98 | memset(bytebuffer, 0, sizeof(uint8_t) * 4 * size); 99 | uint32_t mask = (1u << es) - 1; 100 | for (uint32_t j = 0; j < size; ++j) { 101 | buffer[j] = mask & static_cast(rand() & 0xFF); 102 | } 103 | sboost::byteutils::bitpack(buffer.data(), size, es, bytebuffer); 104 | 105 | sboost::unpacks[es](bytebuffer, size, outputbuffer); 106 | for (uint32_t j = 0; j < size; ++j) { 107 | EXPECT_EQ(buffer[j], outputbuffer[j]) << es << "," << j; 108 | } 109 | } 110 | 111 | } 112 | 113 | --------------------------------------------------------------------------------