├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── bench
    ├── CMakeLists.txt
    ├── bp_bench.cpp
    ├── cuda_bp_bench.cu
    ├── cuda_vbyte_bench.cu
    ├── simdbp_bench.cpp
    ├── streamvbyte_bench.cpp
    ├── synthetic.hpp
    └── varintgb_bench.cpp
├── external
    └── CMakeLists.txt
├── include
    └── gpu_ic
    │   ├── cuda_bp.cuh
    │   ├── cuda_vbyte.cuh
    │   └── utils
    │       ├── binary_collection.hpp
    │       ├── binary_freq_collection.hpp
    │       ├── bit_istream.hpp
    │       ├── bit_ostream.hpp
    │       ├── bit_vector.hpp
    │       ├── broadword.hpp
    │       ├── cuda_utils.hpp
    │       ├── index.cuh
    │       ├── index.hpp
    │       ├── intrinsics.hpp
    │       ├── mappable_vector.hpp
    │       ├── mapper.hpp
    │       ├── posting_list.hpp
    │       ├── progress.hpp
    │       ├── tables.hpp
    │       ├── tight_variable_byte.hpp
    │       └── utils.hpp
├── plot.png
├── src
    ├── CMakeLists.txt
    ├── compress_index.cpp
    ├── cuda_compress_index.cu
    ├── cuda_perf_decode.cu
    └── perf_decode.cpp
└── test_data
    ├── queries
    ├── test_collection.docs
    └── test_collection.freqs


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | AccessModifierOffset: -1
 3 | AlignConsecutiveAssignments: true
 4 | AlignConsecutiveDeclarations: true
 5 | AlignEscapedNewlinesLeft: true
 6 | AlignTrailingComments: false
 7 | AllowAllParametersOfDeclarationOnNextLine: true
 8 | AllowShortIfStatementsOnASingleLine: false
 9 | AllowShortLoopsOnASingleLine: false
10 | AlwaysBreakBeforeMultilineStrings: true
11 | AlwaysBreakTemplateDeclarations: true
12 | BinPackArguments: false
13 | BinPackParameters: false
14 | BreakBeforeBinaryOperators: false
15 | BreakBeforeBraces: Attach
16 | BreakConstructorInitializersBeforeComma: false
17 | ColumnLimit:     100
18 | CommentPragmas:  '^@atd'
19 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
20 | ConstructorInitializerIndentWidth: 4
21 | ContinuationIndentWidth: 4
22 | Cpp11BracedListStyle: true
23 | DerivePointerAlignment: false
24 | ExperimentalAutoDetectBinPacking: true
25 | IndentCaseLabels: false
26 | IndentFunctionDeclarationAfterType: false
27 | IndentWidth:     4
28 | MaxEmptyLinesToKeep: 1
29 | NamespaceIndentation: None
30 | ObjCSpaceBeforeProtocolList: false
31 | PenaltyBreakBeforeFirstCallParameter: 10
32 | PenaltyBreakComment: 60
33 | PenaltyBreakFirstLessLess: 20
34 | PenaltyBreakString: 1000
35 | PenaltyExcessCharacter: 1000000
36 | PenaltyReturnTypeOnItsOwnLine: 200
37 | PointerAlignment: Right
38 | SpaceAfterControlStatementKeyword: true
39 | SpaceBeforeAssignmentOperators: true
40 | SpaceInEmptyParentheses: false
41 | SpacesBeforeTrailingComments: 1
42 | SpacesInAngles: false
43 | SpacesInCStyleCastParentheses: false
44 | SpacesInParentheses: false
45 | Standard:        Cpp11
46 | TabWidth:        8
47 | UseTab:          Never


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | build/
35 | 
36 | .DS_Store


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "external/googletest"]
 2 | 	path = external/googletest
 3 | 	url = https://github.com/google/googletest.git
 4 | [submodule "external/benchmark"]
 5 | 	path = external/benchmark
 6 | 	url = https://github.com/google/benchmark.git
 7 | [submodule "external/FastPFor"]
 8 | 	path = external/FastPFor
 9 | 	url = https://github.com/lemire/FastPFor.git
10 | [submodule "external/cub"]
11 |         path = external/cub
12 |         url = https://github.com/NVlabs/cub.git
13 | [submodule "external/CLI11"]
14 | 	path = external/CLI11
15 | 	url = https://github.com/CLIUtils/CLI11.git
16 | [submodule "external/mio"]
17 | 	path = external/mio
18 | 	url = https://github.com/mandreyel/mio.git
19 | [submodule "external/boost-cmake"]
20 | 	path = external/boost-cmake
21 | 	url = https://github.com/Orphis/boost-cmake.git
22 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | project(gpu_integers_compression)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | set(CMAKE_CXX_EXTENSIONS OFF)
 7 | option(BUILD_CUDA "Build the CUDA library" ON)
 8 | 
 9 | if (NOT CMAKE_BUILD_TYPE)
10 |         message(STATUS "No build type selected, default to Release")
11 |         set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
12 | endif()
13 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} )
14 | 
15 | 
16 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
17 |     if (CXX_COMPILER_VERSION VERSION_LESS 4.7)
18 |         message(STATUS "GCC version must be at least 4.7!")
19 |     endif()
20 |     set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -Ofast -lm  -DNDEBUG -DHAVE_CXX0X -march=native")
21 |     set (CMAKE_CXX_FLAGS_DEBUG   "-Wall -Wcast-align -ggdb  -lm  -DHAVE_CXX0X -march=native")
22 | elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
23 |     if (CXX_COMPILER_VERSION VERSION_LESS 4.2.1)
24 |         message(STATUS  "Clang version must be at least 4.2.1!" )
25 |     endif()
26 |     set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -DHAVE_CXX0X -msse4.1 -march=native")
27 |     set (CMAKE_CXX_FLAGS_DEBUG   "-Wall -Wcast-align -ggdb  -DHAVE_CXX0X -msse4.1 -march=native")
28 | else ()
29 |     message(FATAL_ERROR "Please, use GCC or Clang compiler!")
30 | endif()
31 | 
32 | if (USE_SANITIZERS)
33 |  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
34 | endif ()
35 | 
36 | set(gtest_disable_pthreads ON)
37 | 
38 | 
39 | include_directories(include)
40 | add_library(gpu_integers_compression INTERFACE)
41 | target_include_directories(gpu_integers_compression INTERFACE
42 |     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
43 | )
44 | target_link_libraries(gpu_integers_compression INTERFACE
45 |     mio
46 |     FastPFor
47 |     Boost::boost
48 | )
49 | 
50 | if (BUILD_CUDA)
51 |     set(CMAKE_CUDA_STANDARD 11)
52 |     set(CMAKE_CUDA_STANDARD_REQUIRED ON)
53 |     set(CMAKE_CUDA_EXTENSIONS OFF)
54 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
55 | 
56 |     find_package(CUDA)
57 |     CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS "Auto")
58 |     list(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
59 |     target_link_libraries(gpu_integers_compression INTERFACE cub)
60 | endif ()
61 | MESSAGE( STATUS "BUILD_CUDA: " ${BUILD_CUDA} )
62 | add_subdirectory(external)
63 | 
64 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
65 | 
66 | add_subdirectory(src)
67 | 
68 | enable_testing()
69 | add_subdirectory(bench)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | GPU Integers Compression
 2 | ==========================
 3 | 
 4 | This code was used in the experiments of the following paper:
 5 | 
 6 | Antonio Mallia, Michał Siedlaczek, Torsten Suel, Mohamed Zahran. **GPU-Accelerated Decoding of Integer Lists**. In The 28th ACM International Conference on Information and Knowledge Management (CIKM). 2019
 7 | 
 8 | ## Usage 
 9 | 
10 | ### Build
11 | ```
12 | git clone git@github.com:amallia/gpu-integers-compression.git
13 | cd gpu-integers-compression
14 | mkdir build 
15 | cd build 
16 | cmake .. 
17 | make -j
18 | ```
19 | 
20 | ### External libraries
21 |     - Google Test
22 |     - Google benchmark
23 |     - NVlabs CUB
24 |     - FastPFor
25 |     - CLI11 
26 |     - Boost
27 |     - mio
28 |     
29 | ### Benchmark
30 | 
31 | Benchmarks tasks can be found in `bench` folder. For example you can run:
32 | ```
33 | ./bench/cuda_bp_bench
34 | ./bench/cuda_vbyte_bench
35 | ```
36 | 
37 | ## Codecs
38 | 
39 | ### GPU Binary-Packing
40 | ```cpp
41 | #include "gpu_ic/cuda_bp.cuh"
42 | 
43 | // Values to encode
44 | std::vector<uint32_t> values = {\* ... *\};
45 | 
46 | // Encode
47 | std::vector<uint8_t> encoded_values;
48 | encoded_values.resize(values.size() * 8);
49 | auto compressedsize = cuda_bp::encode(encoded_values.data(), values.data(), values.size());
50 | encoded_values.resize(compressedsize);
51 | encoded_values.shrink_to_fit();
52 | 
53 | // Decode
54 | std::vector<uint32_t> decoded_values;
55 | decoded_values.resize(values.size());
56 | CUDA_CHECK_ERROR(cudaMalloc((void **)&d_encoded, encoded_values.size() * sizeof(uint8_t)));
57 | CUDA_CHECK_ERROR(cudaMemcpy(d_encoded, encoded_values.data(), encoded_values.size() * sizeof(uint8_t), cudaMemcpyHostToDevice));
58 | 
59 | CUDA_CHECK_ERROR(cudaMalloc((void **)&d_decoded, values.size() * sizeof(uint32_t)));
60 | cuda_bp::decode(d_decoded, d_encoded, decoded_values.size());
61 | CUDA_CHECK_ERROR(cudaMemcpy(decoded_values.data(), d_decoded, values.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost));
62 | ```
63 | 
64 | ### GPU Vbyte
65 | 
66 | ```cpp
67 | #include "gpu_ic/cuda_vbyte.cuh"
68 | 
69 | // Values to encode
70 | std::vector<uint32_t> values = {\* ... *\};
71 | 
72 | // Encode
73 | std::vector<uint8_t> encoded_values;
74 | encoded_values.resize(values.size() * 8);
75 | auto compressedsize = cuda_vbyte::encode(encoded_values.data(), values.data(), values.size());
76 | encoded_values.resize(compressedsize);
77 | encoded_values.shrink_to_fit();
78 | 
79 | // Decode
80 | std::vector<uint32_t> decoded_values;
81 | decoded_values.resize(values.size());
82 | CUDA_CHECK_ERROR(cudaMalloc((void **)&d_encoded, encoded_values.size() * sizeof(uint8_t)));
83 | CUDA_CHECK_ERROR(cudaMemcpy(d_encoded, encoded_values.data(), encoded_values.size() * sizeof(uint8_t), cudaMemcpyHostToDevice));
84 | 
85 | CUDA_CHECK_ERROR(cudaMalloc((void **)&d_decoded, values.size() * sizeof(uint32_t)));
86 | cuda_vbyte::decode(d_decoded, d_encoded, decoded_values.size());
87 | CUDA_CHECK_ERROR(cudaMemcpy(decoded_values.data(), d_decoded, values.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost));
88 | 
89 | ```
90 | 
91 | 
92 | ## Benchmarks
93 | <p align="center">
94 | <img src="plot.png" width="75%">
95 | </p>
96 |     
97 | 


--------------------------------------------------------------------------------
/bench/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB BENCH_SOURCES *_bench.cpp)
 2 | foreach(BENCH_SRC ${BENCH_SOURCES})
 3 |   get_filename_component (BENCH_SRC_NAME ${BENCH_SRC} NAME_WE)
 4 |   add_executable(${BENCH_SRC_NAME} ${BENCH_SRC})
 5 |     target_link_libraries(${BENCH_SRC_NAME}
 6 |         benchmark
 7 |         gpu_integers_compression
 8 |         gtest
 9 |         gmock
10 |     )
11 |     target_include_directories(${BENCH_SRC_NAME} BEFORE PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
12 |     )
13 |   add_test(${BENCH_SRC_NAME} ${BENCH_SRC_NAME})
14 | endforeach(BENCH_SRC)
15 | 
16 | if(BUILD_CUDA)
17 | CUDA_ADD_EXECUTABLE(cuda_bp_bench cuda_bp_bench.cu)
18 |   target_link_libraries(cuda_bp_bench
19 |   benchmark
20 |   gpu_integers_compression
21 |   gtest
22 |   gmock
23 | )
24 | CUDA_ADD_EXECUTABLE(cuda_vbyte_bench cuda_vbyte_bench.cu)
25 |   target_link_libraries(cuda_vbyte_bench
26 |   benchmark
27 |   gpu_integers_compression
28 |   gtest
29 |   gmock
30 | )
31 | endif()
32 | 


--------------------------------------------------------------------------------
/bench/bp_bench.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <gmock/gmock.h>
18 | #include <gtest/gtest.h>
19 | #include "benchmark/benchmark.h"
20 | #include "../external/FastPFor/headers/codecfactory.h"
21 | #include "synthetic.hpp"
22 | #include "gpu_ic/utils/utils.hpp"
23 | 
24 | template<typename Generator>
25 | class ValuesFixture : public ::benchmark::Fixture {
26 | 
27 | public:
28 |     using ::benchmark::Fixture::SetUp;
29 |     using ::benchmark::Fixture::TearDown;
30 | 
31 |     virtual void SetUp(::benchmark::State& st) {
32 |         using namespace FastPForLib;
33 |         using namespace gpu_ic;
34 | 
35 |         IntegerCODEC &codec = *CODECFactory::getFromName("BP32");
36 |         Generator clu(1);
37 |         values = clu.generate(st.range(0), 1U << 29);
38 |         utils::delta_encode(values.data(), values.size());
39 | 
40 |         encoded_values.resize(values.size() * 8);
41 |         size_t compressedsize = 0;
42 |         codec.encodeArray(values.data(), values.size(), encoded_values.data(),
43 |                 compressedsize);
44 |         encoded_values.resize(compressedsize);
45 |         encoded_values.shrink_to_fit();
46 | 
47 |         decoded_values.resize(values.size());
48 |     }
49 | 
50 |     virtual void TearDown(::benchmark::State&) {
51 |         ASSERT_EQ(decoded_values.size(), values.size());
52 |         for (size_t i = 0; i < values.size(); ++i)
53 |         {
54 |             ASSERT_EQ(decoded_values[i], values[i]);
55 |         }
56 |         values.clear();
57 |         encoded_values.clear();
58 |         decoded_values.clear();
59 |     }
60 |     std::vector<uint32_t> values;
61 |     std::vector<uint32_t>  encoded_values;
62 |     std::vector<uint32_t> decoded_values;
63 | };
64 | 
65 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform, gpu_ic::UniformDataGenerator)(benchmark::State& state) {
66 |     using namespace FastPForLib;
67 |     IntegerCODEC &codec = *CODECFactory::getFromName("BP32");
68 | 
69 |     while (state.KeepRunning()) {
70 |           size_t recoveredsize = 0;
71 |           codec.decodeArray(encoded_values.data(), encoded_values.size(),
72 |                     decoded_values.data(), recoveredsize);
73 |     }
74 |     auto bpi = double(32*encoded_values.size())/decoded_values.size();
75 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
76 | }
77 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
78 | 
79 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered, gpu_ic::ClusteredDataGenerator)(benchmark::State& state) {
80 |     using namespace FastPForLib;
81 |     IntegerCODEC &codec = *CODECFactory::getFromName("BP32");
82 | 
83 |     while (state.KeepRunning()) {
84 |           size_t recoveredsize = 0;
85 |           codec.decodeArray(encoded_values.data(), encoded_values.size(),
86 |                     decoded_values.data(), recoveredsize);
87 |     }
88 |     auto bpi = double(32*encoded_values.size())/decoded_values.size();
89 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
90 | }
91 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
92 | 
93 | BENCHMARK_MAIN();
94 | 
95 | 


--------------------------------------------------------------------------------
/bench/cuda_bp_bench.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cuda.h>
 18 | #include "gmock/gmock.h"
 19 | #include "gtest/gtest.h"
 20 | #include "benchmark/benchmark.h"
 21 | 
 22 | 
 23 | #include "synthetic.hpp"
 24 | #include "gpu_ic/cuda_bp.cuh"
 25 | #include "gpu_ic/utils/utils.hpp"
 26 | #include "gpu_ic/utils/cuda_utils.hpp"
 27 | 
 28 | template<typename Generator, size_t block_size>
 29 | class ValuesFixture : public ::benchmark::Fixture {
 30 | 
 31 | public:
 32 |     using ::benchmark::Fixture::SetUp;
 33 |     using ::benchmark::Fixture::TearDown;
 34 | 
 35 |     virtual void SetUp(::benchmark::State& st) {
 36 |         using namespace gpu_ic;
 37 | 
 38 |         Generator clu(1);
 39 |         values = clu.generate(st.range(0), 1U << 29);
 40 |         utils::delta_encode(values.data(), values.size());
 41 | 
 42 |         encoded_values.resize(values.size() * 8);
 43 |         auto compressedsize = cuda_bp::encode<block_size>(encoded_values.data(), values.data(), values.size());
 44 |         encoded_values.resize(compressedsize);
 45 |         encoded_values.shrink_to_fit();
 46 | 
 47 |         decoded_values.resize(values.size());
 48 |         CUDA_CHECK_ERROR(cudaSetDevice(0));
 49 |         warmUpGPU<<<1, 1>>>();
 50 |         CUDA_CHECK_ERROR(cudaMalloc((void **)&d_encoded, encoded_values.size() * sizeof(uint8_t)));
 51 |         CUDA_CHECK_ERROR(cudaMemcpy(d_encoded, encoded_values.data(), encoded_values.size() * sizeof(uint8_t), cudaMemcpyHostToDevice));
 52 | 
 53 |         CUDA_CHECK_ERROR(cudaMalloc((void **)&d_decoded, values.size() * sizeof(uint32_t)));
 54 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
 55 |     }
 56 | 
 57 |     virtual void TearDown(::benchmark::State&) {
 58 |         CUDA_CHECK_ERROR(cudaMemcpy(decoded_values.data(), d_decoded, values.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 59 | 
 60 |         ASSERT_EQ(decoded_values.size(), values.size());
 61 |         for (size_t i = 0; i < values.size(); ++i)
 62 |         {
 63 |             ASSERT_EQ(decoded_values[i], values[i]);
 64 |         }
 65 |         cudaFree(d_encoded);
 66 |         cudaFree(d_decoded);
 67 |         values.clear();
 68 |         encoded_values.clear();
 69 |         decoded_values.clear();
 70 |     }
 71 |     std::vector<uint32_t> values;
 72 |     std::vector<uint8_t> encoded_values;
 73 |     std::vector<uint32_t> decoded_values;
 74 |     uint8_t *  d_encoded;
 75 |     uint32_t * d_decoded;
 76 | };
 77 | 
 78 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform128, gpu_ic::UniformDataGenerator, 128)(benchmark::State& state) {
 79 |     while (state.KeepRunning()) {
 80 |         cuda_bp::decode<128>(d_decoded, d_encoded, decoded_values.size());
 81 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
 82 |     }
 83 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
 84 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 85 | }
 86 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform128)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
 87 | 
 88 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform256, gpu_ic::UniformDataGenerator, 256)(benchmark::State& state) {
 89 |     while (state.KeepRunning()) {
 90 |         cuda_bp::decode<256>(d_decoded, d_encoded, decoded_values.size());
 91 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
 92 |     }
 93 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
 94 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 95 | }
 96 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform256)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
 97 | 
 98 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered128, gpu_ic::ClusteredDataGenerator, 128)(benchmark::State& state) {
 99 |     while (state.KeepRunning()) {
100 |         cuda_bp::decode<128>(d_decoded, d_encoded, decoded_values.size());
101 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
102 |     }
103 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
104 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
105 | }
106 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered128)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
107 | 
108 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered256, gpu_ic::ClusteredDataGenerator, 256)(benchmark::State& state) {
109 |     while (state.KeepRunning()) {
110 |         cuda_bp::decode<256>(d_decoded, d_encoded, decoded_values.size());
111 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
112 |     }
113 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
114 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
115 | }
116 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered256)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
117 | 
118 | BENCHMARK_MAIN();
119 | 


--------------------------------------------------------------------------------
/bench/cuda_vbyte_bench.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cuda.h>
 18 | #include "gmock/gmock.h"
 19 | #include "gtest/gtest.h"
 20 | #include "benchmark/benchmark.h"
 21 | 
 22 | #include "gpu_ic/cuda_vbyte.cuh"
 23 | #include "gpu_ic/utils/utils.hpp"
 24 | #include "gpu_ic/utils/cuda_utils.hpp"
 25 | 
 26 | #include "synthetic.hpp"
 27 | 
 28 | template <typename Generator, size_t block_size>
 29 | class ValuesFixture : public ::benchmark::Fixture {
 30 | 
 31 | public:
 32 |     using ::benchmark::Fixture::SetUp;
 33 |     using ::benchmark::Fixture::TearDown;
 34 | 
 35 |     virtual void SetUp(::benchmark::State& st) {
 36 |         using namespace gpu_ic;
 37 | 
 38 |         Generator clu(1);
 39 |         values = clu.generate(st.range(0), 1U << 29);
 40 |         utils::delta_encode(values.data(), values.size());
 41 | 
 42 |         encoded_values.resize(values.size() * 8);
 43 |         auto compressedsize = cuda_vbyte::encode<block_size>(encoded_values.data(), values.data(), values.size());
 44 |         encoded_values.resize(compressedsize);
 45 |         encoded_values.shrink_to_fit();
 46 | 
 47 |         decoded_values.resize(values.size());
 48 |         CUDA_CHECK_ERROR(cudaSetDevice(0));
 49 |         warmUpGPU<<<1, 1>>>();
 50 |         CUDA_CHECK_ERROR(cudaMalloc((void **)&d_encoded, encoded_values.size() * sizeof(uint8_t)));
 51 |         CUDA_CHECK_ERROR(cudaMemcpy(d_encoded, encoded_values.data(), encoded_values.size() * sizeof(uint8_t), cudaMemcpyHostToDevice));
 52 | 
 53 |         CUDA_CHECK_ERROR(cudaMalloc((void **)&d_decoded, values.size() * sizeof(uint32_t)));
 54 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
 55 |     }
 56 | 
 57 |     virtual void TearDown(::benchmark::State&) {
 58 |         CUDA_CHECK_ERROR(cudaMemcpy(decoded_values.data(), d_decoded, values.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 59 | 
 60 |         ASSERT_EQ(decoded_values.size(), values.size());
 61 |         for (size_t i = 0; i < values.size(); ++i)
 62 |         {
 63 |             ASSERT_EQ(decoded_values[i], values[i]);
 64 |         }
 65 | 
 66 |         cudaFree(d_encoded);
 67 |         cudaFree(d_decoded);
 68 |         values.clear();
 69 |         encoded_values.clear();
 70 |         decoded_values.clear();
 71 |     }
 72 |     std::vector<uint32_t> values;
 73 |     std::vector<uint8_t> encoded_values;
 74 |     std::vector<uint32_t> decoded_values;
 75 |     uint8_t *  d_encoded;
 76 |     uint32_t * d_decoded;
 77 | };
 78 | 
 79 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform128, gpu_ic::UniformDataGenerator, 128)(benchmark::State& state) {
 80 |     while (state.KeepRunning()) {
 81 |         cuda_vbyte::decode<128>(d_decoded, d_encoded, decoded_values.size());
 82 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
 83 |     }
 84 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
 85 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 86 | }
 87 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform128)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
 88 | 
 89 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform1024, gpu_ic::UniformDataGenerator, 1024)(benchmark::State& state) {
 90 |     while (state.KeepRunning()) {
 91 |         cuda_vbyte::decode<1024>(d_decoded, d_encoded, decoded_values.size());
 92 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
 93 |     }
 94 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
 95 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 96 | }
 97 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform1024)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
 98 | 
 99 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered128, gpu_ic::ClusteredDataGenerator, 128)(benchmark::State& state) {
100 |     while (state.KeepRunning()) {
101 |         cuda_vbyte::decode<128>(d_decoded, d_encoded, decoded_values.size());
102 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
103 |     }
104 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
105 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
106 | }
107 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered128)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
108 | 
109 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered1024, gpu_ic::ClusteredDataGenerator, 1024)(benchmark::State& state) {
110 |     while (state.KeepRunning()) {
111 |         cuda_vbyte::decode<1024>(d_decoded, d_encoded, decoded_values.size());
112 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
113 |     }
114 |     auto bpi = double(8*encoded_values.size())/decoded_values.size();
115 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
116 | }
117 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered1024)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
118 | 
119 | BENCHMARK_MAIN();
120 | 


--------------------------------------------------------------------------------
/bench/simdbp_bench.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <gmock/gmock.h>
 18 | #include <gtest/gtest.h>
 19 | #include "benchmark/benchmark.h"
 20 | #include "../external/FastPFor/headers/codecfactory.h"
 21 | #include "synthetic.hpp"
 22 | #include "gpu_ic/utils/utils.hpp"
 23 | 
 24 | template <typename Generator>
 25 | class ValuesFixture : public ::benchmark::Fixture {
 26 | 
 27 | public:
 28 |     using ::benchmark::Fixture::SetUp;
 29 |     using ::benchmark::Fixture::TearDown;
 30 | 
 31 |     virtual void SetUp(::benchmark::State& st) {
 32 |         using namespace FastPForLib;
 33 |         using namespace gpu_ic;
 34 | 
 35 |         IntegerCODEC &codec = *CODECFactory::getFromName("simdbinarypacking");
 36 | 
 37 |         Generator clu(1);
 38 |         auto tmp = clu.generate(st.range(0), 1U << 29);
 39 |         values = std::vector<uint32_t>(tmp.begin(), tmp.end());
 40 |         utils::delta_encode(values.data(), values.size());
 41 | 
 42 |         encoded_values.resize(values.size() * 8);
 43 |         size_t compressedsize = 0;
 44 |         codec.encodeArray(values.data(), values.size(), encoded_values.data(),
 45 |                 compressedsize);
 46 |         encoded_values.resize(compressedsize);
 47 |         encoded_values.shrink_to_fit();
 48 | 
 49 |         decoded_values.resize(values.size());
 50 |     }
 51 | 
 52 |     virtual void TearDown(::benchmark::State&) {
 53 |         ASSERT_EQ(decoded_values.size(), values.size());
 54 |         for (size_t i = 0; i < values.size(); ++i)
 55 |         {
 56 |             ASSERT_EQ(decoded_values[i], values[i]);
 57 |         }
 58 |         values.clear();
 59 |         encoded_values.clear();
 60 |         decoded_values.clear();
 61 |     }
 62 |     std::vector<uint32_t> values;
 63 |     std::vector<uint32_t> encoded_values;
 64 |     std::vector<uint32_t> decoded_values;
 65 | };
 66 | 
 67 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform, gpu_ic::UniformDataGenerator)(benchmark::State& state) {
 68 |     using namespace FastPForLib;
 69 |     IntegerCODEC &codec = *CODECFactory::getFromName("simdbinarypacking");
 70 | 
 71 |     while (state.KeepRunning()) {
 72 |           size_t recoveredsize = 0;
 73 |           codec.decodeArray(encoded_values.data(), encoded_values.size(),
 74 |                     decoded_values.data(), recoveredsize);
 75 |     }
 76 |     auto bpi = double(32*encoded_values.size())/decoded_values.size();
 77 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 78 | 
 79 | }
 80 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
 81 | 
 82 | 
 83 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered, gpu_ic::ClusteredDataGenerator)(benchmark::State& state) {
 84 |     using namespace FastPForLib;
 85 |     IntegerCODEC &codec = *CODECFactory::getFromName("simdbinarypacking");
 86 | 
 87 |     while (state.KeepRunning()) {
 88 |           size_t recoveredsize = 0;
 89 |           codec.decodeArray(encoded_values.data(), encoded_values.size(),
 90 |                     decoded_values.data(), recoveredsize);
 91 |     }
 92 |     auto bpi = double(32*encoded_values.size())/decoded_values.size();
 93 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 94 | 
 95 | }
 96 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
 97 | 
 98 | BENCHMARK_MAIN();
 99 | 
100 | 


--------------------------------------------------------------------------------
/bench/streamvbyte_bench.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <gmock/gmock.h>
18 | #include <gtest/gtest.h>
19 | #include "benchmark/benchmark.h"
20 | #include "../external/FastPFor/headers/codecfactory.h"
21 | #include "synthetic.hpp"
22 | #include "gpu_ic/utils/utils.hpp"
23 | 
24 | template <typename Generator>
25 | class ValuesFixture : public ::benchmark::Fixture {
26 | 
27 | public:
28 |     using ::benchmark::Fixture::SetUp;
29 |     using ::benchmark::Fixture::TearDown;
30 | 
31 |     virtual void SetUp(::benchmark::State& st) {
32 |         using namespace FastPForLib;
33 |         IntegerCODEC &codec = *CODECFactory::getFromName("streamvbyte");
34 | 
35 |         Generator clu(1);
36 |         values = clu.generate(st.range(0), 1U << 29);
37 |         utils::delta_encode(values.data(), values.size());
38 | 
39 |         encoded_values.resize(values.size() * 8);
40 |         size_t compressedsize = 0;
41 |         codec.encodeArray(values.data(), values.size(), encoded_values.data(),
42 |                 compressedsize);
43 |         encoded_values.resize(compressedsize);
44 |         encoded_values.shrink_to_fit();
45 | 
46 |         decoded_values.resize(values.size());
47 |     }
48 | 
49 |     virtual void TearDown(::benchmark::State&) {
50 |         ASSERT_EQ(decoded_values.size(), values.size());
51 |         for (size_t i = 0; i < values.size(); ++i)
52 |         {
53 |             ASSERT_EQ(decoded_values[i], values[i]);
54 |         }
55 |         values.clear();
56 |         encoded_values.clear();
57 |         decoded_values.clear();
58 |     }
59 |     std::vector<uint32_t> values;
60 |     std::vector<uint32_t> encoded_values;
61 |     std::vector<uint32_t> decoded_values;
62 | };
63 | 
64 | 
65 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform, gpu_ic::UniformDataGenerator)(benchmark::State& state) {
66 |     using namespace FastPForLib;
67 |     IntegerCODEC &codec = *CODECFactory::getFromName("streamvbyte");
68 | 
69 |     while (state.KeepRunning()) {
70 |           size_t recoveredsize = 0;
71 |           codec.decodeArray(encoded_values.data(), encoded_values.size(),
72 |                     decoded_values.data(), recoveredsize);
73 |     }
74 |     auto bpi = double(32*encoded_values.size())/decoded_values.size();
75 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
76 | 
77 | }
78 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
79 | 
80 | 
81 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered, gpu_ic::ClusteredDataGenerator)(benchmark::State& state) {
82 |     using namespace FastPForLib;
83 |     IntegerCODEC &codec = *CODECFactory::getFromName("streamvbyte");
84 | 
85 |     while (state.KeepRunning()) {
86 |           size_t recoveredsize = 0;
87 |           codec.decodeArray(encoded_values.data(), encoded_values.size(),
88 |                     decoded_values.data(), recoveredsize);
89 |     }
90 |     auto bpi = double(32*encoded_values.size())/decoded_values.size();
91 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
92 | 
93 | }
94 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered)->RangeMultiplier(2)->Range((1ULL << 15), (1ULL<<25));
95 | 
96 | BENCHMARK_MAIN();
97 | 
98 | 


--------------------------------------------------------------------------------
/bench/synthetic.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | #include <random>
 20 | #include <set>
 21 | #include <vector>
 22 | 
 23 | namespace gpu_ic {
 24 | 
 25 | class UniformDataGenerator {
 26 |    public:
 27 |     UniformDataGenerator(uint32_t seed = std::random_device{}()) : rand(seed) {}
 28 | 
 29 |     std::vector<uint32_t> generate(uint32_t N, uint32_t Max) {
 30 |         if (Max < N)
 31 |             throw std::runtime_error("can't generate enough distinct elements in small interval");
 32 | 
 33 |         std::uniform_int_distribution<uint32_t> dis(1, Max - 1);
 34 |         std::vector<uint32_t>                   ans;
 35 |         if (N == 0)
 36 |             return ans; // nothing to do
 37 |         ans.reserve(N);
 38 |         assert(Max >= 1);
 39 | 
 40 |         if (2 * N > Max) {
 41 |             std::set<uint32_t> s;
 42 |             while (s.size() < Max - N)
 43 |                 s.insert(dis(rand));
 44 |             s.insert(Max);
 45 |             ans.resize(N);
 46 |             uint32_t i = 0;
 47 |             size_t   c = 0;
 48 |             for (uint32_t v : s) {
 49 |                 for (; i < v; ++i)
 50 |                     ans[c++] = i;
 51 |                 ++i;
 52 |             }
 53 |             assert(c == ans.size());
 54 |         } else {
 55 |             std::set<uint32_t> s;
 56 |             while (s.size() < N)
 57 |                 s.insert(dis(rand));
 58 |             ans.assign(s.begin(), s.end());
 59 |             assert(N == ans.size());
 60 |         }
 61 |         return ans;
 62 |     }
 63 |     std::mt19937 rand;
 64 | };
 65 | 
 66 | class ClusteredDataGenerator {
 67 |    public:
 68 |     UniformDataGenerator unidg;
 69 |     ClusteredDataGenerator(uint32_t seed = std::random_device{}()) : unidg(seed) {}
 70 | 
 71 |     template <class iterator>
 72 |     void fillUniform(iterator begin, iterator end, uint32_t Min, uint32_t Max) {
 73 |         std::vector<uint32_t> v = unidg.generate(static_cast<uint32_t>(end - begin), Max - Min);
 74 |         for (size_t k = 0; k < v.size(); ++k)
 75 |             *(begin + k) = Min + v[k];
 76 |     }
 77 |     template <class iterator>
 78 |     void fillClustered(iterator begin, iterator end, uint32_t Min, uint32_t Max) {
 79 |         const uint32_t N     = static_cast<uint32_t>(end - begin);
 80 |         const uint32_t range = Max - Min;
 81 |         if (range < N)
 82 |             throw std::runtime_error("can't generate that many in small interval.");
 83 |         assert(range >= N);
 84 |         if ((range == N) || (N < 10)) {
 85 |             fillUniform(begin, end, Min, Max);
 86 |             return;
 87 |         }
 88 |         std::uniform_int_distribution<uint32_t> dis(1, range - N);
 89 |         const uint32_t                          cut = N / 2 + dis(unidg.rand);
 90 |         assert(cut >= N / 2);
 91 |         assert(Max - Min - cut >= N - N / 2);
 92 | 
 93 |         std::uniform_real_distribution<double> urd_dis;
 94 |         const double                           p = urd_dis(unidg.rand);
 95 |         assert(p <= 1);
 96 |         assert(p >= 0);
 97 |         if (p <= 0.25) {
 98 |             fillUniform(begin, begin + N / 2, Min, Min + cut);
 99 |             fillClustered(begin + N / 2, end, Min + cut, Max);
100 |         } else if (p <= 0.5) {
101 |             fillClustered(begin, begin + N / 2, Min, Min + cut);
102 |             fillUniform(begin + N / 2, end, Min + cut, Max);
103 |         } else {
104 |             fillClustered(begin, begin + N / 2, Min, Min + cut);
105 |             fillClustered(begin + N / 2, end, Min + cut, Max);
106 |         }
107 |     }
108 | 
109 |     std::vector<uint32_t> generate(uint32_t N, uint32_t Max) {
110 |         std::vector<uint32_t> ans(N);
111 |         fillClustered(ans.begin(), ans.end(), 0, Max);
112 |         return ans;
113 |     }
114 | };
115 | 
116 | } // namespace gpu_ic
117 | 


--------------------------------------------------------------------------------
/bench/varintgb_bench.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "../external/FastPFor/headers/codecfactory.h"
 18 | #include "benchmark/benchmark.h"
 19 | #include "gpu_ic/utils/utils.hpp"
 20 | #include <gmock/gmock.h>
 21 | #include <gtest/gtest.h>
 22 | 
 23 | #include "synthetic.hpp"
 24 | 
 25 | template <typename Generator>
 26 | class ValuesFixture : public ::benchmark::Fixture {
 27 | 
 28 |    public:
 29 |     using ::benchmark::Fixture::SetUp;
 30 |     using ::benchmark::Fixture::TearDown;
 31 | 
 32 |     virtual void SetUp(::benchmark::State &st) {
 33 |         using namespace FastPForLib;
 34 |         using namespace gpu_ic;
 35 | 
 36 |         IntegerCODEC &codec = *CODECFactory::getFromName("varintgb");
 37 | 
 38 |         Generator clu(1);
 39 |         values = clu.generate(st.range(0), 1U << 29);
 40 |         utils::delta_encode(values.data(), values.size());
 41 | 
 42 |         encoded_values.resize(values.size() * 8);
 43 |         size_t compressedsize = 0;
 44 |         codec.encodeArray(values.data(), values.size(), encoded_values.data(), compressedsize);
 45 |         encoded_values.resize(compressedsize);
 46 |         encoded_values.shrink_to_fit();
 47 | 
 48 |         decoded_values.resize(values.size());
 49 |     }
 50 | 
 51 |     virtual void TearDown(::benchmark::State &) {
 52 |         ASSERT_EQ(decoded_values.size(), values.size());
 53 |         for (size_t i = 0; i < values.size(); ++i) {
 54 |             ASSERT_EQ(decoded_values[i], values[i]);
 55 |         }
 56 |         values.clear();
 57 |         encoded_values.clear();
 58 |         decoded_values.clear();
 59 |     }
 60 |     std::vector<uint32_t> values;
 61 |     std::vector<uint32_t> encoded_values;
 62 |     std::vector<uint32_t> decoded_values;
 63 | };
 64 | 
 65 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeUniform, gpu_ic::UniformDataGenerator)
 66 | (benchmark::State &state) {
 67 |     using namespace FastPForLib;
 68 |     IntegerCODEC &codec = *CODECFactory::getFromName("varintgb");
 69 | 
 70 |     while (state.KeepRunning()) {
 71 |         size_t recoveredsize = 0;
 72 |         codec.decodeArray(
 73 |             encoded_values.data(), encoded_values.size(), decoded_values.data(), recoveredsize);
 74 |     }
 75 |     auto bpi              = double(32 * encoded_values.size()) / decoded_values.size();
 76 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 77 | }
 78 | BENCHMARK_REGISTER_F(ValuesFixture, decodeUniform)
 79 |     ->RangeMultiplier(2)
 80 |     ->Range((1ULL << 15), (1ULL << 25));
 81 | 
 82 | BENCHMARK_TEMPLATE_DEFINE_F(ValuesFixture, decodeClustered, gpu_ic::ClusteredDataGenerator)
 83 | (benchmark::State &state) {
 84 |     using namespace FastPForLib;
 85 |     IntegerCODEC &codec = *CODECFactory::getFromName("varintgb");
 86 | 
 87 |     while (state.KeepRunning()) {
 88 |         size_t recoveredsize = 0;
 89 |         codec.decodeArray(
 90 |             encoded_values.data(), encoded_values.size(), decoded_values.data(), recoveredsize);
 91 |     }
 92 |     auto bpi              = double(32 * encoded_values.size()) / decoded_values.size();
 93 |     state.counters["bpi"] = benchmark::Counter(bpi, benchmark::Counter::kAvgThreads);
 94 | }
 95 | BENCHMARK_REGISTER_F(ValuesFixture, decodeClustered)
 96 |     ->RangeMultiplier(2)
 97 |     ->Range((1ULL << 15), (1ULL << 25));
 98 | 
 99 | BENCHMARK_MAIN();
100 | 


--------------------------------------------------------------------------------
/external/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | EXECUTE_PROCESS(COMMAND git submodule update --init
 2 |                 WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
 3 |                 OUTPUT_QUIET
 4 |         )
 5 | 
 6 | # Add FastPFor
 7 | add_subdirectory(FastPFor EXCLUDE_FROM_ALL)
 8 | 
 9 | add_subdirectory(googletest EXCLUDE_FROM_ALL)
10 | 
11 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Suppressing benchmark's tests" FORCE)
12 | add_subdirectory(benchmark EXCLUDE_FROM_ALL)
13 | 
14 | # Add CLI11
15 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/CLI11 EXCLUDE_FROM_ALL)
16 | 
17 | # Add mio
18 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mio EXCLUDE_FROM_ALL)
19 | 
20 | # Add cereal
21 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/boost-cmake EXCLUDE_FROM_ALL)
22 | 
23 | if (BUILD_CUDA)
24 | add_library(cub INTERFACE)
25 | target_include_directories(cub INTERFACE
26 |     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/cub>
27 | )
28 | endif ()
29 | 


--------------------------------------------------------------------------------
/include/gpu_ic/cuda_bp.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <algorithm>
20 | #include <cuda.h>
21 | #include <numeric>
22 | #include <utility>
23 | 
24 | #include "utils/bit_istream.hpp"
25 | #include "utils/bit_ostream.hpp"
26 | #include "utils/cuda_utils.hpp"
27 | #include "utils/utils.hpp"
28 | 
29 | namespace cuda_bp {
30 | 
31 | template <size_t block_size = 32>
32 | static size_t encode(uint8_t *out, const uint32_t *in, size_t n) {
33 |     bit_ostream bw(out);
34 | 
35 |     auto                blocks = std::ceil((double)n / block_size);
36 |     std::vector<size_t> bits(blocks, 0);
37 |     for (size_t i = 0; i < n; ++i) {
38 |         auto   value = in[i];
39 |         size_t bit   = utils::bits(value);
40 |         auto   b     = i / block_size;
41 |         bits[b]      = std::max(bit, bits[b]);
42 |     }
43 |     bw.write(0, 32);
44 |     uint32_t offset = 0;
45 |     for (auto b : bits) {
46 |         offset += b * block_size/32;
47 |         bw.write(offset, 32);
48 |     }
49 |     for (size_t i = 0; i < n; ++i) {
50 |         auto value = in[i];
51 |         auto b     = i / block_size;
52 |         bw.write(value, bits[b]);
53 |     }
54 |     return ceil((double)bw.size() / 8);
55 | }
56 | 
57 | template <size_t block_size = 32>
58 | __global__ void kernel_decode(uint32_t *      out,
59 |                               const uint32_t *in,
60 |                               size_t          n,
61 |                               const uint32_t *offsets) {
62 |     size_t index = blockIdx.x * blockDim.x + threadIdx.x;
63 |     if (index < n) {
64 |         uint8_t  bit_size = (offsets[blockIdx.x + 1] - offsets[blockIdx.x])*32/block_size;
65 |         uint32_t offset   = offsets[blockIdx.x];
66 |         out[index]        = extract(in + offset, threadIdx.x * bit_size, bit_size);
67 |     }
68 | }
69 | 
70 | template <size_t block_size = 32>
71 | static void decode(uint32_t *d_out, const uint8_t *d_in, size_t n) {
72 |     size_t         header_len = 4 * (ceil((double)n / block_size) + 1);
73 |     const uint8_t *d_payload  = d_in + header_len;
74 |     kernel_decode<block_size><<<ceil((double)n / block_size), block_size>>>(d_out,
75 |                                         reinterpret_cast<const uint32_t *>(d_payload),
76 |                                         n,
77 |                                         reinterpret_cast<const uint32_t *>(d_in));
78 | }
79 | 
80 | } // namespace cuda_bp
81 | 


--------------------------------------------------------------------------------
/include/gpu_ic/cuda_vbyte.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #pragma once
 18 | 
 19 | #include <cuda.h>
 20 | #include <numeric>
 21 | 
 22 | #include "cub/cub.cuh"
 23 | 
 24 | #include "utils/bit_ostream.hpp"
 25 | #include "utils/cuda_utils.hpp"
 26 | 
 27 | namespace cuda_vbyte {
 28 | 
 29 | template <size_t block_size = 128>
 30 | static size_t encode(uint8_t *out, const uint32_t *in, size_t n) {
 31 | 
 32 |     bit_ostream bw_offset(out);
 33 | 
 34 |     size_t block_num  = ceil((double)n / block_size);
 35 |     size_t offset_len = 4 * block_num + 4;
 36 |     size_t size       = 0;
 37 | 
 38 |     bw_offset.write(0, 32);
 39 |     size_t i;
 40 |     for (i = 0; i + block_size < n; i += block_size) {
 41 |         bit_ostream bw_block(out + offset_len);
 42 |         for (int j = i; j < i + block_size and j < n; ++j) {
 43 |             const auto value = in[j];
 44 |             if (value < (1U << 8)) {
 45 |                 bw_block.write(0, 2);
 46 |             } else if (value < (1U << 16)) {
 47 |                 bw_block.write(1, 2);
 48 |             } else if (value < (1U << 24)) {
 49 |                 bw_block.write(2, 2);
 50 |             } else {
 51 |                 bw_block.write(3, 2);
 52 |             }
 53 |         }
 54 |         for (int j = i; j < i + block_size and j < n; ++j) {
 55 |             const auto value = in[j];
 56 |             if (value < (1U << 8)) {
 57 |                 bw_block.write(value, 8);
 58 |             } else if (value < (1U << 16)) {
 59 |                 bw_block.write(value, 16);
 60 |             } else if (value < (1U << 24)) {
 61 |                 bw_block.write(value, 24);
 62 |             } else {
 63 |                 bw_block.write(value, 32);
 64 |             }
 65 |         }
 66 |         auto padding = 32 - (bw_block.size() % 32);
 67 |         bw_block.write(0, padding);
 68 |         size += ceil((double)bw_block.size() / 8);
 69 |         bw_offset.write(size, 32);
 70 |         offset_len += ceil((double)(bw_block.size()) / 8);
 71 |     }
 72 |     bit_ostream bw_block(out + offset_len);
 73 |     auto s = i;
 74 |     size_t bit   = 0;
 75 |     while(s<n) {
 76 |         const auto value = in[s];
 77 |         size_t b = utils::bits(value);
 78 |         bit= std::max(bit, b);
 79 |         s+=1;
 80 |     }
 81 |     bw_block.write(bit, 32);
 82 |     while(i<n) {
 83 |         const auto value = in[i];
 84 |         bw_block.write(value, bit);
 85 |         i+=1;
 86 |     }
 87 |     // auto padding = 32 - (bw_block.size() % 32);
 88 |     // bw_block.write(0, padding);
 89 |     size += ceil((double)bw_block.size() / 8);
 90 |     offset_len += ceil((double)(bw_block.size()) / 8);
 91 | 
 92 |     return offset_len;
 93 | }
 94 | template <size_t block_size = 128>
 95 | __global__ void kernel_decode_vbyte(uint32_t *      out,
 96 |                               const uint32_t *in,
 97 |                               size_t          n,
 98 |                               const uint32_t *offsets) {
 99 | 
100 |     size_t     index  = blockIdx.x * blockDim.x + threadIdx.x;
101 |     uint32_t   offset = offsets[blockIdx.x] / 4;
102 |     if ((blockIdx.x +1) * block_size  < n) {
103 |         __shared__ uint32_t min_offsets[block_size + 1];
104 |         min_offsets[0] = 0;
105 |         min_offsets[threadIdx.x + 1] = (extract(in + offset, threadIdx.x * 2, 2) + 1) * 8;
106 |         __syncthreads();
107 | 
108 |         typedef cub::BlockScan<uint32_t, block_size>       BlockScan;
109 |         __shared__ typename BlockScan::TempStorage temp_storage;
110 |         BlockScan(temp_storage)
111 |             .InclusiveSum(min_offsets[threadIdx.x + 1], min_offsets[threadIdx.x + 1]);
112 |         __syncthreads();
113 |         uint32_t bit = min_offsets[threadIdx.x + 1] - min_offsets[threadIdx.x];
114 |         uint32_t header_len = 2 * (block_size/32);
115 |         out[index]   = extract(in + offset + header_len, min_offsets[threadIdx.x], bit);
116 |     } else if(index < n){
117 |         uint8_t  bit_size = *(in + offset);
118 |         out[index]        = extract(in + offset+1, threadIdx.x * bit_size, bit_size);
119 |     }
120 | }
121 | template <size_t block_size = 128>
122 | static void decode(uint32_t *d_out, const uint8_t *d_in, size_t n) {
123 |     size_t         block_num  = ceil((double)n / block_size);
124 |     size_t         offset_len = 4 * block_num + 4;
125 |     const uint8_t *d_payload  = d_in + offset_len;
126 |     kernel_decode_vbyte<block_size><<<ceil((double)n / block_size), block_size>>>(
127 |         d_out,
128 |         reinterpret_cast<const uint32_t *>(d_payload),
129 |         n,
130 |         reinterpret_cast<const uint32_t *>(d_in));
131 | }
132 | 
133 | } // namespace cuda_vbyte
134 | 


--------------------------------------------------------------------------------
/include/gpu_ic/utils/binary_collection.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <iostream>
  4 | #include <cstdint>
  5 | #include <iterator>
  6 | #include <stdexcept>
  7 | #include <type_traits>
  8 | #include <cassert>
  9 | 
 10 | #include "mio/mmap.hpp"
 11 | 
 12 | 
 13 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 14 | #include <sys/mman.h>
 15 | #endif
 16 | 
 17 | namespace gpu_ic {
 18 | 
 19 |     template <typename Source = mio::mmap_source>
 20 |     class base_binary_collection {
 21 |        public:
 22 |         using posting_type = uint32_t;
 23 |         using pointer = typename std::conditional<std::is_same<Source, mio::mmap_source>::value,
 24 |                                                   posting_type const,
 25 |                                                   posting_type>::type *;
 26 | 
 27 |         base_binary_collection(const char *filename) {
 28 |             std::error_code error;
 29 |             m_file.map(filename, error);
 30 |             if ( error ) {
 31 |                 std::cerr << "error mapping file: " << error.message() << ", exiting..." << std::endl;
 32 |                 throw std::runtime_error("Error opening file");
 33 |             }
 34 |             m_data      = reinterpret_cast<pointer>(m_file.data());
 35 |             m_data_size = m_file.size() / sizeof(m_data[0]);
 36 | 
 37 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 38 |             // Indicates that the application expects to access this address range in a sequential manner
 39 |             auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL);
 40 |             if (ret) std::cerr << "Error calling madvice: " << errno << std::endl;
 41 | #endif
 42 |         }
 43 | 
 44 |         class sequence {
 45 |            public:
 46 |             sequence(pointer begin, pointer end) : m_begin(begin), m_end(end) {}
 47 |             sequence() : m_begin(nullptr), m_end(nullptr) {}
 48 | 
 49 |             pointer begin() const { return m_begin; }
 50 |             pointer end() const { return m_end; }
 51 |             size_t size() const { return m_end - m_begin; }
 52 | 
 53 |             posting_type back() const
 54 |             {
 55 |                 assert(size());
 56 |                 return *(m_end - 1);
 57 |             }
 58 | 
 59 |            private:
 60 |             pointer m_begin;
 61 |             pointer m_end;
 62 |         };
 63 | 
 64 |         using const_sequence = sequence;
 65 | 
 66 |         template <typename S>
 67 |         class base_iterator;
 68 | 
 69 |         using const_iterator = base_iterator<const_sequence>;
 70 |         using iterator = typename std::conditional<std::is_same<Source, mio::mmap_source>::value,
 71 |                                                    const_iterator,
 72 |                                                    base_iterator<sequence>>::type;
 73 | 
 74 |         iterator       begin() { return iterator(this, 0); }
 75 |         iterator       end() { return iterator(this, m_data_size); }
 76 |         const_iterator begin() const { return const_iterator(this, 0); }
 77 |         const_iterator end() const { return const_iterator(this, m_data_size); }
 78 |         const_iterator cbegin() const { return const_iterator(this, 0); }
 79 |         const_iterator cend() const { return const_iterator(this, m_data_size); }
 80 | 
 81 |         template <typename S>
 82 |         class base_iterator : public std::iterator<std::forward_iterator_tag, S> {
 83 |            public:
 84 |             base_iterator() : m_collection(nullptr) {}
 85 | 
 86 |             S const &operator*() const { return m_cur_seq; }
 87 | 
 88 |             S const *operator-> () const { return &m_cur_seq; }
 89 | 
 90 |             base_iterator &operator++() {
 91 |                 m_pos = m_next_pos;
 92 |                 read();
 93 |                 return *this;
 94 |             }
 95 | 
 96 |             bool operator==(base_iterator const &other) const {
 97 |                 assert(m_collection == other.m_collection);
 98 |                 return m_pos == other.m_pos;
 99 |             }
100 | 
101 |             bool operator!=(base_iterator const &other) const { return !(*this == other); }
102 | 
103 |            private:
104 |             friend class base_binary_collection;
105 | 
106 |             base_iterator(base_binary_collection const *coll, size_t pos)
107 |                 : m_collection(coll), m_pos(pos) {
108 |                 read();
109 |             }
110 | 
111 |             void read()
112 |             {
113 |                 assert(m_pos <= m_collection->m_data_size);
114 |                 if (m_pos == m_collection->m_data_size) return;
115 | 
116 |                 size_t n = 0;
117 |                 size_t pos = m_pos;
118 |                 n = m_collection->m_data[pos++];
119 |                 // file might be truncated
120 |                 n = std::min(n, size_t(m_collection->m_data_size - pos));
121 |                 auto begin = &m_collection->m_data[pos];
122 | 
123 |                 m_next_pos = pos + n;
124 |                 m_cur_seq  = S(begin, begin + n);
125 |             }
126 | 
127 |             base_binary_collection const *     m_collection;
128 |             size_t                             m_pos, m_next_pos;
129 |             S                                  m_cur_seq;
130 |         };
131 | 
132 |        private:
133 |         Source  m_file;
134 |         pointer m_data;
135 |         size_t  m_data_size;
136 |     };
137 | 
138 |     using binary_collection = base_binary_collection<>;
139 |     using writable_binary_collection = base_binary_collection<mio::mmap_sink>;
140 | }
141 | 


--------------------------------------------------------------------------------
/include/gpu_ic/utils/binary_freq_collection.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdexcept>
  4 | #include <iterator>
  5 | #include <cstdint>
  6 | 
  7 | #include "binary_collection.hpp"
  8 | 
  9 | namespace gpu_ic {
 10 | 
 11 |     class binary_freq_collection {
 12 |     public:
 13 | 
 14 |         binary_freq_collection(const char* basename)
 15 |             : m_docs((std::string(basename) + ".docs").c_str())
 16 |             , m_freqs((std::string(basename) + ".freqs").c_str())
 17 |         {
 18 |             auto firstseq = *m_docs.begin();
 19 |             if (firstseq.size() != 1) {
 20 |                 throw std::invalid_argument("First sequence should only contain number of documents");
 21 |             }
 22 |             m_num_docs = *firstseq.begin();
 23 |         }
 24 | 
 25 |         class iterator;
 26 | 
 27 |         iterator begin() const
 28 |         {
 29 |             auto docs_it = m_docs.begin();
 30 |             return iterator(++docs_it, m_freqs.begin());
 31 |         }
 32 | 
 33 |         iterator end() const
 34 |         {
 35 |             return iterator(m_docs.end(), m_freqs.end());
 36 |         }
 37 | 
 38 |         size_t size() const
 39 |         {
 40 |             return std::distance(begin(), end());
 41 |         }
 42 | 
 43 |         uint64_t num_docs() const
 44 |         {
 45 |             return m_num_docs;
 46 |         }
 47 | 
 48 |         struct sequence {
 49 |             binary_collection::const_sequence docs;
 50 |             binary_collection::const_sequence freqs;
 51 |         };
 52 | 
 53 |         class iterator : public std::iterator<std::forward_iterator_tag,
 54 |                                               sequence> {
 55 |         public:
 56 |             iterator()
 57 |             {}
 58 | 
 59 |             value_type const& operator*() const
 60 |             {
 61 |                 return m_cur_seq;
 62 |             }
 63 | 
 64 |             value_type const* operator->() const
 65 |             {
 66 |                 return &m_cur_seq;
 67 |             }
 68 | 
 69 |             iterator& operator++()
 70 |             {
 71 |                 m_cur_seq.docs = *++m_docs_it;
 72 |                 m_cur_seq.freqs = *++m_freqs_it;
 73 |                 return *this;
 74 |             }
 75 | 
 76 |             bool operator==(iterator const& other) const
 77 |             {
 78 |                 return m_docs_it == other.m_docs_it;
 79 |             }
 80 | 
 81 |             bool operator!=(iterator const& other) const
 82 |             {
 83 |                 return !(*this == other);
 84 |             }
 85 | 
 86 |         private:
 87 |             friend class binary_freq_collection;
 88 | 
 89 |             iterator(binary_collection::const_iterator docs_it,
 90 |                      binary_collection::const_iterator freqs_it)
 91 |                 : m_docs_it(docs_it), m_freqs_it(freqs_it) {
 92 |                 m_cur_seq.docs = *m_docs_it;
 93 |                 m_cur_seq.freqs = *m_freqs_it;
 94 |             }
 95 | 
 96 |             binary_collection::const_iterator m_docs_it;
 97 |             binary_collection::const_iterator m_freqs_it;
 98 |             sequence m_cur_seq;
 99 |         };
100 | 
101 |     private:
102 |         binary_collection m_docs;
103 |         binary_collection m_freqs;
104 |         uint64_t m_num_docs;
105 |     };
106 | }
107 | 


--------------------------------------------------------------------------------
/include/gpu_ic/utils/bit_istream.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | class bit_istream {
20 |    public:
21 |     bit_istream(uint8_t const *in)
22 |         : m_in(reinterpret_cast<const uint32_t *>(in)), m_avail(0), m_buf(0), m_pos(0) {}
23 | 
24 |     size_t position() const { return m_pos; }
25 | 
26 |     uint32_t read(uint32_t len) {
27 |         if (!len)
28 |             return 0;
29 | 
30 |         if (m_avail < len) {
31 |             m_buf |= uint64_t(*m_in++) << m_avail;
32 |             m_avail += 32;
33 |         }
34 |         uint32_t val = m_buf & ((uint64_t(1) << len) - 1);
35 |         m_buf >>= len;
36 |         m_avail -= len;
37 |         m_pos += len;
38 | 
39 |         return val;
40 |     }
41 | 
42 |     inline uint8_t read_bit() { return read(1); }
43 | 
44 |     inline uint32_t read_unary() {
45 |         uint32_t v = 0;
46 |         while (read_bit() == 0)
47 |             ++v;
48 |         return v;
49 |     }
50 | 
51 |     inline uint32_t read_elias_gamma() {
52 |         auto bits = read_unary();
53 |         return read(bits);
54 |     }
55 | 
56 |     inline uint32_t read_elias_delta() {
57 |         auto bits = read_elias_gamma();
58 |         return read(bits);
59 |     }
60 | 
61 |     inline uint32_t read_vbyte() {
62 |         uint32_t val = 0;
63 |         size_t   i   = 0;
64 |         while (read_bit()) {
65 |             val |= read(7) << (7 * i++);
66 |         }
67 |         val |= read(7) << (7 * i);
68 |         return val;
69 |     }
70 | 
71 |    private:
72 |     uint32_t const *m_in;
73 |     uint32_t        m_avail;
74 |     uint64_t        m_buf;
75 |     size_t          m_pos;
76 | };


--------------------------------------------------------------------------------
/include/gpu_ic/utils/bit_ostream.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "utils.hpp"
20 | 
21 | class bit_ostream {
22 |    public:
23 |     bit_ostream(uint8_t *buf) : m_buf(reinterpret_cast<uint32_t *>(buf)), m_size(0) {}
24 | 
25 |     void write(uint32_t bits, uint32_t len) {
26 |         if (!len)
27 |             return;
28 |         uint32_t pos_in_word = m_size % 32;
29 | 
30 |         m_size += len;
31 |         if (pos_in_word == 0) {
32 |             *m_buf = bits;
33 |             if (len == 32) {
34 |                 m_buf += 1;
35 |             }
36 |         } else {
37 |             *m_buf |= bits << pos_in_word;
38 |             if (len >= 32 - pos_in_word) {
39 |                 m_buf += 1;
40 |                 *m_buf = bits >> (32 - pos_in_word);
41 |             }
42 |         }
43 |     }
44 | 
45 |     size_t size() const { return m_size; }
46 | 
47 |     inline void write_bit(bool val) { write(val, 1); }
48 | 
49 |     inline void write_unary(uint32_t val) {
50 |         while (val--) {
51 |             write_bit(0);
52 |         }
53 |         write_bit(1);
54 |     }
55 | 
56 |     inline void write_elias_gamma(uint32_t val) {
57 |         write_unary(utils::bits(val));
58 |         write(val, utils::bits(val));
59 |     }
60 | 
61 |     inline void write_elias_delta(uint32_t val) {
62 |         write_elias_gamma(utils::bits(val));
63 |         write(val, utils::bits(val));
64 |     }
65 | 
66 |     inline void write_vbyte(uint32_t val) {
67 |         while (val >= 128) {
68 |             write(0x80 | (val & 0x7f), 8);
69 |             val >>= 7;
70 |         }
71 |         write(0, 1);
72 |         write(val, 7);
73 |     }
74 | 
75 |    private:
76 |     uint32_t *m_buf;
77 |     size_t    m_size;
78 | };
79 | 


--------------------------------------------------------------------------------
/include/gpu_ic/utils/bit_vector.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | 
  5 | #include "boost/range.hpp"
  6 | 
  7 | #include "broadword.hpp"
  8 | 
  9 | #include "mappable_vector.hpp"
 10 | 
 11 | namespace gpu_ic {
 12 | 
 13 | namespace detail {
 14 | 
 15 | template <typename IntType1, typename IntType2>
 16 | inline IntType1 ceil_div(IntType1 dividend, IntType2 divisor)
 17 | {
 18 |     // XXX(ot): put some static check that IntType1 >= IntType2
 19 |     IntType1 d = IntType1(divisor);
 20 |     return IntType1(dividend + d - 1) / d;
 21 | }
 22 | 
 23 | inline size_t words_for(uint64_t n) { return ceil_div(n, 64); }
 24 | } // namespace detail
 25 | 
 26 | class bit_vector_builder {
 27 |    public:
 28 |     using bits_type = std::vector<uint64_t>;
 29 | 
 30 |     bit_vector_builder(uint64_t size = 0, bool init = 0) : m_size(size) {
 31 |         m_bits.resize(detail::words_for(size), uint64_t(-init));
 32 |         if (size) {
 33 |             m_cur_word = &m_bits.back();
 34 |             // clear padding bits
 35 |             if (init && size % 64) {
 36 |                 *m_cur_word >>= 64 - (size % 64);
 37 |             }
 38 |         }
 39 |     }
 40 |     bit_vector_builder(const bit_vector_builder &) = delete;
 41 |     bit_vector_builder &operator=(const bit_vector_builder &) = delete;
 42 | 
 43 |     void reserve(uint64_t size) { m_bits.reserve(detail::words_for(size)); }
 44 | 
 45 |     inline void push_back(bool b) {
 46 |         uint64_t pos_in_word = m_size % 64;
 47 |         if (pos_in_word == 0) {
 48 |             m_bits.push_back(0);
 49 |             m_cur_word = &m_bits.back();
 50 |         }
 51 |         *m_cur_word |= (uint64_t)b << pos_in_word;
 52 |         ++m_size;
 53 |     }
 54 | 
 55 |     inline void set(uint64_t pos, bool b) {
 56 |         uint64_t word = pos / 64;
 57 |         uint64_t pos_in_word = pos % 64;
 58 | 
 59 |         m_bits[word] &= ~(uint64_t(1) << pos_in_word);
 60 |         m_bits[word] |= uint64_t(b) << pos_in_word;
 61 |     }
 62 | 
 63 |     inline void set_bits(uint64_t pos, uint64_t bits, size_t len) {
 64 |         assert(pos + len <= size());
 65 |         // check there are no spurious bits
 66 |         assert(len == 64 || (bits >> len) == 0);
 67 |         if (!len)
 68 |             return;
 69 |         uint64_t mask = (len == 64) ? uint64_t(-1) : ((uint64_t(1) << len) - 1);
 70 |         uint64_t word = pos / 64;
 71 |         uint64_t pos_in_word = pos % 64;
 72 | 
 73 |         m_bits[word] &= ~(mask << pos_in_word);
 74 |         m_bits[word] |= bits << pos_in_word;
 75 | 
 76 |         uint64_t stored = 64 - pos_in_word;
 77 |         if (stored < len) {
 78 |             m_bits[word + 1] &= ~(mask >> stored);
 79 |             m_bits[word + 1] |= bits >> stored;
 80 |         }
 81 |     }
 82 | 
 83 |     inline void append_bits(uint64_t bits, size_t len) {
 84 |         // check there are no spurious bits
 85 |         assert(len == 64 || (bits >> len) == 0);
 86 |         if (!len)
 87 |             return;
 88 |         uint64_t pos_in_word = m_size % 64;
 89 |         m_size += len;
 90 |         if (pos_in_word == 0) {
 91 |             m_bits.push_back(bits);
 92 |         } else {
 93 |             *m_cur_word |= bits << pos_in_word;
 94 |             if (len > 64 - pos_in_word) {
 95 |                 m_bits.push_back(bits >> (64 - pos_in_word));
 96 |             }
 97 |         }
 98 |         m_cur_word = &m_bits.back();
 99 |     }
100 | 
101 |     inline void zero_extend(uint64_t n) {
102 |         m_size += n;
103 |         uint64_t needed = detail::words_for(m_size) - m_bits.size();
104 |         if (needed) {
105 |             m_bits.insert(m_bits.end(), needed, 0);
106 |             m_cur_word = &m_bits.back();
107 |         }
108 |     }
109 | 
110 |     inline void one_extend(uint64_t n) {
111 |         while (n >= 64) {
112 |             append_bits(uint64_t(-1), 64);
113 |             n -= 64;
114 |         }
115 |         if (n) {
116 |             append_bits(uint64_t(-1) >> (64 - n), n);
117 |         }
118 |     }
119 | 
120 |     void append(bit_vector_builder const &rhs) {
121 |         if (!rhs.size())
122 |             return;
123 | 
124 |         uint64_t pos = m_bits.size();
125 |         uint64_t shift = size() % 64;
126 |         m_size = size() + rhs.size();
127 |         m_bits.resize(detail::words_for(m_size));
128 | 
129 |         if (shift == 0) { // word-aligned, easy case
130 |             std::copy(rhs.m_bits.begin(), rhs.m_bits.end(), m_bits.begin() + ptrdiff_t(pos));
131 |         } else {
132 |             uint64_t *cur_word = &m_bits.front() + pos - 1;
133 |             for (size_t i = 0; i < rhs.m_bits.size() - 1; ++i) {
134 |                 uint64_t w = rhs.m_bits[i];
135 |                 *cur_word |= w << shift;
136 |                 *++cur_word = w >> (64 - shift);
137 |             }
138 |             *cur_word |= rhs.m_bits.back() << shift;
139 |             if (cur_word < &m_bits.back()) {
140 |                 *++cur_word = rhs.m_bits.back() >> (64 - shift);
141 |             }
142 |         }
143 |         m_cur_word = &m_bits.back();
144 |     }
145 | 
146 |     // reverse in place
147 |     void reverse() {
148 |         uint64_t shift = 64 - (size() % 64);
149 | 
150 |         uint64_t remainder = 0;
151 |         for (size_t i = 0; i < m_bits.size(); ++i) {
152 |             uint64_t cur_word;
153 |             if (shift != 64) { // this should be hoisted out
154 |                 cur_word = remainder | (m_bits[i] << shift);
155 |                 remainder = m_bits[i] >> (64 - shift);
156 |             } else {
157 |                 cur_word = m_bits[i];
158 |             }
159 |             m_bits[i] = broadword::reverse_bits(cur_word);
160 |         }
161 |         assert(remainder == 0);
162 |         std::reverse(m_bits.begin(), m_bits.end());
163 |     }
164 | 
165 |     bits_type &move_bits() {
166 |         assert(detail::words_for(m_size) == m_bits.size());
167 |         return m_bits;
168 |     }
169 | 
170 |     uint64_t size() const { return m_size; }
171 | 
172 |     void swap(bit_vector_builder &other) {
173 |         m_bits.swap(other.m_bits);
174 |         std::swap(m_size, other.m_size);
175 |         std::swap(m_cur_word, other.m_cur_word);
176 |     }
177 | 
178 |    private:
179 |     bits_type m_bits;
180 |     uint64_t m_size;
181 |     uint64_t *m_cur_word;
182 | };
183 | 
184 | class bit_vector {
185 |    public:
186 |     bit_vector() = default;
187 | 
188 |     template <class Range>
189 |     bit_vector(Range const &from) {
190 |         std::vector<uint64_t> bits;
191 |         const uint64_t first_mask = uint64_t(1);
192 |         uint64_t mask = first_mask;
193 |         uint64_t cur_val = 0;
194 |         m_size = 0;
195 |         for (typename boost::range_const_iterator<Range>::type iter = boost::begin(from);
196 |              iter != boost::end(from);
197 |              ++iter) {
198 |             if (*iter) {
199 |                 cur_val |= mask;
200 |             }
201 |             mask <<= 1;
202 |             m_size += 1;
203 |             if (!mask) {
204 |                 bits.push_back(cur_val);
205 |                 mask = first_mask;
206 |                 cur_val = 0;
207 |             }
208 |         }
209 |         if (mask != first_mask) {
210 |             bits.push_back(cur_val);
211 |         }
212 |         m_bits.steal(bits);
213 |     }
214 | 
215 |     bit_vector(bit_vector_builder *from) {
216 |         m_size = from->size();
217 |         m_bits.steal(from->move_bits());
218 |     }
219 | 
220 |     template <typename Visitor>
221 |     void map(Visitor &visit) {
222 |         visit(m_size, "m_size")(m_bits, "m_bits");
223 |     }
224 | 
225 |     void swap(bit_vector &other) {
226 |         std::swap(other.m_size, m_size);
227 |         other.m_bits.swap(m_bits);
228 |     }
229 | 
230 |     inline size_t size() const { return m_size; }
231 | 
232 |     inline bool operator[](uint64_t pos) const {
233 |         assert(pos < m_size);
234 |         uint64_t block = pos / 64;
235 |         assert(block < m_bits.size());
236 |         uint64_t shift = pos % 64;
237 |         return (m_bits[block] >> shift) & 1;
238 |     }
239 | 
240 |     inline uint64_t get_bits(uint64_t pos, uint64_t len) const {
241 |         assert(pos + len <= size());
242 |         if (!len) {
243 |             return 0;
244 |         }
245 |         uint64_t block = pos / 64;
246 |         uint64_t shift = pos % 64;
247 |         uint64_t mask = -(len == 64) | ((1ULL << len) - 1);
248 |         if (shift + len <= 64) {
249 |             return m_bits[block] >> shift & mask;
250 |         } else {
251 |             return (m_bits[block] >> shift) | (m_bits[block + 1] << (64 - shift) & mask);
252 |         }
253 |     }
254 | 
255 |     // same as get_bits(pos, 64) but it can extend further size(), padding with zeros
256 |     inline uint64_t get_word(uint64_t pos) const {
257 |         assert(pos < size());
258 |         uint64_t block = pos / 64;
259 |         uint64_t shift = pos % 64;
260 |         uint64_t word = m_bits[block] >> shift;
261 |         if (shift && block + 1 < m_bits.size()) {
262 |             word |= m_bits[block + 1] << (64 - shift);
263 |         }
264 |         return word;
265 |     }
266 | 
267 |     // unsafe and fast version of get_word, it retrieves at least 56 bits
268 |     inline uint64_t get_word56(uint64_t pos) const {
269 |         // XXX check endianness?
270 |         const char *ptr = reinterpret_cast<const char *>(m_bits.data());
271 |         return *(reinterpret_cast<uint64_t const *>(ptr + pos / 8)) >> (pos % 8);
272 |     }
273 | 
274 |     inline uint64_t predecessor0(uint64_t pos) const {
275 |         assert(pos < m_size);
276 |         uint64_t block = pos / 64;
277 |         uint64_t shift = 64 - pos % 64 - 1;
278 |         uint64_t word = ~m_bits[block];
279 |         word = (word << shift) >> shift;
280 | 
281 |         unsigned long ret;
282 |         while (!broadword::msb(word, ret)) {
283 |             assert(block);
284 |             word = ~m_bits[--block];
285 |         };
286 |         return block * 64 + ret;
287 |     }
288 | 
289 |     inline uint64_t successor0(uint64_t pos) const {
290 |         assert(pos < m_size);
291 |         uint64_t block = pos / 64;
292 |         uint64_t shift = pos % 64;
293 |         uint64_t word = (~m_bits[block] >> shift) << shift;
294 | 
295 |         unsigned long ret;
296 |         while (!broadword::lsb(word, ret)) {
297 |             ++block;
298 |             assert(block < m_bits.size());
299 |             word = ~m_bits[block];
300 |         };
301 |         return block * 64 + ret;
302 |     }
303 | 
304 |     inline uint64_t predecessor1(uint64_t pos) const {
305 |         assert(pos < m_size);
306 |         uint64_t block = pos / 64;
307 |         uint64_t shift = 64 - pos % 64 - 1;
308 |         uint64_t word = m_bits[block];
309 |         word = (word << shift) >> shift;
310 | 
311 |         unsigned long ret;
312 |         while (!broadword::msb(word, ret)) {
313 |             assert(block);
314 |             word = m_bits[--block];
315 |         };
316 |         return block * 64 + ret;
317 |     }
318 | 
319 |     inline uint64_t successor1(uint64_t pos) const {
320 |         assert(pos < m_size);
321 |         uint64_t block = pos / 64;
322 |         uint64_t shift = pos % 64;
323 |         uint64_t word = (m_bits[block] >> shift) << shift;
324 | 
325 |         unsigned long ret;
326 |         while (!broadword::lsb(word, ret)) {
327 |             ++block;
328 |             assert(block < m_bits.size());
329 |             word = m_bits[block];
330 |         };
331 |         return block * 64 + ret;
332 |     }
333 | 
334 |     mapper::mappable_vector<uint64_t> const &data() const { return m_bits; }
335 | 
336 |     struct enumerator {
337 |         enumerator() : m_bv(0), m_pos(uint64_t(-1)) {}
338 | 
339 |         enumerator(bit_vector const &bv, size_t pos) : m_bv(&bv), m_pos(pos), m_buf(0), m_avail(0) {
340 |             m_bv->data().prefetch(m_pos / 64);
341 |         }
342 | 
343 |         inline bool next() {
344 |             if (!m_avail)
345 |                 fill_buf();
346 |             bool b = m_buf & 1;
347 |             m_buf >>= 1;
348 |             m_avail -= 1;
349 |             m_pos += 1;
350 |             return b;
351 |         }
352 | 
353 |         inline uint64_t take(size_t l) {
354 |             if (m_avail < l)
355 |                 fill_buf();
356 |             uint64_t val;
357 |             if (l != 64) {
358 |                 val = m_buf & ((uint64_t(1) << l) - 1);
359 |                 m_buf >>= l;
360 |             } else {
361 |                 val = m_buf;
362 |             }
363 |             m_avail -= l;
364 |             m_pos += l;
365 |             return val;
366 |         }
367 | 
368 |         inline uint64_t skip_zeros() {
369 |             uint64_t zs = 0;
370 |             // XXX the loop may be optimized by aligning access
371 |             while (!m_buf) {
372 |                 m_pos += m_avail;
373 |                 zs += m_avail;
374 |                 m_avail = 0;
375 |                 fill_buf();
376 |             }
377 | 
378 |             uint64_t l = broadword::lsb(m_buf);
379 |             m_buf >>= l;
380 |             m_buf >>= 1;
381 |             m_avail -= l + 1;
382 |             m_pos += l + 1;
383 |             return zs + l;
384 |         }
385 | 
386 |         inline uint64_t position() const { return m_pos; }
387 | 
388 |        private:
389 |         inline void fill_buf() {
390 |             m_buf = m_bv->get_word(m_pos);
391 |             m_avail = 64;
392 |         }
393 | 
394 |         bit_vector const *m_bv;
395 |         size_t m_pos;
396 |         uint64_t m_buf;
397 |         size_t m_avail;
398 |     };
399 | 
400 |     struct unary_enumerator {
401 |         unary_enumerator() : m_data(0), m_position(0), m_buf(0) {}
402 | 
403 |         unary_enumerator(bit_vector const &bv, uint64_t pos) {
404 |             m_data = bv.data().data();
405 |             m_position = pos;
406 |             m_buf = m_data[pos / 64];
407 |             // clear low bits
408 |             m_buf &= uint64_t(-1) << (pos % 64);
409 |         }
410 | 
411 |         uint64_t position() const { return m_position; }
412 | 
413 |         uint64_t next() {
414 |             unsigned long pos_in_word;
415 |             uint64_t buf = m_buf;
416 |             while (!broadword::lsb(buf, pos_in_word)) {
417 |                 m_position += 64;
418 |                 buf = m_data[m_position / 64];
419 |             }
420 | 
421 |             m_buf = buf & (buf - 1); // clear LSB
422 |             m_position = (m_position & ~uint64_t(63)) + pos_in_word;
423 |             return m_position;
424 |         }
425 | 
426 |         // skip to the k-th one after the current position
427 |         void skip(uint64_t k) {
428 |             uint64_t skipped = 0;
429 |             uint64_t buf = m_buf;
430 |             uint64_t w = 0;
431 |             while (skipped + (w = broadword::popcount(buf)) <= k) {
432 |                 skipped += w;
433 |                 m_position += 64;
434 |                 buf = m_data[m_position / 64];
435 |             }
436 |             assert(buf);
437 |             uint64_t pos_in_word = broadword::select_in_word(buf, k - skipped);
438 |             m_buf = buf & (uint64_t(-1) << pos_in_word);
439 |             m_position = (m_position & ~uint64_t(63)) + pos_in_word;
440 |         }
441 | 
442 |         // return the position of the k-th one after the current position.
443 |         uint64_t skip_no_move(uint64_t k) {
444 |             uint64_t position = m_position;
445 |             uint64_t skipped = 0;
446 |             uint64_t buf = m_buf;
447 |             uint64_t w = 0;
448 |             while (skipped + (w = broadword::popcount(buf)) <= k) {
449 |                 skipped += w;
450 |                 position += 64;
451 |                 buf = m_data[position / 64];
452 |             }
453 |             assert(buf);
454 |             uint64_t pos_in_word = broadword::select_in_word(buf, k - skipped);
455 |             position = (position & ~uint64_t(63)) + pos_in_word;
456 |             return position;
457 |         }
458 | 
459 |         // skip to the k-th zero after the current position
460 |         void skip0(uint64_t k) {
461 |             uint64_t skipped = 0;
462 |             uint64_t pos_in_word = m_position % 64;
463 |             uint64_t buf = ~m_buf & (uint64_t(-1) << pos_in_word);
464 |             uint64_t w = 0;
465 |             while (skipped + (w = broadword::popcount(buf)) <= k) {
466 |                 skipped += w;
467 |                 m_position += 64;
468 |                 buf = ~m_data[m_position / 64];
469 |             }
470 |             assert(buf);
471 |             pos_in_word = broadword::select_in_word(buf, k - skipped);
472 |             m_buf = ~buf & (uint64_t(-1) << pos_in_word);
473 |             m_position = (m_position & ~uint64_t(63)) + pos_in_word;
474 |         }
475 | 
476 |        private:
477 |         uint64_t const *m_data;
478 |         uint64_t m_position;
479 |         uint64_t m_buf;
480 |     };
481 | 
482 |    protected:
483 |     size_t m_size;
484 |     mapper::mappable_vector<uint64_t> m_bits;
485 | };
486 | 
487 | } // namespace gpu_ic


--------------------------------------------------------------------------------
/include/gpu_ic/utils/broadword.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdint.h>
  4 | #include "intrinsics.hpp"
  5 | #include "tables.hpp"
  6 | 
  7 | namespace gpu_ic { namespace broadword {
  8 | 
  9 |     static const uint64_t ones_step_4  = 0x1111111111111111ULL;
 10 |     static const uint64_t ones_step_8  = 0x0101010101010101ULL;
 11 |     static const uint64_t ones_step_9  = 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | 1ULL << 36 | 1ULL << 45 | 1ULL << 54;
 12 |     static const uint64_t msbs_step_8  = 0x80ULL * ones_step_8;
 13 |     static const uint64_t msbs_step_9  = 0x100ULL * ones_step_9;
 14 |     static const uint64_t incr_step_8  = 0x80ULL << 56 | 0x40ULL << 48 | 0x20ULL << 40 | 0x10ULL << 32 | 0x8ULL << 24 | 0x4ULL << 16 | 0x2ULL << 8 | 0x1;
 15 |     static const uint64_t inv_count_step_9 = 1ULL << 54 | 2ULL << 45 | 3ULL << 36 | 4ULL << 27 | 5ULL << 18 | 6ULL << 9 | 7ULL;
 16 | 
 17 |     static const uint64_t magic_mask_1 = 0x5555555555555555ULL;
 18 |     static const uint64_t magic_mask_2 = 0x3333333333333333ULL;
 19 |     static const uint64_t magic_mask_3 = 0x0F0F0F0F0F0F0F0FULL;
 20 |     static const uint64_t magic_mask_4 = 0x00FF00FF00FF00FFULL;
 21 |     static const uint64_t magic_mask_5 = 0x0000FFFF0000FFFFULL;
 22 |     static const uint64_t magic_mask_6 = 0x00000000FFFFFFFFULL;
 23 | 
 24 |     inline uint64_t leq_step_8(uint64_t x, uint64_t y)
 25 |     {
 26 |         return ((((y | msbs_step_8) - (x & ~msbs_step_8)) ^ (x ^ y)) & msbs_step_8) >> 7;
 27 |     }
 28 | 
 29 |     inline uint64_t uleq_step_8(uint64_t x, uint64_t y)
 30 |     {
 31 |         return (((((y | msbs_step_8) - (x & ~msbs_step_8)) ^ (x ^ y)) ^ (x & ~y)) & msbs_step_8) >> 7;
 32 |     }
 33 | 
 34 |     inline uint64_t zcompare_step_8(uint64_t x)
 35 |     {
 36 |         return ((x | ((x | msbs_step_8) - ones_step_8)) & msbs_step_8) >> 7;
 37 |     }
 38 | 
 39 |     inline uint64_t uleq_step_9(uint64_t x, uint64_t y)
 40 |     {
 41 |         return (((((y | msbs_step_9) - (x & ~msbs_step_9)) | (x ^ y)) ^ (x & ~y)) & msbs_step_9 ) >> 8;
 42 |     }
 43 | 
 44 |     inline uint64_t byte_counts(uint64_t x)
 45 |     {
 46 |         x = x - ((x & 0xa * ones_step_4) >> 1);
 47 |         x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4);
 48 |         x = (x + (x >> 4)) & 0x0f * ones_step_8;
 49 |         return x;
 50 |     }
 51 | 
 52 |     inline uint64_t bytes_sum(uint64_t x)
 53 |     {
 54 |         return x * ones_step_8 >> 56;
 55 |     }
 56 | 
 57 |     inline uint64_t popcount(uint64_t x)
 58 |     {
 59 | #if USE_POPCNT
 60 |         return intrinsics::popcount(x);
 61 | #else
 62 |         return bytes_sum(byte_counts(x));
 63 | #endif
 64 |     }
 65 | 
 66 |     inline uint64_t reverse_bytes(uint64_t x)
 67 |     {
 68 |         return intrinsics::byteswap64(x);
 69 |     }
 70 | 
 71 |     inline uint64_t reverse_bits(uint64_t x)
 72 |     {
 73 |         x = ((x >> 1) & magic_mask_1) | ((x & magic_mask_1) << 1);
 74 |         x = ((x >> 2) & magic_mask_2) | ((x & magic_mask_2) << 2);
 75 |         x = ((x >> 4) & magic_mask_3) | ((x & magic_mask_3) << 4);
 76 |         return reverse_bytes(x);
 77 |     }
 78 | 
 79 |     inline uint64_t select_in_word(const uint64_t x, const uint64_t k)
 80 |     {
 81 |         assert(k < popcount(x));
 82 | 
 83 |         uint64_t byte_sums = byte_counts(x) * ones_step_8;
 84 | 
 85 |         const uint64_t k_step_8 = k * ones_step_8;
 86 |         const uint64_t geq_k_step_8 = (((k_step_8 | msbs_step_8) - byte_sums) & msbs_step_8);
 87 | #if USE_POPCNT
 88 |         const uint64_t place = intrinsics::popcount(geq_k_step_8) * 8;
 89 | #else
 90 |         const uint64_t place = ((geq_k_step_8 >> 7) * ones_step_8 >> 53) & ~uint64_t(0x7);
 91 | #endif
 92 |         const uint64_t byte_rank = k - (((byte_sums << 8 ) >> place) & uint64_t(0xFF));
 93 |         return place + tables::select_in_byte[((x >> place) & 0xFF ) | (byte_rank << 8)];
 94 |     }
 95 | 
 96 |     inline uint64_t same_msb(uint64_t x, uint64_t y)
 97 |     {
 98 |         return (x ^ y) <= (x & y);
 99 |     }
100 | 
101 |     namespace detail {
102 |         // Adapted from LSB of Chess Programming Wiki
103 |         static const uint8_t debruijn64_mapping[64] = {
104 |             63,  0, 58,  1, 59, 47, 53,  2,
105 |             60, 39, 48, 27, 54, 33, 42,  3,
106 |             61, 51, 37, 40, 49, 18, 28, 20,
107 |             55, 30, 34, 11, 43, 14, 22,  4,
108 |             62, 57, 46, 52, 38, 26, 32, 41,
109 |             50, 36, 17, 19, 29, 10, 13, 21,
110 |             56, 45, 25, 31, 35, 16,  9, 12,
111 |             44, 24, 15,  8, 23,  7,  6,  5
112 |         };
113 |         static const uint64_t debruijn64 = 0x07EDD5E59A4E28C2ULL;
114 |     }
115 | 
116 |     // return the position of the single bit set in the word x
117 |     inline uint8_t bit_position(uint64_t x)
118 |     {
119 |         assert(popcount(x) == 1);
120 |         return detail::debruijn64_mapping
121 |             [(x * detail::debruijn64) >> 58];
122 |     }
123 | 
124 |     inline uint8_t msb(uint64_t x, unsigned long& ret)
125 |     {
126 |         return intrinsics::bsr64(&ret, x);
127 |     }
128 | 
129 |     inline uint8_t msb(uint64_t x)
130 |     {
131 |         assert(x);
132 |         unsigned long ret = -1U;
133 |         msb(x, ret);
134 |         return (uint8_t)ret;
135 |     }
136 | 
137 |     inline uint8_t lsb(uint64_t x, unsigned long& ret)
138 |     {
139 |         return intrinsics::bsf64(&ret, x);
140 |     }
141 | 
142 |     inline uint8_t lsb(uint64_t x)
143 |     {
144 |         assert(x);
145 |         unsigned long ret = -1U;
146 |         lsb(x, ret);
147 |         return (uint8_t)ret;
148 |     }
149 | 
150 | }}


--------------------------------------------------------------------------------
/include/gpu_ic/utils/cuda_utils.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <iostream>
20 | #include <stdexcept>
21 | 
22 | #define CUDA_CHECK_ERROR(err) __cudaSafeCall(err, __FILE__, __LINE__)
23 | 
24 | inline void __cudaSafeCall(cudaError err, const char *file, const int line) {
25 |     if (cudaSuccess != err) {
26 |         std::ostringstream stringStream;
27 |         stringStream << "cudaSafeCall() failed at " << file << ":" << line << " : "
28 |                      << cudaGetErrorString(err);
29 |         throw(std::runtime_error(stringStream.str()));
30 |     }
31 | }
32 | 
33 | __global__
34 | void warmUpGPU()
35 | {
36 |   // do nothing
37 | }
38 | 
39 | __device__ uint32_t extract(const uint32_t *in, size_t offset, size_t bit) {
40 |     int      firstBit                = offset;
41 |     int      lastBit                 = firstBit + bit - 1;
42 |     uint32_t packed                  = in[firstBit / 32];
43 |     int      firstBitInPacked        = firstBit % 32;
44 |     uint32_t packedOverflow          = in[lastBit / 32];
45 |     bool     isOverflowing           = lastBit % 32 < firstBitInPacked;
46 |     int      lastBitInPackedOverflow = !isOverflowing ? -1 : lastBit % 32;
47 |     uint32_t outFromPacked =
48 |         ((packed >> firstBitInPacked) & (0xFFFFFFFF >> (32 - (bit - lastBitInPackedOverflow - 1))));
49 |     uint32_t outFromOverflow = (packedOverflow & (0xFFFFFFFF >> (32 - lastBitInPackedOverflow - 1)))
50 |                                << (bit - lastBitInPackedOverflow - 1);
51 |     return outFromPacked | outFromOverflow;
52 | }
53 | 


--------------------------------------------------------------------------------
/include/gpu_ic/utils/index.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "mappable_vector.hpp"
  4 | #include "bit_vector.hpp"
  5 | 
  6 | namespace gpu_ic {
  7 | 
  8 |     // template <typename BlockCodec>
  9 |     class index {
 10 |     public:
 11 |         index()
 12 |             : m_size(0)
 13 |         {}
 14 | 
 15 |         class builder {
 16 |         public:
 17 |             builder(uint64_t num_docs)
 18 |             {
 19 |                 m_num_docs = num_docs;
 20 |                 m_endpoints.push_back(0);
 21 |             }
 22 | 
 23 |             template <typename DocsIterator, typename Encoder>
 24 |             void add_posting_list(uint64_t n, DocsIterator docs_begin, Encoder encoder_function, bool compress_freqs)
 25 |             {
 26 |                 if (!n) throw std::invalid_argument("List must be nonempty");
 27 |                 tight_variable_byte::encode_single(n, m_lists);
 28 |                 DocsIterator docs_it(docs_begin);
 29 |                 std::vector<uint32_t> docs_buf(n);
 30 | 
 31 |                 uint32_t last_doc(*docs_it++);;
 32 |                 for (size_t i = 1; i < n; ++i) {
 33 |                     uint32_t doc(*docs_it++);
 34 |                     if(not compress_freqs) {
 35 |                         docs_buf[i] = doc - last_doc - 1;
 36 |                     } else {
 37 |                         docs_buf[i] = doc - 1;
 38 |                     }
 39 |                     last_doc = doc;
 40 |                 }
 41 | 
 42 |                 std::vector<uint8_t> encoded_values(n*4+1024);
 43 |                 size_t compressedsize = encoder_function(encoded_values.data(), docs_buf.data(), docs_buf.size());
 44 |                 encoded_values.resize(compressedsize);
 45 |                 encoded_values.shrink_to_fit();
 46 |                 m_lists.insert(m_lists.end(), encoded_values.data(), encoded_values.data() + encoded_values.size());
 47 |                 m_endpoints.push_back(m_lists.size());
 48 |             }
 49 | 
 50 | 
 51 |             size_t build(index& sq)
 52 |             {
 53 |                 sq.m_size = m_endpoints.size() - 1;
 54 |                 sq.m_num_docs = m_num_docs;
 55 |                 sq.m_lists.steal(m_lists);
 56 |                 sq.m_endpoints.steal(m_endpoints);
 57 |                 return sq.m_lists.size();
 58 |             }
 59 | 
 60 |         private:
 61 |             size_t m_num_docs;
 62 |             std::vector<uint64_t> m_endpoints;
 63 |             std::vector<uint8_t> m_lists;
 64 |         };
 65 | 
 66 |         size_t size() const
 67 |         {
 68 |             return m_size;
 69 |         }
 70 | 
 71 |         uint64_t num_docs() const
 72 |         {
 73 |             return m_num_docs;
 74 |         }
 75 | 
 76 | 
 77 |         size_t get_data(std::vector<uint8_t> &data, size_t i) const
 78 |         {
 79 |             assert(i < size());
 80 |             uint32_t n;
 81 |             auto data_begin = tight_variable_byte::decode(m_lists.data() + m_endpoints[i], &n, 1);
 82 |             data.insert(data.end(), data_begin, m_lists.data() + m_endpoints[i+1] );
 83 |             return n;
 84 |         }
 85 | 
 86 |         void warmup(size_t i) const
 87 |         {
 88 |             assert(i < size());
 89 |         //     compact_elias_fano::enumerator endpoints(m_endpoints, 0,
 90 |         //                                              m_lists.size(), m_size,
 91 |         //                                              m_params);
 92 | 
 93 |             auto begin = m_endpoints[i];
 94 |             auto end = m_lists.size();
 95 |             if (i + 1 != size()) {
 96 |                 end = m_endpoints[i + 1];
 97 |             }
 98 | 
 99 |             volatile uint32_t tmp;
100 |             for (size_t i = begin; i != end; ++i) {
101 |                 tmp = m_lists[i];
102 |             }
103 |             (void)tmp;
104 |         }
105 | 
106 |         void swap(index& other)
107 |         {
108 |             std::swap(m_size, other.m_size);
109 |             m_endpoints.swap(other.m_endpoints);
110 |             m_lists.swap(other.m_lists);
111 |         }
112 | 
113 |         template <typename Visitor>
114 |         void map(Visitor& visit)
115 |         {
116 |             visit
117 |                 (m_size, "m_size")
118 |                 (m_num_docs, "m_num_docs")
119 |                 (m_endpoints, "m_endpoints")
120 |                 (m_lists, "m_lists");
121 |         }
122 | 
123 |     private:
124 |         size_t m_size;
125 |         size_t m_num_docs;
126 |         mapper::mappable_vector<uint64_t> m_endpoints;
127 |         mapper::mappable_vector<uint8_t> m_lists;
128 |     };
129 | }


--------------------------------------------------------------------------------
/include/gpu_ic/utils/index.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "mappable_vector.hpp"
  4 | #include "bit_vector.hpp"
  5 | 
  6 | #include "posting_list.hpp"
  7 | 
  8 | 
  9 | 
 10 | 
 11 | namespace gpu_ic {
 12 | 
 13 |     // template <typename BlockCodec>
 14 |     template <typename Codec>
 15 |     class index {
 16 |     public:
 17 |         index()
 18 |             : m_size(0)
 19 |         {}
 20 | 
 21 |         class builder {
 22 |         public:
 23 |             builder(uint64_t num_docs)
 24 |             {
 25 |                 m_num_docs = num_docs;
 26 |                 m_endpoints.push_back(0);
 27 |             }
 28 | 
 29 |             template <typename DocsIterator>
 30 |             void add_posting_list(uint64_t n, DocsIterator docs_begin, Codec codec, bool compress_freqs)
 31 |             {
 32 |                 if (!n) throw std::invalid_argument("List must be nonempty");
 33 |                 posting_list::write(m_lists, n, docs_begin, codec, compress_freqs);
 34 |                 m_endpoints.push_back(m_lists.size());
 35 |             }
 36 | 
 37 | 
 38 |             size_t build(index& sq)
 39 |             {
 40 |                 sq.m_size = m_endpoints.size() - 1;
 41 |                 sq.m_num_docs = m_num_docs;
 42 |                 sq.m_lists.steal(m_lists);
 43 |                 sq.m_endpoints.steal(m_endpoints);
 44 |                 return sq.m_lists.size();
 45 |             }
 46 | 
 47 |         private:
 48 |             size_t m_num_docs;
 49 |             std::vector<uint64_t> m_endpoints;
 50 |             std::vector<uint8_t> m_lists;
 51 |         };
 52 | 
 53 |         size_t size() const
 54 |         {
 55 |             return m_size;
 56 |         }
 57 | 
 58 |         uint64_t num_docs() const
 59 |         {
 60 |             return m_num_docs;
 61 |         }
 62 | 
 63 |         typedef typename posting_list::document_enumerator document_enumerator;
 64 | 
 65 |         document_enumerator operator[](size_t i) const
 66 |         {
 67 |             assert(i < size());
 68 |             auto endpoint = m_endpoints[i];
 69 |             auto len =   m_endpoints[i+1] - endpoint;
 70 |             return document_enumerator(m_lists.data() + endpoint, len, m_codec);
 71 |         }
 72 | 
 73 |         size_t get_data(std::vector<uint8_t> &data, size_t i) const
 74 |         {
 75 |             assert(i < size());
 76 |             uint32_t n;
 77 |             auto data_begin = tight_variable_byte::decode(m_lists.data() + m_endpoints[i], &n, 1);
 78 |             data.insert(data.end(), data_begin, m_lists.data() + m_endpoints[i+1] );
 79 |             return n;
 80 |         }
 81 | 
 82 |         void warmup(size_t i) const
 83 |         {
 84 |             assert(i < size());
 85 |         //     compact_elias_fano::enumerator endpoints(m_endpoints, 0,
 86 |         //                                              m_lists.size(), m_size,
 87 |         //                                              m_params);
 88 | 
 89 |             auto begin = m_endpoints[i];
 90 |             auto end = m_lists.size();
 91 |             if (i + 1 != size()) {
 92 |                 end = m_endpoints[i + 1];
 93 |             }
 94 | 
 95 |             volatile uint32_t tmp;
 96 |             for (size_t i = begin; i != end; ++i) {
 97 |                 tmp = m_lists[i];
 98 |             }
 99 |             (void)tmp;
100 |         }
101 | 
102 |         void swap(index& other)
103 |         {
104 |             std::swap(m_size, other.m_size);
105 |             m_endpoints.swap(other.m_endpoints);
106 |             m_lists.swap(other.m_lists);
107 |         }
108 | 
109 |         template <typename Visitor>
110 |         void map(Visitor& visit)
111 |         {
112 |             visit
113 |                 (m_size, "m_size")
114 |                 (m_num_docs, "m_num_docs")
115 |                 (m_endpoints, "m_endpoints")
116 |                 (m_lists, "m_lists");
117 |         }
118 | 
119 |     private:
120 |         size_t m_size;
121 |         size_t m_num_docs;
122 |         mapper::mappable_vector<uint64_t> m_endpoints;
123 |         mapper::mappable_vector<uint8_t> m_lists;
124 |         Codec m_codec;
125 |     };
126 | }


--------------------------------------------------------------------------------
/include/gpu_ic/utils/intrinsics.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | #include <x86intrin.h>
 5 | #if defined(__SSE4_2__)
 6 | #define USE_POPCNT 1
 7 | #else
 8 | #define USE_POPCNT 0
 9 | #endif
10 | 
11 | #if defined(__GNUC__) || defined(__clang__)
12 | #define __INTRIN_INLINE inline __attribute__((__always_inline__))
13 | #elif defined(_MSC_VER)
14 | #define __INTRIN_INLINE inline __forceinline
15 | #else
16 | #define __INTRIN_INLINE inline
17 | #endif
18 | 
19 | namespace gpu_ic {
20 | namespace intrinsics {
21 | 
22 | __INTRIN_INLINE uint64_t byteswap64(uint64_t value) {
23 | #if defined(__GNUC__) || defined(__clang__)
24 |     return __builtin_bswap64(value);
25 | #elif defined(_MSC_VER)
26 |     return _byteswap_uint64(value);
27 | #else
28 | #error Unsupported platform
29 | #endif
30 | }
31 | 
32 | __INTRIN_INLINE bool bsf64(unsigned long *const index, const uint64_t mask) {
33 | #if defined(__GNUC__) || defined(__clang__)
34 |     if (mask) {
35 |         *index = (unsigned long)__builtin_ctzll(mask);
36 |         return true;
37 |     } else {
38 |         return false;
39 |     }
40 | #elif defined(_MSC_VER)
41 |     return _BitScanForward64(index, mask) != 0;
42 | #else
43 | #error Unsupported platform
44 | #endif
45 | }
46 | 
47 | __INTRIN_INLINE bool bsr64(unsigned long *const index, const uint64_t mask) {
48 | #if defined(__GNUC__) || defined(__clang__)
49 |     if (mask) {
50 |         *index = (unsigned long)(63 - __builtin_clzll(mask));
51 |         return true;
52 |     } else {
53 |         return false;
54 |     }
55 | #elif defined(_MSC_VER)
56 |     return _BitScanReverse64(index, mask) != 0;
57 | #else
58 | #error Unsupported platform
59 | #endif
60 | }
61 | 
62 | template <typename T>
63 | __INTRIN_INLINE void prefetch(T const *ptr) {
64 | #if defined(__SSE__)
65 |     _mm_prefetch((const char *)ptr, _MM_HINT_T0);
66 | #endif
67 | }
68 | 
69 | #if USE_POPCNT
70 | 
71 | __INTRIN_INLINE uint64_t popcount(uint64_t x) { return uint64_t(_mm_popcnt_u64(x)); }
72 | 
73 | #endif /* USE_POPCNT */
74 | 
75 | } // namespace intrinsics
76 | } // namespace succinct


--------------------------------------------------------------------------------
/include/gpu_ic/utils/mappable_vector.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | #include <algorithm>
  5 | 
  6 | #include "boost/utility.hpp"
  7 | #include "boost/range.hpp"
  8 | #include "boost/function.hpp"
  9 | #include "boost/lambda/bind.hpp"
 10 | #include "boost/lambda/construct.hpp"
 11 | 
 12 | #include "intrinsics.hpp"
 13 | 
 14 | namespace gpu_ic { namespace mapper {
 15 | 
 16 |     namespace detail {
 17 |         class freeze_visitor;
 18 |         class map_visitor;
 19 |         class sizeof_visitor;
 20 |     }
 21 | 
 22 |     typedef boost::function<void()> deleter_t;
 23 | 
 24 |     template <typename T> // T must be a POD
 25 |     class mappable_vector {
 26 |     public:
 27 |         typedef T value_type;
 28 |         typedef const T* iterator;
 29 |         typedef const T* const_iterator;
 30 | 
 31 |         mappable_vector()
 32 |             : m_data(0)
 33 |             , m_size(0)
 34 |             , m_deleter()
 35 |         {}
 36 |         mappable_vector(const mappable_vector &) = delete;
 37 |         mappable_vector &operator=(const mappable_vector &) = delete;
 38 | 
 39 |         template <typename Range>
 40 |         mappable_vector(Range const& from)
 41 |             : m_data(0)
 42 |             , m_size(0)
 43 |         {
 44 |             size_t size = boost::size(from);
 45 |             T* data = new T[size];
 46 |             m_deleter = boost::lambda::bind(boost::lambda::delete_array(), data);
 47 | 
 48 |             std::copy(boost::begin(from),
 49 |                       boost::end(from),
 50 |                       data);
 51 |             m_data = data;
 52 |             m_size = size;
 53 |         }
 54 | 
 55 |         ~mappable_vector() {
 56 |             if (m_deleter) {
 57 |                 m_deleter();
 58 |             }
 59 |         }
 60 | 
 61 |         void swap(mappable_vector& other) {
 62 |             using std::swap;
 63 |             swap(m_data, other.m_data);
 64 |             swap(m_size, other.m_size);
 65 |             swap(m_deleter, other.m_deleter);
 66 |         }
 67 | 
 68 |         void clear() {
 69 |             mappable_vector().swap(*this);
 70 |         }
 71 | 
 72 |         void steal(std::vector<T>& vec) {
 73 |             clear();
 74 |             m_size = vec.size();
 75 |             if (m_size) {
 76 |                 std::vector<T>* new_vec = new std::vector<T>;
 77 |                 new_vec->swap(vec);
 78 |                 m_deleter = boost::lambda::bind(boost::lambda::delete_ptr(), new_vec);
 79 |                 m_data = &(*new_vec)[0];
 80 |             }
 81 |         }
 82 | 
 83 |         template <typename Range>
 84 |         void assign(Range const& from) {
 85 |             clear();
 86 |             mappable_vector(from).swap(*this);
 87 |         }
 88 | 
 89 |         uint64_t size() const {
 90 |             return m_size;
 91 |         }
 92 | 
 93 |         inline const_iterator begin() const {
 94 |             return m_data;
 95 |         }
 96 | 
 97 |         inline const_iterator end() const {
 98 |             return m_data + m_size;
 99 |         }
100 | 
101 |         inline T const& operator[](uint64_t i) const {
102 |             assert(i < m_size);
103 |             return m_data[i];
104 |         }
105 | 
106 |         inline T const* data() const {
107 |             return m_data;
108 |         }
109 | 
110 |         inline void prefetch(size_t i) const {
111 |             intrinsics::prefetch(m_data + i);
112 |         }
113 | 
114 |         friend class detail::freeze_visitor;
115 |         friend class detail::map_visitor;
116 |         friend class detail::sizeof_visitor;
117 | 
118 |     protected:
119 |         const T* m_data;
120 |         uint64_t m_size;
121 |         deleter_t m_deleter;
122 |     };
123 | 
124 | }}


--------------------------------------------------------------------------------
/include/gpu_ic/utils/mapper.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <fstream>
  4 | #include <iostream>
  5 | 
  6 | #include "mio/mmap.hpp"
  7 | 
  8 | #include "mappable_vector.hpp"
  9 | 
 10 | namespace gpu_ic {
 11 | namespace mapper {
 12 | 
 13 | struct map_flags {
 14 |     enum { warmup = 1 };
 15 | };
 16 | 
 17 | struct size_node;
 18 | typedef std::shared_ptr<size_node> size_node_ptr;
 19 | 
 20 | struct size_node {
 21 |     size_node() : size(0) {}
 22 | 
 23 |     std::string name;
 24 |     size_t size;
 25 |     std::vector<size_node_ptr> children;
 26 | 
 27 |     void dump(std::ostream &os = std::cerr, size_t depth = 0) {
 28 |         os << std::string(depth * 4, ' ') << name << ": " << size << '\n';
 29 |         for (size_t i = 0; i < children.size(); ++i) {
 30 |             children[i]->dump(os, depth + 1);
 31 |         }
 32 |     }
 33 | };
 34 | 
 35 | namespace detail {
 36 | class freeze_visitor {
 37 |    public:
 38 |     freeze_visitor(std::ofstream &fout, uint64_t flags)
 39 |         : m_fout(fout), m_flags(flags), m_written(0) {
 40 |         // Save freezing flags
 41 |         m_fout.write(reinterpret_cast<const char *>(&m_flags), sizeof(m_flags));
 42 |         m_written += sizeof(m_flags);
 43 |     }
 44 | 
 45 |     freeze_visitor(const freeze_visitor &) = delete;
 46 |     freeze_visitor &operator=(const freeze_visitor &) = delete;
 47 | 
 48 |     template <typename T>
 49 |     typename std::enable_if<!std::is_pod<T>::value, freeze_visitor &>::type operator()(
 50 |         T &val, const char * /* friendly_name */) {
 51 |         val.map(*this);
 52 |         return *this;
 53 |     }
 54 | 
 55 |     template <typename T>
 56 |     typename std::enable_if<std::is_pod<T>::value, freeze_visitor &>::type operator()(
 57 |         T &val, const char * /* friendly_name */) {
 58 |         m_fout.write(reinterpret_cast<const char *>(&val), sizeof(T));
 59 |         m_written += sizeof(T);
 60 |         return *this;
 61 |     }
 62 | 
 63 |     template <typename T>
 64 |     freeze_visitor &operator()(mappable_vector<T> &vec, const char * /* friendly_name */) {
 65 |         (*this)(vec.m_size, "size");
 66 | 
 67 |         size_t n_bytes = static_cast<size_t>(vec.m_size * sizeof(T));
 68 |         m_fout.write(reinterpret_cast<const char *>(vec.m_data), long(n_bytes));
 69 |         m_written += n_bytes;
 70 | 
 71 |         return *this;
 72 |     }
 73 | 
 74 |     size_t written() const { return m_written; }
 75 | 
 76 |    protected:
 77 |     std::ofstream &m_fout;
 78 |     const uint64_t m_flags;
 79 |     uint64_t m_written;
 80 | };
 81 | 
 82 | class map_visitor {
 83 |    public:
 84 |     map_visitor(const char *base_address, uint64_t flags)
 85 |         : m_base(base_address), m_cur(m_base), m_flags(flags) {
 86 |         m_freeze_flags = *reinterpret_cast<const uint64_t *>(m_cur);
 87 |         m_cur += sizeof(m_freeze_flags);
 88 |     }
 89 | 
 90 |     map_visitor(const map_visitor &) = delete;
 91 |     map_visitor &operator=(const map_visitor &) = delete;
 92 | 
 93 |     template <typename T>
 94 |     typename std::enable_if<!std::is_pod<T>::value, map_visitor &>::type operator()(
 95 |         T &val, const char * /* friendly_name */) {
 96 |         val.map(*this);
 97 |         return *this;
 98 |     }
 99 | 
100 |     template <typename T>
101 |     typename std::enable_if<std::is_pod<T>::value, map_visitor &>::type operator()(
102 |         T &val, const char * /* friendly_name */) {
103 |         val = *reinterpret_cast<const T *>(m_cur);
104 |         m_cur += sizeof(T);
105 |         return *this;
106 |     }
107 | 
108 |     template <typename T>
109 |     map_visitor &operator()(mappable_vector<T> &vec, const char * /* friendly_name */) {
110 |         vec.clear();
111 |         (*this)(vec.m_size, "size");
112 | 
113 |         vec.m_data = reinterpret_cast<const T *>(m_cur);
114 |         size_t bytes = vec.m_size * sizeof(T);
115 | 
116 |         if (m_flags & map_flags::warmup) {
117 |             T foo;
118 |             volatile T *bar = &foo;
119 |             for (size_t i = 0; i < vec.m_size; ++i) {
120 |                 *bar = vec.m_data[i];
121 |             }
122 |         }
123 | 
124 |         m_cur += bytes;
125 |         return *this;
126 |     }
127 | 
128 |     size_t bytes_read() const { return size_t(m_cur - m_base); }
129 | 
130 |    protected:
131 |     const char *m_base;
132 |     const char *m_cur;
133 |     const uint64_t m_flags;
134 |     uint64_t m_freeze_flags;
135 | };
136 | 
137 | class sizeof_visitor {
138 |    public:
139 |     sizeof_visitor(bool with_tree = false) : m_size(0) {
140 |         if (with_tree) {
141 |             m_cur_size_node = std::make_shared<size_node>();
142 |         }
143 |     }
144 | 
145 |     sizeof_visitor(const sizeof_visitor &) = delete;
146 |     sizeof_visitor &operator=(const sizeof_visitor &) = delete;
147 | 
148 |     template <typename T>
149 |     typename std::enable_if<!std::is_pod<T>::value, sizeof_visitor &>::type operator()(
150 |         T &val, const char *friendly_name) {
151 |         size_t checkpoint = m_size;
152 |         size_node_ptr parent_node;
153 |         if (m_cur_size_node) {
154 |             parent_node = m_cur_size_node;
155 |             m_cur_size_node = make_node(friendly_name);
156 |         }
157 | 
158 |         val.map(*this);
159 | 
160 |         if (m_cur_size_node) {
161 |             m_cur_size_node->size = m_size - checkpoint;
162 |             m_cur_size_node = parent_node;
163 |         }
164 |         return *this;
165 |     }
166 | 
167 |     template <typename T>
168 |     typename std::enable_if<std::is_pod<T>::value, sizeof_visitor &>::type operator()(
169 |         T & /* val */, const char * /* friendly_name */) {
170 |         // don't track PODs in the size tree (they are constant sized)
171 |         m_size += sizeof(T);
172 |         return *this;
173 |     }
174 | 
175 |     template <typename T>
176 |     sizeof_visitor &operator()(mappable_vector<T> &vec, const char *friendly_name) {
177 |         size_t checkpoint = m_size;
178 |         (*this)(vec.m_size, "size");
179 |         m_size += static_cast<size_t>(vec.m_size * sizeof(T));
180 | 
181 |         if (m_cur_size_node) {
182 |             make_node(friendly_name)->size = m_size - checkpoint;
183 |         }
184 | 
185 |         return *this;
186 |     }
187 | 
188 |     size_t size() const { return m_size; }
189 | 
190 |     size_node_ptr size_tree() const {
191 |         assert(m_cur_size_node);
192 |         return m_cur_size_node;
193 |     }
194 | 
195 |    protected:
196 |     size_node_ptr make_node(const char *name) {
197 |         size_node_ptr node = std::make_shared<size_node>();
198 |         m_cur_size_node->children.push_back(node);
199 |         node->name = name;
200 |         return node;
201 |     }
202 | 
203 |     size_t m_size;
204 |     size_node_ptr m_cur_size_node;
205 | };
206 | 
207 | } // namespace detail
208 | 
209 | template <typename T>
210 | size_t freeze(T &val,
211 |               std::ofstream &fout,
212 |               uint64_t flags = 0,
213 |               const char *friendly_name = "<TOP>") {
214 |     detail::freeze_visitor freezer(fout, flags);
215 |     freezer(val, friendly_name);
216 |     return freezer.written();
217 | }
218 | 
219 | template <typename T>
220 | size_t freeze(T &val,
221 |               const char *filename,
222 |               uint64_t flags = 0,
223 |               const char *friendly_name = "<TOP>") {
224 |     std::ofstream fout(filename, std::ios::binary);
225 |     return freeze(val, fout, flags, friendly_name);
226 | }
227 | 
228 | template <typename T>
229 | size_t map(T &val,
230 |            const char *base_address,
231 |            uint64_t flags = 0,
232 |            const char *friendly_name = "<TOP>") {
233 |     detail::map_visitor mapper(base_address, flags);
234 |     mapper(val, friendly_name);
235 |     return mapper.bytes_read();
236 | }
237 | 
238 | template <typename T>
239 | size_t map(T &val,
240 |            const mio::mmap_source &m,
241 |            uint64_t flags = 0,
242 |            const char *friendly_name = "<TOP>") {
243 |     return map(val, m.data(), flags, friendly_name);
244 | }
245 | 
246 | template <typename T>
247 | size_t size_of(T &val) {
248 |     detail::sizeof_visitor sizer;
249 |     sizer(val, "");
250 |     return sizer.size();
251 | }
252 | 
253 | template <typename T>
254 | size_node_ptr size_tree_of(T &val, const char *friendly_name = "<TOP>") {
255 |     detail::sizeof_visitor sizer(true);
256 |     sizer(val, friendly_name);
257 |     assert(sizer.size_tree()->children.size());
258 |     return sizer.size_tree()->children[0];
259 | }
260 | 
261 | } // namespace mapper
262 | } // namespace gpu_ic


--------------------------------------------------------------------------------
/include/gpu_ic/utils/posting_list.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "tight_variable_byte.hpp"
 4 | 
 5 | namespace gpu_ic {
 6 | 
 7 |     struct posting_list {
 8 | 
 9 |         template <typename DocsIterator, typename Codec>
10 |         static void write(std::vector<uint8_t>& out, uint32_t n, DocsIterator docs_begin, Codec codec, bool compress_freqs) {
11 |             tight_variable_byte::encode_single(n, out);
12 | 
13 |             DocsIterator docs_it(docs_begin);
14 |             std::vector<uint32_t> docs_buf(n);
15 | 
16 |             uint32_t last_doc(*docs_it++);;
17 |             for (size_t i = 1; i < n; ++i) {
18 |                 uint32_t doc(*docs_it++);
19 |                 if(not compress_freqs) {
20 |                     docs_buf[i] = doc - last_doc - 1;
21 |                 }
22 |                 else {
23 |                     docs_buf[i] = doc - 1;
24 |                 }
25 |                 last_doc = doc;
26 |             }
27 | 
28 |             size_t compressedsize = 0;
29 |             std::vector<uint8_t> encoded_values(n*4+1024);
30 |             codec.encodeArray(docs_buf.data(), n, reinterpret_cast<uint32_t*>(encoded_values.data()), compressedsize);
31 |             out.insert(out.end(), encoded_values.data(), encoded_values.data() + compressedsize*4);
32 |         }
33 | 
34 |         class document_enumerator {
35 |         public:
36 | 
37 |             template <typename Codec>
38 |             document_enumerator(uint8_t const* data, uint64_t len, Codec codec)
39 |                 : m_n(0)
40 |                 , m_base(tight_variable_byte::decode(data, &m_n, 1))
41 |                 , m_len(len)
42 |             {
43 |                 m_docs_buf.resize(m_n);
44 |                 decode_docs_block(codec);
45 | 
46 |             }
47 | 
48 |             void next()
49 |             {
50 |                 ++m_pos_in_block;
51 |                 m_cur_docid = m_docs_buf[m_pos_in_block];
52 |             }
53 | 
54 | 
55 |             uint64_t docid() const
56 |             {
57 |                 return m_cur_docid;
58 |             }
59 | 
60 | 
61 |             uint64_t position() const
62 |             {
63 |                 return m_pos_in_block;
64 |             }
65 | 
66 |             uint64_t size() const
67 |             {
68 |                 return m_n;
69 |             }
70 | 
71 |         private:
72 | 
73 |             template <typename Codec>
74 |             void decode_docs_block(Codec codec)
75 |             {
76 | 
77 |                 size_t n =m_n;
78 |                 codec.decodeArray(reinterpret_cast<uint32_t const *>(m_base), m_len/4, reinterpret_cast<uint32_t*>(m_docs_buf.data()), n);
79 | 
80 |                 m_pos_in_block = 0;
81 |                 m_cur_docid = m_docs_buf[0];
82 | 
83 |             }
84 | 
85 |             uint32_t m_n;
86 |             uint8_t const* m_base;
87 |             uint64_t m_len;
88 | 
89 |             uint32_t m_pos_in_block;
90 |             uint32_t m_cur_docid;
91 | 
92 | 
93 |             std::vector<uint32_t> m_docs_buf;
94 | 
95 |         };
96 | 
97 |     };
98 | }


--------------------------------------------------------------------------------
/include/gpu_ic/utils/progress.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <chrono>
 4 | #include <condition_variable>
 5 | #include <iostream>
 6 | #include <thread>
 7 | 
 8 | namespace gpu_ic {
 9 | 
10 | class progress {
11 | 
12 |    public:
13 |     progress(const std::string &name, size_t goal) : m_name(name) {
14 |         if (goal == 0) {
15 |             throw std::runtime_error("goal must be positive");
16 |         }
17 |         m_goal = goal;
18 |     }
19 |     ~progress() {
20 |         m_status.notify_one();
21 |         std::unique_lock<std::mutex> lock(m_mut);
22 |         print_status();
23 |         std::cerr << std::endl;
24 |     }
25 | 
26 |     void update(size_t inc) {
27 |         std::unique_lock<std::mutex> lock(m_mut);
28 |         m_count += inc;
29 |         print_status();
30 |     }
31 | 
32 |    private:
33 |     std::string m_name;
34 |     size_t m_count = 0;
35 |     size_t m_goal  = 0;
36 | 
37 |     std::chrono::time_point<std::chrono::steady_clock> m_start = std::chrono::steady_clock::now();
38 | 
39 |     std::mutex m_mut;
40 |     std::condition_variable m_status;
41 | 
42 |     void print_status() {
43 |         size_t progress = (100 * m_count) / m_goal;
44 |         std::chrono::seconds elapsed  = std::chrono::duration_cast<std::chrono::seconds>(
45 |             std::chrono::steady_clock::now() - m_start);
46 |         std::cerr << '\r' << m_name << ": " << progress << "% [";
47 |         format_interval(std::cerr, elapsed);
48 |         std::cerr << "]";
49 |     }
50 | 
51 |     std::ostream& format_interval(std::ostream& out, std::chrono::seconds time) {
52 |         using std::chrono::hours;
53 |         using std::chrono::minutes;
54 |         using std::chrono::seconds;
55 |         hours h = std::chrono::duration_cast<hours>(time);
56 |         minutes m = std::chrono::duration_cast<minutes>(time - h);
57 |         seconds s = std::chrono::duration_cast<seconds>(time - h - m);
58 |         if (h.count() > 0) { out << h.count() << "h "; }
59 |         if (m.count() > 0) { out << m.count() << "m "; }
60 |         out << s.count() << "s";
61 |         return out;
62 |     }
63 | };
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/include/gpu_ic/utils/tables.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | 
 5 | namespace gpu_ic { namespace tables {
 6 | 
 7 |     const uint8_t select_in_byte[2048] = {
 8 |         8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1,
 9 |         0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
10 |         1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,
11 |         0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
12 |         2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1,
13 |         0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0,
14 |         1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8,
15 |         4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1,
16 |         4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2,
17 |         1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, 1, 7, 2,
18 |         2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3,
19 |         2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1,
20 |         4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3,
21 |         1, 3, 2, 2, 1, 8, 8, 8, 8, 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8,
22 |         8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8,
23 |         6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2,
24 |         6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4,
25 |         2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3,
26 |         3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6,
27 |         5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28 |         8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8,
29 |         5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6,
30 |         6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8,
31 |         8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5,
32 |         8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6,
33 |         3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5,
34 |         5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
35 |         8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8,
36 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8,
37 |         6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8,
38 |         8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, 8, 7, 7, 5, 8,
39 |         7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
40 |         8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8,
41 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
42 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
43 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6,
44 |         8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8,
45 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
46 |         8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8,
47 |         8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
48 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
49 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
50 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
51 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
52 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
53 |         8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8,
54 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
55 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
56 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
57 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
58 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
59 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
60 |         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7
61 |     };
62 | 
63 | }}


--------------------------------------------------------------------------------
/include/gpu_ic/utils/tight_variable_byte.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | class tight_variable_byte {
 4 |    public:
 5 |     template <uint32_t i>
 6 |     static uint8_t extract7bits(const uint32_t val) {
 7 |         return static_cast<uint8_t>((val >> (7 * i)) & ((1U << 7) - 1));
 8 |     }
 9 | 
10 |     template <uint32_t i>
11 |     static uint8_t extract7bitsmaskless(const uint32_t val) {
12 |         return static_cast<uint8_t>((val >> (7 * i)));
13 |     }
14 | 
15 |     static void encode(const uint32_t *in, const size_t length, uint8_t *out, size_t &nvalue) {
16 |         uint8_t *bout = out;
17 |         for (size_t k = 0; k < length; ++k) {
18 |             const uint32_t val(in[k]);
19 |             /**
20 |              * Code below could be shorter. Whether it could be faster
21 |              * depends on your compiler and machine.
22 |              */
23 |             if (val < (1U << 7)) {
24 |                 *bout = static_cast<uint8_t>(val | (1U << 7));
25 |                 ++bout;
26 |             } else if (val < (1U << 14)) {
27 |                 *bout = extract7bits<0>(val);
28 |                 ++bout;
29 |                 *bout = extract7bitsmaskless<1>(val) | (1U << 7);
30 |                 ++bout;
31 |             } else if (val < (1U << 21)) {
32 |                 *bout = extract7bits<0>(val);
33 |                 ++bout;
34 |                 *bout = extract7bits<1>(val);
35 |                 ++bout;
36 |                 *bout = extract7bitsmaskless<2>(val) | (1U << 7);
37 |                 ++bout;
38 |             } else if (val < (1U << 28)) {
39 |                 *bout = extract7bits<0>(val);
40 |                 ++bout;
41 |                 *bout = extract7bits<1>(val);
42 |                 ++bout;
43 |                 *bout = extract7bits<2>(val);
44 |                 ++bout;
45 |                 *bout = extract7bitsmaskless<3>(val) | (1U << 7);
46 |                 ++bout;
47 |             } else {
48 |                 *bout = extract7bits<0>(val);
49 |                 ++bout;
50 |                 *bout = extract7bits<1>(val);
51 |                 ++bout;
52 |                 *bout = extract7bits<2>(val);
53 |                 ++bout;
54 |                 *bout = extract7bits<3>(val);
55 |                 ++bout;
56 |                 *bout = extract7bitsmaskless<4>(val) | (1U << 7);
57 |                 ++bout;
58 |             }
59 |         }
60 |         nvalue = bout - out;
61 |     }
62 | 
63 |     static void encode_single(uint32_t val, std::vector<uint8_t> &out) {
64 |         uint8_t buf[5];
65 |         size_t  nvalue;
66 |         encode(&val, 1, buf, nvalue);
67 |         out.insert(out.end(), buf, buf + nvalue);
68 |     }
69 | 
70 |     static uint8_t const *decode(const uint8_t *in, uint32_t *out, size_t n) {
71 |         const uint8_t *inbyte = in;
72 |         for (size_t i = 0; i < n; ++i) {
73 |             unsigned int shift = 0;
74 |             for (uint32_t v = 0;; shift += 7) {
75 |                 uint8_t c = *inbyte++;
76 |                 v += ((c & 127) << shift);
77 |                 if ((c & 128)) {
78 |                     *out++ = v;
79 |                     break;
80 |                 }
81 |             }
82 |         }
83 |         return inbyte;
84 |     }
85 | };
86 | 


--------------------------------------------------------------------------------
/include/gpu_ic/utils/utils.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018-present Antonio Mallia <me@antoniomallia.it>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <iostream>
20 | #include <stdexcept>
21 | 
22 | namespace utils {
23 | 
24 | template <class T>
25 | inline void do_not_optimize_away(T&& datum) {
26 |     asm volatile("" : "+r" (datum));
27 | }
28 | 
29 | inline void delta_encode(uint32_t *in, size_t n) {
30 |     for (size_t i = n - 1; i > 0; --i) {
31 |         in[i] -= in[i - 1];
32 |     }
33 | }
34 | 
35 | inline void delta_decode(uint32_t *in, size_t n) {
36 |     for (size_t i = 1; i < n; ++i) {
37 |         in[i] += in[i - 1];
38 |     }
39 | }
40 | 
41 | /*
42 |  * Computes the number of bits required to store the given integer value.
43 |  */
44 | inline constexpr uint_fast8_t bits(size_t value) {
45 |     return value == 0 ? 1U : (64 - __builtin_clzll(value));
46 | }
47 | 
48 | } // namespace utils


--------------------------------------------------------------------------------
/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amallia/gpu-integers-compression/a120ed89d4c5eed5761402147525ef480cdcdc93/plot.png


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(compress_index compress_index.cpp)
 2 | target_link_libraries(compress_index
 3 |   gpu_integers_compression
 4 |   CLI11
 5 | )
 6 | 
 7 | add_executable(perf_decode perf_decode.cpp)
 8 | target_link_libraries(perf_decode
 9 |   gpu_integers_compression
10 |   CLI11
11 | )
12 | 
13 | if(BUILD_CUDA)
14 | CUDA_ADD_EXECUTABLE(cuda_compress_index cuda_compress_index.cu)
15 | target_link_libraries(cuda_compress_index
16 |   gpu_integers_compression
17 |   CLI11
18 | )
19 | CUDA_ADD_EXECUTABLE(cuda_perf_decode cuda_perf_decode.cu)
20 | target_link_libraries(cuda_perf_decode
21 |   gpu_integers_compression
22 |   CLI11
23 | )
24 | endif()
25 | 


--------------------------------------------------------------------------------
/src/compress_index.cpp:
--------------------------------------------------------------------------------
  1 | #include "CLI/CLI.hpp"
  2 | #include "../external/FastPFor/headers/codecfactory.h"
  3 | #include "gpu_ic/utils/binary_freq_collection.hpp"
  4 | #include "gpu_ic/utils/progress.hpp"
  5 | #include "gpu_ic/utils/bit_ostream.hpp"
  6 | #include "gpu_ic/utils/bit_istream.hpp"
  7 | #include "gpu_ic/utils/tight_variable_byte.hpp"
  8 | #include "gpu_ic/utils/index.hpp"
  9 | #include "gpu_ic/utils/mapper.hpp"
 10 | #include "mio/mmap.hpp"
 11 | 
 12 | using namespace gpu_ic;
 13 | using namespace FastPForLib;
 14 | 
 15 | template <typename InputCollection, typename Codec>
 16 | void verify_index(InputCollection const &input,
 17 |                        const std::string &filename, bool compress_freqs) {
 18 | 
 19 |     Codec codec;
 20 |     gpu_ic::index<Codec> coll;
 21 |     mio::mmap_source m;
 22 |     std::error_code error;
 23 |     m.map(filename, error);
 24 |     mapper::map(coll, m);
 25 | 
 26 |     {
 27 |         progress progress("Verify index", input.size());
 28 | 
 29 |         size_t i =0;
 30 |         for (auto const &plist : input) {
 31 |             auto docs_it = compress_freqs ?  plist.freqs.begin() : plist.docs.begin();
 32 | 
 33 |             std::vector<uint32_t> values(plist.docs.size());
 34 |             uint32_t last_doc(*docs_it++);;
 35 |             for (size_t j = 1; j < plist.docs.size(); ++j) {
 36 |                 uint32_t doc(*docs_it++);
 37 |                 if(not compress_freqs){
 38 |                     values[j] = doc - last_doc - 1;
 39 |                 }
 40 |                 else{
 41 |                     values[j] = doc - 1;
 42 |                 }
 43 |                 last_doc = doc;
 44 |             }
 45 | 
 46 |             std::vector<uint8_t> tmp;
 47 |             auto n = coll.get_data(tmp, i);
 48 |             std::vector<uint32_t> decode_values(n);
 49 |             codec.decodeArray(reinterpret_cast<uint32_t const *>(tmp.data()), tmp.size()/4, reinterpret_cast<uint32_t*>(decode_values.data()), n);
 50 | 
 51 |             if(n != plist.docs.size())
 52 |             {
 53 |                 std::cerr << "Error: wrong list length. List: " << i << ", size: " << n << ", real_size: " << plist.docs.size() << std::endl;
 54 |                 std::abort();
 55 |             }
 56 | 
 57 |             for (size_t j = 0; j < n; ++j) {
 58 |                 if(decode_values[j] != values[j]) {
 59 |                     std::cerr << "Error: wrong decoded value. List: " << i << ", position: " << j << ", element: " << decode_values[j] << ", real_element: " << values[j] << std::endl;
 60 |                     std::abort();
 61 |                 }
 62 |             }
 63 |             progress.update(1);
 64 |             i+=1;
 65 |         }
 66 |     }
 67 | 
 68 | }
 69 | 
 70 | template <typename InputCollection, typename Codec>
 71 | void create_collection(InputCollection const &input,
 72 |                        const std::string &output_filename,
 73 |                        Codec &codec, bool compress_freqs) {
 74 | 
 75 |     typename gpu_ic::index<Codec>::builder builder(input.num_docs());
 76 |     size_t postings = 0;
 77 |     {
 78 |         progress progress("Create index", input.size());
 79 | 
 80 |         for (auto const &plist : input) {
 81 |             size_t size = plist.docs.size();
 82 |             if(not compress_freqs) {
 83 |                 builder.add_posting_list(size, plist.docs.begin(), codec, compress_freqs);
 84 |             }
 85 |             else {
 86 |                 builder.add_posting_list(size, plist.freqs.begin(), codec, compress_freqs);
 87 |             }
 88 |             postings += size;
 89 |             progress.update(1);
 90 |         }
 91 |     }
 92 | 
 93 |     gpu_ic::index<Codec> coll;
 94 |     auto data_len = builder.build(coll);
 95 |     auto byte= mapper::freeze(coll, output_filename.c_str());
 96 | 
 97 | 
 98 |     double bits_per_doc  = data_len * 8.0 / postings;
 99 |     std::cout << "Documents: " << postings << ", Total size bytes: " << byte << ", bits/doc: " << bits_per_doc << std::endl;
100 | 
101 |     verify_index<InputCollection, Codec>(input, output_filename, compress_freqs);
102 | }
103 | 
104 | 
105 | int main(int argc, char const *argv[])
106 | {
107 |     std::string type;
108 |     std::string input_basename;
109 |     std::string output_filename;
110 |     bool compress_freqs = false;
111 | 
112 |     CLI::App app{"compress_index - a tool for compressing an index."};
113 |     app.add_option("-t,--type", type, "Index type")->required();
114 |     app.add_option("-c,--collection", input_basename, "Collection basename")->required();
115 |     app.add_option("-o,--output", output_filename, "Output filename")->required();
116 |     app.add_flag("--freqs", compress_freqs, "Compress freqs instead of docs");
117 | 
118 |     CLI11_PARSE(app, argc, argv);
119 | 
120 |     binary_freq_collection input(input_basename.c_str());
121 |     if (type == "simdbp") {
122 |         CompositeCodec<SIMDBinaryPacking, VariableByte> codec;
123 |         create_collection(input, output_filename, codec, compress_freqs);
124 |     } else if (type == "streamvbyte") {
125 |         StreamVByte codec;
126 |         create_collection(input, output_filename, codec, compress_freqs);
127 |     } else if (type == "bp") {
128 |         CompositeCodec<BP32, VariableByte> codec;
129 |         create_collection(input, output_filename, codec, compress_freqs);
130 |     } else if (type == "varintgb") {
131 |         VarIntGB<> codec;
132 |         create_collection(input, output_filename, codec, compress_freqs);
133 |     } else {
134 |         std::cerr << "Unknown type" << std::endl;
135 |     }
136 | 
137 |     return 0;
138 | }
139 | 


--------------------------------------------------------------------------------
/src/cuda_compress_index.cu:
--------------------------------------------------------------------------------
  1 | #include "CLI/CLI.hpp"
  2 | #include "gpu_ic/utils/binary_freq_collection.hpp"
  3 | #include "gpu_ic/utils/progress.hpp"
  4 | #include "gpu_ic/utils/tight_variable_byte.hpp"
  5 | #include "gpu_ic/utils/index.cuh"
  6 | #include "gpu_ic/utils/mapper.hpp"
  7 | #include "mio/mmap.hpp"
  8 | #include "gpu_ic/cuda_bp.cuh"
  9 | #include "gpu_ic/cuda_vbyte.cuh"
 10 | 
 11 | using namespace gpu_ic;
 12 | 
 13 | template <typename InputCollection, typename Decoder>
 14 | void verify_index(InputCollection const &input,
 15 |                        const std::string &filename, Decoder decoder_function, bool compress_freqs) {
 16 | 
 17 | //     Codec codec;
 18 |     gpu_ic::index coll;
 19 |     mio::mmap_source m;
 20 |     std::error_code error;
 21 |     m.map(filename, error);
 22 |     mapper::map(coll, m);
 23 | 
 24 |     {
 25 |         progress progress("Verify index", input.size());
 26 | 
 27 |         size_t i =0;
 28 |         for (auto const &plist : input) {
 29 |             auto docs_it = compress_freqs ?  plist.freqs.begin() : plist.docs.begin();
 30 | 
 31 |             std::vector<uint32_t> values(plist.docs.size());
 32 |             uint32_t last_doc(*docs_it++);;
 33 |             for (size_t j = 1; j < plist.docs.size(); ++j) {
 34 |                 uint32_t doc(*docs_it++);
 35 |                 if(not compress_freqs) {
 36 |                    values[j] = doc - last_doc - 1;
 37 |                 } else {
 38 |                     values[j] = doc - 1;
 39 |                 }
 40 |                 last_doc = doc;
 41 |             }
 42 | 
 43 |             std::vector<uint8_t> tmp;
 44 |             auto n = coll.get_data(tmp, i);
 45 |             std::vector<uint32_t> decode_values(n);
 46 | 
 47 |             CUDA_CHECK_ERROR(cudaSetDevice(0));
 48 |             warmUpGPU<<<1, 1>>>();
 49 | 
 50 |             uint8_t *  d_encoded;
 51 |             CUDA_CHECK_ERROR(cudaMalloc((void **)&d_encoded, tmp.size() * sizeof(uint8_t)));
 52 |             CUDA_CHECK_ERROR(cudaMemcpy(d_encoded, tmp.data(), tmp.size() * sizeof(uint8_t), cudaMemcpyHostToDevice));
 53 | 
 54 |             uint32_t * d_decoded;
 55 |             CUDA_CHECK_ERROR(cudaMalloc((void **)&d_decoded, values.size() * sizeof(uint32_t)));
 56 |             decoder_function(d_decoded, d_encoded, decode_values.size());
 57 |             CUDA_CHECK_ERROR(cudaMemcpy(decode_values.data(), d_decoded, values.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 58 | 
 59 |             cudaFree(d_encoded);
 60 |             cudaFree(d_decoded);
 61 | 
 62 |             if(n != plist.docs.size())
 63 |             {
 64 |                 std::cerr << "Error: wrong list length. List: " << i << ", size: " << n << ", real_size: " << plist.docs.size() << std::endl;
 65 |                 std::abort();
 66 |             }
 67 | 
 68 |             for (size_t j = 0; j < n; ++j) {
 69 |                 if(decode_values[j] != values[j]) {
 70 |                     std::cerr << "Error: wrong decoded value. List: " << i << ", position: " << j << ", element: " << decode_values[j] << ", real_element: " << values[j] << std::endl;
 71 |                     std::abort();
 72 |                 }
 73 |             }
 74 |             progress.update(1);
 75 |             i+=1;
 76 |         }
 77 |     }
 78 | 
 79 | }
 80 | 
 81 | template <typename InputCollection, typename Encoder, typename Decoder>
 82 | void create_collection(InputCollection const &input,
 83 |                        const std::string &output_filename,
 84 |                        Encoder &encoder_function, Decoder &decoder_function, bool compress_freqs) {
 85 | 
 86 |     typename gpu_ic::index::builder builder(input.num_docs());
 87 |     size_t postings = 0;
 88 |     {
 89 |         progress progress("Create index", input.size());
 90 | 
 91 |         for (auto const &plist : input) {
 92 |             size_t size = plist.docs.size();
 93 |           if(not compress_freqs) {
 94 |                 builder.add_posting_list(size, plist.docs.begin(), encoder_function, compress_freqs);
 95 |             }
 96 |             else {
 97 |                 builder.add_posting_list(size, plist.freqs.begin(), encoder_function, compress_freqs);
 98 |             }
 99 | 
100 |             postings += size;
101 |             progress.update(1);
102 |         }
103 |     }
104 | 
105 |     gpu_ic::index coll;
106 |     auto data_len = builder.build(coll);
107 |     auto byte= mapper::freeze(coll, output_filename.c_str());
108 | 
109 | 
110 |     double bits_per_doc  = data_len * 8.0 / postings;
111 |     std::cout << "Documents: " << postings << ", total size bytes: " << byte << ", bits/doc: " << bits_per_doc << std::endl;
112 | 
113 |     verify_index(input, output_filename, decoder_function, compress_freqs);
114 | }
115 | 
116 | 
117 | int main(int argc, char** argv)
118 | {
119 |     std::string type;
120 |     std::string input_basename;
121 |     std::string output_filename;
122 |     bool compress_freqs = false;
123 | 
124 |     CLI::App app{"compress_index - a tool for compressing an index."};
125 |     app.add_option("-t,--type", type, "Index type")->required();
126 |     app.add_option("-c,--collection", input_basename, "Collection basename")->required();
127 |     app.add_option("-o,--output", output_filename, "Output filename")->required();
128 |     app.add_flag("--freqs", compress_freqs, "Compress freqs instead of docs");
129 |     CLI11_PARSE(app, argc, argv);
130 | 
131 |     binary_freq_collection input(input_basename.c_str());
132 |     if (type == "cuda_bp") {
133 |         create_collection(input, output_filename, cuda_bp::encode<>, cuda_bp::decode<>, compress_freqs);
134 |     } else if (type == "cuda_bp64") {
135 |         create_collection(input, output_filename, cuda_bp::encode<64>, cuda_bp::decode<64>, compress_freqs);
136 |     } else if (type == "cuda_bp128") {
137 |         create_collection(input, output_filename, cuda_bp::encode<128>, cuda_bp::decode<128>, compress_freqs);
138 |     } else if (type == "cuda_bp256") {
139 |         create_collection(input, output_filename, cuda_bp::encode<256>, cuda_bp::decode<256>, compress_freqs);
140 |     } else if (type == "cuda_bp512") {
141 |         create_collection(input, output_filename, cuda_bp::encode<512>, cuda_bp::decode<512>, compress_freqs);
142 |     } else if (type == "cuda_bp1024") {
143 |         create_collection(input, output_filename, cuda_bp::encode<1024>, cuda_bp::decode<1024>, compress_freqs);
144 |     } else if (type == "cuda_vbyte") {
145 |         create_collection(input, output_filename, cuda_vbyte::encode<>, cuda_vbyte::decode<>, compress_freqs);
146 |     } else if (type == "cuda_vbyte1024") {
147 |         create_collection(input, output_filename, cuda_vbyte::encode<1024>, cuda_vbyte::decode<1024>, compress_freqs);
148 |     } else {
149 |         std::cerr << "Unknown type" << std::endl;
150 |     }
151 | 
152 |     return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/src/cuda_perf_decode.cu:
--------------------------------------------------------------------------------
  1 | #include "CLI/CLI.hpp"
  2 | #include "gpu_ic/cuda_bp.cuh"
  3 | #include "gpu_ic/cuda_vbyte.cuh"
  4 | 
  5 | #include "gpu_ic/utils/binary_freq_collection.hpp"
  6 | #include "gpu_ic/utils/progress.hpp"
  7 | #include "gpu_ic/utils/tight_variable_byte.hpp"
  8 | #include "gpu_ic/utils/index.cuh"
  9 | #include "gpu_ic/utils/mapper.hpp"
 10 | #include "gpu_ic/utils/utils.hpp"
 11 | #include "mio/mmap.hpp"
 12 | #include <chrono>
 13 | #include <cmath>
 14 | 
 15 | using namespace gpu_ic;
 16 | using clock_type = std::chrono::high_resolution_clock;
 17 | 
 18 | template <typename Decoder>
 19 | void perftest(const std::string &filename, Decoder &decoder_function, const std::vector<uint32_t> &terms)
 20 | {
 21 |     gpu_ic::index coll;
 22 |     mio::mmap_source m;
 23 |     std::error_code error;
 24 |     m.map(filename, error);
 25 |     mapper::map(coll, m);
 26 | 
 27 |     std::vector<std::pair<size_t, std::vector<uint8_t>>> long_lists;
 28 |     long_lists.reserve(terms.size());
 29 |     for(auto&& t :terms) {
 30 | 	std::vector<uint8_t> tmp;
 31 |         auto n = coll.get_data(tmp, t);
 32 |         long_lists.push_back(std::make_pair(n, tmp));
 33 |     }
 34 |     CUDA_CHECK_ERROR(cudaSetDevice(0));
 35 |     warmUpGPU<<<1, 1>>>();
 36 |     std::cout << "Scanning " << long_lists.size() << " posting lists" << std::endl;
 37 |     std::chrono::duration<double> elapsed(0);
 38 |     size_t postings = 0;
 39 |     for (auto i: long_lists) {
 40 |         uint8_t *  d_encoded;
 41 |         CUDA_CHECK_ERROR(cudaMalloc((void **)&d_encoded, i.second.size() * sizeof(uint8_t)));
 42 |         CUDA_CHECK_ERROR(cudaMemcpy(d_encoded, i.second.data(), i.second.size() * sizeof(uint8_t), cudaMemcpyHostToDevice));
 43 | 
 44 |         std::vector<uint32_t> decode_values(i.first);
 45 |         uint32_t * d_decoded;
 46 |         CUDA_CHECK_ERROR(cudaMalloc((void **)&d_decoded, decode_values.size() * sizeof(uint32_t)));
 47 |         CUDA_CHECK_ERROR(cudaDeviceSynchronize());
 48 | 
 49 |         auto start = clock_type::now();
 50 |         decoder_function(d_decoded, d_encoded, decode_values.size());
 51 |         cudaDeviceSynchronize();
 52 |         auto end = clock_type::now();
 53 | 
 54 |         CUDA_CHECK_ERROR(cudaMemcpy(decode_values.data(), d_decoded, decode_values.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 55 | 
 56 |         CUDA_CHECK_ERROR(cudaFree(d_encoded));
 57 |         CUDA_CHECK_ERROR(cudaFree(d_decoded));
 58 | 
 59 |     	elapsed += end - start;
 60 | 
 61 |         for (size_t j = 0; j < i.first; ++j) {
 62 |             // std::cerr << decode_values[j] << std::endl;
 63 |             utils::do_not_optimize_away(decode_values[j]);
 64 |         }
 65 |         postings += decode_values.size();
 66 |     }
 67 | 
 68 |     double next_ns = elapsed.count() / postings * 1000000000;
 69 |     double b_int_s = postings / elapsed.count() / 1000000;
 70 |     std::cout << "Performed " << postings << " next()"
 71 |              << " in " << elapsed.count() << " [sec], "
 72 |              << std::fixed << std::setprecision(2)
 73 |              << next_ns << " [ns] x posting, "
 74 |              << b_int_s << " M ints/sec"
 75 |              << std::endl;
 76 | 
 77 | }
 78 | 
 79 | bool read_query(std::vector<uint32_t> &ret, std::istream &is = std::cin) {
 80 |     ret.clear();
 81 |     std::string line;
 82 |     if (!std::getline(is, line))
 83 |         return false;
 84 |     std::istringstream iline(line);
 85 |     uint32_t       term_id;
 86 |     while (iline >> term_id) {
 87 |         ret.push_back(term_id);
 88 |     }
 89 |     return true;
 90 | }
 91 | 
 92 | int main(int argc, char const *argv[])
 93 | {
 94 |     std::string type;
 95 |     std::string index_basename;
 96 |     std::string query_basename;
 97 | 
 98 |     CLI::App app{"compress_index - a tool for compressing an index."};
 99 |     app.add_option("-t,--type", type, "Index type")->required();
100 |     app.add_option("-i,--index", index_basename, "Index basename")->required();
101 |     app.add_option("-q,--query", query_basename, "Query basename")->required();
102 |     CLI11_PARSE(app, argc, argv);
103 | 
104 |      std::vector<uint32_t> terms;
105 |      std::filebuf fb;
106 |      size_t queries_num = 0;
107 |      if (fb.open(query_basename, std::ios::in)) {
108 |          std::istream is(&fb);
109 |          std::vector<uint32_t> q;
110 |          while (read_query(q, is)) {
111 |              queries_num+=1;
112 |              terms.insert(terms.end(), q.begin(), q.end());
113 |          }
114 |      }
115 |     if (type == "cuda_bp") {
116 |         perftest(index_basename, cuda_bp::decode<>, terms);
117 |     } else if (type == "cuda_bp64") {
118 |         perftest(index_basename, cuda_bp::decode<64>, terms);
119 |     } else if (type == "cuda_bp128") {
120 |         perftest(index_basename, cuda_bp::decode<128>, terms);
121 |     } else if (type == "cuda_bp256") {
122 |         perftest(index_basename, cuda_bp::decode<256>, terms);
123 |     } else if (type == "cuda_bp512") {
124 |         perftest(index_basename, cuda_bp::decode<512>, terms);
125 |     } else if (type == "cuda_bp1024") {
126 |         perftest(index_basename, cuda_bp::decode<1024>, terms);
127 |     } else if (type == "cuda_vbyte") {
128 |         perftest(index_basename, cuda_vbyte::decode<>, terms);
129 |     } else if (type == "cuda_vbyte1024") {
130 |         perftest(index_basename, cuda_vbyte::decode<1024>, terms);
131 |     } else {
132 |         std::cerr << "Unknown type" << std::endl;
133 |     }
134 | 
135 |     // std::cout << "Queries: " << queries_num << ", terms: " << terms.size() << std::endl;
136 | 
137 | 
138 |     return 0;
139 | }
140 | 


--------------------------------------------------------------------------------
/src/perf_decode.cpp:
--------------------------------------------------------------------------------
  1 | #include "CLI/CLI.hpp"
  2 | #include "../external/FastPFor/headers/codecfactory.h"
  3 | #include "gpu_ic/utils/binary_freq_collection.hpp"
  4 | #include "gpu_ic/utils/progress.hpp"
  5 | #include "gpu_ic/utils/bit_ostream.hpp"
  6 | #include "gpu_ic/utils/bit_istream.hpp"
  7 | #include "gpu_ic/utils/tight_variable_byte.hpp"
  8 | #include "gpu_ic/utils/index.hpp"
  9 | #include "gpu_ic/utils/mapper.hpp"
 10 | #include "gpu_ic/utils/utils.hpp"
 11 | #include "mio/mmap.hpp"
 12 | #include <chrono>
 13 | #include <cmath>
 14 | 
 15 | using namespace gpu_ic;
 16 | using namespace FastPForLib;
 17 | using clock_type = std::chrono::high_resolution_clock;
 18 | 
 19 | template<typename Codec>
 20 | void perftest(const std::string &filename, const std::vector<uint32_t> &terms)
 21 | {
 22 |     Codec codec;
 23 |     gpu_ic::index<Codec> coll;
 24 |     mio::mmap_source m;
 25 |     std::error_code error;
 26 |     m.map(filename, error);
 27 |     mapper::map(coll, m);
 28 | 
 29 |     std::vector<std::pair<size_t, std::vector<uint8_t>>> long_lists;
 30 |     long_lists.reserve(terms.size());
 31 |     for(auto&& t : terms) {
 32 |         std::vector<uint8_t> tmp;
 33 |         auto n = coll.get_data(tmp, t);
 34 |         long_lists.push_back(std::make_pair(n, tmp));
 35 |     }
 36 | 
 37 |     std::cout << "Scanning " << long_lists.size() << " posting list" << std::endl;
 38 |     std::chrono::duration<double> elapsed(0);
 39 |     size_t postings = 0;
 40 |     for (auto i: long_lists) {
 41 |         std::vector<uint32_t> decode_values(i.first);
 42 |         auto start = clock_type::now();
 43 |         size_t n = 0;
 44 |         codec.decodeArray(reinterpret_cast<uint32_t const *>(i.second.data()), i.second.size()/4, reinterpret_cast<uint32_t*>(decode_values.data()), n);
 45 |         auto end = clock_type::now();
 46 |         elapsed += end - start;
 47 |         if(n != i.first) {
 48 |             std::cerr << "Error: number of decoded values " << n << ", actual number of values" << i.first << std::endl;
 49 |         }
 50 |         for (size_t i = 0; i < n; ++i) {
 51 |             utils::do_not_optimize_away(decode_values[i]);
 52 |         }
 53 |         postings += decode_values.size();
 54 |     }
 55 | 
 56 |     double next_ns = elapsed.count() / postings * 1000000000;
 57 |     double b_int_s = postings / elapsed.count() / 1000000;
 58 |     std::cout << "Performed " << postings << " next()"
 59 |              << " in " << elapsed.count() << " [sec], "
 60 |              << std::fixed << std::setprecision(2)
 61 |              << next_ns << " [ns] x posting, "
 62 |              << b_int_s << " M ints/sec"
 63 |              << std::endl;
 64 | 
 65 | }
 66 | 
 67 | bool read_query(std::vector<uint32_t> &ret, std::istream &is = std::cin) {
 68 |     ret.clear();
 69 |     std::string line;
 70 |     if (!std::getline(is, line))
 71 |         return false;
 72 |     std::istringstream iline(line);
 73 |     uint32_t       term_id;
 74 |     while (iline >> term_id) {
 75 |         ret.push_back(term_id);
 76 |     }
 77 |     return true;
 78 | }
 79 | 
 80 | int main(int argc, char const *argv[])
 81 | {
 82 |     std::string type;
 83 |     std::string index_basename;
 84 |     std::string query_basename;
 85 | 
 86 |     CLI::App app{"compress_index - a tool for compressing an index."};
 87 |     app.add_option("-t,--type", type, "Index type")->required();
 88 |     app.add_option("-i,--index", index_basename, "Index basename")->required();
 89 |     app.add_option("-q,--query", query_basename, "Query basename")->required();
 90 |     CLI11_PARSE(app, argc, argv);
 91 | 
 92 |     std::vector<uint32_t> terms;
 93 |     std::filebuf fb;
 94 |     size_t queries_num = 0;
 95 |     if (fb.open(query_basename, std::ios::in)) {
 96 |         std::istream is(&fb);
 97 |         std::vector<uint32_t> q;
 98 |         while (read_query(q, is)) {
 99 |             queries_num+=1;
100 |             terms.insert(terms.end(), q.begin(), q.end());
101 |         }
102 |     }
103 | 
104 |     if (type == "simdbp") {
105 |         CompositeCodec<SIMDBinaryPacking, VariableByte> codec;
106 |         perftest<decltype(codec)>(index_basename, terms);
107 |     } else if (type == "streamvbyte") {
108 |         StreamVByte codec;
109 |         perftest<decltype(codec)>(index_basename, terms);
110 |     } else if (type == "bp") {
111 |         CompositeCodec<BP32, VariableByte> codec;
112 |         perftest<decltype(codec)>(index_basename, terms);
113 |     } else if (type == "varintgb") {
114 |         VarIntGB<> codec;
115 |         perftest<decltype(codec)>(index_basename, terms);
116 |     } else {
117 |         std::cerr << "Unknown type" << std::endl;
118 |     }
119 | 
120 | 
121 | 
122 |     // std::cout << "Queries: " << queries_num << ", terms: " << terms.size() << std::endl;
123 | 
124 | 
125 |     return 0;
126 | }
127 | 


--------------------------------------------------------------------------------
/test_data/queries:
--------------------------------------------------------------------------------
  1 | 101587	61936
  2 | 40429	86328
  3 | 13975	94987	102912	75488	86157
  4 | 80811	110278	90269	96541
  5 | 33726
  6 | 78401	68238
  7 | 59451	82510
  8 | 110622	102912	53265	66945	43418	101818	99022	54523	54209
  9 | 67842	54513	67848
 10 | 55900	91909
 11 | 51079	89883
 12 | 38616	96982
 13 | 97986	43403
 14 | 106967	75552	59184
 15 | 86328	82481	95555	80147
 16 | 101785	47930
 17 | 44232	103219
 18 | 90882	72383
 19 | 48145	68857
 20 | 73102	55872	68283
 21 | 43460	110362
 22 | 46586
 23 | 47320	33596
 24 | 101682	72197
 25 | 62885	43748
 26 | 110278	44879
 27 | 62574	93388	40150	68583
 28 | 102046	74112
 29 | 65953	111200
 30 | 101365	17496	110642	53842
 31 | 82777	83431	41152	44915
 32 | 60341	49248	34323	95878	67486	75119
 33 | 102133	112621	65989
 34 | 60740	78250	62198
 35 | 60392	75877	86281
 36 | 67574
 37 | 33856	88404
 38 | 40975
 39 | 97369	110949
 40 | 110717	76695	110770	74156	102912	54599	42353	111450
 41 | 73411	82481	72583	79520	46235
 42 | 40013	42353	42958	106267
 43 | 51571	51834	82481	91489
 44 | 46410	47753
 45 | 81496	33252	59377
 46 | 80219	72531	82632
 47 | 61559	110479	71821
 48 | 46352	86758	75773
 49 | 105328	44427	5924	86157
 50 | 82607	103402	98558
 51 | 59519	47436	39332
 52 | 105329	61936
 53 | 74447	49248	49285
 54 | 67262	62044	105677	67262	62044	105677	96886
 55 | 113086	52033
 56 | 69774	67486	50806
 57 | 43974	96023	91015
 58 | 62557
 59 | 86738	96807	40429	59978	57905
 60 | 99001	78599
 61 | 65446	91071	50240	93962	111030
 62 | 55612	111457
 63 | 82620	79303	111530	102324	97353	68820	34390	112715	66631	71126	69016
 64 | 97366	85132
 65 | 92457	91889
 66 | 111200	67486	84677
 67 | 86157	93388	82481	61684	41505	70086	8468	48343
 68 | 111450	93388	91851	67486	94022	38961
 69 | 102133	97188	47852
 70 | 60392	44792	47436	39332
 71 | 109782	78596	68754	42738
 72 | 71780	82481	102046	91015	65989
 73 | 102503	62083
 74 | 34247	44390
 75 | 42771	63843
 76 | 93479
 77 | 110622	68820	102133	47977	82481	102133	43302
 78 | 45777	102533	61690
 79 | 58684	42983
 80 | 102133	39983	82481	77197	34202	76695
 81 | 91753	40749
 82 | 47487	57873	62029
 83 | 33229	44941
 84 | 69805	31550	42004
 85 | 79610	98398
 86 | 30300	32436	71869	59978	93645	94610	106016
 87 | 90013
 88 | 86938	74830	44915
 89 | 69359	100705	58774	78596	102889
 90 | 58625	48720	82481	78623
 91 | 78500	49248	80811	90144	56796
 92 | 33708	42738
 93 | 93788	70008	93879	102339	96015
 94 | 68581	58195	59978	53338	34202	88081
 95 | 69805	96470	93944
 96 | 102133	72383	48169	67558
 97 | 110622	68820	53187
 98 | 100035	68289	45194	79365
 99 | 56694
100 | 92489	84496	47977
101 | 47675	44915
102 | 74156	53113
103 | 48792	82481	44782	44145	82481	104965	88209
104 | 50265	53793	95978
105 | 54599	34202	76462
106 | 52857
107 | 57681	47478
108 | 66014	102912	48819	58131	68462	98077	59953
109 | 60120	34194
110 | 64274	69016	83392	74156	69016
111 | 80432	102046	80422
112 | 51590	94716	79520
113 | 60317	75609	79072
114 | 76897	73806
115 | 101585	95555
116 | 99306	68335	68551
117 | 91214	95878	59978	96921	53338	93388	67486	77217
118 | 102133	57803	93670	44596
119 | 69571	74156	65063
120 | 57729	47034	101846	45930
121 | 70609	65356
122 | 71712	89029	105677	40967	99737
123 | 97478	32942	90144	56796
124 | 99856	42059	111730
125 | 101268	80644
126 | 75039	101681
127 | 81398	55245	84949	104433
128 | 68820	102133	40683	80689	51060	31550	46819	40683
129 | 93959	97650	33229	95458
130 | 61690	62697
131 | 110278	44713
132 | 93788	69773	49248	49285	94399
133 | 52165	78514	89883
134 | 47089	82481	75567
135 | 58663	58634	69640
136 | 53889	105983	96013	105677	67486	59951
137 | 68645	58676	95458	103402	44145
138 | 59451	97116
139 | 90435	56089	88388	47753
140 | 66631	72410
141 | 65814	63815	75496
142 | 104388	44850	33229	47302
143 | 102555	87083	95997	91738
144 | 78567	65741	59978	93645
145 | 84890	98474	56035
146 | 66945	112832	74156	97319	95496	102176	60392
147 | 106967	33286
148 | 38616	76506	86773
149 | 98388	63026
150 | 69800	76231
151 | 60392	82179	82481	71537
152 | 41996	110299
153 | 42585	78960	46337
154 | 104317	72842	83942	57392
155 | 49314	47382
156 | 49251	59940	67558
157 | 102133	41029	47521	64342
158 | 44297	71101
159 | 96035	63790	99413
160 | 80377	83553	71627
161 | 33385	103552	51209
162 | 32556	34202	57081
163 | 104322	103029
164 | 79660	103590	98779	87320
165 | 62029	30298	4807	96598	26877	6386	46406	47487
166 | 87869	111161	80913	68238
167 | 98289	85861	98077
168 | 88154	110278	68583	60392
169 | 97600	96472	96062
170 | 80377	112825	47089	89876	88225
171 | 86000	101610	67910
172 | 54191	58195
173 | 106830	82481	104506	76023	58520
174 | 100072
175 | 64131	51040	92214	101985
176 | 86537	60870
177 | 88435	110278	32606
178 | 81950	47436	39332
179 | 100437	87304	100018
180 | 72377	87092
181 | 42250
182 | 44241	59978	93645	67486	59451
183 | 104801	98449
184 | 65447	82481	95754	92013	80811
185 | 31550	109770	82984	102133	78408	78623
186 | 54550	67486	67203	8802
187 | 110770	84205	43628	75415	38658	61157	100705
188 | 74433	49248	101444	38817
189 | 69571	61327
190 | 56809	41152
191 | 41911	103874
192 | 50855	82984	45058	47750
193 | 72231	54729
194 | 41173
195 | 105871	62567	62697	59978	54935
196 | 49806	65959
197 | 46444	47487
198 | 60845	91919
199 | 32007
200 | 102095	112839
201 | 43302	44961	73912
202 | 110349	45930
203 | 57771	64563	110245	96541
204 | 97919	48164	102749
205 | 94508	59978	63248
206 | 91705	73102	50393
207 | 60392	98612	101985	47427	67203
208 | 110786	86769	39667	109901	103219	2671
209 | 63254	47673	97604
210 | 82697	75944	103402
211 | 97593	34202	62207	47753	59978	46369
212 | 71089	34175
213 | 33300	76282
214 | 85795	33745
215 | 65959	80377	112825
216 | 48754	89457	50481	97213
217 | 45286
218 | 77016	65807
219 | 93959	89635
220 | 4542	84803
221 | 65543
222 | 105922	80724	60551	86294	105677	103960
223 | 69628	42585	33229	73293
224 | 106928	47521	67701	110389
225 | 96585	51814
226 | 109945	94508	82481	88549
227 | 59995	89564	49516	55913
228 | 98449	87992	69227	40277	85111
229 | 49254	65741	73764
230 | 46248	12608	60458
231 | 102133	51198	113242
232 | 45612	76695
233 | 43422	64630
234 | 86157	71692	75182
235 | 68689	62558
236 | 85853	60484	91015	67486	80377	69613
237 | 66333	96160
238 | 111542	39667	65741
239 | 65741	57563	73126
240 | 90595	97823	53778	45773	93388	79216
241 | 8859
242 | 66309	97919
243 | 84734	94508	60458	44782
244 | 34281	67486	33941
245 | 80377	69613	53287
246 | 46556	86987
247 | 69032	55929	52484
248 | 95458	60836	65741	88572
249 | 80913	46579	72575	41346	32477
250 | 70461
251 | 63837	93388	49024	78067
252 | 42353	51339
253 | 93176
254 | 97489	84852
255 | 84672	89564
256 | 67558
257 | 27281
258 | 47647	80143	78250
259 | 97863	32177
260 | 112916	61891	82620
261 | 39717	40712	64889
262 | 39749	80410
263 | 111497	49248
264 | 105871	51834	82481	54935	83069	85130
265 | 61508	109936	102679
266 | 111457	85054
267 | 33883
268 | 72160	95997
269 | 87439	82519
270 | 39363	100394	84617
271 | 57929	105065	68394
272 | 34267
273 | 75721	98492	42738	82481	49134
274 | 77175	38658
275 | 98955	97248	96035
276 | 48062	99737	93880	47223
277 | 65741	86670
278 | 85319	71012
279 | 79365	104515	40277	66631	71573	54383	93388	79365	54383	100705
280 | 85853	64590
281 | 67567
282 | 94508	113242	44381	32606
283 | 59451	98492	53114	82481	49053
284 | 110952	80377	69613	82179	58076	9365
285 | 103393	84803	67486	47979	97986
286 | 32942	33944	87059	96541
287 | 111530	102912	77013	97353
288 | 44941	79216	45804
289 | 64169	51039
290 | 46607	100605	59978	79216
291 | 98705	45882	34202	111635	64585
292 | 32747	31550	85853
293 | 103368	63248
294 | 52853	112626
295 | 86783	72044	59439
296 | 49066	95458
297 | 44596	98492	48757	101985
298 | 43653	65886	96216	93536
299 | 63864	9072	103744
300 | 69032	84983	95868	99381
301 | 90640	102142	106822	80377	112825
302 | 47521	76492
303 | 80377	69613	80410
304 | 59951	99019
305 | 99373	67486	78960
306 | 110684	111455	79303	69453	53612	73754
307 | 43987	86092
308 | 34522	84496	49472
309 | 70624
310 | 102339	34202	74890	39919	48343
311 | 83993	48669	91087
312 | 51400	49583
313 | 106403	47089	99045
314 | 65959	55753	71627	75361
315 | 85065	89402	47930
316 | 42490
317 | 91013	102912	44347	60870
318 | 60392	88156	53847
319 | 76756	41520	104515	93388	31550
320 | 112799	41183	68820	96935	102181	102133
321 | 98819	49251	79216
322 | 95793	96987
323 | 80811	74112
324 | 40157	76848	43843	79303	101688
325 | 68271	101635
326 | 87885	64601
327 | 45967	104367	83015	60120	79315
328 | 9174	59995	57368
329 | 69553	106830
330 | 63974
331 | 84852	51834	78464	106255
332 | 54264	96107
333 | 104405	96293	48186	59978	110677	83392	81520	102265
334 | 99045	110663	102265	95217	78960	67486	69297	77095
335 | 47977	67266
336 | 44782	80410	84689
337 | 74964	64286	39332
338 | 79610	64619	9174	46410	47753
339 | 52853	65959
340 | 61566	70878
341 | 112601	79303	92489
342 | 94679	41646	32241
343 | 41650	83906	78567
344 | 55987	74044	63248
345 | 44878	92539	93143
346 | 56498	77200
347 | 39750	97650
348 | 87869	111450
349 | 60392	85801	94916
350 | 60830
351 | 60392	57206	82481	88464	111542	90847
352 | 93959	78586
353 | 69805	13974	6756
354 | 71860
355 | 86110	45512
356 | 56640	49248	94508	53047
357 | 32804	34202	75808
358 | 86610	19275
359 | 89012
360 | 40240	74112
361 | 66014	102912	68183	31550	45860	57755
362 | 97578	111530	59978	60484
363 | 61293	82481	87731
364 | 51656	97353
365 | 60612
366 | 90144	56796	67486	63326	82713
367 | 77811	60870
368 | 93925	76278
369 | 94904	85497
370 | 102168
371 | 95831	88277
372 | 86113	96015
373 | 80811	65959
374 | 75799	65907	86157
375 | 42395	84494
376 | 8682	102476
377 | 64756	70537
378 | 68910
379 | 97356	48943
380 | 94679
381 | 89613	63041	67486	42857	66839
382 | 105922	82549	88153	87992
383 | 41650	105871	41152
384 | 69291	54520	63814
385 | 91754	74719	110639
386 | 71730	49248	58828
387 | 48928	43179	63334
388 | 89621	98558	32804	87398	83459
389 | 59451	71899	101813	47753
390 | 73414	102912	50745	93388	78250	44381	91787
391 | 69227	78542
392 | 102133	104677	78266	80410	84689
393 | 89283	65959	110507	103834	32807
394 | 112590	46758	53831	48169
395 | 99008	89437	60535	78623
396 | 80377	112825	95668	80643	47521
397 | 92739
398 | 102339	91795	82984	103402
399 | 66204	49248	49285
400 | 105922	102133	61091	52558	13828	91223	42958
401 | 93190	50806
402 | 103189	101119	85189
403 | 99378	63218	59978	31550	58181	87083	63176
404 | 71101
405 | 59451	91355	83446
406 | 85003	82481	48062	96921	86799	59978	85314
407 | 70911
408 | 82834	111457
409 | 40467	46414
410 | 53778	76438
411 | 82070	69904	80410
412 | 66246	81952
413 | 66014	102912	48510	31550	97640	95173
414 | 86157	82481	62521	63041	102133	110949	96530
415 | 103446
416 | 64855	60146
417 | 73885	44611	39332
418 | 34393	49569	91087
419 | 92007	59954
420 | 68789	95458
421 | 55364	75285	72096	33432
422 | 48731	33252
423 | 71102	58520	53718	86328
424 | 61647	34202	47415	56096	67486	102133	54523	74719
425 | 46630	106255	94508
426 | 82620	79303	75285	66631	74156	65357	97142
427 | 34281	67486	33941	89437
428 | 52672	104474	70970
429 | 54577	62065
430 | 53573
431 | 52309	44879
432 | 103552	67486	43388
433 | 91754	33634
434 | 59978	75428	82915	80081
435 | 88154	75471	102912	47977	82481	33353
436 | 61625	103347
437 | 44893	67486	102133	32674
438 | 73783	66358
439 | 81507	75476
440 | 10647	42254	66853
441 | 110622	66945	112850	53338	61784	42284	44381	31550	102533
442 | 48669	88081	67695
443 | 48343	67486	41597	67702
444 | 52079	59451	65664	68070	41158
445 | 90953	109804
446 | 89575	60535	102133	90144	111591
447 | 32556	84936
448 | 96392	46410
449 | 102133	50714	106909	47753	78623
450 | 55245	53484	49285
451 | 110291
452 | 94680	44882	44056	57457	66113	103219
453 | 98492	40732	82481	102046
454 | 60392	102116	112832	45169
455 | 46535	86587	39212	81926
456 | 111389
457 | 41389	49248	71537	61559
458 | 110230	48030	75739	74830
459 | 69876	51553	106251	90144	98492	39842
460 | 90144	39012
461 | 51039	43703
462 | 85497	98558	91767
463 | 97425	51021	87059
464 | 47089	82481	83588	33353
465 | 65959	34202	78553	59978	93645	67486	97478	102046
466 | 57916	49248	85515	88846	90374
467 | 88374	53793
468 | 65938	79568	58828
469 | 80926	85619
470 | 85975	102535
471 | 106830	61241	40467
472 | 83918	40702	91015	88438
473 | 80443
474 | 44189	44824
475 | 46556	48087	88438
476 | 33972	80525
477 | 65768
478 | 46328	34202	102133	46758	57613	78623
479 | 61684	45612
480 | 65357
481 | 46999	96987	90144	56796	33003
482 | 96267	32199
483 | 49569	45169	40150	88323
484 | 107372	40601	23611
485 | 46406	59377
486 | 56006	39992	67486
487 | 65356	86281
488 | 43616	95458
489 | 68565	94045	40702
490 | 86066	68381	33262
491 | 106170	65745
492 | 53576	55403
493 | 79075
494 | 85577	43189
495 | 46414	93766
496 | 96392	61241
497 | 101688	46344
498 | 45147	58429	96216	96676
499 | 39485	49251
500 | 43537
501 | 


--------------------------------------------------------------------------------
/test_data/test_collection.docs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amallia/gpu-integers-compression/a120ed89d4c5eed5761402147525ef480cdcdc93/test_data/test_collection.docs


--------------------------------------------------------------------------------
/test_data/test_collection.freqs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amallia/gpu-integers-compression/a120ed89d4c5eed5761402147525ef480cdcdc93/test_data/test_collection.freqs


--------------------------------------------------------------------------------