├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── data └── test_sequence ├── external └── CMakeLists.txt ├── include ├── builder.hpp ├── building_util.hpp ├── constants.hpp ├── contains.hpp ├── decode.hpp ├── enumerator.hpp ├── intersection.hpp ├── intersection_many.hpp ├── next_geq.hpp ├── next_geq_enumerator.hpp ├── s_index.hpp ├── s_sequence.hpp ├── select.hpp ├── table.hpp ├── uncompress.hpp ├── uncompress_chunk_and_intersect.hpp ├── uncompress_chunk_and_merge.hpp ├── union.hpp ├── union_many.hpp └── util.hpp ├── script ├── build.py └── queries.py ├── src ├── CMakeLists.txt ├── build.cpp ├── cardinality.cpp ├── contains.cpp ├── decode.cpp ├── example.cpp ├── intersect.cpp ├── next_geq.cpp ├── select.cpp ├── uncompress.cpp └── union.cpp ├── statistics └── README.md ├── test ├── CMakeLists.txt ├── test_common.hpp ├── test_contains.cpp ├── test_decode.cpp ├── test_enumerator.cpp ├── test_intersect.cpp ├── test_intersect_many.cpp ├── test_next_geq.cpp ├── test_next_geq_enumerator.cpp ├── test_select.cpp ├── test_uncompress.cpp ├── test_union.cpp └── test_union_many.cpp └── tools ├── CMakeLists.txt ├── gen_clustered_data.cpp ├── gen_random_next_geq_queries.cpp ├── gen_random_pairwise_queries.cpp ├── gen_random_select_queries.cpp └── gen_uniform_data.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: true 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: Empty 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeComma 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: true 45 | BreakConstructorInitializers: BeforeComma 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 80 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: false 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^' 66 | Priority: 2 67 | - Regex: '^<.*\.h>' 68 | Priority: 1 69 | - Regex: '^<.*' 70 | Priority: 2 71 | - Regex: '.*' 72 | Priority: 3 73 | IncludeIsMainRegex: '([-_](test|unittest))?$' 74 | IndentCaseLabels: true 75 | IndentPPDirectives: None 76 | IndentWidth: 4 77 | IndentWrappedFunctionNames: false 78 | JavaScriptQuotes: Leave 79 | JavaScriptWrapImports: true 80 | KeepEmptyLinesAtTheStartOfBlocks: false 81 | MacroBlockBegin: '' 82 | MacroBlockEnd: '' 83 | MaxEmptyLinesToKeep: 1 84 | NamespaceIndentation: None 85 | ObjCBinPackProtocolList: Never 86 | ObjCBlockIndentWidth: 2 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: true 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyBreakTemplateDeclaration: 10 95 | PenaltyExcessCharacter: 1000000 96 | PenaltyReturnTypeOnItsOwnLine: 200 97 | PointerAlignment: Left 98 | RawStringFormats: 99 | - Language: Cpp 100 | Delimiters: 101 | - cc 102 | - CC 103 | - cpp 104 | - Cpp 105 | - CPP 106 | - 'c++' 107 | - 'C++' 108 | CanonicalDelimiter: '' 109 | BasedOnStyle: google 110 | - Language: TextProto 111 | Delimiters: 112 | - pb 113 | - PB 114 | - proto 115 | - PROTO 116 | EnclosingFunctions: 117 | - EqualsProto 118 | - EquivToProto 119 | - PARSE_PARTIAL_TEXT_PROTO 120 | - PARSE_TEST_PROTO 121 | - PARSE_TEXT_PROTO 122 | - ParseTextOrDie 123 | - ParseTextProtoOrDie 124 | CanonicalDelimiter: '' 125 | BasedOnStyle: google 126 | ReflowComments: true 127 | SortIncludes: false 128 | SortUsingDeclarations: false 129 | SpaceAfterCStyleCast: false 130 | SpaceAfterTemplateKeyword: true 131 | SpaceBeforeAssignmentOperators: true 132 | SpaceBeforeCpp11BracedList: false 133 | SpaceBeforeCtorInitializerColon: true 134 | SpaceBeforeInheritanceColon: true 135 | SpaceBeforeParens: ControlStatements 136 | SpaceBeforeRangeBasedForLoopColon: true 137 | SpaceInEmptyParentheses: false 138 | SpacesBeforeTrailingComments: 2 139 | SpacesInAngles: false 140 | SpacesInContainerLiterals: true 141 | SpacesInCStyleCastParentheses: false 142 | SpacesInParentheses: false 143 | SpacesInSquareBrackets: false 144 | Standard: Auto 145 | StatementMacros: 146 | - Q_UNUSED 147 | - QT_REQUIRE_VERSION 148 | TabWidth: 8 149 | UseTab: Never 150 | ... 151 | 152 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | build 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/mm_file"] 2 | path = external/mm_file 3 | url = https://github.com/jermp/mm_file.git 4 | [submodule "external/essentials"] 5 | path = external/essentials 6 | url = https://github.com/jermp/essentials.git 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(S_INDEXES) 3 | 4 | if(NOT CMAKE_BUILD_TYPE) 5 | set(CMAKE_BUILD_TYPE "Release") 6 | endif() 7 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} ) 8 | 9 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 10 | 11 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 12 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 13 | endif () 14 | 15 | if(UNIX) 16 | 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") 18 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") 19 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces") 22 | 23 | if(USE_SANITIZERS) 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") 25 | endif() 26 | 27 | endif() 28 | 29 | include_directories(${S_INDEXES_SOURCE_DIR}/include) 30 | 31 | add_subdirectory(external) 32 | add_subdirectory(src) 33 | add_subdirectory(test) 34 | add_subdirectory(tools) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright 2019-2021 Giulio Ermanno Pibiri 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included 13 | in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Sliced Indexes 2 | ============== 3 | 4 | A C++ implementation of [*sliced indexes*](https://arxiv.org/abs/1907.01032) [3,4], 5 | that can be used to compress bitmaps and inverted lists. 6 | 7 | Also refer to the [CSUR paper](https://arxiv.org/abs/1908.10598v2) [5] for further experiments and comparisons (Section 6). 8 | 9 | This guide is meant to provide a brief overview of the library and to illustrate its functionalities through some examples. 10 | ##### Table of contents 11 | * [Compiling the code](#compiling-the-code) 12 | * [Quick Start](#quick-start) 13 | * [Building a collection of sequences](#building-a-collection-of-sequences) 14 | * [Operations](#operations) 15 | * [Testing](#testing) 16 | * [Tools](#tools) 17 | * [An example microbenchmark](#an-example-microbenchmark) 18 | * [Authors](#authors) 19 | * [References](#references) 20 | 21 | Compiling the code 22 | ------------------ 23 | 24 | The code is tested on Linux with `gcc` 7.3.0 and on Mac 10.14 with `clang` 10.0.0. 25 | To build the code, [`CMake`](https://cmake.org/) is required. 26 | 27 | The code has few external dependencies (for testing, serialization and memory-mapping facilities), so clone the repository with 28 | 29 | git clone --recursive https://github.com/jermp/s_indexes.git 30 | 31 | If you have cloned the repository without `--recursive`, you will need to perform the following commands before 32 | compiling: 33 | 34 | git submodule init 35 | git submodule update 36 | 37 | To compile the code for a release environment (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following: 38 | 39 | mkdir build 40 | cd build 41 | cmake .. 42 | make 43 | 44 | Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs. 45 | 46 | For a testing environment, use the following instead: 47 | 48 | mkdir debug_build 49 | cd debug_build 50 | cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On 51 | make 52 | 53 | Quick Start 54 | ------- 55 | 56 | For a quick start, see the source file `src/example.cpp`. 57 | After compilation, run this example with 58 | 59 | ./example < ../data/test_sequence 60 | 61 | which will: 62 | 63 | 1. read from the standard input using a test 64 | sequence from the directory `data`; 65 | 2. build the data structure in memory and perform some operations (decode and select). 66 | 67 | By specifying an output file name, it is possible to 68 | serialize the data structure on disk. To perform the 69 | operations, the data structure is then memory mapped 70 | from such file. To do so, type 71 | 72 | ./example -o out.bin < ../data/test_sequence 73 | 74 | ```C++ 75 | #include 76 | 77 | #include "../external/essentials/include/essentials.hpp" 78 | #include "builder.hpp" 79 | #include "s_sequence.hpp" 80 | #include "select.hpp" 81 | #include "decode.hpp" 82 | 83 | using namespace sliced; 84 | 85 | int main(int argc, char** argv) { 86 | int mandatory = 1; 87 | char const* output_filename = nullptr; 88 | 89 | for (int i = mandatory; i != argc; ++i) { 90 | if (std::string(argv[i]) == "-o") { 91 | ++i; 92 | output_filename = argv[i]; 93 | } else if (std::string(argv[i]) == "-h") { 94 | std::cout << argv[0] << " -o output_filename < input" << std::endl; 95 | return 1; 96 | } else { 97 | std::cout << "unknown option '" << argv[i] << "'" << std::endl; 98 | return 1; 99 | } 100 | } 101 | 102 | std::vector input; 103 | 104 | { // read input from std::in 105 | uint32_t n, x; 106 | std::cin >> n; 107 | input.reserve(n); 108 | for (uint32_t i = 0; i != n; ++i) { 109 | std::cin >> x; 110 | input.push_back(x); 111 | } 112 | } 113 | 114 | // build the sequence and print statistics 115 | s_sequence::builder builder; 116 | auto stats = builder.build(input.data(), input.size()); 117 | stats.print(); 118 | 119 | mm::file_source mm_file; 120 | uint8_t const* data = nullptr; 121 | 122 | if (output_filename) { // if an output file is specified, then serialize 123 | essentials::print_size(builder); 124 | essentials::save(builder, output_filename); 125 | 126 | // mmap 127 | int advice = mm::advice::normal; // can be also random and sequential 128 | mm_file.open(output_filename, advice); 129 | 130 | // skip first 8 bytes storing the number of written bytes 131 | data = mm_file.data() + 8; 132 | 133 | } else { // otherwise work directly in memory 134 | data = builder.data(); 135 | } 136 | 137 | // initialize a s_sequence from data, regardless the source 138 | s_sequence ss(data); 139 | 140 | uint32_t size = ss.size(); 141 | 142 | // decode whole list to an output buffer 143 | std::vector out(size); 144 | ss.decode(out.data()); 145 | // check written values 146 | uint32_t value = 0; 147 | for (uint32_t i = 0; i != size; ++i) { 148 | if (input[i] != out[i]) { 149 | std::cout << "got " << out[i] << " but expected " << input[i] 150 | << std::endl; 151 | return 1; 152 | } 153 | 154 | ss.select(i, value); // select i-th element 155 | if (value != out[i]) { 156 | std::cout << "got " << value << " but expected " << out[i] 157 | << std::endl; 158 | return 1; 159 | } 160 | } 161 | 162 | return 0; 163 | } 164 | ``` 165 | 166 | Building a collection of sequences 167 | ---------------------------------- 168 | 169 | Typically, we want to build all the sequences from 170 | a collection. 171 | In this case, we assume that the input collection 172 | is a binary file with all the sequences being written 173 | as 32-bit integers, as popular for also other libraries 174 | such as [`ds2i`](https://github.com/ot/ds2i). 175 | In particular, each sequence is prefixed by an additional 176 | 32-bit integer representing the size of the sequence. 177 | The collection file starts with a singleton sequence 178 | containing the universe of representation of the sequences, i.e., the maximum representable value. 179 | 180 | For example, an test input collection with 100 sequences drawn 181 | from a universe of size 1,000,000 can be generated 182 | with 183 | 184 | ./gen_clustered_data 100 1000000 test_collection --binary 185 | 186 | To build an index from such collection, then use 187 | 188 | ./build test_collection --density 0.01 --out test_collection.out 189 | 190 | with a density threshold of 0.01 and an output file 191 | `test_collection.out` onto which the data structure is serialized. 192 | You should get an output like: 193 | 194 | universe size: 1000000 195 | processed 100 sequences, 45911859 integers 196 | chunks: 1572 197 | full chunks: 466 (66.5183% of ints) 198 | empty chunks: 310 (19.7201% of chunks) 199 | dense chunks: 513 (30.3916% of ints) 200 | sparse chunks: 283 (3.09016% of ints) 201 | blocks: 23395 202 | empty blocks: 14 (0.0598418% of blocks) 203 | dense blocks: 7614 (2.53826% of ints) 204 | sparse blocks: 15767 (0.551905% of ints) 205 | 0.00179405 [bpi] for chunks' headers 206 | 0.00540078 [bpi] for blocks' headers 207 | 0.732272 [bpi] for dense chunks 208 | 0.0424549 [bpi] for dense blocks 209 | 0.0468998 [bpi] for sparse blocks 210 | total bytes: 4757416 211 | total bpi: 0.828965 212 | 213 | from which you can see some statistics about the built data structure. 214 | 215 | Operations 216 | ---------- 217 | 218 | Given a single *sliced* sequence, it is possible to execute the 219 | following operations (see also `include/s_sequence.hpp`): 220 | 221 | ```C++ 222 | /* decode the sequence to the output buffer */ 223 | size_t decode(uint32_t* out) const; 224 | 225 | /* convert the sequence to an output bitmap */ 226 | size_t uncompress(uint64_t* out) const; 227 | 228 | /* select the i-th value */ 229 | bool select(uint32_t i, uint32_t& value) const; 230 | 231 | /* check if value is present in the sequence */ 232 | bool contains(uint32_t value) const; 233 | 234 | /* returns the minimum value that is >= lower_bound 235 | if found, otherwise a "not found" value is returned */ 236 | uint32_t next_geq(uint32_t lower_bound) const; 237 | ``` 238 | 239 | Given a collection of (at least 2) *sliced* sequences, it is possible to perform intersection and merging of the sequences: 240 | 241 | ```C++ 242 | /* writes the result of the intersection between l and s to the output buffer, 243 | returning the size of the result */ 244 | size_t pairwise_intersection(s_sequence const& l, s_sequence const& r, uint32_t* out); 245 | 246 | /* writes the result of the union between l and s to the output buffer, 247 | returning the size of the result */ 248 | size_t pairwise_union(s_sequence const& l, s_sequence const& r, uint32_t* out); 249 | 250 | /* writes the result of the intersection between the 251 | sequences to the output buffer, returning the size of the result */ 252 | size_t intersection(std::vector& sequences, uint32_t* out); 253 | 254 | /* writes the result of the union between the 255 | sequences to the output buffer, returning the size of the result */ 256 | size_t union_many(std::vector& sequences, uint32_t* out); 257 | 258 | ``` 259 | 260 | The source `src` folder contains programs to benchmark such operations. 261 | 262 | #### Example 1. 263 | Use: 264 | 265 | ./decode test_collection.out 266 | 267 | to decode all the sequences in the collection. You should get something 268 | like: 269 | 270 | decoded 100 sequences 271 | decoded 45911859 integers 272 | Elapsed time: 0.034721 [sec] 273 | Mean per sequence: 347.21 [musec] 274 | Mean per integer: 0.756253 [ns] 275 | 276 | #### Example 2. 277 | To execute some intersection operations, first generate some queries with 278 | 279 | ./gen_random_pairwise_queries 1000 100 > test_pairwise_queries 280 | 281 | and then run 282 | 283 | ./intersect test_collection.out 1000 < test_pairwise_queries 284 | 285 | You should get something like: 286 | 287 | performing 1000 pairwise-intersections... 288 | Mean per run: 136562 [musec] 289 | Mean per query: 136.562 [musec] 290 | 291 | Testing 292 | ------- 293 | The subfolder `test` contains testing programs to maintain 294 | the correctness of the implementation. 295 | 296 | To run a test, just run the corresponding program without 297 | argument to see the required ones. 298 | 299 | For example, to test decoding correctness, use 300 | 301 | ./test_decode test_collection.out ../data/test_collection 0.01 302 | 303 | which will check every decoded integer against the original input 304 | collection (note that you must provide the *correct* original input collection as well as the *density level* it was used during building). 305 | 306 | Tools 307 | ----- 308 | The subfolder `tools` contains some programs generating 309 | synthetic data to test the code. 310 | 311 | For example, the sequence `data/test_sequence` was generated with 312 | 313 | ./gen_clustered_data 1 1000000 test_sequence 314 | 315 | A test collection can be generated with 316 | 317 | ./gen_clustered_data 100 1000000 test_collection --binary 318 | 319 | A test query log can be generated with 320 | 321 | ./gen_random_pairwise_queries 1000 100 > test_pairwise_queries 322 | 323 | An example microbenchmark 324 | ----- 325 | In the following microbenchmark we show the number of bits per integer (bpi) and average microseconds per list intersection query with 2 sequences. 326 | 327 | We compare Slicing with Roaring [1] and Partitioned Elias-Fano [2]. 328 | 329 | We use the datasets Census-Income, Census-1881, Weather and Wikileaks shipped with the [CRoaring Library](https://github.com/RoaringBitmap/CRoaring) (see directory `benchmarks/realdata`). 330 | See [1] for a description of such datasets. 331 | 332 | To measure the bpi rate, we serialize the data structures and take the written number of bytes. 333 | To measure query timings, we compute 1,000 intersections between 334 | random pairs of lists for 10 times and report the average. 335 | 336 | The benchmark was executed on a Linux 4.4.0 server machine with 337 | an Intel i7-7700 CPU (@3.6 GHz) and 64 GB of RAM. 338 | The code was compiled with gcc 7.3.0 with all optimizations 339 | (see also `CMakeLists.txt`). 340 | 341 | #### Table 1. Bits per integer 342 | |**Dataset** |**Roaring** | **Slicing** | **PEF**| 343 | |------------|-----------:|-------------:|-------:| 344 | |Census-Income| 2.74 |2.23 |2.03| 345 | |Census-1881| 15.93 | 10.83| 7.28| 346 | |Weather | 5.43 | 4.05| 3.13| 347 | |Wikileaks| 16.30 | 10.18| 8.87| 348 | 349 | #### Table 2. µsec per list intersection 350 | |**Dataset** |**Roaring** | **Slicing** | **PEF**| 351 | |-------------|-----------:|-------------:|-------:| 352 | |Census-Income| 4.68 |11.56| 115.17| 353 | |Census-1881 | 0.15 |0.18 |0.92| 354 | |Weather | 13.37 |25.70 |213.00| 355 | |Wikileaks | 0.86 |0.47 |2.51| 356 | 357 | Authors 358 | ------- 359 | * [Giulio Ermanno Pibiri](http://pages.di.unipi.it/pibiri/), 360 | 361 | References 362 | ------- 363 | 364 | * [1] Daniel Lemire, Owen Kaser, Nathan Kurz, Luca Deri, Chris O’Hara, François Saint-Jacques, and Gregory Ssi-Yan-Kai. 2018. *Roaring bitmaps: Implementation of an optimized software library*. Software: Practice and Experience 48, 4, 365 | 867–895. 366 | * [2] Giuseppe Ottaviano and Rossano Venturini. *Partitioned Elias-Fano Indexes*. 2014. In Proceedings of the 37th International 367 | Conference on Research and Development in Information Retrieval. 273–282. 368 | * [3] Giulio Ermanno Pibiri. *Fast and Compact Set Intersection through Recursive Universe Partitioning*. 2021. IEEE Data Compression Conference (DCC). 369 | * [4] Giulio Ermanno Pibiri. *On Slicing Sorted Integer Sequences*. 2019. arXiv preprint. https://arxiv.org/abs/1907.01032 370 | * [5] Giulio Ermanno Pibiri and Rossano Venturini. *Techniques for Inverted Index Compression*. 2020. ACM Computing Surveys (CSUR). [https://arxiv.org/abs/1908.10598v2](https://arxiv.org/abs/1908.10598v2) 371 | -------------------------------------------------------------------------------- /external/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(mm_file/include/mm_file) 2 | include_directories(essentials/include) -------------------------------------------------------------------------------- /include/builder.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 8 | 9 | #include "s_index.hpp" 10 | #include "util.hpp" 11 | #include "building_util.hpp" 12 | 13 | namespace sliced { 14 | 15 | void encode_block(std::vector& block, uint32_t& id, 16 | std::vector& header, std::vector& data) { 17 | if (block.size() > 0) { 18 | write_uint(id, header); 19 | write_uint(block.size() - 1, header); 20 | if (block.size() >= constants::block_sparseness_threshold - 1) { 21 | write_bits(block.data(), block.size(), constants::block_size, 0, 22 | data); 23 | } else { 24 | for (auto pos : block) write_uint(pos, data); 25 | } 26 | block.clear(); 27 | } 28 | id += 1; 29 | } 30 | 31 | void encode_sparse_chunk(uint32_t const* begin, uint32_t const* end, slice s, 32 | std::vector& block, 33 | std::vector& out) { 34 | std::vector header; 35 | std::vector data; 36 | header.reserve(256 * 2); // at most 37 | data.reserve(256 * 32); // at most 38 | uint32_t id = 0; 39 | uint32_t base = s.left; 40 | while (begin != end and *begin < s.right) { 41 | uint32_t val = *begin - base; 42 | if (val >= constants::block_size) { 43 | encode_block(block, id, header, data); 44 | base += constants::block_size; 45 | } else { 46 | assert(val < constants::block_size); 47 | block.push_back(val); 48 | assert(block.size() <= constants::block_size); 49 | ++begin; 50 | } 51 | } 52 | encode_block(block, id, header, data); 53 | out.insert(out.end(), header.begin(), header.end()); 54 | out.insert(out.end(), data.begin(), data.end()); 55 | } 56 | 57 | void encode_sequence(uint32_t const* data, size_t n, 58 | std::vector& block, statistics& stats, 59 | std::vector& out) { 60 | assert(block.empty()); 61 | auto begin = data; 62 | auto end = data + n; 63 | uint32_t universe = *(data + n - 1); 64 | uint32_t chunks = num_chunks(universe); 65 | assert(chunks > 0 and chunks <= constants::chunk_size); 66 | 67 | stats.sequences += 1; 68 | stats.integers += n; 69 | stats.chunks += chunks; 70 | 71 | std::vector chunks_header; 72 | chunks_header.reserve(4 * constants::chunk_size); // at most 73 | std::vector tmp; 74 | 75 | const uint32_t dense_chunk_bytes = bytes_for(constants::chunk_size); 76 | slice s = {0, constants::chunk_size}; 77 | 78 | for (uint32_t i = 0; i != chunks; ++i) { 79 | uint32_t cardinality = 0; 80 | if (*begin < s.right) { 81 | cardinality = chunk_cardinality(begin, end, s); 82 | chunks_header.push_back(i); 83 | chunks_header.push_back(cardinality - 1); 84 | 85 | if (cardinality < constants::chunk_sparseness_threshold) { 86 | auto sparse_chunk_stats = sparse_chunk_bitsize(begin, end, s); 87 | 88 | uint64_t sparse_chunk_bytes = 89 | (sparse_chunk_stats.dense_blocks * 16 + 90 | // NOTE: the 8 bits for the cardinality of sparse 91 | // blocks are already accounted in sparse_blocks_bits 92 | sparse_chunk_stats.sparse_blocks * 8 + 93 | sparse_chunk_stats.dense_blocks_bits + 94 | sparse_chunk_stats.sparse_blocks_bits) / 95 | 8; 96 | 97 | if (sparse_chunk_bytes >= dense_chunk_bytes) { 98 | stats.dense_chunks += 1; 99 | stats.dense_chunks_bits += dense_chunk_bytes * 8; 100 | stats.integers_in_dense_chunks += cardinality; 101 | chunks_header.push_back(type::dense); 102 | chunks_header.push_back(constants::chunk_size / 8); 103 | write_bits(begin, cardinality, constants::chunk_size, 104 | s.left, tmp); 105 | } else { 106 | /* 107 | We would like clusters of: 108 | - few blocks in chunk + blocks have sufficiently large 109 | cardinality for method 1; 110 | - many blocks in chunk + blocks have low cardinality for 111 | method 2. 112 | */ 113 | 114 | stats.sparse_chunks += 1; 115 | stats.integers_in_sparse_chunks += cardinality; 116 | 117 | uint16_t num_non_empty_blocks = 118 | sparse_chunk_stats.dense_blocks + 119 | sparse_chunk_stats.sparse_blocks; 120 | assert(num_non_empty_blocks >= 1 and 121 | num_non_empty_blocks <= 122 | constants::chunk_size / constants::block_size); 123 | 124 | // how many chunks that have : 125 | // 1 block, 2 blocks, 3 blocks... 126 | stats.num_blocks_in_chunks[num_non_empty_blocks] += 1; 127 | stats.num_integers[num_non_empty_blocks] += cardinality; 128 | stats.blocks += 129 | num_non_empty_blocks + sparse_chunk_stats.empty_blocks; 130 | 131 | stats.accumulate(sparse_chunk_stats); 132 | 133 | uint16_t packed = type::sparse; 134 | packed |= (num_non_empty_blocks - 1) << 8; 135 | chunks_header.push_back(packed); 136 | 137 | chunks_header.push_back(sparse_chunk_bytes); 138 | 139 | encode_sparse_chunk(begin, end, s, block, tmp); 140 | } 141 | 142 | } else { 143 | if (cardinality == constants::chunk_size) { 144 | stats.full_chunks += 1; 145 | stats.integers_in_full_chunks += cardinality; 146 | chunks_header.push_back(type::full); 147 | chunks_header.push_back(0); 148 | } else { 149 | stats.dense_chunks += 1; 150 | stats.dense_chunks_bits += dense_chunk_bytes * 8; 151 | stats.integers_in_dense_chunks += cardinality; 152 | assert(dense_chunk_bytes * 8.0 / cardinality <= 2.0); 153 | chunks_header.push_back(type::dense); 154 | chunks_header.push_back(constants::chunk_size / 8); 155 | write_bits(begin, cardinality, constants::chunk_size, 156 | s.left, tmp); 157 | } 158 | } 159 | 160 | } else { 161 | stats.empty_chunks += 1; 162 | } 163 | 164 | s.left = s.right; 165 | s.right += constants::chunk_size; 166 | begin += cardinality; 167 | } 168 | 169 | assert(begin == end); 170 | chunks = chunks_header.size() / 4; 171 | write_uint(chunks - 1, out); 172 | 173 | // write chunks / constants::associativity pointers 174 | // NOTE: a pointer is 175 | // cardinality | byte_offset 176 | // ----------- ----------- 177 | // 32 bits 32 bits 178 | uint64_t offsets = chunks / constants::associativity; 179 | uint32_t offset = 0; 180 | uint32_t cardinality = 0; 181 | for (uint64_t i = 0; i != offsets; ++i) { 182 | uint32_t base = i * 4 * constants::associativity; 183 | offset = 0; 184 | cardinality = 0; 185 | for (uint32_t j = 1; j != constants::associativity + 1; ++j) { 186 | cardinality += chunks_header[base + 1] + 1; 187 | offset += chunks_header[base + 3]; 188 | base += 4; 189 | } 190 | write_uint(cardinality, out); 191 | write_uint(offset, out); 192 | } 193 | 194 | auto ptr = reinterpret_cast(chunks_header.data()); 195 | out.insert(out.end(), ptr, 196 | ptr + chunks_header.size() * sizeof(chunks_header.front())); 197 | out.insert(out.end(), tmp.begin(), tmp.end()); 198 | 199 | stats.chunks_header_bits += chunks * 16 * 4 + 16 + 200 | offsets * sizeof(offset) * 8 + 201 | offsets * sizeof(cardinality) * 8; 202 | } 203 | 204 | statistics encode_sequence(uint32_t const* data, size_t n, 205 | std::vector& out) { 206 | std::vector block; 207 | block.reserve(constants::block_size); 208 | statistics stats; 209 | 210 | encode_sequence(data, n, block, stats, out); 211 | 212 | stats.blocks_header_bits = 213 | stats.dense_blocks * 16 + stats.sparse_blocks * 8; 214 | stats.bits = stats.chunks_header_bits + stats.blocks_header_bits + 215 | stats.dense_chunks_bits + stats.dense_blocks_bits + 216 | stats.sparse_blocks_bits; 217 | 218 | stats.bits += 2 * 64; 219 | 220 | return stats; 221 | } 222 | 223 | struct s_index::builder { 224 | builder(parameters const& params) 225 | : m_params(params) {} 226 | 227 | statistics build() { 228 | mm::file_source input(m_params.collection_filename, 229 | mm::advice::sequential); 230 | uint32_t const* data = input.data(); 231 | std::vector block; 232 | block.reserve(constants::block_size); 233 | statistics stats; 234 | 235 | assert(data[0] == 1); 236 | std::cout << "universe size: " << data[1] << std::endl; 237 | 238 | m_offsets.push_back(data[1]); 239 | m_offsets.push_back(0); 240 | 241 | for (size_t i = 2; // first two values reserved for a singleton 242 | // sequence containing the universe size 243 | i < input.size();) { 244 | uint32_t n = data[i]; 245 | uint32_t universe = data[i + n]; 246 | if (pass(m_params, n, universe)) { 247 | encode_sequence(data + i + 1, n, block, stats, m_sequences); 248 | m_offsets.push_back(m_sequences.size()); 249 | if (stats.sequences % 1000 == 0) { 250 | std::cout << "processed " << stats.sequences << " sequences" 251 | << std::endl; 252 | } 253 | } 254 | i += n + 1; 255 | } 256 | 257 | m_offsets.pop_back(); 258 | 259 | stats.blocks_header_bits = 260 | stats.dense_blocks * 16 + stats.sparse_blocks * 8; 261 | stats.bits = stats.chunks_header_bits + stats.blocks_header_bits + 262 | stats.dense_chunks_bits + stats.dense_blocks_bits + 263 | stats.sparse_blocks_bits; 264 | 265 | stats.bits += 2 * 64; 266 | stats.bits += m_offsets.size() * 64; 267 | 268 | return stats; 269 | } 270 | 271 | template 272 | void visit(Visitor& visitor) { 273 | visitor.visit(m_offsets); 274 | visitor.visit(m_sequences); 275 | } 276 | 277 | private: 278 | parameters const& m_params; 279 | std::vector m_offsets; 280 | std::vector m_sequences; 281 | }; 282 | 283 | struct s_sequence::builder { 284 | builder() {} 285 | 286 | statistics build(uint32_t const* data, size_t n) { 287 | return encode_sequence(data, n, m_out); 288 | } 289 | 290 | uint8_t const* data() const { 291 | return m_out.data(); 292 | } 293 | 294 | template 295 | void visit(Visitor& visitor) { 296 | visitor.visit(m_out); 297 | } 298 | 299 | private: 300 | std::vector m_out; 301 | }; 302 | } // namespace sliced -------------------------------------------------------------------------------- /include/building_util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "constants.hpp" 6 | #include "util.hpp" 7 | 8 | namespace sliced { 9 | 10 | struct slice { 11 | uint32_t left; 12 | uint32_t right; 13 | }; 14 | 15 | template 16 | void set_bit(size_t position, std::vector& bits) { 17 | assert(position < bits.size() * sizeof(W) * 8); 18 | size_t w = position / (sizeof(W) * 8); 19 | size_t o = position % (sizeof(W) * 8); 20 | bits[w] |= W(1) << o; 21 | } 22 | 23 | template 24 | void write_uint(T val, std::vector& out) { 25 | auto ptr = reinterpret_cast(&val); 26 | out.insert(out.end(), ptr, ptr + sizeof(T)); 27 | } 28 | 29 | template 30 | void write_uint(T x, std::ofstream& out) { 31 | out.write(reinterpret_cast(&x), sizeof(T)); 32 | } 33 | 34 | void write_bits(uint32_t const* begin, size_t n, size_t bits, uint32_t base, 35 | std::vector& out) { 36 | assert(bits % 64 == 0); 37 | std::vector bitmap(bits / 64, 0); 38 | for (uint32_t i = 0; i != n; ++i, ++begin) { 39 | uint32_t val = *begin - base; 40 | set_bit(val, bitmap); 41 | } 42 | auto ptr = reinterpret_cast(bitmap.data()); 43 | out.insert(out.end(), ptr, ptr + bitmap.size() * sizeof(bitmap.front())); 44 | } 45 | 46 | uint32_t chunk_cardinality(uint32_t const* begin, uint32_t const* end, 47 | slice s) { 48 | uint32_t c = 0; 49 | uint32_t prev = -1; 50 | while (begin != end and *begin < s.right) { 51 | assert(*begin >= s.left); 52 | assert(*begin - s.left < constants::chunk_size); 53 | if (*begin == prev) throw std::runtime_error("duplicate element"); 54 | prev = *begin; 55 | ++begin; 56 | ++c; 57 | } 58 | assert(c > 0 and c <= constants::chunk_size); 59 | return c; 60 | } 61 | 62 | void block_bitsize(size_t block_size, statistics& stats) { 63 | stats.blocks += 1; 64 | assert(block_size <= constants::block_size); 65 | if (block_size == 0) { 66 | stats.empty_blocks += 1; 67 | } else if (block_size >= constants::block_sparseness_threshold - 1) { 68 | stats.dense_blocks += 1; 69 | stats.dense_blocks_bits += constants::block_size; 70 | stats.integers_in_dense_blocks += block_size; 71 | } else { 72 | assert(block_size <= constants::block_sparseness_threshold - 2); 73 | stats.sparse_blocks += 1; 74 | stats.integers_in_sparse_blocks += block_size; 75 | stats.sparse_blocks_bits += 8 * (block_size + 1); 76 | stats.sparse_blocks_cardinalities[block_size] += 1; 77 | } 78 | } 79 | 80 | statistics sparse_chunk_bitsize(uint32_t const* begin, uint32_t const* end, 81 | slice s) { 82 | statistics stats; 83 | uint32_t base = s.left; 84 | size_t block_size = 0; 85 | while (begin != end and *begin < s.right) { 86 | uint32_t val = *begin - base; 87 | if (val >= constants::block_size) { 88 | block_bitsize(block_size, stats); 89 | base += constants::block_size; 90 | block_size = 0; 91 | } else { 92 | assert(val < constants::block_size); 93 | ++block_size; 94 | assert(block_size <= constants::block_size); 95 | ++begin; 96 | } 97 | } 98 | block_bitsize(block_size, stats); 99 | return stats; 100 | } 101 | 102 | } // namespace sliced -------------------------------------------------------------------------------- /include/constants.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace sliced { 4 | 5 | namespace constants { 6 | 7 | static const uint32_t chunk_size = uint32_t(1) << 16; 8 | static const uint32_t block_size = uint32_t(1) << 8; 9 | 10 | static const uint64_t chunk_size_in_64bit_words = chunk_size / 64; 11 | static const uint64_t block_size_in_64bit_words = block_size / 64; 12 | 13 | static const uint64_t chunk_sparseness_threshold = chunk_size / 2; 14 | static const uint64_t block_sparseness_threshold = block_size / 8; 15 | 16 | static const uint32_t associativity = 32; 17 | 18 | static const uint32_t not_found = uint32_t(-1); 19 | 20 | } // namespace constants 21 | } // namespace sliced -------------------------------------------------------------------------------- /include/contains.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // #include "immintrin.h" 4 | 5 | namespace sliced { 6 | 7 | bool sparse_block_contains(uint8_t const* begin, int cardinality, 8 | uint32_t value) { 9 | // scalar code as fast as SIMD approach 10 | for (int i = 0; i != cardinality; ++i) { 11 | if (begin[i] > value) return false; 12 | if (begin[i] == value) return true; 13 | } 14 | return false; 15 | 16 | // (void)cardinality; 17 | // __m256i c = _mm256_set1_epi8(value); 18 | // __m256i v = _mm256_loadu_si256((__m256i const*)begin); 19 | // __m256i res = _mm256_cmpeq_epi8(c, v); 20 | // return _mm256_testz_si256(res, _mm256_setzero_si256()); 21 | } 22 | 23 | bool contains_sparse_chunk(uint8_t const* begin, int blocks, uint32_t value) { 24 | assert(blocks >= 1 and blocks <= 256); 25 | uint8_t const* data = begin + blocks * 2; 26 | uint8_t const* end = data; 27 | uint32_t block_id = value >> 8; 28 | 29 | while (begin != end) { 30 | uint8_t id = *begin; 31 | if (id > block_id) return false; 32 | int c = *(begin + 1) + 1; 33 | int bytes = 32; 34 | int type = type::dense; 35 | if (LIKELY(c < 31)) { 36 | bytes = c; 37 | type = type::sparse; 38 | } 39 | if (id == block_id) { 40 | uint32_t base = id * 256; 41 | assert(value >= base); 42 | value -= base; 43 | if (type == type::sparse) { 44 | return sparse_block_contains(data, c, value); 45 | } else { 46 | return bitmap_contains(reinterpret_cast(data), 47 | value); 48 | } 49 | } 50 | data += bytes; 51 | begin += 2; 52 | } 53 | 54 | return false; 55 | } 56 | 57 | bool s_sequence::contains(uint32_t value) const { 58 | auto it = begin(); 59 | uint32_t chunk_id = value >> 16; 60 | it.skip_to_value(chunk_id); 61 | if (it.id() == chunk_id) { 62 | value &= 0xFFFF; 63 | switch (it.type()) { 64 | case type::sparse: 65 | return contains_sparse_chunk(it.data, it.blocks(), value); 66 | case type::dense: 67 | return bitmap_contains( 68 | reinterpret_cast(it.data), value); 69 | case type::full: 70 | return true; 71 | default: 72 | assert(false); 73 | __builtin_unreachable(); 74 | } 75 | } 76 | return false; 77 | } 78 | 79 | } // namespace sliced -------------------------------------------------------------------------------- /include/decode.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "immintrin.h" 4 | #include "constants.hpp" 5 | #include "decode.hpp" 6 | 7 | namespace sliced { 8 | 9 | inline uint32_t decode_bitmap(uint64_t const* bitmap, 10 | size_t size_in_64bit_words, uint32_t base, 11 | uint32_t* out) { 12 | uint32_t size = 0; 13 | for (size_t i = 0; i != size_in_64bit_words; ++i) { 14 | uint64_t w = bitmap[i]; 15 | while (w != 0) { 16 | uint64_t t = w & (~w + 1); 17 | int r = __builtin_ctzll(w); 18 | out[size++] = r + base; 19 | w ^= t; 20 | } 21 | base += 64; 22 | } 23 | return size; 24 | } 25 | 26 | uint32_t decode_sparse_block(uint8_t const* begin, int cardinality, 27 | uint32_t base, uint32_t* out) { 28 | __m128i in_vec; 29 | __m256i converted; 30 | __m256i base_vec = _mm256_set1_epi32(base); 31 | 32 | in_vec = _mm_lddqu_si128((__m128i const*)(begin + 0)); 33 | converted = _mm256_cvtepu8_epi32(in_vec); 34 | converted = _mm256_add_epi32(base_vec, converted); 35 | _mm256_storeu_si256((__m256i*)(out + 0), converted); 36 | 37 | // most likely 38 | if (cardinality <= 8) return cardinality; 39 | 40 | in_vec = _mm_lddqu_si128((__m128i const*)(begin + 8)); 41 | converted = _mm256_cvtepu8_epi32(in_vec); 42 | converted = _mm256_add_epi32(base_vec, converted); 43 | _mm256_storeu_si256((__m256i*)(out + 8), converted); 44 | 45 | in_vec = _mm_lddqu_si128((__m128i const*)(begin + 16)); 46 | converted = _mm256_cvtepu8_epi32(in_vec); 47 | converted = _mm256_add_epi32(base_vec, converted); 48 | _mm256_storeu_si256((__m256i*)(out + 16), converted); 49 | 50 | in_vec = _mm_lddqu_si128((__m128i const*)(begin + 24)); 51 | converted = _mm256_cvtepu8_epi32(in_vec); 52 | converted = _mm256_add_epi32(base_vec, converted); 53 | _mm256_storeu_si256((__m256i*)(out + 24), converted); 54 | 55 | return cardinality; 56 | } 57 | 58 | uint32_t decode_sparse_chunk(uint8_t const* begin, int blocks, uint32_t base, 59 | uint32_t* out) { 60 | assert(blocks >= 1 and blocks <= 256); 61 | uint8_t const* data = begin + blocks * 2; 62 | uint8_t const* end = data; 63 | uint32_t* tmp = out; 64 | while (begin != end) { 65 | uint8_t id = *begin; 66 | int c = *(begin + 1) + 1; 67 | int bytes = 32; 68 | int type = type::dense; 69 | if (LIKELY(c < 31)) { 70 | bytes = c; 71 | type = type::sparse; 72 | } 73 | uint32_t b = base + id * 256; 74 | if (type == type::sparse) { 75 | tmp += decode_sparse_block(data, c, b, tmp); 76 | } else { 77 | tmp += decode_bitmap(reinterpret_cast(data), 78 | constants::block_size_in_64bit_words, b, tmp); 79 | } 80 | data += bytes; 81 | begin += 2; 82 | } 83 | return size_t(tmp - out); 84 | } 85 | 86 | inline uint32_t decode_full_chunk(uint32_t base, uint32_t* out) { 87 | for (uint32_t i = 0; i != constants::chunk_size; ++i) out[i] = i + base; 88 | return constants::chunk_size; 89 | } 90 | 91 | size_t decode_chunk(s_sequence::iterator const& it, uint32_t* out) { 92 | uint32_t base = it.id() << 16; 93 | switch (it.type()) { 94 | case type::sparse: 95 | return decode_sparse_chunk(it.data, it.blocks(), base, out); 96 | case type::dense: 97 | return decode_bitmap(reinterpret_cast(it.data), 98 | constants::chunk_size_in_64bit_words, base, 99 | out); 100 | case type::full: 101 | return decode_full_chunk(base, out); 102 | default: 103 | assert(false); 104 | __builtin_unreachable(); 105 | } 106 | } 107 | 108 | size_t s_sequence::decode(uint32_t* out) const { 109 | auto it = begin(); 110 | uint32_t* in = out; 111 | for (uint32_t i = 0; i != chunks; ++i) { 112 | out += decode_chunk(it, out); 113 | it.next(); 114 | } 115 | return size_t(out - in); 116 | } 117 | } // namespace sliced -------------------------------------------------------------------------------- /include/enumerator.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "decode.hpp" 4 | 5 | namespace sliced { 6 | 7 | struct enumerator { 8 | enumerator() { 9 | m_buf.resize(constants::chunk_size); 10 | } 11 | 12 | void init(s_sequence const& s, uint32_t past_the_end) { 13 | m_it = s.begin(); 14 | m_chunk = 0; 15 | m_chunks = s.chunks; 16 | m_i = 0; 17 | m_cardinality = m_it.cardinality(); 18 | m_past_the_end = past_the_end; 19 | m_has_next = true; 20 | decode_chunk(m_it, m_buf.data()); 21 | } 22 | 23 | bool has_next() const { 24 | return m_has_next; 25 | } 26 | 27 | void next() { 28 | if (++m_i == m_cardinality) { 29 | if (++m_chunk == m_chunks) { 30 | m_has_next = false; 31 | return; 32 | } 33 | m_i = 0; 34 | m_it.next(); 35 | m_cardinality = m_it.cardinality(); 36 | decode_chunk(m_it, m_buf.data()); 37 | } 38 | } 39 | 40 | uint32_t value() const { 41 | return m_has_next ? m_buf[m_i] : m_past_the_end; 42 | } 43 | 44 | private: 45 | s_sequence::iterator m_it; 46 | uint32_t m_chunk; 47 | uint32_t m_chunks; 48 | uint32_t m_i; 49 | uint32_t m_cardinality; 50 | uint32_t m_past_the_end; 51 | bool m_has_next; 52 | std::vector m_buf; 53 | }; 54 | 55 | } // namespace sliced -------------------------------------------------------------------------------- /include/intersection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "immintrin.h" 4 | 5 | #include "constants.hpp" 6 | #include "util.hpp" 7 | #include "decode.hpp" 8 | #include "uncompress.hpp" 9 | #include "table.hpp" 10 | 11 | namespace sliced { 12 | 13 | #define INIT \ 14 | __m256i base_v = _mm256_set1_epi32(base); \ 15 | __m128i v_l = _mm_lddqu_si128((__m128i const*)l); \ 16 | __m128i v_r = _mm_lddqu_si128((__m128i const*)r); \ 17 | __m256i converted_v; \ 18 | __m128i shuf, p, res; \ 19 | int mask, matched; 20 | 21 | #define INTERSECT \ 22 | res = \ 23 | _mm_cmpestrm(v_l, card_l, v_r, card_r, \ 24 | _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); \ 25 | mask = _mm_extract_epi32(res, 0); \ 26 | matched = _mm_popcnt_u32(mask); \ 27 | size += matched; \ 28 | shuf = _mm_load_si128((__m128i const*)shuffle_mask + mask); \ 29 | p = _mm_shuffle_epi8(v_r, shuf); \ 30 | converted_v = _mm256_cvtepu8_epi32(p); \ 31 | converted_v = _mm256_add_epi32(base_v, converted_v); \ 32 | _mm256_storeu_si256((__m256i*)out, converted_v); \ 33 | if (matched > 8) { \ 34 | p = _mm_bsrli_si128(p, 8); \ 35 | converted_v = _mm256_cvtepu8_epi32(p); \ 36 | converted_v = _mm256_add_epi32(base_v, converted_v); \ 37 | _mm256_storeu_si256((__m256i*)(out + 8), converted_v); \ 38 | } 39 | 40 | #define ADVANCE(ptr) \ 41 | out += size; \ 42 | ptr += 16; \ 43 | v_##ptr = _mm_lddqu_si128((__m128i const*)ptr); \ 44 | card_##ptr -= 16; 45 | 46 | size_t ss_intersect_block(uint8_t const* l, uint8_t const* r, int card_l, 47 | int card_r, uint32_t base, uint32_t* out) { 48 | assert(card_l > 0 and 49 | card_l <= int(constants::block_sparseness_threshold - 2)); 50 | assert(card_r > 0 and 51 | card_r <= int(constants::block_sparseness_threshold - 2)); 52 | size_t size = 0; 53 | 54 | if (LIKELY(card_l <= 16 and card_r <= 16)) { 55 | INIT INTERSECT return size; // 1 cmpestr 56 | } 57 | 58 | if (card_l <= 16 and card_r > 16) { 59 | INIT INTERSECT ADVANCE(r) INTERSECT return size; // 2 cmpestr 60 | } 61 | 62 | if (card_r <= 16 and card_l > 16) { 63 | INIT INTERSECT ADVANCE(l) INTERSECT return size; // 2 cmpestr 64 | } 65 | 66 | // card_l > 16 and card_r > 16 -> 4 cmpestr, but scalar may be more 67 | // convenient... 68 | 69 | uint8_t const* end_l = l + card_l; 70 | uint8_t const* end_r = r + card_r; 71 | while (true) { 72 | while (*l < *r) { 73 | if (++l == end_l) return size; 74 | } 75 | while (*l > *r) { 76 | if (++r == end_r) return size; 77 | } 78 | if (*l == *r) { 79 | out[size++] = *l + base; 80 | if (++l == end_l or ++r == end_r) return size; 81 | } 82 | } 83 | 84 | return size; 85 | } 86 | 87 | inline size_t dd_intersect_block(uint8_t const* l, uint8_t const* r, 88 | uint32_t base, uint32_t* out) { 89 | return and_bitmaps(l, r, constants::block_size_in_64bit_words, base, out); 90 | } 91 | 92 | size_t ds_intersect_block(uint8_t const* l, uint8_t const* r, int card, 93 | uint32_t base, uint32_t* out) { 94 | uint64_t const* bitmap = reinterpret_cast(l); 95 | uint32_t k = 0; 96 | for (int i = 0; i != card; ++i) { 97 | uint32_t key = r[i]; 98 | out[k] = key + base; 99 | k += bitmap_contains(bitmap, key); 100 | } 101 | return k; 102 | } 103 | 104 | size_t ss_intersect_chunk(uint8_t const* l, uint8_t const* r, int blocks_l, 105 | int blocks_r, uint32_t base, uint32_t* out) { 106 | assert(blocks_l >= 1 and blocks_l <= 256); 107 | assert(blocks_r >= 1 and blocks_r <= 256); 108 | uint8_t const* data_l = l + blocks_l * 2; 109 | uint8_t const* data_r = r + blocks_r * 2; 110 | uint8_t const* end_l = data_l; 111 | uint8_t const* end_r = data_r; 112 | uint32_t* tmp = out; 113 | 114 | while (true) { 115 | while (*l < *r) { 116 | if (l + 2 == end_l) return size_t(tmp - out); 117 | int c = *(l + 1) + 1; 118 | data_l += BYTES_BY_CARDINALITY(c); 119 | l += 2; 120 | } 121 | while (*l > *r) { 122 | if (r + 2 == end_r) return size_t(tmp - out); 123 | int c = *(r + 1) + 1; 124 | data_r += BYTES_BY_CARDINALITY(c); 125 | r += 2; 126 | } 127 | if (*l == *r) { 128 | uint8_t id = *l; 129 | ++l; 130 | ++r; 131 | int cl = *l + 1; 132 | int cr = *r + 1; 133 | int type_l = type::dense; 134 | int type_r = type::dense; 135 | int bl = 32; 136 | int br = 32; 137 | 138 | if (LIKELY(cl < 31)) { 139 | bl = cl; 140 | type_l = type::sparse; 141 | } 142 | 143 | if (LIKELY(cr < 31)) { 144 | br = cr; 145 | type_r = type::sparse; 146 | } 147 | 148 | uint32_t b = base + id * 256; 149 | uint32_t n = 0; 150 | 151 | switch (block_pair(type_l, type_r)) { 152 | case block_pair(type::sparse, type::sparse): 153 | n = ss_intersect_block(data_l, data_r, cl, cr, b, tmp); 154 | break; 155 | case block_pair(type::sparse, type::dense): 156 | n = ds_intersect_block(data_r, data_l, cl, b, tmp); 157 | break; 158 | case block_pair(type::dense, type::sparse): 159 | n = ds_intersect_block(data_l, data_r, cr, b, tmp); 160 | break; 161 | case block_pair(type::dense, type::dense): 162 | n = and_bitmaps(data_l, data_r, 163 | constants::block_size_in_64bit_words, b, 164 | tmp); 165 | break; 166 | default: 167 | assert(false); 168 | __builtin_unreachable(); 169 | } 170 | 171 | tmp += n; 172 | 173 | if (l + 1 == end_l or r + 1 == end_r) return size_t(tmp - out); 174 | 175 | data_l += bl; 176 | data_r += br; 177 | ++l; 178 | ++r; 179 | } 180 | } 181 | 182 | return size_t(tmp - out); 183 | } 184 | 185 | size_t ds_intersect_chunk(uint8_t const* l, uint8_t const* r, int blocks_r, 186 | uint32_t base, uint32_t* out) { 187 | static std::vector x(constants::chunk_size_in_64bit_words); 188 | std::fill(x.begin(), x.end(), 0); 189 | uncompress_sparse_chunk(r, blocks_r, x.data()); 190 | return and_bitmaps(l, reinterpret_cast(x.data()), 191 | constants::chunk_size_in_64bit_words, base, out); 192 | } 193 | 194 | size_t pairwise_intersection(s_sequence const& l, s_sequence const& r, 195 | uint32_t* out) { 196 | auto it_l = l.begin(); 197 | auto it_r = r.begin(); 198 | uint32_t* in = out; 199 | while (it_l.has_next() and it_r.has_next()) { 200 | uint16_t id_l = it_l.id(); 201 | uint16_t id_r = it_r.id(); 202 | 203 | if (id_l == id_r) { 204 | uint32_t n = 0; 205 | uint32_t base = id_l << 16; 206 | int blocks_l = 0; 207 | int blocks_r = 0; 208 | 209 | uint16_t type_l = it_l.type(); 210 | uint16_t type_r = it_r.type(); 211 | 212 | switch (chunk_pair(type_l, type_r)) { 213 | case chunk_pair(type::sparse, type::sparse): 214 | blocks_l = it_l.blocks(); 215 | blocks_r = it_r.blocks(); 216 | if (blocks_l < blocks_r) { 217 | n = ss_intersect_chunk(it_l.data, it_r.data, blocks_l, 218 | blocks_r, base, out); 219 | } else { 220 | n = ss_intersect_chunk(it_r.data, it_l.data, blocks_r, 221 | blocks_l, base, out); 222 | } 223 | break; 224 | case chunk_pair(type::sparse, type::dense): 225 | n = ds_intersect_chunk(it_r.data, it_l.data, it_l.blocks(), 226 | base, out); 227 | break; 228 | case chunk_pair(type::sparse, type::full): 229 | n = decode_sparse_chunk(it_l.data, it_l.blocks(), base, 230 | out); 231 | break; 232 | case chunk_pair(type::dense, type::sparse): 233 | n = ds_intersect_chunk(it_l.data, it_r.data, it_r.blocks(), 234 | base, out); 235 | break; 236 | case chunk_pair(type::dense, type::dense): 237 | n = and_bitmaps(it_l.data, it_r.data, 238 | constants::chunk_size_in_64bit_words, base, 239 | out); 240 | break; 241 | case chunk_pair(type::dense, type::full): 242 | n = decode_bitmap( 243 | reinterpret_cast(it_l.data), 244 | constants::chunk_size_in_64bit_words, base, out); 245 | break; 246 | case chunk_pair(type::full, type::sparse): 247 | n = decode_sparse_chunk(it_r.data, it_r.blocks(), base, 248 | out); 249 | break; 250 | case chunk_pair(type::full, type::dense): 251 | n = decode_bitmap( 252 | reinterpret_cast(it_r.data), 253 | constants::chunk_size_in_64bit_words, base, out); 254 | break; 255 | case chunk_pair(type::full, type::full): 256 | n = decode_full_chunk(base, out); 257 | break; 258 | default: 259 | assert(false); 260 | __builtin_unreachable(); 261 | } 262 | 263 | out += n; 264 | it_l.next(); 265 | it_r.next(); 266 | 267 | } else if (id_l < id_r) { 268 | it_l.advance(id_r); 269 | } else { 270 | it_r.advance(id_l); 271 | } 272 | } 273 | return size_t(out - in); 274 | } 275 | 276 | } // namespace sliced -------------------------------------------------------------------------------- /include/intersection_many.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "decode.hpp" 4 | #include "uncompress_chunk_and_intersect.hpp" 5 | 6 | namespace sliced { 7 | 8 | size_t intersection(std::vector& sequences, uint32_t* out) { 9 | uint32_t* in = out; 10 | std::sort(sequences.begin(), sequences.end(), 11 | [](auto const& l, auto const& r) { 12 | return l.cardinality() < r.cardinality(); 13 | }); 14 | std::vector iterators(sequences.size()); 15 | for (size_t i = 0; i != sequences.size(); ++i) { 16 | iterators[i] = sequences[i].begin(); 17 | } 18 | 19 | static std::vector headers(65536); 20 | uint64_t num_headers = 0; 21 | { 22 | uint32_t candidate = iterators[0].id(); 23 | size_t i = 1; 24 | while (candidate < 65536) { 25 | for (; i < iterators.size(); ++i) { 26 | iterators[i].skip_to_value(candidate); 27 | uint32_t val = iterators[i].id(); 28 | if (val != candidate) { 29 | candidate = val; 30 | i = 0; 31 | break; 32 | } 33 | } 34 | if (i == iterators.size()) { 35 | headers[num_headers++] = candidate; 36 | iterators[0].next(); 37 | candidate = iterators[0].id(); 38 | i = 1; 39 | } 40 | } 41 | } 42 | 43 | { 44 | static std::vector bitmap(1024); 45 | for (size_t i = 0; i != sequences.size(); ++i) { 46 | iterators[i] = sequences[i].begin(); 47 | } 48 | for (uint64_t i = 0; i != num_headers; ++i) { 49 | uint32_t header = headers[i]; 50 | uint32_t base = header << 16; 51 | 52 | // std::sort(iterators.begin(), iterators.end(), 53 | // [](auto const& l, auto const& r) { 54 | // return l.cardinality() < r.cardinality(); 55 | // }); 56 | 57 | iterators[0].advance(header); 58 | assert(iterators[0].id() == header); 59 | uint32_t cardinality = 60 | uncompress_chunk(iterators[0], bitmap.data()); 61 | for (uint64_t i = 1; i != iterators.size(); ++i) { 62 | iterators[i].advance(header); 63 | assert(iterators[i].id() == header); 64 | cardinality = uncompress_chunk_and_intersect( 65 | iterators[i], bitmap.data(), cardinality); 66 | if (cardinality == 0) goto SKIP; 67 | } 68 | out += decode_bitmap(bitmap.data(), 1024, base, out); 69 | SKIP:; 70 | } 71 | } 72 | 73 | return size_t(out - in); 74 | } 75 | 76 | } // namespace sliced -------------------------------------------------------------------------------- /include/next_geq.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace sliced { 4 | 5 | #define BLOCK_MIN \ 6 | if (LIKELY(*(begin + 1) < 30)) return *data + base; \ 7 | return min_value_in_bitmap(data, constants::block_size_in_64bit_words) + \ 8 | base; 9 | 10 | #define BLOCK_MIN_ \ 11 | if (LIKELY(*(m_begin + 1) < 30)) { \ 12 | value = *m_data + base; \ 13 | } else { \ 14 | value = min_value_in_bitmap(m_data, \ 15 | constants::block_size_in_64bit_words) + \ 16 | base; \ 17 | } 18 | 19 | #define CHUNK_MIN(it) \ 20 | switch (it.type()) { \ 21 | case type::sparse: \ 22 | value = min_value_in_sparse_chunk(it.data, it.blocks()); \ 23 | break; \ 24 | case type::dense: \ 25 | value = min_value_in_bitmap(it.data, \ 26 | constants::chunk_size_in_64bit_words); \ 27 | break; \ 28 | case type::full: \ 29 | value = 0; \ 30 | break; \ 31 | default: \ 32 | assert(false); \ 33 | __builtin_unreachable(); \ 34 | } \ 35 | return value + it.base(); 36 | 37 | uint32_t next_geq_sparse_block(uint8_t const* begin, int cardinality, 38 | uint32_t value) { 39 | for (int i = 0; i != cardinality; ++i) { 40 | if (begin[i] >= value) return begin[i]; 41 | } 42 | return constants::not_found; 43 | } 44 | 45 | uint32_t max_value_in_bitmap(uint8_t const* data, size_t size_in_64bit_words) { 46 | uint64_t const* bitmap = reinterpret_cast(data); 47 | for (int32_t i = size_in_64bit_words - 1; i >= 0; --i) { 48 | uint64_t w = bitmap[i]; 49 | if (w != 0) { 50 | int r = __builtin_clzll(w); 51 | return i * 64 + 63 - r; 52 | } 53 | } 54 | return 0; 55 | } 56 | 57 | uint32_t min_value_in_bitmap(uint8_t const* data, size_t size_in_64bit_words) { 58 | uint64_t const* bitmap = reinterpret_cast(data); 59 | for (uint32_t i = 0; i != size_in_64bit_words; ++i) { 60 | uint64_t w = bitmap[i]; 61 | if (w != 0) return i * 64 + __builtin_ctzll(w); 62 | } 63 | return 0; 64 | } 65 | 66 | uint32_t next_geq_bitmap(uint8_t const* data, uint32_t size_in_64bit_words, 67 | uint32_t value) { 68 | uint64_t const* bitmap = reinterpret_cast(data); 69 | uint32_t k = value / 64; 70 | uint64_t word = bitmap[k]; 71 | const int diff = value - k * 64; 72 | word = (word >> diff) << diff; 73 | while (word == 0) { 74 | k++; 75 | if (k == size_in_64bit_words) return constants::not_found; 76 | word = bitmap[k]; 77 | } 78 | return k * 64 + __builtin_ctzll(word); 79 | } 80 | 81 | uint32_t max_value_in_block(uint8_t const* begin, uint8_t const* data) { 82 | int c = *(begin + 1); 83 | if (LIKELY(c < 30)) { // block type is sparse 84 | return *(data + c); 85 | } 86 | return max_value_in_bitmap(data, constants::block_size_in_64bit_words); 87 | } 88 | 89 | uint32_t next_geq_sparse_chunk(uint8_t const* begin, int blocks, 90 | uint32_t value) { 91 | assert(blocks >= 1 and blocks <= 256); 92 | uint8_t const* data = begin + blocks * 2; 93 | uint8_t const* end = data; 94 | uint32_t block_id = value >> 8; 95 | uint32_t id = *begin; 96 | 97 | while (id < block_id and begin != end) { 98 | int c = *(begin + 1) + 1; 99 | data += BYTES_BY_CARDINALITY(c); 100 | begin += 2; 101 | id = *begin; 102 | } 103 | 104 | if (begin != end) { 105 | uint32_t base = id * 256; 106 | if (base >= value) { // saturate 107 | BLOCK_MIN 108 | } 109 | 110 | value &= 0xFF; 111 | if (value > max_value_in_block(begin, data)) { // saturate 112 | if (begin + 2 == end) return constants::not_found; 113 | int c = *(begin + 1) + 1; 114 | data += BYTES_BY_CARDINALITY(c); 115 | begin += 2; 116 | id = *begin; 117 | base = id * 256; 118 | BLOCK_MIN 119 | } 120 | 121 | base = id * 256; 122 | int c = *(begin + 1) + 1; 123 | if (LIKELY(c < 31)) { // block type is sparse 124 | return next_geq_sparse_block(data, c, value) + base; 125 | } 126 | return next_geq_bitmap(data, constants::block_size_in_64bit_words, 127 | value) + 128 | base; 129 | } 130 | 131 | return constants::not_found; 132 | } 133 | 134 | uint32_t min_value_in_sparse_chunk(uint8_t const* begin, int blocks) { 135 | assert(blocks >= 1 and blocks <= 256); 136 | uint8_t const* data = begin + blocks * 2; 137 | uint32_t id = *begin; 138 | uint32_t base = id * 256; 139 | BLOCK_MIN 140 | } 141 | 142 | uint32_t s_sequence::next_geq(uint32_t value) const { 143 | auto it = begin(); 144 | uint32_t chunk_id = value >> 16; 145 | it.skip_to_value(chunk_id); 146 | 147 | if (it.base() >= value) { // saturate 148 | CHUNK_MIN(it) 149 | } 150 | 151 | if (it.has_next()) { 152 | value &= 0xFFFF; 153 | switch (it.type()) { 154 | case type::sparse: 155 | value = next_geq_sparse_chunk(it.data, it.blocks(), value); 156 | break; 157 | case type::dense: 158 | value = next_geq_bitmap( 159 | it.data, constants::chunk_size_in_64bit_words, value); 160 | break; 161 | case type::full: 162 | break; 163 | default: 164 | assert(false); 165 | __builtin_unreachable(); 166 | } 167 | 168 | if (value != constants::not_found) return value + it.base(); 169 | 170 | // saturate 171 | it.next(); 172 | if (it.has_next()) CHUNK_MIN(it) 173 | } 174 | 175 | return constants::not_found; 176 | } 177 | 178 | } // namespace sliced -------------------------------------------------------------------------------- /include/next_geq_enumerator.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "next_geq.hpp" 4 | 5 | namespace sliced { 6 | 7 | struct next_geq_enumerator { 8 | next_geq_enumerator() {} 9 | 10 | next_geq_enumerator(s_sequence const& s) 11 | : m_chunk_id(-1) 12 | , m_id(0) 13 | , m_begin(nullptr) 14 | , m_end(nullptr) 15 | , m_data(nullptr) 16 | , m_size(s.size()) 17 | , m_it(s.begin()) {} 18 | 19 | uint64_t size() const { 20 | return m_size; 21 | } 22 | 23 | uint32_t next_geq(uint32_t value) { 24 | uint32_t chunk_id = value >> 16; 25 | m_it.skip_to_value(chunk_id); 26 | 27 | if (m_it.base() >= value) { // saturate 28 | CHUNK_MIN(m_it) 29 | } 30 | 31 | if (m_it.has_next()) { 32 | value &= 0xFFFF; 33 | switch (m_it.type()) { 34 | case type::sparse: { 35 | uint32_t block_id = value >> 8; 36 | if (m_chunk_id != m_it.id()) { 37 | uint32_t blocks = m_it.blocks(); 38 | assert(blocks >= 1 and blocks <= 256); 39 | m_begin = m_it.data; 40 | m_data = m_begin + blocks * 2; 41 | m_end = m_data; 42 | m_id = *m_begin; 43 | m_chunk_id = m_it.id(); 44 | } 45 | 46 | while (m_id < block_id and m_begin != m_end) { 47 | int c = *(m_begin + 1) + 1; 48 | m_data += BYTES_BY_CARDINALITY(c); 49 | m_begin += 2; 50 | m_id = *m_begin; 51 | } 52 | 53 | if (m_begin != m_end) { 54 | uint32_t base = m_id * 256; 55 | if (base >= value) { // saturate 56 | BLOCK_MIN_ 57 | break; 58 | } 59 | 60 | value &= 0xFF; 61 | if (value > 62 | max_value_in_block(m_begin, m_data)) { // saturate 63 | if (m_begin + 2 == m_end) { 64 | value = constants::not_found; 65 | break; 66 | } 67 | int c = *(m_begin + 1) + 1; 68 | m_data += BYTES_BY_CARDINALITY(c); 69 | m_begin += 2; 70 | m_id = *m_begin; 71 | base = m_id * 256; 72 | BLOCK_MIN_ 73 | break; 74 | } 75 | base = m_id * 256; 76 | int c = *(m_begin + 1) + 1; 77 | if (LIKELY(c < 31)) { // block type is sparse 78 | value = 79 | next_geq_sparse_block(m_data, c, value) + base; 80 | } else { 81 | value = next_geq_bitmap( 82 | m_data, 83 | constants::block_size_in_64bit_words, 84 | value) + 85 | base; 86 | } 87 | } else { 88 | value = constants::not_found; 89 | } 90 | } break; 91 | case type::dense: 92 | value = next_geq_bitmap( 93 | m_it.data, constants::chunk_size_in_64bit_words, value); 94 | break; 95 | case type::full: 96 | break; 97 | default: 98 | assert(false); 99 | __builtin_unreachable(); 100 | } 101 | 102 | if (value != constants::not_found) return value + m_it.base(); 103 | 104 | // saturate 105 | m_it.next(); 106 | if (m_it.has_next()) CHUNK_MIN(m_it) 107 | } 108 | 109 | return constants::not_found; 110 | } 111 | 112 | private: 113 | uint32_t m_chunk_id; 114 | uint8_t m_id; 115 | uint8_t const* m_begin; 116 | uint8_t const* m_end; 117 | uint8_t const* m_data; 118 | uint64_t m_size; 119 | s_sequence::iterator m_it; 120 | }; 121 | 122 | } // namespace sliced -------------------------------------------------------------------------------- /include/s_index.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 7 | 8 | #include "s_sequence.hpp" 9 | 10 | namespace sliced { 11 | 12 | struct s_index { 13 | struct builder; 14 | 15 | inline size_t size() const { 16 | return m_size; 17 | } 18 | 19 | inline size_t universe() const { 20 | return m_universe; 21 | } 22 | 23 | s_sequence operator[](size_t i) const { 24 | assert(i < size()); 25 | return s_sequence(m_sequences + m_offsets[i]); 26 | } 27 | 28 | void mmap(char const* binary_filename) { 29 | m_input.open(binary_filename, mm::advice::sequential); 30 | auto ptr = reinterpret_cast(m_input.data()); 31 | m_size = *ptr++; 32 | m_universe = *ptr++; 33 | m_offsets = ptr; 34 | m_sequences = m_input.data() + m_size * sizeof(uint64_t) + 35 | sizeof(m_size) + sizeof(m_universe); 36 | m_size -= 1; 37 | } 38 | 39 | private: 40 | mm::file_source m_input; 41 | uint64_t const* m_offsets; 42 | uint8_t const* m_sequences; 43 | uint64_t m_size; 44 | uint64_t m_universe; 45 | }; 46 | 47 | } // namespace sliced -------------------------------------------------------------------------------- /include/s_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "constants.hpp" 4 | 5 | namespace sliced { 6 | 7 | struct s_sequence { 8 | struct builder; 9 | 10 | s_sequence() 11 | : chunks(0) 12 | , m_pointers(nullptr) 13 | , m_header(nullptr) 14 | , m_data(nullptr) {} 15 | 16 | s_sequence(uint8_t const* addr) { 17 | uint16_t const* ptr = reinterpret_cast(addr); 18 | chunks = 1 + *ptr++; 19 | m_pointers = reinterpret_cast(ptr); 20 | uint64_t pointers_bytes = 21 | chunks / constants::associativity * sizeof(uint32_t) * 2; 22 | uint64_t header_bytes = chunks * 4 * sizeof(uint16_t); 23 | m_header = ptr + pointers_bytes / sizeof(uint16_t); 24 | m_data = addr + sizeof(uint16_t) + pointers_bytes + header_bytes; 25 | } 26 | 27 | size_t decode(uint32_t* out) const; 28 | size_t uncompress(uint64_t* out) const; 29 | bool select(uint32_t rank, uint32_t& value) const; 30 | bool contains(uint32_t value) const; 31 | uint32_t next_geq(uint32_t value) const; 32 | 33 | uint32_t const* pointers() const { 34 | return m_pointers; 35 | } 36 | 37 | uint16_t const* header() const { 38 | return m_header; 39 | } 40 | 41 | uint8_t const* data() const { 42 | return m_data; 43 | } 44 | 45 | inline uint32_t cardinality() const { 46 | auto const* h = header(); 47 | uint32_t c = 0; 48 | for (uint32_t i = 0; i != chunks; ++i) { 49 | c += *(h + 1) + 1; 50 | h += 4; 51 | } 52 | return c; 53 | } 54 | 55 | inline uint32_t size() const { 56 | return cardinality(); 57 | } 58 | 59 | struct iterator { 60 | iterator() 61 | : pointers(nullptr) 62 | , header(nullptr) 63 | , data(nullptr) 64 | , begin(0) 65 | , end(0) {} 66 | 67 | iterator(s_sequence const& s, uint32_t begin, uint32_t end) 68 | : pointers(s.pointers()) 69 | , header(s.header()) 70 | , data(s.data()) 71 | , begin(begin) 72 | , end(end) {} 73 | 74 | inline uint32_t id() const { 75 | // return *header; 76 | return has_next() ? *header : 65536; // saturate 77 | } 78 | 79 | inline uint32_t base() const { 80 | return id() << 16; 81 | } 82 | 83 | inline uint32_t cardinality() const { 84 | return *(header + 1) + 1; 85 | } 86 | 87 | inline uint32_t type() const { 88 | return *(header + 2) & 255; 89 | } 90 | 91 | inline uint32_t blocks() const { 92 | return (*(header + 2) >> 8) + 1; 93 | } 94 | 95 | inline uint32_t offset() const { 96 | return *(header + 3); 97 | } 98 | 99 | inline void next() { 100 | data += offset(); 101 | header += 4; 102 | begin += 1; 103 | } 104 | 105 | inline bool has_next() const { 106 | return begin < end; 107 | } 108 | 109 | void advance(uint32_t lower_bound) { 110 | while (id() < lower_bound and has_next()) next(); 111 | } 112 | 113 | void skip_to_value(uint32_t lower_bound) { 114 | while (skip_position() < end and *skip_header() <= lower_bound) { 115 | data += *(pointers + 1); 116 | pointers += 2; 117 | header = skip_header(); 118 | begin = skip_position(); 119 | } 120 | advance(lower_bound); 121 | } 122 | 123 | uint32_t skip_to_position(uint32_t rank) { 124 | uint32_t elements = 0; 125 | while (skip_position() < end) { 126 | uint32_t c = *pointers; 127 | if (elements + c > rank) break; 128 | elements += c; 129 | data += *(pointers + 1); 130 | pointers += 2; 131 | header = skip_header(); 132 | begin = skip_position(); 133 | } 134 | 135 | while (has_next()) { 136 | uint32_t c = cardinality(); 137 | if (elements + c > rank) return elements; 138 | elements += c; 139 | next(); 140 | } 141 | 142 | return elements; 143 | } 144 | 145 | uint32_t const* pointers; 146 | uint16_t const* header; 147 | uint8_t const* data; 148 | uint32_t begin; 149 | uint32_t end; 150 | 151 | private: 152 | inline uint32_t skip_position() const { 153 | return begin + constants::associativity; 154 | } 155 | 156 | inline uint16_t const* skip_header() const { 157 | return header + 4 * constants::associativity; 158 | } 159 | }; 160 | 161 | iterator begin() const { 162 | return iterator(*this, 0, chunks); 163 | } 164 | 165 | uint32_t chunks; 166 | 167 | private: 168 | uint32_t const* m_pointers; 169 | uint16_t const* m_header; 170 | uint8_t const* m_data; 171 | }; 172 | 173 | } // namespace sliced -------------------------------------------------------------------------------- /include/select.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace sliced { 4 | 5 | uint32_t select_bitmap(uint8_t const* data, size_t size_in_64bit_words, 6 | uint32_t rank) { 7 | uint64_t const* bitmap = reinterpret_cast(data); 8 | uint32_t elements = 0; 9 | for (size_t i = 0; i != size_in_64bit_words; ++i) { 10 | uint64_t w = bitmap[i]; 11 | int c = __builtin_popcountll(w); 12 | if (elements + c > rank) { 13 | uint32_t base = i * 64; 14 | 15 | assert(rank >= elements); 16 | rank -= elements; 17 | uint64_t i = 1ULL << rank; 18 | asm("pdep %[w], %[mask], %[w]" : [ w ] "+r"(w) : [ mask ] "r"(i)); 19 | asm("tzcnt %[bit], %[index]" 20 | : [ index ] "=r"(i) 21 | : [ bit ] "g"(w) 22 | : "cc"); 23 | return i + base; 24 | 25 | // while (w != 0) { 26 | // uint64_t t = w & (~w + 1); 27 | // int r = __builtin_ctzll(w); 28 | // if (elements == rank) { 29 | // return r + base; 30 | // } 31 | // w ^= t; 32 | // ++elements; 33 | // } 34 | } 35 | elements += c; 36 | } 37 | assert(false); 38 | __builtin_unreachable(); 39 | return elements; 40 | } 41 | 42 | uint32_t select_sparse_chunk(uint8_t const* begin, int blocks, uint32_t rank) { 43 | assert(blocks >= 1 and blocks <= 256); 44 | uint8_t const* data = begin + blocks * 2; 45 | uint8_t const* end = data; 46 | uint32_t elements = 0; 47 | while (begin != end) { 48 | uint8_t id = *begin; 49 | int c = *(begin + 1) + 1; 50 | int bytes = 32; 51 | int type = type::dense; 52 | if (LIKELY(c < 31)) { 53 | bytes = c; 54 | type = type::sparse; 55 | } 56 | if (elements + c > rank) { 57 | rank -= elements; 58 | assert(int(rank) < c); 59 | uint32_t base = id * 256; 60 | if (type == type::sparse) { 61 | return *(data + rank) + base; 62 | } else { 63 | return select_bitmap(data, constants::block_size_in_64bit_words, 64 | rank) + 65 | base; 66 | } 67 | } 68 | elements += c; 69 | data += bytes; 70 | begin += 2; 71 | } 72 | assert(false); 73 | __builtin_unreachable(); 74 | return elements; 75 | } 76 | 77 | bool s_sequence::select(uint32_t rank, uint32_t& value) const { 78 | auto it = begin(); 79 | uint32_t elements = it.skip_to_position(rank); 80 | if (it.has_next()) { 81 | rank -= elements; 82 | assert(rank < constants::chunk_size); 83 | switch (it.type()) { 84 | case type::sparse: 85 | value = select_sparse_chunk(it.data, it.blocks(), rank); 86 | break; 87 | case type::dense: 88 | value = select_bitmap( 89 | it.data, constants::chunk_size_in_64bit_words, rank); 90 | break; 91 | case type::full: 92 | value = rank; 93 | break; 94 | default: 95 | assert(false); 96 | __builtin_unreachable(); 97 | } 98 | value += it.id() << 16; 99 | return true; 100 | } 101 | return false; 102 | } 103 | 104 | } // namespace sliced -------------------------------------------------------------------------------- /include/uncompress.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "constants.hpp" 4 | 5 | namespace sliced { 6 | 7 | inline void uncompress_sparse_block(uint8_t const* begin, int cardinality, 8 | uint64_t* out) { 9 | // for (int i = 0; i != cardinality; ++i) { 10 | // set_bit(*begin++, out); 11 | // } 12 | uint64_t offset, load, pos; 13 | const uint64_t shift = 6; 14 | uint8_t const* end = begin + cardinality; 15 | __asm volatile( 16 | "1:\n" 17 | "movzbq (%[begin]), %[pos]\n" 18 | "shrx %[shift], %[pos], %[offset]\n" 19 | "mov (%[out],%[offset],8), %[load]\n" 20 | "bts %[pos], %[load]\n" 21 | "mov %[load], (%[out],%[offset],8)\n" 22 | "add $1, %[begin]\n" 23 | "cmp %[begin], %[end]\n" 24 | "jnz 1b" 25 | : [ begin ] "+&r"(begin), [ load ] "=&r"(load), [ pos ] "=&r"(pos), 26 | [ offset ] "=&r"(offset) 27 | : [ end ] "r"(end), [ out ] "r"(out), [ shift ] "r"(shift)); 28 | } 29 | 30 | inline void uncompress_dense_block(uint8_t const* begin, uint64_t* out) { 31 | memcpy(out, begin, constants::block_size / 8); 32 | } 33 | 34 | void uncompress_sparse_chunk(uint8_t const* begin, int blocks, uint64_t* out) { 35 | assert(blocks >= 1 and blocks <= 256); 36 | uint8_t const* data = begin + blocks * 2; 37 | uint64_t* bitmap = out; 38 | uint8_t prev = 0; 39 | for (int i = 0; i != blocks; ++i) { 40 | uint8_t id = *begin; 41 | ++begin; 42 | int c = *begin; 43 | c += 1; 44 | int bytes = 32; 45 | int type = type::dense; 46 | if (LIKELY(c < 31)) { 47 | bytes = c; 48 | type = type::sparse; 49 | } 50 | bitmap += (id - prev) * constants::block_size_in_64bit_words; 51 | if (type == type::sparse) { 52 | uncompress_sparse_block(data, c, bitmap); 53 | } else if (type == type::dense) { 54 | uncompress_dense_block(data, bitmap); 55 | } 56 | data += bytes; 57 | ++begin; 58 | prev = id; 59 | } 60 | } 61 | 62 | inline void uncompress_dense_chunk(uint8_t const* begin, uint64_t* out) { 63 | memcpy(out, begin, constants::chunk_size / 8); 64 | } 65 | 66 | inline void uncompress_full_chunk(uint64_t* out) { 67 | for (uint32_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) { 68 | out[i] = uint64_t(-1); 69 | } 70 | } 71 | 72 | inline size_t uncompress_chunk(s_sequence::iterator const& it, uint64_t* out) { 73 | switch (it.type()) { 74 | case type::sparse: { 75 | for (uint64_t i = 0; i != 1024; ++i) out[i] = 0; 76 | uncompress_sparse_chunk(it.data, it.blocks(), out); 77 | break; 78 | } 79 | case type::dense: { 80 | uncompress_dense_chunk(it.data, out); 81 | break; 82 | } 83 | case type::full: { 84 | uncompress_full_chunk(out); 85 | break; 86 | } 87 | default: 88 | assert(false); 89 | __builtin_unreachable(); 90 | } 91 | return it.cardinality(); 92 | } 93 | 94 | size_t s_sequence::uncompress(uint64_t* out) const { 95 | auto it = begin(); 96 | size_t uncompressed = 0; 97 | uint16_t prev = 0; 98 | for (uint32_t i = 0; i != chunks; ++i) { 99 | uint16_t id = it.id(); 100 | out += (id - prev) * constants::chunk_size_in_64bit_words; 101 | uncompressed += uncompress_chunk(it, out); 102 | prev = id; 103 | it.next(); 104 | } 105 | assert(uncompressed > 0); 106 | return uncompressed; 107 | } 108 | 109 | } // namespace sliced -------------------------------------------------------------------------------- /include/uncompress_chunk_and_intersect.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "uncompress.hpp" 4 | 5 | namespace sliced { 6 | 7 | // uint32_t uncompress_sparse_block_and_intersect(uint8_t const* begin, 8 | // int cardinality, uint64_t* 9 | // out) { 10 | // static std::vector tmp(constants::block_size_in_64bit_words); 11 | // tmp[0] = 0; 12 | // tmp[1] = 0; 13 | // tmp[2] = 0; 14 | // tmp[3] = 0; 15 | // uncompress_sparse_block(begin, cardinality, tmp.data()); 16 | // uint32_t c = 0; 17 | // for (size_t i = 0; i != constants::block_size_in_64bit_words; ++i) { 18 | // out[i] &= tmp[i]; 19 | // c += __builtin_popcountll(out[i]); 20 | // } 21 | // return c; 22 | // } 23 | 24 | // uint32_t uncompress_dense_block_and_intersect(uint8_t const* begin, 25 | // uint64_t* out) { 26 | // uint64_t const* in = reinterpret_cast(begin); 27 | // uint32_t c = 0; 28 | // for (size_t i = 0; i != constants::block_size_in_64bit_words; ++i) { 29 | // out[i] &= in[i]; 30 | // c += __builtin_popcountll(out[i]); 31 | // } 32 | // return c; 33 | // } 34 | 35 | uint32_t uncompress_sparse_chunk_and_intersect(uint8_t const* begin, int blocks, 36 | uint64_t* out) { 37 | static std::vector tmp(constants::chunk_size_in_64bit_words); 38 | std::fill(tmp.begin(), tmp.end(), 0); 39 | uncompress_sparse_chunk(begin, blocks, tmp.data()); 40 | uint32_t c = 0; 41 | for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) { 42 | out[i] &= tmp[i]; 43 | c += __builtin_popcountll(out[i]); 44 | } 45 | return c; 46 | 47 | // assert(blocks >= 1 and blocks <= 256); 48 | // uint8_t const* data = begin + blocks * 2; 49 | // uint64_t* tmp = out; 50 | 51 | // uint8_t prev = 0; 52 | // uint32_t uncompressed = 0; 53 | // for (int i = 0; i != blocks; ++i) { 54 | // uint8_t id = *begin; 55 | // ++begin; 56 | // int c = *begin + 1; 57 | // int bytes = 32; 58 | // int type = type::dense; 59 | // if (LIKELY(c < 31)) { 60 | // bytes = c; 61 | // type = type::sparse; 62 | // } 63 | 64 | // // zero out any blocks in the middle 65 | // for (uint64_t k = constants::block_size_in_64bit_words; 66 | // k < (id - prev) * constants::block_size_in_64bit_words; ++k) { 67 | // tmp[k] = 0; 68 | // } 69 | 70 | // tmp += (id - prev) * constants::block_size_in_64bit_words; 71 | // uint32_t u = 0; 72 | // if (type == type::sparse) { 73 | // u = uncompress_sparse_block_and_intersect(data, c, tmp); 74 | // } else if (type == type::dense) { 75 | // u = uncompress_dense_block_and_intersect(data, tmp); 76 | // } 77 | // uncompressed += u; 78 | // data += bytes; 79 | // ++begin; 80 | // prev = id; 81 | // } 82 | 83 | // // zero out any trailing blocks 84 | // tmp += constants::block_size_in_64bit_words; 85 | // for (uint64_t k = 0; 86 | // k != (255 - prev) * constants::block_size_in_64bit_words; ++k) { 87 | // tmp[k] = 0; 88 | // } 89 | 90 | // return uncompressed; 91 | } 92 | 93 | uint32_t uncompress_dense_chunk_and_intersect(uint8_t const* begin, 94 | uint64_t* out) { 95 | uint64_t const* in = reinterpret_cast(begin); 96 | uint32_t c = 0; 97 | for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) { 98 | out[i] &= in[i]; 99 | c += __builtin_popcountll(out[i]); 100 | } 101 | return c; 102 | } 103 | 104 | inline size_t uncompress_chunk_and_intersect(s_sequence::iterator const& it, 105 | uint64_t* out, 106 | uint64_t cardinality) { 107 | switch (it.type()) { 108 | case type::sparse: 109 | return uncompress_sparse_chunk_and_intersect(it.data, it.blocks(), 110 | out); 111 | case type::dense: 112 | return uncompress_dense_chunk_and_intersect(it.data, out); 113 | case type::full: 114 | return cardinality; 115 | default: 116 | assert(false); 117 | __builtin_unreachable(); 118 | } 119 | } 120 | 121 | } // namespace sliced -------------------------------------------------------------------------------- /include/uncompress_chunk_and_merge.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "uncompress.hpp" 4 | 5 | namespace sliced { 6 | 7 | void uncompress_sparse_chunk_and_merge(uint8_t const* begin, int blocks, 8 | uint64_t* out) { 9 | static std::vector tmp(constants::chunk_size_in_64bit_words); 10 | std::fill(tmp.begin(), tmp.end(), 0); 11 | uncompress_sparse_chunk(begin, blocks, tmp.data()); 12 | for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) { 13 | out[i] |= tmp[i]; 14 | } 15 | } 16 | 17 | void uncompress_dense_chunk_and_merge(uint8_t const* begin, uint64_t* out) { 18 | uint64_t const* in = reinterpret_cast(begin); 19 | for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) { 20 | out[i] |= in[i]; 21 | } 22 | } 23 | 24 | void uncompress_full_chunk_and_merge(uint64_t* out) { 25 | for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) { 26 | out[i] = uint64_t(-1); 27 | } 28 | } 29 | 30 | inline void uncompress_chunk_and_merge(s_sequence::iterator const& it, 31 | uint64_t* out) { 32 | switch (it.type()) { 33 | case type::sparse: { 34 | uncompress_sparse_chunk_and_merge(it.data, it.blocks(), out); 35 | break; 36 | } 37 | case type::dense: { 38 | uncompress_dense_chunk_and_merge(it.data, out); 39 | break; 40 | } 41 | case type::full: { 42 | uncompress_full_chunk_and_merge(out); 43 | break; 44 | } 45 | default: 46 | assert(false); 47 | __builtin_unreachable(); 48 | } 49 | } 50 | 51 | } // namespace sliced -------------------------------------------------------------------------------- /include/union.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "constants.hpp" 4 | #include "util.hpp" 5 | #include "uncompress.hpp" 6 | #include "decode.hpp" 7 | 8 | namespace sliced { 9 | 10 | #define DECODE(ptr) \ 11 | uint8_t id = *ptr; \ 12 | int c = *(ptr + 1) + 1; \ 13 | int type = type::dense; \ 14 | int bytes = 32; \ 15 | if (LIKELY(c < 31)) { \ 16 | bytes = c; \ 17 | type = type::sparse; \ 18 | } \ 19 | out += decode_block(data_##ptr, type, c, base + id * 256, out); \ 20 | data_##ptr += bytes; \ 21 | ptr += 2; 22 | 23 | size_t ss_union_block(uint8_t const* l, uint8_t const* r, int card_l, 24 | int card_r, uint32_t base, uint32_t* out) { 25 | assert(card_l > 0 and 26 | card_l <= int(constants::block_sparseness_threshold - 2)); 27 | assert(card_r > 0 and 28 | card_r <= int(constants::block_sparseness_threshold - 2)); 29 | size_t size = 0; 30 | 31 | uint8_t const* end_l = l + card_l; 32 | uint8_t const* end_r = r + card_r; 33 | 34 | while (true) { 35 | if (*l < *r) { 36 | out[size++] = *l + base; 37 | ++l; 38 | if (l == end_l) break; 39 | } else if (*r < *l) { 40 | out[size++] = *r + base; 41 | ++r; 42 | if (r == end_r) break; 43 | } else { 44 | out[size++] = *l + base; 45 | ++l; 46 | ++r; 47 | if (l == end_l or r == end_r) break; 48 | } 49 | } 50 | 51 | if (l != end_l) { 52 | size += decode_sparse_block(l, end_l - l, base, out + size); 53 | } 54 | 55 | if (r != end_l) { 56 | size += decode_sparse_block(r, end_r - r, base, out + size); 57 | } 58 | 59 | return size; 60 | } 61 | 62 | size_t ds_union_block(uint8_t const* l, uint8_t const* r, int cardinality, 63 | uint32_t base, uint32_t* out) { 64 | static uint64_t x[4]; 65 | memcpy(x, reinterpret_cast(l), constants::block_size / 8); 66 | uncompress_sparse_block(r, cardinality, x); 67 | return or_bitmaps(l, reinterpret_cast(x), 68 | constants::block_size_in_64bit_words, base, out); 69 | } 70 | 71 | inline uint32_t decode_block(uint8_t const* data, int type, int cardinality, 72 | uint32_t base, uint32_t* out) { 73 | if (type == type::sparse) { 74 | return decode_sparse_block(data, cardinality, base, out); 75 | } 76 | return decode_bitmap(reinterpret_cast(data), 77 | constants::block_size_in_64bit_words, base, out); 78 | } 79 | 80 | size_t ss_union_chunk(uint8_t const* l, uint8_t const* r, int blocks_l, 81 | int blocks_r, uint32_t base, uint32_t* out) { 82 | assert(blocks_l >= 1 and blocks_l <= 256); 83 | assert(blocks_r >= 1 and blocks_r <= 256); 84 | 85 | uint8_t const* data_l = l + blocks_l * 2; 86 | uint8_t const* data_r = r + blocks_r * 2; 87 | uint8_t const* end_l = data_l; 88 | uint8_t const* end_r = data_r; 89 | uint32_t* in = out; 90 | 91 | while (true) { 92 | if (*l < *r) { 93 | DECODE(l) 94 | if (l == end_l) break; 95 | } else if (*l > *r) { 96 | DECODE(r) 97 | if (r == end_r) break; 98 | } else { 99 | uint8_t id = *l; 100 | ++l; 101 | ++r; 102 | int cl = *l + 1; 103 | int cr = *r + 1; 104 | int type_l = type::dense; 105 | int type_r = type::dense; 106 | int bl = 32; 107 | int br = 32; 108 | 109 | if (LIKELY(cl < 31)) { 110 | bl = cl; 111 | type_l = type::sparse; 112 | } 113 | 114 | if (LIKELY(cr < 31)) { 115 | br = cr; 116 | type_r = type::sparse; 117 | } 118 | 119 | uint32_t b = base + id * 256; 120 | uint32_t n = 0; 121 | 122 | switch (block_pair(type_l, type_r)) { 123 | case block_pair(type::sparse, type::sparse): 124 | n = ss_union_block(data_l, data_r, cl, cr, b, out); 125 | break; 126 | case block_pair(type::sparse, type::dense): 127 | n = ds_union_block(data_r, data_l, cl, b, out); 128 | break; 129 | case block_pair(type::dense, type::sparse): 130 | n = ds_union_block(data_l, data_r, cr, b, out); 131 | break; 132 | case block_pair(type::dense, type::dense): 133 | n = or_bitmaps(data_l, data_r, 134 | constants::block_size_in_64bit_words, b, 135 | out); 136 | break; 137 | default: 138 | assert(false); 139 | __builtin_unreachable(); 140 | } 141 | 142 | out += n; 143 | data_l += bl; 144 | data_r += br; 145 | ++l; 146 | ++r; 147 | 148 | if (l == end_l or r == end_r) { break; } 149 | } 150 | } 151 | 152 | while (l != end_l) { DECODE(l) } 153 | while (r != end_r) { DECODE(r) } 154 | 155 | return size_t(out - in); 156 | } 157 | 158 | size_t ds_union_chunk(uint8_t const* l, uint8_t const* r, int blocks_r, 159 | uint32_t base, uint32_t* out) { 160 | static std::vector x(1024); 161 | std::fill(x.begin(), x.end(), 0); 162 | uncompress_sparse_chunk(r, blocks_r, x.data()); 163 | return or_bitmaps(l, reinterpret_cast(x.data()), 164 | constants::chunk_size_in_64bit_words, base, out); 165 | } 166 | 167 | size_t pairwise_union(s_sequence const& l, s_sequence const& r, uint32_t* out) { 168 | auto it_l = l.begin(); 169 | auto it_r = r.begin(); 170 | uint32_t* in = out; 171 | 172 | while (true) { 173 | uint16_t id_l = it_l.id(); 174 | uint16_t id_r = it_r.id(); 175 | 176 | if (id_l == id_r) { 177 | uint32_t n = 0; 178 | uint32_t base = id_l << 16; 179 | int blocks_l = 0; 180 | int blocks_r = 0; 181 | 182 | uint16_t type_l = it_l.type(); 183 | uint16_t type_r = it_r.type(); 184 | 185 | switch (chunk_pair(type_l, type_r)) { 186 | case chunk_pair(type::sparse, type::sparse): 187 | blocks_l = it_l.blocks(); 188 | blocks_r = it_r.blocks(); 189 | if (blocks_l < blocks_r) { 190 | n = ss_union_chunk(it_l.data, it_r.data, blocks_l, 191 | blocks_r, base, out); 192 | } else { 193 | n = ss_union_chunk(it_r.data, it_l.data, blocks_r, 194 | blocks_l, base, out); 195 | } 196 | break; 197 | case chunk_pair(type::sparse, type::dense): 198 | n = ds_union_chunk(it_r.data, it_l.data, it_l.blocks(), 199 | base, out); 200 | break; 201 | case chunk_pair(type::sparse, type::full): 202 | n = decode_full_chunk(base, out); 203 | break; 204 | case chunk_pair(type::dense, type::sparse): 205 | n = ds_union_chunk(it_l.data, it_r.data, it_r.blocks(), 206 | base, out); 207 | break; 208 | case chunk_pair(type::dense, type::dense): 209 | n = or_bitmaps(it_l.data, it_r.data, 210 | constants::chunk_size_in_64bit_words, base, 211 | out); 212 | break; 213 | case chunk_pair(type::dense, type::full): 214 | n = decode_full_chunk(base, out); 215 | break; 216 | case chunk_pair(type::full, type::sparse): 217 | n = decode_full_chunk(base, out); 218 | break; 219 | case chunk_pair(type::full, type::dense): 220 | n = decode_full_chunk(base, out); 221 | break; 222 | case chunk_pair(type::full, type::full): 223 | n = decode_full_chunk(base, out); 224 | break; 225 | default: 226 | assert(false); 227 | __builtin_unreachable(); 228 | } 229 | 230 | out += n; 231 | 232 | it_l.next(); 233 | it_r.next(); 234 | if (!it_l.has_next() or !it_r.has_next()) break; 235 | 236 | } else if (id_l < id_r) { 237 | out += decode_chunk(it_l, out); 238 | it_l.next(); 239 | if (!it_l.has_next()) break; 240 | } else { 241 | out += decode_chunk(it_r, out); 242 | it_r.next(); 243 | if (!it_r.has_next()) break; 244 | } 245 | } 246 | 247 | while (it_l.has_next()) { 248 | out += decode_chunk(it_l, out); 249 | it_l.next(); 250 | } 251 | 252 | while (it_r.has_next()) { 253 | out += decode_chunk(it_r, out); 254 | it_r.next(); 255 | } 256 | 257 | return size_t(out - in); 258 | } 259 | 260 | } // namespace sliced -------------------------------------------------------------------------------- /include/union_many.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "decode.hpp" 4 | #include "uncompress_chunk_and_merge.hpp" 5 | 6 | namespace sliced { 7 | 8 | size_t union_many(std::vector& sequences, uint32_t* out) { 9 | uint32_t* in = out; 10 | std::vector iterators(sequences.size()); 11 | for (size_t i = 0; i != sequences.size(); ++i) { 12 | iterators[i] = sequences[i].begin(); 13 | } 14 | 15 | static std::vector bitmap(1024); 16 | uint32_t header = std::min_element(iterators.begin(), iterators.end(), 17 | [](auto const& l, auto const& r) { 18 | return l.id() < r.id(); 19 | }) 20 | ->id(); 21 | bool first = true; 22 | while (header < 65536) { 23 | uint32_t base = header << 16; 24 | for (size_t i = 0; i != iterators.size(); ++i) { 25 | if (iterators[i].id() == header) { 26 | if (first) { 27 | uncompress_chunk(iterators[i], bitmap.data()); 28 | first = false; 29 | } else { 30 | uncompress_chunk_and_merge(iterators[i], bitmap.data()); 31 | } 32 | } 33 | } 34 | first = true; 35 | out += decode_bitmap(bitmap.data(), 1024, base, out); 36 | uint32_t next = 65536; 37 | for (size_t i = 0; i != iterators.size(); ++i) { 38 | if (iterators[i].id() == header) iterators[i].next(); 39 | if (iterators[i].id() < next) next = iterators[i].id(); 40 | } 41 | header = next; 42 | } 43 | 44 | return size_t(out - in); 45 | } 46 | 47 | } // namespace sliced -------------------------------------------------------------------------------- /include/util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include // for log2 and ceil 4 | #include 5 | #include "constants.hpp" 6 | 7 | namespace sliced { 8 | 9 | #define chunk_pair(l, r) (3 * (l) + (r)) 10 | #define block_pair(l, r) (2 * (l) + (r)) 11 | 12 | #define BYTES_BY_CARDINALITY(c) LIKELY(c < 31) ? c : 32 13 | 14 | #define LIKELY(x) __builtin_expect(!!(x), 1) 15 | 16 | #define OPERATE_BITMAPS(OP, l, r, size_in_64bit_words, base, out) \ 17 | uint64_t const* bitmap_l = reinterpret_cast(l); \ 18 | uint64_t const* bitmap_r = reinterpret_cast(r); \ 19 | size_t size = 0; \ 20 | for (size_t i = 0; i != size_in_64bit_words; ++i) { \ 21 | uint64_t w = bitmap_l[i] OP bitmap_r[i]; \ 22 | while (w != 0) { \ 23 | uint64_t t = w & (~w + 1); \ 24 | int r = __builtin_ctzll(w); \ 25 | out[size++] = r + base; \ 26 | w ^= t; \ 27 | } \ 28 | base += 64; \ 29 | } \ 30 | return size; 31 | 32 | size_t and_bitmaps(uint8_t const* l, uint8_t const* r, 33 | size_t size_in_64bit_words, uint32_t base, uint32_t* out){ 34 | OPERATE_BITMAPS(&, l, r, size_in_64bit_words, base, out)} 35 | 36 | size_t or_bitmaps(uint8_t const* l, uint8_t const* r, 37 | size_t size_in_64bit_words, uint32_t base, uint32_t* out) { 38 | OPERATE_BITMAPS(|, l, r, size_in_64bit_words, base, out) 39 | } 40 | 41 | inline bool bitmap_contains(uint64_t const* bitmap, uint64_t pos) { 42 | // uint64_t w = bitmap[pos >> 6]; 43 | // w >>= pos & 63; 44 | // return w & 1; 45 | 46 | uint64_t r; 47 | uint64_t w = bitmap[pos >> 6]; 48 | __asm volatile( 49 | "bt %2,%1\n" 50 | "sbb %0,%0" 51 | : "=r"(r) 52 | : "r"(w), "r"(pos)); 53 | return r; 54 | } 55 | 56 | size_t bytes_for(size_t bits) { 57 | return (bits + 8 - 1) / 8; 58 | } 59 | 60 | uint32_t num_chunks(uint64_t universe) { 61 | return (universe + constants::chunk_size) / constants::chunk_size; 62 | } 63 | 64 | enum type { empty = 0, sparse = 1, dense = 3, full = 2 }; 65 | 66 | struct parameters { 67 | parameters() 68 | : collection_filename("") 69 | , density(-1.0) 70 | , size(0) {} 71 | 72 | std::string collection_filename; 73 | double density; 74 | uint32_t size; 75 | }; 76 | 77 | bool pass(parameters const& params, uint32_t n, uint32_t universe) { 78 | if (params.density >= 0.0 and double(n) / universe > params.density) 79 | return true; 80 | if (n > params.size) return true; 81 | return false; 82 | } 83 | 84 | struct query { 85 | uint32_t i; 86 | uint32_t j; 87 | }; 88 | 89 | /* For a sorted list of size n whose universe is u. */ 90 | uint64_t elias_fano_bitsize(uint64_t n, uint64_t u) { 91 | return n * 92 | ((u > n ? (std::ceil(std::log2(static_cast(u) / n))) : 0) + 93 | 2); 94 | } 95 | 96 | struct statistics { 97 | statistics() { 98 | memset(this, 0, sizeof(*this)); 99 | } 100 | 101 | uint64_t sequences; 102 | 103 | uint64_t integers; 104 | uint64_t integers_in_sparse_chunks; 105 | uint64_t integers_in_dense_chunks; 106 | uint64_t integers_in_full_chunks; 107 | uint64_t integers_in_sparse_blocks; 108 | uint64_t integers_in_dense_blocks; 109 | 110 | uint64_t chunks; 111 | uint64_t empty_chunks; 112 | uint64_t sparse_chunks; 113 | uint64_t very_sparse_chunks; 114 | uint64_t dense_chunks; 115 | uint64_t full_chunks; 116 | 117 | uint64_t blocks; 118 | uint64_t empty_blocks; 119 | uint64_t sparse_blocks; 120 | uint64_t dense_blocks; 121 | 122 | uint64_t bits; 123 | uint64_t chunks_header_bits; 124 | uint64_t blocks_header_bits; 125 | uint64_t dense_chunks_bits; 126 | uint64_t dense_blocks_bits; 127 | uint64_t sparse_blocks_bits; 128 | 129 | uint64_t sparse_blocks_cardinalities[1 + 30]; 130 | 131 | // (non-empty blocks) 132 | uint64_t 133 | num_blocks_in_chunks[1 + constants::chunk_size / constants::block_size]; 134 | uint64_t num_integers[1 + constants::chunk_size / constants::block_size]; 135 | 136 | void accumulate(statistics const& other) { 137 | dense_blocks += other.dense_blocks; 138 | sparse_blocks += other.sparse_blocks; 139 | empty_blocks += other.empty_blocks; 140 | integers_in_dense_blocks += other.integers_in_dense_blocks; 141 | integers_in_sparse_blocks += other.integers_in_sparse_blocks; 142 | dense_blocks_bits += other.dense_blocks_bits; 143 | sparse_blocks_bits += other.sparse_blocks_bits; 144 | 145 | for (int i = 1; i != 30 + 1; i++) { 146 | sparse_blocks_cardinalities[i] += 147 | other.sparse_blocks_cardinalities[i]; 148 | } 149 | } 150 | 151 | void print() { 152 | std::cout << "processed " << sequences << " sequences, " << integers 153 | << " integers" << std::endl; 154 | 155 | std::cout << "chunks: " << chunks << std::endl; 156 | std::cout << "full chunks: " << full_chunks << " (" 157 | << integers_in_full_chunks * 100.0 / integers << "% of ints)" 158 | << std::endl; 159 | std::cout << "empty chunks: " << empty_chunks << " (" 160 | << empty_chunks * 100.0 / chunks << "% of chunks)" 161 | << std::endl; 162 | std::cout << "dense chunks: " << dense_chunks << " (" 163 | << integers_in_dense_chunks * 100.0 / integers << "% of ints)" 164 | << std::endl; 165 | std::cout << "sparse chunks: " << sparse_chunks << " (" 166 | << integers_in_sparse_chunks * 100.0 / integers 167 | << "% of ints)" << std::endl; 168 | 169 | std::cout << "blocks: " << blocks << std::endl; 170 | std::cout << "empty blocks: " << empty_blocks << " (" 171 | << empty_blocks * 100.0 / blocks << "% of blocks)" 172 | << std::endl; 173 | std::cout << "dense blocks: " << dense_blocks << " (" 174 | << integers_in_dense_blocks * 100.0 / integers << "% of ints)" 175 | << std::endl; 176 | std::cout << "sparse blocks: " << sparse_blocks << " (" 177 | << integers_in_sparse_blocks * 100.0 / integers 178 | << "% of ints)" << std::endl; 179 | 180 | std::cout << double(chunks_header_bits) / integers 181 | << " [bpi] for chunks' headers" << std::endl; 182 | std::cout << double(blocks_header_bits) / integers 183 | << " [bpi] for blocks' headers" << std::endl; 184 | std::cout << double(dense_chunks_bits) / integers 185 | << " [bpi] for dense chunks" << std::endl; 186 | std::cout << double(dense_blocks_bits) / integers 187 | << " [bpi] for dense blocks" << std::endl; 188 | std::cout << double(sparse_blocks_bits) / integers 189 | << " [bpi] for sparse blocks" << std::endl; 190 | 191 | std::cout << "total bytes: " << bits / 8 << std::endl; 192 | std::cout << "total bpi: " << double(bits) / integers << std::endl; 193 | 194 | std::cout << "== sparse blocks cardinalities (%) ==" << std::endl; 195 | double expected_value = 0.0; 196 | for (int i = 1; i != 30 + 1; ++i) { 197 | double p_i = static_cast(sparse_blocks_cardinalities[i]) / 198 | sparse_blocks; 199 | std::cout << "sparse blocks with card. " << i << ": " << p_i * 100.0 200 | << std::endl; 201 | expected_value += i * p_i; 202 | } 203 | std::cout << "expected_value " << expected_value << std::endl; 204 | 205 | std::cout << "== distribution of blocks in sparse chunks (" 206 | << integers_in_sparse_chunks * 100.0 / integers 207 | << "% of ints) ==" << std::endl; 208 | uint64_t covered_integers_in_sparse_chunks = 0; 209 | expected_value = 0.0; 210 | for (uint64_t i = 1; 211 | i != constants::chunk_size / constants::block_size + 1; ++i) { 212 | uint64_t avg_num_integers_per_chunk = 213 | static_cast(num_integers[i]) / num_blocks_in_chunks[i]; 214 | uint64_t elias_fano_bits = elias_fano_bitsize( 215 | avg_num_integers_per_chunk, constants::chunk_size); 216 | 217 | uint64_t total_num_blocks = i * num_blocks_in_chunks[i]; 218 | double avg_num_integers_per_block = 219 | static_cast(num_integers[i]) / total_num_blocks; 220 | double p_i = 221 | static_cast(num_blocks_in_chunks[i]) / sparse_chunks; 222 | std::cout << "sparse chunks with " << i 223 | << " blocks: " << p_i * 100.0 224 | << "%; avg_num_integers_per_block = " 225 | << avg_num_integers_per_block 226 | << "; avg_num_integers_per_chunk = " 227 | << avg_num_integers_per_chunk << std::endl; 228 | expected_value += i * p_i; 229 | covered_integers_in_sparse_chunks += num_integers[i]; 230 | 231 | std::cout << "Elias-Fano avg. bpi " 232 | << static_cast(elias_fano_bits) / 233 | avg_num_integers_per_chunk 234 | << " vs. 8" << std::endl; 235 | 236 | std::cout << " -- total integers covered " 237 | << (covered_integers_in_sparse_chunks * 100.0) / 238 | integers_in_sparse_chunks 239 | << "%" << std::endl; 240 | } 241 | std::cout << "expected_value " << expected_value << std::endl; 242 | } 243 | }; 244 | 245 | } // namespace sliced -------------------------------------------------------------------------------- /script/build.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | input_filename = sys.argv[1] 4 | output_filename = sys.argv[2] 5 | 6 | os.system("./build " + input_filename + " 0.01 -o " + output_filename + ".0.01.bin") 7 | os.system("./build " + input_filename + " 0.001 -o " + output_filename + ".0.001.bin") 8 | os.system("./build " + input_filename + " 0.0001 -o " + output_filename + ".0.0001.bin") 9 | -------------------------------------------------------------------------------- /script/queries.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | index_filename = sys.argv[1] 4 | query_logs_path = sys.argv[2] 5 | runs = 3 6 | 7 | for i in xrange(0, runs): 8 | os.system("./decode " + index_filename) 9 | 10 | for i in xrange(0, runs): 11 | os.system("./intersect " + index_filename + " 1000 < " + query_logs_path + "/pairwise_queries.1k") 12 | 13 | for i in xrange(0, runs): 14 | os.system("./union " + index_filename + " 1000 < " + query_logs_path + "/pairwise_queries.1k") 15 | 16 | for i in xrange(0, runs): 17 | os.system("./select " + index_filename + " 1000 < " + query_logs_path + "/select_queries.1k") 18 | 19 | for i in xrange(0, runs): 20 | os.system("./next_geq " + index_filename + " 1000 < " + query_logs_path + "/next_geq_queries.1k") -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(build build.cpp) 2 | add_executable(decode decode.cpp) 3 | add_executable(uncompress uncompress.cpp) 4 | add_executable(intersect intersect.cpp) 5 | add_executable(union union.cpp) 6 | add_executable(cardinality cardinality.cpp) 7 | add_executable(select select.cpp) 8 | add_executable(contains contains.cpp) 9 | add_executable(next_geq next_geq.cpp) 10 | 11 | add_executable(example example.cpp) -------------------------------------------------------------------------------- /src/build.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | #include "builder.hpp" 5 | #include "s_index.hpp" 6 | 7 | using namespace sliced; 8 | 9 | void build(parameters const& params, char const* output_filename) { 10 | typedef s_index::builder builder_type; 11 | builder_type builder(params); 12 | auto stats = builder.build(); 13 | stats.print(); 14 | if (output_filename) { 15 | essentials::print_size(builder); 16 | std::cout << "saving data structure to disk..." << std::endl; 17 | essentials::save(builder, output_filename); 18 | } 19 | } 20 | 21 | int main(int argc, char** argv) { 22 | int mandatory = 2; 23 | if (argc < mandatory) { 24 | std::cout << argv[0] 25 | << " collection_filename [--density d] [--size s] [--out " 26 | "output_filename]" 27 | << std::endl; 28 | return 1; 29 | } 30 | 31 | parameters params; 32 | params.collection_filename = argv[1]; 33 | char const* output_filename = nullptr; 34 | 35 | for (int i = mandatory; i != argc; ++i) { 36 | if (std::string(argv[i]) == "--density") { 37 | ++i; 38 | params.density = std::stod(argv[i]); 39 | } else if (std::string(argv[i]) == "--size") { 40 | ++i; 41 | params.size = std::atoi(argv[i]); 42 | } else if (std::string(argv[i]) == "--out") { 43 | ++i; 44 | output_filename = argv[i]; 45 | } 46 | } 47 | 48 | build(params, output_filename); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /src/cardinality.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | 8 | using namespace sliced; 9 | 10 | void perf_cardinality(char const* binary_filename) { 11 | s_index index; 12 | index.mmap(binary_filename); 13 | 14 | size_t total = 0; 15 | essentials::timer_type t; 16 | static const int runs = 10 + 1; 17 | for (int run = 0; run != runs; ++run) { 18 | t.start(); 19 | for (size_t i = 0; i != index.size(); ++i) { 20 | auto sequence = index[i]; 21 | total += sequence.cardinality(); 22 | } 23 | t.stop(); 24 | } 25 | std::cout << total / runs << std::endl; 26 | t.discard_first(); 27 | double avg = t.average(); 28 | std::cout << "Mean per run: " << avg << " [musec]\n"; 29 | std::cout << "Mean per query: " << avg / index.size() * 1000 << " [ns]"; 30 | std::cout << std::endl; 31 | } 32 | 33 | int main(int argc, char** argv) { 34 | int mandatory = 2; 35 | if (argc < mandatory) { 36 | std::cout << argv[0] << " " << std::endl; 37 | return 1; 38 | } 39 | 40 | perf_cardinality(argv[1]); 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /src/contains.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "contains.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void perf_contains(char const* binary_filename, 12 | std::vector const& queries) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | size_t total = 0; 17 | std::cout << "performing " << queries.size() << " contains queries..." 18 | << std::endl; 19 | essentials::timer_type t; 20 | static const int runs = 3 + 1; 21 | for (int run = 0; run != runs; ++run) { 22 | t.start(); 23 | for (auto const& q : queries) total += index[q.i].contains(q.j); 24 | t.stop(); 25 | } 26 | std::cout << total << std::endl; 27 | t.discard_first(); 28 | double avg = t.average(); 29 | std::cout << "Mean per run: " << avg << " [musec]\n"; 30 | std::cout << "Mean per query: " << avg / queries.size() << " [musec]"; 31 | std::cout << std::endl; 32 | } 33 | 34 | int main(int argc, char** argv) { 35 | int mandatory = 3; 36 | if (argc < mandatory) { 37 | std::cout << argv[0] << " index_filename num_queries < queries" 38 | << std::endl; 39 | return 1; 40 | } 41 | 42 | char const* binary_filename = argv[1]; 43 | uint64_t num_queries = std::stoull(argv[2]); 44 | std::vector queries; 45 | queries.reserve(num_queries); 46 | 47 | std::cout << "reading queries..." << std::endl; 48 | for (uint32_t i = 0; i != num_queries; ++i) { 49 | query q; 50 | int x = scanf("%d", &q.i); 51 | int y = scanf("%d", &q.j); 52 | if (x == EOF or y == EOF) break; 53 | queries.push_back(q); 54 | } 55 | std::cout << "DONE" << std::endl; 56 | 57 | perf_contains(binary_filename, queries); 58 | 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /src/decode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "decode.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void decode(char const* binary_filename) { 12 | s_index index; 13 | index.mmap(binary_filename); 14 | 15 | std::vector out(index.universe()); 16 | uint64_t integers = 0; 17 | essentials::timer_type t; 18 | t.start(); 19 | for (size_t i = 0; i != index.size(); ++i) { 20 | auto sequence = index[i]; 21 | size_t decoded = sequence.decode(out.data()); 22 | integers += decoded; 23 | } 24 | t.stop(); 25 | 26 | std::cout << "decoded " << index.size() << " sequences" << std::endl; 27 | std::cout << "decoded " << integers << " integers" << std::endl; 28 | 29 | double elapsed = t.average(); 30 | std::cout << "Elapsed time: " << elapsed / 1000000 << " [sec]\n"; 31 | std::cout << "Mean per sequence: " << elapsed / index.size() 32 | << " [musec]\n"; 33 | std::cout << "Mean per integer: " << elapsed / integers * 1000 << " [ns]"; 34 | std::cout << std::endl; 35 | } 36 | 37 | int main(int argc, char** argv) { 38 | int mandatory = 2; 39 | if (argc < mandatory) { 40 | std::cout << argv[0] << " " << std::endl; 41 | return 1; 42 | } 43 | 44 | decode(argv[1]); 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /src/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | #include "builder.hpp" 5 | #include "s_sequence.hpp" 6 | #include "select.hpp" 7 | #include "decode.hpp" 8 | 9 | using namespace sliced; 10 | 11 | int main(int argc, char** argv) { 12 | int mandatory = 1; 13 | char const* output_filename = nullptr; 14 | 15 | for (int i = mandatory; i != argc; ++i) { 16 | if (std::string(argv[i]) == "-o") { 17 | ++i; 18 | output_filename = argv[i]; 19 | } else if (std::string(argv[i]) == "-h") { 20 | std::cout << argv[0] << " -o output_filename < input" << std::endl; 21 | return 1; 22 | } else { 23 | std::cout << "unknown option '" << argv[i] << "'" << std::endl; 24 | return 1; 25 | } 26 | } 27 | 28 | std::vector input; 29 | 30 | { // read input from std::in 31 | uint32_t n, x; 32 | std::cin >> n; 33 | input.reserve(n); 34 | for (uint32_t i = 0; i != n; ++i) { 35 | std::cin >> x; 36 | input.push_back(x); 37 | } 38 | } 39 | 40 | // build the sequence and print statistics 41 | s_sequence::builder builder; 42 | auto stats = builder.build(input.data(), input.size()); 43 | stats.print(); 44 | 45 | mm::file_source mm_file; 46 | uint8_t const* data = nullptr; 47 | 48 | if (output_filename) { // if an output file is specified, then serialize 49 | essentials::print_size(builder); 50 | essentials::save(builder, output_filename); 51 | 52 | // mmap 53 | int advice = mm::advice::normal; // can be also random and sequential 54 | mm_file.open(output_filename, advice); 55 | 56 | // skip first 8 bytes storing the number of written bytes 57 | data = mm_file.data() + 8; 58 | 59 | } else { // otherwise work directly in memory 60 | data = builder.data(); 61 | } 62 | 63 | // initialize a s_sequence from data, regardless the source 64 | s_sequence ss(data); 65 | 66 | uint32_t size = ss.size(); 67 | 68 | // decode whole list to an output buffer 69 | std::vector out(size); 70 | ss.decode(out.data()); 71 | // check written values 72 | uint32_t value = 0; 73 | for (uint32_t i = 0; i != size; ++i) { 74 | if (input[i] != out[i]) { 75 | std::cout << "got " << out[i] << " but expected " << input[i] 76 | << std::endl; 77 | return 1; 78 | } 79 | 80 | ss.select(i, value); // select i-th element 81 | if (value != out[i]) { 82 | std::cout << "got " << value << " but expected " << out[i] 83 | << std::endl; 84 | return 1; 85 | } 86 | } 87 | 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /src/intersect.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "intersection.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void perf_intersection(char const* binary_filename, 12 | std::vector const& queries) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | std::vector out(index.universe()); 17 | size_t total = 0; 18 | std::cout << "performing " << queries.size() << " pairwise-intersections..." 19 | << std::endl; 20 | essentials::timer_type t; 21 | static const int runs = 10 + 1; 22 | for (int run = 0; run != runs; ++run) { 23 | t.start(); 24 | for (auto const& q : queries) { 25 | total += pairwise_intersection(index[q.i], index[q.j], out.data()); 26 | } 27 | t.stop(); 28 | } 29 | std::cout << total << std::endl; 30 | t.discard_first(); 31 | double avg = t.average(); 32 | std::cout << "Mean per run: " << avg << " [musec]\n"; 33 | std::cout << "Mean per query: " << avg / queries.size() << " [musec]"; 34 | std::cout << std::endl; 35 | } 36 | 37 | int main(int argc, char** argv) { 38 | int mandatory = 3; 39 | if (argc < mandatory) { 40 | std::cout << argv[0] << " index_filename num_queries < queries" 41 | << std::endl; 42 | return 1; 43 | } 44 | 45 | char const* binary_filename = argv[1]; 46 | uint64_t num_queries = std::stoull(argv[2]); 47 | std::vector queries; 48 | queries.reserve(num_queries); 49 | 50 | std::cout << "reading queries..." << std::endl; 51 | for (uint32_t i = 0; i != num_queries; ++i) { 52 | query q; 53 | int x = scanf("%d", &q.i); 54 | int y = scanf("%d", &q.j); 55 | if (x == EOF or y == EOF) break; 56 | queries.push_back(q); 57 | } 58 | std::cout << "DONE" << std::endl; 59 | 60 | perf_intersection(binary_filename, queries); 61 | 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /src/next_geq.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "next_geq.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void perf_next_geq(char const* binary_filename, 12 | uint64_t num_queries_per_sequence) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | uint64_t total_queries = index.size() * num_queries_per_sequence; 17 | std::vector queries; 18 | queries.reserve(total_queries); 19 | 20 | std::cout << "reading queries..." << std::endl; 21 | for (uint32_t i = 0; i != total_queries; ++i) { 22 | uint32_t q; 23 | int x = scanf("%d", &q); 24 | if (x == EOF) break; 25 | queries.push_back(q); 26 | } 27 | std::cout << "DONE" << std::endl; 28 | 29 | size_t total = 0; 30 | std::cout << "performing " << queries.size() << " next_geq queries..." 31 | << std::endl; 32 | essentials::timer_type t; 33 | static const int runs = 3 + 1; 34 | for (int run = 0; run != runs; ++run) { 35 | uint64_t q = 0; 36 | t.start(); 37 | for (uint32_t i = 0; i != index.size(); ++i) { 38 | auto sequence = index[i]; 39 | for (uint32_t j = 0; j != num_queries_per_sequence; ++j) { 40 | total += sequence.next_geq(queries[q++]); 41 | } 42 | } 43 | t.stop(); 44 | } 45 | std::cout << total << std::endl; 46 | t.discard_first(); 47 | double avg = t.average(); 48 | std::cout << "Mean per run: " << avg << " [musec]\n"; 49 | std::cout << "Mean per query: " << avg / total_queries << " [musec]"; 50 | std::cout << std::endl; 51 | } 52 | 53 | int main(int argc, char** argv) { 54 | int mandatory = 3; 55 | if (argc < mandatory) { 56 | std::cout << argv[0] 57 | << " index_filename num_queries_per_sequence < queries" 58 | << std::endl; 59 | return 1; 60 | } 61 | 62 | char const* binary_filename = argv[1]; 63 | uint64_t num_queries_per_sequence = std::stoull(argv[2]); 64 | perf_next_geq(binary_filename, num_queries_per_sequence); 65 | 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /src/select.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "select.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void perf_select(char const* binary_filename, 12 | uint64_t num_queries_per_sequence) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | uint64_t total_queries = index.size() * num_queries_per_sequence; 17 | std::vector queries; 18 | queries.reserve(total_queries); 19 | 20 | std::cout << "reading queries..." << std::endl; 21 | for (uint32_t i = 0; i != total_queries; ++i) { 22 | uint32_t q; 23 | int x = scanf("%d", &q); 24 | if (x == EOF) break; 25 | queries.push_back(q); 26 | } 27 | std::cout << "DONE" << std::endl; 28 | 29 | size_t total = 0; 30 | std::cout << "performing " << queries.size() << " select queries..." 31 | << std::endl; 32 | essentials::timer_type t; 33 | static const int runs = 3 + 1; 34 | for (int run = 0; run != runs; ++run) { 35 | uint64_t q = 0; 36 | uint32_t value = 0; 37 | t.start(); 38 | for (uint32_t i = 0; i != index.size(); ++i) { 39 | auto sequence = index[i]; 40 | for (uint32_t j = 0; j != num_queries_per_sequence; ++j) { 41 | sequence.select(queries[q++], value); 42 | total += value; 43 | } 44 | } 45 | t.stop(); 46 | } 47 | std::cout << total << std::endl; 48 | t.discard_first(); 49 | double avg = t.average(); 50 | std::cout << "Mean per run: " << avg << " [musec]\n"; 51 | std::cout << "Mean per query: " << avg / total_queries << " [musec]"; 52 | std::cout << std::endl; 53 | } 54 | 55 | int main(int argc, char** argv) { 56 | int mandatory = 3; 57 | if (argc < mandatory) { 58 | std::cout << argv[0] 59 | << " index_filename num_queries_per_sequence < queries" 60 | << std::endl; 61 | return 1; 62 | } 63 | 64 | char const* binary_filename = argv[1]; 65 | uint64_t num_queries_per_sequence = std::stoull(argv[2]); 66 | perf_select(binary_filename, num_queries_per_sequence); 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /src/uncompress.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "uncompress.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void uncompress(char const* binary_filename) { 12 | s_index index; 13 | index.mmap(binary_filename); 14 | 15 | uint64_t universe = index.universe(); 16 | std::cout << "universe size: " << universe << std::endl; 17 | size_t size_in_64bit_words = 18 | num_chunks(universe) * constants::chunk_size / 64; 19 | std::vector out(size_in_64bit_words, 0); 20 | uint64_t integers = 0; 21 | essentials::timer_type t; 22 | t.start(); 23 | for (size_t i = 0; i != index.size(); ++i) { 24 | auto sequence = index[i]; 25 | size_t decoded = sequence.uncompress(out.data()); 26 | integers += decoded; 27 | } 28 | t.stop(); 29 | 30 | std::cout << "decoded " << index.size() << " sequences" << std::endl; 31 | std::cout << "decoded " << integers << " integers" << std::endl; 32 | 33 | double elapsed = t.average(); 34 | std::cout << "Elapsed time: " << elapsed / 1000000 << " [sec]\n"; 35 | std::cout << "Mean per sequence: " << elapsed / index.size() 36 | << " [musec]\n"; 37 | std::cout << "Mean per integer: " << elapsed / integers * 1000 << " [ns]"; 38 | std::cout << std::endl; 39 | } 40 | 41 | int main(int argc, char** argv) { 42 | int mandatory = 2; 43 | if (argc < mandatory) { 44 | std::cout << argv[0] << " " << std::endl; 45 | return 1; 46 | } 47 | 48 | uncompress(argv[1]); 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /src/union.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "union.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void perf_union(char const* binary_filename, 12 | std::vector const& queries) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | std::vector out(index.universe()); 17 | size_t total = 0; 18 | std::cout << "performing " << queries.size() << " pairwise-unions..." 19 | << std::endl; 20 | essentials::timer_type t; 21 | static const int runs = 10 + 1; 22 | for (int run = 0; run != runs; ++run) { 23 | t.start(); 24 | for (auto const& q : queries) { 25 | total += pairwise_union(index[q.i], index[q.j], out.data()); 26 | } 27 | t.stop(); 28 | } 29 | std::cout << total << std::endl; 30 | t.discard_first(); 31 | double avg = t.average(); 32 | std::cout << "Mean per run: " << avg << " [musec]\n"; 33 | std::cout << "Mean per query: " << avg / queries.size() << " [musec]"; 34 | std::cout << std::endl; 35 | } 36 | 37 | int main(int argc, char** argv) { 38 | int mandatory = 3; 39 | if (argc < mandatory) { 40 | std::cout << argv[0] << " index_filename num_queries < queries" 41 | << std::endl; 42 | return 1; 43 | } 44 | 45 | char const* binary_filename = argv[1]; 46 | uint64_t num_queries = std::stoull(argv[2]); 47 | std::vector queries; 48 | queries.reserve(num_queries); 49 | 50 | std::cout << "reading queries..." << std::endl; 51 | for (uint32_t i = 0; i != num_queries; ++i) { 52 | query q; 53 | int x = scanf("%d", &q.i); 54 | int y = scanf("%d", &q.j); 55 | if (x == EOF or y == EOF) break; 56 | queries.push_back(q); 57 | } 58 | std::cout << "DONE" << std::endl; 59 | 60 | perf_union(binary_filename, queries); 61 | 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(test_decode test_decode.cpp) 2 | add_executable(test_uncompress test_uncompress.cpp) 3 | add_executable(test_intersect test_intersect.cpp) 4 | add_executable(test_intersect_many test_intersect_many.cpp) 5 | add_executable(test_union test_union.cpp) 6 | add_executable(test_union_many test_union_many.cpp) 7 | add_executable(test_select test_select.cpp) 8 | add_executable(test_contains test_contains.cpp) 9 | add_executable(test_next_geq test_next_geq.cpp) 10 | add_executable(test_next_geq_enumerator test_next_geq_enumerator.cpp) 11 | add_executable(test_enumerator test_enumerator.cpp) -------------------------------------------------------------------------------- /test/test_common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace sliced { 8 | 9 | namespace testing { 10 | typedef std::vector query_type; 11 | 12 | bool read_query_and_remove_duplicates(query_type& query, 13 | std::istream& is = std::cin) { 14 | query.clear(); 15 | std::string line; 16 | if (!std::getline(is, line)) return false; 17 | std::istringstream iline(line); 18 | uint32_t index; 19 | while (iline >> index) query.push_back(index); 20 | std::sort(query.begin(), query.end()); 21 | query.erase(std::unique(query.begin(), query.end()), query.end()); 22 | return true; 23 | } 24 | } // namespace testing 25 | 26 | #define TEST \ 27 | int mandatory = 3; \ 28 | if (argc < mandatory) { \ 29 | std::cout \ 30 | << argv[0] \ 31 | << " index_filename collection_filename [--density d] [--size s]" \ 32 | << std::endl; \ 33 | return 1; \ 34 | } \ 35 | \ 36 | char const* index_filename = argv[1]; \ 37 | parameters params; \ 38 | params.collection_filename = argv[2]; \ 39 | \ 40 | for (int i = mandatory; i != argc; ++i) { \ 41 | if (std::string(argv[i]) == "--density") { \ 42 | ++i; \ 43 | params.density = std::stod(argv[i]); \ 44 | } else if (std::string(argv[i]) == "--size") { \ 45 | ++i; \ 46 | params.size = std::atoi(argv[i]); \ 47 | } \ 48 | } \ 49 | \ 50 | test(index_filename, params); 51 | 52 | } // namespace sliced -------------------------------------------------------------------------------- /test/test_contains.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 4 | 5 | #include "test_common.hpp" 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "contains.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void test(char const* binary_filename, parameters const& params) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | mm::file_source input(params.collection_filename, 17 | mm::advice::sequential); 18 | uint32_t const* data = input.data(); 19 | assert(data[0] == 1); 20 | std::cout << "universe size: " << index.universe() << std::endl; 21 | size_t k = 0; 22 | bool good = true; 23 | 24 | for (size_t i = 2; i < input.size();) { 25 | uint32_t n = data[i]; 26 | uint32_t universe = data[i + n]; 27 | if (pass(params, n, universe)) { 28 | auto sequence = index[k]; 29 | uint32_t c = sequence.cardinality(); 30 | 31 | if (c != n) { 32 | good = false; 33 | std::cout << "cardinality " << c << ": expected " << n 34 | << std::endl; 35 | } 36 | 37 | uint32_t const* ptr = data + i + 1; 38 | for (size_t j = 0; j != n; ++j) { 39 | uint32_t value = *ptr++; 40 | bool in = sequence.contains(value); 41 | if (!in) { 42 | good = false; 43 | std::cout << value << " should have been found" 44 | << std::endl; 45 | } 46 | } 47 | 48 | ++k; 49 | if (k % 1000 == 0) { 50 | std::cout << "checked " << k << " sequences" << std::endl; 51 | } 52 | } 53 | i += n + 1; 54 | } 55 | std::cout << "checked " << k << " sequences" << std::endl; 56 | if (good) std::cout << "everything good" << std::endl; 57 | } 58 | 59 | int main(int argc, char** argv) { 60 | TEST return 0; 61 | } 62 | -------------------------------------------------------------------------------- /test/test_decode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 4 | 5 | #include "test_common.hpp" 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "decode.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void test(char const* binary_filename, parameters const& params) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | mm::file_source input(params.collection_filename, 17 | mm::advice::sequential); 18 | uint32_t const* data = input.data(); 19 | assert(data[0] == 1); 20 | std::cout << "universe size: " << index.universe() << std::endl; 21 | std::vector out(index.universe()); 22 | size_t k = 0; 23 | bool good = true; 24 | 25 | for (size_t i = 2; i < input.size();) { 26 | uint32_t n = data[i]; 27 | uint32_t universe = data[i + n]; 28 | if (pass(params, n, universe)) { 29 | auto sequence = index[k]; 30 | size_t decoded = sequence.decode(out.data()); 31 | 32 | uint32_t c = sequence.cardinality(); 33 | if (c != n) { 34 | good = false; 35 | std::cout << "cardinality " << c << ": expected " << n 36 | << std::endl; 37 | } 38 | 39 | if (decoded != n) { 40 | good = false; 41 | std::cout << "decoded " << decoded << " integers: expected " 42 | << n << std::endl; 43 | } 44 | 45 | uint32_t const* ptr = data + i + 1; 46 | for (size_t j = 0; j != n; ++j) { 47 | uint32_t expected = *ptr++; 48 | if (expected != out[j]) { 49 | good = false; 50 | std::cout << "error at " << j << "/" << n << ": expected " 51 | << expected << " but got " << out[j] << std::endl; 52 | } 53 | } 54 | 55 | ++k; 56 | if (k % 1000 == 0) { 57 | std::cout << "decoded " << k << " sequences" << std::endl; 58 | } 59 | } 60 | i += n + 1; 61 | } 62 | std::cout << "decoded " << k << " sequences" << std::endl; 63 | if (good) std::cout << "everything good" << std::endl; 64 | } 65 | 66 | int main(int argc, char** argv) { 67 | TEST return 0; 68 | } 69 | -------------------------------------------------------------------------------- /test/test_enumerator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 4 | 5 | #include "test_common.hpp" 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "enumerator.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void test(char const* binary_filename, parameters const& params) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | mm::file_source input(params.collection_filename, 17 | mm::advice::sequential); 18 | uint32_t const* data = input.data(); 19 | assert(data[0] == 1); 20 | std::cout << "universe size: " << index.universe() << std::endl; 21 | size_t k = 0; 22 | bool good = true; 23 | 24 | enumerator e; 25 | 26 | for (size_t i = 2; i < input.size();) { 27 | uint32_t n = data[i]; 28 | uint32_t universe = data[i + n]; 29 | if (pass(params, n, universe)) { 30 | auto sequence = index[k]; 31 | e.init(sequence, index.universe()); 32 | 33 | uint32_t const* ptr = data + i + 1; 34 | for (size_t j = 0; j != n; ++j, e.next()) { 35 | uint32_t expected = *ptr++; 36 | uint32_t got = e.value(); 37 | if (expected != got) { 38 | good = false; 39 | std::cout << "error at " << j << "/" << n << ": expected " 40 | << expected << " but got " << got << std::endl; 41 | } 42 | } 43 | 44 | ++k; 45 | if (k % 1000 == 0) { 46 | std::cout << "decoded " << k << " sequences" << std::endl; 47 | } 48 | } 49 | i += n + 1; 50 | } 51 | std::cout << "decoded " << k << " sequences" << std::endl; 52 | if (good) std::cout << "everything good" << std::endl; 53 | } 54 | 55 | int main(int argc, char** argv) { 56 | TEST return 0; 57 | } 58 | -------------------------------------------------------------------------------- /test/test_intersect.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.hpp" 5 | #include "s_index.hpp" 6 | #include "intersection.hpp" 7 | 8 | #include "intersection_many.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void test(char const* binary_filename, std::vector const& queries) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | uint64_t universe = index.universe(); 17 | std::vector out(universe); 18 | std::vector i(universe); 19 | std::vector j(universe); 20 | std::vector expected(universe); 21 | bool good = true; 22 | 23 | for (auto const& q : queries) { 24 | size_t i_size = index[q.i].decode(i.data()); 25 | size_t j_size = index[q.j].decode(j.data()); 26 | auto it = 27 | std::set_intersection(i.begin(), i.begin() + i_size, j.begin(), 28 | j.begin() + j_size, expected.begin()); 29 | size_t expected_size = it - expected.begin(); 30 | 31 | std::vector sequences(2); 32 | sequences[0] = index[q.i]; 33 | sequences[1] = index[q.j]; 34 | size_t size = intersection(sequences, out.data()); 35 | // size_t size = pairwise_intersection(index[q.i], index[q.j], 36 | // out.data()); 37 | 38 | if (expected_size != size) { 39 | good = false; 40 | std::cout << "intersection has size " << size << " but expected " 41 | << expected_size << std::endl; 42 | } 43 | 44 | for (size_t i = 0; i != size; ++i) { 45 | if (expected[i] != out[i]) { 46 | good = false; 47 | std::cout << "error at " << i << "/" << size << ": expected " 48 | << expected[i] << " but got " << out[i] << std::endl; 49 | } 50 | } 51 | } 52 | std::cout << "tested " << queries.size() << " queries" << std::endl; 53 | if (good) std::cout << "everything good" << std::endl; 54 | } 55 | 56 | int main(int argc, char** argv) { 57 | int mandatory = 3; 58 | if (argc < mandatory) { 59 | std::cout << argv[0] << " index_filename num_queries < queries" 60 | << std::endl; 61 | return 1; 62 | } 63 | 64 | char const* binary_filename = argv[1]; 65 | uint64_t num_queries = std::stoull(argv[2]); 66 | std::vector queries; 67 | queries.reserve(num_queries); 68 | 69 | std::cout << "reading queries..." << std::endl; 70 | for (uint32_t i = 0; i != num_queries; ++i) { 71 | query q; 72 | int x = scanf("%d", &q.i); 73 | int y = scanf("%d", &q.j); 74 | if (x == EOF or y == EOF) break; 75 | queries.push_back(q); 76 | } 77 | 78 | test(binary_filename, queries); 79 | 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /test/test_intersect_many.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.hpp" 5 | #include "s_index.hpp" 6 | #include "enumerator.hpp" 7 | #include "test_common.hpp" 8 | #include "intersection_many.hpp" 9 | 10 | using namespace sliced; 11 | using namespace sliced::testing; 12 | 13 | std::vector pairwise_intersection(std::vector const& l, 14 | std::vector const& r) { 15 | auto it_l = l.begin(); 16 | auto it_r = r.begin(); 17 | std::vector out; 18 | while (it_l != l.end() and it_r != r.end()) { 19 | if (*it_l < *it_r) { 20 | ++it_l; 21 | } else if (*it_r < *it_l) { 22 | ++it_r; 23 | } else { 24 | out.push_back(*it_l); 25 | ++it_l; 26 | ++it_r; 27 | } 28 | } 29 | return out; 30 | } 31 | 32 | std::vector intersection(s_index const& index, 33 | query_type const& query) { 34 | assert(query.size() >= 2); 35 | std::vector> sequences(query.size()); 36 | for (uint32_t i = 0; i != query.size(); ++i) { 37 | auto sequence = index[query[i]]; 38 | enumerator e; 39 | e.init(sequence, index.universe()); 40 | sequences[i].reserve(sequence.cardinality()); 41 | while (e.has_next()) { 42 | sequences[i].push_back(e.value()); 43 | e.next(); 44 | } 45 | assert(sequences[i].size() == sequence.cardinality()); 46 | assert(std::is_sorted(sequences[i].begin(), sequences[i].end())); 47 | } 48 | 49 | auto result = pairwise_intersection(sequences[0], sequences[1]); 50 | for (uint32_t i = 2; i != sequences.size(); ++i) { 51 | result = pairwise_intersection(result, sequences[i]); 52 | } 53 | 54 | return result; 55 | } 56 | 57 | void test(char const* binary_filename, std::vector const& queries) { 58 | s_index index; 59 | index.mmap(binary_filename); 60 | 61 | uint64_t universe = index.universe(); 62 | std::vector out(universe); 63 | bool good = true; 64 | 65 | std::vector sequences; 66 | uint64_t num_queries = 0; 67 | for (auto const& q : queries) { 68 | if (q.size() < 2) continue; 69 | sequences.clear(); 70 | auto expected = intersection(index, q); 71 | for (uint32_t i = 0; i != q.size(); ++i) { 72 | sequences.push_back(index[q[i]]); 73 | } 74 | size_t size = intersection(sequences, out.data()); 75 | std::cout << "intersection has size " << size << std::endl; 76 | if (expected.size() != size) { 77 | good = false; 78 | std::cout << "intersection has size " << size << " but expected " 79 | << expected.size() << std::endl; 80 | } 81 | 82 | for (size_t i = 0; i != size; ++i) { 83 | if (expected[i] != out[i]) { 84 | good = false; 85 | std::cout << "error at " << i << "/" << size << ": expected " 86 | << expected[i] << " but got " << out[i] << std::endl; 87 | } 88 | } 89 | ++num_queries; 90 | } 91 | std::cout << "tested " << num_queries << " queries" << std::endl; 92 | if (good) std::cout << "everything good" << std::endl; 93 | } 94 | 95 | int main(int argc, char** argv) { 96 | int mandatory = 3; 97 | if (argc < mandatory) { 98 | std::cout << argv[0] << " index_filename num_queries < queries" 99 | << std::endl; 100 | return 1; 101 | } 102 | 103 | char const* binary_filename = argv[1]; 104 | uint64_t num_queries = std::stoull(argv[2]); 105 | 106 | std::cout << "reading queries..." << std::endl; 107 | std::vector queries; 108 | queries.reserve(num_queries); 109 | query_type q; 110 | uint64_t i = 0; 111 | while (i != num_queries and testing::read_query_and_remove_duplicates(q)) { 112 | assert(!q.empty()); 113 | queries.push_back(q); 114 | ++i; 115 | } 116 | 117 | std::cout << "running test..." << std::endl; 118 | test(binary_filename, queries); 119 | 120 | return 0; 121 | } 122 | -------------------------------------------------------------------------------- /test/test_next_geq.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 4 | 5 | #include "test_common.hpp" 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "next_geq.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void test(char const* binary_filename, parameters const& params) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | mm::file_source input(params.collection_filename, 17 | mm::advice::sequential); 18 | uint32_t const* data = input.data(); 19 | assert(data[0] == 1); 20 | std::cout << "universe size: " << index.universe() << std::endl; 21 | size_t k = 0; 22 | bool good = true; 23 | 24 | for (size_t i = 2; i < input.size();) { 25 | uint32_t n = data[i]; 26 | uint32_t universe = data[i + n]; 27 | if (pass(params, n, universe)) { 28 | auto sequence = index[k]; 29 | uint32_t c = sequence.cardinality(); 30 | 31 | if (c != n) { 32 | good = false; 33 | std::cout << "cardinality " << c << ": expected " << n 34 | << std::endl; 35 | } 36 | 37 | uint32_t const* list = data + i + 1; 38 | 39 | /* run next_geq for all values in [0, universe] */ 40 | for (size_t lower_bound = 0; lower_bound != universe + 1; 41 | ++lower_bound) { 42 | auto it = std::lower_bound(list, list + n, lower_bound); 43 | uint32_t next_geq = sequence.next_geq(lower_bound); 44 | assert(next_geq >= lower_bound); 45 | if (next_geq != *it) { 46 | good = false; 47 | std::cout << "error at " << lower_bound << "/" << universe 48 | << ": got " << next_geq 49 | << " but expected next_geq(" << lower_bound 50 | << ") = " << *it << std::endl; 51 | } 52 | } 53 | 54 | /* test some out-of-bound values */ 55 | for (size_t lower_bound = universe + 1; 56 | lower_bound != universe + 1000000 + 1; lower_bound += 10000) { 57 | uint32_t next_geq = sequence.next_geq(lower_bound); 58 | if (next_geq != constants::not_found) { 59 | good = false; 60 | std::cout << "error : got " << next_geq << " but expected " 61 | << constants::not_found << std::endl; 62 | } 63 | } 64 | 65 | ++k; 66 | if (k % 1000 == 0) { 67 | std::cout << "checked " << k << " sequences" << std::endl; 68 | } 69 | } 70 | i += n + 1; 71 | } 72 | std::cout << "checked " << k << " sequences" << std::endl; 73 | if (good) std::cout << "everything good" << std::endl; 74 | } 75 | 76 | int main(int argc, char** argv) { 77 | TEST return 0; 78 | } 79 | -------------------------------------------------------------------------------- /test/test_next_geq_enumerator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 4 | 5 | #include "test_common.hpp" 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "next_geq_enumerator.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void test(char const* binary_filename, parameters const& params) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | mm::file_source input(params.collection_filename, 17 | mm::advice::sequential); 18 | uint32_t const* data = input.data(); 19 | assert(data[0] == 1); 20 | std::cout << "universe size: " << index.universe() << std::endl; 21 | size_t k = 0; 22 | bool good = true; 23 | 24 | for (size_t i = 2; i < input.size();) { 25 | uint32_t n = data[i]; 26 | uint32_t universe = data[i + n]; 27 | 28 | if (pass(params, n, universe)) { 29 | auto sequence = index[k]; 30 | uint32_t c = sequence.cardinality(); 31 | 32 | if (c != n) { 33 | good = false; 34 | std::cout << "cardinality " << c << ": expected " << n 35 | << std::endl; 36 | } 37 | 38 | uint32_t const* list = data + i + 1; 39 | next_geq_enumerator e(sequence); 40 | 41 | /* run next_geq for all values in [0, universe] */ 42 | for (size_t lower_bound = 0; lower_bound != universe + 1; 43 | ++lower_bound) { 44 | auto it = std::lower_bound(list, list + n, lower_bound); 45 | uint32_t next_geq = e.next_geq(lower_bound); 46 | assert(next_geq >= lower_bound); 47 | if (next_geq != *it) { 48 | good = false; 49 | std::cout << "error at " << lower_bound << "/" << universe 50 | << ": got " << next_geq 51 | << " but expected next_geq(" << lower_bound 52 | << ") = " << *it << std::endl; 53 | } 54 | } 55 | 56 | /* test some out-of-bound values */ 57 | for (size_t lower_bound = universe + 1; 58 | lower_bound != universe + 1000000 + 1; lower_bound += 10000) { 59 | uint32_t next_geq = e.next_geq(lower_bound); 60 | if (next_geq != constants::not_found) { 61 | good = false; 62 | std::cout << "error : got " << next_geq << " but expected " 63 | << constants::not_found << std::endl; 64 | } 65 | } 66 | 67 | ++k; 68 | if (k % 1000 == 0) { 69 | std::cout << "checked " << k << " sequences" << std::endl; 70 | } 71 | } 72 | i += n + 1; 73 | } 74 | std::cout << "checked " << k << " sequences" << std::endl; 75 | if (good) std::cout << "everything good" << std::endl; 76 | } 77 | 78 | int main(int argc, char** argv) { 79 | TEST return 0; 80 | } 81 | -------------------------------------------------------------------------------- /test/test_select.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 4 | 5 | #include "test_common.hpp" 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "select.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void test(char const* binary_filename, parameters const& params) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | mm::file_source input(params.collection_filename, 17 | mm::advice::sequential); 18 | uint32_t const* data = input.data(); 19 | assert(data[0] == 1); 20 | std::cout << "universe size: " << index.universe() << std::endl; 21 | size_t k = 0; 22 | bool good = true; 23 | 24 | for (size_t i = 2; i < input.size();) { 25 | uint32_t n = data[i]; 26 | uint32_t universe = data[i + n]; 27 | if (pass(params, n, universe)) { 28 | auto sequence = index[k]; 29 | uint32_t c = sequence.cardinality(); 30 | 31 | if (c != n) { 32 | good = false; 33 | std::cout << "cardinality " << c << ": expected " << n 34 | << std::endl; 35 | } 36 | 37 | uint32_t const* ptr = data + i + 1; 38 | uint32_t value = 0; 39 | for (size_t j = 0; j != n; ++j) { 40 | uint32_t expected = *ptr++; 41 | 42 | bool valid = sequence.select(j, value); 43 | if (!valid) { 44 | good = false; 45 | std::cout << "rank " << j << " should have been valid" 46 | << std::endl; 47 | } 48 | 49 | if (expected != value) { 50 | good = false; 51 | std::cout << "error at " << j << "/" << n << ": expected " 52 | << expected << " but got " << value << std::endl; 53 | } 54 | } 55 | 56 | ++k; 57 | if (k % 1000 == 0) { 58 | std::cout << "checked " << k << " sequences" << std::endl; 59 | } 60 | } 61 | i += n + 1; 62 | } 63 | std::cout << "checked " << k << " sequences" << std::endl; 64 | if (good) std::cout << "everything good" << std::endl; 65 | } 66 | 67 | int main(int argc, char** argv) { 68 | TEST return 0; 69 | } 70 | -------------------------------------------------------------------------------- /test/test_uncompress.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 4 | 5 | #include "test_common.hpp" 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "uncompress.hpp" 9 | 10 | using namespace sliced; 11 | 12 | uint32_t decode_bitmap_and_reset(uint64_t* bitmap, size_t size_in_64bit_words, 13 | uint32_t* out) { 14 | uint32_t size = 0; 15 | uint32_t base = 0; 16 | for (size_t i = 0; i != size_in_64bit_words; ++i) { 17 | uint64_t w = bitmap[i]; 18 | while (w != 0) { 19 | uint64_t t = w & (~w + 1); 20 | int r = __builtin_ctzll(w); 21 | out[size++] = r + base; 22 | w ^= t; 23 | } 24 | bitmap[i] = 0; 25 | base += 64; 26 | } 27 | return size; 28 | } 29 | 30 | void test(char const* binary_filename, parameters const& params) { 31 | s_index index; 32 | index.mmap(binary_filename); 33 | 34 | mm::file_source input(params.collection_filename, 35 | mm::advice::sequential); 36 | uint32_t const* data = input.data(); 37 | 38 | assert(data[0] == 1); 39 | uint64_t universe = index.universe(); 40 | std::cout << "universe size: " << universe << std::endl; 41 | size_t size_in_64bit_words = 42 | num_chunks(universe) * constants::chunk_size / 64; 43 | std::vector bitmap(size_in_64bit_words, 0); 44 | std::vector out(index.universe()); 45 | size_t k = 0; 46 | bool good = true; 47 | 48 | for (size_t i = 2; i < input.size();) { 49 | uint32_t n = data[i]; 50 | uint32_t universe = data[i + n]; 51 | 52 | if (pass(params, n, universe)) { 53 | auto sequence = index[k]; 54 | size_t decoded = sequence.uncompress(bitmap.data()); 55 | size_t d = decode_bitmap_and_reset(bitmap.data(), 56 | size_in_64bit_words, out.data()); 57 | if (decoded != d) { 58 | good = false; 59 | std::cout << "decoded " << decoded << " integers: expected " 60 | << d << std::endl; 61 | } 62 | 63 | if (decoded != n) { 64 | good = false; 65 | std::cout << "decoded " << decoded << " integers: expected " 66 | << n << std::endl; 67 | } 68 | 69 | uint32_t const* ptr = data + i + 1; 70 | for (size_t j = 0; j != n; ++j) { 71 | uint32_t expected = *ptr++; 72 | if (expected != out[j]) { 73 | good = false; 74 | std::cout << "error at " << j << "/" << n << ": expected " 75 | << expected << " but got " << out[j] << std::endl; 76 | } 77 | } 78 | 79 | ++k; 80 | if (k % 1000 == 0) { 81 | std::cout << "decoded " << k << " sequences" << std::endl; 82 | } 83 | } 84 | i += n + 1; 85 | } 86 | std::cout << "uncompressed " << k << " sequences" << std::endl; 87 | if (good) std::cout << "everything good" << std::endl; 88 | } 89 | 90 | int main(int argc, char** argv) { 91 | TEST return 0; 92 | } 93 | -------------------------------------------------------------------------------- /test/test_union.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.hpp" 5 | #include "s_index.hpp" 6 | #include "union.hpp" 7 | 8 | using namespace sliced; 9 | 10 | void test(char const* binary_filename, std::vector const& queries) { 11 | s_index index; 12 | index.mmap(binary_filename); 13 | 14 | uint64_t universe = index.universe(); 15 | std::vector out(universe); 16 | std::vector i(universe); 17 | std::vector j(universe); 18 | std::vector expected(universe); 19 | bool good = true; 20 | 21 | for (auto const& q : queries) { 22 | size_t i_size = index[q.i].decode(i.data()); 23 | size_t j_size = index[q.j].decode(j.data()); 24 | auto it = std::set_union(i.begin(), i.begin() + i_size, j.begin(), 25 | j.begin() + j_size, expected.begin()); 26 | size_t expected_size = it - expected.begin(); 27 | 28 | size_t size = pairwise_union(index[q.i], index[q.j], out.data()); 29 | 30 | if (expected_size != size) { 31 | good = false; 32 | std::cout << "union has size " << size << " but expected " 33 | << expected_size << std::endl; 34 | } 35 | 36 | for (size_t i = 0; i != size; ++i) { 37 | if (expected[i] != out[i]) { 38 | good = false; 39 | std::cout << "error at " << i << "/" << size << ": expected " 40 | << expected[i] << " but got " << out[i] << std::endl; 41 | } 42 | } 43 | } 44 | std::cout << "tested " << queries.size() << " queries" << std::endl; 45 | if (good) std::cout << "everything good" << std::endl; 46 | } 47 | 48 | int main(int argc, char** argv) { 49 | int mandatory = 3; 50 | if (argc < mandatory) { 51 | std::cout << argv[0] << " index_filename num_queries < queries" 52 | << std::endl; 53 | return 1; 54 | } 55 | 56 | char const* binary_filename = argv[1]; 57 | uint64_t num_queries = std::stoull(argv[2]); 58 | std::vector queries; 59 | queries.reserve(num_queries); 60 | 61 | std::cout << "reading queries..." << std::endl; 62 | for (uint32_t i = 0; i != num_queries; ++i) { 63 | query q; 64 | int x = scanf("%d", &q.i); 65 | int y = scanf("%d", &q.j); 66 | if (x == EOF or y == EOF) break; 67 | queries.push_back(q); 68 | } 69 | 70 | test(binary_filename, queries); 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /test/test_union_many.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "util.hpp" 6 | #include "s_index.hpp" 7 | #include "enumerator.hpp" 8 | #include "test_common.hpp" 9 | #include "union_many.hpp" 10 | 11 | using namespace sliced; 12 | using namespace sliced::testing; 13 | 14 | typedef std::vector query_type; 15 | 16 | std::vector pairwise_union(std::vector const& l, 17 | std::vector const& r) { 18 | auto it_l = l.begin(); 19 | auto it_r = r.begin(); 20 | std::vector out; 21 | while (true) { 22 | if (it_l == l.end()) { 23 | std::copy(it_r, r.end(), std::back_inserter(out)); 24 | break; 25 | } 26 | if (it_r == r.end()) { 27 | std::copy(it_l, l.end(), std::back_inserter(out)); 28 | break; 29 | } 30 | if (*it_l < *it_r) { 31 | out.push_back(*it_l); 32 | ++it_l; 33 | } else if (*it_r < *it_l) { 34 | out.push_back(*it_r); 35 | ++it_r; 36 | } else { 37 | out.push_back(*it_l); 38 | ++it_l; 39 | ++it_r; 40 | } 41 | } 42 | return out; 43 | } 44 | 45 | std::vector union_many(s_index const& index, 46 | query_type const& query) { 47 | assert(query.size() >= 2); 48 | std::vector> sequences(query.size()); 49 | for (uint32_t i = 0; i != query.size(); ++i) { 50 | auto sequence = index[query[i]]; 51 | enumerator e; 52 | e.init(sequence, index.universe()); 53 | sequences[i].reserve(sequence.cardinality()); 54 | while (e.has_next()) { 55 | sequences[i].push_back(e.value()); 56 | e.next(); 57 | } 58 | assert(sequences[i].size() == sequence.cardinality()); 59 | assert(std::is_sorted(sequences[i].begin(), sequences[i].end())); 60 | } 61 | 62 | std::vector result; 63 | for (uint32_t i = 0; i != sequences.size(); ++i) { 64 | result = pairwise_union(result, sequences[i]); 65 | } 66 | 67 | return result; 68 | } 69 | 70 | void test(char const* binary_filename, std::vector const& queries) { 71 | s_index index; 72 | index.mmap(binary_filename); 73 | 74 | uint64_t universe = index.universe(); 75 | std::vector out(universe); 76 | bool good = true; 77 | 78 | std::vector sequences; 79 | uint64_t num_queries = 0; 80 | for (auto const& q : queries) { 81 | if (q.size() < 2) continue; 82 | sequences.clear(); 83 | auto expected = union_many(index, q); 84 | for (uint32_t i = 0; i != q.size(); ++i) { 85 | sequences.push_back(index[q[i]]); 86 | } 87 | size_t size = union_many(sequences, out.data()); 88 | std::cout << "union has size " << size << std::endl; 89 | if (expected.size() != size) { 90 | good = false; 91 | std::cout << "union has size " << size << " but expected " 92 | << expected.size() << std::endl; 93 | } 94 | 95 | for (size_t i = 0; i != size; ++i) { 96 | if (expected[i] != out[i]) { 97 | good = false; 98 | std::cout << "error at " << i << "/" << size << ": expected " 99 | << expected[i] << " but got " << out[i] << std::endl; 100 | } 101 | } 102 | ++num_queries; 103 | } 104 | std::cout << "tested " << num_queries << " queries" << std::endl; 105 | if (good) std::cout << "everything good" << std::endl; 106 | } 107 | 108 | int main(int argc, char** argv) { 109 | int mandatory = 3; 110 | if (argc < mandatory) { 111 | std::cout << argv[0] << " index_filename num_queries < queries" 112 | << std::endl; 113 | return 1; 114 | } 115 | 116 | char const* binary_filename = argv[1]; 117 | uint64_t num_queries = std::stoull(argv[2]); 118 | 119 | std::cout << "reading queries..." << std::endl; 120 | std::vector queries; 121 | queries.reserve(num_queries); 122 | query_type q; 123 | uint64_t i = 0; 124 | while (i != num_queries and testing::read_query_and_remove_duplicates(q)) { 125 | assert(!q.empty()); 126 | queries.push_back(q); 127 | ++i; 128 | } 129 | 130 | std::cout << "running test..." << std::endl; 131 | test(binary_filename, queries); 132 | 133 | return 0; 134 | } 135 | -------------------------------------------------------------------------------- /tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(gen_uniform_data gen_uniform_data.cpp) 2 | add_executable(gen_clustered_data gen_clustered_data.cpp) 3 | add_executable(gen_random_pairwise_queries gen_random_pairwise_queries.cpp) 4 | add_executable(gen_random_select_queries gen_random_select_queries.cpp) 5 | add_executable(gen_random_next_geq_queries gen_random_next_geq_queries.cpp) -------------------------------------------------------------------------------- /tools/gen_clustered_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "building_util.hpp" 7 | 8 | #include "../external/essentials/include/essentials.hpp" 9 | 10 | enum event_code { skip = 0, include_all = 1, include_some = 2 }; 11 | 12 | struct event { 13 | int code; 14 | float prob; 15 | }; 16 | 17 | int gen_event(std::vector const& events, float p) { 18 | assert(p >= 0 and p <= 1.0); 19 | float cumulative = events[0].prob; 20 | size_t i = 0; 21 | while (cumulative < p) { 22 | ++i; 23 | cumulative += events[i].prob; 24 | } 25 | assert(i < events.size()); 26 | return events[i].code; 27 | }; 28 | 29 | void gen(uint32_t num_lists, uint32_t universe, char const* output_filename, 30 | bool binary) { 31 | using namespace sliced; 32 | 33 | std::ofstream out; 34 | if (binary) { 35 | out.open(std::string(output_filename), 36 | std::ios_base::binary | std::ios_base::out); 37 | } else { 38 | out.open(std::string(output_filename)); 39 | } 40 | 41 | if (!out.is_open()) { 42 | std::cout << "error in opening file" << std::endl; 43 | return; 44 | } 45 | 46 | if (binary) { 47 | // header: singleton list containing the universe 48 | write_uint(uint32_t(1), out); 49 | write_uint(universe, out); 50 | } 51 | 52 | essentials::uniform_int_rng length(10, constants::chunk_size / 4); 53 | typedef essentials::uniform_int_rng random_int; 54 | random_int* element = nullptr; 55 | 56 | std::vector list; 57 | list.reserve(universe); 58 | 59 | std::vector events(3); 60 | events[0] = {event_code::skip, 0.3}; 61 | events[1] = {event_code::include_all, 0.2}; 62 | events[2] = {event_code::include_some, 0.5}; 63 | 64 | for (uint32_t i = 0; i != num_lists; ++i) { 65 | list.clear(); 66 | 67 | slice s = {0, constants::chunk_size}; 68 | while (s.left < universe) { 69 | float p = float(rand()) / RAND_MAX; 70 | int code = gen_event(events, p); 71 | uint32_t n = 0; 72 | uint32_t end = std::min(s.right, universe); 73 | switch (code) { 74 | case event_code::skip: 75 | break; 76 | case event_code::include_all: 77 | for (uint32_t k = s.left; k != end; ++k) { 78 | list.push_back(k); 79 | } 80 | break; 81 | case event_code::include_some: 82 | element = new random_int(s.left, end, 83 | essentials::get_random_seed()); 84 | n = length.gen(); 85 | for (uint32_t k = 0; k != n; ++k) { 86 | list.push_back(element->gen()); 87 | } 88 | break; 89 | default: 90 | assert(false); 91 | __builtin_unreachable(); 92 | } 93 | s.left = s.right; 94 | s.right += constants::chunk_size; 95 | } 96 | 97 | std::sort(list.begin(), list.end()); 98 | auto end = std::unique(list.begin(), list.end()); 99 | uint32_t n = std::distance(list.begin(), end); 100 | 101 | if (binary) { 102 | write_uint(n, out); 103 | char const* begin = reinterpret_cast(list.data()); 104 | out.write(begin, n * sizeof(uint32_t)); 105 | } else { 106 | out << n << "\n"; 107 | for (auto x : list) out << x << "\n"; 108 | } 109 | } 110 | out.close(); 111 | } 112 | 113 | int main(int argc, char** argv) { 114 | int mandatory = 4; 115 | if (argc < mandatory) { 116 | std::cout << argv[0] << " num_lists universe output_filename --binary" 117 | << std::endl; 118 | return 1; 119 | } 120 | 121 | uint32_t num_lists = std::atoi(argv[1]); 122 | uint32_t universe = std::atoi(argv[2]); 123 | char const* output_filename = argv[3]; 124 | bool binary = false; 125 | 126 | for (int i = mandatory; i != argc; ++i) { 127 | if (std::string(argv[i]) == "--binary") binary = true; 128 | } 129 | 130 | gen(num_lists, universe, output_filename, binary); 131 | 132 | return 0; 133 | } 134 | -------------------------------------------------------------------------------- /tools/gen_random_next_geq_queries.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../external/essentials/include/essentials.hpp" 5 | 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | #include "decode.hpp" 9 | 10 | using namespace sliced; 11 | 12 | void generate(char const* binary_filename, uint32_t num_queries_per_sequence) { 13 | s_index index; 14 | index.mmap(binary_filename); 15 | 16 | std::vector queries; 17 | queries.reserve(index.size() * num_queries_per_sequence); 18 | std::vector out(index.universe()); 19 | 20 | for (size_t i = 0; i != index.size(); ++i) { 21 | auto sequence = index[i]; 22 | size_t decoded = sequence.decode(out.data()); 23 | essentials::uniform_int_rng rng(0, decoded - 1); 24 | for (uint32_t k = 0; k != num_queries_per_sequence; ++k) { 25 | queries.push_back(out[rng.gen()]); 26 | } 27 | } 28 | 29 | // std::random_shuffle(queries.begin(), queries.end()); 30 | for (auto q : queries) std::cout << q << "\n"; 31 | } 32 | 33 | int main(int argc, char** argv) { 34 | int mandatory = 3; 35 | if (argc < mandatory) { 36 | std::cout << argv[0] << " index_filename num_queries_per_sequence" 37 | << std::endl; 38 | return 1; 39 | } 40 | 41 | generate(argv[1], std::atoi(argv[2])); 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /tools/gen_random_pairwise_queries.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../external/essentials/include/essentials.hpp" 4 | 5 | void generate(uint32_t num_queries, uint32_t num_sequences) { 6 | essentials::uniform_int_rng rng(0, num_sequences - 1); 7 | for (size_t i = 0; i != num_queries; ++i) { 8 | std::cout << rng.gen() << "\t" << rng.gen() << "\n"; 9 | } 10 | } 11 | 12 | int main(int argc, char** argv) { 13 | int mandatory = 3; 14 | if (argc < mandatory) { 15 | std::cout << argv[0] << " num_queries num_sequences" << std::endl; 16 | return 1; 17 | } 18 | 19 | generate(std::atoi(argv[1]), std::atoi(argv[2])); 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /tools/gen_random_select_queries.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../external/essentials/include/essentials.hpp" 5 | 6 | #include "util.hpp" 7 | #include "s_index.hpp" 8 | 9 | using namespace sliced; 10 | 11 | void generate(char const* binary_filename, uint32_t num_queries_per_sequence) { 12 | s_index index; 13 | index.mmap(binary_filename); 14 | 15 | std::vector queries; 16 | queries.reserve(index.size() * num_queries_per_sequence); 17 | 18 | for (size_t i = 0; i != index.size(); ++i) { 19 | auto sequence = index[i]; 20 | uint32_t cardinality = sequence.cardinality(); 21 | essentials::uniform_int_rng rng(0, cardinality - 1); 22 | for (uint32_t k = 0; k != num_queries_per_sequence; ++k) { 23 | queries.push_back(rng.gen()); 24 | } 25 | } 26 | 27 | // std::random_shuffle(queries.begin(), queries.end()); 28 | for (auto q : queries) std::cout << q << "\n"; 29 | } 30 | 31 | int main(int argc, char** argv) { 32 | int mandatory = 3; 33 | if (argc < mandatory) { 34 | std::cout << argv[0] << " index_filename num_queries_per_sequence" 35 | << std::endl; 36 | return 1; 37 | } 38 | 39 | generate(argv[1], std::atoi(argv[2])); 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /tools/gen_uniform_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "building_util.hpp" 7 | 8 | #include "../external/essentials/include/essentials.hpp" 9 | 10 | void gen(uint32_t num_lists, uint32_t min_length, uint32_t max_length, 11 | uint32_t universe, char const* output_filename) { 12 | using namespace sliced; 13 | std::ofstream out(std::string(output_filename), 14 | std::ios_base::binary | std::ios_base::out); 15 | // header: singleton list containing the universe 16 | write_uint(uint32_t(1), out); 17 | write_uint(universe, out); 18 | essentials::uniform_int_rng length(min_length, max_length); 19 | essentials::uniform_int_rng element(0, universe); 20 | std::vector list; 21 | list.reserve(max_length); 22 | for (uint32_t i = 0; i != num_lists; ++i) { 23 | list.clear(); 24 | uint32_t n = length.gen(); 25 | for (uint32_t k = 0; k != n; ++k) list.push_back(element.gen()); 26 | std::sort(list.begin(), list.end()); 27 | auto it = std::unique(list.begin(), list.end()); 28 | n = std::distance(list.begin(), it); 29 | write_uint(n, out); 30 | char const* begin = reinterpret_cast(list.data()); 31 | out.write(begin, n * sizeof(uint32_t)); 32 | } 33 | out.close(); 34 | } 35 | 36 | int main(int argc, char** argv) { 37 | int mandatory = 6; 38 | if (argc < mandatory) { 39 | std::cout << argv[0] 40 | << " num_lists min_length max_length universe output_filename" 41 | << std::endl; 42 | return 1; 43 | } 44 | 45 | uint32_t num_lists = std::atoi(argv[1]); 46 | uint32_t min_length = std::atoi(argv[2]); 47 | uint32_t max_length = std::atoi(argv[3]); 48 | uint32_t universe = std::atoi(argv[4]); 49 | char const* output_filename = argv[5]; 50 | 51 | gen(num_lists, min_length, max_length, universe, output_filename); 52 | 53 | return 0; 54 | } 55 | --------------------------------------------------------------------------------