├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── data
    └── test_sequence
├── external
    └── CMakeLists.txt
├── include
    ├── builder.hpp
    ├── building_util.hpp
    ├── constants.hpp
    ├── contains.hpp
    ├── decode.hpp
    ├── enumerator.hpp
    ├── intersection.hpp
    ├── intersection_many.hpp
    ├── next_geq.hpp
    ├── next_geq_enumerator.hpp
    ├── s_index.hpp
    ├── s_sequence.hpp
    ├── select.hpp
    ├── table.hpp
    ├── uncompress.hpp
    ├── uncompress_chunk_and_intersect.hpp
    ├── uncompress_chunk_and_merge.hpp
    ├── union.hpp
    ├── union_many.hpp
    └── util.hpp
├── script
    ├── build.py
    └── queries.py
├── src
    ├── CMakeLists.txt
    ├── build.cpp
    ├── cardinality.cpp
    ├── contains.cpp
    ├── decode.cpp
    ├── example.cpp
    ├── intersect.cpp
    ├── next_geq.cpp
    ├── select.cpp
    ├── uncompress.cpp
    └── union.cpp
├── statistics
    └── README.md
├── test
    ├── CMakeLists.txt
    ├── test_common.hpp
    ├── test_contains.cpp
    ├── test_decode.cpp
    ├── test_enumerator.cpp
    ├── test_intersect.cpp
    ├── test_intersect_many.cpp
    ├── test_next_geq.cpp
    ├── test_next_geq_enumerator.cpp
    ├── test_select.cpp
    ├── test_uncompress.cpp
    ├── test_union.cpp
    └── test_union_many.cpp
└── tools
    ├── CMakeLists.txt
    ├── gen_clustered_data.cpp
    ├── gen_random_next_geq_queries.cpp
    ├── gen_random_pairwise_queries.cpp
    ├── gen_random_select_queries.cpp
    └── gen_uniform_data.cpp


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Google
  4 | AccessModifierOffset: -4
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: true
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: Empty
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: Yes
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakInheritanceList: BeforeComma
 43 | BreakBeforeTernaryOperators: true
 44 | BreakConstructorInitializersBeforeComma: true
 45 | BreakConstructorInitializers: BeforeComma
 46 | BreakAfterJavaFieldAnnotations: false
 47 | BreakStringLiterals: true
 48 | ColumnLimit:     80
 49 | CommentPragmas:  '^ IWYU pragma:'
 50 | CompactNamespaces: false
 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
 52 | ConstructorInitializerIndentWidth: 4
 53 | ContinuationIndentWidth: 4
 54 | Cpp11BracedListStyle: true
 55 | DerivePointerAlignment: false
 56 | DisableFormat:   false
 57 | ExperimentalAutoDetectBinPacking: false
 58 | FixNamespaceComments: true
 59 | ForEachMacros:
 60 |   - foreach
 61 |   - Q_FOREACH
 62 |   - BOOST_FOREACH
 63 | IncludeBlocks:   Preserve
 64 | IncludeCategories:
 65 |   - Regex:           '^<ext/.*\.h>'
 66 |     Priority:        2
 67 |   - Regex:           '^<.*\.h>'
 68 |     Priority:        1
 69 |   - Regex:           '^<.*'
 70 |     Priority:        2
 71 |   - Regex:           '.*'
 72 |     Priority:        3
 73 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 74 | IndentCaseLabels: true
 75 | IndentPPDirectives: None
 76 | IndentWidth:     4
 77 | IndentWrappedFunctionNames: false
 78 | JavaScriptQuotes: Leave
 79 | JavaScriptWrapImports: true
 80 | KeepEmptyLinesAtTheStartOfBlocks: false
 81 | MacroBlockBegin: ''
 82 | MacroBlockEnd:   ''
 83 | MaxEmptyLinesToKeep: 1
 84 | NamespaceIndentation: None
 85 | ObjCBinPackProtocolList: Never
 86 | ObjCBlockIndentWidth: 2
 87 | ObjCSpaceAfterProperty: false
 88 | ObjCSpaceBeforeProtocolList: true
 89 | PenaltyBreakAssignment: 2
 90 | PenaltyBreakBeforeFirstCallParameter: 1
 91 | PenaltyBreakComment: 300
 92 | PenaltyBreakFirstLessLess: 120
 93 | PenaltyBreakString: 1000
 94 | PenaltyBreakTemplateDeclaration: 10
 95 | PenaltyExcessCharacter: 1000000
 96 | PenaltyReturnTypeOnItsOwnLine: 200
 97 | PointerAlignment: Left
 98 | RawStringFormats:
 99 |   - Language:        Cpp
100 |     Delimiters:
101 |       - cc
102 |       - CC
103 |       - cpp
104 |       - Cpp
105 |       - CPP
106 |       - 'c++'
107 |       - 'C++'
108 |     CanonicalDelimiter: ''
109 |     BasedOnStyle:    google
110 |   - Language:        TextProto
111 |     Delimiters:
112 |       - pb
113 |       - PB
114 |       - proto
115 |       - PROTO
116 |     EnclosingFunctions:
117 |       - EqualsProto
118 |       - EquivToProto
119 |       - PARSE_PARTIAL_TEXT_PROTO
120 |       - PARSE_TEST_PROTO
121 |       - PARSE_TEXT_PROTO
122 |       - ParseTextOrDie
123 |       - ParseTextProtoOrDie
124 |     CanonicalDelimiter: ''
125 |     BasedOnStyle:    google
126 | ReflowComments:  true
127 | SortIncludes:    false
128 | SortUsingDeclarations: false
129 | SpaceAfterCStyleCast: false
130 | SpaceAfterTemplateKeyword: true
131 | SpaceBeforeAssignmentOperators: true
132 | SpaceBeforeCpp11BracedList: false
133 | SpaceBeforeCtorInitializerColon: true
134 | SpaceBeforeInheritanceColon: true
135 | SpaceBeforeParens: ControlStatements
136 | SpaceBeforeRangeBasedForLoopColon: true
137 | SpaceInEmptyParentheses: false
138 | SpacesBeforeTrailingComments: 2
139 | SpacesInAngles:  false
140 | SpacesInContainerLiterals: true
141 | SpacesInCStyleCastParentheses: false
142 | SpacesInParentheses: false
143 | SpacesInSquareBrackets: false
144 | Standard:        Auto
145 | StatementMacros:
146 |   - Q_UNUSED
147 |   - QT_REQUIRE_VERSION
148 | TabWidth:        8
149 | UseTab:          Never
150 | ...
151 | 
152 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | build
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/mm_file"]
2 | 	path = external/mm_file
3 | 	url = https://github.com/jermp/mm_file.git
4 | [submodule "external/essentials"]
5 | 	path = external/essentials
6 | 	url = https://github.com/jermp/essentials.git
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project(S_INDEXES)
 3 | 
 4 | if(NOT CMAKE_BUILD_TYPE)
 5 |   set(CMAKE_BUILD_TYPE "Release")
 6 | endif()
 7 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} )
 8 | 
 9 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
10 | 
11 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
12 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
13 | endif ()
14 | 
15 | if(UNIX)
16 | 
17 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
18 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
19 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
20 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
21 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces")
22 | 
23 |   if(USE_SANITIZERS)
24 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
25 |   endif()
26 | 
27 | endif()
28 | 
29 | include_directories(${S_INDEXES_SOURCE_DIR}/include)
30 | 
31 | add_subdirectory(external)
32 | add_subdirectory(src)
33 | add_subdirectory(test)
34 | add_subdirectory(tools)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright 2019-2021 Giulio Ermanno Pibiri
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included
13 | in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 | OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Sliced Indexes
  2 | ==============
  3 | 
  4 | A C++ implementation of [*sliced indexes*](https://arxiv.org/abs/1907.01032) [3,4],
  5 | that can be used to compress bitmaps and inverted lists.
  6 | 
  7 | Also refer to the [CSUR paper](https://arxiv.org/abs/1908.10598v2) [5] for further experiments and comparisons (Section 6).
  8 | 
  9 | This guide is meant to provide a brief overview of the library and to illustrate its functionalities through some examples.
 10 | ##### Table of contents
 11 | * [Compiling the code](#compiling-the-code)
 12 | * [Quick Start](#quick-start)
 13 | * [Building a collection of sequences](#building-a-collection-of-sequences)
 14 | * [Operations](#operations)
 15 | * [Testing](#testing)
 16 | * [Tools](#tools)
 17 | * [An example microbenchmark](#an-example-microbenchmark)
 18 | * [Authors](#authors)
 19 | * [References](#references)
 20 | 
 21 | Compiling the code
 22 | ------------------
 23 | 
 24 | The code is tested on Linux with `gcc` 7.3.0 and on Mac 10.14 with `clang` 10.0.0.
 25 | To build the code, [`CMake`](https://cmake.org/) is required.
 26 | 
 27 | The code has few external dependencies (for testing, serialization and memory-mapping facilities), so clone the repository with
 28 | 
 29 | 	git clone --recursive https://github.com/jermp/s_indexes.git
 30 | 
 31 | If you have cloned the repository without `--recursive`, you will need to perform the following commands before
 32 | compiling:
 33 | 
 34 | 	git submodule init
 35 | 	git submodule update
 36 | 
 37 | To compile the code for a release environment (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following:
 38 | 
 39 |     mkdir build
 40 |     cd build
 41 |     cmake ..
 42 |     make
 43 | 
 44 | Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs.
 45 | 
 46 | For a testing environment, use the following instead:
 47 | 
 48 |     mkdir debug_build
 49 |     cd debug_build
 50 |     cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
 51 |     make
 52 | 
 53 | Quick Start
 54 | -------
 55 | 
 56 | For a quick start, see the source file `src/example.cpp`.
 57 | After compilation, run this example with
 58 | 
 59 | 	./example < ../data/test_sequence
 60 | 
 61 | which will:
 62 | 
 63 | 1. read from the standard input using a test
 64 | sequence from the directory `data`;
 65 | 2. build the data structure in memory and perform some operations (decode and select).
 66 | 
 67 | By specifying an output file name, it is possible to
 68 | serialize the data structure on disk. To perform the
 69 | operations, the data structure is then memory mapped
 70 | from such file. To do so, type
 71 | 
 72 | 	./example -o out.bin < ../data/test_sequence
 73 | 
 74 | ```C++
 75 | #include <iostream>
 76 | 
 77 | #include "../external/essentials/include/essentials.hpp"
 78 | #include "builder.hpp"
 79 | #include "s_sequence.hpp"
 80 | #include "select.hpp"
 81 | #include "decode.hpp"
 82 | 
 83 | using namespace sliced;
 84 | 
 85 | int main(int argc, char** argv) {
 86 |     int mandatory = 1;
 87 |     char const* output_filename = nullptr;
 88 | 
 89 |     for (int i = mandatory; i != argc; ++i) {
 90 |         if (std::string(argv[i]) == "-o") {
 91 |             ++i;
 92 |             output_filename = argv[i];
 93 |         } else if (std::string(argv[i]) == "-h") {
 94 |             std::cout << argv[0] << " -o output_filename < input" << std::endl;
 95 |             return 1;
 96 |         } else {
 97 |             std::cout << "unknown option '" << argv[i] << "'" << std::endl;
 98 |             return 1;
 99 |         }
100 |     }
101 | 
102 |     std::vector<uint32_t> input;
103 | 
104 |     {  // read input from std::in
105 |         uint32_t n, x;
106 |         std::cin >> n;
107 |         input.reserve(n);
108 |         for (uint32_t i = 0; i != n; ++i) {
109 |             std::cin >> x;
110 |             input.push_back(x);
111 |         }
112 |     }
113 | 
114 |     // build the sequence and print statistics
115 |     s_sequence::builder builder;
116 |     auto stats = builder.build(input.data(), input.size());
117 |     stats.print();
118 | 
119 |     mm::file_source<uint8_t> mm_file;
120 |     uint8_t const* data = nullptr;
121 | 
122 |     if (output_filename) {  // if an output file is specified, then serialize
123 |         essentials::print_size(builder);
124 |         essentials::save<s_sequence::builder>(builder, output_filename);
125 | 
126 |         // mmap
127 |         int advice = mm::advice::normal;  // can be also random and sequential
128 |         mm_file.open(output_filename, advice);
129 | 
130 |         // skip first 8 bytes storing the number of written bytes
131 |         data = mm_file.data() + 8;
132 | 
133 |     } else {  // otherwise work directly in memory
134 |         data = builder.data();
135 |     }
136 | 
137 |     // initialize a s_sequence from data, regardless the source
138 |     s_sequence ss(data);
139 | 
140 |     uint32_t size = ss.size();
141 | 
142 |     // decode whole list to an output buffer
143 |     std::vector<uint32_t> out(size);
144 |     ss.decode(out.data());
145 |     // check written values
146 |     uint32_t value = 0;
147 |     for (uint32_t i = 0; i != size; ++i) {
148 |         if (input[i] != out[i]) {
149 |             std::cout << "got " << out[i] << " but expected " << input[i]
150 |                       << std::endl;
151 |             return 1;
152 |         }
153 | 
154 |         ss.select(i, value);  // select i-th element
155 |         if (value != out[i]) {
156 |             std::cout << "got " << value << " but expected " << out[i]
157 |                       << std::endl;
158 |             return 1;
159 |         }
160 |     }
161 | 
162 |     return 0;
163 | }
164 | ```
165 | 
166 | Building a collection of sequences
167 | ----------------------------------
168 | 
169 | Typically, we want to build all the sequences from
170 | a collection.
171 | In this case, we assume that the input collection
172 | is a binary file with all the sequences being written
173 | as 32-bit integers, as popular for also other libraries
174 | such as [`ds2i`](https://github.com/ot/ds2i).
175 | In particular, each sequence is prefixed by an additional
176 | 32-bit integer representing the size of the sequence.
177 | The collection file starts with a singleton sequence
178 | containing the universe of representation of the sequences, i.e., the maximum representable value.
179 | 
180 | For example, an test input collection with 100 sequences drawn
181 | from a universe of size 1,000,000 can be generated
182 | with
183 | 
184 | 	./gen_clustered_data 100 1000000 test_collection --binary
185 | 
186 | To build an index from such collection, then use
187 | 
188 | 	./build test_collection --density 0.01 --out test_collection.out
189 | 
190 | with a density threshold of 0.01 and an output file
191 | `test_collection.out` onto which the data structure is serialized.
192 | You should get an output like:
193 | 
194 | 	universe size: 1000000
195 | 	processed 100 sequences, 45911859 integers
196 | 	chunks: 1572
197 | 	full chunks: 466 (66.5183% of ints)
198 | 	empty chunks: 310 (19.7201% of chunks)
199 | 	dense chunks: 513 (30.3916% of ints)
200 | 	sparse chunks: 283 (3.09016% of ints)
201 | 	blocks: 23395
202 | 	empty blocks: 14 (0.0598418% of blocks)
203 | 	dense blocks: 7614 (2.53826% of ints)
204 | 	sparse blocks: 15767 (0.551905% of ints)
205 | 	0.00179405 [bpi] for chunks' headers
206 | 	0.00540078 [bpi] for blocks' headers
207 | 	0.732272 [bpi] for dense chunks
208 | 	0.0424549 [bpi] for dense blocks
209 | 	0.0468998 [bpi] for sparse blocks
210 | 	total bytes: 4757416
211 | 	total bpi: 0.828965
212 | 
213 | from which you can see some statistics about the built data structure.
214 | 
215 | Operations
216 | ----------
217 | 
218 | Given a single *sliced* sequence, it is possible to execute the
219 | following operations (see also `include/s_sequence.hpp`):
220 | 
221 | ```C++
222 | /* decode the sequence to the output buffer */
223 | size_t decode(uint32_t* out) const;
224 | 
225 | /* convert the sequence to an output bitmap */
226 | size_t uncompress(uint64_t* out) const;
227 | 
228 | /* select the i-th value */
229 | bool select(uint32_t i, uint32_t& value) const;
230 | 
231 | /* check if value is present in the sequence */
232 | bool contains(uint32_t value) const;
233 | 
234 | /* returns the minimum value that is >= lower_bound
235 |    if found, otherwise a "not found" value is returned */
236 | uint32_t next_geq(uint32_t lower_bound) const;
237 | ```
238 | 
239 | Given a collection of (at least 2) *sliced* sequences, it is possible to perform intersection and merging of the sequences:
240 | 
241 | ```C++
242 | /* writes the result of the intersection between l and s to the output buffer,
243 |    returning the size of the result */
244 | size_t pairwise_intersection(s_sequence const& l, s_sequence const& r, uint32_t* out);
245 | 
246 | /* writes the result of the union between l and s to the output buffer,
247 |    returning the size of the result */
248 | size_t pairwise_union(s_sequence const& l, s_sequence const& r, uint32_t* out);
249 | 
250 | /* writes the result of the intersection between the
251 |    sequences to the output buffer, returning the size of the result */
252 | size_t intersection(std::vector<s_sequence>& sequences, uint32_t* out);
253 | 
254 | /* writes the result of the union between the
255 |    sequences to the output buffer, returning the size of the result */
256 | size_t union_many(std::vector<s_sequence>& sequences, uint32_t* out);
257 | 
258 | ```
259 | 
260 | The source `src` folder contains programs to benchmark such operations.
261 | 
262 | #### Example 1.
263 | Use:
264 | 
265 | 	./decode test_collection.out
266 | 
267 | to decode all the sequences in the collection. You should get something
268 | like:
269 | 
270 | 	decoded 100 sequences
271 | 	decoded 45911859 integers
272 | 	Elapsed time: 0.034721 [sec]
273 | 	Mean per sequence: 347.21 [musec]
274 | 	Mean per integer: 0.756253 [ns]
275 | 
276 | #### Example 2.
277 | To execute some intersection operations, first generate some queries with
278 | 
279 | 	./gen_random_pairwise_queries 1000 100 > test_pairwise_queries
280 | 
281 | and then run
282 | 
283 | 	./intersect test_collection.out 1000 < test_pairwise_queries
284 | 
285 | You should get something like:
286 | 
287 | 	performing 1000 pairwise-intersections...
288 | 	Mean per run: 136562 [musec]
289 | 	Mean per query: 136.562 [musec]
290 | 
291 | Testing
292 | -------
293 | The subfolder `test` contains testing programs to maintain
294 | the correctness of the implementation.
295 | 
296 | To run a test, just run the corresponding program without
297 | argument to see the required ones.
298 | 
299 | For example, to test decoding correctness, use
300 | 
301 | 	./test_decode test_collection.out ../data/test_collection 0.01
302 | 
303 | which will check every decoded integer against the original input
304 | collection (note that you must provide the *correct* original input collection as well as the *density level* it was used during building).
305 | 
306 | Tools
307 | -----
308 | The subfolder `tools` contains some programs generating
309 | synthetic data to test the code.
310 | 
311 | For example, the sequence `data/test_sequence` was generated with
312 | 
313 | 	./gen_clustered_data 1 1000000 test_sequence
314 | 
315 | A test collection can be generated with
316 | 
317 | 	./gen_clustered_data 100 1000000 test_collection --binary
318 | 
319 | A test query log can be generated with
320 | 
321 | 	./gen_random_pairwise_queries 1000 100 > test_pairwise_queries
322 | 
323 | An example microbenchmark
324 | -----
325 | In the following microbenchmark we show the number of bits per integer (bpi) and average microseconds per list intersection query with 2 sequences.
326 | 
327 | We compare Slicing with Roaring [1] and Partitioned Elias-Fano [2].
328 | 
329 | We use the datasets Census-Income, Census-1881, Weather and Wikileaks shipped with the [CRoaring Library](https://github.com/RoaringBitmap/CRoaring) (see directory `benchmarks/realdata`).
330 | See [1] for a description of such datasets.
331 | 
332 | To measure the bpi rate, we serialize the data structures and take the written number of bytes.
333 | To measure query timings, we compute 1,000 intersections between
334 | random pairs of lists for 10 times and report the average.
335 | 
336 | The benchmark was executed on a Linux 4.4.0 server machine with
337 | an Intel i7-7700 CPU (@3.6 GHz) and 64 GB of RAM.
338 | The code was compiled with gcc 7.3.0 with all optimizations
339 | (see also `CMakeLists.txt`).
340 | 
341 | #### Table 1. Bits per integer
342 | |**Dataset** |**Roaring** | **Slicing**  | **PEF**|
343 | |------------|-----------:|-------------:|-------:|
344 | |Census-Income|	2.74	   |2.23	|2.03|
345 | |Census-1881|	  15.93      |	10.83|	7.28|
346 | |Weather	|     5.43      |	4.05|	3.13|
347 | |Wikileaks|	  16.30      |	10.18|	8.87|
348 | 
349 | #### Table 2. µsec per list intersection
350 | |**Dataset**  |**Roaring** | **Slicing**  | **PEF**|
351 | |-------------|-----------:|-------------:|-------:|
352 | |Census-Income|	4.68	|11.56|	115.17|
353 | |Census-1881  |	0.15	|0.18	|0.92|
354 | |Weather	    |  13.37	|25.70	|213.00|
355 | |Wikileaks    |	0.86	|0.47	|2.51|
356 | 
357 | Authors
358 | -------
359 | * [Giulio Ermanno Pibiri](http://pages.di.unipi.it/pibiri/), <giulio.ermanno.pibiri@isti.cnr.it>
360 | 
361 | References
362 | -------
363 | 
364 | * [1] Daniel Lemire, Owen Kaser, Nathan Kurz, Luca Deri, Chris O’Hara, François Saint-Jacques, and Gregory Ssi-Yan-Kai. 2018. *Roaring bitmaps: Implementation of an optimized software library*. Software: Practice and Experience 48, 4,
365 | 867–895.
366 | * [2] Giuseppe Ottaviano and Rossano Venturini. *Partitioned Elias-Fano Indexes*. 2014. In Proceedings of the 37th International
367 | Conference on Research and Development in Information Retrieval. 273–282.
368 | * [3] Giulio Ermanno Pibiri. *Fast and Compact Set Intersection through Recursive Universe Partitioning*. 2021. IEEE Data Compression Conference (DCC).
369 | * [4] Giulio Ermanno Pibiri. *On Slicing Sorted Integer Sequences*. 2019. arXiv preprint. https://arxiv.org/abs/1907.01032
370 | * [5] Giulio Ermanno Pibiri and Rossano Venturini. *Techniques for Inverted Index Compression*. 2020. ACM Computing Surveys (CSUR). [https://arxiv.org/abs/1908.10598v2](https://arxiv.org/abs/1908.10598v2)
371 | 


--------------------------------------------------------------------------------
/external/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | include_directories(mm_file/include/mm_file)
2 | include_directories(essentials/include)


--------------------------------------------------------------------------------
/include/builder.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <iostream>
  4 | #include <cassert>
  5 | #include <vector>
  6 | 
  7 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
  8 | 
  9 | #include "s_index.hpp"
 10 | #include "util.hpp"
 11 | #include "building_util.hpp"
 12 | 
 13 | namespace sliced {
 14 | 
 15 | void encode_block(std::vector<uint32_t>& block, uint32_t& id,
 16 |                   std::vector<uint8_t>& header, std::vector<uint8_t>& data) {
 17 |     if (block.size() > 0) {
 18 |         write_uint<uint8_t>(id, header);
 19 |         write_uint<uint8_t>(block.size() - 1, header);
 20 |         if (block.size() >= constants::block_sparseness_threshold - 1) {
 21 |             write_bits(block.data(), block.size(), constants::block_size, 0,
 22 |                        data);
 23 |         } else {
 24 |             for (auto pos : block) write_uint<uint8_t>(pos, data);
 25 |         }
 26 |         block.clear();
 27 |     }
 28 |     id += 1;
 29 | }
 30 | 
 31 | void encode_sparse_chunk(uint32_t const* begin, uint32_t const* end, slice s,
 32 |                          std::vector<uint32_t>& block,
 33 |                          std::vector<uint8_t>& out) {
 34 |     std::vector<uint8_t> header;
 35 |     std::vector<uint8_t> data;
 36 |     header.reserve(256 * 2);  // at most
 37 |     data.reserve(256 * 32);   // at most
 38 |     uint32_t id = 0;
 39 |     uint32_t base = s.left;
 40 |     while (begin != end and *begin < s.right) {
 41 |         uint32_t val = *begin - base;
 42 |         if (val >= constants::block_size) {
 43 |             encode_block(block, id, header, data);
 44 |             base += constants::block_size;
 45 |         } else {
 46 |             assert(val < constants::block_size);
 47 |             block.push_back(val);
 48 |             assert(block.size() <= constants::block_size);
 49 |             ++begin;
 50 |         }
 51 |     }
 52 |     encode_block(block, id, header, data);
 53 |     out.insert(out.end(), header.begin(), header.end());
 54 |     out.insert(out.end(), data.begin(), data.end());
 55 | }
 56 | 
 57 | void encode_sequence(uint32_t const* data, size_t n,
 58 |                      std::vector<uint32_t>& block, statistics& stats,
 59 |                      std::vector<uint8_t>& out) {
 60 |     assert(block.empty());
 61 |     auto begin = data;
 62 |     auto end = data + n;
 63 |     uint32_t universe = *(data + n - 1);
 64 |     uint32_t chunks = num_chunks(universe);
 65 |     assert(chunks > 0 and chunks <= constants::chunk_size);
 66 | 
 67 |     stats.sequences += 1;
 68 |     stats.integers += n;
 69 |     stats.chunks += chunks;
 70 | 
 71 |     std::vector<uint16_t> chunks_header;
 72 |     chunks_header.reserve(4 * constants::chunk_size);  // at most
 73 |     std::vector<uint8_t> tmp;
 74 | 
 75 |     const uint32_t dense_chunk_bytes = bytes_for(constants::chunk_size);
 76 |     slice s = {0, constants::chunk_size};
 77 | 
 78 |     for (uint32_t i = 0; i != chunks; ++i) {
 79 |         uint32_t cardinality = 0;
 80 |         if (*begin < s.right) {
 81 |             cardinality = chunk_cardinality(begin, end, s);
 82 |             chunks_header.push_back(i);
 83 |             chunks_header.push_back(cardinality - 1);
 84 | 
 85 |             if (cardinality < constants::chunk_sparseness_threshold) {
 86 |                 auto sparse_chunk_stats = sparse_chunk_bitsize(begin, end, s);
 87 | 
 88 |                 uint64_t sparse_chunk_bytes =
 89 |                     (sparse_chunk_stats.dense_blocks * 16 +
 90 |                      // NOTE: the 8 bits for the cardinality of sparse
 91 |                      // blocks are already accounted in sparse_blocks_bits
 92 |                      sparse_chunk_stats.sparse_blocks * 8 +
 93 |                      sparse_chunk_stats.dense_blocks_bits +
 94 |                      sparse_chunk_stats.sparse_blocks_bits) /
 95 |                     8;
 96 | 
 97 |                 if (sparse_chunk_bytes >= dense_chunk_bytes) {
 98 |                     stats.dense_chunks += 1;
 99 |                     stats.dense_chunks_bits += dense_chunk_bytes * 8;
100 |                     stats.integers_in_dense_chunks += cardinality;
101 |                     chunks_header.push_back(type::dense);
102 |                     chunks_header.push_back(constants::chunk_size / 8);
103 |                     write_bits(begin, cardinality, constants::chunk_size,
104 |                                s.left, tmp);
105 |                 } else {
106 |                     /*
107 |                         We would like clusters of:
108 |                         - few blocks in chunk + blocks have sufficiently large
109 |                           cardinality for method 1;
110 |                         - many blocks in chunk + blocks have low cardinality for
111 |                           method 2.
112 |                     */
113 | 
114 |                     stats.sparse_chunks += 1;
115 |                     stats.integers_in_sparse_chunks += cardinality;
116 | 
117 |                     uint16_t num_non_empty_blocks =
118 |                         sparse_chunk_stats.dense_blocks +
119 |                         sparse_chunk_stats.sparse_blocks;
120 |                     assert(num_non_empty_blocks >= 1 and
121 |                            num_non_empty_blocks <=
122 |                                constants::chunk_size / constants::block_size);
123 | 
124 |                     // how many chunks that have :
125 |                     // 1 block, 2 blocks, 3 blocks...
126 |                     stats.num_blocks_in_chunks[num_non_empty_blocks] += 1;
127 |                     stats.num_integers[num_non_empty_blocks] += cardinality;
128 |                     stats.blocks +=
129 |                         num_non_empty_blocks + sparse_chunk_stats.empty_blocks;
130 | 
131 |                     stats.accumulate(sparse_chunk_stats);
132 | 
133 |                     uint16_t packed = type::sparse;
134 |                     packed |= (num_non_empty_blocks - 1) << 8;
135 |                     chunks_header.push_back(packed);
136 | 
137 |                     chunks_header.push_back(sparse_chunk_bytes);
138 | 
139 |                     encode_sparse_chunk(begin, end, s, block, tmp);
140 |                 }
141 | 
142 |             } else {
143 |                 if (cardinality == constants::chunk_size) {
144 |                     stats.full_chunks += 1;
145 |                     stats.integers_in_full_chunks += cardinality;
146 |                     chunks_header.push_back(type::full);
147 |                     chunks_header.push_back(0);
148 |                 } else {
149 |                     stats.dense_chunks += 1;
150 |                     stats.dense_chunks_bits += dense_chunk_bytes * 8;
151 |                     stats.integers_in_dense_chunks += cardinality;
152 |                     assert(dense_chunk_bytes * 8.0 / cardinality <= 2.0);
153 |                     chunks_header.push_back(type::dense);
154 |                     chunks_header.push_back(constants::chunk_size / 8);
155 |                     write_bits(begin, cardinality, constants::chunk_size,
156 |                                s.left, tmp);
157 |                 }
158 |             }
159 | 
160 |         } else {
161 |             stats.empty_chunks += 1;
162 |         }
163 | 
164 |         s.left = s.right;
165 |         s.right += constants::chunk_size;
166 |         begin += cardinality;
167 |     }
168 | 
169 |     assert(begin == end);
170 |     chunks = chunks_header.size() / 4;
171 |     write_uint<uint16_t>(chunks - 1, out);
172 | 
173 |     // write chunks / constants::associativity pointers
174 |     // NOTE: a pointer is
175 |     // cardinality | byte_offset
176 |     // -----------   -----------
177 |     //   32 bits       32 bits
178 |     uint64_t offsets = chunks / constants::associativity;
179 |     uint32_t offset = 0;
180 |     uint32_t cardinality = 0;
181 |     for (uint64_t i = 0; i != offsets; ++i) {
182 |         uint32_t base = i * 4 * constants::associativity;
183 |         offset = 0;
184 |         cardinality = 0;
185 |         for (uint32_t j = 1; j != constants::associativity + 1; ++j) {
186 |             cardinality += chunks_header[base + 1] + 1;
187 |             offset += chunks_header[base + 3];
188 |             base += 4;
189 |         }
190 |         write_uint<uint32_t>(cardinality, out);
191 |         write_uint<uint32_t>(offset, out);
192 |     }
193 | 
194 |     auto ptr = reinterpret_cast<uint8_t const*>(chunks_header.data());
195 |     out.insert(out.end(), ptr,
196 |                ptr + chunks_header.size() * sizeof(chunks_header.front()));
197 |     out.insert(out.end(), tmp.begin(), tmp.end());
198 | 
199 |     stats.chunks_header_bits += chunks * 16 * 4 + 16 +
200 |                                 offsets * sizeof(offset) * 8 +
201 |                                 offsets * sizeof(cardinality) * 8;
202 | }
203 | 
204 | statistics encode_sequence(uint32_t const* data, size_t n,
205 |                            std::vector<uint8_t>& out) {
206 |     std::vector<uint32_t> block;
207 |     block.reserve(constants::block_size);
208 |     statistics stats;
209 | 
210 |     encode_sequence(data, n, block, stats, out);
211 | 
212 |     stats.blocks_header_bits =
213 |         stats.dense_blocks * 16 + stats.sparse_blocks * 8;
214 |     stats.bits = stats.chunks_header_bits + stats.blocks_header_bits +
215 |                  stats.dense_chunks_bits + stats.dense_blocks_bits +
216 |                  stats.sparse_blocks_bits;
217 | 
218 |     stats.bits += 2 * 64;
219 | 
220 |     return stats;
221 | }
222 | 
223 | struct s_index::builder {
224 |     builder(parameters const& params)
225 |         : m_params(params) {}
226 | 
227 |     statistics build() {
228 |         mm::file_source<uint32_t> input(m_params.collection_filename,
229 |                                         mm::advice::sequential);
230 |         uint32_t const* data = input.data();
231 |         std::vector<uint32_t> block;
232 |         block.reserve(constants::block_size);
233 |         statistics stats;
234 | 
235 |         assert(data[0] == 1);
236 |         std::cout << "universe size: " << data[1] << std::endl;
237 | 
238 |         m_offsets.push_back(data[1]);
239 |         m_offsets.push_back(0);
240 | 
241 |         for (size_t i = 2;  // first two values reserved for a singleton
242 |                             // sequence containing the universe size
243 |              i < input.size();) {
244 |             uint32_t n = data[i];
245 |             uint32_t universe = data[i + n];
246 |             if (pass(m_params, n, universe)) {
247 |                 encode_sequence(data + i + 1, n, block, stats, m_sequences);
248 |                 m_offsets.push_back(m_sequences.size());
249 |                 if (stats.sequences % 1000 == 0) {
250 |                     std::cout << "processed " << stats.sequences << " sequences"
251 |                               << std::endl;
252 |                 }
253 |             }
254 |             i += n + 1;
255 |         }
256 | 
257 |         m_offsets.pop_back();
258 | 
259 |         stats.blocks_header_bits =
260 |             stats.dense_blocks * 16 + stats.sparse_blocks * 8;
261 |         stats.bits = stats.chunks_header_bits + stats.blocks_header_bits +
262 |                      stats.dense_chunks_bits + stats.dense_blocks_bits +
263 |                      stats.sparse_blocks_bits;
264 | 
265 |         stats.bits += 2 * 64;
266 |         stats.bits += m_offsets.size() * 64;
267 | 
268 |         return stats;
269 |     }
270 | 
271 |     template <typename Visitor>
272 |     void visit(Visitor& visitor) {
273 |         visitor.visit(m_offsets);
274 |         visitor.visit(m_sequences);
275 |     }
276 | 
277 | private:
278 |     parameters const& m_params;
279 |     std::vector<uint64_t> m_offsets;
280 |     std::vector<uint8_t> m_sequences;
281 | };
282 | 
283 | struct s_sequence::builder {
284 |     builder() {}
285 | 
286 |     statistics build(uint32_t const* data, size_t n) {
287 |         return encode_sequence(data, n, m_out);
288 |     }
289 | 
290 |     uint8_t const* data() const {
291 |         return m_out.data();
292 |     }
293 | 
294 |     template <typename Visitor>
295 |     void visit(Visitor& visitor) {
296 |         visitor.visit(m_out);
297 |     }
298 | 
299 | private:
300 |     std::vector<uint8_t> m_out;
301 | };
302 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/building_util.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cassert>
  4 | 
  5 | #include "constants.hpp"
  6 | #include "util.hpp"
  7 | 
  8 | namespace sliced {
  9 | 
 10 | struct slice {
 11 |     uint32_t left;
 12 |     uint32_t right;
 13 | };
 14 | 
 15 | template <typename W>
 16 | void set_bit(size_t position, std::vector<W>& bits) {
 17 |     assert(position < bits.size() * sizeof(W) * 8);
 18 |     size_t w = position / (sizeof(W) * 8);
 19 |     size_t o = position % (sizeof(W) * 8);
 20 |     bits[w] |= W(1) << o;
 21 | }
 22 | 
 23 | template <typename T>
 24 | void write_uint(T val, std::vector<uint8_t>& out) {
 25 |     auto ptr = reinterpret_cast<uint8_t const*>(&val);
 26 |     out.insert(out.end(), ptr, ptr + sizeof(T));
 27 | }
 28 | 
 29 | template <typename T = uint32_t>
 30 | void write_uint(T x, std::ofstream& out) {
 31 |     out.write(reinterpret_cast<char const*>(&x), sizeof(T));
 32 | }
 33 | 
 34 | void write_bits(uint32_t const* begin, size_t n, size_t bits, uint32_t base,
 35 |                 std::vector<uint8_t>& out) {
 36 |     assert(bits % 64 == 0);
 37 |     std::vector<uint64_t> bitmap(bits / 64, 0);
 38 |     for (uint32_t i = 0; i != n; ++i, ++begin) {
 39 |         uint32_t val = *begin - base;
 40 |         set_bit(val, bitmap);
 41 |     }
 42 |     auto ptr = reinterpret_cast<uint8_t const*>(bitmap.data());
 43 |     out.insert(out.end(), ptr, ptr + bitmap.size() * sizeof(bitmap.front()));
 44 | }
 45 | 
 46 | uint32_t chunk_cardinality(uint32_t const* begin, uint32_t const* end,
 47 |                            slice s) {
 48 |     uint32_t c = 0;
 49 |     uint32_t prev = -1;
 50 |     while (begin != end and *begin < s.right) {
 51 |         assert(*begin >= s.left);
 52 |         assert(*begin - s.left < constants::chunk_size);
 53 |         if (*begin == prev) throw std::runtime_error("duplicate element");
 54 |         prev = *begin;
 55 |         ++begin;
 56 |         ++c;
 57 |     }
 58 |     assert(c > 0 and c <= constants::chunk_size);
 59 |     return c;
 60 | }
 61 | 
 62 | void block_bitsize(size_t block_size, statistics& stats) {
 63 |     stats.blocks += 1;
 64 |     assert(block_size <= constants::block_size);
 65 |     if (block_size == 0) {
 66 |         stats.empty_blocks += 1;
 67 |     } else if (block_size >= constants::block_sparseness_threshold - 1) {
 68 |         stats.dense_blocks += 1;
 69 |         stats.dense_blocks_bits += constants::block_size;
 70 |         stats.integers_in_dense_blocks += block_size;
 71 |     } else {
 72 |         assert(block_size <= constants::block_sparseness_threshold - 2);
 73 |         stats.sparse_blocks += 1;
 74 |         stats.integers_in_sparse_blocks += block_size;
 75 |         stats.sparse_blocks_bits += 8 * (block_size + 1);
 76 |         stats.sparse_blocks_cardinalities[block_size] += 1;
 77 |     }
 78 | }
 79 | 
 80 | statistics sparse_chunk_bitsize(uint32_t const* begin, uint32_t const* end,
 81 |                                 slice s) {
 82 |     statistics stats;
 83 |     uint32_t base = s.left;
 84 |     size_t block_size = 0;
 85 |     while (begin != end and *begin < s.right) {
 86 |         uint32_t val = *begin - base;
 87 |         if (val >= constants::block_size) {
 88 |             block_bitsize(block_size, stats);
 89 |             base += constants::block_size;
 90 |             block_size = 0;
 91 |         } else {
 92 |             assert(val < constants::block_size);
 93 |             ++block_size;
 94 |             assert(block_size <= constants::block_size);
 95 |             ++begin;
 96 |         }
 97 |     }
 98 |     block_bitsize(block_size, stats);
 99 |     return stats;
100 | }
101 | 
102 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/constants.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace sliced {
 4 | 
 5 | namespace constants {
 6 | 
 7 | static const uint32_t chunk_size = uint32_t(1) << 16;
 8 | static const uint32_t block_size = uint32_t(1) << 8;
 9 | 
10 | static const uint64_t chunk_size_in_64bit_words = chunk_size / 64;
11 | static const uint64_t block_size_in_64bit_words = block_size / 64;
12 | 
13 | static const uint64_t chunk_sparseness_threshold = chunk_size / 2;
14 | static const uint64_t block_sparseness_threshold = block_size / 8;
15 | 
16 | static const uint32_t associativity = 32;
17 | 
18 | static const uint32_t not_found = uint32_t(-1);
19 | 
20 | }  // namespace constants
21 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/contains.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // #include "immintrin.h"
 4 | 
 5 | namespace sliced {
 6 | 
 7 | bool sparse_block_contains(uint8_t const* begin, int cardinality,
 8 |                            uint32_t value) {
 9 |     // scalar code as fast as SIMD approach
10 |     for (int i = 0; i != cardinality; ++i) {
11 |         if (begin[i] > value) return false;
12 |         if (begin[i] == value) return true;
13 |     }
14 |     return false;
15 | 
16 |     // (void)cardinality;
17 |     // __m256i c = _mm256_set1_epi8(value);
18 |     // __m256i v = _mm256_loadu_si256((__m256i const*)begin);
19 |     // __m256i res = _mm256_cmpeq_epi8(c, v);
20 |     // return _mm256_testz_si256(res, _mm256_setzero_si256());
21 | }
22 | 
23 | bool contains_sparse_chunk(uint8_t const* begin, int blocks, uint32_t value) {
24 |     assert(blocks >= 1 and blocks <= 256);
25 |     uint8_t const* data = begin + blocks * 2;
26 |     uint8_t const* end = data;
27 |     uint32_t block_id = value >> 8;
28 | 
29 |     while (begin != end) {
30 |         uint8_t id = *begin;
31 |         if (id > block_id) return false;
32 |         int c = *(begin + 1) + 1;
33 |         int bytes = 32;
34 |         int type = type::dense;
35 |         if (LIKELY(c < 31)) {
36 |             bytes = c;
37 |             type = type::sparse;
38 |         }
39 |         if (id == block_id) {
40 |             uint32_t base = id * 256;
41 |             assert(value >= base);
42 |             value -= base;
43 |             if (type == type::sparse) {
44 |                 return sparse_block_contains(data, c, value);
45 |             } else {
46 |                 return bitmap_contains(reinterpret_cast<uint64_t const*>(data),
47 |                                        value);
48 |             }
49 |         }
50 |         data += bytes;
51 |         begin += 2;
52 |     }
53 | 
54 |     return false;
55 | }
56 | 
57 | bool s_sequence::contains(uint32_t value) const {
58 |     auto it = begin();
59 |     uint32_t chunk_id = value >> 16;
60 |     it.skip_to_value(chunk_id);
61 |     if (it.id() == chunk_id) {
62 |         value &= 0xFFFF;
63 |         switch (it.type()) {
64 |             case type::sparse:
65 |                 return contains_sparse_chunk(it.data, it.blocks(), value);
66 |             case type::dense:
67 |                 return bitmap_contains(
68 |                     reinterpret_cast<uint64_t const*>(it.data), value);
69 |             case type::full:
70 |                 return true;
71 |             default:
72 |                 assert(false);
73 |                 __builtin_unreachable();
74 |         }
75 |     }
76 |     return false;
77 | }
78 | 
79 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/decode.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "immintrin.h"
  4 | #include "constants.hpp"
  5 | #include "decode.hpp"
  6 | 
  7 | namespace sliced {
  8 | 
  9 | inline uint32_t decode_bitmap(uint64_t const* bitmap,
 10 |                               size_t size_in_64bit_words, uint32_t base,
 11 |                               uint32_t* out) {
 12 |     uint32_t size = 0;
 13 |     for (size_t i = 0; i != size_in_64bit_words; ++i) {
 14 |         uint64_t w = bitmap[i];
 15 |         while (w != 0) {
 16 |             uint64_t t = w & (~w + 1);
 17 |             int r = __builtin_ctzll(w);
 18 |             out[size++] = r + base;
 19 |             w ^= t;
 20 |         }
 21 |         base += 64;
 22 |     }
 23 |     return size;
 24 | }
 25 | 
 26 | uint32_t decode_sparse_block(uint8_t const* begin, int cardinality,
 27 |                              uint32_t base, uint32_t* out) {
 28 |     __m128i in_vec;
 29 |     __m256i converted;
 30 |     __m256i base_vec = _mm256_set1_epi32(base);
 31 | 
 32 |     in_vec = _mm_lddqu_si128((__m128i const*)(begin + 0));
 33 |     converted = _mm256_cvtepu8_epi32(in_vec);
 34 |     converted = _mm256_add_epi32(base_vec, converted);
 35 |     _mm256_storeu_si256((__m256i*)(out + 0), converted);
 36 | 
 37 |     // most likely
 38 |     if (cardinality <= 8) return cardinality;
 39 | 
 40 |     in_vec = _mm_lddqu_si128((__m128i const*)(begin + 8));
 41 |     converted = _mm256_cvtepu8_epi32(in_vec);
 42 |     converted = _mm256_add_epi32(base_vec, converted);
 43 |     _mm256_storeu_si256((__m256i*)(out + 8), converted);
 44 | 
 45 |     in_vec = _mm_lddqu_si128((__m128i const*)(begin + 16));
 46 |     converted = _mm256_cvtepu8_epi32(in_vec);
 47 |     converted = _mm256_add_epi32(base_vec, converted);
 48 |     _mm256_storeu_si256((__m256i*)(out + 16), converted);
 49 | 
 50 |     in_vec = _mm_lddqu_si128((__m128i const*)(begin + 24));
 51 |     converted = _mm256_cvtepu8_epi32(in_vec);
 52 |     converted = _mm256_add_epi32(base_vec, converted);
 53 |     _mm256_storeu_si256((__m256i*)(out + 24), converted);
 54 | 
 55 |     return cardinality;
 56 | }
 57 | 
 58 | uint32_t decode_sparse_chunk(uint8_t const* begin, int blocks, uint32_t base,
 59 |                              uint32_t* out) {
 60 |     assert(blocks >= 1 and blocks <= 256);
 61 |     uint8_t const* data = begin + blocks * 2;
 62 |     uint8_t const* end = data;
 63 |     uint32_t* tmp = out;
 64 |     while (begin != end) {
 65 |         uint8_t id = *begin;
 66 |         int c = *(begin + 1) + 1;
 67 |         int bytes = 32;
 68 |         int type = type::dense;
 69 |         if (LIKELY(c < 31)) {
 70 |             bytes = c;
 71 |             type = type::sparse;
 72 |         }
 73 |         uint32_t b = base + id * 256;
 74 |         if (type == type::sparse) {
 75 |             tmp += decode_sparse_block(data, c, b, tmp);
 76 |         } else {
 77 |             tmp += decode_bitmap(reinterpret_cast<uint64_t const*>(data),
 78 |                                  constants::block_size_in_64bit_words, b, tmp);
 79 |         }
 80 |         data += bytes;
 81 |         begin += 2;
 82 |     }
 83 |     return size_t(tmp - out);
 84 | }
 85 | 
 86 | inline uint32_t decode_full_chunk(uint32_t base, uint32_t* out) {
 87 |     for (uint32_t i = 0; i != constants::chunk_size; ++i) out[i] = i + base;
 88 |     return constants::chunk_size;
 89 | }
 90 | 
 91 | size_t decode_chunk(s_sequence::iterator const& it, uint32_t* out) {
 92 |     uint32_t base = it.id() << 16;
 93 |     switch (it.type()) {
 94 |         case type::sparse:
 95 |             return decode_sparse_chunk(it.data, it.blocks(), base, out);
 96 |         case type::dense:
 97 |             return decode_bitmap(reinterpret_cast<uint64_t const*>(it.data),
 98 |                                  constants::chunk_size_in_64bit_words, base,
 99 |                                  out);
100 |         case type::full:
101 |             return decode_full_chunk(base, out);
102 |         default:
103 |             assert(false);
104 |             __builtin_unreachable();
105 |     }
106 | }
107 | 
108 | size_t s_sequence::decode(uint32_t* out) const {
109 |     auto it = begin();
110 |     uint32_t* in = out;
111 |     for (uint32_t i = 0; i != chunks; ++i) {
112 |         out += decode_chunk(it, out);
113 |         it.next();
114 |     }
115 |     return size_t(out - in);
116 | }
117 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/enumerator.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "decode.hpp"
 4 | 
 5 | namespace sliced {
 6 | 
 7 | struct enumerator {
 8 |     enumerator() {
 9 |         m_buf.resize(constants::chunk_size);
10 |     }
11 | 
12 |     void init(s_sequence const& s, uint32_t past_the_end) {
13 |         m_it = s.begin();
14 |         m_chunk = 0;
15 |         m_chunks = s.chunks;
16 |         m_i = 0;
17 |         m_cardinality = m_it.cardinality();
18 |         m_past_the_end = past_the_end;
19 |         m_has_next = true;
20 |         decode_chunk(m_it, m_buf.data());
21 |     }
22 | 
23 |     bool has_next() const {
24 |         return m_has_next;
25 |     }
26 | 
27 |     void next() {
28 |         if (++m_i == m_cardinality) {
29 |             if (++m_chunk == m_chunks) {
30 |                 m_has_next = false;
31 |                 return;
32 |             }
33 |             m_i = 0;
34 |             m_it.next();
35 |             m_cardinality = m_it.cardinality();
36 |             decode_chunk(m_it, m_buf.data());
37 |         }
38 |     }
39 | 
40 |     uint32_t value() const {
41 |         return m_has_next ? m_buf[m_i] : m_past_the_end;
42 |     }
43 | 
44 | private:
45 |     s_sequence::iterator m_it;
46 |     uint32_t m_chunk;
47 |     uint32_t m_chunks;
48 |     uint32_t m_i;
49 |     uint32_t m_cardinality;
50 |     uint32_t m_past_the_end;
51 |     bool m_has_next;
52 |     std::vector<uint32_t> m_buf;
53 | };
54 | 
55 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/intersection.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "immintrin.h"
  4 | 
  5 | #include "constants.hpp"
  6 | #include "util.hpp"
  7 | #include "decode.hpp"
  8 | #include "uncompress.hpp"
  9 | #include "table.hpp"
 10 | 
 11 | namespace sliced {
 12 | 
 13 | #define INIT                                          \
 14 |     __m256i base_v = _mm256_set1_epi32(base);         \
 15 |     __m128i v_l = _mm_lddqu_si128((__m128i const*)l); \
 16 |     __m128i v_r = _mm_lddqu_si128((__m128i const*)r); \
 17 |     __m256i converted_v;                              \
 18 |     __m128i shuf, p, res;                             \
 19 |     int mask, matched;
 20 | 
 21 | #define INTERSECT                                                             \
 22 |     res =                                                                     \
 23 |         _mm_cmpestrm(v_l, card_l, v_r, card_r,                                \
 24 |                      _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); \
 25 |     mask = _mm_extract_epi32(res, 0);                                         \
 26 |     matched = _mm_popcnt_u32(mask);                                           \
 27 |     size += matched;                                                          \
 28 |     shuf = _mm_load_si128((__m128i const*)shuffle_mask + mask);               \
 29 |     p = _mm_shuffle_epi8(v_r, shuf);                                          \
 30 |     converted_v = _mm256_cvtepu8_epi32(p);                                    \
 31 |     converted_v = _mm256_add_epi32(base_v, converted_v);                      \
 32 |     _mm256_storeu_si256((__m256i*)out, converted_v);                          \
 33 |     if (matched > 8) {                                                        \
 34 |         p = _mm_bsrli_si128(p, 8);                                            \
 35 |         converted_v = _mm256_cvtepu8_epi32(p);                                \
 36 |         converted_v = _mm256_add_epi32(base_v, converted_v);                  \
 37 |         _mm256_storeu_si256((__m256i*)(out + 8), converted_v);                \
 38 |     }
 39 | 
 40 | #define ADVANCE(ptr)                                \
 41 |     out += size;                                    \
 42 |     ptr += 16;                                      \
 43 |     v_##ptr = _mm_lddqu_si128((__m128i const*)ptr); \
 44 |     card_##ptr -= 16;
 45 | 
 46 | size_t ss_intersect_block(uint8_t const* l, uint8_t const* r, int card_l,
 47 |                           int card_r, uint32_t base, uint32_t* out) {
 48 |     assert(card_l > 0 and
 49 |            card_l <= int(constants::block_sparseness_threshold - 2));
 50 |     assert(card_r > 0 and
 51 |            card_r <= int(constants::block_sparseness_threshold - 2));
 52 |     size_t size = 0;
 53 | 
 54 |     if (LIKELY(card_l <= 16 and card_r <= 16)) {
 55 |         INIT INTERSECT return size;  // 1 cmpestr
 56 |     }
 57 | 
 58 |     if (card_l <= 16 and card_r > 16) {
 59 |         INIT INTERSECT ADVANCE(r) INTERSECT return size;  // 2 cmpestr
 60 |     }
 61 | 
 62 |     if (card_r <= 16 and card_l > 16) {
 63 |         INIT INTERSECT ADVANCE(l) INTERSECT return size;  // 2 cmpestr
 64 |     }
 65 | 
 66 |     // card_l  > 16 and card_r  > 16 -> 4 cmpestr, but scalar may be more
 67 |     // convenient...
 68 | 
 69 |     uint8_t const* end_l = l + card_l;
 70 |     uint8_t const* end_r = r + card_r;
 71 |     while (true) {
 72 |         while (*l < *r) {
 73 |             if (++l == end_l) return size;
 74 |         }
 75 |         while (*l > *r) {
 76 |             if (++r == end_r) return size;
 77 |         }
 78 |         if (*l == *r) {
 79 |             out[size++] = *l + base;
 80 |             if (++l == end_l or ++r == end_r) return size;
 81 |         }
 82 |     }
 83 | 
 84 |     return size;
 85 | }
 86 | 
 87 | inline size_t dd_intersect_block(uint8_t const* l, uint8_t const* r,
 88 |                                  uint32_t base, uint32_t* out) {
 89 |     return and_bitmaps(l, r, constants::block_size_in_64bit_words, base, out);
 90 | }
 91 | 
 92 | size_t ds_intersect_block(uint8_t const* l, uint8_t const* r, int card,
 93 |                           uint32_t base, uint32_t* out) {
 94 |     uint64_t const* bitmap = reinterpret_cast<uint64_t const*>(l);
 95 |     uint32_t k = 0;
 96 |     for (int i = 0; i != card; ++i) {
 97 |         uint32_t key = r[i];
 98 |         out[k] = key + base;
 99 |         k += bitmap_contains(bitmap, key);
100 |     }
101 |     return k;
102 | }
103 | 
104 | size_t ss_intersect_chunk(uint8_t const* l, uint8_t const* r, int blocks_l,
105 |                           int blocks_r, uint32_t base, uint32_t* out) {
106 |     assert(blocks_l >= 1 and blocks_l <= 256);
107 |     assert(blocks_r >= 1 and blocks_r <= 256);
108 |     uint8_t const* data_l = l + blocks_l * 2;
109 |     uint8_t const* data_r = r + blocks_r * 2;
110 |     uint8_t const* end_l = data_l;
111 |     uint8_t const* end_r = data_r;
112 |     uint32_t* tmp = out;
113 | 
114 |     while (true) {
115 |         while (*l < *r) {
116 |             if (l + 2 == end_l) return size_t(tmp - out);
117 |             int c = *(l + 1) + 1;
118 |             data_l += BYTES_BY_CARDINALITY(c);
119 |             l += 2;
120 |         }
121 |         while (*l > *r) {
122 |             if (r + 2 == end_r) return size_t(tmp - out);
123 |             int c = *(r + 1) + 1;
124 |             data_r += BYTES_BY_CARDINALITY(c);
125 |             r += 2;
126 |         }
127 |         if (*l == *r) {
128 |             uint8_t id = *l;
129 |             ++l;
130 |             ++r;
131 |             int cl = *l + 1;
132 |             int cr = *r + 1;
133 |             int type_l = type::dense;
134 |             int type_r = type::dense;
135 |             int bl = 32;
136 |             int br = 32;
137 | 
138 |             if (LIKELY(cl < 31)) {
139 |                 bl = cl;
140 |                 type_l = type::sparse;
141 |             }
142 | 
143 |             if (LIKELY(cr < 31)) {
144 |                 br = cr;
145 |                 type_r = type::sparse;
146 |             }
147 | 
148 |             uint32_t b = base + id * 256;
149 |             uint32_t n = 0;
150 | 
151 |             switch (block_pair(type_l, type_r)) {
152 |                 case block_pair(type::sparse, type::sparse):
153 |                     n = ss_intersect_block(data_l, data_r, cl, cr, b, tmp);
154 |                     break;
155 |                 case block_pair(type::sparse, type::dense):
156 |                     n = ds_intersect_block(data_r, data_l, cl, b, tmp);
157 |                     break;
158 |                 case block_pair(type::dense, type::sparse):
159 |                     n = ds_intersect_block(data_l, data_r, cr, b, tmp);
160 |                     break;
161 |                 case block_pair(type::dense, type::dense):
162 |                     n = and_bitmaps(data_l, data_r,
163 |                                     constants::block_size_in_64bit_words, b,
164 |                                     tmp);
165 |                     break;
166 |                 default:
167 |                     assert(false);
168 |                     __builtin_unreachable();
169 |             }
170 | 
171 |             tmp += n;
172 | 
173 |             if (l + 1 == end_l or r + 1 == end_r) return size_t(tmp - out);
174 | 
175 |             data_l += bl;
176 |             data_r += br;
177 |             ++l;
178 |             ++r;
179 |         }
180 |     }
181 | 
182 |     return size_t(tmp - out);
183 | }
184 | 
185 | size_t ds_intersect_chunk(uint8_t const* l, uint8_t const* r, int blocks_r,
186 |                           uint32_t base, uint32_t* out) {
187 |     static std::vector<uint64_t> x(constants::chunk_size_in_64bit_words);
188 |     std::fill(x.begin(), x.end(), 0);
189 |     uncompress_sparse_chunk(r, blocks_r, x.data());
190 |     return and_bitmaps(l, reinterpret_cast<uint8_t const*>(x.data()),
191 |                        constants::chunk_size_in_64bit_words, base, out);
192 | }
193 | 
194 | size_t pairwise_intersection(s_sequence const& l, s_sequence const& r,
195 |                              uint32_t* out) {
196 |     auto it_l = l.begin();
197 |     auto it_r = r.begin();
198 |     uint32_t* in = out;
199 |     while (it_l.has_next() and it_r.has_next()) {
200 |         uint16_t id_l = it_l.id();
201 |         uint16_t id_r = it_r.id();
202 | 
203 |         if (id_l == id_r) {
204 |             uint32_t n = 0;
205 |             uint32_t base = id_l << 16;
206 |             int blocks_l = 0;
207 |             int blocks_r = 0;
208 | 
209 |             uint16_t type_l = it_l.type();
210 |             uint16_t type_r = it_r.type();
211 | 
212 |             switch (chunk_pair(type_l, type_r)) {
213 |                 case chunk_pair(type::sparse, type::sparse):
214 |                     blocks_l = it_l.blocks();
215 |                     blocks_r = it_r.blocks();
216 |                     if (blocks_l < blocks_r) {
217 |                         n = ss_intersect_chunk(it_l.data, it_r.data, blocks_l,
218 |                                                blocks_r, base, out);
219 |                     } else {
220 |                         n = ss_intersect_chunk(it_r.data, it_l.data, blocks_r,
221 |                                                blocks_l, base, out);
222 |                     }
223 |                     break;
224 |                 case chunk_pair(type::sparse, type::dense):
225 |                     n = ds_intersect_chunk(it_r.data, it_l.data, it_l.blocks(),
226 |                                            base, out);
227 |                     break;
228 |                 case chunk_pair(type::sparse, type::full):
229 |                     n = decode_sparse_chunk(it_l.data, it_l.blocks(), base,
230 |                                             out);
231 |                     break;
232 |                 case chunk_pair(type::dense, type::sparse):
233 |                     n = ds_intersect_chunk(it_l.data, it_r.data, it_r.blocks(),
234 |                                            base, out);
235 |                     break;
236 |                 case chunk_pair(type::dense, type::dense):
237 |                     n = and_bitmaps(it_l.data, it_r.data,
238 |                                     constants::chunk_size_in_64bit_words, base,
239 |                                     out);
240 |                     break;
241 |                 case chunk_pair(type::dense, type::full):
242 |                     n = decode_bitmap(
243 |                         reinterpret_cast<uint64_t const*>(it_l.data),
244 |                         constants::chunk_size_in_64bit_words, base, out);
245 |                     break;
246 |                 case chunk_pair(type::full, type::sparse):
247 |                     n = decode_sparse_chunk(it_r.data, it_r.blocks(), base,
248 |                                             out);
249 |                     break;
250 |                 case chunk_pair(type::full, type::dense):
251 |                     n = decode_bitmap(
252 |                         reinterpret_cast<uint64_t const*>(it_r.data),
253 |                         constants::chunk_size_in_64bit_words, base, out);
254 |                     break;
255 |                 case chunk_pair(type::full, type::full):
256 |                     n = decode_full_chunk(base, out);
257 |                     break;
258 |                 default:
259 |                     assert(false);
260 |                     __builtin_unreachable();
261 |             }
262 | 
263 |             out += n;
264 |             it_l.next();
265 |             it_r.next();
266 | 
267 |         } else if (id_l < id_r) {
268 |             it_l.advance(id_r);
269 |         } else {
270 |             it_r.advance(id_l);
271 |         }
272 |     }
273 |     return size_t(out - in);
274 | }
275 | 
276 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/intersection_many.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "decode.hpp"
 4 | #include "uncompress_chunk_and_intersect.hpp"
 5 | 
 6 | namespace sliced {
 7 | 
 8 | size_t intersection(std::vector<s_sequence>& sequences, uint32_t* out) {
 9 |     uint32_t* in = out;
10 |     std::sort(sequences.begin(), sequences.end(),
11 |               [](auto const& l, auto const& r) {
12 |                   return l.cardinality() < r.cardinality();
13 |               });
14 |     std::vector<s_sequence::iterator> iterators(sequences.size());
15 |     for (size_t i = 0; i != sequences.size(); ++i) {
16 |         iterators[i] = sequences[i].begin();
17 |     }
18 | 
19 |     static std::vector<uint16_t> headers(65536);
20 |     uint64_t num_headers = 0;
21 |     {
22 |         uint32_t candidate = iterators[0].id();
23 |         size_t i = 1;
24 |         while (candidate < 65536) {
25 |             for (; i < iterators.size(); ++i) {
26 |                 iterators[i].skip_to_value(candidate);
27 |                 uint32_t val = iterators[i].id();
28 |                 if (val != candidate) {
29 |                     candidate = val;
30 |                     i = 0;
31 |                     break;
32 |                 }
33 |             }
34 |             if (i == iterators.size()) {
35 |                 headers[num_headers++] = candidate;
36 |                 iterators[0].next();
37 |                 candidate = iterators[0].id();
38 |                 i = 1;
39 |             }
40 |         }
41 |     }
42 | 
43 |     {
44 |         static std::vector<uint64_t> bitmap(1024);
45 |         for (size_t i = 0; i != sequences.size(); ++i) {
46 |             iterators[i] = sequences[i].begin();
47 |         }
48 |         for (uint64_t i = 0; i != num_headers; ++i) {
49 |             uint32_t header = headers[i];
50 |             uint32_t base = header << 16;
51 | 
52 |             // std::sort(iterators.begin(), iterators.end(),
53 |             //           [](auto const& l, auto const& r) {
54 |             //               return l.cardinality() < r.cardinality();
55 |             //           });
56 | 
57 |             iterators[0].advance(header);
58 |             assert(iterators[0].id() == header);
59 |             uint32_t cardinality =
60 |                 uncompress_chunk(iterators[0], bitmap.data());
61 |             for (uint64_t i = 1; i != iterators.size(); ++i) {
62 |                 iterators[i].advance(header);
63 |                 assert(iterators[i].id() == header);
64 |                 cardinality = uncompress_chunk_and_intersect(
65 |                     iterators[i], bitmap.data(), cardinality);
66 |                 if (cardinality == 0) goto SKIP;
67 |             }
68 |             out += decode_bitmap(bitmap.data(), 1024, base, out);
69 |         SKIP:;
70 |         }
71 |     }
72 | 
73 |     return size_t(out - in);
74 | }
75 | 
76 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/next_geq.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace sliced {
  4 | 
  5 | #define BLOCK_MIN                                                            \
  6 |     if (LIKELY(*(begin + 1) < 30)) return *data + base;                      \
  7 |     return min_value_in_bitmap(data, constants::block_size_in_64bit_words) + \
  8 |            base;
  9 | 
 10 | #define BLOCK_MIN_                                                          \
 11 |     if (LIKELY(*(m_begin + 1) < 30)) {                                      \
 12 |         value = *m_data + base;                                             \
 13 |     } else {                                                                \
 14 |         value = min_value_in_bitmap(m_data,                                 \
 15 |                                     constants::block_size_in_64bit_words) + \
 16 |                 base;                                                       \
 17 |     }
 18 | 
 19 | #define CHUNK_MIN(it)                                                          \
 20 |     switch (it.type()) {                                                       \
 21 |         case type::sparse:                                                     \
 22 |             value = min_value_in_sparse_chunk(it.data, it.blocks());           \
 23 |             break;                                                             \
 24 |         case type::dense:                                                      \
 25 |             value = min_value_in_bitmap(it.data,                               \
 26 |                                         constants::chunk_size_in_64bit_words); \
 27 |             break;                                                             \
 28 |         case type::full:                                                       \
 29 |             value = 0;                                                         \
 30 |             break;                                                             \
 31 |         default:                                                               \
 32 |             assert(false);                                                     \
 33 |             __builtin_unreachable();                                           \
 34 |     }                                                                          \
 35 |     return value + it.base();
 36 | 
 37 | uint32_t next_geq_sparse_block(uint8_t const* begin, int cardinality,
 38 |                                uint32_t value) {
 39 |     for (int i = 0; i != cardinality; ++i) {
 40 |         if (begin[i] >= value) return begin[i];
 41 |     }
 42 |     return constants::not_found;
 43 | }
 44 | 
 45 | uint32_t max_value_in_bitmap(uint8_t const* data, size_t size_in_64bit_words) {
 46 |     uint64_t const* bitmap = reinterpret_cast<uint64_t const*>(data);
 47 |     for (int32_t i = size_in_64bit_words - 1; i >= 0; --i) {
 48 |         uint64_t w = bitmap[i];
 49 |         if (w != 0) {
 50 |             int r = __builtin_clzll(w);
 51 |             return i * 64 + 63 - r;
 52 |         }
 53 |     }
 54 |     return 0;
 55 | }
 56 | 
 57 | uint32_t min_value_in_bitmap(uint8_t const* data, size_t size_in_64bit_words) {
 58 |     uint64_t const* bitmap = reinterpret_cast<uint64_t const*>(data);
 59 |     for (uint32_t i = 0; i != size_in_64bit_words; ++i) {
 60 |         uint64_t w = bitmap[i];
 61 |         if (w != 0) return i * 64 + __builtin_ctzll(w);
 62 |     }
 63 |     return 0;
 64 | }
 65 | 
 66 | uint32_t next_geq_bitmap(uint8_t const* data, uint32_t size_in_64bit_words,
 67 |                          uint32_t value) {
 68 |     uint64_t const* bitmap = reinterpret_cast<uint64_t const*>(data);
 69 |     uint32_t k = value / 64;
 70 |     uint64_t word = bitmap[k];
 71 |     const int diff = value - k * 64;
 72 |     word = (word >> diff) << diff;
 73 |     while (word == 0) {
 74 |         k++;
 75 |         if (k == size_in_64bit_words) return constants::not_found;
 76 |         word = bitmap[k];
 77 |     }
 78 |     return k * 64 + __builtin_ctzll(word);
 79 | }
 80 | 
 81 | uint32_t max_value_in_block(uint8_t const* begin, uint8_t const* data) {
 82 |     int c = *(begin + 1);
 83 |     if (LIKELY(c < 30)) {  // block type is sparse
 84 |         return *(data + c);
 85 |     }
 86 |     return max_value_in_bitmap(data, constants::block_size_in_64bit_words);
 87 | }
 88 | 
 89 | uint32_t next_geq_sparse_chunk(uint8_t const* begin, int blocks,
 90 |                                uint32_t value) {
 91 |     assert(blocks >= 1 and blocks <= 256);
 92 |     uint8_t const* data = begin + blocks * 2;
 93 |     uint8_t const* end = data;
 94 |     uint32_t block_id = value >> 8;
 95 |     uint32_t id = *begin;
 96 | 
 97 |     while (id < block_id and begin != end) {
 98 |         int c = *(begin + 1) + 1;
 99 |         data += BYTES_BY_CARDINALITY(c);
100 |         begin += 2;
101 |         id = *begin;
102 |     }
103 | 
104 |     if (begin != end) {
105 |         uint32_t base = id * 256;
106 |         if (base >= value) {  // saturate
107 |             BLOCK_MIN
108 |         }
109 | 
110 |         value &= 0xFF;
111 |         if (value > max_value_in_block(begin, data)) {  // saturate
112 |             if (begin + 2 == end) return constants::not_found;
113 |             int c = *(begin + 1) + 1;
114 |             data += BYTES_BY_CARDINALITY(c);
115 |             begin += 2;
116 |             id = *begin;
117 |             base = id * 256;
118 |             BLOCK_MIN
119 |         }
120 | 
121 |         base = id * 256;
122 |         int c = *(begin + 1) + 1;
123 |         if (LIKELY(c < 31)) {  // block type is sparse
124 |             return next_geq_sparse_block(data, c, value) + base;
125 |         }
126 |         return next_geq_bitmap(data, constants::block_size_in_64bit_words,
127 |                                value) +
128 |                base;
129 |     }
130 | 
131 |     return constants::not_found;
132 | }
133 | 
134 | uint32_t min_value_in_sparse_chunk(uint8_t const* begin, int blocks) {
135 |     assert(blocks >= 1 and blocks <= 256);
136 |     uint8_t const* data = begin + blocks * 2;
137 |     uint32_t id = *begin;
138 |     uint32_t base = id * 256;
139 |     BLOCK_MIN
140 | }
141 | 
142 | uint32_t s_sequence::next_geq(uint32_t value) const {
143 |     auto it = begin();
144 |     uint32_t chunk_id = value >> 16;
145 |     it.skip_to_value(chunk_id);
146 | 
147 |     if (it.base() >= value) {  // saturate
148 |         CHUNK_MIN(it)
149 |     }
150 | 
151 |     if (it.has_next()) {
152 |         value &= 0xFFFF;
153 |         switch (it.type()) {
154 |             case type::sparse:
155 |                 value = next_geq_sparse_chunk(it.data, it.blocks(), value);
156 |                 break;
157 |             case type::dense:
158 |                 value = next_geq_bitmap(
159 |                     it.data, constants::chunk_size_in_64bit_words, value);
160 |                 break;
161 |             case type::full:
162 |                 break;
163 |             default:
164 |                 assert(false);
165 |                 __builtin_unreachable();
166 |         }
167 | 
168 |         if (value != constants::not_found) return value + it.base();
169 | 
170 |         // saturate
171 |         it.next();
172 |         if (it.has_next()) CHUNK_MIN(it)
173 |     }
174 | 
175 |     return constants::not_found;
176 | }
177 | 
178 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/next_geq_enumerator.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "next_geq.hpp"
  4 | 
  5 | namespace sliced {
  6 | 
  7 | struct next_geq_enumerator {
  8 |     next_geq_enumerator() {}
  9 | 
 10 |     next_geq_enumerator(s_sequence const& s)
 11 |         : m_chunk_id(-1)
 12 |         , m_id(0)
 13 |         , m_begin(nullptr)
 14 |         , m_end(nullptr)
 15 |         , m_data(nullptr)
 16 |         , m_size(s.size())
 17 |         , m_it(s.begin()) {}
 18 | 
 19 |     uint64_t size() const {
 20 |         return m_size;
 21 |     }
 22 | 
 23 |     uint32_t next_geq(uint32_t value) {
 24 |         uint32_t chunk_id = value >> 16;
 25 |         m_it.skip_to_value(chunk_id);
 26 | 
 27 |         if (m_it.base() >= value) {  // saturate
 28 |             CHUNK_MIN(m_it)
 29 |         }
 30 | 
 31 |         if (m_it.has_next()) {
 32 |             value &= 0xFFFF;
 33 |             switch (m_it.type()) {
 34 |                 case type::sparse: {
 35 |                     uint32_t block_id = value >> 8;
 36 |                     if (m_chunk_id != m_it.id()) {
 37 |                         uint32_t blocks = m_it.blocks();
 38 |                         assert(blocks >= 1 and blocks <= 256);
 39 |                         m_begin = m_it.data;
 40 |                         m_data = m_begin + blocks * 2;
 41 |                         m_end = m_data;
 42 |                         m_id = *m_begin;
 43 |                         m_chunk_id = m_it.id();
 44 |                     }
 45 | 
 46 |                     while (m_id < block_id and m_begin != m_end) {
 47 |                         int c = *(m_begin + 1) + 1;
 48 |                         m_data += BYTES_BY_CARDINALITY(c);
 49 |                         m_begin += 2;
 50 |                         m_id = *m_begin;
 51 |                     }
 52 | 
 53 |                     if (m_begin != m_end) {
 54 |                         uint32_t base = m_id * 256;
 55 |                         if (base >= value) {  // saturate
 56 |                             BLOCK_MIN_
 57 |                             break;
 58 |                         }
 59 | 
 60 |                         value &= 0xFF;
 61 |                         if (value >
 62 |                             max_value_in_block(m_begin, m_data)) {  // saturate
 63 |                             if (m_begin + 2 == m_end) {
 64 |                                 value = constants::not_found;
 65 |                                 break;
 66 |                             }
 67 |                             int c = *(m_begin + 1) + 1;
 68 |                             m_data += BYTES_BY_CARDINALITY(c);
 69 |                             m_begin += 2;
 70 |                             m_id = *m_begin;
 71 |                             base = m_id * 256;
 72 |                             BLOCK_MIN_
 73 |                             break;
 74 |                         }
 75 |                         base = m_id * 256;
 76 |                         int c = *(m_begin + 1) + 1;
 77 |                         if (LIKELY(c < 31)) {  // block type is sparse
 78 |                             value =
 79 |                                 next_geq_sparse_block(m_data, c, value) + base;
 80 |                         } else {
 81 |                             value = next_geq_bitmap(
 82 |                                         m_data,
 83 |                                         constants::block_size_in_64bit_words,
 84 |                                         value) +
 85 |                                     base;
 86 |                         }
 87 |                     } else {
 88 |                         value = constants::not_found;
 89 |                     }
 90 |                 } break;
 91 |                 case type::dense:
 92 |                     value = next_geq_bitmap(
 93 |                         m_it.data, constants::chunk_size_in_64bit_words, value);
 94 |                     break;
 95 |                 case type::full:
 96 |                     break;
 97 |                 default:
 98 |                     assert(false);
 99 |                     __builtin_unreachable();
100 |             }
101 | 
102 |             if (value != constants::not_found) return value + m_it.base();
103 | 
104 |             // saturate
105 |             m_it.next();
106 |             if (m_it.has_next()) CHUNK_MIN(m_it)
107 |         }
108 | 
109 |         return constants::not_found;
110 |     }
111 | 
112 | private:
113 |     uint32_t m_chunk_id;
114 |     uint8_t m_id;
115 |     uint8_t const* m_begin;
116 |     uint8_t const* m_end;
117 |     uint8_t const* m_data;
118 |     uint64_t m_size;
119 |     s_sequence::iterator m_it;
120 | };
121 | 
122 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/s_index.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cassert>
 4 | #include <vector>
 5 | 
 6 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 7 | 
 8 | #include "s_sequence.hpp"
 9 | 
10 | namespace sliced {
11 | 
12 | struct s_index {
13 |     struct builder;
14 | 
15 |     inline size_t size() const {
16 |         return m_size;
17 |     }
18 | 
19 |     inline size_t universe() const {
20 |         return m_universe;
21 |     }
22 | 
23 |     s_sequence operator[](size_t i) const {
24 |         assert(i < size());
25 |         return s_sequence(m_sequences + m_offsets[i]);
26 |     }
27 | 
28 |     void mmap(char const* binary_filename) {
29 |         m_input.open(binary_filename, mm::advice::sequential);
30 |         auto ptr = reinterpret_cast<uint64_t const*>(m_input.data());
31 |         m_size = *ptr++;
32 |         m_universe = *ptr++;
33 |         m_offsets = ptr;
34 |         m_sequences = m_input.data() + m_size * sizeof(uint64_t) +
35 |                       sizeof(m_size) + sizeof(m_universe);
36 |         m_size -= 1;
37 |     }
38 | 
39 | private:
40 |     mm::file_source<uint8_t> m_input;
41 |     uint64_t const* m_offsets;
42 |     uint8_t const* m_sequences;
43 |     uint64_t m_size;
44 |     uint64_t m_universe;
45 | };
46 | 
47 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/s_sequence.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "constants.hpp"
  4 | 
  5 | namespace sliced {
  6 | 
  7 | struct s_sequence {
  8 |     struct builder;
  9 | 
 10 |     s_sequence()
 11 |         : chunks(0)
 12 |         , m_pointers(nullptr)
 13 |         , m_header(nullptr)
 14 |         , m_data(nullptr) {}
 15 | 
 16 |     s_sequence(uint8_t const* addr) {
 17 |         uint16_t const* ptr = reinterpret_cast<uint16_t const*>(addr);
 18 |         chunks = 1 + *ptr++;
 19 |         m_pointers = reinterpret_cast<uint32_t const*>(ptr);
 20 |         uint64_t pointers_bytes =
 21 |             chunks / constants::associativity * sizeof(uint32_t) * 2;
 22 |         uint64_t header_bytes = chunks * 4 * sizeof(uint16_t);
 23 |         m_header = ptr + pointers_bytes / sizeof(uint16_t);
 24 |         m_data = addr + sizeof(uint16_t) + pointers_bytes + header_bytes;
 25 |     }
 26 | 
 27 |     size_t decode(uint32_t* out) const;
 28 |     size_t uncompress(uint64_t* out) const;
 29 |     bool select(uint32_t rank, uint32_t& value) const;
 30 |     bool contains(uint32_t value) const;
 31 |     uint32_t next_geq(uint32_t value) const;
 32 | 
 33 |     uint32_t const* pointers() const {
 34 |         return m_pointers;
 35 |     }
 36 | 
 37 |     uint16_t const* header() const {
 38 |         return m_header;
 39 |     }
 40 | 
 41 |     uint8_t const* data() const {
 42 |         return m_data;
 43 |     }
 44 | 
 45 |     inline uint32_t cardinality() const {
 46 |         auto const* h = header();
 47 |         uint32_t c = 0;
 48 |         for (uint32_t i = 0; i != chunks; ++i) {
 49 |             c += *(h + 1) + 1;
 50 |             h += 4;
 51 |         }
 52 |         return c;
 53 |     }
 54 | 
 55 |     inline uint32_t size() const {
 56 |         return cardinality();
 57 |     }
 58 | 
 59 |     struct iterator {
 60 |         iterator()
 61 |             : pointers(nullptr)
 62 |             , header(nullptr)
 63 |             , data(nullptr)
 64 |             , begin(0)
 65 |             , end(0) {}
 66 | 
 67 |         iterator(s_sequence const& s, uint32_t begin, uint32_t end)
 68 |             : pointers(s.pointers())
 69 |             , header(s.header())
 70 |             , data(s.data())
 71 |             , begin(begin)
 72 |             , end(end) {}
 73 | 
 74 |         inline uint32_t id() const {
 75 |             // return *header;
 76 |             return has_next() ? *header : 65536;  // saturate
 77 |         }
 78 | 
 79 |         inline uint32_t base() const {
 80 |             return id() << 16;
 81 |         }
 82 | 
 83 |         inline uint32_t cardinality() const {
 84 |             return *(header + 1) + 1;
 85 |         }
 86 | 
 87 |         inline uint32_t type() const {
 88 |             return *(header + 2) & 255;
 89 |         }
 90 | 
 91 |         inline uint32_t blocks() const {
 92 |             return (*(header + 2) >> 8) + 1;
 93 |         }
 94 | 
 95 |         inline uint32_t offset() const {
 96 |             return *(header + 3);
 97 |         }
 98 | 
 99 |         inline void next() {
100 |             data += offset();
101 |             header += 4;
102 |             begin += 1;
103 |         }
104 | 
105 |         inline bool has_next() const {
106 |             return begin < end;
107 |         }
108 | 
109 |         void advance(uint32_t lower_bound) {
110 |             while (id() < lower_bound and has_next()) next();
111 |         }
112 | 
113 |         void skip_to_value(uint32_t lower_bound) {
114 |             while (skip_position() < end and *skip_header() <= lower_bound) {
115 |                 data += *(pointers + 1);
116 |                 pointers += 2;
117 |                 header = skip_header();
118 |                 begin = skip_position();
119 |             }
120 |             advance(lower_bound);
121 |         }
122 | 
123 |         uint32_t skip_to_position(uint32_t rank) {
124 |             uint32_t elements = 0;
125 |             while (skip_position() < end) {
126 |                 uint32_t c = *pointers;
127 |                 if (elements + c > rank) break;
128 |                 elements += c;
129 |                 data += *(pointers + 1);
130 |                 pointers += 2;
131 |                 header = skip_header();
132 |                 begin = skip_position();
133 |             }
134 | 
135 |             while (has_next()) {
136 |                 uint32_t c = cardinality();
137 |                 if (elements + c > rank) return elements;
138 |                 elements += c;
139 |                 next();
140 |             }
141 | 
142 |             return elements;
143 |         }
144 | 
145 |         uint32_t const* pointers;
146 |         uint16_t const* header;
147 |         uint8_t const* data;
148 |         uint32_t begin;
149 |         uint32_t end;
150 | 
151 |     private:
152 |         inline uint32_t skip_position() const {
153 |             return begin + constants::associativity;
154 |         }
155 | 
156 |         inline uint16_t const* skip_header() const {
157 |             return header + 4 * constants::associativity;
158 |         }
159 |     };
160 | 
161 |     iterator begin() const {
162 |         return iterator(*this, 0, chunks);
163 |     }
164 | 
165 |     uint32_t chunks;
166 | 
167 | private:
168 |     uint32_t const* m_pointers;
169 |     uint16_t const* m_header;
170 |     uint8_t const* m_data;
171 | };
172 | 
173 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/select.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace sliced {
  4 | 
  5 | uint32_t select_bitmap(uint8_t const* data, size_t size_in_64bit_words,
  6 |                        uint32_t rank) {
  7 |     uint64_t const* bitmap = reinterpret_cast<uint64_t const*>(data);
  8 |     uint32_t elements = 0;
  9 |     for (size_t i = 0; i != size_in_64bit_words; ++i) {
 10 |         uint64_t w = bitmap[i];
 11 |         int c = __builtin_popcountll(w);
 12 |         if (elements + c > rank) {
 13 |             uint32_t base = i * 64;
 14 | 
 15 |             assert(rank >= elements);
 16 |             rank -= elements;
 17 |             uint64_t i = 1ULL << rank;
 18 |             asm("pdep %[w], %[mask], %[w]" : [ w ] "+r"(w) : [ mask ] "r"(i));
 19 |             asm("tzcnt %[bit], %[index]"
 20 |                 : [ index ] "=r"(i)
 21 |                 : [ bit ] "g"(w)
 22 |                 : "cc");
 23 |             return i + base;
 24 | 
 25 |             // while (w != 0) {
 26 |             //     uint64_t t = w & (~w + 1);
 27 |             //     int r = __builtin_ctzll(w);
 28 |             //     if (elements == rank) {
 29 |             //         return r + base;
 30 |             //     }
 31 |             //     w ^= t;
 32 |             //     ++elements;
 33 |             // }
 34 |         }
 35 |         elements += c;
 36 |     }
 37 |     assert(false);
 38 |     __builtin_unreachable();
 39 |     return elements;
 40 | }
 41 | 
 42 | uint32_t select_sparse_chunk(uint8_t const* begin, int blocks, uint32_t rank) {
 43 |     assert(blocks >= 1 and blocks <= 256);
 44 |     uint8_t const* data = begin + blocks * 2;
 45 |     uint8_t const* end = data;
 46 |     uint32_t elements = 0;
 47 |     while (begin != end) {
 48 |         uint8_t id = *begin;
 49 |         int c = *(begin + 1) + 1;
 50 |         int bytes = 32;
 51 |         int type = type::dense;
 52 |         if (LIKELY(c < 31)) {
 53 |             bytes = c;
 54 |             type = type::sparse;
 55 |         }
 56 |         if (elements + c > rank) {
 57 |             rank -= elements;
 58 |             assert(int(rank) < c);
 59 |             uint32_t base = id * 256;
 60 |             if (type == type::sparse) {
 61 |                 return *(data + rank) + base;
 62 |             } else {
 63 |                 return select_bitmap(data, constants::block_size_in_64bit_words,
 64 |                                      rank) +
 65 |                        base;
 66 |             }
 67 |         }
 68 |         elements += c;
 69 |         data += bytes;
 70 |         begin += 2;
 71 |     }
 72 |     assert(false);
 73 |     __builtin_unreachable();
 74 |     return elements;
 75 | }
 76 | 
 77 | bool s_sequence::select(uint32_t rank, uint32_t& value) const {
 78 |     auto it = begin();
 79 |     uint32_t elements = it.skip_to_position(rank);
 80 |     if (it.has_next()) {
 81 |         rank -= elements;
 82 |         assert(rank < constants::chunk_size);
 83 |         switch (it.type()) {
 84 |             case type::sparse:
 85 |                 value = select_sparse_chunk(it.data, it.blocks(), rank);
 86 |                 break;
 87 |             case type::dense:
 88 |                 value = select_bitmap(
 89 |                     it.data, constants::chunk_size_in_64bit_words, rank);
 90 |                 break;
 91 |             case type::full:
 92 |                 value = rank;
 93 |                 break;
 94 |             default:
 95 |                 assert(false);
 96 |                 __builtin_unreachable();
 97 |         }
 98 |         value += it.id() << 16;
 99 |         return true;
100 |     }
101 |     return false;
102 | }
103 | 
104 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/uncompress.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "constants.hpp"
  4 | 
  5 | namespace sliced {
  6 | 
  7 | inline void uncompress_sparse_block(uint8_t const* begin, int cardinality,
  8 |                                     uint64_t* out) {
  9 |     // for (int i = 0; i != cardinality; ++i) {
 10 |     //     set_bit(*begin++, out);
 11 |     // }
 12 |     uint64_t offset, load, pos;
 13 |     const uint64_t shift = 6;
 14 |     uint8_t const* end = begin + cardinality;
 15 |     __asm volatile(
 16 |         "1:\n"
 17 |         "movzbq (%[begin]), %[pos]\n"
 18 |         "shrx %[shift], %[pos], %[offset]\n"
 19 |         "mov (%[out],%[offset],8), %[load]\n"
 20 |         "bts %[pos], %[load]\n"
 21 |         "mov %[load], (%[out],%[offset],8)\n"
 22 |         "add $1, %[begin]\n"
 23 |         "cmp %[begin], %[end]\n"
 24 |         "jnz 1b"
 25 |         : [ begin ] "+&r"(begin), [ load ] "=&r"(load), [ pos ] "=&r"(pos),
 26 |           [ offset ] "=&r"(offset)
 27 |         : [ end ] "r"(end), [ out ] "r"(out), [ shift ] "r"(shift));
 28 | }
 29 | 
 30 | inline void uncompress_dense_block(uint8_t const* begin, uint64_t* out) {
 31 |     memcpy(out, begin, constants::block_size / 8);
 32 | }
 33 | 
 34 | void uncompress_sparse_chunk(uint8_t const* begin, int blocks, uint64_t* out) {
 35 |     assert(blocks >= 1 and blocks <= 256);
 36 |     uint8_t const* data = begin + blocks * 2;
 37 |     uint64_t* bitmap = out;
 38 |     uint8_t prev = 0;
 39 |     for (int i = 0; i != blocks; ++i) {
 40 |         uint8_t id = *begin;
 41 |         ++begin;
 42 |         int c = *begin;
 43 |         c += 1;
 44 |         int bytes = 32;
 45 |         int type = type::dense;
 46 |         if (LIKELY(c < 31)) {
 47 |             bytes = c;
 48 |             type = type::sparse;
 49 |         }
 50 |         bitmap += (id - prev) * constants::block_size_in_64bit_words;
 51 |         if (type == type::sparse) {
 52 |             uncompress_sparse_block(data, c, bitmap);
 53 |         } else if (type == type::dense) {
 54 |             uncompress_dense_block(data, bitmap);
 55 |         }
 56 |         data += bytes;
 57 |         ++begin;
 58 |         prev = id;
 59 |     }
 60 | }
 61 | 
 62 | inline void uncompress_dense_chunk(uint8_t const* begin, uint64_t* out) {
 63 |     memcpy(out, begin, constants::chunk_size / 8);
 64 | }
 65 | 
 66 | inline void uncompress_full_chunk(uint64_t* out) {
 67 |     for (uint32_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) {
 68 |         out[i] = uint64_t(-1);
 69 |     }
 70 | }
 71 | 
 72 | inline size_t uncompress_chunk(s_sequence::iterator const& it, uint64_t* out) {
 73 |     switch (it.type()) {
 74 |         case type::sparse: {
 75 |             for (uint64_t i = 0; i != 1024; ++i) out[i] = 0;
 76 |             uncompress_sparse_chunk(it.data, it.blocks(), out);
 77 |             break;
 78 |         }
 79 |         case type::dense: {
 80 |             uncompress_dense_chunk(it.data, out);
 81 |             break;
 82 |         }
 83 |         case type::full: {
 84 |             uncompress_full_chunk(out);
 85 |             break;
 86 |         }
 87 |         default:
 88 |             assert(false);
 89 |             __builtin_unreachable();
 90 |     }
 91 |     return it.cardinality();
 92 | }
 93 | 
 94 | size_t s_sequence::uncompress(uint64_t* out) const {
 95 |     auto it = begin();
 96 |     size_t uncompressed = 0;
 97 |     uint16_t prev = 0;
 98 |     for (uint32_t i = 0; i != chunks; ++i) {
 99 |         uint16_t id = it.id();
100 |         out += (id - prev) * constants::chunk_size_in_64bit_words;
101 |         uncompressed += uncompress_chunk(it, out);
102 |         prev = id;
103 |         it.next();
104 |     }
105 |     assert(uncompressed > 0);
106 |     return uncompressed;
107 | }
108 | 
109 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/uncompress_chunk_and_intersect.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "uncompress.hpp"
  4 | 
  5 | namespace sliced {
  6 | 
  7 | // uint32_t uncompress_sparse_block_and_intersect(uint8_t const* begin,
  8 | //                                                int cardinality, uint64_t*
  9 | //                                                out) {
 10 | //     static std::vector<uint64_t> tmp(constants::block_size_in_64bit_words);
 11 | //     tmp[0] = 0;
 12 | //     tmp[1] = 0;
 13 | //     tmp[2] = 0;
 14 | //     tmp[3] = 0;
 15 | //     uncompress_sparse_block(begin, cardinality, tmp.data());
 16 | //     uint32_t c = 0;
 17 | //     for (size_t i = 0; i != constants::block_size_in_64bit_words; ++i) {
 18 | //         out[i] &= tmp[i];
 19 | //         c += __builtin_popcountll(out[i]);
 20 | //     }
 21 | //     return c;
 22 | // }
 23 | 
 24 | // uint32_t uncompress_dense_block_and_intersect(uint8_t const* begin,
 25 | //                                               uint64_t* out) {
 26 | //     uint64_t const* in = reinterpret_cast<uint64_t const*>(begin);
 27 | //     uint32_t c = 0;
 28 | //     for (size_t i = 0; i != constants::block_size_in_64bit_words; ++i) {
 29 | //         out[i] &= in[i];
 30 | //         c += __builtin_popcountll(out[i]);
 31 | //     }
 32 | //     return c;
 33 | // }
 34 | 
 35 | uint32_t uncompress_sparse_chunk_and_intersect(uint8_t const* begin, int blocks,
 36 |                                                uint64_t* out) {
 37 |     static std::vector<uint64_t> tmp(constants::chunk_size_in_64bit_words);
 38 |     std::fill(tmp.begin(), tmp.end(), 0);
 39 |     uncompress_sparse_chunk(begin, blocks, tmp.data());
 40 |     uint32_t c = 0;
 41 |     for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) {
 42 |         out[i] &= tmp[i];
 43 |         c += __builtin_popcountll(out[i]);
 44 |     }
 45 |     return c;
 46 | 
 47 |     // assert(blocks >= 1 and blocks <= 256);
 48 |     // uint8_t const* data = begin + blocks * 2;
 49 |     // uint64_t* tmp = out;
 50 | 
 51 |     // uint8_t prev = 0;
 52 |     // uint32_t uncompressed = 0;
 53 |     // for (int i = 0; i != blocks; ++i) {
 54 |     //     uint8_t id = *begin;
 55 |     //     ++begin;
 56 |     //     int c = *begin + 1;
 57 |     //     int bytes = 32;
 58 |     //     int type = type::dense;
 59 |     //     if (LIKELY(c < 31)) {
 60 |     //         bytes = c;
 61 |     //         type = type::sparse;
 62 |     //     }
 63 | 
 64 |     //     // zero out any blocks in the middle
 65 |     //     for (uint64_t k = constants::block_size_in_64bit_words;
 66 |     //          k < (id - prev) * constants::block_size_in_64bit_words; ++k) {
 67 |     //         tmp[k] = 0;
 68 |     //     }
 69 | 
 70 |     //     tmp += (id - prev) * constants::block_size_in_64bit_words;
 71 |     //     uint32_t u = 0;
 72 |     //     if (type == type::sparse) {
 73 |     //         u = uncompress_sparse_block_and_intersect(data, c, tmp);
 74 |     //     } else if (type == type::dense) {
 75 |     //         u = uncompress_dense_block_and_intersect(data, tmp);
 76 |     //     }
 77 |     //     uncompressed += u;
 78 |     //     data += bytes;
 79 |     //     ++begin;
 80 |     //     prev = id;
 81 |     // }
 82 | 
 83 |     // // zero out any trailing blocks
 84 |     // tmp += constants::block_size_in_64bit_words;
 85 |     // for (uint64_t k = 0;
 86 |     //      k != (255 - prev) * constants::block_size_in_64bit_words; ++k) {
 87 |     //     tmp[k] = 0;
 88 |     // }
 89 | 
 90 |     // return uncompressed;
 91 | }
 92 | 
 93 | uint32_t uncompress_dense_chunk_and_intersect(uint8_t const* begin,
 94 |                                               uint64_t* out) {
 95 |     uint64_t const* in = reinterpret_cast<uint64_t const*>(begin);
 96 |     uint32_t c = 0;
 97 |     for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) {
 98 |         out[i] &= in[i];
 99 |         c += __builtin_popcountll(out[i]);
100 |     }
101 |     return c;
102 | }
103 | 
104 | inline size_t uncompress_chunk_and_intersect(s_sequence::iterator const& it,
105 |                                              uint64_t* out,
106 |                                              uint64_t cardinality) {
107 |     switch (it.type()) {
108 |         case type::sparse:
109 |             return uncompress_sparse_chunk_and_intersect(it.data, it.blocks(),
110 |                                                          out);
111 |         case type::dense:
112 |             return uncompress_dense_chunk_and_intersect(it.data, out);
113 |         case type::full:
114 |             return cardinality;
115 |         default:
116 |             assert(false);
117 |             __builtin_unreachable();
118 |     }
119 | }
120 | 
121 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/uncompress_chunk_and_merge.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "uncompress.hpp"
 4 | 
 5 | namespace sliced {
 6 | 
 7 | void uncompress_sparse_chunk_and_merge(uint8_t const* begin, int blocks,
 8 |                                        uint64_t* out) {
 9 |     static std::vector<uint64_t> tmp(constants::chunk_size_in_64bit_words);
10 |     std::fill(tmp.begin(), tmp.end(), 0);
11 |     uncompress_sparse_chunk(begin, blocks, tmp.data());
12 |     for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) {
13 |         out[i] |= tmp[i];
14 |     }
15 | }
16 | 
17 | void uncompress_dense_chunk_and_merge(uint8_t const* begin, uint64_t* out) {
18 |     uint64_t const* in = reinterpret_cast<uint64_t const*>(begin);
19 |     for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) {
20 |         out[i] |= in[i];
21 |     }
22 | }
23 | 
24 | void uncompress_full_chunk_and_merge(uint64_t* out) {
25 |     for (size_t i = 0; i != constants::chunk_size_in_64bit_words; ++i) {
26 |         out[i] = uint64_t(-1);
27 |     }
28 | }
29 | 
30 | inline void uncompress_chunk_and_merge(s_sequence::iterator const& it,
31 |                                        uint64_t* out) {
32 |     switch (it.type()) {
33 |         case type::sparse: {
34 |             uncompress_sparse_chunk_and_merge(it.data, it.blocks(), out);
35 |             break;
36 |         }
37 |         case type::dense: {
38 |             uncompress_dense_chunk_and_merge(it.data, out);
39 |             break;
40 |         }
41 |         case type::full: {
42 |             uncompress_full_chunk_and_merge(out);
43 |             break;
44 |         }
45 |         default:
46 |             assert(false);
47 |             __builtin_unreachable();
48 |     }
49 | }
50 | 
51 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/union.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "constants.hpp"
  4 | #include "util.hpp"
  5 | #include "uncompress.hpp"
  6 | #include "decode.hpp"
  7 | 
  8 | namespace sliced {
  9 | 
 10 | #define DECODE(ptr)                                                 \
 11 |     uint8_t id = *ptr;                                              \
 12 |     int c = *(ptr + 1) + 1;                                         \
 13 |     int type = type::dense;                                         \
 14 |     int bytes = 32;                                                 \
 15 |     if (LIKELY(c < 31)) {                                           \
 16 |         bytes = c;                                                  \
 17 |         type = type::sparse;                                        \
 18 |     }                                                               \
 19 |     out += decode_block(data_##ptr, type, c, base + id * 256, out); \
 20 |     data_##ptr += bytes;                                            \
 21 |     ptr += 2;
 22 | 
 23 | size_t ss_union_block(uint8_t const* l, uint8_t const* r, int card_l,
 24 |                       int card_r, uint32_t base, uint32_t* out) {
 25 |     assert(card_l > 0 and
 26 |            card_l <= int(constants::block_sparseness_threshold - 2));
 27 |     assert(card_r > 0 and
 28 |            card_r <= int(constants::block_sparseness_threshold - 2));
 29 |     size_t size = 0;
 30 | 
 31 |     uint8_t const* end_l = l + card_l;
 32 |     uint8_t const* end_r = r + card_r;
 33 | 
 34 |     while (true) {
 35 |         if (*l < *r) {
 36 |             out[size++] = *l + base;
 37 |             ++l;
 38 |             if (l == end_l) break;
 39 |         } else if (*r < *l) {
 40 |             out[size++] = *r + base;
 41 |             ++r;
 42 |             if (r == end_r) break;
 43 |         } else {
 44 |             out[size++] = *l + base;
 45 |             ++l;
 46 |             ++r;
 47 |             if (l == end_l or r == end_r) break;
 48 |         }
 49 |     }
 50 | 
 51 |     if (l != end_l) {
 52 |         size += decode_sparse_block(l, end_l - l, base, out + size);
 53 |     }
 54 | 
 55 |     if (r != end_l) {
 56 |         size += decode_sparse_block(r, end_r - r, base, out + size);
 57 |     }
 58 | 
 59 |     return size;
 60 | }
 61 | 
 62 | size_t ds_union_block(uint8_t const* l, uint8_t const* r, int cardinality,
 63 |                       uint32_t base, uint32_t* out) {
 64 |     static uint64_t x[4];
 65 |     memcpy(x, reinterpret_cast<uint64_t const*>(l), constants::block_size / 8);
 66 |     uncompress_sparse_block(r, cardinality, x);
 67 |     return or_bitmaps(l, reinterpret_cast<uint8_t const*>(x),
 68 |                       constants::block_size_in_64bit_words, base, out);
 69 | }
 70 | 
 71 | inline uint32_t decode_block(uint8_t const* data, int type, int cardinality,
 72 |                              uint32_t base, uint32_t* out) {
 73 |     if (type == type::sparse) {
 74 |         return decode_sparse_block(data, cardinality, base, out);
 75 |     }
 76 |     return decode_bitmap(reinterpret_cast<uint64_t const*>(data),
 77 |                          constants::block_size_in_64bit_words, base, out);
 78 | }
 79 | 
 80 | size_t ss_union_chunk(uint8_t const* l, uint8_t const* r, int blocks_l,
 81 |                       int blocks_r, uint32_t base, uint32_t* out) {
 82 |     assert(blocks_l >= 1 and blocks_l <= 256);
 83 |     assert(blocks_r >= 1 and blocks_r <= 256);
 84 | 
 85 |     uint8_t const* data_l = l + blocks_l * 2;
 86 |     uint8_t const* data_r = r + blocks_r * 2;
 87 |     uint8_t const* end_l = data_l;
 88 |     uint8_t const* end_r = data_r;
 89 |     uint32_t* in = out;
 90 | 
 91 |     while (true) {
 92 |         if (*l < *r) {
 93 |             DECODE(l)
 94 |             if (l == end_l) break;
 95 |         } else if (*l > *r) {
 96 |             DECODE(r)
 97 |             if (r == end_r) break;
 98 |         } else {
 99 |             uint8_t id = *l;
100 |             ++l;
101 |             ++r;
102 |             int cl = *l + 1;
103 |             int cr = *r + 1;
104 |             int type_l = type::dense;
105 |             int type_r = type::dense;
106 |             int bl = 32;
107 |             int br = 32;
108 | 
109 |             if (LIKELY(cl < 31)) {
110 |                 bl = cl;
111 |                 type_l = type::sparse;
112 |             }
113 | 
114 |             if (LIKELY(cr < 31)) {
115 |                 br = cr;
116 |                 type_r = type::sparse;
117 |             }
118 | 
119 |             uint32_t b = base + id * 256;
120 |             uint32_t n = 0;
121 | 
122 |             switch (block_pair(type_l, type_r)) {
123 |                 case block_pair(type::sparse, type::sparse):
124 |                     n = ss_union_block(data_l, data_r, cl, cr, b, out);
125 |                     break;
126 |                 case block_pair(type::sparse, type::dense):
127 |                     n = ds_union_block(data_r, data_l, cl, b, out);
128 |                     break;
129 |                 case block_pair(type::dense, type::sparse):
130 |                     n = ds_union_block(data_l, data_r, cr, b, out);
131 |                     break;
132 |                 case block_pair(type::dense, type::dense):
133 |                     n = or_bitmaps(data_l, data_r,
134 |                                    constants::block_size_in_64bit_words, b,
135 |                                    out);
136 |                     break;
137 |                 default:
138 |                     assert(false);
139 |                     __builtin_unreachable();
140 |             }
141 | 
142 |             out += n;
143 |             data_l += bl;
144 |             data_r += br;
145 |             ++l;
146 |             ++r;
147 | 
148 |             if (l == end_l or r == end_r) { break; }
149 |         }
150 |     }
151 | 
152 |     while (l != end_l) { DECODE(l) }
153 |     while (r != end_r) { DECODE(r) }
154 | 
155 |     return size_t(out - in);
156 | }
157 | 
158 | size_t ds_union_chunk(uint8_t const* l, uint8_t const* r, int blocks_r,
159 |                       uint32_t base, uint32_t* out) {
160 |     static std::vector<uint64_t> x(1024);
161 |     std::fill(x.begin(), x.end(), 0);
162 |     uncompress_sparse_chunk(r, blocks_r, x.data());
163 |     return or_bitmaps(l, reinterpret_cast<uint8_t const*>(x.data()),
164 |                       constants::chunk_size_in_64bit_words, base, out);
165 | }
166 | 
167 | size_t pairwise_union(s_sequence const& l, s_sequence const& r, uint32_t* out) {
168 |     auto it_l = l.begin();
169 |     auto it_r = r.begin();
170 |     uint32_t* in = out;
171 | 
172 |     while (true) {
173 |         uint16_t id_l = it_l.id();
174 |         uint16_t id_r = it_r.id();
175 | 
176 |         if (id_l == id_r) {
177 |             uint32_t n = 0;
178 |             uint32_t base = id_l << 16;
179 |             int blocks_l = 0;
180 |             int blocks_r = 0;
181 | 
182 |             uint16_t type_l = it_l.type();
183 |             uint16_t type_r = it_r.type();
184 | 
185 |             switch (chunk_pair(type_l, type_r)) {
186 |                 case chunk_pair(type::sparse, type::sparse):
187 |                     blocks_l = it_l.blocks();
188 |                     blocks_r = it_r.blocks();
189 |                     if (blocks_l < blocks_r) {
190 |                         n = ss_union_chunk(it_l.data, it_r.data, blocks_l,
191 |                                            blocks_r, base, out);
192 |                     } else {
193 |                         n = ss_union_chunk(it_r.data, it_l.data, blocks_r,
194 |                                            blocks_l, base, out);
195 |                     }
196 |                     break;
197 |                 case chunk_pair(type::sparse, type::dense):
198 |                     n = ds_union_chunk(it_r.data, it_l.data, it_l.blocks(),
199 |                                        base, out);
200 |                     break;
201 |                 case chunk_pair(type::sparse, type::full):
202 |                     n = decode_full_chunk(base, out);
203 |                     break;
204 |                 case chunk_pair(type::dense, type::sparse):
205 |                     n = ds_union_chunk(it_l.data, it_r.data, it_r.blocks(),
206 |                                        base, out);
207 |                     break;
208 |                 case chunk_pair(type::dense, type::dense):
209 |                     n = or_bitmaps(it_l.data, it_r.data,
210 |                                    constants::chunk_size_in_64bit_words, base,
211 |                                    out);
212 |                     break;
213 |                 case chunk_pair(type::dense, type::full):
214 |                     n = decode_full_chunk(base, out);
215 |                     break;
216 |                 case chunk_pair(type::full, type::sparse):
217 |                     n = decode_full_chunk(base, out);
218 |                     break;
219 |                 case chunk_pair(type::full, type::dense):
220 |                     n = decode_full_chunk(base, out);
221 |                     break;
222 |                 case chunk_pair(type::full, type::full):
223 |                     n = decode_full_chunk(base, out);
224 |                     break;
225 |                 default:
226 |                     assert(false);
227 |                     __builtin_unreachable();
228 |             }
229 | 
230 |             out += n;
231 | 
232 |             it_l.next();
233 |             it_r.next();
234 |             if (!it_l.has_next() or !it_r.has_next()) break;
235 | 
236 |         } else if (id_l < id_r) {
237 |             out += decode_chunk(it_l, out);
238 |             it_l.next();
239 |             if (!it_l.has_next()) break;
240 |         } else {
241 |             out += decode_chunk(it_r, out);
242 |             it_r.next();
243 |             if (!it_r.has_next()) break;
244 |         }
245 |     }
246 | 
247 |     while (it_l.has_next()) {
248 |         out += decode_chunk(it_l, out);
249 |         it_l.next();
250 |     }
251 | 
252 |     while (it_r.has_next()) {
253 |         out += decode_chunk(it_r, out);
254 |         it_r.next();
255 |     }
256 | 
257 |     return size_t(out - in);
258 | }
259 | 
260 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/union_many.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "decode.hpp"
 4 | #include "uncompress_chunk_and_merge.hpp"
 5 | 
 6 | namespace sliced {
 7 | 
 8 | size_t union_many(std::vector<s_sequence>& sequences, uint32_t* out) {
 9 |     uint32_t* in = out;
10 |     std::vector<s_sequence::iterator> iterators(sequences.size());
11 |     for (size_t i = 0; i != sequences.size(); ++i) {
12 |         iterators[i] = sequences[i].begin();
13 |     }
14 | 
15 |     static std::vector<uint64_t> bitmap(1024);
16 |     uint32_t header = std::min_element(iterators.begin(), iterators.end(),
17 |                                        [](auto const& l, auto const& r) {
18 |                                            return l.id() < r.id();
19 |                                        })
20 |                           ->id();
21 |     bool first = true;
22 |     while (header < 65536) {
23 |         uint32_t base = header << 16;
24 |         for (size_t i = 0; i != iterators.size(); ++i) {
25 |             if (iterators[i].id() == header) {
26 |                 if (first) {
27 |                     uncompress_chunk(iterators[i], bitmap.data());
28 |                     first = false;
29 |                 } else {
30 |                     uncompress_chunk_and_merge(iterators[i], bitmap.data());
31 |                 }
32 |             }
33 |         }
34 |         first = true;
35 |         out += decode_bitmap(bitmap.data(), 1024, base, out);
36 |         uint32_t next = 65536;
37 |         for (size_t i = 0; i != iterators.size(); ++i) {
38 |             if (iterators[i].id() == header) iterators[i].next();
39 |             if (iterators[i].id() < next) next = iterators[i].id();
40 |         }
41 |         header = next;
42 |     }
43 | 
44 |     return size_t(out - in);
45 | }
46 | 
47 | }  // namespace sliced


--------------------------------------------------------------------------------
/include/util.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cmath>  // for log2 and ceil
  4 | #include <cstring>
  5 | #include "constants.hpp"
  6 | 
  7 | namespace sliced {
  8 | 
  9 | #define chunk_pair(l, r) (3 * (l) + (r))
 10 | #define block_pair(l, r) (2 * (l) + (r))
 11 | 
 12 | #define BYTES_BY_CARDINALITY(c) LIKELY(c < 31) ? c : 32
 13 | 
 14 | #define LIKELY(x) __builtin_expect(!!(x), 1)
 15 | 
 16 | #define OPERATE_BITMAPS(OP, l, r, size_in_64bit_words, base, out)    \
 17 |     uint64_t const* bitmap_l = reinterpret_cast<uint64_t const*>(l); \
 18 |     uint64_t const* bitmap_r = reinterpret_cast<uint64_t const*>(r); \
 19 |     size_t size = 0;                                                 \
 20 |     for (size_t i = 0; i != size_in_64bit_words; ++i) {              \
 21 |         uint64_t w = bitmap_l[i] OP bitmap_r[i];                     \
 22 |         while (w != 0) {                                             \
 23 |             uint64_t t = w & (~w + 1);                               \
 24 |             int r = __builtin_ctzll(w);                              \
 25 |             out[size++] = r + base;                                  \
 26 |             w ^= t;                                                  \
 27 |         }                                                            \
 28 |         base += 64;                                                  \
 29 |     }                                                                \
 30 |     return size;
 31 | 
 32 | size_t and_bitmaps(uint8_t const* l, uint8_t const* r,
 33 |                    size_t size_in_64bit_words, uint32_t base, uint32_t* out){
 34 |     OPERATE_BITMAPS(&, l, r, size_in_64bit_words, base, out)}
 35 | 
 36 | size_t or_bitmaps(uint8_t const* l, uint8_t const* r,
 37 |                   size_t size_in_64bit_words, uint32_t base, uint32_t* out) {
 38 |     OPERATE_BITMAPS(|, l, r, size_in_64bit_words, base, out)
 39 | }
 40 | 
 41 | inline bool bitmap_contains(uint64_t const* bitmap, uint64_t pos) {
 42 |     // uint64_t w = bitmap[pos >> 6];
 43 |     // w >>= pos & 63;
 44 |     // return w & 1;
 45 | 
 46 |     uint64_t r;
 47 |     uint64_t w = bitmap[pos >> 6];
 48 |     __asm volatile(
 49 |         "bt %2,%1\n"
 50 |         "sbb %0,%0"
 51 |         : "=r"(r)
 52 |         : "r"(w), "r"(pos));
 53 |     return r;
 54 | }
 55 | 
 56 | size_t bytes_for(size_t bits) {
 57 |     return (bits + 8 - 1) / 8;
 58 | }
 59 | 
 60 | uint32_t num_chunks(uint64_t universe) {
 61 |     return (universe + constants::chunk_size) / constants::chunk_size;
 62 | }
 63 | 
 64 | enum type { empty = 0, sparse = 1, dense = 3, full = 2 };
 65 | 
 66 | struct parameters {
 67 |     parameters()
 68 |         : collection_filename("")
 69 |         , density(-1.0)
 70 |         , size(0) {}
 71 | 
 72 |     std::string collection_filename;
 73 |     double density;
 74 |     uint32_t size;
 75 | };
 76 | 
 77 | bool pass(parameters const& params, uint32_t n, uint32_t universe) {
 78 |     if (params.density >= 0.0 and double(n) / universe > params.density)
 79 |         return true;
 80 |     if (n > params.size) return true;
 81 |     return false;
 82 | }
 83 | 
 84 | struct query {
 85 |     uint32_t i;
 86 |     uint32_t j;
 87 | };
 88 | 
 89 | /* For a sorted list of size n whose universe is u. */
 90 | uint64_t elias_fano_bitsize(uint64_t n, uint64_t u) {
 91 |     return n *
 92 |            ((u > n ? (std::ceil(std::log2(static_cast<double>(u) / n))) : 0) +
 93 |             2);
 94 | }
 95 | 
 96 | struct statistics {
 97 |     statistics() {
 98 |         memset(this, 0, sizeof(*this));
 99 |     }
100 | 
101 |     uint64_t sequences;
102 | 
103 |     uint64_t integers;
104 |     uint64_t integers_in_sparse_chunks;
105 |     uint64_t integers_in_dense_chunks;
106 |     uint64_t integers_in_full_chunks;
107 |     uint64_t integers_in_sparse_blocks;
108 |     uint64_t integers_in_dense_blocks;
109 | 
110 |     uint64_t chunks;
111 |     uint64_t empty_chunks;
112 |     uint64_t sparse_chunks;
113 |     uint64_t very_sparse_chunks;
114 |     uint64_t dense_chunks;
115 |     uint64_t full_chunks;
116 | 
117 |     uint64_t blocks;
118 |     uint64_t empty_blocks;
119 |     uint64_t sparse_blocks;
120 |     uint64_t dense_blocks;
121 | 
122 |     uint64_t bits;
123 |     uint64_t chunks_header_bits;
124 |     uint64_t blocks_header_bits;
125 |     uint64_t dense_chunks_bits;
126 |     uint64_t dense_blocks_bits;
127 |     uint64_t sparse_blocks_bits;
128 | 
129 |     uint64_t sparse_blocks_cardinalities[1 + 30];
130 | 
131 |     // (non-empty blocks)
132 |     uint64_t
133 |         num_blocks_in_chunks[1 + constants::chunk_size / constants::block_size];
134 |     uint64_t num_integers[1 + constants::chunk_size / constants::block_size];
135 | 
136 |     void accumulate(statistics const& other) {
137 |         dense_blocks += other.dense_blocks;
138 |         sparse_blocks += other.sparse_blocks;
139 |         empty_blocks += other.empty_blocks;
140 |         integers_in_dense_blocks += other.integers_in_dense_blocks;
141 |         integers_in_sparse_blocks += other.integers_in_sparse_blocks;
142 |         dense_blocks_bits += other.dense_blocks_bits;
143 |         sparse_blocks_bits += other.sparse_blocks_bits;
144 | 
145 |         for (int i = 1; i != 30 + 1; i++) {
146 |             sparse_blocks_cardinalities[i] +=
147 |                 other.sparse_blocks_cardinalities[i];
148 |         }
149 |     }
150 | 
151 |     void print() {
152 |         std::cout << "processed " << sequences << " sequences, " << integers
153 |                   << " integers" << std::endl;
154 | 
155 |         std::cout << "chunks: " << chunks << std::endl;
156 |         std::cout << "full chunks: " << full_chunks << " ("
157 |                   << integers_in_full_chunks * 100.0 / integers << "% of ints)"
158 |                   << std::endl;
159 |         std::cout << "empty chunks: " << empty_chunks << " ("
160 |                   << empty_chunks * 100.0 / chunks << "% of chunks)"
161 |                   << std::endl;
162 |         std::cout << "dense chunks: " << dense_chunks << " ("
163 |                   << integers_in_dense_chunks * 100.0 / integers << "% of ints)"
164 |                   << std::endl;
165 |         std::cout << "sparse chunks: " << sparse_chunks << " ("
166 |                   << integers_in_sparse_chunks * 100.0 / integers
167 |                   << "% of ints)" << std::endl;
168 | 
169 |         std::cout << "blocks: " << blocks << std::endl;
170 |         std::cout << "empty blocks: " << empty_blocks << " ("
171 |                   << empty_blocks * 100.0 / blocks << "% of blocks)"
172 |                   << std::endl;
173 |         std::cout << "dense blocks: " << dense_blocks << " ("
174 |                   << integers_in_dense_blocks * 100.0 / integers << "% of ints)"
175 |                   << std::endl;
176 |         std::cout << "sparse blocks: " << sparse_blocks << " ("
177 |                   << integers_in_sparse_blocks * 100.0 / integers
178 |                   << "% of ints)" << std::endl;
179 | 
180 |         std::cout << double(chunks_header_bits) / integers
181 |                   << " [bpi] for chunks' headers" << std::endl;
182 |         std::cout << double(blocks_header_bits) / integers
183 |                   << " [bpi] for blocks' headers" << std::endl;
184 |         std::cout << double(dense_chunks_bits) / integers
185 |                   << " [bpi] for dense chunks" << std::endl;
186 |         std::cout << double(dense_blocks_bits) / integers
187 |                   << " [bpi] for dense blocks" << std::endl;
188 |         std::cout << double(sparse_blocks_bits) / integers
189 |                   << " [bpi] for sparse blocks" << std::endl;
190 | 
191 |         std::cout << "total bytes: " << bits / 8 << std::endl;
192 |         std::cout << "total bpi: " << double(bits) / integers << std::endl;
193 | 
194 |         std::cout << "== sparse blocks cardinalities (%) ==" << std::endl;
195 |         double expected_value = 0.0;
196 |         for (int i = 1; i != 30 + 1; ++i) {
197 |             double p_i = static_cast<double>(sparse_blocks_cardinalities[i]) /
198 |                          sparse_blocks;
199 |             std::cout << "sparse blocks with card. " << i << ": " << p_i * 100.0
200 |                       << std::endl;
201 |             expected_value += i * p_i;
202 |         }
203 |         std::cout << "expected_value " << expected_value << std::endl;
204 | 
205 |         std::cout << "== distribution of blocks in sparse chunks ("
206 |                   << integers_in_sparse_chunks * 100.0 / integers
207 |                   << "% of ints) ==" << std::endl;
208 |         uint64_t covered_integers_in_sparse_chunks = 0;
209 |         expected_value = 0.0;
210 |         for (uint64_t i = 1;
211 |              i != constants::chunk_size / constants::block_size + 1; ++i) {
212 |             uint64_t avg_num_integers_per_chunk =
213 |                 static_cast<double>(num_integers[i]) / num_blocks_in_chunks[i];
214 |             uint64_t elias_fano_bits = elias_fano_bitsize(
215 |                 avg_num_integers_per_chunk, constants::chunk_size);
216 | 
217 |             uint64_t total_num_blocks = i * num_blocks_in_chunks[i];
218 |             double avg_num_integers_per_block =
219 |                 static_cast<double>(num_integers[i]) / total_num_blocks;
220 |             double p_i =
221 |                 static_cast<double>(num_blocks_in_chunks[i]) / sparse_chunks;
222 |             std::cout << "sparse chunks with " << i
223 |                       << " blocks: " << p_i * 100.0
224 |                       << "%; avg_num_integers_per_block = "
225 |                       << avg_num_integers_per_block
226 |                       << "; avg_num_integers_per_chunk = "
227 |                       << avg_num_integers_per_chunk << std::endl;
228 |             expected_value += i * p_i;
229 |             covered_integers_in_sparse_chunks += num_integers[i];
230 | 
231 |             std::cout << "Elias-Fano avg. bpi "
232 |                       << static_cast<double>(elias_fano_bits) /
233 |                              avg_num_integers_per_chunk
234 |                       << " vs. 8" << std::endl;
235 | 
236 |             std::cout << " -- total integers covered "
237 |                       << (covered_integers_in_sparse_chunks * 100.0) /
238 |                              integers_in_sparse_chunks
239 |                       << "%" << std::endl;
240 |         }
241 |         std::cout << "expected_value " << expected_value << std::endl;
242 |     }
243 | };
244 | 
245 | }  // namespace sliced


--------------------------------------------------------------------------------
/script/build.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | 
3 | input_filename = sys.argv[1]
4 | output_filename = sys.argv[2]
5 | 
6 | os.system("./build " + input_filename + " 0.01 -o " + output_filename + ".0.01.bin")
7 | os.system("./build " + input_filename + " 0.001 -o " + output_filename + ".0.001.bin")
8 | os.system("./build " + input_filename + " 0.0001 -o " + output_filename + ".0.0001.bin")
9 | 


--------------------------------------------------------------------------------
/script/queries.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | 
 3 | index_filename = sys.argv[1]
 4 | query_logs_path = sys.argv[2]
 5 | runs = 3
 6 | 
 7 | for i in xrange(0, runs):
 8 |     os.system("./decode " + index_filename)
 9 | 
10 | for i in xrange(0, runs):
11 |     os.system("./intersect " + index_filename + " 1000 < " + query_logs_path + "/pairwise_queries.1k")
12 | 
13 | for i in xrange(0, runs):
14 |     os.system("./union " + index_filename + " 1000 < " + query_logs_path + "/pairwise_queries.1k")
15 | 
16 | for i in xrange(0, runs):
17 |     os.system("./select " + index_filename + " 1000 < " + query_logs_path + "/select_queries.1k")
18 | 
19 | for i in xrange(0, runs):
20 |     os.system("./next_geq " + index_filename + " 1000 < " + query_logs_path + "/next_geq_queries.1k")


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(build build.cpp)
 2 | add_executable(decode decode.cpp)
 3 | add_executable(uncompress uncompress.cpp)
 4 | add_executable(intersect intersect.cpp)
 5 | add_executable(union union.cpp)
 6 | add_executable(cardinality cardinality.cpp)
 7 | add_executable(select select.cpp)
 8 | add_executable(contains contains.cpp)
 9 | add_executable(next_geq next_geq.cpp)
10 | 
11 | add_executable(example example.cpp)


--------------------------------------------------------------------------------
/src/build.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | #include "builder.hpp"
 5 | #include "s_index.hpp"
 6 | 
 7 | using namespace sliced;
 8 | 
 9 | void build(parameters const& params, char const* output_filename) {
10 |     typedef s_index::builder builder_type;
11 |     builder_type builder(params);
12 |     auto stats = builder.build();
13 |     stats.print();
14 |     if (output_filename) {
15 |         essentials::print_size(builder);
16 |         std::cout << "saving data structure to disk..." << std::endl;
17 |         essentials::save<builder_type>(builder, output_filename);
18 |     }
19 | }
20 | 
21 | int main(int argc, char** argv) {
22 |     int mandatory = 2;
23 |     if (argc < mandatory) {
24 |         std::cout << argv[0]
25 |                   << " collection_filename [--density d] [--size s] [--out "
26 |                      "output_filename]"
27 |                   << std::endl;
28 |         return 1;
29 |     }
30 | 
31 |     parameters params;
32 |     params.collection_filename = argv[1];
33 |     char const* output_filename = nullptr;
34 | 
35 |     for (int i = mandatory; i != argc; ++i) {
36 |         if (std::string(argv[i]) == "--density") {
37 |             ++i;
38 |             params.density = std::stod(argv[i]);
39 |         } else if (std::string(argv[i]) == "--size") {
40 |             ++i;
41 |             params.size = std::atoi(argv[i]);
42 |         } else if (std::string(argv[i]) == "--out") {
43 |             ++i;
44 |             output_filename = argv[i];
45 |         }
46 |     }
47 | 
48 |     build(params, output_filename);
49 | 
50 |     return 0;
51 | }
52 | 


--------------------------------------------------------------------------------
/src/cardinality.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | 
 8 | using namespace sliced;
 9 | 
10 | void perf_cardinality(char const* binary_filename) {
11 |     s_index index;
12 |     index.mmap(binary_filename);
13 | 
14 |     size_t total = 0;
15 |     essentials::timer_type t;
16 |     static const int runs = 10 + 1;
17 |     for (int run = 0; run != runs; ++run) {
18 |         t.start();
19 |         for (size_t i = 0; i != index.size(); ++i) {
20 |             auto sequence = index[i];
21 |             total += sequence.cardinality();
22 |         }
23 |         t.stop();
24 |     }
25 |     std::cout << total / runs << std::endl;
26 |     t.discard_first();
27 |     double avg = t.average();
28 |     std::cout << "Mean per run: " << avg << " [musec]\n";
29 |     std::cout << "Mean per query: " << avg / index.size() * 1000 << " [ns]";
30 |     std::cout << std::endl;
31 | }
32 | 
33 | int main(int argc, char** argv) {
34 |     int mandatory = 2;
35 |     if (argc < mandatory) {
36 |         std::cout << argv[0] << " <index_filename>" << std::endl;
37 |         return 1;
38 |     }
39 | 
40 |     perf_cardinality(argv[1]);
41 |     return 0;
42 | }
43 | 


--------------------------------------------------------------------------------
/src/contains.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | #include "contains.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void perf_contains(char const* binary_filename,
12 |                    std::vector<query> const& queries) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     size_t total = 0;
17 |     std::cout << "performing " << queries.size() << " contains queries..."
18 |               << std::endl;
19 |     essentials::timer_type t;
20 |     static const int runs = 3 + 1;
21 |     for (int run = 0; run != runs; ++run) {
22 |         t.start();
23 |         for (auto const& q : queries) total += index[q.i].contains(q.j);
24 |         t.stop();
25 |     }
26 |     std::cout << total << std::endl;
27 |     t.discard_first();
28 |     double avg = t.average();
29 |     std::cout << "Mean per run: " << avg << " [musec]\n";
30 |     std::cout << "Mean per query: " << avg / queries.size() << " [musec]";
31 |     std::cout << std::endl;
32 | }
33 | 
34 | int main(int argc, char** argv) {
35 |     int mandatory = 3;
36 |     if (argc < mandatory) {
37 |         std::cout << argv[0] << " index_filename num_queries < queries"
38 |                   << std::endl;
39 |         return 1;
40 |     }
41 | 
42 |     char const* binary_filename = argv[1];
43 |     uint64_t num_queries = std::stoull(argv[2]);
44 |     std::vector<query> queries;
45 |     queries.reserve(num_queries);
46 | 
47 |     std::cout << "reading queries..." << std::endl;
48 |     for (uint32_t i = 0; i != num_queries; ++i) {
49 |         query q;
50 |         int x = scanf("%d", &q.i);
51 |         int y = scanf("%d", &q.j);
52 |         if (x == EOF or y == EOF) break;
53 |         queries.push_back(q);
54 |     }
55 |     std::cout << "DONE" << std::endl;
56 | 
57 |     perf_contains(binary_filename, queries);
58 | 
59 |     return 0;
60 | }
61 | 


--------------------------------------------------------------------------------
/src/decode.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | #include "decode.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void decode(char const* binary_filename) {
12 |     s_index index;
13 |     index.mmap(binary_filename);
14 | 
15 |     std::vector<uint32_t> out(index.universe());
16 |     uint64_t integers = 0;
17 |     essentials::timer_type t;
18 |     t.start();
19 |     for (size_t i = 0; i != index.size(); ++i) {
20 |         auto sequence = index[i];
21 |         size_t decoded = sequence.decode(out.data());
22 |         integers += decoded;
23 |     }
24 |     t.stop();
25 | 
26 |     std::cout << "decoded " << index.size() << " sequences" << std::endl;
27 |     std::cout << "decoded " << integers << " integers" << std::endl;
28 | 
29 |     double elapsed = t.average();
30 |     std::cout << "Elapsed time: " << elapsed / 1000000 << " [sec]\n";
31 |     std::cout << "Mean per sequence: " << elapsed / index.size()
32 |               << " [musec]\n";
33 |     std::cout << "Mean per integer: " << elapsed / integers * 1000 << " [ns]";
34 |     std::cout << std::endl;
35 | }
36 | 
37 | int main(int argc, char** argv) {
38 |     int mandatory = 2;
39 |     if (argc < mandatory) {
40 |         std::cout << argv[0] << " <index_filename>" << std::endl;
41 |         return 1;
42 |     }
43 | 
44 |     decode(argv[1]);
45 |     return 0;
46 | }
47 | 


--------------------------------------------------------------------------------
/src/example.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | #include "builder.hpp"
 5 | #include "s_sequence.hpp"
 6 | #include "select.hpp"
 7 | #include "decode.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | int main(int argc, char** argv) {
12 |     int mandatory = 1;
13 |     char const* output_filename = nullptr;
14 | 
15 |     for (int i = mandatory; i != argc; ++i) {
16 |         if (std::string(argv[i]) == "-o") {
17 |             ++i;
18 |             output_filename = argv[i];
19 |         } else if (std::string(argv[i]) == "-h") {
20 |             std::cout << argv[0] << " -o output_filename < input" << std::endl;
21 |             return 1;
22 |         } else {
23 |             std::cout << "unknown option '" << argv[i] << "'" << std::endl;
24 |             return 1;
25 |         }
26 |     }
27 | 
28 |     std::vector<uint32_t> input;
29 | 
30 |     {  // read input from std::in
31 |         uint32_t n, x;
32 |         std::cin >> n;
33 |         input.reserve(n);
34 |         for (uint32_t i = 0; i != n; ++i) {
35 |             std::cin >> x;
36 |             input.push_back(x);
37 |         }
38 |     }
39 | 
40 |     // build the sequence and print statistics
41 |     s_sequence::builder builder;
42 |     auto stats = builder.build(input.data(), input.size());
43 |     stats.print();
44 | 
45 |     mm::file_source<uint8_t> mm_file;
46 |     uint8_t const* data = nullptr;
47 | 
48 |     if (output_filename) {  // if an output file is specified, then serialize
49 |         essentials::print_size(builder);
50 |         essentials::save<s_sequence::builder>(builder, output_filename);
51 | 
52 |         // mmap
53 |         int advice = mm::advice::normal;  // can be also random and sequential
54 |         mm_file.open(output_filename, advice);
55 | 
56 |         // skip first 8 bytes storing the number of written bytes
57 |         data = mm_file.data() + 8;
58 | 
59 |     } else {  // otherwise work directly in memory
60 |         data = builder.data();
61 |     }
62 | 
63 |     // initialize a s_sequence from data, regardless the source
64 |     s_sequence ss(data);
65 | 
66 |     uint32_t size = ss.size();
67 | 
68 |     // decode whole list to an output buffer
69 |     std::vector<uint32_t> out(size);
70 |     ss.decode(out.data());
71 |     // check written values
72 |     uint32_t value = 0;
73 |     for (uint32_t i = 0; i != size; ++i) {
74 |         if (input[i] != out[i]) {
75 |             std::cout << "got " << out[i] << " but expected " << input[i]
76 |                       << std::endl;
77 |             return 1;
78 |         }
79 | 
80 |         ss.select(i, value);  // select i-th element
81 |         if (value != out[i]) {
82 |             std::cout << "got " << value << " but expected " << out[i]
83 |                       << std::endl;
84 |             return 1;
85 |         }
86 |     }
87 | 
88 |     return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/src/intersect.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | #include "intersection.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void perf_intersection(char const* binary_filename,
12 |                        std::vector<query> const& queries) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     std::vector<uint32_t> out(index.universe());
17 |     size_t total = 0;
18 |     std::cout << "performing " << queries.size() << " pairwise-intersections..."
19 |               << std::endl;
20 |     essentials::timer_type t;
21 |     static const int runs = 10 + 1;
22 |     for (int run = 0; run != runs; ++run) {
23 |         t.start();
24 |         for (auto const& q : queries) {
25 |             total += pairwise_intersection(index[q.i], index[q.j], out.data());
26 |         }
27 |         t.stop();
28 |     }
29 |     std::cout << total << std::endl;
30 |     t.discard_first();
31 |     double avg = t.average();
32 |     std::cout << "Mean per run: " << avg << " [musec]\n";
33 |     std::cout << "Mean per query: " << avg / queries.size() << " [musec]";
34 |     std::cout << std::endl;
35 | }
36 | 
37 | int main(int argc, char** argv) {
38 |     int mandatory = 3;
39 |     if (argc < mandatory) {
40 |         std::cout << argv[0] << " index_filename num_queries < queries"
41 |                   << std::endl;
42 |         return 1;
43 |     }
44 | 
45 |     char const* binary_filename = argv[1];
46 |     uint64_t num_queries = std::stoull(argv[2]);
47 |     std::vector<query> queries;
48 |     queries.reserve(num_queries);
49 | 
50 |     std::cout << "reading queries..." << std::endl;
51 |     for (uint32_t i = 0; i != num_queries; ++i) {
52 |         query q;
53 |         int x = scanf("%d", &q.i);
54 |         int y = scanf("%d", &q.j);
55 |         if (x == EOF or y == EOF) break;
56 |         queries.push_back(q);
57 |     }
58 |     std::cout << "DONE" << std::endl;
59 | 
60 |     perf_intersection(binary_filename, queries);
61 | 
62 |     return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/src/next_geq.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | #include "next_geq.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void perf_next_geq(char const* binary_filename,
12 |                    uint64_t num_queries_per_sequence) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     uint64_t total_queries = index.size() * num_queries_per_sequence;
17 |     std::vector<uint32_t> queries;
18 |     queries.reserve(total_queries);
19 | 
20 |     std::cout << "reading queries..." << std::endl;
21 |     for (uint32_t i = 0; i != total_queries; ++i) {
22 |         uint32_t q;
23 |         int x = scanf("%d", &q);
24 |         if (x == EOF) break;
25 |         queries.push_back(q);
26 |     }
27 |     std::cout << "DONE" << std::endl;
28 | 
29 |     size_t total = 0;
30 |     std::cout << "performing " << queries.size() << " next_geq queries..."
31 |               << std::endl;
32 |     essentials::timer_type t;
33 |     static const int runs = 3 + 1;
34 |     for (int run = 0; run != runs; ++run) {
35 |         uint64_t q = 0;
36 |         t.start();
37 |         for (uint32_t i = 0; i != index.size(); ++i) {
38 |             auto sequence = index[i];
39 |             for (uint32_t j = 0; j != num_queries_per_sequence; ++j) {
40 |                 total += sequence.next_geq(queries[q++]);
41 |             }
42 |         }
43 |         t.stop();
44 |     }
45 |     std::cout << total << std::endl;
46 |     t.discard_first();
47 |     double avg = t.average();
48 |     std::cout << "Mean per run: " << avg << " [musec]\n";
49 |     std::cout << "Mean per query: " << avg / total_queries << " [musec]";
50 |     std::cout << std::endl;
51 | }
52 | 
53 | int main(int argc, char** argv) {
54 |     int mandatory = 3;
55 |     if (argc < mandatory) {
56 |         std::cout << argv[0]
57 |                   << " index_filename num_queries_per_sequence < queries"
58 |                   << std::endl;
59 |         return 1;
60 |     }
61 | 
62 |     char const* binary_filename = argv[1];
63 |     uint64_t num_queries_per_sequence = std::stoull(argv[2]);
64 |     perf_next_geq(binary_filename, num_queries_per_sequence);
65 | 
66 |     return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/src/select.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | #include "select.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void perf_select(char const* binary_filename,
12 |                  uint64_t num_queries_per_sequence) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     uint64_t total_queries = index.size() * num_queries_per_sequence;
17 |     std::vector<uint32_t> queries;
18 |     queries.reserve(total_queries);
19 | 
20 |     std::cout << "reading queries..." << std::endl;
21 |     for (uint32_t i = 0; i != total_queries; ++i) {
22 |         uint32_t q;
23 |         int x = scanf("%d", &q);
24 |         if (x == EOF) break;
25 |         queries.push_back(q);
26 |     }
27 |     std::cout << "DONE" << std::endl;
28 | 
29 |     size_t total = 0;
30 |     std::cout << "performing " << queries.size() << " select queries..."
31 |               << std::endl;
32 |     essentials::timer_type t;
33 |     static const int runs = 3 + 1;
34 |     for (int run = 0; run != runs; ++run) {
35 |         uint64_t q = 0;
36 |         uint32_t value = 0;
37 |         t.start();
38 |         for (uint32_t i = 0; i != index.size(); ++i) {
39 |             auto sequence = index[i];
40 |             for (uint32_t j = 0; j != num_queries_per_sequence; ++j) {
41 |                 sequence.select(queries[q++], value);
42 |                 total += value;
43 |             }
44 |         }
45 |         t.stop();
46 |     }
47 |     std::cout << total << std::endl;
48 |     t.discard_first();
49 |     double avg = t.average();
50 |     std::cout << "Mean per run: " << avg << " [musec]\n";
51 |     std::cout << "Mean per query: " << avg / total_queries << " [musec]";
52 |     std::cout << std::endl;
53 | }
54 | 
55 | int main(int argc, char** argv) {
56 |     int mandatory = 3;
57 |     if (argc < mandatory) {
58 |         std::cout << argv[0]
59 |                   << " index_filename num_queries_per_sequence < queries"
60 |                   << std::endl;
61 |         return 1;
62 |     }
63 | 
64 |     char const* binary_filename = argv[1];
65 |     uint64_t num_queries_per_sequence = std::stoull(argv[2]);
66 |     perf_select(binary_filename, num_queries_per_sequence);
67 | 
68 |     return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/src/uncompress.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | #include "uncompress.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void uncompress(char const* binary_filename) {
12 |     s_index index;
13 |     index.mmap(binary_filename);
14 | 
15 |     uint64_t universe = index.universe();
16 |     std::cout << "universe size: " << universe << std::endl;
17 |     size_t size_in_64bit_words =
18 |         num_chunks(universe) * constants::chunk_size / 64;
19 |     std::vector<uint64_t> out(size_in_64bit_words, 0);
20 |     uint64_t integers = 0;
21 |     essentials::timer_type t;
22 |     t.start();
23 |     for (size_t i = 0; i != index.size(); ++i) {
24 |         auto sequence = index[i];
25 |         size_t decoded = sequence.uncompress(out.data());
26 |         integers += decoded;
27 |     }
28 |     t.stop();
29 | 
30 |     std::cout << "decoded " << index.size() << " sequences" << std::endl;
31 |     std::cout << "decoded " << integers << " integers" << std::endl;
32 | 
33 |     double elapsed = t.average();
34 |     std::cout << "Elapsed time: " << elapsed / 1000000 << " [sec]\n";
35 |     std::cout << "Mean per sequence: " << elapsed / index.size()
36 |               << " [musec]\n";
37 |     std::cout << "Mean per integer: " << elapsed / integers * 1000 << " [ns]";
38 |     std::cout << std::endl;
39 | }
40 | 
41 | int main(int argc, char** argv) {
42 |     int mandatory = 2;
43 |     if (argc < mandatory) {
44 |         std::cout << argv[0] << " <index_filename>" << std::endl;
45 |         return 1;
46 |     }
47 | 
48 |     uncompress(argv[1]);
49 |     return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/src/union.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | #include "util.hpp"
 6 | #include "s_index.hpp"
 7 | #include "union.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void perf_union(char const* binary_filename,
12 |                 std::vector<query> const& queries) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     std::vector<uint32_t> out(index.universe());
17 |     size_t total = 0;
18 |     std::cout << "performing " << queries.size() << " pairwise-unions..."
19 |               << std::endl;
20 |     essentials::timer_type t;
21 |     static const int runs = 10 + 1;
22 |     for (int run = 0; run != runs; ++run) {
23 |         t.start();
24 |         for (auto const& q : queries) {
25 |             total += pairwise_union(index[q.i], index[q.j], out.data());
26 |         }
27 |         t.stop();
28 |     }
29 |     std::cout << total << std::endl;
30 |     t.discard_first();
31 |     double avg = t.average();
32 |     std::cout << "Mean per run: " << avg << " [musec]\n";
33 |     std::cout << "Mean per query: " << avg / queries.size() << " [musec]";
34 |     std::cout << std::endl;
35 | }
36 | 
37 | int main(int argc, char** argv) {
38 |     int mandatory = 3;
39 |     if (argc < mandatory) {
40 |         std::cout << argv[0] << " index_filename num_queries < queries"
41 |                   << std::endl;
42 |         return 1;
43 |     }
44 | 
45 |     char const* binary_filename = argv[1];
46 |     uint64_t num_queries = std::stoull(argv[2]);
47 |     std::vector<query> queries;
48 |     queries.reserve(num_queries);
49 | 
50 |     std::cout << "reading queries..." << std::endl;
51 |     for (uint32_t i = 0; i != num_queries; ++i) {
52 |         query q;
53 |         int x = scanf("%d", &q.i);
54 |         int y = scanf("%d", &q.j);
55 |         if (x == EOF or y == EOF) break;
56 |         queries.push_back(q);
57 |     }
58 |     std::cout << "DONE" << std::endl;
59 | 
60 |     perf_union(binary_filename, queries);
61 | 
62 |     return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(test_decode test_decode.cpp)
 2 | add_executable(test_uncompress test_uncompress.cpp)
 3 | add_executable(test_intersect test_intersect.cpp)
 4 | add_executable(test_intersect_many test_intersect_many.cpp)
 5 | add_executable(test_union test_union.cpp)
 6 | add_executable(test_union_many test_union_many.cpp)
 7 | add_executable(test_select test_select.cpp)
 8 | add_executable(test_contains test_contains.cpp)
 9 | add_executable(test_next_geq test_next_geq.cpp)
10 | add_executable(test_next_geq_enumerator test_next_geq_enumerator.cpp)
11 | add_executable(test_enumerator test_enumerator.cpp)


--------------------------------------------------------------------------------
/test/test_common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <sstream>
 5 | #include <algorithm>
 6 | 
 7 | namespace sliced {
 8 | 
 9 | namespace testing {
10 | typedef std::vector<uint32_t> query_type;
11 | 
12 | bool read_query_and_remove_duplicates(query_type& query,
13 |                                       std::istream& is = std::cin) {
14 |     query.clear();
15 |     std::string line;
16 |     if (!std::getline(is, line)) return false;
17 |     std::istringstream iline(line);
18 |     uint32_t index;
19 |     while (iline >> index) query.push_back(index);
20 |     std::sort(query.begin(), query.end());
21 |     query.erase(std::unique(query.begin(), query.end()), query.end());
22 |     return true;
23 | }
24 | }  // namespace testing
25 | 
26 | #define TEST                                                                  \
27 |     int mandatory = 3;                                                        \
28 |     if (argc < mandatory) {                                                   \
29 |         std::cout                                                             \
30 |             << argv[0]                                                        \
31 |             << " index_filename collection_filename [--density d] [--size s]" \
32 |             << std::endl;                                                     \
33 |         return 1;                                                             \
34 |     }                                                                         \
35 |                                                                               \
36 |     char const* index_filename = argv[1];                                     \
37 |     parameters params;                                                        \
38 |     params.collection_filename = argv[2];                                     \
39 |                                                                               \
40 |     for (int i = mandatory; i != argc; ++i) {                                 \
41 |         if (std::string(argv[i]) == "--density") {                            \
42 |             ++i;                                                              \
43 |             params.density = std::stod(argv[i]);                              \
44 |         } else if (std::string(argv[i]) == "--size") {                        \
45 |             ++i;                                                              \
46 |             params.size = std::atoi(argv[i]);                                 \
47 |         }                                                                     \
48 |     }                                                                         \
49 |                                                                               \
50 |     test(index_filename, params);
51 | 
52 | }  // namespace sliced


--------------------------------------------------------------------------------
/test/test_contains.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 4 | 
 5 | #include "test_common.hpp"
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "contains.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void test(char const* binary_filename, parameters const& params) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     mm::file_source<uint32_t> input(params.collection_filename,
17 |                                     mm::advice::sequential);
18 |     uint32_t const* data = input.data();
19 |     assert(data[0] == 1);
20 |     std::cout << "universe size: " << index.universe() << std::endl;
21 |     size_t k = 0;
22 |     bool good = true;
23 | 
24 |     for (size_t i = 2; i < input.size();) {
25 |         uint32_t n = data[i];
26 |         uint32_t universe = data[i + n];
27 |         if (pass(params, n, universe)) {
28 |             auto sequence = index[k];
29 |             uint32_t c = sequence.cardinality();
30 | 
31 |             if (c != n) {
32 |                 good = false;
33 |                 std::cout << "cardinality " << c << ": expected " << n
34 |                           << std::endl;
35 |             }
36 | 
37 |             uint32_t const* ptr = data + i + 1;
38 |             for (size_t j = 0; j != n; ++j) {
39 |                 uint32_t value = *ptr++;
40 |                 bool in = sequence.contains(value);
41 |                 if (!in) {
42 |                     good = false;
43 |                     std::cout << value << " should have been found"
44 |                               << std::endl;
45 |                 }
46 |             }
47 | 
48 |             ++k;
49 |             if (k % 1000 == 0) {
50 |                 std::cout << "checked " << k << " sequences" << std::endl;
51 |             }
52 |         }
53 |         i += n + 1;
54 |     }
55 |     std::cout << "checked " << k << " sequences" << std::endl;
56 |     if (good) std::cout << "everything good" << std::endl;
57 | }
58 | 
59 | int main(int argc, char** argv) {
60 |     TEST return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/test/test_decode.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 4 | 
 5 | #include "test_common.hpp"
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "decode.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void test(char const* binary_filename, parameters const& params) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     mm::file_source<uint32_t> input(params.collection_filename,
17 |                                     mm::advice::sequential);
18 |     uint32_t const* data = input.data();
19 |     assert(data[0] == 1);
20 |     std::cout << "universe size: " << index.universe() << std::endl;
21 |     std::vector<uint32_t> out(index.universe());
22 |     size_t k = 0;
23 |     bool good = true;
24 | 
25 |     for (size_t i = 2; i < input.size();) {
26 |         uint32_t n = data[i];
27 |         uint32_t universe = data[i + n];
28 |         if (pass(params, n, universe)) {
29 |             auto sequence = index[k];
30 |             size_t decoded = sequence.decode(out.data());
31 | 
32 |             uint32_t c = sequence.cardinality();
33 |             if (c != n) {
34 |                 good = false;
35 |                 std::cout << "cardinality " << c << ": expected " << n
36 |                           << std::endl;
37 |             }
38 | 
39 |             if (decoded != n) {
40 |                 good = false;
41 |                 std::cout << "decoded " << decoded << " integers: expected "
42 |                           << n << std::endl;
43 |             }
44 | 
45 |             uint32_t const* ptr = data + i + 1;
46 |             for (size_t j = 0; j != n; ++j) {
47 |                 uint32_t expected = *ptr++;
48 |                 if (expected != out[j]) {
49 |                     good = false;
50 |                     std::cout << "error at " << j << "/" << n << ": expected "
51 |                               << expected << " but got " << out[j] << std::endl;
52 |                 }
53 |             }
54 | 
55 |             ++k;
56 |             if (k % 1000 == 0) {
57 |                 std::cout << "decoded " << k << " sequences" << std::endl;
58 |             }
59 |         }
60 |         i += n + 1;
61 |     }
62 |     std::cout << "decoded " << k << " sequences" << std::endl;
63 |     if (good) std::cout << "everything good" << std::endl;
64 | }
65 | 
66 | int main(int argc, char** argv) {
67 |     TEST return 0;
68 | }
69 | 


--------------------------------------------------------------------------------
/test/test_enumerator.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 4 | 
 5 | #include "test_common.hpp"
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "enumerator.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void test(char const* binary_filename, parameters const& params) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     mm::file_source<uint32_t> input(params.collection_filename,
17 |                                     mm::advice::sequential);
18 |     uint32_t const* data = input.data();
19 |     assert(data[0] == 1);
20 |     std::cout << "universe size: " << index.universe() << std::endl;
21 |     size_t k = 0;
22 |     bool good = true;
23 | 
24 |     enumerator e;
25 | 
26 |     for (size_t i = 2; i < input.size();) {
27 |         uint32_t n = data[i];
28 |         uint32_t universe = data[i + n];
29 |         if (pass(params, n, universe)) {
30 |             auto sequence = index[k];
31 |             e.init(sequence, index.universe());
32 | 
33 |             uint32_t const* ptr = data + i + 1;
34 |             for (size_t j = 0; j != n; ++j, e.next()) {
35 |                 uint32_t expected = *ptr++;
36 |                 uint32_t got = e.value();
37 |                 if (expected != got) {
38 |                     good = false;
39 |                     std::cout << "error at " << j << "/" << n << ": expected "
40 |                               << expected << " but got " << got << std::endl;
41 |                 }
42 |             }
43 | 
44 |             ++k;
45 |             if (k % 1000 == 0) {
46 |                 std::cout << "decoded " << k << " sequences" << std::endl;
47 |             }
48 |         }
49 |         i += n + 1;
50 |     }
51 |     std::cout << "decoded " << k << " sequences" << std::endl;
52 |     if (good) std::cout << "everything good" << std::endl;
53 | }
54 | 
55 | int main(int argc, char** argv) {
56 |     TEST return 0;
57 | }
58 | 


--------------------------------------------------------------------------------
/test/test_intersect.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <algorithm>
 3 | 
 4 | #include "util.hpp"
 5 | #include "s_index.hpp"
 6 | #include "intersection.hpp"
 7 | 
 8 | #include "intersection_many.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void test(char const* binary_filename, std::vector<query> const& queries) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     uint64_t universe = index.universe();
17 |     std::vector<uint32_t> out(universe);
18 |     std::vector<uint32_t> i(universe);
19 |     std::vector<uint32_t> j(universe);
20 |     std::vector<uint32_t> expected(universe);
21 |     bool good = true;
22 | 
23 |     for (auto const& q : queries) {
24 |         size_t i_size = index[q.i].decode(i.data());
25 |         size_t j_size = index[q.j].decode(j.data());
26 |         auto it =
27 |             std::set_intersection(i.begin(), i.begin() + i_size, j.begin(),
28 |                                   j.begin() + j_size, expected.begin());
29 |         size_t expected_size = it - expected.begin();
30 | 
31 |         std::vector<s_sequence> sequences(2);
32 |         sequences[0] = index[q.i];
33 |         sequences[1] = index[q.j];
34 |         size_t size = intersection(sequences, out.data());
35 |         // size_t size = pairwise_intersection(index[q.i], index[q.j],
36 |         // out.data());
37 | 
38 |         if (expected_size != size) {
39 |             good = false;
40 |             std::cout << "intersection has size " << size << " but expected "
41 |                       << expected_size << std::endl;
42 |         }
43 | 
44 |         for (size_t i = 0; i != size; ++i) {
45 |             if (expected[i] != out[i]) {
46 |                 good = false;
47 |                 std::cout << "error at " << i << "/" << size << ": expected "
48 |                           << expected[i] << " but got " << out[i] << std::endl;
49 |             }
50 |         }
51 |     }
52 |     std::cout << "tested " << queries.size() << " queries" << std::endl;
53 |     if (good) std::cout << "everything good" << std::endl;
54 | }
55 | 
56 | int main(int argc, char** argv) {
57 |     int mandatory = 3;
58 |     if (argc < mandatory) {
59 |         std::cout << argv[0] << " index_filename num_queries < queries"
60 |                   << std::endl;
61 |         return 1;
62 |     }
63 | 
64 |     char const* binary_filename = argv[1];
65 |     uint64_t num_queries = std::stoull(argv[2]);
66 |     std::vector<query> queries;
67 |     queries.reserve(num_queries);
68 | 
69 |     std::cout << "reading queries..." << std::endl;
70 |     for (uint32_t i = 0; i != num_queries; ++i) {
71 |         query q;
72 |         int x = scanf("%d", &q.i);
73 |         int y = scanf("%d", &q.j);
74 |         if (x == EOF or y == EOF) break;
75 |         queries.push_back(q);
76 |     }
77 | 
78 |     test(binary_filename, queries);
79 | 
80 |     return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/test/test_intersect_many.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <algorithm>
  3 | 
  4 | #include "util.hpp"
  5 | #include "s_index.hpp"
  6 | #include "enumerator.hpp"
  7 | #include "test_common.hpp"
  8 | #include "intersection_many.hpp"
  9 | 
 10 | using namespace sliced;
 11 | using namespace sliced::testing;
 12 | 
 13 | std::vector<uint32_t> pairwise_intersection(std::vector<uint32_t> const& l,
 14 |                                             std::vector<uint32_t> const& r) {
 15 |     auto it_l = l.begin();
 16 |     auto it_r = r.begin();
 17 |     std::vector<uint32_t> out;
 18 |     while (it_l != l.end() and it_r != r.end()) {
 19 |         if (*it_l < *it_r) {
 20 |             ++it_l;
 21 |         } else if (*it_r < *it_l) {
 22 |             ++it_r;
 23 |         } else {
 24 |             out.push_back(*it_l);
 25 |             ++it_l;
 26 |             ++it_r;
 27 |         }
 28 |     }
 29 |     return out;
 30 | }
 31 | 
 32 | std::vector<uint32_t> intersection(s_index const& index,
 33 |                                    query_type const& query) {
 34 |     assert(query.size() >= 2);
 35 |     std::vector<std::vector<uint32_t>> sequences(query.size());
 36 |     for (uint32_t i = 0; i != query.size(); ++i) {
 37 |         auto sequence = index[query[i]];
 38 |         enumerator e;
 39 |         e.init(sequence, index.universe());
 40 |         sequences[i].reserve(sequence.cardinality());
 41 |         while (e.has_next()) {
 42 |             sequences[i].push_back(e.value());
 43 |             e.next();
 44 |         }
 45 |         assert(sequences[i].size() == sequence.cardinality());
 46 |         assert(std::is_sorted(sequences[i].begin(), sequences[i].end()));
 47 |     }
 48 | 
 49 |     auto result = pairwise_intersection(sequences[0], sequences[1]);
 50 |     for (uint32_t i = 2; i != sequences.size(); ++i) {
 51 |         result = pairwise_intersection(result, sequences[i]);
 52 |     }
 53 | 
 54 |     return result;
 55 | }
 56 | 
 57 | void test(char const* binary_filename, std::vector<query_type> const& queries) {
 58 |     s_index index;
 59 |     index.mmap(binary_filename);
 60 | 
 61 |     uint64_t universe = index.universe();
 62 |     std::vector<uint32_t> out(universe);
 63 |     bool good = true;
 64 | 
 65 |     std::vector<s_sequence> sequences;
 66 |     uint64_t num_queries = 0;
 67 |     for (auto const& q : queries) {
 68 |         if (q.size() < 2) continue;
 69 |         sequences.clear();
 70 |         auto expected = intersection(index, q);
 71 |         for (uint32_t i = 0; i != q.size(); ++i) {
 72 |             sequences.push_back(index[q[i]]);
 73 |         }
 74 |         size_t size = intersection(sequences, out.data());
 75 |         std::cout << "intersection has size " << size << std::endl;
 76 |         if (expected.size() != size) {
 77 |             good = false;
 78 |             std::cout << "intersection has size " << size << " but expected "
 79 |                       << expected.size() << std::endl;
 80 |         }
 81 | 
 82 |         for (size_t i = 0; i != size; ++i) {
 83 |             if (expected[i] != out[i]) {
 84 |                 good = false;
 85 |                 std::cout << "error at " << i << "/" << size << ": expected "
 86 |                           << expected[i] << " but got " << out[i] << std::endl;
 87 |             }
 88 |         }
 89 |         ++num_queries;
 90 |     }
 91 |     std::cout << "tested " << num_queries << " queries" << std::endl;
 92 |     if (good) std::cout << "everything good" << std::endl;
 93 | }
 94 | 
 95 | int main(int argc, char** argv) {
 96 |     int mandatory = 3;
 97 |     if (argc < mandatory) {
 98 |         std::cout << argv[0] << " index_filename num_queries < queries"
 99 |                   << std::endl;
100 |         return 1;
101 |     }
102 | 
103 |     char const* binary_filename = argv[1];
104 |     uint64_t num_queries = std::stoull(argv[2]);
105 | 
106 |     std::cout << "reading queries..." << std::endl;
107 |     std::vector<query_type> queries;
108 |     queries.reserve(num_queries);
109 |     query_type q;
110 |     uint64_t i = 0;
111 |     while (i != num_queries and testing::read_query_and_remove_duplicates(q)) {
112 |         assert(!q.empty());
113 |         queries.push_back(q);
114 |         ++i;
115 |     }
116 | 
117 |     std::cout << "running test..." << std::endl;
118 |     test(binary_filename, queries);
119 | 
120 |     return 0;
121 | }
122 | 


--------------------------------------------------------------------------------
/test/test_next_geq.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 4 | 
 5 | #include "test_common.hpp"
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "next_geq.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void test(char const* binary_filename, parameters const& params) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     mm::file_source<uint32_t> input(params.collection_filename,
17 |                                     mm::advice::sequential);
18 |     uint32_t const* data = input.data();
19 |     assert(data[0] == 1);
20 |     std::cout << "universe size: " << index.universe() << std::endl;
21 |     size_t k = 0;
22 |     bool good = true;
23 | 
24 |     for (size_t i = 2; i < input.size();) {
25 |         uint32_t n = data[i];
26 |         uint32_t universe = data[i + n];
27 |         if (pass(params, n, universe)) {
28 |             auto sequence = index[k];
29 |             uint32_t c = sequence.cardinality();
30 | 
31 |             if (c != n) {
32 |                 good = false;
33 |                 std::cout << "cardinality " << c << ": expected " << n
34 |                           << std::endl;
35 |             }
36 | 
37 |             uint32_t const* list = data + i + 1;
38 | 
39 |             /* run next_geq for all values in [0, universe] */
40 |             for (size_t lower_bound = 0; lower_bound != universe + 1;
41 |                  ++lower_bound) {
42 |                 auto it = std::lower_bound(list, list + n, lower_bound);
43 |                 uint32_t next_geq = sequence.next_geq(lower_bound);
44 |                 assert(next_geq >= lower_bound);
45 |                 if (next_geq != *it) {
46 |                     good = false;
47 |                     std::cout << "error at " << lower_bound << "/" << universe
48 |                               << ": got " << next_geq
49 |                               << " but expected next_geq(" << lower_bound
50 |                               << ") = " << *it << std::endl;
51 |                 }
52 |             }
53 | 
54 |             /* test some out-of-bound values */
55 |             for (size_t lower_bound = universe + 1;
56 |                  lower_bound != universe + 1000000 + 1; lower_bound += 10000) {
57 |                 uint32_t next_geq = sequence.next_geq(lower_bound);
58 |                 if (next_geq != constants::not_found) {
59 |                     good = false;
60 |                     std::cout << "error : got " << next_geq << " but expected "
61 |                               << constants::not_found << std::endl;
62 |                 }
63 |             }
64 | 
65 |             ++k;
66 |             if (k % 1000 == 0) {
67 |                 std::cout << "checked " << k << " sequences" << std::endl;
68 |             }
69 |         }
70 |         i += n + 1;
71 |     }
72 |     std::cout << "checked " << k << " sequences" << std::endl;
73 |     if (good) std::cout << "everything good" << std::endl;
74 | }
75 | 
76 | int main(int argc, char** argv) {
77 |     TEST return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/test/test_next_geq_enumerator.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 4 | 
 5 | #include "test_common.hpp"
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "next_geq_enumerator.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void test(char const* binary_filename, parameters const& params) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     mm::file_source<uint32_t> input(params.collection_filename,
17 |                                     mm::advice::sequential);
18 |     uint32_t const* data = input.data();
19 |     assert(data[0] == 1);
20 |     std::cout << "universe size: " << index.universe() << std::endl;
21 |     size_t k = 0;
22 |     bool good = true;
23 | 
24 |     for (size_t i = 2; i < input.size();) {
25 |         uint32_t n = data[i];
26 |         uint32_t universe = data[i + n];
27 | 
28 |         if (pass(params, n, universe)) {
29 |             auto sequence = index[k];
30 |             uint32_t c = sequence.cardinality();
31 | 
32 |             if (c != n) {
33 |                 good = false;
34 |                 std::cout << "cardinality " << c << ": expected " << n
35 |                           << std::endl;
36 |             }
37 | 
38 |             uint32_t const* list = data + i + 1;
39 |             next_geq_enumerator e(sequence);
40 | 
41 |             /* run next_geq for all values in [0, universe] */
42 |             for (size_t lower_bound = 0; lower_bound != universe + 1;
43 |                  ++lower_bound) {
44 |                 auto it = std::lower_bound(list, list + n, lower_bound);
45 |                 uint32_t next_geq = e.next_geq(lower_bound);
46 |                 assert(next_geq >= lower_bound);
47 |                 if (next_geq != *it) {
48 |                     good = false;
49 |                     std::cout << "error at " << lower_bound << "/" << universe
50 |                               << ": got " << next_geq
51 |                               << " but expected next_geq(" << lower_bound
52 |                               << ") = " << *it << std::endl;
53 |                 }
54 |             }
55 | 
56 |             /* test some out-of-bound values */
57 |             for (size_t lower_bound = universe + 1;
58 |                  lower_bound != universe + 1000000 + 1; lower_bound += 10000) {
59 |                 uint32_t next_geq = e.next_geq(lower_bound);
60 |                 if (next_geq != constants::not_found) {
61 |                     good = false;
62 |                     std::cout << "error : got " << next_geq << " but expected "
63 |                               << constants::not_found << std::endl;
64 |                 }
65 |             }
66 | 
67 |             ++k;
68 |             if (k % 1000 == 0) {
69 |                 std::cout << "checked " << k << " sequences" << std::endl;
70 |             }
71 |         }
72 |         i += n + 1;
73 |     }
74 |     std::cout << "checked " << k << " sequences" << std::endl;
75 |     if (good) std::cout << "everything good" << std::endl;
76 | }
77 | 
78 | int main(int argc, char** argv) {
79 |     TEST return 0;
80 | }
81 | 


--------------------------------------------------------------------------------
/test/test_select.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 4 | 
 5 | #include "test_common.hpp"
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "select.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void test(char const* binary_filename, parameters const& params) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     mm::file_source<uint32_t> input(params.collection_filename,
17 |                                     mm::advice::sequential);
18 |     uint32_t const* data = input.data();
19 |     assert(data[0] == 1);
20 |     std::cout << "universe size: " << index.universe() << std::endl;
21 |     size_t k = 0;
22 |     bool good = true;
23 | 
24 |     for (size_t i = 2; i < input.size();) {
25 |         uint32_t n = data[i];
26 |         uint32_t universe = data[i + n];
27 |         if (pass(params, n, universe)) {
28 |             auto sequence = index[k];
29 |             uint32_t c = sequence.cardinality();
30 | 
31 |             if (c != n) {
32 |                 good = false;
33 |                 std::cout << "cardinality " << c << ": expected " << n
34 |                           << std::endl;
35 |             }
36 | 
37 |             uint32_t const* ptr = data + i + 1;
38 |             uint32_t value = 0;
39 |             for (size_t j = 0; j != n; ++j) {
40 |                 uint32_t expected = *ptr++;
41 | 
42 |                 bool valid = sequence.select(j, value);
43 |                 if (!valid) {
44 |                     good = false;
45 |                     std::cout << "rank " << j << " should have been valid"
46 |                               << std::endl;
47 |                 }
48 | 
49 |                 if (expected != value) {
50 |                     good = false;
51 |                     std::cout << "error at " << j << "/" << n << ": expected "
52 |                               << expected << " but got " << value << std::endl;
53 |                 }
54 |             }
55 | 
56 |             ++k;
57 |             if (k % 1000 == 0) {
58 |                 std::cout << "checked " << k << " sequences" << std::endl;
59 |             }
60 |         }
61 |         i += n + 1;
62 |     }
63 |     std::cout << "checked " << k << " sequences" << std::endl;
64 |     if (good) std::cout << "everything good" << std::endl;
65 | }
66 | 
67 | int main(int argc, char** argv) {
68 |     TEST return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/test/test_uncompress.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 4 | 
 5 | #include "test_common.hpp"
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "uncompress.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | uint32_t decode_bitmap_and_reset(uint64_t* bitmap, size_t size_in_64bit_words,
13 |                                  uint32_t* out) {
14 |     uint32_t size = 0;
15 |     uint32_t base = 0;
16 |     for (size_t i = 0; i != size_in_64bit_words; ++i) {
17 |         uint64_t w = bitmap[i];
18 |         while (w != 0) {
19 |             uint64_t t = w & (~w + 1);
20 |             int r = __builtin_ctzll(w);
21 |             out[size++] = r + base;
22 |             w ^= t;
23 |         }
24 |         bitmap[i] = 0;
25 |         base += 64;
26 |     }
27 |     return size;
28 | }
29 | 
30 | void test(char const* binary_filename, parameters const& params) {
31 |     s_index index;
32 |     index.mmap(binary_filename);
33 | 
34 |     mm::file_source<uint32_t> input(params.collection_filename,
35 |                                     mm::advice::sequential);
36 |     uint32_t const* data = input.data();
37 | 
38 |     assert(data[0] == 1);
39 |     uint64_t universe = index.universe();
40 |     std::cout << "universe size: " << universe << std::endl;
41 |     size_t size_in_64bit_words =
42 |         num_chunks(universe) * constants::chunk_size / 64;
43 |     std::vector<uint64_t> bitmap(size_in_64bit_words, 0);
44 |     std::vector<uint32_t> out(index.universe());
45 |     size_t k = 0;
46 |     bool good = true;
47 | 
48 |     for (size_t i = 2; i < input.size();) {
49 |         uint32_t n = data[i];
50 |         uint32_t universe = data[i + n];
51 | 
52 |         if (pass(params, n, universe)) {
53 |             auto sequence = index[k];
54 |             size_t decoded = sequence.uncompress(bitmap.data());
55 |             size_t d = decode_bitmap_and_reset(bitmap.data(),
56 |                                                size_in_64bit_words, out.data());
57 |             if (decoded != d) {
58 |                 good = false;
59 |                 std::cout << "decoded " << decoded << " integers: expected "
60 |                           << d << std::endl;
61 |             }
62 | 
63 |             if (decoded != n) {
64 |                 good = false;
65 |                 std::cout << "decoded " << decoded << " integers: expected "
66 |                           << n << std::endl;
67 |             }
68 | 
69 |             uint32_t const* ptr = data + i + 1;
70 |             for (size_t j = 0; j != n; ++j) {
71 |                 uint32_t expected = *ptr++;
72 |                 if (expected != out[j]) {
73 |                     good = false;
74 |                     std::cout << "error at " << j << "/" << n << ": expected "
75 |                               << expected << " but got " << out[j] << std::endl;
76 |                 }
77 |             }
78 | 
79 |             ++k;
80 |             if (k % 1000 == 0) {
81 |                 std::cout << "decoded " << k << " sequences" << std::endl;
82 |             }
83 |         }
84 |         i += n + 1;
85 |     }
86 |     std::cout << "uncompressed " << k << " sequences" << std::endl;
87 |     if (good) std::cout << "everything good" << std::endl;
88 | }
89 | 
90 | int main(int argc, char** argv) {
91 |     TEST return 0;
92 | }
93 | 


--------------------------------------------------------------------------------
/test/test_union.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <algorithm>
 3 | 
 4 | #include "util.hpp"
 5 | #include "s_index.hpp"
 6 | #include "union.hpp"
 7 | 
 8 | using namespace sliced;
 9 | 
10 | void test(char const* binary_filename, std::vector<query> const& queries) {
11 |     s_index index;
12 |     index.mmap(binary_filename);
13 | 
14 |     uint64_t universe = index.universe();
15 |     std::vector<uint32_t> out(universe);
16 |     std::vector<uint32_t> i(universe);
17 |     std::vector<uint32_t> j(universe);
18 |     std::vector<uint32_t> expected(universe);
19 |     bool good = true;
20 | 
21 |     for (auto const& q : queries) {
22 |         size_t i_size = index[q.i].decode(i.data());
23 |         size_t j_size = index[q.j].decode(j.data());
24 |         auto it = std::set_union(i.begin(), i.begin() + i_size, j.begin(),
25 |                                  j.begin() + j_size, expected.begin());
26 |         size_t expected_size = it - expected.begin();
27 | 
28 |         size_t size = pairwise_union(index[q.i], index[q.j], out.data());
29 | 
30 |         if (expected_size != size) {
31 |             good = false;
32 |             std::cout << "union has size " << size << " but expected "
33 |                       << expected_size << std::endl;
34 |         }
35 | 
36 |         for (size_t i = 0; i != size; ++i) {
37 |             if (expected[i] != out[i]) {
38 |                 good = false;
39 |                 std::cout << "error at " << i << "/" << size << ": expected "
40 |                           << expected[i] << " but got " << out[i] << std::endl;
41 |             }
42 |         }
43 |     }
44 |     std::cout << "tested " << queries.size() << " queries" << std::endl;
45 |     if (good) std::cout << "everything good" << std::endl;
46 | }
47 | 
48 | int main(int argc, char** argv) {
49 |     int mandatory = 3;
50 |     if (argc < mandatory) {
51 |         std::cout << argv[0] << " index_filename num_queries < queries"
52 |                   << std::endl;
53 |         return 1;
54 |     }
55 | 
56 |     char const* binary_filename = argv[1];
57 |     uint64_t num_queries = std::stoull(argv[2]);
58 |     std::vector<query> queries;
59 |     queries.reserve(num_queries);
60 | 
61 |     std::cout << "reading queries..." << std::endl;
62 |     for (uint32_t i = 0; i != num_queries; ++i) {
63 |         query q;
64 |         int x = scanf("%d", &q.i);
65 |         int y = scanf("%d", &q.j);
66 |         if (x == EOF or y == EOF) break;
67 |         queries.push_back(q);
68 |     }
69 | 
70 |     test(binary_filename, queries);
71 | 
72 |     return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/test/test_union_many.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <algorithm>
  3 | #include <sstream>
  4 | 
  5 | #include "util.hpp"
  6 | #include "s_index.hpp"
  7 | #include "enumerator.hpp"
  8 | #include "test_common.hpp"
  9 | #include "union_many.hpp"
 10 | 
 11 | using namespace sliced;
 12 | using namespace sliced::testing;
 13 | 
 14 | typedef std::vector<uint32_t> query_type;
 15 | 
 16 | std::vector<uint32_t> pairwise_union(std::vector<uint32_t> const& l,
 17 |                                      std::vector<uint32_t> const& r) {
 18 |     auto it_l = l.begin();
 19 |     auto it_r = r.begin();
 20 |     std::vector<uint32_t> out;
 21 |     while (true) {
 22 |         if (it_l == l.end()) {
 23 |             std::copy(it_r, r.end(), std::back_inserter(out));
 24 |             break;
 25 |         }
 26 |         if (it_r == r.end()) {
 27 |             std::copy(it_l, l.end(), std::back_inserter(out));
 28 |             break;
 29 |         }
 30 |         if (*it_l < *it_r) {
 31 |             out.push_back(*it_l);
 32 |             ++it_l;
 33 |         } else if (*it_r < *it_l) {
 34 |             out.push_back(*it_r);
 35 |             ++it_r;
 36 |         } else {
 37 |             out.push_back(*it_l);
 38 |             ++it_l;
 39 |             ++it_r;
 40 |         }
 41 |     }
 42 |     return out;
 43 | }
 44 | 
 45 | std::vector<uint32_t> union_many(s_index const& index,
 46 |                                  query_type const& query) {
 47 |     assert(query.size() >= 2);
 48 |     std::vector<std::vector<uint32_t>> sequences(query.size());
 49 |     for (uint32_t i = 0; i != query.size(); ++i) {
 50 |         auto sequence = index[query[i]];
 51 |         enumerator e;
 52 |         e.init(sequence, index.universe());
 53 |         sequences[i].reserve(sequence.cardinality());
 54 |         while (e.has_next()) {
 55 |             sequences[i].push_back(e.value());
 56 |             e.next();
 57 |         }
 58 |         assert(sequences[i].size() == sequence.cardinality());
 59 |         assert(std::is_sorted(sequences[i].begin(), sequences[i].end()));
 60 |     }
 61 | 
 62 |     std::vector<uint32_t> result;
 63 |     for (uint32_t i = 0; i != sequences.size(); ++i) {
 64 |         result = pairwise_union(result, sequences[i]);
 65 |     }
 66 | 
 67 |     return result;
 68 | }
 69 | 
 70 | void test(char const* binary_filename, std::vector<query_type> const& queries) {
 71 |     s_index index;
 72 |     index.mmap(binary_filename);
 73 | 
 74 |     uint64_t universe = index.universe();
 75 |     std::vector<uint32_t> out(universe);
 76 |     bool good = true;
 77 | 
 78 |     std::vector<s_sequence> sequences;
 79 |     uint64_t num_queries = 0;
 80 |     for (auto const& q : queries) {
 81 |         if (q.size() < 2) continue;
 82 |         sequences.clear();
 83 |         auto expected = union_many(index, q);
 84 |         for (uint32_t i = 0; i != q.size(); ++i) {
 85 |             sequences.push_back(index[q[i]]);
 86 |         }
 87 |         size_t size = union_many(sequences, out.data());
 88 |         std::cout << "union has size " << size << std::endl;
 89 |         if (expected.size() != size) {
 90 |             good = false;
 91 |             std::cout << "union has size " << size << " but expected "
 92 |                       << expected.size() << std::endl;
 93 |         }
 94 | 
 95 |         for (size_t i = 0; i != size; ++i) {
 96 |             if (expected[i] != out[i]) {
 97 |                 good = false;
 98 |                 std::cout << "error at " << i << "/" << size << ": expected "
 99 |                           << expected[i] << " but got " << out[i] << std::endl;
100 |             }
101 |         }
102 |         ++num_queries;
103 |     }
104 |     std::cout << "tested " << num_queries << " queries" << std::endl;
105 |     if (good) std::cout << "everything good" << std::endl;
106 | }
107 | 
108 | int main(int argc, char** argv) {
109 |     int mandatory = 3;
110 |     if (argc < mandatory) {
111 |         std::cout << argv[0] << " index_filename num_queries < queries"
112 |                   << std::endl;
113 |         return 1;
114 |     }
115 | 
116 |     char const* binary_filename = argv[1];
117 |     uint64_t num_queries = std::stoull(argv[2]);
118 | 
119 |     std::cout << "reading queries..." << std::endl;
120 |     std::vector<query_type> queries;
121 |     queries.reserve(num_queries);
122 |     query_type q;
123 |     uint64_t i = 0;
124 |     while (i != num_queries and testing::read_query_and_remove_duplicates(q)) {
125 |         assert(!q.empty());
126 |         queries.push_back(q);
127 |         ++i;
128 |     }
129 | 
130 |     std::cout << "running test..." << std::endl;
131 |     test(binary_filename, queries);
132 | 
133 |     return 0;
134 | }
135 | 


--------------------------------------------------------------------------------
/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(gen_uniform_data gen_uniform_data.cpp)
2 | add_executable(gen_clustered_data gen_clustered_data.cpp)
3 | add_executable(gen_random_pairwise_queries gen_random_pairwise_queries.cpp)
4 | add_executable(gen_random_select_queries gen_random_select_queries.cpp)
5 | add_executable(gen_random_next_geq_queries gen_random_next_geq_queries.cpp)


--------------------------------------------------------------------------------
/tools/gen_clustered_data.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <vector>
  4 | #include <algorithm>
  5 | 
  6 | #include "building_util.hpp"
  7 | 
  8 | #include "../external/essentials/include/essentials.hpp"
  9 | 
 10 | enum event_code { skip = 0, include_all = 1, include_some = 2 };
 11 | 
 12 | struct event {
 13 |     int code;
 14 |     float prob;
 15 | };
 16 | 
 17 | int gen_event(std::vector<event> const& events, float p) {
 18 |     assert(p >= 0 and p <= 1.0);
 19 |     float cumulative = events[0].prob;
 20 |     size_t i = 0;
 21 |     while (cumulative < p) {
 22 |         ++i;
 23 |         cumulative += events[i].prob;
 24 |     }
 25 |     assert(i < events.size());
 26 |     return events[i].code;
 27 | };
 28 | 
 29 | void gen(uint32_t num_lists, uint32_t universe, char const* output_filename,
 30 |          bool binary) {
 31 |     using namespace sliced;
 32 | 
 33 |     std::ofstream out;
 34 |     if (binary) {
 35 |         out.open(std::string(output_filename),
 36 |                  std::ios_base::binary | std::ios_base::out);
 37 |     } else {
 38 |         out.open(std::string(output_filename));
 39 |     }
 40 | 
 41 |     if (!out.is_open()) {
 42 |         std::cout << "error in opening file" << std::endl;
 43 |         return;
 44 |     }
 45 | 
 46 |     if (binary) {
 47 |         // header: singleton list containing the universe
 48 |         write_uint(uint32_t(1), out);
 49 |         write_uint(universe, out);
 50 |     }
 51 | 
 52 |     essentials::uniform_int_rng<uint32_t> length(10, constants::chunk_size / 4);
 53 |     typedef essentials::uniform_int_rng<uint32_t> random_int;
 54 |     random_int* element = nullptr;
 55 | 
 56 |     std::vector<uint32_t> list;
 57 |     list.reserve(universe);
 58 | 
 59 |     std::vector<event> events(3);
 60 |     events[0] = {event_code::skip, 0.3};
 61 |     events[1] = {event_code::include_all, 0.2};
 62 |     events[2] = {event_code::include_some, 0.5};
 63 | 
 64 |     for (uint32_t i = 0; i != num_lists; ++i) {
 65 |         list.clear();
 66 | 
 67 |         slice s = {0, constants::chunk_size};
 68 |         while (s.left < universe) {
 69 |             float p = float(rand()) / RAND_MAX;
 70 |             int code = gen_event(events, p);
 71 |             uint32_t n = 0;
 72 |             uint32_t end = std::min<uint32_t>(s.right, universe);
 73 |             switch (code) {
 74 |                 case event_code::skip:
 75 |                     break;
 76 |                 case event_code::include_all:
 77 |                     for (uint32_t k = s.left; k != end; ++k) {
 78 |                         list.push_back(k);
 79 |                     }
 80 |                     break;
 81 |                 case event_code::include_some:
 82 |                     element = new random_int(s.left, end,
 83 |                                              essentials::get_random_seed());
 84 |                     n = length.gen();
 85 |                     for (uint32_t k = 0; k != n; ++k) {
 86 |                         list.push_back(element->gen());
 87 |                     }
 88 |                     break;
 89 |                 default:
 90 |                     assert(false);
 91 |                     __builtin_unreachable();
 92 |             }
 93 |             s.left = s.right;
 94 |             s.right += constants::chunk_size;
 95 |         }
 96 | 
 97 |         std::sort(list.begin(), list.end());
 98 |         auto end = std::unique(list.begin(), list.end());
 99 |         uint32_t n = std::distance(list.begin(), end);
100 | 
101 |         if (binary) {
102 |             write_uint(n, out);
103 |             char const* begin = reinterpret_cast<char const*>(list.data());
104 |             out.write(begin, n * sizeof(uint32_t));
105 |         } else {
106 |             out << n << "\n";
107 |             for (auto x : list) out << x << "\n";
108 |         }
109 |     }
110 |     out.close();
111 | }
112 | 
113 | int main(int argc, char** argv) {
114 |     int mandatory = 4;
115 |     if (argc < mandatory) {
116 |         std::cout << argv[0] << " num_lists universe output_filename --binary"
117 |                   << std::endl;
118 |         return 1;
119 |     }
120 | 
121 |     uint32_t num_lists = std::atoi(argv[1]);
122 |     uint32_t universe = std::atoi(argv[2]);
123 |     char const* output_filename = argv[3];
124 |     bool binary = false;
125 | 
126 |     for (int i = mandatory; i != argc; ++i) {
127 |         if (std::string(argv[i]) == "--binary") binary = true;
128 |     }
129 | 
130 |     gen(num_lists, universe, output_filename, binary);
131 | 
132 |     return 0;
133 | }
134 | 


--------------------------------------------------------------------------------
/tools/gen_random_next_geq_queries.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <algorithm>
 3 | 
 4 | #include "../external/essentials/include/essentials.hpp"
 5 | 
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | #include "decode.hpp"
 9 | 
10 | using namespace sliced;
11 | 
12 | void generate(char const* binary_filename, uint32_t num_queries_per_sequence) {
13 |     s_index index;
14 |     index.mmap(binary_filename);
15 | 
16 |     std::vector<uint32_t> queries;
17 |     queries.reserve(index.size() * num_queries_per_sequence);
18 |     std::vector<uint32_t> out(index.universe());
19 | 
20 |     for (size_t i = 0; i != index.size(); ++i) {
21 |         auto sequence = index[i];
22 |         size_t decoded = sequence.decode(out.data());
23 |         essentials::uniform_int_rng<uint32_t> rng(0, decoded - 1);
24 |         for (uint32_t k = 0; k != num_queries_per_sequence; ++k) {
25 |             queries.push_back(out[rng.gen()]);
26 |         }
27 |     }
28 | 
29 |     // std::random_shuffle(queries.begin(), queries.end());
30 |     for (auto q : queries) std::cout << q << "\n";
31 | }
32 | 
33 | int main(int argc, char** argv) {
34 |     int mandatory = 3;
35 |     if (argc < mandatory) {
36 |         std::cout << argv[0] << " index_filename num_queries_per_sequence"
37 |                   << std::endl;
38 |         return 1;
39 |     }
40 | 
41 |     generate(argv[1], std::atoi(argv[2]));
42 |     return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/gen_random_pairwise_queries.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "../external/essentials/include/essentials.hpp"
 4 | 
 5 | void generate(uint32_t num_queries, uint32_t num_sequences) {
 6 |     essentials::uniform_int_rng<uint32_t> rng(0, num_sequences - 1);
 7 |     for (size_t i = 0; i != num_queries; ++i) {
 8 |         std::cout << rng.gen() << "\t" << rng.gen() << "\n";
 9 |     }
10 | }
11 | 
12 | int main(int argc, char** argv) {
13 |     int mandatory = 3;
14 |     if (argc < mandatory) {
15 |         std::cout << argv[0] << " num_queries num_sequences" << std::endl;
16 |         return 1;
17 |     }
18 | 
19 |     generate(std::atoi(argv[1]), std::atoi(argv[2]));
20 |     return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/tools/gen_random_select_queries.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <algorithm>
 3 | 
 4 | #include "../external/essentials/include/essentials.hpp"
 5 | 
 6 | #include "util.hpp"
 7 | #include "s_index.hpp"
 8 | 
 9 | using namespace sliced;
10 | 
11 | void generate(char const* binary_filename, uint32_t num_queries_per_sequence) {
12 |     s_index index;
13 |     index.mmap(binary_filename);
14 | 
15 |     std::vector<uint32_t> queries;
16 |     queries.reserve(index.size() * num_queries_per_sequence);
17 | 
18 |     for (size_t i = 0; i != index.size(); ++i) {
19 |         auto sequence = index[i];
20 |         uint32_t cardinality = sequence.cardinality();
21 |         essentials::uniform_int_rng<uint32_t> rng(0, cardinality - 1);
22 |         for (uint32_t k = 0; k != num_queries_per_sequence; ++k) {
23 |             queries.push_back(rng.gen());
24 |         }
25 |     }
26 | 
27 |     // std::random_shuffle(queries.begin(), queries.end());
28 |     for (auto q : queries) std::cout << q << "\n";
29 | }
30 | 
31 | int main(int argc, char** argv) {
32 |     int mandatory = 3;
33 |     if (argc < mandatory) {
34 |         std::cout << argv[0] << " index_filename num_queries_per_sequence"
35 |                   << std::endl;
36 |         return 1;
37 |     }
38 | 
39 |     generate(argv[1], std::atoi(argv[2]));
40 |     return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/tools/gen_uniform_data.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <vector>
 4 | #include <algorithm>
 5 | 
 6 | #include "building_util.hpp"
 7 | 
 8 | #include "../external/essentials/include/essentials.hpp"
 9 | 
10 | void gen(uint32_t num_lists, uint32_t min_length, uint32_t max_length,
11 |          uint32_t universe, char const* output_filename) {
12 |     using namespace sliced;
13 |     std::ofstream out(std::string(output_filename),
14 |                       std::ios_base::binary | std::ios_base::out);
15 |     // header: singleton list containing the universe
16 |     write_uint(uint32_t(1), out);
17 |     write_uint(universe, out);
18 |     essentials::uniform_int_rng<uint32_t> length(min_length, max_length);
19 |     essentials::uniform_int_rng<uint32_t> element(0, universe);
20 |     std::vector<uint32_t> list;
21 |     list.reserve(max_length);
22 |     for (uint32_t i = 0; i != num_lists; ++i) {
23 |         list.clear();
24 |         uint32_t n = length.gen();
25 |         for (uint32_t k = 0; k != n; ++k) list.push_back(element.gen());
26 |         std::sort(list.begin(), list.end());
27 |         auto it = std::unique(list.begin(), list.end());
28 |         n = std::distance(list.begin(), it);
29 |         write_uint(n, out);
30 |         char const* begin = reinterpret_cast<char const*>(list.data());
31 |         out.write(begin, n * sizeof(uint32_t));
32 |     }
33 |     out.close();
34 | }
35 | 
36 | int main(int argc, char** argv) {
37 |     int mandatory = 6;
38 |     if (argc < mandatory) {
39 |         std::cout << argv[0]
40 |                   << " num_lists min_length max_length universe output_filename"
41 |                   << std::endl;
42 |         return 1;
43 |     }
44 | 
45 |     uint32_t num_lists = std::atoi(argv[1]);
46 |     uint32_t min_length = std::atoi(argv[2]);
47 |     uint32_t max_length = std::atoi(argv[3]);
48 |     uint32_t universe = std::atoi(argv[4]);
49 |     char const* output_filename = argv[5];
50 | 
51 |     gen(num_lists, min_length, max_length, universe, output_filename);
52 | 
53 |     return 0;
54 | }
55 | 


--------------------------------------------------------------------------------