├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── external └── CMakeLists.txt ├── include ├── dint │ ├── block_statistics.hpp │ ├── dict_freq_index.hpp │ ├── dict_posting_list.hpp │ ├── dictionary_builders.hpp │ ├── dictionary_building_utils.hpp │ ├── dictionary_types.hpp │ ├── dint_codecs.hpp │ ├── dint_configuration.hpp │ ├── hash_utils.hpp │ ├── multi_dictionary.hpp │ ├── rectangular_dictionary.hpp │ ├── single_dictionary.hpp │ └── statistics_collectors.hpp ├── ds2i │ ├── LICENSE │ ├── VarIntG8IU.h │ ├── all_ones_sequence.hpp │ ├── binary_blocks_collection.hpp │ ├── binary_collection.hpp │ ├── binary_freq_collection.hpp │ ├── bitvector_collection.hpp │ ├── block_codecs.hpp │ ├── block_freq_index.hpp │ ├── block_posting_list.hpp │ ├── block_profiler.hpp │ ├── bm25.hpp │ ├── compact_elias_fano.hpp │ ├── compact_ranked_bitvector.hpp │ ├── configuration.hpp │ ├── dec_time_prediction.hpp │ ├── dec_time_regression.py │ ├── ds2i_config.hpp.in │ ├── freq_index.hpp │ ├── global_parameters.hpp │ ├── index_build_utils.hpp │ ├── indexed_sequence.hpp │ ├── integer_codes.hpp │ ├── interpolative_coding.hpp │ ├── mixed_block.hpp │ ├── optimal_partition.hpp │ ├── partitioned_sequence.hpp │ ├── positive_sequence.hpp │ ├── qmx.hpp │ ├── qmx_codec.hpp │ ├── queries.hpp │ ├── semiasync_queue.hpp │ ├── sequence_collection.hpp │ ├── strict_elias_fano.hpp │ ├── strict_sequence.hpp │ ├── uniform_partitioned_sequence.hpp │ ├── varintgb.h │ ├── verify_collection.hpp │ └── wand_data.hpp ├── index_types.hpp └── util.hpp ├── scripts ├── build.py ├── build_and_query.py ├── build_dint_indexes.py ├── collect_timings.py ├── query.py ├── stat_all.py └── test_all.py ├── src ├── CMakeLists.txt ├── check_index.cpp ├── create_freq_index.cpp ├── create_wand_data.cpp ├── dict_perf_test.cpp ├── pair_wise_intersect.cpp └── queries.cpp ├── test ├── CMakeLists.txt ├── test_block_codecs.cpp ├── test_block_freq_index.cpp ├── test_block_posting_list.cpp ├── test_compact_elias_fano.cpp ├── test_compact_ranked_bitvector.cpp ├── test_data │ ├── queries │ ├── test_collection.docs │ ├── test_collection.freqs │ └── test_collection.sizes ├── test_freq_index.cpp ├── test_generic_sequence.hpp ├── test_indexed_sequence.cpp ├── test_partitioned_sequence.cpp ├── test_positive_sequence.cpp ├── test_ranked_queries.cpp ├── test_sequence_collection.cpp ├── test_strict_elias_fano.cpp └── test_uniform_partitioned_sequence.cpp └── vroom_env ├── CMakeLists.txt ├── check_encoded_data.cpp ├── codecs.hpp ├── decode.cpp ├── dint_codecs.hpp ├── encode.cpp ├── jobs.hpp └── statistics.hpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: Empty 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeComma 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: true 45 | BreakConstructorInitializers: BeforeComma 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 80 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: false 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^' 66 | Priority: 2 67 | - Regex: '^<.*\.h>' 68 | Priority: 1 69 | - Regex: '^<.*' 70 | Priority: 2 71 | - Regex: '.*' 72 | Priority: 3 73 | IncludeIsMainRegex: '([-_](test|unittest))?$' 74 | IndentCaseLabels: true 75 | IndentPPDirectives: None 76 | IndentWidth: 4 77 | IndentWrappedFunctionNames: false 78 | JavaScriptQuotes: Leave 79 | JavaScriptWrapImports: true 80 | KeepEmptyLinesAtTheStartOfBlocks: false 81 | MacroBlockBegin: '' 82 | MacroBlockEnd: '' 83 | MaxEmptyLinesToKeep: 1 84 | NamespaceIndentation: None 85 | ObjCBinPackProtocolList: Never 86 | ObjCBlockIndentWidth: 2 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: true 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyBreakTemplateDeclaration: 10 95 | PenaltyExcessCharacter: 1000000 96 | PenaltyReturnTypeOnItsOwnLine: 200 97 | PointerAlignment: Left 98 | RawStringFormats: 99 | - Language: Cpp 100 | Delimiters: 101 | - cc 102 | - CC 103 | - cpp 104 | - Cpp 105 | - CPP 106 | - 'c++' 107 | - 'C++' 108 | CanonicalDelimiter: '' 109 | BasedOnStyle: google 110 | - Language: TextProto 111 | Delimiters: 112 | - pb 113 | - PB 114 | - proto 115 | - PROTO 116 | EnclosingFunctions: 117 | - EqualsProto 118 | - EquivToProto 119 | - PARSE_PARTIAL_TEXT_PROTO 120 | - PARSE_TEST_PROTO 121 | - PARSE_TEXT_PROTO 122 | - ParseTextOrDie 123 | - ParseTextProtoOrDie 124 | CanonicalDelimiter: '' 125 | BasedOnStyle: google 126 | ReflowComments: true 127 | SortIncludes: false 128 | SortUsingDeclarations: false 129 | SpaceAfterCStyleCast: false 130 | SpaceAfterTemplateKeyword: true 131 | SpaceBeforeAssignmentOperators: true 132 | SpaceBeforeCpp11BracedList: false 133 | SpaceBeforeCtorInitializerColon: true 134 | SpaceBeforeInheritanceColon: true 135 | SpaceBeforeParens: ControlStatements 136 | SpaceBeforeRangeBasedForLoopColon: true 137 | SpaceInEmptyParentheses: false 138 | SpacesBeforeTrailingComments: 2 139 | SpacesInAngles: false 140 | SpacesInContainerLiterals: true 141 | SpacesInCStyleCastParentheses: false 142 | SpacesInParentheses: false 143 | SpacesInSquareBrackets: false 144 | Standard: Auto 145 | StatementMacros: 146 | - Q_UNUSED 147 | - QT_REQUIRE_VERSION 148 | TabWidth: 8 149 | UseTab: Never 150 | ... 151 | 152 | 153 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | data/ 3 | ds2i_config.hpp 4 | 5 | # Python junk 6 | *.pyc 7 | 8 | # CMake junk 9 | CMakeCache.txt 10 | CMakeFiles/ 11 | *.cmake 12 | Makefile 13 | Testing/ 14 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/succinct"] 2 | path = external/succinct 3 | url = https://github.com/ot/succinct.git 4 | [submodule "external/streamvbyte"] 5 | path = external/streamvbyte 6 | url = https://github.com/lemire/streamvbyte.git 7 | [submodule "external/MaskedVByte"] 8 | path = external/MaskedVByte 9 | url = https://github.com/lemire/MaskedVByte.git 10 | [submodule "external/FastPFor"] 11 | path = external/FastPFor 12 | url = https://github.com/ot/FastPFor.git 13 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(DS2I) 3 | 4 | configure_file( 5 | ${DS2I_SOURCE_DIR}/include/ds2i/ds2i_config.hpp.in 6 | ${DS2I_SOURCE_DIR}/include/ds2i/ds2i_config.hpp 7 | ESCAPE_QUOTES) 8 | 9 | if(NOT CMAKE_BUILD_TYPE) 10 | set(CMAKE_BUILD_TYPE "Release") 11 | endif() 12 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} ) 13 | 14 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 15 | 16 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 18 | endif () 19 | 20 | if (UNIX) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") 26 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces") 27 | 28 | if (USE_SANITIZERS) 29 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") 30 | endif () 31 | 32 | endif() 33 | 34 | find_package(Boost COMPONENTS iostreams unit_test_framework filesystem system log log_setup date_time chrono REQUIRED) 35 | include_directories(${Boost_INCLUDE_DIRS}) 36 | link_directories(${Boost_LIBRARY_DIRS}) 37 | 38 | include_directories(${DS2I_SOURCE_DIR}/include/ds2i) 39 | include_directories(${DS2I_SOURCE_DIR}/include/dint) 40 | 41 | add_subdirectory(external) 42 | 43 | include_directories(${DS2I_SOURCE_DIR}/external 44 | ${DS2I_SOURCE_DIR}/include 45 | ) 46 | 47 | add_subdirectory(src) 48 | add_subdirectory(vroom_env) 49 | 50 | enable_testing() 51 | add_subdirectory(test) 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 2 | * Giulio Ermanno Pibiri, 3 | * Matthias Petri, 4 | * Alistair Moffat, 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | `dint` - Dictionary of INTeger sequences 2 | ------- 3 | 4 | This is the code used for the experiments in the paper [*Fast Dictionary-based Compression for Inverted Indexes*](http://pages.di.unipi.it/pibiri/papers/WSDM19.pdf) [1], by Giulio Ermanno Pibiri, Matthias Petri and Alistair Moffat. 5 | 6 | This guide is meant to provide a brief overview of the library and to illustrate its functionalities through some examples. 7 | ##### Table of contents 8 | * [Building the code](#building-the-code) 9 | * [Input data format](#input-data-format) 10 | * [Building the indexes](#building-the-indexes) 11 | * [Vroom environment](#vroom-environment) 12 | * [Benchmark](#benchmark) 13 | * [Authors](#authors) 14 | * [Bibliography](#bibliography) 15 | 16 | Building the code 17 | ----------------- 18 | 19 | The code is tested on Linux Ubuntu with `gcc` 7.3.0. The following dependencies are needed for the build: `CMake` >= 2.8 and `Boost`. 20 | 21 | The code is largely based on the [`ds2i`](https://github.com/ot/ds2i) project, so it depends on several submodules. If you have cloned the repository without `--recursive`, you will need to perform the following commands before 22 | building: 23 | 24 | $ git submodule init 25 | $ git submodule update 26 | 27 | To build the code on Unix systems (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following: 28 | 29 | $ mkdir build 30 | $ cd build 31 | $ cmake .. -DCMAKE_BUILD_TYPE=Release 32 | $ make -j[number of jobs] 33 | 34 | Setting `[number of jobs]` is recommended, e.g., `make -j4`. 35 | 36 | Unless otherwise specified, for the rest of this guide we assume that we type the terminal commands of the following examples from the created directory `build`. 37 | 38 | 39 | Input data format 40 | ----------------- 41 | The collection containing the docID and frequency lists follow the format of [`ds2i`](https://github.com/ot/ds2i), that is all integer lists are prefixed by their length written as 32-bit little-endian unsigned integers: 42 | 43 | * `.docs` starts with a singleton binary sequence where its only 44 | integer is the number of documents in the collection. It is then followed by 45 | one binary sequence for each posting list, in order of term-ids. Each posting 46 | list contains the sequence of docIDs containing the term. 47 | 48 | * `.freqs` is composed of a one binary sequence per posting list, where 49 | each sequence contains the occurrence counts of the postings, aligned with the 50 | previous file (note however that this file does not have an additional 51 | singleton list at its beginning). 52 | 53 | The `data` subfolder contains an example of such collection organization, for a total of 113,306 sequences and 3,327,520 postings. The `queries` file is, instead, a collection of 500 (multi-term) queries. 54 | 55 | For the following examples, we assume to work with the sample data contained in `data`. 56 | 57 | Building the indexes 58 | -------------------- 59 | 60 | The executables `create_freq_index` should be used to build the indexes, given an input collection. To know the parameters needed by the executable, just type 61 | 62 | $ ./create_freq_index 63 | 64 | without any parameters. You will get: 65 | 66 | $ Usage ./create_freq_index: 67 | $ [output_filename] [--check] 68 | 69 | Below we show some examples. 70 | 71 | ##### Example 1. 72 | The commands 73 | 74 | $ ./create_freq_index single_rect_dint ../test/test_data/test_collection single_rect_dint.bin 75 | $ ./create_freq_index single_packed_dint ../test/test_data/test_collection single_packed_dint.bin 76 | $ ./create_freq_index multi_packed_dint ../test/test_data/test_collection multi_packed_dint.bin 77 | 78 | can be used to build three DINT indexes that use: a single, rectangular dictionary; a single, packed dictionary and multi, packed dictionaries respectively. 79 | 80 | ##### Example 2. 81 | The command 82 | 83 | $ ./queries single_packed_dint and single_packed_dint.bin < ../test/test_data/queries 84 | 85 | performes the boolean AND queries contained in the data file `queries` over the index serialized to `single_packed_dint.bin`. 86 | 87 | Vroom environment 88 | ----------------- 89 | The "vroom" environment is designed to test the raw sequential decoding speed 90 | of the encoders. See the folder `vroom_env` and the following example. 91 | 92 | ##### Example. 93 | After building a `single_packed_dint`, we can encode all the sequences in a collection 94 | (without any blocking mechanism), using the following command 95 | 96 | $ ./encode single_packed_dint ../test/test_data/test_collection.docs --dict dict.test_collection.docs.single_packed.DSF-65536-16 --out test.bin 97 | 98 | that serializes all the compressed lists to the file `test.bin`. Then we can decode sequentially all the lists in such file by using 99 | 100 | $ ./decode single_packed_dint test.bin --dict dict.test_collection.docs.single_packed.DSF-65536-16 101 | 102 | Benchmark 103 | --------- 104 | 105 | A comparison between the space of `single_rect`, `single_packed` and `multi_packed` on the provided `test_collection` is shown below (`bpi` stands for "bits per integer"). 106 | For this small test collection, we exclude the space for the 107 | dictionaries. 108 | Results have been collected on a machine with an Intel i7-7700 processor clocked at 3.6 GHz and running Linux 4.4.0, 64 bits. The code was compiled using the highest optimization setting (see CMakeLists.txt). 109 | 110 | | **Index** |**docs [bpi]** |**freqs [bpi]** | 111 | |-------------------|---------------:|----------------:| 112 | |`single_rect` | 5.939 | 3.047 | 113 | |`single_packed` | 5.939 | 3.047 | 114 | |`multi_packed` | 4.766 | 2.455 | 115 | |`PEF eps-opt` | 6.369 | 3.479 | 116 | 117 | 118 | Authors 119 | ------- 120 | * Giulio Ermanno Pibiri, 121 | * Matthias Petri, 122 | * Alistair Moffat, 123 | 124 | Bibliography 125 | ------------ 126 | * [1] Giulio Ermanno Pibiri, Matthias Petri and Alistair Moffat, *Fast Dictionary-based Compression for Inverted Indexes*. In the Proceedings of the 12-th ACM Conference on Web Search and Data Mining (WSDM 2019). -------------------------------------------------------------------------------- /external/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | EXECUTE_PROCESS(COMMAND git submodule update --init 2 | WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/.. 3 | OUTPUT_QUIET 4 | ) 5 | 6 | add_subdirectory(succinct EXCLUDE_FROM_ALL) 7 | add_subdirectory(FastPFor EXCLUDE_FROM_ALL) 8 | 9 | # Add streamvbyte 10 | include_directories(streamvbyte/include) 11 | add_library(streamvbyte STATIC streamvbyte/src/streamvbyte_encode.c 12 | streamvbyte/src/streamvbyte_decode.c 13 | ) 14 | 15 | # Add maskedvbyte 16 | include_directories(MaskedVByte/include) 17 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11 -march=native") 18 | add_library(MaskedVByte STATIC MaskedVByte/src/varintdecode.c 19 | MaskedVByte/src/varintencode.c 20 | ) 21 | -------------------------------------------------------------------------------- /include/dint/dictionary_builders.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "dint_configuration.hpp" 8 | #include "statistics_collectors.hpp" 9 | #include "binary_blocks_collection.hpp" 10 | #include "hash_utils.hpp" 11 | #include "util.hpp" 12 | 13 | namespace ds2i { 14 | 15 | static const double codeword_bits = std::log2(constants::num_entries); 16 | static const double initial_bpi = 3 * codeword_bits; 17 | static const double eps = 0.0001; 18 | 19 | double cost(uint32_t block_size, uint32_t block_frequency) { 20 | return block_frequency * (initial_bpi * block_size - codeword_bits); 21 | } 22 | 23 | double compute_saving(uint32_t block_size, uint32_t block_frequency, 24 | uint64_t total_integers) { 25 | return cost(block_size, block_frequency) / total_integers; 26 | }; 27 | 28 | struct cost_filter { 29 | cost_filter(double threshold = eps) : m_threshold(threshold) {} 30 | 31 | bool operator()(block_type const& block, uint64_t total_integers) const { 32 | return compute_saving(block.data.size(), block.freq, total_integers) > 33 | m_threshold; 34 | } 35 | 36 | private: 37 | double m_threshold; 38 | }; 39 | 40 | template 41 | struct decreasing_static_frequencies { 42 | typedef Dictionary dictionary_type; 43 | typedef Statistics statistics_type; 44 | 45 | static std::string type() { 46 | return "DSF-" + std::to_string(dictionary_type::num_entries) + "-" + 47 | std::to_string(dictionary_type::max_entry_size); 48 | } 49 | 50 | static auto filter() { 51 | cost_filter filter(eps / 1000); 52 | return filter; 53 | } 54 | 55 | static void build(typename dictionary_type::builder& dict_builder, 56 | statistics_type& stats) { 57 | logger() << "building " << type() << " dictionary for " 58 | << stats.total_integers << " integers" << std::endl; 59 | 60 | dict_builder.init(); 61 | for (uint64_t s = 0; s != stats.blocks.size(); ++s) { 62 | uint64_t n = dictionary_type::num_entries; 63 | if (stats.blocks[s].size() < n) { 64 | n = stats.blocks[s].size(); 65 | } 66 | 67 | auto it = stats.blocks[s].begin(); 68 | for (uint64_t i = 0; i != n; ++i, ++it) { 69 | auto const& block = *it; 70 | dict_builder.append(block.data.data(), block.data.size(), s); 71 | } 72 | } 73 | 74 | dict_builder.build(); 75 | } 76 | }; 77 | } // namespace ds2i 78 | -------------------------------------------------------------------------------- /include/dint/dictionary_types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "dint_configuration.hpp" 4 | #include "rectangular_dictionary.hpp" 5 | #include "single_dictionary.hpp" 6 | #include "multi_dictionary.hpp" 7 | 8 | namespace ds2i { 9 | 10 | using single_dictionary_rectangular_type = 11 | rectangular_dictionary; 12 | using single_dictionary_packed_type = 13 | single_dictionary; 15 | using single_dictionary_overlapped_type = 16 | single_dictionary; 18 | 19 | using multi_dictionary_packed_type = 20 | multi_dictionary; 22 | using multi_dictionary_overlapped_type = 23 | multi_dictionary; 25 | 26 | } // namespace ds2i 27 | -------------------------------------------------------------------------------- /include/dint/dint_configuration.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #define EXCEPTIONS 2 7 | #define INF std::numeric_limits::max() 8 | 9 | namespace ds2i { 10 | 11 | namespace constants { 12 | 13 | enum block_selector { 14 | max = 0 15 | // median = 1, 16 | // mode = 2 17 | }; 18 | 19 | static const int context = block_selector::max; 20 | static const uint32_t num_selectors = 6; 21 | static const uint32_t selector_codes[] = {0, 1, 2, 3, 4, 5}; 22 | 23 | // b = 16, l = 16 24 | static const uint32_t max_entry_size = 16; 25 | static const uint32_t target_sizes[] = {16, 8, 4, 2, 1}; 26 | static const uint32_t num_entries = 65536; 27 | static const uint32_t log2_num_entries = 16; 28 | static const uint32_t num_target_sizes = std::log2(max_entry_size) + 1; 29 | } // namespace constants 30 | } // namespace ds2i 31 | -------------------------------------------------------------------------------- /include/dint/hash_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace ds2i { 4 | 5 | typedef std::pair byte_range; 6 | 7 | uint64_t murmur_hash64(const void* key, size_t len, uint64_t seed) { 8 | const uint64_t m = 0xc6a4a7935bd1e995ULL; 9 | const int r = 47; 10 | 11 | uint64_t h = seed ^ (len * m); 12 | 13 | #if defined(__arm) || defined(__arm__) 14 | const size_t ksize = sizeof(uint64_t); 15 | const unsigned char* data = (const unsigned char*)key; 16 | const unsigned char* end = data + (std::size_t)(len / 8) * ksize; 17 | #else 18 | const uint64_t* data = (const uint64_t*)key; 19 | const uint64_t* end = data + (len / 8); 20 | #endif 21 | 22 | while (data != end) { 23 | #if defined(__arm) || defined(__arm__) 24 | uint64_t k; 25 | memcpy(&k, data, ksize); 26 | data += ksize; 27 | #else 28 | uint64_t k = *data++; 29 | #endif 30 | 31 | k *= m; 32 | k ^= k >> r; 33 | k *= m; 34 | 35 | h ^= k; 36 | h *= m; 37 | } 38 | 39 | const unsigned char* data2 = (const unsigned char*)data; 40 | 41 | switch (len & 7) { 42 | // fall through 43 | case 7: 44 | h ^= uint64_t(data2[6]) << 48; 45 | // fall through 46 | case 6: 47 | h ^= uint64_t(data2[5]) << 40; 48 | // fall through 49 | case 5: 50 | h ^= uint64_t(data2[4]) << 32; 51 | // fall through 52 | case 4: 53 | h ^= uint64_t(data2[3]) << 24; 54 | // fall through 55 | case 3: 56 | h ^= uint64_t(data2[2]) << 16; 57 | // fall through 58 | case 2: 59 | h ^= uint64_t(data2[1]) << 8; 60 | // fall through 61 | case 1: 62 | h ^= uint64_t(data2[0]); 63 | h *= m; 64 | }; 65 | 66 | h ^= h >> r; 67 | h *= m; 68 | h ^= h >> r; 69 | 70 | return h; 71 | } 72 | 73 | uint64_t hash_bytes64(byte_range const& br) { 74 | return murmur_hash64(br.first, br.second - br.first, 0); 75 | } 76 | 77 | uint64_t hash_bytes64(uint32_t const* ptr, size_t size_u32) { 78 | return murmur_hash64(reinterpret_cast(ptr), 79 | size_u32 * sizeof(uint32_t), 0); 80 | } 81 | } // namespace ds2i 82 | -------------------------------------------------------------------------------- /include/dint/statistics_collectors.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "hash_utils.hpp" 4 | #include "dint_configuration.hpp" 5 | 6 | namespace ds2i { 7 | 8 | struct block_type { 9 | block_type() : freq(1) {} 10 | 11 | uint64_t hash() const { 12 | return hash_bytes64(data.data(), data.size()); 13 | } 14 | 15 | uint64_t freq; 16 | std::vector data; 17 | }; 18 | 19 | typedef std::unordered_map map_type; 20 | 21 | struct selector { 22 | uint32_t get(uint32_t const* entry, size_t n) { 23 | uint32_t x = 0; 24 | if (constants::context == constants::block_selector::max) { 25 | for (auto ptr = entry; ptr != entry + n; ++ptr) { 26 | if (*ptr > x) { 27 | x = *ptr; 28 | } 29 | } 30 | } else { 31 | throw std::runtime_error("Unsupported context"); 32 | } 33 | 34 | uint32_t selector_code = 0; 35 | if (x > 1) { 36 | selector_code = ceil_log2(ceil_log2(x + 1)); 37 | } 38 | return selector_code; 39 | } 40 | }; 41 | 42 | struct freq_sorter { 43 | bool operator()(block_type const& l, block_type const& r) { 44 | return l.freq > r.freq; 45 | } 46 | }; 47 | 48 | struct length_freq_sorter { 49 | bool operator()(block_type const& l, block_type const& r) { 50 | if (l.data.size() == r.data.size()) { 51 | return l.freq > r.freq; 52 | } 53 | return l.data.size() > r.data.size(); 54 | } 55 | }; 56 | 57 | struct freq_length_sorter { 58 | bool operator()(block_type const& l, block_type const& r) { 59 | if (l.freq == r.freq) { 60 | return l.data.size() > r.data.size(); 61 | } 62 | return l.freq > r.freq; 63 | } 64 | }; 65 | 66 | void increase_frequency(uint32_t const* entry, size_t n, map_type& bmap, 67 | uint32_t amount = 1) { 68 | auto hash = hash_bytes64(entry, n); 69 | auto it = bmap.find(hash); 70 | if (it != bmap.end()) { 71 | (*it).second.freq += amount; 72 | } else { 73 | block_type block; 74 | block.data.reserve(n); 75 | while (block.data.size() < n) { 76 | block.data.push_back(*entry++); 77 | } 78 | bmap[hash] = std::move(block); 79 | } 80 | } 81 | 82 | template 83 | struct adjusted { 84 | static const uint32_t max_block_size = t_max_block_size; 85 | 86 | static std::string type() { 87 | return "adjusted"; 88 | } 89 | 90 | static void collect(std::vector& buf, 91 | std::vector& block_maps) { 92 | auto b = buf.data(); 93 | uint32_t blocks = buf.size() / constants::block_size; 94 | selector sct; 95 | for (uint32_t i = 0, pos = 0; i < blocks; 96 | ++i, pos += constants::block_size) { 97 | uint32_t index = sct.get(b + pos, constants::block_size); 98 | for (uint32_t s = 0; s < constants::num_target_sizes; ++s) { 99 | uint32_t jump_size = constants::target_sizes[s]; 100 | uint32_t jumps = constants::block_size / jump_size; 101 | for (uint32_t j = 0, p = 0; j < jumps; ++j, p += jump_size) { 102 | increase_frequency(b + pos + p, jump_size, 103 | block_maps[index]); 104 | } 105 | } 106 | } 107 | } 108 | 109 | static void collect(std::vector& buf, map_type& block_map) { 110 | auto b = buf.data(); 111 | for (uint32_t s = 0; s < constants::num_target_sizes; ++s) { 112 | uint32_t block_size = constants::target_sizes[s]; 113 | uint32_t blocks = buf.size() / block_size; 114 | for (uint32_t i = 0, pos = 0; i < blocks; ++i, pos += block_size) { 115 | increase_frequency(b + pos, block_size, block_map); 116 | } 117 | } 118 | } 119 | }; 120 | }; // namespace ds2i 121 | -------------------------------------------------------------------------------- /include/ds2i/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015 2 | Giuseppe Ottaviano 3 | Rossano Venturini 4 | Nicola Tonellotto 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | -------------------------------------------------------------------------------- /include/ds2i/all_ones_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "global_parameters.hpp" 4 | #include "util.hpp" 5 | 6 | namespace ds2i { 7 | 8 | struct all_ones_sequence { 9 | 10 | inline static uint64_t 11 | bitsize(global_parameters const& /* params */, uint64_t universe, uint64_t n) 12 | { 13 | return (universe == n) ? 0 : uint64_t(-1); 14 | } 15 | 16 | template 17 | static void write(succinct::bit_vector_builder&, 18 | Iterator, 19 | uint64_t universe, uint64_t n, 20 | global_parameters const&) 21 | { 22 | assert(universe == n); (void)universe; (void)n; 23 | } 24 | 25 | class enumerator { 26 | public: 27 | 28 | typedef std::pair value_type; // (position, value) 29 | 30 | enumerator(succinct::bit_vector const&, uint64_t, 31 | uint64_t universe, uint64_t n, 32 | global_parameters const&) 33 | : m_universe(universe) 34 | , m_position(size()) 35 | { 36 | assert(universe == n); (void)n; 37 | } 38 | 39 | value_type move(uint64_t position) 40 | { 41 | assert(position <= size()); 42 | m_position = position; 43 | return value_type(m_position, m_position); 44 | } 45 | 46 | value_type next_geq(uint64_t lower_bound) 47 | { 48 | assert(lower_bound <= size()); 49 | m_position = lower_bound; 50 | return value_type(m_position, m_position); 51 | } 52 | 53 | value_type next() 54 | { 55 | m_position += 1; 56 | return value_type(m_position, m_position); 57 | } 58 | 59 | uint64_t size() const 60 | { 61 | return m_universe; 62 | } 63 | 64 | uint64_t prev_value() const 65 | { 66 | if (m_position == 0) { 67 | return 0; 68 | } 69 | return m_position - 1; 70 | } 71 | 72 | private: 73 | uint64_t m_universe; 74 | uint64_t m_position; 75 | }; 76 | }; 77 | } 78 | -------------------------------------------------------------------------------- /include/ds2i/binary_blocks_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "util.hpp" 10 | 11 | namespace ds2i { 12 | 13 | struct binary_blocks_collection { 14 | binary_blocks_collection(const char* filename) 15 | { 16 | m_file.open(filename); 17 | if (!m_file.is_open()) { 18 | throw std::runtime_error("Error opening file"); 19 | } 20 | m_data = (uint32_t const*) m_file.data(); 21 | m_data_size = m_file.size() / sizeof(m_data[0]); 22 | auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL); 23 | if (ret) { 24 | std::cerr << "Error calling madvice: " << errno; 25 | } 26 | 27 | // parse header 28 | m_num_blocks = *m_data; 29 | if (!m_num_blocks) { 30 | throw std::runtime_error("Number of blocks must not be 0"); 31 | } 32 | } 33 | 34 | struct iterator; 35 | 36 | iterator begin() const { 37 | return iterator(this, 1); 38 | } 39 | 40 | iterator end() const { 41 | return iterator(this, m_data_size); 42 | } 43 | 44 | struct block { 45 | block() 46 | : m_freq(0) 47 | , m_begin(nullptr) 48 | , m_end(nullptr) 49 | {} 50 | 51 | uint32_t const* begin() const { 52 | return m_begin; 53 | } 54 | 55 | uint32_t const* end() const { 56 | return m_end; 57 | } 58 | 59 | uint32_t freq() const { 60 | return m_freq; 61 | } 62 | 63 | size_t size() const { 64 | return m_end - m_begin; 65 | } 66 | 67 | private: 68 | friend struct binary_blocks_collection::iterator; 69 | 70 | block(uint32_t freq, 71 | uint32_t const* begin, 72 | uint32_t const* end) 73 | : m_freq(freq) 74 | , m_begin(begin) 75 | , m_end(end) 76 | {} 77 | 78 | uint32_t m_freq; 79 | uint32_t const* m_begin; 80 | uint32_t const* m_end; 81 | }; 82 | 83 | struct iterator { 84 | iterator() 85 | : m_collection(nullptr) 86 | {} 87 | 88 | block const& operator*() const { 89 | return m_cur_block; 90 | } 91 | 92 | block const* operator->() const { 93 | return &m_cur_block; 94 | } 95 | 96 | iterator& operator++() { 97 | m_pos = m_next_pos; 98 | read(); 99 | return *this; 100 | } 101 | 102 | bool operator==(iterator const& other) const { 103 | assert(m_collection == other.m_collection); 104 | return m_pos == other.m_pos; 105 | } 106 | 107 | bool operator!=(iterator const& other) const { 108 | return !(*this == other); 109 | } 110 | 111 | private: 112 | friend struct binary_blocks_collection; 113 | 114 | iterator(binary_blocks_collection const* coll, uint64_t pos) 115 | : m_collection(coll) 116 | , m_pos(pos) 117 | { 118 | read(); 119 | } 120 | 121 | void read() 122 | { 123 | assert(m_pos <= m_collection->m_data_size); 124 | if (m_pos == m_collection->m_data_size) return; 125 | 126 | size_t n = 0; 127 | size_t pos = m_pos; 128 | while (!(n = m_collection->m_data[pos++])); // skip empty seqs 129 | // file might be truncated 130 | n = std::min(n, size_t(m_collection->m_data_size - pos)); 131 | 132 | uint32_t freq = m_collection->m_data[pos]; 133 | uint32_t const* begin = &m_collection->m_data[pos + 1]; 134 | uint32_t const* end = begin + n; 135 | 136 | m_next_pos = pos + n + 1; 137 | m_cur_block = block(freq, begin, end); 138 | } 139 | 140 | binary_blocks_collection const* m_collection; 141 | size_t m_pos, m_next_pos; 142 | block m_cur_block; 143 | }; 144 | 145 | uint64_t num_blocks() const { 146 | return m_num_blocks; 147 | } 148 | 149 | private: 150 | boost::iostreams::mapped_file_source m_file; 151 | uint32_t const* m_data; 152 | size_t m_data_size; 153 | uint64_t m_num_blocks; 154 | }; 155 | } 156 | -------------------------------------------------------------------------------- /include/ds2i/binary_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "util.hpp" 10 | 11 | namespace ds2i { 12 | 13 | class binary_collection { 14 | public: 15 | typedef uint32_t posting_type; 16 | 17 | binary_collection(const char* filename) 18 | { 19 | m_file.open(filename); 20 | if ( !m_file.is_open() ) { 21 | throw std::runtime_error("Error opening file"); 22 | } 23 | m_data = (posting_type const*)m_file.data(); 24 | m_data_size = m_file.size() / sizeof(m_data[0]); 25 | 26 | auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL); 27 | if (ret) { 28 | logger() << "Error calling madvice: " << errno; 29 | } 30 | } 31 | 32 | class iterator; 33 | 34 | iterator begin() const { 35 | return iterator(this, 0); 36 | } 37 | 38 | iterator end() const { 39 | return iterator(this, m_data_size); 40 | } 41 | 42 | size_t num_postings() const { 43 | return m_data_size; 44 | } 45 | 46 | class sequence { 47 | public: 48 | sequence() 49 | : m_begin(nullptr) 50 | , m_end(nullptr) 51 | {} 52 | 53 | posting_type const* begin() const 54 | { 55 | return m_begin; 56 | } 57 | 58 | posting_type const* end() const 59 | { 60 | return m_end; 61 | } 62 | 63 | posting_type back() const 64 | { 65 | assert(size()); 66 | return *(m_end - 1); 67 | } 68 | 69 | size_t size() const 70 | { 71 | return m_end - m_begin; 72 | } 73 | 74 | private: 75 | friend class binary_collection::iterator; 76 | 77 | sequence(posting_type const* begin, posting_type const* end) 78 | : m_begin(begin) 79 | , m_end(end) 80 | {} 81 | 82 | posting_type const* m_begin; 83 | posting_type const* m_end; 84 | }; 85 | 86 | class iterator : public std::iterator { 88 | public: 89 | iterator() 90 | : m_collection(nullptr) 91 | {} 92 | 93 | value_type const& operator*() const 94 | { 95 | return m_cur_seq; 96 | } 97 | 98 | value_type const* operator->() const 99 | { 100 | return &m_cur_seq; 101 | } 102 | 103 | iterator& operator++() 104 | { 105 | m_pos = m_next_pos; 106 | read(); 107 | return *this; 108 | } 109 | 110 | bool operator==(iterator const& other) const 111 | { 112 | assert(m_collection == other.m_collection); 113 | return m_pos == other.m_pos; 114 | } 115 | 116 | bool operator!=(iterator const& other) const 117 | { 118 | return !(*this == other); 119 | } 120 | 121 | private: 122 | friend class binary_collection; 123 | 124 | iterator(binary_collection const* coll, size_t pos) 125 | : m_collection(coll) 126 | , m_pos(pos) 127 | { 128 | read(); 129 | } 130 | 131 | void read() 132 | { 133 | assert(m_pos <= m_collection->m_data_size); 134 | if (m_pos == m_collection->m_data_size) return; 135 | 136 | size_t n = 0; 137 | size_t pos = m_pos; 138 | while (!(n = m_collection->m_data[pos++])); // skip empty seqs 139 | // file might be truncated 140 | n = std::min(n, size_t(m_collection->m_data_size - pos)); 141 | posting_type const* begin = &m_collection->m_data[pos]; 142 | posting_type const* end = begin + n; 143 | 144 | m_next_pos = pos + n; 145 | m_cur_seq = sequence(begin, end); 146 | } 147 | 148 | binary_collection const* m_collection; 149 | size_t m_pos, m_next_pos; 150 | sequence m_cur_seq; 151 | }; 152 | 153 | private: 154 | boost::iostreams::mapped_file_source m_file; 155 | posting_type const* m_data; 156 | size_t m_data_size; 157 | }; 158 | } 159 | -------------------------------------------------------------------------------- /include/ds2i/binary_freq_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "binary_collection.hpp" 8 | 9 | namespace ds2i { 10 | 11 | class binary_freq_collection { 12 | public: 13 | 14 | binary_freq_collection(const char* basename) 15 | : m_docs((std::string(basename) + ".docs").c_str()) 16 | , m_freqs((std::string(basename) + ".freqs").c_str()) 17 | { 18 | auto firstseq = *m_docs.begin(); 19 | if (firstseq.size() != 1) { 20 | throw std::invalid_argument("First sequence should only contain number of documents"); 21 | } 22 | m_num_docs = *firstseq.begin(); 23 | } 24 | 25 | class iterator; 26 | 27 | iterator begin() const { 28 | auto docs_it = m_docs.begin(); 29 | return iterator(++docs_it, m_freqs.begin()); 30 | } 31 | 32 | iterator end() const { 33 | return iterator(m_docs.end(), m_freqs.end()); 34 | } 35 | 36 | uint64_t num_docs() const { 37 | return m_num_docs; 38 | } 39 | 40 | uint64_t num_postings() const { 41 | return m_docs.num_postings() + m_freqs.num_postings() 42 | - 2; // skip fist singleton sequence, containing num. of docs 43 | } 44 | 45 | struct sequence { 46 | binary_collection::sequence docs; 47 | binary_collection::sequence freqs; 48 | }; 49 | 50 | class iterator : public std::iterator { 52 | public: 53 | iterator() 54 | {} 55 | 56 | value_type const& operator*() const { 57 | return m_cur_seq; 58 | } 59 | 60 | value_type const* operator->() const { 61 | return &m_cur_seq; 62 | } 63 | 64 | iterator& operator++() { 65 | m_cur_seq.docs = *++m_docs_it; 66 | m_cur_seq.freqs = *++m_freqs_it; 67 | return *this; 68 | } 69 | 70 | bool operator==(iterator const& other) const { 71 | return m_docs_it == other.m_docs_it; 72 | } 73 | 74 | bool operator!=(iterator const& other) const { 75 | return !(*this == other); 76 | } 77 | 78 | private: 79 | friend class binary_freq_collection; 80 | 81 | iterator(binary_collection::iterator docs_it, 82 | binary_collection::iterator freqs_it) 83 | : m_docs_it(docs_it) 84 | , m_freqs_it(freqs_it) 85 | { 86 | m_cur_seq.docs = *m_docs_it; 87 | m_cur_seq.freqs = *m_freqs_it; 88 | } 89 | 90 | binary_collection::iterator m_docs_it; 91 | binary_collection::iterator m_freqs_it; 92 | sequence m_cur_seq; 93 | }; 94 | 95 | private: 96 | binary_collection m_docs; 97 | binary_collection m_freqs; 98 | uint64_t m_num_docs; 99 | }; 100 | } 101 | -------------------------------------------------------------------------------- /include/ds2i/bitvector_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "compact_elias_fano.hpp" 6 | 7 | namespace ds2i { 8 | 9 | class bitvector_collection { 10 | public: 11 | bitvector_collection() 12 | : m_size(0) 13 | {} 14 | 15 | class builder { 16 | public: 17 | builder(global_parameters const& params) 18 | : m_params(params) 19 | { 20 | m_endpoints.push_back(0); 21 | } 22 | 23 | void append(succinct::bit_vector_builder& bvb) 24 | { 25 | m_bitvectors.append(bvb); 26 | m_endpoints.push_back(m_bitvectors.size()); 27 | } 28 | 29 | void build(bitvector_collection& sq) 30 | { 31 | sq.m_size = m_endpoints.size() - 1; 32 | succinct::bit_vector(&m_bitvectors).swap(sq.m_bitvectors); 33 | 34 | succinct::bit_vector_builder bvb; 35 | compact_elias_fano::write(bvb, m_endpoints.begin(), 36 | m_bitvectors.size(), sq.m_size, 37 | m_params); 38 | succinct::bit_vector(&bvb).swap(sq.m_endpoints); 39 | } 40 | 41 | private: 42 | global_parameters m_params; 43 | std::vector m_endpoints; 44 | succinct::bit_vector_builder m_bitvectors; 45 | }; 46 | 47 | size_t size() const 48 | { 49 | return m_size; 50 | } 51 | 52 | succinct::bit_vector const& bits() const 53 | { 54 | return m_bitvectors; 55 | } 56 | 57 | succinct::bit_vector::enumerator 58 | get(global_parameters const& params, size_t i) const 59 | { 60 | assert(i < size()); 61 | compact_elias_fano::enumerator endpoints(m_endpoints, 0, 62 | m_bitvectors.size(), m_size, 63 | params); 64 | 65 | auto endpoint = endpoints.move(i).second; 66 | return succinct::bit_vector::enumerator(m_bitvectors, endpoint); 67 | } 68 | 69 | void swap(bitvector_collection& other) 70 | { 71 | std::swap(m_size, other.m_size); 72 | m_endpoints.swap(other.m_endpoints); 73 | m_bitvectors.swap(other.m_bitvectors); 74 | } 75 | 76 | template 77 | void map(Visitor& visit) 78 | { 79 | visit 80 | (m_size, "m_size") 81 | (m_endpoints, "m_endpoints") 82 | (m_bitvectors, "m_bitvectors") 83 | ; 84 | } 85 | 86 | private: 87 | size_t m_size; 88 | succinct::bit_vector m_endpoints; 89 | succinct::bit_vector m_bitvectors; 90 | }; 91 | } 92 | -------------------------------------------------------------------------------- /include/ds2i/block_freq_index.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "compact_elias_fano.hpp" 6 | #include "block_posting_list.hpp" 7 | #include "semiasync_queue.hpp" 8 | 9 | namespace ds2i { 10 | 11 | template 12 | class block_freq_index { 13 | public: 14 | block_freq_index() 15 | : m_size(0) 16 | {} 17 | 18 | class builder { 19 | public: 20 | builder(uint64_t num_docs, global_parameters const& params) 21 | : m_queue(1 << 24) 22 | , m_params(params) 23 | { 24 | m_num_docs = num_docs; 25 | m_endpoints.push_back(0); 26 | } 27 | 28 | template 29 | void add_posting_list(uint64_t n, DocsIterator docs_begin, 30 | FreqsIterator freqs_begin, uint64_t /* occurrences */) 31 | { 32 | if (!n) throw std::invalid_argument("List must be nonempty"); 33 | block_posting_list::write(m_lists, n, 34 | docs_begin, freqs_begin); 35 | m_endpoints.push_back(m_lists.size()); 36 | 37 | // if (!n) throw std::invalid_argument("List must be nonempty"); 38 | // std::shared_ptr> 39 | // ptr(new list_adder 40 | // (*this, docs_begin, freqs_begin, n)); 41 | // m_queue.add_job(ptr, 2 * n); 42 | } 43 | 44 | template 45 | void add_posting_list(uint64_t n, BlockDataRange const& blocks) 46 | { 47 | if (!n) throw std::invalid_argument("List must be nonempty"); 48 | block_posting_list::write_blocks(m_lists, n, blocks); 49 | m_endpoints.push_back(m_lists.size()); 50 | } 51 | 52 | template 53 | void add_posting_list(BytesRange const& data) 54 | { 55 | m_lists.insert(m_lists.end(), std::begin(data), std::end(data)); 56 | m_endpoints.push_back(m_lists.size()); 57 | } 58 | 59 | void build_model(std::string const&) 60 | {} 61 | 62 | void build(block_freq_index& sq) 63 | { 64 | m_queue.complete(); 65 | sq.m_params = m_params; 66 | sq.m_size = m_endpoints.size() - 1; 67 | sq.m_num_docs = m_num_docs; 68 | sq.m_lists.steal(m_lists); 69 | 70 | succinct::bit_vector_builder bvb; 71 | compact_elias_fano::write(bvb, m_endpoints.begin(), 72 | sq.m_lists.size(), sq.m_size, 73 | m_params); // XXX 74 | succinct::bit_vector(&bvb).swap(sq.m_endpoints); 75 | } 76 | 77 | private: 78 | 79 | template 80 | struct list_adder : semiasync_queue::job { 81 | list_adder(builder& b, 82 | DocsIterator docs_begin, 83 | FreqsIterator freqs_begin, 84 | uint64_t n) 85 | : b(b) 86 | , docs_begin(docs_begin) 87 | , freqs_begin(freqs_begin) 88 | , n(n) 89 | {} 90 | 91 | virtual void prepare() 92 | { 93 | block_posting_list::write( 94 | list, n, docs_begin, freqs_begin 95 | ); 96 | } 97 | 98 | virtual void commit() 99 | { 100 | b.m_lists.insert(b.m_lists.end(), list.begin(), list.end()); 101 | b.m_endpoints.push_back(b.m_lists.size()); 102 | } 103 | 104 | builder& b; 105 | DocsIterator docs_begin; 106 | FreqsIterator freqs_begin; 107 | uint64_t n; 108 | std::vector list; 109 | }; 110 | 111 | semiasync_queue m_queue; 112 | global_parameters m_params; 113 | size_t m_num_docs; 114 | std::vector m_endpoints; 115 | std::vector m_lists; 116 | }; 117 | 118 | size_t size() const 119 | { 120 | return m_size; 121 | } 122 | 123 | uint64_t num_docs() const 124 | { 125 | return m_num_docs; 126 | } 127 | 128 | typedef typename block_posting_list::document_enumerator document_enumerator; 129 | 130 | document_enumerator operator[](size_t i) const 131 | { 132 | assert(i < size()); 133 | compact_elias_fano::enumerator endpoints(m_endpoints, 0, 134 | m_lists.size(), m_size, 135 | m_params); 136 | 137 | auto endpoint = endpoints.move(i).second; 138 | return document_enumerator(m_lists.data() + endpoint, num_docs(), i); 139 | } 140 | 141 | void warmup(size_t i) const 142 | { 143 | assert(i < size()); 144 | compact_elias_fano::enumerator endpoints(m_endpoints, 0, 145 | m_lists.size(), m_size, 146 | m_params); 147 | 148 | auto begin = endpoints.move(i).second; 149 | auto end = m_lists.size(); 150 | if (i + 1 != size()) { 151 | end = endpoints.move(i + 1).second; 152 | } 153 | 154 | volatile uint32_t tmp; 155 | for (size_t i = begin; i != end; ++i) { 156 | tmp = m_lists[i]; 157 | } 158 | (void)tmp; 159 | } 160 | 161 | void swap(block_freq_index& other) 162 | { 163 | std::swap(m_params, other.m_params); 164 | std::swap(m_size, other.m_size); 165 | m_endpoints.swap(other.m_endpoints); 166 | m_lists.swap(other.m_lists); 167 | } 168 | 169 | template 170 | void map(Visitor& visit) 171 | { 172 | visit 173 | (m_params, "m_params") 174 | (m_size, "m_size") 175 | (m_num_docs, "m_num_docs") 176 | (m_endpoints, "m_endpoints") 177 | (m_lists, "m_lists") 178 | ; 179 | } 180 | 181 | private: 182 | global_parameters m_params; 183 | size_t m_size; 184 | size_t m_num_docs; 185 | succinct::bit_vector m_endpoints; 186 | succinct::mapper::mappable_vector m_lists; 187 | }; 188 | } 189 | -------------------------------------------------------------------------------- /include/ds2i/block_profiler.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace ds2i { 8 | 9 | class block_profiler { 10 | public: 11 | 12 | ~block_profiler() 13 | { 14 | std::lock_guard lock(m_mutex); 15 | for (auto const& it: m_block_freqs) { 16 | delete [] it.second.second; 17 | } 18 | } 19 | 20 | typedef std::atomic_uint_fast32_t counter_type; 21 | 22 | static block_profiler& get() { 23 | static block_profiler instance; 24 | return instance; 25 | } 26 | 27 | static counter_type* open_list(uint32_t term_id, uint32_t blocks) 28 | { 29 | block_profiler& instance = get(); 30 | std::lock_guard lock(instance.m_mutex); 31 | auto& v = instance.m_block_freqs[term_id]; 32 | if (v.second == nullptr) { 33 | v.first = 2 * blocks; 34 | v.second = new counter_type[v.first]; 35 | std::fill(v.second, v.second + v.first, 0); 36 | } 37 | return v.second; 38 | } 39 | 40 | static void dump(std::ostream& os) 41 | { 42 | block_profiler& instance = get(); 43 | std::lock_guard lock(instance.m_mutex); 44 | 45 | for (auto const& it: instance.m_block_freqs) { 46 | os << it.first; 47 | 48 | for (size_t i = 0; i < it.second.first; ++i) { 49 | os << '\t' << it.second.second[i]; 50 | } 51 | 52 | os << '\n'; 53 | } 54 | } 55 | 56 | private: 57 | block_profiler() {} 58 | 59 | // XXX can't do vector of atomics ARGHH 60 | std::map> m_block_freqs; 61 | std::mutex m_mutex; 62 | }; 63 | 64 | } 65 | -------------------------------------------------------------------------------- /include/ds2i/bm25.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace ds2i { 6 | 7 | struct bm25 { 8 | static constexpr float b = 0.5; 9 | static constexpr float k1 = 1.2; 10 | 11 | static float doc_term_weight(uint64_t freq, float norm_len) 12 | { 13 | float f = (float)freq; 14 | return f / (f + k1 * (1.0f - b + b * norm_len)); 15 | } 16 | 17 | static float query_term_weight(uint64_t freq, uint64_t df, uint64_t num_docs) 18 | { 19 | float f = (float)freq; 20 | float fdf = (float)df; 21 | float idf = std::log((float(num_docs) - fdf + 0.5f) / (fdf + 0.5f)); 22 | static const float epsilon_score = 1.0E-6; 23 | return f * std::max(epsilon_score, idf) * (1.0f + k1); 24 | } 25 | }; 26 | 27 | } 28 | -------------------------------------------------------------------------------- /include/ds2i/configuration.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace ds2i { 9 | 10 | class configuration { 11 | public: 12 | static configuration const& get() { 13 | static configuration instance; 14 | return instance; 15 | } 16 | 17 | double eps1; 18 | double eps2; 19 | uint64_t fix_cost; 20 | 21 | size_t log_partition_size; 22 | size_t worker_threads; 23 | 24 | bool heuristic_greedy; 25 | 26 | private: 27 | configuration() 28 | { 29 | fillvar("DS2I_EPS1", eps1, 0.03); 30 | fillvar("DS2I_EPS2", eps2, 0.3); 31 | fillvar("DS2I_FIXCOST", fix_cost, 64); 32 | fillvar("DS2I_LOG_PART", log_partition_size, 7); 33 | fillvar("DS2I_THREADS", worker_threads, std::thread::hardware_concurrency()); 34 | fillvar("DS2I_HEURISTIC_GREEDY", heuristic_greedy, false); 35 | } 36 | 37 | template 38 | void fillvar(const char* envvar, T& var, T2 def) 39 | { 40 | const char* val = std::getenv(envvar); 41 | if (!val || !strlen(val)) { 42 | var = def; 43 | } else { 44 | var = boost::lexical_cast(val); 45 | } 46 | } 47 | }; 48 | 49 | } 50 | -------------------------------------------------------------------------------- /include/ds2i/dec_time_prediction.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "util.hpp" 12 | 13 | #define DS2I_FEATURE_TYPES (n)(size)(sum_of_logs)(entropy)(nonzeros)(max_b)(pfor_b)(pfor_exceptions) 14 | 15 | namespace ds2i { namespace time_prediction { 16 | 17 | constexpr size_t num_features = BOOST_PP_SEQ_SIZE(DS2I_FEATURE_TYPES); 18 | 19 | enum class feature_type { 20 | BOOST_PP_SEQ_ENUM(DS2I_FEATURE_TYPES), end 21 | }; 22 | 23 | feature_type parse_feature_type(std::string const& name) 24 | { 25 | if (false) { 26 | #define LOOP_BODY(R, DATA, T) \ 27 | } else if (name == BOOST_PP_STRINGIZE(T)) { \ 28 | return feature_type::T; \ 29 | /**/ 30 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, DS2I_FEATURE_TYPES); 31 | #undef LOOP_BODY 32 | } else { 33 | throw std::invalid_argument("Invalid feature name " + name); 34 | } 35 | 36 | } 37 | 38 | std::string feature_name(feature_type f) 39 | { 40 | switch (f) { 41 | #define LOOP_BODY(R, DATA, T) \ 42 | case feature_type::T: return BOOST_PP_STRINGIZE(T); \ 43 | /**/ 44 | 45 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, DS2I_FEATURE_TYPES); 46 | #undef LOOP_BODY 47 | default: throw std::invalid_argument("Invalid feature type"); 48 | } 49 | } 50 | 51 | class feature_vector { 52 | public: 53 | feature_vector() 54 | { 55 | std::fill(m_features.begin(), m_features.end(), 0); 56 | } 57 | 58 | float& operator[](feature_type f) { return m_features[(size_t)f]; } 59 | float const& operator[](feature_type f) const { return m_features[(size_t)f]; } 60 | 61 | stats_line& dump(stats_line& sl) const 62 | { 63 | for (size_t i = 0; i < num_features; ++i) { 64 | feature_type ft = (feature_type)i; 65 | sl(feature_name(ft), (*this)[ft]); 66 | } 67 | return sl; 68 | } 69 | 70 | protected: 71 | std::array m_features; 72 | }; 73 | 74 | class predictor : public feature_vector { 75 | public: 76 | predictor() 77 | : m_bias(0) 78 | {} 79 | 80 | predictor(std::vector> const& values) 81 | { 82 | for (auto const& kv: values) { 83 | if (kv.first == "bias") { 84 | bias() = kv.second; 85 | } else { 86 | (*this)[parse_feature_type(kv.first)] = kv.second; 87 | } 88 | } 89 | } 90 | 91 | float& bias() { return m_bias; } 92 | float const& bias() const { return m_bias; } 93 | 94 | float operator()(feature_vector const& f) const 95 | { 96 | float result = bias(); 97 | for (size_t i = 0; i < num_features; ++i) { 98 | feature_type ft = (feature_type)i; 99 | result += (*this)[ft] * f[ft]; 100 | } 101 | return result; 102 | } 103 | 104 | protected: 105 | float m_bias; 106 | }; 107 | 108 | void values_statistics(std::vector values, feature_vector& f) 109 | { 110 | std::sort(values.begin(), values.end()); 111 | f[feature_type::n] = values.size(); 112 | if (values.empty()) return; 113 | 114 | uint32_t last_value = values.front(); 115 | size_t group_begin = 0; 116 | double entropy = 0; 117 | double sum_of_logs = 0; 118 | double nonzeros = 0; 119 | uint32_t max_b = 0; 120 | 121 | for (size_t i = 1; i <= values.size(); ++i) { 122 | if (i == values.size() || values[i] != last_value) { 123 | size_t group_size = i - group_begin; 124 | entropy += group_size * log2(double(values.size()) / group_size); 125 | sum_of_logs += group_size * log2(double(last_value) + 1); 126 | if (last_value != 0) { 127 | nonzeros += group_size; 128 | } 129 | uint32_t b = last_value ? succinct::broadword::msb(last_value) + 1 : 0; 130 | max_b = std::max(max_b, b); 131 | 132 | if (i < values.size()) { 133 | last_value = values[i]; 134 | group_begin = i; 135 | } 136 | } 137 | } 138 | 139 | f[feature_type::entropy] = entropy; 140 | f[feature_type::sum_of_logs] = sum_of_logs; 141 | f[feature_type::nonzeros] = nonzeros; 142 | f[feature_type::max_b] = max_b; 143 | } 144 | 145 | bool read_block_stats(std::istream& is, uint32_t& list_id, std::vector& block_counts) 146 | { 147 | thread_local std::string line; 148 | uint32_t count; 149 | block_counts.clear(); 150 | if (!std::getline(is, line)) return false; 151 | 152 | std::istringstream iss(line); 153 | iss >> list_id; 154 | while (iss >> count) block_counts.push_back(count); 155 | 156 | return true; 157 | } 158 | 159 | 160 | }} 161 | -------------------------------------------------------------------------------- /include/ds2i/dec_time_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import logging 6 | import json 7 | import collections 8 | import random 9 | 10 | import numpy as np 11 | import l1l1 12 | 13 | from ext import baker 14 | 15 | @baker.command 16 | def parse_data(filename, output_filename): 17 | import pandas 18 | 19 | logging.info('Reading data from %s', filename) 20 | with open(filename) as fin: 21 | data = [json.loads(line) for line in fin] 22 | 23 | pd = pandas.DataFrame(data) 24 | logging.info('Saving dataframe to %s', output_filename) 25 | pd.to_pickle(output_filename) 26 | 27 | 28 | @baker.command 29 | def train(filename): 30 | import pandas 31 | logging.info('Reading data from %s', filename) 32 | df = pandas.read_pickle(filename) 33 | 34 | for t, gdf in df.groupby('type'): 35 | logging.info('Block type %d ------', t) 36 | 37 | idxs = list(gdf.index) 38 | random.shuffle(idxs) 39 | 40 | split_point = int(0.8 * len(idxs)) 41 | training = gdf.ix[idxs[:split_point]] 42 | test = gdf.ix[idxs[split_point:]] 43 | 44 | median = training['time'].median() 45 | logging.info('Median time %.3f', median) 46 | 47 | median_pred_err = np.mean(np.abs(test['time'] - median)) 48 | logging.info('Error for constant predictor %.3f', 49 | median_pred_err) 50 | 51 | to_drop = ['type', 'time', 'n', 'entropy'] 52 | training_X = training.drop(to_drop, axis=1) 53 | test_X = test.drop(to_drop, axis=1) 54 | 55 | opt = l1l1.solve_l1l1_approx(training_X.values, training['time'], 0.01) 56 | weights = opt[:-1] 57 | bias = opt[-1] 58 | 59 | predict = lambda X: np.dot(X.values, weights) + bias 60 | 61 | lr_pred_err = np.mean(np.abs(predict(test_X) - test['time'])) 62 | 63 | logging.info('Error for linear predictor %.3f', 64 | lr_pred_err) 65 | 66 | weights_str = ' + '.join('%.3f * %s' % (coef, col) 67 | for coef, col in zip(weights, training_X.columns.values) 68 | if abs(coef) > 1e-08) \ 69 | + ' + %.3f' % bias 70 | logging.info('Linear params: %s', weights_str) 71 | 72 | # output 73 | fields = ['type', t, 'bias', bias] 74 | 75 | for coef, col in zip(weights, training_X.columns.values): 76 | fields += [col, coef] 77 | 78 | print '\t'.join(map(str, fields)) 79 | 80 | 81 | if __name__ == '__main__': 82 | random.seed(1729) 83 | logging.basicConfig(level=logging.INFO, 84 | format='%(asctime)s:%(levelname)s: %(message)s') 85 | baker.run() 86 | -------------------------------------------------------------------------------- /include/ds2i/ds2i_config.hpp.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define DS2I_SOURCE_DIR "@DS2I_SOURCE_DIR@" 4 | -------------------------------------------------------------------------------- /include/ds2i/freq_index.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "bitvector_collection.hpp" 4 | #include "compact_elias_fano.hpp" 5 | #include "integer_codes.hpp" 6 | #include "global_parameters.hpp" 7 | #include "semiasync_queue.hpp" 8 | 9 | namespace ds2i { 10 | 11 | template 12 | class freq_index { 13 | public: 14 | freq_index() : m_num_docs(0) {} 15 | 16 | class builder { 17 | public: 18 | builder(uint64_t num_docs, global_parameters const& params) 19 | : m_queue(1 << 24) 20 | , m_params(params) 21 | , m_num_docs(num_docs) 22 | , m_docs_sequences(params) 23 | , m_freqs_sequences(params) {} 24 | 25 | template 26 | void add_posting_list(uint64_t n, DocsIterator docs_begin, 27 | FreqsIterator freqs_begin, uint64_t occurrences) { 28 | if (!n) 29 | throw std::invalid_argument("List must be nonempty"); 30 | std::shared_ptr> ptr( 31 | new list_adder( 32 | *this, docs_begin, freqs_begin, occurrences, n)); 33 | m_queue.add_job(ptr, 2 * n); 34 | } 35 | 36 | void build_model(std::string const&) {} 37 | 38 | void build(freq_index& sq) { 39 | m_queue.complete(); 40 | sq.m_num_docs = m_num_docs; 41 | sq.m_params = m_params; 42 | 43 | m_docs_sequences.build(sq.m_docs_sequences); 44 | m_freqs_sequences.build(sq.m_freqs_sequences); 45 | } 46 | 47 | private: 48 | template 49 | struct list_adder : semiasync_queue::job { 50 | list_adder(builder& b, DocsIterator docs_begin, 51 | FreqsIterator freqs_begin, uint64_t occurrences, 52 | uint64_t n) 53 | : b(b) 54 | , docs_begin(docs_begin) 55 | , freqs_begin(freqs_begin) 56 | , occurrences(occurrences) 57 | , n(n) {} 58 | 59 | virtual void prepare() { 60 | write_gamma_nonzero(docs_bits, occurrences); 61 | if (occurrences > 1) { 62 | docs_bits.append_bits(n, ceil_log2(occurrences + 1)); 63 | } 64 | 65 | DocsSequence::write(docs_bits, docs_begin, b.m_num_docs, n, 66 | b.m_params); 67 | 68 | FreqsSequence::write(freqs_bits, freqs_begin, occurrences + 1, 69 | n, b.m_params); 70 | } 71 | 72 | virtual void commit() { 73 | b.m_docs_sequences.append(docs_bits); 74 | b.m_freqs_sequences.append(freqs_bits); 75 | } 76 | 77 | builder& b; 78 | DocsIterator docs_begin; 79 | FreqsIterator freqs_begin; 80 | uint64_t occurrences; 81 | uint64_t n; 82 | succinct::bit_vector_builder docs_bits; 83 | succinct::bit_vector_builder freqs_bits; 84 | }; 85 | 86 | semiasync_queue m_queue; 87 | global_parameters m_params; 88 | uint64_t m_num_docs; 89 | bitvector_collection::builder m_docs_sequences; 90 | bitvector_collection::builder m_freqs_sequences; 91 | }; 92 | 93 | uint64_t size() const { 94 | return m_docs_sequences.size(); 95 | } 96 | 97 | uint64_t num_docs() const { 98 | return m_num_docs; 99 | } 100 | 101 | class document_enumerator { 102 | public: 103 | void reset() { 104 | m_cur_pos = 0; 105 | m_cur_docid = m_docs_enum.move(0).second; 106 | } 107 | 108 | void DS2I_FLATTEN_FUNC next() { 109 | auto val = m_docs_enum.next(); 110 | m_cur_pos = val.first; 111 | m_cur_docid = val.second; 112 | } 113 | 114 | void DS2I_FLATTEN_FUNC next_geq(uint64_t lower_bound) { 115 | auto val = m_docs_enum.next_geq(lower_bound); 116 | m_cur_pos = val.first; 117 | m_cur_docid = val.second; 118 | } 119 | 120 | void DS2I_FLATTEN_FUNC move(uint64_t position) { 121 | auto val = m_docs_enum.move(position); 122 | m_cur_pos = val.first; 123 | m_cur_docid = val.second; 124 | } 125 | 126 | uint64_t docid() const { 127 | return m_cur_docid; 128 | } 129 | 130 | uint64_t DS2I_FLATTEN_FUNC freq() { 131 | return m_freqs_enum.move(m_cur_pos).second; 132 | } 133 | 134 | uint64_t position() const { 135 | return m_cur_pos; 136 | } 137 | 138 | uint64_t size() const { 139 | return m_docs_enum.size(); 140 | } 141 | 142 | typename DocsSequence::enumerator const& docs_enum() const { 143 | return m_docs_enum; 144 | } 145 | 146 | typename FreqsSequence::enumerator const& freqs_enum() const { 147 | return m_freqs_enum; 148 | } 149 | 150 | private: 151 | friend class freq_index; 152 | 153 | document_enumerator(typename DocsSequence::enumerator docs_enum, 154 | typename FreqsSequence::enumerator freqs_enum) 155 | : m_docs_enum(docs_enum), m_freqs_enum(freqs_enum) { 156 | reset(); 157 | } 158 | 159 | uint64_t m_cur_pos; 160 | uint64_t m_cur_docid; 161 | typename DocsSequence::enumerator m_docs_enum; 162 | typename FreqsSequence::enumerator m_freqs_enum; 163 | }; 164 | 165 | document_enumerator operator[](size_t i) const { 166 | assert(i < size()); 167 | auto docs_it = m_docs_sequences.get(m_params, i); 168 | uint64_t occurrences = read_gamma_nonzero(docs_it); 169 | uint64_t n = 1; 170 | if (occurrences > 1) { 171 | n = docs_it.take(ceil_log2(occurrences + 1)); 172 | } 173 | 174 | typename DocsSequence::enumerator docs_enum(m_docs_sequences.bits(), 175 | docs_it.position(), 176 | num_docs(), n, m_params); 177 | 178 | auto freqs_it = m_freqs_sequences.get(m_params, i); 179 | typename FreqsSequence::enumerator freqs_enum( 180 | m_freqs_sequences.bits(), freqs_it.position(), occurrences + 1, n, 181 | m_params); 182 | 183 | return document_enumerator(docs_enum, freqs_enum); 184 | } 185 | 186 | void warmup(size_t /* i */) const { 187 | // XXX implement this 188 | } 189 | 190 | global_parameters const& params() const { 191 | return m_params; 192 | } 193 | 194 | void swap(freq_index& other) { 195 | std::swap(m_params, other.m_params); 196 | std::swap(m_num_docs, other.m_num_docs); 197 | m_docs_sequences.swap(other.m_docs_sequences); 198 | m_freqs_sequences.swap(other.m_freqs_sequences); 199 | } 200 | 201 | template 202 | void map(Visitor& visit) { 203 | visit(m_params, "m_params")(m_num_docs, "m_num_docs")( 204 | m_docs_sequences, "m_docs_sequences")(m_freqs_sequences, 205 | "m_freqs_sequences"); 206 | } 207 | 208 | private: 209 | global_parameters m_params; 210 | uint64_t m_num_docs; 211 | bitvector_collection m_docs_sequences; 212 | bitvector_collection m_freqs_sequences; 213 | }; 214 | } // namespace ds2i 215 | -------------------------------------------------------------------------------- /include/ds2i/global_parameters.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace ds2i { 4 | 5 | struct global_parameters { 6 | global_parameters() 7 | : ef_log_sampling0(9) 8 | , ef_log_sampling1(8) 9 | , rb_log_rank1_sampling(9) 10 | , rb_log_sampling1(8) 11 | , log_partition_size(7) 12 | {} 13 | 14 | template 15 | void map(Visitor& visit) 16 | { 17 | visit 18 | (ef_log_sampling0, "ef_log_sampling0") 19 | (ef_log_sampling1, "ef_log_sampling1") 20 | (rb_log_rank1_sampling, "rb_log_rank1_sampling") 21 | (rb_log_sampling1, "rb_log_sampling1") 22 | (log_partition_size, "log_partition_size") 23 | ; 24 | } 25 | 26 | uint8_t ef_log_sampling0; 27 | uint8_t ef_log_sampling1; 28 | uint8_t rb_log_rank1_sampling; 29 | uint8_t rb_log_sampling1; 30 | uint8_t log_partition_size; 31 | }; 32 | 33 | } 34 | -------------------------------------------------------------------------------- /include/ds2i/index_build_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "index_types.hpp" 4 | #include "util.hpp" 5 | #include "succinct/mapper.hpp" 6 | 7 | namespace ds2i { 8 | 9 | struct progress_logger { 10 | progress_logger(std::string msg_type) 11 | : msg(msg_type) 12 | , sequences(0) 13 | , postings(0) 14 | {} 15 | 16 | void log() { 17 | logger() << msg << " " << sequences << " sequences, " 18 | << postings << " postings" << std::endl; 19 | } 20 | 21 | void done_sequence(size_t n) { 22 | sequences += 1; 23 | postings += n; 24 | // if (sequences % 1000000 == 0) { 25 | // log(); 26 | // } 27 | } 28 | 29 | std::string msg; 30 | size_t sequences, postings; 31 | }; 32 | 33 | template 34 | size_t get_size_stats(freq_index& coll, 35 | uint64_t& docs_size, uint64_t& freqs_size) 36 | { 37 | auto size_tree = succinct::mapper::size_tree_of(coll); 38 | size_tree->dump(); 39 | for (auto const& node: size_tree->children) { 40 | if (node->name == "m_docs_sequences") { 41 | docs_size = node->size; 42 | } else if (node->name == "m_freqs_sequences") { 43 | freqs_size = node->size; 44 | } 45 | } 46 | return size_tree->size; 47 | } 48 | 49 | template 50 | size_t get_size_stats(block_freq_index& coll, 51 | uint64_t& docs_size, uint64_t& freqs_size) 52 | { 53 | auto size_tree = succinct::mapper::size_tree_of(coll); 54 | size_tree->dump(); 55 | uint64_t total_size = 0; 56 | for (auto const& node: size_tree->children) { 57 | if (node->name == "m_lists") { 58 | total_size = node->size; 59 | } 60 | } 61 | 62 | freqs_size = 0; 63 | for (size_t i = 0; i < coll.size(); ++i) { 64 | freqs_size += coll[i].stats_freqs_size(); 65 | } 66 | docs_size = total_size - freqs_size; 67 | return size_tree->size; 68 | } 69 | 70 | template 71 | size_t get_size_stats(dict_freq_index& coll, 72 | uint64_t& docs_size, uint64_t& freqs_size) 73 | { 74 | auto size_tree = succinct::mapper::size_tree_of(coll); 75 | size_tree->dump(); 76 | uint64_t total_size = 0; 77 | for (auto const& node: size_tree->children) { 78 | if (node->name == "m_lists") { 79 | total_size = node->size; 80 | } 81 | } 82 | 83 | freqs_size = 0; 84 | for (size_t i = 0; i < coll.size(); ++i) { 85 | freqs_size += coll[i].stats_freqs_size(); 86 | } 87 | docs_size = total_size - freqs_size; 88 | return size_tree->size; 89 | } 90 | 91 | template 92 | void dump_stats(Collection& coll, 93 | std::string const& type, 94 | uint64_t postings) 95 | { 96 | 97 | uint64_t docs_size = 0, freqs_size = 0; 98 | size_t total_index_size = 99 | get_size_stats(coll, docs_size, freqs_size); 100 | 101 | double bits_per_doc = docs_size * 8.0 / postings; 102 | double bits_per_freq = freqs_size * 8.0 / postings; 103 | logger() << "Documents: " << docs_size << " bytes, " 104 | << bits_per_doc << " bits per element" << std::endl; 105 | logger() << "Frequencies: " << freqs_size << " bytes, " 106 | << bits_per_freq << " bits per element" << std::endl; 107 | logger() << "Index size: " 108 | << double(total_index_size) / (uint64_t(1) << 30) << " [GiB]" << std::endl; 109 | 110 | stats_line() 111 | ("type", type) 112 | ("size", total_index_size) 113 | ("docs_size", docs_size) 114 | ("freqs_size", freqs_size) 115 | ("bits_per_doc", bits_per_doc) 116 | ("bits_per_freq", bits_per_freq) 117 | ; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /include/ds2i/indexed_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "compact_elias_fano.hpp" 6 | #include "compact_ranked_bitvector.hpp" 7 | #include "all_ones_sequence.hpp" 8 | #include "global_parameters.hpp" 9 | 10 | namespace ds2i { 11 | 12 | struct indexed_sequence { 13 | 14 | enum index_type { 15 | elias_fano = 0, 16 | ranked_bitvector = 1, 17 | all_ones = 2, 18 | 19 | index_types = 3 20 | }; 21 | 22 | static const uint64_t type_bits = 1; // all_ones is implicit 23 | 24 | static DS2I_FLATTEN_FUNC uint64_t 25 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 26 | { 27 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 28 | 29 | uint64_t ef_cost = compact_elias_fano::bitsize(params, universe, n) + type_bits; 30 | if (ef_cost < best_cost) { 31 | best_cost = ef_cost; 32 | } 33 | 34 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(params, universe, n) + type_bits; 35 | if (rb_cost < best_cost) { 36 | best_cost = rb_cost; 37 | } 38 | 39 | return best_cost; 40 | } 41 | 42 | template 43 | static void write(succinct::bit_vector_builder& bvb, 44 | Iterator begin, 45 | uint64_t universe, uint64_t n, 46 | global_parameters const& params) 47 | { 48 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 49 | int best_type = all_ones; 50 | 51 | if (best_cost) { 52 | uint64_t ef_cost = compact_elias_fano::bitsize(params, universe, n) + type_bits; 53 | if (ef_cost < best_cost) { 54 | best_cost = ef_cost; 55 | best_type = elias_fano; 56 | } 57 | 58 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(params, universe, n) + type_bits; 59 | if (rb_cost < best_cost) { 60 | best_cost = rb_cost; 61 | best_type = ranked_bitvector; 62 | } 63 | 64 | bvb.append_bits(best_type, type_bits); 65 | } 66 | 67 | 68 | switch (best_type) { 69 | case elias_fano: 70 | compact_elias_fano::write(bvb, begin, 71 | universe, n, 72 | params); 73 | break; 74 | case ranked_bitvector: 75 | compact_ranked_bitvector::write(bvb, begin, 76 | universe, n, 77 | params); 78 | break; 79 | case all_ones: 80 | all_ones_sequence::write(bvb, begin, 81 | universe, n, 82 | params); 83 | break; 84 | default: 85 | assert(false); 86 | } 87 | } 88 | 89 | class enumerator { 90 | public: 91 | 92 | typedef std::pair value_type; // (position, value) 93 | 94 | enumerator() 95 | {} 96 | 97 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 98 | uint64_t universe, uint64_t n, 99 | global_parameters const& params) 100 | { 101 | if (all_ones_sequence::bitsize(params, universe, n) == 0) { 102 | m_type = all_ones; 103 | } else { 104 | m_type = index_type(bv.get_word56(offset) 105 | & ((uint64_t(1) << type_bits) - 1)); 106 | } 107 | 108 | switch (m_type) { 109 | case elias_fano: 110 | m_ef_enumerator = compact_elias_fano::enumerator(bv, offset + type_bits, 111 | universe, n, 112 | params); 113 | break; 114 | case ranked_bitvector: 115 | m_rb_enumerator = compact_ranked_bitvector::enumerator(bv, offset + type_bits, 116 | universe, n, 117 | params); 118 | break; 119 | case all_ones: 120 | m_ao_enumerator = all_ones_sequence::enumerator(bv, offset + type_bits, 121 | universe, n, 122 | params); 123 | break; 124 | default: 125 | throw std::invalid_argument("Unsupported type"); 126 | } 127 | } 128 | 129 | #define ENUMERATOR_METHOD(RETURN_TYPE, METHOD, FORMALS, ACTUALS) \ 130 | RETURN_TYPE DS2I_FLATTEN_FUNC METHOD FORMALS \ 131 | { \ 132 | switch (__builtin_expect(m_type, elias_fano)) { \ 133 | case elias_fano: \ 134 | return m_ef_enumerator.METHOD ACTUALS; \ 135 | case ranked_bitvector: \ 136 | return m_rb_enumerator.METHOD ACTUALS; \ 137 | case all_ones: \ 138 | return m_ao_enumerator.METHOD ACTUALS; \ 139 | default: \ 140 | assert(false); \ 141 | __builtin_unreachable(); \ 142 | } \ 143 | } \ 144 | /**/ 145 | 146 | // semicolons are redundant but they are needed to get emacs to 147 | // align the lines properly 148 | ENUMERATOR_METHOD(value_type, move, (uint64_t position), (position)); 149 | ENUMERATOR_METHOD(value_type, next_geq, (uint64_t lower_bound), (lower_bound)); 150 | ENUMERATOR_METHOD(value_type, next, (), ()); 151 | ENUMERATOR_METHOD(uint64_t, size, () const, ()); 152 | ENUMERATOR_METHOD(uint64_t, prev_value, () const, ()); 153 | 154 | #undef ENUMERATOR_METHOD 155 | #undef ENUMERATOR_VOID_METHOD 156 | 157 | private: 158 | index_type m_type; 159 | union { 160 | compact_elias_fano::enumerator m_ef_enumerator; 161 | compact_ranked_bitvector::enumerator m_rb_enumerator; 162 | all_ones_sequence::enumerator m_ao_enumerator; 163 | }; 164 | }; 165 | }; 166 | } 167 | -------------------------------------------------------------------------------- /include/ds2i/integer_codes.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace ds2i { 4 | 5 | // note: n can be 0 6 | void write_gamma(succinct::bit_vector_builder& bvb, uint64_t n) 7 | { 8 | uint64_t nn = n + 1; 9 | uint64_t l = succinct::broadword::msb(nn); 10 | uint64_t hb = uint64_t(1) << l; 11 | bvb.append_bits(hb, l + 1); 12 | bvb.append_bits(nn ^ hb, l); 13 | } 14 | 15 | void write_gamma_nonzero(succinct::bit_vector_builder& bvb, uint64_t n) 16 | { 17 | assert(n > 0); 18 | write_gamma(bvb, n - 1); 19 | } 20 | 21 | uint64_t read_gamma(succinct::bit_vector::enumerator& it) 22 | { 23 | uint64_t l = it.skip_zeros(); 24 | return (it.take(l) | (uint64_t(1) << l)) - 1; 25 | } 26 | 27 | uint64_t read_gamma_nonzero(succinct::bit_vector::enumerator& it) 28 | { 29 | return read_gamma(it) + 1; 30 | } 31 | 32 | void write_delta(succinct::bit_vector_builder& bvb, uint64_t n) 33 | { 34 | uint64_t nn = n + 1; 35 | uint64_t l = succinct::broadword::msb(nn); 36 | uint64_t hb = uint64_t(1) << l; 37 | write_gamma(bvb, l); 38 | bvb.append_bits(nn ^ hb, l); 39 | } 40 | 41 | uint64_t read_delta(succinct::bit_vector::enumerator& it) 42 | { 43 | uint64_t l = read_gamma(it); 44 | return (it.take(l) | (uint64_t(1) << l)) - 1; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /include/ds2i/interpolative_coding.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "succinct/broadword.hpp" 7 | 8 | namespace ds2i { 9 | 10 | class bit_writer { 11 | public: 12 | bit_writer(std::vector& buf) 13 | : m_buf(buf) 14 | , m_size(0) 15 | , m_cur_word(nullptr) 16 | { 17 | m_buf.clear(); 18 | } 19 | 20 | void write(uint32_t bits, uint32_t len) 21 | { 22 | if (!len) return; 23 | uint32_t pos_in_word = m_size % 32; 24 | m_size += len; 25 | if (pos_in_word == 0) { 26 | m_buf.push_back(bits); 27 | } else { 28 | *m_cur_word |= bits << pos_in_word; 29 | if (len > 32 - pos_in_word) { 30 | m_buf.push_back(bits >> (32 - pos_in_word)); 31 | } 32 | } 33 | m_cur_word = &m_buf.back(); 34 | } 35 | 36 | size_t size() const { 37 | return m_size; 38 | } 39 | 40 | void write_int(uint32_t val, uint32_t u) 41 | { 42 | assert(u > 0); 43 | assert(val < u); 44 | auto b = succinct::broadword::msb(u); 45 | uint64_t m = (uint64_t(1) << (b + 1)) - u; 46 | 47 | if (val < m) { 48 | write(val, b); 49 | } else { 50 | val += m; 51 | // since we use little-endian we must split the writes 52 | write(val >> 1, b); 53 | write(val & 1, 1); 54 | } 55 | } 56 | 57 | void write_interpolative(uint32_t const* in, 58 | size_t n, 59 | uint32_t low, 60 | uint32_t high) 61 | { 62 | if (!n) return; 63 | assert(low <= high); 64 | 65 | size_t h = n / 2; 66 | uint32_t val = in[h]; 67 | write_int(val - low, high - low + 1); 68 | write_interpolative(in, h, low, val); 69 | write_interpolative(in + h + 1, n - h - 1, val, high); 70 | } 71 | 72 | 73 | private: 74 | std::vector& m_buf; 75 | size_t m_size; 76 | uint32_t* m_cur_word; 77 | }; 78 | 79 | class bit_reader { 80 | public: 81 | bit_reader(uint32_t const* in) 82 | : m_in(in) 83 | , m_avail(0) 84 | , m_buf(0) 85 | , m_pos(0) 86 | {} 87 | 88 | size_t position() const 89 | { 90 | return m_pos; 91 | } 92 | 93 | uint32_t read(uint32_t len) 94 | { 95 | if (!len) return 0; 96 | 97 | if (m_avail < len) { 98 | m_buf |= uint64_t(*m_in++) << m_avail; 99 | m_avail += 32; 100 | } 101 | uint32_t val = m_buf & ((uint64_t(1) << len) - 1); 102 | m_buf >>= len; 103 | m_avail -= len; 104 | m_pos += len; 105 | 106 | return val; 107 | } 108 | 109 | uint32_t read_int(uint32_t u) 110 | { 111 | assert(u > 0); 112 | auto b = succinct::broadword::msb(u); 113 | uint64_t m = (uint64_t(1) << (b + 1)) - u; 114 | 115 | uint32_t val = read(b); 116 | if (val >= m) { 117 | val = (val << 1) + read(1) - m; 118 | } 119 | 120 | assert(val < u); 121 | return val; 122 | } 123 | 124 | void read_interpolative(uint32_t* out, 125 | size_t n, 126 | uint32_t low, 127 | uint32_t high) 128 | { 129 | assert(low <= high); 130 | assert(n > 0); 131 | 132 | size_t h = n / 2; 133 | uint32_t val = low + read_int(high - low + 1); 134 | out[h] = val; 135 | if (n == 1) { 136 | // optimization to avoid two unpredictable ifs 137 | return; 138 | } 139 | // the two ifs are a bit ugly but it is faster than postponing them 140 | if (h) { 141 | read_interpolative(out, h, low, val); 142 | } 143 | if (n - h - 1) { 144 | read_interpolative(out + h + 1, n - h - 1, val, high); 145 | } 146 | } 147 | 148 | private: 149 | uint32_t const* m_in; 150 | uint32_t m_avail; 151 | uint64_t m_buf; 152 | size_t m_pos; 153 | }; 154 | 155 | 156 | 157 | } 158 | -------------------------------------------------------------------------------- /include/ds2i/optimal_partition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "util.hpp" 7 | 8 | namespace ds2i { 9 | 10 | typedef uint32_t posting_t ; 11 | typedef uint64_t cost_t; 12 | 13 | struct optimal_partition { 14 | 15 | std::vector partition; 16 | cost_t cost_opt = 0; // the costs are in bits! 17 | 18 | template 19 | struct cost_window { 20 | // a window reppresent the cost of the interval [start, end) 21 | 22 | ForwardIterator start_it; 23 | ForwardIterator end_it; 24 | // starting and ending position of the window 25 | posting_t start = 0; 26 | posting_t end = 0; // end-th position is not in the current window 27 | posting_t min_p = 0; // element that preceed the first element of the window 28 | posting_t max_p = 0; 29 | 30 | cost_t cost_upper_bound; // The maximum cost for this window 31 | 32 | cost_window(ForwardIterator begin, cost_t cost_upper_bound) 33 | : start_it(begin) 34 | , end_it(begin) 35 | , min_p(*begin) 36 | , max_p(0) 37 | , cost_upper_bound(cost_upper_bound) 38 | {} 39 | 40 | uint64_t universe() const 41 | { 42 | return max_p - min_p + 1; 43 | } 44 | 45 | uint64_t size() const 46 | { 47 | return end - start; 48 | } 49 | 50 | void advance_start() 51 | { 52 | min_p = *start_it + 1; 53 | ++start; 54 | ++start_it; 55 | } 56 | 57 | void advance_end() 58 | { 59 | max_p = *end_it; 60 | ++end; 61 | ++end_it; 62 | } 63 | 64 | }; 65 | 66 | optimal_partition() 67 | {} 68 | 69 | template 70 | optimal_partition(ForwardIterator begin, uint64_t universe, uint64_t size, 71 | CostFunction cost_fun, double eps1, double eps2) 72 | { 73 | cost_t single_block_cost = cost_fun(universe, size); 74 | std::vector min_cost(size+1, single_block_cost); 75 | min_cost[0] = 0; 76 | 77 | // create the required window: one for each power of approx_factor 78 | std::vector> windows; 79 | cost_t cost_lb = cost_fun(1, 1); // minimum cost 80 | cost_t cost_bound = cost_lb; 81 | while (eps1 == 0 || cost_bound < cost_lb / eps1) { 82 | windows.emplace_back(begin, cost_bound); 83 | if (cost_bound >= single_block_cost) break; 84 | cost_bound = cost_bound * (1 + eps2); 85 | } 86 | 87 | std::vector path(size + 1, 0); 88 | for (posting_t i = 0; i < size; i++) { 89 | size_t last_end = i + 1; 90 | for (auto& window: windows) { 91 | 92 | assert(window.start == i); 93 | while (window.end < last_end) { 94 | window.advance_end(); 95 | } 96 | 97 | cost_t window_cost; 98 | while (true) { 99 | window_cost = cost_fun(window.universe(), window.size()); 100 | if ((min_cost[i] + window_cost < min_cost[window.end])) { 101 | min_cost[window.end] = min_cost[i] + window_cost; 102 | path[window.end] = i; 103 | } 104 | last_end = window.end; 105 | if (window.end == size) break; 106 | if (window_cost >= window.cost_upper_bound) break; 107 | window.advance_end(); 108 | } 109 | 110 | window.advance_start(); 111 | } 112 | } 113 | 114 | posting_t curr_pos = size; 115 | while( curr_pos != 0 ) { 116 | partition.push_back(curr_pos); 117 | curr_pos = path[curr_pos]; 118 | } 119 | std::reverse(partition.begin(), partition.end()); 120 | cost_opt = min_cost[size]; 121 | } 122 | }; 123 | 124 | } 125 | -------------------------------------------------------------------------------- /include/ds2i/positive_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "global_parameters.hpp" 4 | #include "strict_sequence.hpp" 5 | #include "util.hpp" 6 | 7 | namespace ds2i { 8 | 9 | template 10 | struct positive_sequence { 11 | 12 | typedef BaseSequence base_sequence_type; 13 | typedef typename base_sequence_type::enumerator base_sequence_enumerator; 14 | 15 | template 16 | static void write(succinct::bit_vector_builder& bvb, 17 | Iterator begin, 18 | uint64_t universe, uint64_t n, 19 | global_parameters const& params) 20 | { 21 | assert(n > 0); 22 | auto cumulative_begin = 23 | make_function_iterator(std::make_pair(uint64_t(*begin), begin), 24 | [](std::pair& state) { 25 | state.first += *++state.second; 26 | }, [](std::pair const& state) { 27 | return state.first; 28 | }); 29 | base_sequence_type::write(bvb, cumulative_begin, universe, n, params); 30 | } 31 | 32 | class enumerator { 33 | public: 34 | 35 | typedef std::pair value_type; // (position, value) 36 | 37 | enumerator() 38 | {} 39 | 40 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 41 | uint64_t universe, uint64_t n, 42 | global_parameters const& params) 43 | : m_base_enum(bv, offset, universe, n, params) 44 | , m_position(m_base_enum.size()) 45 | {} 46 | 47 | value_type move(uint64_t position) { 48 | // we cache m_position and m_cur to avoid the call overhead in 49 | // the most common cases 50 | uint64_t prev = m_cur; 51 | if (position != m_position + 1) { 52 | if (DS2I_UNLIKELY(position == 0)) { 53 | // we need to special-case position 0 54 | m_cur = m_base_enum.move(0).second; 55 | m_position = 0; 56 | return value_type(m_position, m_cur); 57 | } 58 | prev = m_base_enum.move(position - 1).second; 59 | } 60 | 61 | m_cur = m_base_enum.next().second; 62 | m_position = position; 63 | return value_type(position, m_cur - prev); 64 | } 65 | 66 | base_sequence_enumerator const& base() const { 67 | return m_base_enum; 68 | } 69 | 70 | value_type next() { 71 | return m_base_enum.next(); 72 | } 73 | 74 | private: 75 | base_sequence_enumerator m_base_enum; 76 | uint64_t m_position; 77 | uint64_t m_cur; 78 | }; 79 | }; 80 | } 81 | -------------------------------------------------------------------------------- /include/ds2i/semiasync_queue.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "configuration.hpp" 8 | #include "util.hpp" 9 | 10 | namespace ds2i { 11 | 12 | class semiasync_queue { 13 | public: 14 | semiasync_queue(double work_per_thread) 15 | : m_expected_work(0) 16 | , m_work_per_thread(work_per_thread) 17 | { 18 | m_max_threads = configuration::get().worker_threads; 19 | // logger() << "semiasync_queue using " << m_max_threads 20 | // << " worker threads" << std::endl; 21 | } 22 | 23 | class job { 24 | public: 25 | virtual void prepare() = 0; 26 | virtual void commit() = 0; 27 | }; 28 | 29 | typedef std::shared_ptr job_ptr_type; 30 | 31 | void add_job(job_ptr_type j, double expected_work) 32 | { 33 | if (m_max_threads) { 34 | m_next_thread.first.push_back(j); 35 | m_expected_work += expected_work; 36 | if (m_expected_work >= m_work_per_thread) { 37 | spawn_next_thread(); 38 | } 39 | } else { // all in main thread 40 | j->prepare(); 41 | j->commit(); 42 | j.reset(); 43 | } 44 | } 45 | 46 | void complete() { 47 | if (!m_next_thread.first.empty()) { 48 | spawn_next_thread(); 49 | } 50 | while (!m_running_threads.empty()) { 51 | commit_thread(); 52 | } 53 | } 54 | 55 | private: 56 | 57 | void spawn_next_thread() 58 | { 59 | if (m_running_threads.size() == m_max_threads) { 60 | commit_thread(); 61 | } 62 | 63 | m_running_threads.emplace_back(); 64 | std::swap(m_next_thread, m_running_threads.back()); 65 | 66 | std::vector const& cur_queue = m_running_threads.back().first; 67 | m_running_threads.back().second = std::thread([&]() { 68 | for (auto const& j: cur_queue) { 69 | j->prepare(); 70 | } 71 | }); 72 | 73 | m_expected_work = 0; 74 | } 75 | 76 | void commit_thread() 77 | { 78 | assert(!m_running_threads.empty()); 79 | m_running_threads.front().second.join(); 80 | for (auto& j: m_running_threads.front().first) { 81 | j->commit(); 82 | j.reset(); 83 | } 84 | m_running_threads.pop_front(); 85 | } 86 | 87 | typedef std::pair, std::thread> thread_t; 88 | thread_t m_next_thread; 89 | std::deque m_running_threads; 90 | 91 | size_t m_expected_work; 92 | double m_work_per_thread; 93 | size_t m_max_threads; 94 | }; 95 | 96 | } 97 | -------------------------------------------------------------------------------- /include/ds2i/sequence_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "bitvector_collection.hpp" 4 | #include "compact_elias_fano.hpp" 5 | #include "integer_codes.hpp" 6 | #include "global_parameters.hpp" 7 | #include "semiasync_queue.hpp" 8 | 9 | namespace ds2i { 10 | 11 | template 12 | class sequence_collection { 13 | public: 14 | typedef typename IndexedSequence::enumerator enumerator_type; 15 | 16 | sequence_collection() 17 | {} 18 | 19 | class builder { 20 | public: 21 | builder(global_parameters const& params) 22 | : m_queue(1 << 24) 23 | , m_params(params) 24 | , m_sequences(params) 25 | {} 26 | 27 | template 28 | void add_sequence(Iterator begin, uint64_t last_element, uint64_t n) 29 | { 30 | if (!n) throw std::invalid_argument("Sequence must be nonempty"); 31 | 32 | // make_shared does not seem to work 33 | std::shared_ptr> 34 | ptr(new sequence_adder(*this, begin, last_element, n)); 35 | m_queue.add_job(ptr, n); 36 | } 37 | 38 | void build(sequence_collection& sq) 39 | { 40 | m_queue.complete(); 41 | sq.m_params = m_params; 42 | m_sequences.build(sq.m_sequences); 43 | } 44 | 45 | private: 46 | 47 | template 48 | struct sequence_adder : semiasync_queue::job { 49 | sequence_adder(builder& b, 50 | Iterator begin, 51 | uint64_t last_element, 52 | uint64_t n) 53 | : b(b) 54 | , begin(begin) 55 | , last_element(last_element) 56 | , n(n) 57 | {} 58 | 59 | virtual void prepare() 60 | { 61 | // store approximation of the universe as smallest power of two 62 | // that can represent last_element 63 | uint64_t universe_bits = ceil_log2(last_element); 64 | write_gamma(bits, universe_bits); 65 | write_gamma_nonzero(bits, n); 66 | IndexedSequence::write(bits, begin, 67 | (uint64_t(1) << universe_bits) + 1, n, 68 | b.m_params); 69 | } 70 | 71 | virtual void commit() 72 | { 73 | b.m_sequences.append(bits); 74 | } 75 | 76 | builder& b; 77 | Iterator begin; 78 | uint64_t last_element; 79 | uint64_t n; 80 | succinct::bit_vector_builder bits; 81 | }; 82 | 83 | semiasync_queue m_queue; 84 | global_parameters m_params; 85 | bitvector_collection::builder m_sequences; 86 | }; 87 | 88 | size_t size() const 89 | { 90 | return m_sequences.size(); 91 | } 92 | 93 | enumerator_type operator[](size_t i) const 94 | { 95 | assert(i < size()); 96 | auto it = m_sequences.get(m_params, i); 97 | uint64_t universe_bits = read_gamma(it); 98 | uint64_t n = read_gamma_nonzero(it); 99 | 100 | return enumerator_type(m_sequences.bits(), it.position(), 101 | (uint64_t(1) << universe_bits) + 1, n, 102 | m_params); 103 | } 104 | 105 | void swap(sequence_collection& other) 106 | { 107 | std::swap(m_params, other.m_params); 108 | std::swap(m_size, other.m_size); 109 | m_sequences.swap(other.m_sequences); 110 | } 111 | 112 | template 113 | void map(Visitor& visit) 114 | { 115 | visit 116 | (m_params, "m_params") 117 | (m_size, "m_size") 118 | (m_sequences, "m_sequences") 119 | ; 120 | } 121 | 122 | private: 123 | global_parameters m_params; 124 | size_t m_size; 125 | bitvector_collection m_sequences; 126 | }; 127 | } 128 | -------------------------------------------------------------------------------- /include/ds2i/strict_elias_fano.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "compact_elias_fano.hpp" 6 | #include "util.hpp" 7 | 8 | namespace ds2i { 9 | 10 | struct strict_elias_fano { 11 | 12 | static DS2I_FLATTEN_FUNC uint64_t 13 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 14 | { 15 | assert(universe >= n); 16 | return compact_elias_fano::bitsize(params, universe - n + 1, n); 17 | } 18 | 19 | template 20 | static void write(succinct::bit_vector_builder& bvb, 21 | Iterator begin, 22 | uint64_t universe, uint64_t n, 23 | global_parameters const& params) 24 | { 25 | uint64_t new_universe = universe - n + 1; 26 | typedef typename std::iterator_traits::value_type value_type; 27 | auto new_begin = 28 | make_function_iterator(std::make_pair(value_type(0), begin), 29 | [](std::pair& state) { 30 | ++state.first; 31 | ++state.second; 32 | }, [](std::pair const& state) { 33 | return *state.second - state.first; 34 | }); 35 | compact_elias_fano::write(bvb, new_begin, new_universe, n, params); 36 | } 37 | 38 | class enumerator { 39 | public: 40 | 41 | typedef std::pair value_type; // (position, value) 42 | 43 | enumerator() 44 | {} 45 | 46 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 47 | uint64_t universe, uint64_t n, 48 | global_parameters const& params) 49 | : m_ef_enum(bv, offset, universe - n + 1, n, params) 50 | {} 51 | 52 | value_type move(uint64_t position) 53 | { 54 | auto val = m_ef_enum.move(position); 55 | return value_type(val.first, val.second + val.first); 56 | } 57 | 58 | value_type next() 59 | { 60 | auto val = m_ef_enum.next(); 61 | return value_type(val.first, val.second + val.first); 62 | } 63 | 64 | uint64_t size() const 65 | { 66 | return m_ef_enum.size(); 67 | } 68 | 69 | uint64_t prev_value() const 70 | { 71 | if (m_ef_enum.position()) { 72 | return m_ef_enum.prev_value() + m_ef_enum.position() - 1; 73 | } else { 74 | return 0; 75 | } 76 | } 77 | 78 | private: 79 | compact_elias_fano::enumerator m_ef_enum; 80 | }; 81 | 82 | }; 83 | } 84 | -------------------------------------------------------------------------------- /include/ds2i/strict_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "strict_elias_fano.hpp" 6 | #include "compact_ranked_bitvector.hpp" 7 | #include "all_ones_sequence.hpp" 8 | #include "global_parameters.hpp" 9 | 10 | namespace ds2i { 11 | 12 | struct strict_sequence { 13 | 14 | enum index_type { 15 | elias_fano = 0, 16 | ranked_bitvector = 1, 17 | all_ones = 2, 18 | 19 | index_types = 3 20 | }; 21 | 22 | static const uint64_t type_bits = 1; // all_ones is implicit 23 | 24 | static global_parameters strict_params(global_parameters params) 25 | { 26 | // we do not need to index the zeros 27 | params.ef_log_sampling0 = 63; 28 | params.rb_log_rank1_sampling = 63; 29 | return params; 30 | } 31 | 32 | static DS2I_FLATTEN_FUNC uint64_t 33 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 34 | { 35 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 36 | auto sparams = strict_params(params); 37 | 38 | uint64_t ef_cost = strict_elias_fano::bitsize(sparams, universe, n) + type_bits; 39 | if (ef_cost < best_cost) { 40 | best_cost = ef_cost; 41 | } 42 | 43 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(sparams, universe, n) + type_bits; 44 | if (rb_cost < best_cost) { 45 | best_cost = rb_cost; 46 | } 47 | 48 | return best_cost; 49 | } 50 | 51 | template 52 | static void write(succinct::bit_vector_builder& bvb, 53 | Iterator begin, 54 | uint64_t universe, uint64_t n, 55 | global_parameters const& params) 56 | { 57 | auto sparams = strict_params(params); 58 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 59 | int best_type = all_ones; 60 | 61 | if (best_cost) { 62 | uint64_t ef_cost = strict_elias_fano::bitsize(sparams, universe, n) + type_bits; 63 | if (ef_cost < best_cost) { 64 | best_cost = ef_cost; 65 | best_type = elias_fano; 66 | } 67 | 68 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(sparams, universe, n) + type_bits; 69 | if (rb_cost < best_cost) { 70 | best_cost = rb_cost; 71 | best_type = ranked_bitvector; 72 | } 73 | 74 | bvb.append_bits(best_type, type_bits); 75 | } 76 | 77 | switch (best_type) { 78 | case elias_fano: 79 | strict_elias_fano::write(bvb, begin, 80 | universe, n, 81 | sparams); 82 | break; 83 | case ranked_bitvector: 84 | compact_ranked_bitvector::write(bvb, begin, 85 | universe, n, 86 | sparams); 87 | break; 88 | case all_ones: 89 | all_ones_sequence::write(bvb, begin, 90 | universe, n, 91 | sparams); 92 | break; 93 | default: 94 | assert(false); 95 | } 96 | } 97 | 98 | class enumerator { 99 | public: 100 | 101 | typedef std::pair value_type; // (position, value) 102 | 103 | enumerator() 104 | {} 105 | 106 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 107 | uint64_t universe, uint64_t n, 108 | global_parameters const& params) 109 | { 110 | auto sparams = strict_params(params); 111 | 112 | if (all_ones_sequence::bitsize(params, universe, n) == 0) { 113 | m_type = all_ones; 114 | } else { 115 | m_type = index_type(bv.get_word56(offset) 116 | & ((uint64_t(1) << type_bits) - 1)); 117 | } 118 | 119 | switch (m_type) { 120 | case elias_fano: 121 | m_ef_enumerator = strict_elias_fano::enumerator(bv, offset + type_bits, 122 | universe, n, 123 | sparams); 124 | break; 125 | case ranked_bitvector: 126 | m_rb_enumerator = compact_ranked_bitvector::enumerator(bv, offset + type_bits, 127 | universe, n, 128 | sparams); 129 | break; 130 | case all_ones: 131 | m_ao_enumerator = all_ones_sequence::enumerator(bv, offset + type_bits, 132 | universe, n, 133 | sparams); 134 | break; 135 | default: 136 | throw std::invalid_argument("Unsupported type"); 137 | } 138 | } 139 | 140 | #define ENUMERATOR_METHOD(RETURN_TYPE, METHOD, FORMALS, ACTUALS) \ 141 | RETURN_TYPE DS2I_FLATTEN_FUNC METHOD FORMALS \ 142 | { \ 143 | switch (__builtin_expect(m_type, elias_fano)) { \ 144 | case elias_fano: \ 145 | return m_ef_enumerator.METHOD ACTUALS; \ 146 | case ranked_bitvector: \ 147 | return m_rb_enumerator.METHOD ACTUALS; \ 148 | case all_ones: \ 149 | return m_ao_enumerator.METHOD ACTUALS; \ 150 | default: \ 151 | assert(false); \ 152 | __builtin_unreachable(); \ 153 | } \ 154 | } \ 155 | /**/ 156 | 157 | // semicolons are redundant but they are needed to get emacs to 158 | // align the lines properly 159 | ENUMERATOR_METHOD(value_type, move, (uint64_t position), (position)); 160 | ENUMERATOR_METHOD(value_type, next, (), ()); 161 | ENUMERATOR_METHOD(uint64_t, size, () const, ()); 162 | ENUMERATOR_METHOD(uint64_t, prev_value, () const, ()); 163 | 164 | #undef ENUMERATOR_METHOD 165 | #undef ENUMERATOR_VOID_METHOD 166 | 167 | private: 168 | index_type m_type; 169 | union { 170 | strict_elias_fano::enumerator m_ef_enumerator; 171 | compact_ranked_bitvector::enumerator m_rb_enumerator; 172 | all_ones_sequence::enumerator m_ao_enumerator; 173 | }; 174 | }; 175 | }; 176 | } 177 | -------------------------------------------------------------------------------- /include/ds2i/verify_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "util.hpp" 5 | 6 | namespace ds2i { 7 | template 8 | void verify_collection(InputCollection const& input, const char* filename) 9 | { 10 | Collection coll; 11 | boost::iostreams::mapped_file_source m(filename); 12 | succinct::mapper::map(coll, m); 13 | 14 | logger() << "Checking the written data, just to be extra safe..." << std::endl; 15 | size_t s = 0; 16 | for (auto seq: input) { 17 | auto e = coll[s]; 18 | if (e.size() != seq.docs.size()) { 19 | logger() << "sequence " << s 20 | << " has wrong length! (" 21 | << e.size() << " != " << seq.docs.size() << ")" 22 | ; 23 | exit(1); 24 | } 25 | 26 | for (size_t i = 0; i < e.size(); ++i, e.next()) { 27 | uint64_t docid = *(seq.docs.begin() + i); 28 | uint64_t freq = *(seq.freqs.begin() + i); 29 | 30 | if (docid != e.docid()) { 31 | logger() << "docid in sequence " << s 32 | << " differs at position " << i << "!"; 33 | logger() << e.docid() << " != " << docid; 34 | logger() << "sequence length: " << seq.docs.size(); 35 | 36 | exit(1); 37 | } 38 | 39 | if (freq != e.freq()) { 40 | logger() << "freq in sequence " << s 41 | << " differs at position " << i << "!"; 42 | logger() << e.freq() << " != " << freq; 43 | logger() << "sequence length: " << seq.docs.size(); 44 | 45 | exit(1); 46 | } 47 | } 48 | 49 | s += 1; 50 | } 51 | logger() << "Everything is OK!" << std::endl; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /include/ds2i/wand_data.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "binary_freq_collection.hpp" 6 | #include "bm25.hpp" 7 | #include "util.hpp" 8 | 9 | namespace ds2i { 10 | 11 | template 12 | class wand_data { 13 | public: 14 | wand_data() 15 | {} 16 | 17 | template 18 | wand_data(LengthsIterator len_it, uint64_t num_docs, 19 | binary_freq_collection const& coll) 20 | { 21 | std::vector norm_lens(num_docs); 22 | double lens_sum = 0; 23 | logger() << "Reading sizes..."; 24 | for (size_t i = 0; i < num_docs; ++i) { 25 | float len = *len_it++; 26 | norm_lens[i] = len; 27 | lens_sum += len; 28 | } 29 | float avg_len = float(lens_sum / double(num_docs)); 30 | for (size_t i = 0; i < num_docs; ++i) { 31 | norm_lens[i] /= avg_len; 32 | } 33 | 34 | logger() << "Storing max weight for each list..."; 35 | std::vector max_term_weight; 36 | for (auto const& seq: coll) { 37 | float max_score = 0; 38 | for (size_t i = 0; i < seq.docs.size(); ++i) { 39 | uint64_t docid = *(seq.docs.begin() + i); 40 | uint64_t freq = *(seq.freqs.begin() + i); 41 | float score = Scorer::doc_term_weight(freq, norm_lens[docid]); 42 | max_score = std::max(max_score, score); 43 | } 44 | max_term_weight.push_back(max_score); 45 | if ((max_term_weight.size() % 1000000) == 0) { 46 | logger() << max_term_weight.size() << " list processed"; 47 | } 48 | } 49 | logger() << max_term_weight.size() << " list processed"; 50 | 51 | m_norm_lens.steal(norm_lens); 52 | m_max_term_weight.steal(max_term_weight); 53 | } 54 | 55 | float norm_len(uint64_t doc_id) const 56 | { 57 | return m_norm_lens[doc_id]; 58 | } 59 | 60 | float max_term_weight(uint64_t term_id) const 61 | { 62 | return m_max_term_weight[term_id]; 63 | } 64 | 65 | void swap(wand_data& other) 66 | { 67 | m_norm_lens.swap(other.m_norm_lens); 68 | m_max_term_weight.swap(other.m_max_term_weight); 69 | } 70 | 71 | template 72 | void map(Visitor& visit) 73 | { 74 | visit 75 | (m_norm_lens, "m_norm_lens") 76 | (m_max_term_weight, "m_max_term_weight") 77 | ; 78 | } 79 | 80 | private: 81 | succinct::mapper::mappable_vector m_norm_lens; 82 | succinct::mapper::mappable_vector m_max_term_weight; 83 | }; 84 | 85 | } 86 | -------------------------------------------------------------------------------- /include/index_types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "freq_index.hpp" 8 | #include "positive_sequence.hpp" 9 | #include "partitioned_sequence.hpp" 10 | #include "uniform_partitioned_sequence.hpp" 11 | #include "binary_freq_collection.hpp" 12 | #include "block_freq_index.hpp" 13 | #include "block_codecs.hpp" 14 | #include "mixed_block.hpp" 15 | 16 | #include "dint_configuration.hpp" 17 | #include "dint_codecs.hpp" 18 | #include "dictionary_types.hpp" 19 | #include "dictionary_builders.hpp" 20 | #include "block_statistics.hpp" 21 | #include "dict_freq_index.hpp" 22 | 23 | namespace ds2i { 24 | 25 | typedef freq_index> 26 | ef_index; 27 | 28 | typedef freq_index> single_index; 29 | 30 | typedef freq_index< 31 | uniform_partitioned_sequence<>, 32 | positive_sequence>> 33 | uniform_index; 34 | 35 | typedef freq_index, 36 | positive_sequence>> 37 | opt_index; 38 | 39 | typedef block_freq_index block_optpfor_index; 40 | typedef block_freq_index block_varintg8iu_index; 41 | typedef block_freq_index block_interpolative_index; 42 | typedef block_freq_index block_qmx_index; 43 | typedef block_freq_index block_mixed_index; 44 | typedef block_freq_index block_u32_index; 45 | typedef block_freq_index block_vbyte_index; 46 | typedef block_freq_index block_simple16_index; 47 | typedef block_freq_index block_varintgb_index; 48 | typedef block_freq_index block_maskedvbyte_index; 49 | typedef block_freq_index block_streamvbyte_index; 50 | 51 | // DINT indexes 52 | 53 | // collector type 54 | using adjusted_collector_type = adjusted; 55 | 56 | // statistic types 57 | using adjusted_block_stats_type = block_statistics; 58 | using adjusted_block_multi_stats_type = 59 | block_multi_statistics; 60 | 61 | // dictionary_builders 62 | using single_rectangular_builder = 63 | decreasing_static_frequencies; 65 | 66 | using single_packed_builder = 67 | decreasing_static_frequencies; 69 | 70 | using multi_packed_builder = 71 | decreasing_static_frequencies; 73 | 74 | // DINT configurations (all use optimal block parsing) 75 | using single_rect_dint_index = 76 | dict_freq_index; 77 | using single_packed_dint_index = 78 | dict_freq_index; 79 | using multi_packed_dint_index = 80 | dict_freq_index; 81 | } // namespace ds2i 82 | 83 | #define DS2I_INDEX_TYPES \ 84 | (ef)(single)(uniform)(opt)(block_optpfor)(block_varintg8iu)( \ 85 | block_interpolative)(block_qmx)(block_mixed)(block_u32)(block_vbyte)( \ 86 | block_simple16)(block_varintgb)(block_maskedvbyte)(block_streamvbyte)( \ 87 | single_rect_dint)(single_packed_dint)(multi_packed_dint) 88 | #define DS2I_BLOCK_INDEX_TYPES \ 89 | (block_optpfor)(block_varintg8iu)(block_interpolative)(block_qmx)( \ 90 | block_mixed)(block_u32)(block_vbyte)(block_simple16)(block_varintgb)( \ 91 | block_maskedvbyte)(block_streamvbyte) -------------------------------------------------------------------------------- /include/util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "succinct/broadword.hpp" 17 | 18 | #define DS2I_LIKELY(x) __builtin_expect(!!(x), 1) 19 | #define DS2I_UNLIKELY(x) __builtin_expect(!!(x), 0) 20 | #define DS2I_NOINLINE __attribute__((noinline)) 21 | #define DS2I_ALWAYSINLINE __attribute__((always_inline)) 22 | 23 | #if defined(__GNUC__) && !defined(__clang__) 24 | #define DS2I_FLATTEN_FUNC __attribute__((always_inline, flatten)) 25 | #else 26 | #define DS2I_FLATTEN_FUNC DS2I_ALWAYSINLINE 27 | #endif 28 | 29 | namespace ds2i { 30 | 31 | namespace constants { 32 | // min and max length of inverted lists 33 | static const uint64_t min_size = 0; 34 | static const uint64_t max_size = 50000000; 35 | static const uint32_t block_size = 256; 36 | const static uint64_t GiB = 1073741824; 37 | const static uint64_t MiB = 1048576; 38 | const static uint64_t KiB = 1024; 39 | } // namespace constants 40 | 41 | struct node { 42 | node() {} 43 | 44 | node(uint32_t p, uint32_t w, uint32_t c) 45 | : parent(p), codeword(w), cost(c) {} 46 | 47 | uint32_t parent; 48 | uint32_t codeword; 49 | uint32_t cost; 50 | }; 51 | 52 | typedef std::chrono::high_resolution_clock clock_type; 53 | 54 | constexpr bool is_power_of_two(uint64_t x) { 55 | return (x & (x - 1)) == 0; 56 | } 57 | 58 | enum struct data_type : char { 59 | docs = 'd', 60 | freqs = 'f', 61 | }; 62 | 63 | std::string extension(data_type type) { 64 | return type == data_type::docs ? ".docs" : ".freqs"; 65 | } 66 | 67 | inline uint64_t ceil_log2(const uint64_t x) { 68 | assert(x > 0); 69 | return (x > 1) ? succinct::broadword::msb(x - 1) + 1 : 0; 70 | } 71 | 72 | inline uint64_t floor_log2(const uint64_t x) { 73 | return (x > 1) ? succinct::broadword::msb(x) : 0; 74 | } 75 | 76 | inline std::ostream& logger() { 77 | time_t t = std::time(nullptr); 78 | std::locale loc; 79 | const std::time_put& tp = std::use_facet>(loc); 80 | const char* fmt = "%F %T"; 81 | tp.put(std::cerr, std::cerr, ' ', std::localtime(&t), fmt, 82 | fmt + strlen(fmt)); 83 | return std::cerr << ": "; 84 | } 85 | 86 | inline double get_time_usecs() { 87 | timeval tv; 88 | gettimeofday(&tv, NULL); 89 | return double(tv.tv_sec) * 1000000 + double(tv.tv_usec); 90 | } 91 | 92 | inline double get_user_time_usecs() { 93 | rusage ru; 94 | getrusage(RUSAGE_SELF, &ru); 95 | return double(ru.ru_utime.tv_sec) * 1000000 + double(ru.ru_utime.tv_usec); 96 | } 97 | 98 | // stolen from folly 99 | template 100 | inline void do_not_optimize_away(T&& datum) { 101 | asm volatile("" : "+r"(datum)); 102 | } 103 | 104 | template 105 | struct has_next_geq { 106 | template 107 | struct sfinae {}; 108 | template 109 | static char test(sfinae); 110 | template 111 | static int test(...); 112 | enum { value = sizeof(test(0)) == sizeof(char) }; 113 | }; 114 | 115 | // A more powerful version of boost::function_input_iterator that also works 116 | // with lambdas. 117 | // 118 | // Important: the functors must be stateless, otherwise the behavior is 119 | // undefined. 120 | template 121 | class function_iterator 122 | : public std::iterator::type> { 124 | public: 125 | function_iterator() {} 126 | 127 | function_iterator(State initial_state) : m_state(initial_state) {} 128 | 129 | friend inline void swap(function_iterator& lhs, function_iterator& rhs) { 130 | using std::swap; 131 | swap(lhs.m_state, rhs.m_state); 132 | } 133 | 134 | // XXX why isn't this inherited from std::iterator? 135 | typedef typename std::result_of::type value_type; 136 | 137 | value_type operator*() const { 138 | // XXX I do not know if this trick is legal for stateless lambdas, 139 | // but it seems to work on GCC and Clang 140 | return (*static_cast(nullptr))(m_state); 141 | } 142 | 143 | function_iterator& operator++() { 144 | (*static_cast(nullptr))(m_state); 145 | return *this; 146 | } 147 | 148 | function_iterator operator++(int) { 149 | function_iterator it(*this); 150 | operator++(); 151 | return it; 152 | } 153 | 154 | bool operator==(function_iterator const& other) const { 155 | return m_state == other.m_state; 156 | } 157 | 158 | bool operator!=(function_iterator const& other) const { 159 | return !(*this == other); 160 | } 161 | 162 | private: 163 | State m_state; 164 | }; 165 | 166 | template 167 | function_iterator make_function_iterator( 168 | State initial_state, AdvanceFunctor, ValueFunctor) { 169 | return function_iterator( 170 | initial_state); 171 | } 172 | 173 | struct stats_line { 174 | stats_line() : first(true) { 175 | std::cout << "{"; 176 | } 177 | 178 | ~stats_line() { 179 | std::cout << "}" << std::endl; 180 | } 181 | 182 | template 183 | stats_line& operator()(K const& key, T const& value) { 184 | if (!first) { 185 | std::cout << ", "; 186 | } else { 187 | first = false; 188 | } 189 | 190 | emit(key); 191 | std::cout << ": "; 192 | emit(value); 193 | return *this; 194 | } 195 | 196 | template 197 | stats_line& operator()(T const& obj) { 198 | return obj.dump(*this); 199 | } 200 | 201 | private: 202 | template 203 | void emit(T const& v) const { 204 | std::cout << v; 205 | } 206 | 207 | // XXX properly escape strings 208 | void emit(const char* s) const { 209 | std::cout << '"' << s << '"'; 210 | } 211 | 212 | void emit(std::string const& s) const { 213 | emit(s.c_str()); 214 | } 215 | 216 | template 217 | void emit(std::vector const& v) const { 218 | std::cout << "["; 219 | bool first = true; 220 | for (auto const& i : v) { 221 | if (first) { 222 | first = false; 223 | } else { 224 | std::cout << ", "; 225 | } 226 | emit(i); 227 | } 228 | std::cout << "]"; 229 | } 230 | 231 | template 232 | void emit(std::map const& m) const { 233 | std::vector> v(m.begin(), m.end()); 234 | emit(v); 235 | } 236 | 237 | template 238 | typename std::enable_if::type emit_tuple_helper( 239 | Tuple const& t) const { 240 | emit_tuple_helper(t); 241 | std::cout << ", "; 242 | emit(std::get(t)); 243 | } 244 | 245 | template 246 | typename std::enable_if::type emit_tuple_helper( 247 | Tuple const& t) const { 248 | emit(std::get<0>(t)); 249 | } 250 | 251 | template 252 | void emit(std::tuple const& t) const { 253 | std::cout << "["; 254 | emit_tuple_helper, sizeof...(Tp) - 1>(t); 255 | std::cout << "]"; 256 | } 257 | 258 | template 259 | void emit(std::pair const& p) const { 260 | emit(std::make_tuple(p.first, p.second)); 261 | } 262 | 263 | bool first; 264 | }; 265 | 266 | } // namespace ds2i 267 | -------------------------------------------------------------------------------- /scripts/build.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | collection_basename = sys.argv[1] 4 | output_prefix_name = sys.argv[2] 5 | bins_directory = sys.argv[3] 6 | 7 | strategies = [ 8 | # "opt", 9 | "block_interpolative", 10 | "block_qmx", 11 | # "block_simple16", 12 | "block_optpfor", 13 | # "block_vbyte", 14 | # "block_varintgb", 15 | # "block_varintg8iu", 16 | # "block_streamvbyte", 17 | # "block_maskedvbyte" 18 | ] 19 | 20 | for type in strategies: 21 | # build index 22 | output = output_prefix_name + "." + type 23 | cmd = "./create_freq_index " + type + " " + collection_basename + " " + bins_directory + "/" + output + ".bin > " + output + ".results" 24 | os.system(cmd) -------------------------------------------------------------------------------- /scripts/build_and_query.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | collection_basename = sys.argv[1] 4 | output_prefix_name = sys.argv[2] 5 | query_logs_basename = sys.argv[3] 6 | 7 | strategies = [ 8 | # "opt", 9 | "block_interpolative", 10 | "block_qmx", 11 | "block_simple16", 12 | "block_optpfor", 13 | "block_vbyte", 14 | "block_varintgb", 15 | "block_varintg8iu", 16 | "block_streamvbyte", 17 | "block_maskedvbyte" 18 | ] 19 | 20 | results_directory = "./results." + output_prefix_name 21 | if not os.path.exists(results_directory): 22 | os.makedirs(results_directory) 23 | 24 | bins_directory = "./bins." + output_prefix_name 25 | if not os.path.exists(bins_directory): 26 | os.makedirs(bins_directory) 27 | 28 | for type in strategies: 29 | # build index 30 | output = output_prefix_name + "." + type 31 | cmd = "./create_freq_index " + type + " " + collection_basename + " " + bins_directory + "/" + output + ".bin" 32 | cmd += " 2> " + results_directory + "/" + output + ".log" 33 | os.system(cmd) 34 | 35 | # perform queries 36 | for suffix in ["0.mapped.1k", "0.mapped.selective", "1.mapped.1k", "1.mapped.selective"]: 37 | cmd = "./queries " + type + " and " + bins_directory + "/" + output + ".bin < " + query_logs_basename + suffix + " >> " + results_directory + "/" + output + ".querytime" 38 | for i in xrange(0, 4): 39 | os.system(cmd) 40 | 41 | os.system("rm " + bins_directory + "/" + output + ".bin") 42 | -------------------------------------------------------------------------------- /scripts/build_dint_indexes.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | collection_basename = sys.argv[1] 4 | ouput_prefix_name = sys.argv[2] 5 | 6 | strategies = ["DSF", "PDF", "LSS"] 7 | 8 | for strategy in strategies: 9 | output = ouput_prefix_name + "." + strategy 10 | cmd = "./create_freq_index " + strategy + "_block_dint " + collection_basename + " " + output + ".bin" 11 | cmd += " 2> " + output + ".log" 12 | os.system(cmd) 13 | -------------------------------------------------------------------------------- /scripts/collect_timings.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | path_to_basename = sys.argv[1] # e.g., '/data2/inverted_indexes/gov2/gov2.sorted-text' 4 | path_to_binaries = sys.argv[2] # e.g., './bin' 5 | path_to_results = sys.argv[3] # e.g., './results' 6 | prefix_name = sys.argv[4] # e.g., 'gov2' 7 | query_log = sys.argv[5] 8 | 9 | strategies = ["opt", 10 | "block_interpolative", 11 | "block_qmx", 12 | "block_simple16", 13 | "block_optpfor", 14 | "block_vbyte", 15 | "block_varintgb", 16 | "block_varintg8iu", 17 | "streamvbyte_block", 18 | "maskedvbyte_block"] 19 | 20 | for type in strategies: 21 | cmd = "./queries " + type + " and " + path_to_binaries + "/" + prefix_name + "." + type + ".bin < " + query_log + " >> " + path_to_results + "/" + prefix_name + "." + type + ".querytime" 22 | # print cmd 23 | for i in xrange(0, 3): 24 | os.system(cmd) 25 | -------------------------------------------------------------------------------- /scripts/query.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | bins_directory = sys.argv[1] 4 | output_prefix_name = sys.argv[2] 5 | query_log = sys.argv[3] 6 | output = sys.argv[4] 7 | 8 | strategies = [ 9 | "opt", 10 | "block_interpolative", 11 | "block_qmx", 12 | # "block_simple16", 13 | "block_optpfor", 14 | # "block_vbyte", 15 | "block_varintgb", 16 | "block_varintg8iu", 17 | "block_streamvbyte", 18 | "block_maskedvbyte" 19 | ] 20 | 21 | for type in strategies: 22 | # perform queries 23 | index = output_prefix_name + "." + type + ".bin" 24 | cmd = "./queries " + type + " and " + bins_directory + "/" + index + " < " + query_log + " >> " + output 25 | for i in xrange(0, 1): 26 | os.system(cmd) -------------------------------------------------------------------------------- /scripts/stat_all.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | dataset_filename = sys.argv[1] 4 | output_prefix = sys.argv[2] 5 | dictionary_filename = sys.argv[3] 6 | 7 | codecs = ["interpolative", "optpfor", "varintg8iu", "qmx", "vbyte", "u32", 8 | "simple16", "streamvbyte", "maskedvbyte", "varintgb", "dint"] 9 | 10 | for codec in codecs: 11 | output = output_prefix + "." + codec + ".out" 12 | enc_cmd = "./encode " + codec + " " + dataset_filename + " --out " + output 13 | dec_cmd = "perf record -e cache-misses ./decode " + codec + " " + output # perf stat 14 | if codec == "dint": 15 | enc_cmd += " " + "--dict " + dictionary_filename 16 | dec_cmd += " " + "--dict " + dictionary_filename 17 | os.system(enc_cmd) 18 | for i in xrange(0, 3): 19 | os.system(dec_cmd) 20 | os.system("rm " + output) 21 | -------------------------------------------------------------------------------- /scripts/test_all.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | dataset_filename = sys.argv[1] 4 | output_prefix = sys.argv[2] 5 | 6 | codecs = ["interpolative", 7 | "qmx", 8 | # "optpfor", 9 | "simple16", 10 | "varintgb", 11 | "varintg8iu", 12 | "vbyte", 13 | "maskedvbyte", 14 | "streamvbyte"] 15 | 16 | for codec in codecs: 17 | output = output_prefix + "." + codec + ".out" 18 | enc_cmd = "./encode " + codec + " " + dataset_filename + " --out " + output 19 | dec_cmd = "./decode " + codec + " " + output 20 | os.system(enc_cmd) 21 | for i in xrange(0, 3): 22 | os.system(dec_cmd) 23 | os.system("rm " + output) 24 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(create_freq_index create_freq_index.cpp) 2 | target_link_libraries(create_freq_index 3 | ${Boost_LIBRARIES} 4 | FastPFor_lib 5 | streamvbyte 6 | MaskedVByte 7 | ) 8 | 9 | add_executable(create_wand_data create_wand_data.cpp) 10 | target_link_libraries(create_wand_data 11 | ${Boost_LIBRARIES} 12 | ) 13 | 14 | add_executable(queries queries.cpp) 15 | target_link_libraries(queries 16 | ${Boost_LIBRARIES} 17 | FastPFor_lib 18 | streamvbyte 19 | MaskedVByte 20 | ) 21 | 22 | add_executable(pair_wise_intersect pair_wise_intersect.cpp) 23 | target_link_libraries(pair_wise_intersect 24 | ${Boost_LIBRARIES} 25 | FastPFor_lib 26 | streamvbyte 27 | MaskedVByte 28 | ) 29 | 30 | add_executable(check_index check_index.cpp) 31 | target_link_libraries(check_index 32 | ${Boost_LIBRARIES} 33 | FastPFor_lib 34 | streamvbyte 35 | MaskedVByte 36 | ) 37 | 38 | add_executable(dict_perf_test dict_perf_test.cpp) 39 | target_link_libraries(dict_perf_test 40 | ${Boost_LIBRARIES} 41 | ) -------------------------------------------------------------------------------- /src/check_index.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "configuration.hpp" 5 | #include "index_build_utils.hpp" 6 | #include "index_types.hpp" 7 | #include "util.hpp" 8 | #include "verify_collection.hpp" 9 | 10 | int main(int argc, char** argv) { 11 | if (argc < 4) { 12 | std::cerr << "Usage " << argv[0] << ":\n" 13 | << "\t " 14 | << std::endl; 15 | return 1; 16 | } 17 | 18 | using namespace ds2i; 19 | std::string index_type = argv[1]; 20 | const char* index_filename = argv[2]; 21 | const char* collection_filename = argv[3]; 22 | 23 | binary_freq_collection input(collection_filename); 24 | 25 | if (false) { 26 | #define LOOP_BODY(R, DATA, T) \ 27 | } \ 28 | else if (index_type == BOOST_PP_STRINGIZE(T)) { \ 29 | verify_collection( \ 30 | input, index_filename); 31 | 32 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, DS2I_INDEX_TYPES); 33 | #undef LOOP_BODY 34 | } else { 35 | logger() << "ERROR: Unknown type " << index_type << std::endl; 36 | } 37 | 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /src/create_freq_index.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "configuration.hpp" 10 | #include "index_types.hpp" 11 | #include "util.hpp" 12 | #include "verify_collection.hpp" 13 | #include "index_build_utils.hpp" 14 | 15 | using namespace ds2i; 16 | 17 | template 18 | void dump_index_specific_stats(Collection const&, std::string const&) {} 19 | 20 | void dump_index_specific_stats(uniform_index const& coll, 21 | std::string const& type) { 22 | stats_line()("type", type)("log_partition_size", 23 | int(coll.params().log_partition_size)); 24 | } 25 | 26 | void dump_index_specific_stats(opt_index const& coll, std::string const& type) { 27 | auto const& conf = configuration::get(); 28 | 29 | double long_postings = 0; 30 | double docs_partitions = 0; 31 | double freqs_partitions = 0; 32 | 33 | for (size_t s = 0; s < coll.size(); ++s) { 34 | auto const& list = coll[s]; 35 | if (list.size() > constants::min_size) { 36 | long_postings += list.size(); 37 | docs_partitions += list.docs_enum().num_partitions(); 38 | freqs_partitions += list.freqs_enum().base().num_partitions(); 39 | } 40 | } 41 | 42 | stats_line()("type", type)("eps1", conf.eps1)("eps2", conf.eps2)( 43 | "fix_cost", conf.fix_cost)("docs_avg_part", 44 | long_postings / docs_partitions)( 45 | "freqs_avg_part", long_postings / freqs_partitions); 46 | } 47 | 48 | template 49 | void build_model(std::string input_basename, 50 | typename CollectionType::builder& builder) { 51 | builder.build_model(input_basename); 52 | } 53 | 54 | template 55 | void create_collection(std::string input_basename, 56 | global_parameters const& params, 57 | const char* output_filename, bool check, 58 | std::string const& seq_type) { 59 | binary_freq_collection input(input_basename.c_str()); 60 | size_t num_docs = input.num_docs(); 61 | double tick = get_time_usecs(); 62 | double user_tick = get_user_time_usecs(); 63 | 64 | typename CollectionType::builder builder(num_docs, params); 65 | build_model(input_basename, builder); 66 | 67 | logger() << "Processing " << input.num_docs() << " documents..." 68 | << std::endl; 69 | progress_logger plog("Encoded"); 70 | 71 | boost::progress_display progress(input.num_postings()); 72 | 73 | for (auto const& plist : input) { 74 | uint64_t n = plist.docs.size(); 75 | if (n > constants::min_size) { 76 | uint64_t freqs_sum = std::accumulate( 77 | plist.freqs.begin(), plist.freqs.end(), uint64_t(0)); 78 | builder.add_posting_list(n, plist.docs.begin(), plist.freqs.begin(), 79 | freqs_sum); 80 | plog.done_sequence(n); 81 | progress += n + plist.freqs.size() + 2; 82 | } 83 | } 84 | 85 | std::cerr << std::endl; 86 | 87 | plog.log(); 88 | CollectionType coll; 89 | builder.build(coll); 90 | double elapsed_secs = (get_time_usecs() - tick) / 1000000; 91 | double user_elapsed_secs = (get_user_time_usecs() - user_tick) / 1000000; 92 | logger() << seq_type << " collection built in " << elapsed_secs 93 | << " seconds" << std::endl; 94 | 95 | stats_line()("type", seq_type)("worker_threads", 96 | configuration::get().worker_threads)( 97 | "construction_time", elapsed_secs)("construction_user_time", 98 | user_elapsed_secs); 99 | 100 | dump_stats(coll, seq_type, plog.postings); 101 | dump_index_specific_stats(coll, seq_type); 102 | 103 | if (output_filename) { 104 | succinct::mapper::freeze(coll, output_filename); 105 | if (check) { 106 | verify_collection( 107 | input, output_filename); 108 | } 109 | } 110 | } 111 | 112 | int main(int argc, const char** argv) { 113 | int mandatory = 3; 114 | if (argc < mandatory) { 115 | std::cerr << "Usage: " << argv[0] << ":\n" 116 | << "\t [] " 117 | "[--check]" 118 | << std::endl; 119 | return 1; 120 | } 121 | 122 | std::string type = argv[1]; 123 | const char* input_basename = argv[2]; 124 | const char* output_filename = nullptr; 125 | if (argc > mandatory) { 126 | output_filename = argv[mandatory]; 127 | } 128 | 129 | bool check = false; 130 | if (argc > mandatory + 1 and 131 | std::string(argv[mandatory + 1]) == "--check") { 132 | check = true; 133 | } 134 | 135 | ds2i::global_parameters params; 136 | params.log_partition_size = configuration::get().log_partition_size; 137 | 138 | if (false) { 139 | #define LOOP_BODY(R, DATA, T) \ 140 | } \ 141 | else if (type == BOOST_PP_STRINGIZE(T)) { \ 142 | create_collection( \ 143 | input_basename, params, output_filename, check, type); \ 144 | /**/ 145 | 146 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, DS2I_INDEX_TYPES); 147 | #undef LOOP_BODY 148 | } else { 149 | logger() << "ERROR: Unknown type " << type << std::endl; 150 | } 151 | 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /src/create_wand_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "succinct/mapper.hpp" 5 | #include "binary_freq_collection.hpp" 6 | #include "binary_collection.hpp" 7 | #include "wand_data.hpp" 8 | #include "util.hpp" 9 | 10 | int main(int argc, const char** argv) { 11 | using namespace ds2i; 12 | 13 | if (argc != 3) { 14 | std::cerr << "Usage: " << argv[0] 15 | << " " << std::endl; 16 | return 1; 17 | } 18 | 19 | std::string input_basename = argv[1]; 20 | const char* output_filename = argv[2]; 21 | 22 | binary_collection sizes_coll((input_basename + ".sizes").c_str()); 23 | binary_freq_collection coll(input_basename.c_str()); 24 | 25 | wand_data<> wdata(sizes_coll.begin()->begin(), coll.num_docs(), coll); 26 | succinct::mapper::freeze(wdata, output_filename); 27 | } 28 | -------------------------------------------------------------------------------- /src/dict_perf_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "util.hpp" 7 | #include "dictionary_types.hpp" 8 | #include "dint_configuration.hpp" 9 | 10 | using namespace ds2i; 11 | 12 | typedef single_dictionary_rectangular_type dictionary_type; 13 | 14 | int main(int argc, char** argv) { 15 | if (argc < 1) { 16 | std::cerr << "Usage " << argv[0] << ":\n" 17 | << "\t" << std::endl; 18 | return 1; 19 | } 20 | 21 | char const* dictionary_filename = argv[1]; 22 | 23 | dictionary_type dict; 24 | typename dictionary_type::builder builder; 25 | std::ifstream dictionary_file(dictionary_filename); 26 | builder.load(dictionary_file); 27 | uint64_t dict_size = builder.size(); 28 | logger() << "loaded a dictionary with " << dict_size << " entries" 29 | << std::endl; 30 | builder.build(dict); 31 | 32 | constexpr uint64_t n = 10000000; 33 | std::random_device rd; 34 | std::default_random_engine eng(rd()); 35 | std::uniform_int_distribution uniform_dist(0, dict_size); 36 | 37 | std::vector indexes; 38 | indexes.reserve(n); 39 | for (uint64_t i = 0; i < n; ++i) { 40 | indexes.push_back(uniform_dist(eng)); 41 | } 42 | 43 | constexpr uint32_t runs = 10; 44 | std::vector out(dictionary_type::max_entry_size, 45 | 0); // output buffer 46 | double elapsed_time = 0; 47 | for (uint32_t run = 0; run < runs; ++run) { 48 | auto start = clock_type::now(); 49 | for (auto index : indexes) { 50 | uint32_t decoded_ints = dict.copy(index, out.data()); 51 | do_not_optimize_away(decoded_ints); 52 | } 53 | auto end = clock_type::now(); 54 | std::chrono::nanoseconds elapsed = end - start; 55 | elapsed_time += elapsed.count(); 56 | } 57 | 58 | logger() << "total elapsed time: " << elapsed_time / 1000000000 << " [secs]" 59 | << std::endl; 60 | logger() << "avg. time x run: " << elapsed_time / runs / 1000000000 61 | << " [secs]" << std::endl; 62 | logger() << "avg. time x copy: " << elapsed_time / runs / n << " [ns]" 63 | << std::endl; 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /src/pair_wise_intersect.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "index_types.hpp" 9 | #include "util.hpp" 10 | 11 | typedef uint32_t term_id_type; 12 | typedef std::vector term_id_vec; 13 | 14 | bool read_query(term_id_vec& ret, std::istream& is = std::cin) { 15 | ret.clear(); 16 | std::string line; 17 | if (!std::getline(is, line)) 18 | return false; 19 | std::istringstream iline(line); 20 | term_id_type term_id; 21 | while (iline >> term_id) { 22 | ret.push_back(term_id); 23 | } 24 | 25 | return true; 26 | } 27 | 28 | template 29 | static uint64_t intersect(uint64_t num_docs, std::vector& enums, 30 | std::vector& out) { 31 | // increasing frequency 32 | if (enums[0].size() > enums[1].size()) { 33 | std::swap(enums[0], enums[1]); 34 | } 35 | 36 | uint64_t results = 0; 37 | uint64_t candidate = enums[0].docid(); 38 | size_t i = 1; 39 | while (candidate < num_docs) { 40 | for (; i < 2; ++i) { 41 | enums[i].next_geq(candidate); 42 | if (enums[i].docid() != candidate) { 43 | candidate = enums[i].docid(); 44 | i = 0; 45 | break; 46 | } 47 | } 48 | 49 | if (i == 2) { 50 | out[results] = candidate; 51 | ++results; 52 | enums[0].next(); 53 | candidate = enums[0].docid(); 54 | i = 1; 55 | } 56 | } 57 | 58 | return results; 59 | } 60 | 61 | template 62 | void perftest(const char* index_filename) { 63 | using namespace ds2i; 64 | 65 | Index index; 66 | logger() << "Loading index from " << index_filename << std::endl; 67 | boost::iostreams::mapped_file_source m(index_filename); 68 | succinct::mapper::map(index, m); 69 | 70 | std::vector queries; 71 | term_id_vec q; 72 | while (read_query(q)) { 73 | assert(q.size() == 2); 74 | queries.push_back(q); 75 | } 76 | 77 | uint32_t num_queries = queries.size(); 78 | logger() << "Executing " << num_queries << " pair-wise intersections..." 79 | << std::endl; 80 | 81 | uint64_t num_docs = index.num_docs(); 82 | std::vector out(num_docs); 83 | 84 | double total_usecs = 0.0; 85 | // first run if for warming up 86 | static const int runs = 10 + 1; 87 | size_t total = 0; 88 | 89 | typedef typename Index::document_enumerator enum_type; 90 | std::vector qq; 91 | qq.reserve(2); 92 | for (int run = 0; run != runs; ++run) { 93 | double start = get_time_usecs(); 94 | for (uint32_t i = 0; i != num_queries; ++i) { 95 | qq.clear(); 96 | for (auto term : queries[i]) { 97 | qq.push_back(index[term]); 98 | } 99 | uint64_t size = intersect(num_docs, qq, out); 100 | total += size; 101 | } 102 | double end = get_time_usecs(); 103 | double elapsed = end - start; 104 | if (run) { 105 | total_usecs += elapsed; 106 | } 107 | } 108 | 109 | // for debug 110 | std::cout << total << std::endl; 111 | 112 | printf( 113 | "\t %d intersections took %lf [musecs] (avg. among %d " 114 | "runs)\n", 115 | num_queries, total_usecs / (runs - 1), runs - 1); 116 | printf( 117 | "\t %lf [musecs] per intersection (avg. among %d " 118 | "queries)\n", 119 | total_usecs / (runs - 1) / num_queries, num_queries); 120 | } 121 | 122 | int main(int argc, const char** argv) { 123 | using namespace ds2i; 124 | 125 | int mandatory = 3; 126 | if (argc < mandatory) { 127 | std::cerr << argv[0] << " < query_log" 128 | << std::endl; 129 | return 1; 130 | } 131 | 132 | std::string index_type = argv[1]; 133 | const char* index_filename = argv[2]; 134 | 135 | if (false) { 136 | #define LOOP_BODY(R, DATA, T) \ 137 | } \ 138 | else if (index_type == BOOST_PP_STRINGIZE(T)) { \ 139 | perftest(index_filename); \ 140 | /**/ 141 | 142 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, DS2I_INDEX_TYPES); 143 | #undef LOOP_BODY 144 | } else { 145 | logger() << "ERROR: Unknown index type " << index_type << std::endl; 146 | } 147 | 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /src/queries.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "index_types.hpp" 9 | #include "wand_data.hpp" 10 | #include "queries.hpp" 11 | #include "util.hpp" 12 | 13 | const size_t runs = 10 + 1; 14 | 15 | template 16 | void op_perftest(IndexType const& index, 17 | QueryOperator&& query_op, // XXX!!! 18 | std::vector const& queries, 19 | std::string const& index_type, std::string const& query_type, 20 | size_t runs) { 21 | using namespace ds2i; 22 | 23 | std::vector query_times; 24 | size_t total = 0; 25 | for (size_t run = 0; run != runs; ++run) { 26 | for (auto const& query : queries) { 27 | auto tick = get_time_usecs(); 28 | uint64_t results = query_op(index, query); 29 | // do_not_optimize_away(results); 30 | total += results; 31 | double elapsed = double(get_time_usecs() - tick); 32 | if (run != 0) { // first run is not timed 33 | query_times.push_back(elapsed); 34 | } 35 | } 36 | } 37 | 38 | std::cout << total << std::endl; 39 | 40 | if (false) { 41 | for (auto t : query_times) { 42 | std::cout << (t / 1000) << std::endl; 43 | } 44 | } else { 45 | std::sort(query_times.begin(), query_times.end()); 46 | double avg = 47 | std::accumulate(query_times.begin(), query_times.end(), double()) / 48 | query_times.size(); 49 | double q50 = query_times[query_times.size() / 2]; 50 | double q90 = query_times[90 * query_times.size() / 100]; 51 | double q95 = query_times[95 * query_times.size() / 100]; 52 | // logger() << "---- " << index_type << " " << query_type; 53 | // logger() << "Mean: " << avg; 54 | // logger() << "50% quantile: " << q50; 55 | // logger() << "90% quantile: " << q90; 56 | // logger() << "95% quantile: " << q95; 57 | 58 | stats_line()("type", index_type)("query", query_type)("avg", avg)( 59 | "q50", q50)("q90", q90)("q95", q95); 60 | } 61 | } 62 | 63 | template 64 | void perftest(const char* index_filename, const char* wand_data_filename, 65 | std::vector const& queries, 66 | std::string const& type, std::string const& query_type) { 67 | using namespace ds2i; 68 | 69 | IndexType index; 70 | logger() << "Loading index from " << index_filename << std::endl; 71 | boost::iostreams::mapped_file_source m(index_filename); 72 | succinct::mapper::map(index, m); 73 | 74 | logger() << "Warming up posting lists" << std::endl; 75 | std::unordered_set warmed_up; 76 | for (auto const& q : queries) { 77 | for (auto t : q) { 78 | if (!warmed_up.count(t)) { 79 | index.warmup(t); 80 | warmed_up.insert(t); 81 | } 82 | } 83 | } 84 | 85 | wand_data<> wdata; 86 | boost::iostreams::mapped_file_source md; 87 | if (wand_data_filename) { 88 | md.open(wand_data_filename); 89 | succinct::mapper::map(wdata, md, succinct::mapper::map_flags::warmup); 90 | } 91 | 92 | std::vector query_types; 93 | boost::algorithm::split(query_types, query_type, boost::is_any_of(":")); 94 | 95 | for (auto const& t : query_types) { 96 | if (t == "and") { 97 | op_perftest(index, and_query(), queries, type, t, runs); 98 | } else if (t == "and_freq") { 99 | op_perftest(index, and_query(), queries, type, t, runs); 100 | } else if (t == "or") { 101 | op_perftest(index, or_query(), queries, type, t, runs); 102 | } else if (t == "or_freq") { 103 | op_perftest(index, or_query(), queries, type, t, runs); 104 | } else if (t == "wand" && wand_data_filename) { 105 | op_perftest(index, wand_query(wdata, 10), queries, type, t, runs); 106 | } else if (t == "ranked_and" && wand_data_filename) { 107 | op_perftest(index, ranked_and_query(wdata, 10), queries, type, t, 108 | runs); 109 | } else if (t == "maxscore" && wand_data_filename) { 110 | op_perftest(index, maxscore_query(wdata, 10), queries, type, t, 111 | runs); 112 | } else { 113 | logger() << "Unsupported query type: " << t << std::endl; 114 | } 115 | } 116 | } 117 | 118 | int main(int argc, const char** argv) { 119 | using namespace ds2i; 120 | 121 | int mandatory = 4; 122 | if (argc < mandatory) { 123 | std::cerr << argv[0] 124 | << " " 125 | "[wand_filename] < query_log" 126 | << std::endl; 127 | return 1; 128 | } 129 | 130 | std::string type = argv[1]; 131 | std::string query_type = argv[2]; 132 | const char* index_filename = argv[3]; 133 | const char* wand_data_filename = nullptr; 134 | 135 | if (argc > mandatory) { 136 | wand_data_filename = argv[mandatory]; 137 | } 138 | 139 | std::vector queries; 140 | term_id_vec q; 141 | while (read_query(q)) queries.push_back(q); 142 | 143 | if (false) { 144 | #define LOOP_BODY(R, DATA, T) \ 145 | } \ 146 | else if (type == BOOST_PP_STRINGIZE(T)) { \ 147 | perftest(index_filename, wand_data_filename, \ 148 | queries, type, query_type); \ 149 | /**/ 150 | 151 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, DS2I_INDEX_TYPES); 152 | #undef LOOP_BODY 153 | } else { 154 | logger() << "ERROR: Unknown type " << type << std::endl; 155 | } 156 | 157 | return 0; 158 | } 159 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB TEST_SOURCES test_*.cpp) 2 | foreach(TEST_SRC ${TEST_SOURCES}) 3 | get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) 4 | add_executable(${TEST_SRC_NAME} ${TEST_SRC}) 5 | target_link_libraries(${TEST_SRC_NAME} 6 | succinct 7 | ${Boost_LIBRARIES} 8 | ) 9 | add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME}) 10 | endforeach(TEST_SRC) 11 | 12 | target_link_libraries(test_block_codecs 13 | FastPFor_lib) 14 | 15 | target_link_libraries(test_block_posting_list 16 | FastPFor_lib) 17 | 18 | target_link_libraries(test_block_freq_index 19 | FastPFor_lib) 20 | 21 | -------------------------------------------------------------------------------- /test/test_block_codecs.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE block_codecs 2 | 3 | #include "succinct/test_common.hpp" 4 | #include "block_codecs.hpp" 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | void test_block_codec() 11 | { 12 | std::vector sizes = {1, 16, BlockCodec::block_size - 1, BlockCodec::block_size}; 13 | for(size_t mag=1;mag<25;mag++) { 14 | for (auto size: sizes) { 15 | std::vector values(size); 16 | std::mt19937 gen(12345); 17 | std::uniform_int_distribution dis(0, (1< encoded; 26 | BlockCodec::encode(values.data(), sum_of_values, values.size(), encoded); 27 | 28 | std::vector decoded(values.size() + BlockCodec::overflow); 29 | uint8_t const* out = BlockCodec::decode(encoded.data(), decoded.data(), 30 | sum_of_values, values.size()); 31 | 32 | BOOST_REQUIRE_EQUAL(encoded.size(), out - encoded.data()); 33 | BOOST_REQUIRE_EQUAL_COLLECTIONS(values.begin(), values.end(), 34 | decoded.begin(), decoded.begin()+ values.size()); 35 | } 36 | } 37 | } 38 | } 39 | 40 | BOOST_AUTO_TEST_CASE(block_codecs) 41 | { 42 | test_block_codec(); 43 | test_block_codec(); 44 | test_block_codec(); 45 | test_block_codec(); 46 | test_block_codec(); 47 | test_block_codec(); 48 | test_block_codec(); 49 | } 50 | -------------------------------------------------------------------------------- /test/test_block_freq_index.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE block_freq_index 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "block_freq_index.hpp" 6 | #include "block_codecs.hpp" 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | template 14 | void test_block_freq_index() 15 | { 16 | ds2i::global_parameters params; 17 | uint64_t universe = 20000; 18 | typedef ds2i::block_freq_index collection_type; 19 | typename collection_type::builder b(universe, params); 20 | 21 | typedef std::vector vec_type; 22 | std::vector> posting_lists(30); 23 | for (auto& plist: posting_lists) { 24 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 25 | uint64_t n = uint64_t(universe / avg_gap); 26 | plist.first = random_sequence(universe, n, true); 27 | plist.second.resize(n); 28 | std::generate(plist.second.begin(), plist.second.end(), 29 | []() { return (rand() % 256) + 1; }); 30 | 31 | b.add_posting_list(n, plist.first.begin(), 32 | plist.second.begin(), 0); 33 | 34 | } 35 | 36 | { 37 | collection_type coll; 38 | b.build(coll); 39 | succinct::mapper::freeze(coll, "temp.bin"); 40 | } 41 | 42 | { 43 | collection_type coll; 44 | boost::iostreams::mapped_file_source m("temp.bin"); 45 | succinct::mapper::map(coll, m); 46 | 47 | for (size_t i = 0; i < posting_lists.size(); ++i) { 48 | auto const& plist = posting_lists[i]; 49 | auto doc_enum = coll[i]; 50 | BOOST_REQUIRE_EQUAL(plist.first.size(), doc_enum.size()); 51 | for (size_t p = 0; p < plist.first.size(); ++p, doc_enum.next()) { 52 | MY_REQUIRE_EQUAL(plist.first[p], doc_enum.docid(), 53 | "i = " << i << " p = " << p); 54 | MY_REQUIRE_EQUAL(plist.second[p], doc_enum.freq(), 55 | "i = " << i << " p = " << p); 56 | } 57 | BOOST_REQUIRE_EQUAL(coll.num_docs(), doc_enum.docid()); 58 | } 59 | } 60 | } 61 | 62 | BOOST_AUTO_TEST_CASE(block_freq_index) 63 | { 64 | test_block_freq_index(); 65 | test_block_freq_index(); 66 | test_block_freq_index(); 67 | test_block_freq_index(); 68 | test_block_freq_index(); 69 | test_block_freq_index(); 70 | test_block_freq_index(); 71 | } 72 | -------------------------------------------------------------------------------- /test/test_block_posting_list.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE block_posting_list 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "block_posting_list.hpp" 6 | #include "block_codecs.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | void test_block_posting_list_ops(uint8_t const* data, uint64_t n, uint64_t universe, 14 | std::vector const& docs, 15 | std::vector const& freqs) 16 | { 17 | typename PostingList::document_enumerator e(data, universe); 18 | BOOST_REQUIRE_EQUAL(n, e.size()); 19 | for (size_t i = 0; i < n; ++i, e.next()) { 20 | MY_REQUIRE_EQUAL(docs[i], e.docid(), 21 | "i = " << i << " size = " << n); 22 | MY_REQUIRE_EQUAL(freqs[i], e.freq(), 23 | "i = " << i << " size = " << n); 24 | } 25 | // XXX better testing of next_geq 26 | for (size_t i = 0; i < n; ++i) { 27 | e.reset(); 28 | e.next_geq(docs[i]); 29 | MY_REQUIRE_EQUAL(docs[i], e.docid(), 30 | "i = " << i << " size = " << n); 31 | MY_REQUIRE_EQUAL(freqs[i], e.freq(), 32 | "i = " << i << " size = " << n); 33 | } 34 | e.reset(); e.next_geq(docs.back() + 1); 35 | BOOST_REQUIRE_EQUAL(universe, e.docid()); 36 | e.reset(); e.next_geq(universe); 37 | BOOST_REQUIRE_EQUAL(universe, e.docid()); 38 | } 39 | 40 | void random_posting_data(uint64_t n, uint64_t universe, 41 | std::vector& docs, 42 | std::vector& freqs) 43 | { 44 | docs = random_sequence(universe, n, true); 45 | freqs.resize(n); 46 | std::generate(freqs.begin(), freqs.end(), 47 | []() { return (rand() % 256) + 1; }); 48 | } 49 | 50 | template 51 | void test_block_posting_list() 52 | { 53 | typedef ds2i::block_posting_list posting_list_type; 54 | uint64_t universe = 20000; 55 | for (size_t t = 0; t < 20; ++t) { 56 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 57 | uint64_t n = uint64_t(universe / avg_gap); 58 | 59 | std::vector docs, freqs; 60 | random_posting_data(n, universe, docs, freqs); 61 | std::vector data; 62 | posting_list_type::write(data, n, docs.begin(), freqs.begin()); 63 | 64 | test_block_posting_list_ops(data.data(), n, universe, 65 | docs, freqs); 66 | } 67 | } 68 | 69 | template 70 | void test_block_posting_list_reordering() 71 | { 72 | typedef ds2i::block_posting_list posting_list_type; 73 | uint64_t universe = 20000; 74 | for (size_t t = 0; t < 20; ++t) { 75 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 76 | uint64_t n = uint64_t(universe / avg_gap); 77 | 78 | std::vector docs, freqs; 79 | random_posting_data(n, universe, docs, freqs); 80 | std::vector data; 81 | posting_list_type::write(data, n, docs.begin(), freqs.begin()); 82 | 83 | // reorder blocks 84 | typename posting_list_type::document_enumerator e(data.data(), universe); 85 | auto blocks = e.get_blocks(); 86 | std::random_shuffle(blocks.begin() + 1, blocks.end()); // leave first block in place 87 | 88 | std::vector reordered_data; 89 | posting_list_type::write_blocks(reordered_data, n, blocks); 90 | 91 | test_block_posting_list_ops(reordered_data.data(), n, universe, 92 | docs, freqs); 93 | } 94 | } 95 | 96 | BOOST_AUTO_TEST_CASE(block_posting_list) 97 | { 98 | test_block_posting_list(); 99 | test_block_posting_list(); 100 | test_block_posting_list(); 101 | test_block_posting_list(); 102 | test_block_posting_list(); 103 | test_block_posting_list(); 104 | test_block_posting_list(); 105 | } 106 | 107 | BOOST_AUTO_TEST_CASE(block_posting_list_reordering) 108 | { 109 | test_block_posting_list_reordering(); 110 | test_block_posting_list_reordering(); 111 | } 112 | -------------------------------------------------------------------------------- /test/test_compact_elias_fano.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE compact_elias_fano 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "compact_elias_fano.hpp" 6 | #include 7 | #include 8 | 9 | struct sequence_initialization { 10 | sequence_initialization() 11 | { 12 | n = 100000; 13 | universe = n * 1024; 14 | seq = random_sequence(universe, n); 15 | 16 | // high granularity to test more corner cases 17 | params.ef_log_sampling0 = 4; 18 | params.ef_log_sampling1 = 5; 19 | succinct::bit_vector_builder bvb; 20 | ds2i::compact_elias_fano::write(bvb, 21 | seq.begin(), 22 | universe, seq.size(), 23 | params); 24 | succinct::bit_vector(&bvb).swap(bv); 25 | } 26 | 27 | ds2i::global_parameters params; 28 | size_t n; 29 | size_t universe; 30 | std::vector seq; 31 | succinct::bit_vector bv; 32 | }; 33 | 34 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_singleton, 35 | sequence_initialization) 36 | { 37 | // test singleton sequences 38 | std::vector short_seq; 39 | short_seq.push_back(0); 40 | test_sequence(ds2i::compact_elias_fano(), params, 1, short_seq); 41 | short_seq[0] = 1; 42 | test_sequence(ds2i::compact_elias_fano(), params, 2, short_seq); 43 | } 44 | 45 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_construction, 46 | sequence_initialization) 47 | { 48 | 49 | // test pointers and low-level values 50 | ds2i::compact_elias_fano::offsets of(0, 51 | universe, seq.size(), 52 | params); 53 | uint64_t rank = 0; 54 | for (uint64_t pos = 0; pos < of.higher_bits_length; ++pos) { 55 | bool b = bv[of.higher_bits_offset + pos]; 56 | uint64_t rank0 = pos - rank; 57 | 58 | if (b) { 59 | uint64_t read_v = ((pos - rank - 1) << of.lower_bits) | 60 | bv.get_bits(of.lower_bits_offset + rank * of.lower_bits, 61 | of.lower_bits); 62 | MY_REQUIRE_EQUAL(seq[rank], read_v, "rank = " << rank); 63 | } 64 | 65 | if (b && rank && (rank % (1 << of.log_sampling1)) == 0) { 66 | uint64_t ptr_offset = of.pointers1_offset + 67 | ((rank >> of.log_sampling1) - 1) * of.pointer_size; 68 | MY_REQUIRE_EQUAL(pos, bv.get_bits(ptr_offset, of.pointer_size), 69 | "rank = " << rank); 70 | } 71 | 72 | if (!b && rank0 && (rank0 % (1 << of.log_sampling0)) == 0) { 73 | uint64_t ptr_offset = of.pointers0_offset + 74 | ((rank0 >> of.log_sampling0) - 1) * of.pointer_size; 75 | MY_REQUIRE_EQUAL(pos, bv.get_bits(ptr_offset, of.pointer_size), 76 | "rank0 = " << rank0); 77 | } 78 | rank += b; 79 | } 80 | } 81 | 82 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_enumerator, 83 | sequence_initialization) 84 | { 85 | ds2i::compact_elias_fano::enumerator r(bv, 0, 86 | universe, seq.size(), 87 | params); 88 | test_sequence(r, seq); 89 | } 90 | 91 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_weakly_monotone, 92 | sequence_initialization) 93 | { 94 | n = 100000; 95 | universe = n * 3; 96 | std::vector seq = random_sequence(universe, n, false); 97 | test_sequence(ds2i::compact_elias_fano(), params, universe, seq); 98 | } 99 | 100 | -------------------------------------------------------------------------------- /test/test_compact_ranked_bitvector.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE compact_ranked_bitvector 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "compact_ranked_bitvector.hpp" 6 | #include 7 | #include 8 | 9 | struct sequence_initialization { 10 | sequence_initialization() 11 | { 12 | n = 100000; 13 | universe = n * 3; 14 | seq = random_sequence(universe, n, true); 15 | 16 | // high granularity to test more corner cases 17 | params.rb_log_rank1_sampling = 6; 18 | params.rb_log_sampling1 = 5; 19 | succinct::bit_vector_builder bvb; 20 | ds2i::compact_ranked_bitvector::write(bvb, 21 | seq.begin(), 22 | universe, seq.size(), 23 | params); 24 | succinct::bit_vector(&bvb).swap(bv); 25 | } 26 | 27 | ds2i::global_parameters params; 28 | size_t n; 29 | size_t universe; 30 | uint64_t log_rank1_sampling; 31 | uint64_t log_sampling1; 32 | std::vector seq; 33 | succinct::bit_vector bv; 34 | }; 35 | 36 | BOOST_FIXTURE_TEST_CASE(compact_ranked_bitvector_construction, 37 | sequence_initialization) 38 | { 39 | 40 | // test pointers and rank samples 41 | ds2i::compact_ranked_bitvector::offsets of(0, 42 | universe, seq.size(), 43 | params); 44 | uint64_t rank = 0; 45 | for (uint64_t pos = 0; pos < of.universe; ++pos) { 46 | bool b = bv[of.bits_offset + pos]; 47 | 48 | if (b) { 49 | MY_REQUIRE_EQUAL(seq[rank], pos, "rank = " << rank); 50 | } 51 | 52 | if (b && rank && (rank % (1 << of.log_sampling1)) == 0) { 53 | uint64_t ptr_offset = of.pointers1_offset + 54 | ((rank >> of.log_sampling1) - 1) * of.pointer_size; 55 | MY_REQUIRE_EQUAL(pos, bv.get_bits(ptr_offset, of.pointer_size), 56 | "rank = " << rank); 57 | } 58 | 59 | if (pos && (pos % (1 << of.log_rank1_sampling) == 0)) { 60 | uint64_t sample_offset = of.rank1_samples_offset + 61 | ((pos >> of.log_rank1_sampling) - 1) * of.rank1_sample_size; 62 | MY_REQUIRE_EQUAL(rank, bv.get_bits(sample_offset, of.rank1_sample_size), 63 | "pos = " << pos); 64 | } 65 | 66 | rank += b; 67 | } 68 | } 69 | 70 | BOOST_FIXTURE_TEST_CASE(compact_ranked_bitvector_singleton, 71 | sequence_initialization) 72 | { 73 | // test singleton sequences 74 | std::vector short_seq; 75 | short_seq.push_back(0); 76 | test_sequence(ds2i::compact_ranked_bitvector(), params, 1, short_seq); 77 | short_seq[0] = 1; 78 | test_sequence(ds2i::compact_ranked_bitvector(), params, 2, short_seq); 79 | } 80 | 81 | BOOST_FIXTURE_TEST_CASE(compact_ranked_bitvector_enumerator, 82 | sequence_initialization) 83 | { 84 | ds2i::compact_ranked_bitvector::enumerator r(bv, 0, 85 | universe, seq.size(), 86 | params); 87 | test_sequence(r, seq); 88 | } 89 | -------------------------------------------------------------------------------- /test/test_data/test_collection.docs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/dint/d5b4b9ef1998119b2f06b9143153f7e82f12cb56/test/test_data/test_collection.docs -------------------------------------------------------------------------------- /test/test_data/test_collection.freqs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/dint/d5b4b9ef1998119b2f06b9143153f7e82f12cb56/test/test_data/test_collection.freqs -------------------------------------------------------------------------------- /test/test_data/test_collection.sizes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/dint/d5b4b9ef1998119b2f06b9143153f7e82f12cb56/test/test_data/test_collection.sizes -------------------------------------------------------------------------------- /test/test_freq_index.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE freq_index 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "freq_index.hpp" 6 | #include "indexed_sequence.hpp" 7 | #include "partitioned_sequence.hpp" 8 | #include "positive_sequence.hpp" 9 | #include "uniform_partitioned_sequence.hpp" 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | template 18 | void test_freq_index() 19 | { 20 | ds2i::global_parameters params; 21 | uint64_t universe = 20000; 22 | typedef ds2i::freq_index 23 | collection_type; 24 | typename collection_type::builder b(universe, params); 25 | 26 | typedef std::vector vec_type; 27 | std::vector> posting_lists(30); 28 | for (auto& plist: posting_lists) { 29 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 30 | uint64_t n = uint64_t(universe / avg_gap); 31 | plist.first = random_sequence(universe, n, true); 32 | plist.second.resize(n); 33 | std::generate(plist.second.begin(), plist.second.end(), 34 | []() { return (rand() % 256) + 1; }); 35 | uint64_t freqs_sum = std::accumulate(plist.second.begin(), 36 | plist.second.end(), uint64_t(0)); 37 | 38 | b.add_posting_list(n, plist.first.begin(), 39 | plist.second.begin(), freqs_sum); 40 | 41 | } 42 | 43 | { 44 | collection_type coll; 45 | b.build(coll); 46 | succinct::mapper::freeze(coll, "temp.bin"); 47 | } 48 | 49 | { 50 | collection_type coll; 51 | boost::iostreams::mapped_file_source m("temp.bin"); 52 | succinct::mapper::map(coll, m); 53 | 54 | for (size_t i = 0; i < posting_lists.size(); ++i) { 55 | auto const& plist = posting_lists[i]; 56 | auto doc_enum = coll[i]; 57 | BOOST_REQUIRE_EQUAL(plist.first.size(), doc_enum.size()); 58 | for (size_t p = 0; p < plist.first.size(); ++p, doc_enum.next()) { 59 | MY_REQUIRE_EQUAL(plist.first[p], doc_enum.docid(), 60 | "i = " << i << " p = " << p); 61 | MY_REQUIRE_EQUAL(plist.second[p], doc_enum.freq(), 62 | "i = " << i << " p = " << p); 63 | } 64 | BOOST_REQUIRE_EQUAL(coll.num_docs(), doc_enum.docid()); 65 | } 66 | } 67 | } 68 | 69 | BOOST_AUTO_TEST_CASE(freq_index) 70 | { 71 | using ds2i::indexed_sequence; 72 | using ds2i::strict_sequence; 73 | using ds2i::positive_sequence; 74 | using ds2i::partitioned_sequence; 75 | using ds2i::uniform_partitioned_sequence; 76 | 77 | test_freq_index>(); 79 | 80 | test_freq_index, 81 | positive_sequence>>(); 82 | test_freq_index, 83 | positive_sequence>>(); 84 | } 85 | -------------------------------------------------------------------------------- /test/test_generic_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "succinct/test_common.hpp" 4 | #include "succinct/bit_vector.hpp" 5 | #include "util.hpp" 6 | 7 | std::vector random_sequence(size_t universe, size_t n, 8 | bool strict = true) 9 | { 10 | srand(42); 11 | std::vector seq; 12 | 13 | uint64_t u = strict ? (universe - n) : universe; 14 | for (size_t i = 0; i < n; ++i) { 15 | seq.push_back(rand() % u); 16 | } 17 | std::sort(seq.begin(), seq.end()); 18 | 19 | if (strict) { 20 | for (size_t i = 0; i < n; ++i) { 21 | seq[i] += i; 22 | } 23 | } 24 | 25 | return seq; 26 | } 27 | 28 | template 29 | void test_move_next(SequenceReader r, std::vector const& seq) 30 | { 31 | BOOST_REQUIRE_EQUAL(seq.size(), r.size()); 32 | if (seq.empty()) { 33 | // just check that move works 34 | BOOST_REQUIRE_EQUAL(seq.size(), r.move(seq.size()).first); 35 | return; 36 | } 37 | 38 | typename SequenceReader::value_type val; 39 | 40 | // test random access and enumeration 41 | for (uint64_t i = 0; i < seq.size(); ++i) { 42 | val = r.move(i); 43 | MY_REQUIRE_EQUAL(i, val.first, 44 | "i = " << i); 45 | MY_REQUIRE_EQUAL(seq[i], val.second, 46 | "i = " << i); 47 | 48 | if (i) { 49 | MY_REQUIRE_EQUAL(seq[i - 1], r.prev_value(), 50 | "i = " << i); 51 | } else { 52 | MY_REQUIRE_EQUAL(0, r.prev_value(), 53 | "i = " << i); 54 | } 55 | } 56 | r.move(seq.size()); 57 | BOOST_REQUIRE_EQUAL(seq.back(), r.prev_value()); 58 | 59 | val = r.move(0); 60 | for (uint64_t i = 0; i < seq.size(); ++i) { 61 | MY_REQUIRE_EQUAL(seq[i], val.second, 62 | "i = " << i); 63 | 64 | if (i) { 65 | MY_REQUIRE_EQUAL(seq[i - 1], r.prev_value(), 66 | "i = " << i); 67 | } else { 68 | MY_REQUIRE_EQUAL(0, r.prev_value(), 69 | "i = " << i); 70 | } 71 | val = r.next(); 72 | } 73 | BOOST_REQUIRE_EQUAL(r.size(), val.first); 74 | BOOST_REQUIRE_EQUAL(seq.back(), r.prev_value()); 75 | 76 | // test small skips 77 | for (size_t i = 0; i < seq.size(); ++i) { 78 | for (size_t skip = 1; skip < seq.size() - i; skip <<= 1) { 79 | auto rr = r; 80 | rr.move(i); 81 | auto val = rr.move(i + skip); 82 | MY_REQUIRE_EQUAL(i + skip, val.first, 83 | "i = " << i << " skip = " << skip); 84 | MY_REQUIRE_EQUAL(seq[i + skip], val.second, 85 | "i = " << i << " skip = " << skip); 86 | } 87 | } 88 | } 89 | 90 | template 91 | void test_next_geq(SequenceReader r, std::vector const& seq) 92 | { 93 | BOOST_REQUIRE_EQUAL(seq.size(), r.size()); 94 | if (seq.empty()) { 95 | // just check that next_geq works 96 | BOOST_REQUIRE_EQUAL(seq.size(), r.next_geq(1).first); 97 | return; 98 | } 99 | 100 | typename SequenceReader::value_type val; 101 | 102 | // test successor 103 | uint64_t last = 0; 104 | for (size_t i = 0; i < seq.size(); ++i) { 105 | if (seq[i] == last) continue; 106 | 107 | auto rr = r; 108 | for (size_t t = 0; t < 10; ++t) { 109 | uint64_t p = 0; 110 | switch (i) { 111 | case 0: 112 | p = last + 1; break; 113 | case 1: 114 | p = seq[i]; break; 115 | default: 116 | p = last + 1 + (rand() % (seq[i] - last)); 117 | } 118 | 119 | val = rr.next_geq(p); 120 | BOOST_REQUIRE_EQUAL(i, val.first); 121 | MY_REQUIRE_EQUAL(seq[i], val.second, 122 | "p = " << p); 123 | 124 | if (val.first) { 125 | MY_REQUIRE_EQUAL(seq[val.first - 1], rr.prev_value(), 126 | "i = " << i); 127 | } else { 128 | MY_REQUIRE_EQUAL(0, rr.prev_value(), 129 | "i = " << i); 130 | } 131 | } 132 | last = seq[i]; 133 | } 134 | 135 | val = r.next_geq(seq.back() + 1); 136 | BOOST_REQUIRE_EQUAL(r.size(), val.first); 137 | BOOST_REQUIRE_EQUAL(seq.back(), r.prev_value()); 138 | 139 | // check next_geq beyond universe 140 | val = r.next_geq(2 * seq.back() + 1); 141 | BOOST_REQUIRE_EQUAL(r.size(), val.first); 142 | 143 | // test small skips 144 | for (size_t i = 0; i < seq.size(); ++i) { 145 | for (size_t skip = 1; skip < seq.size() - i; skip <<= 1) { 146 | size_t exp_pos = i + skip; 147 | // for weakly monotone sequences, next_at returns the first of the 148 | // run of equal values 149 | while ((exp_pos > 0) && seq[exp_pos - 1] == seq[i + skip]) { 150 | exp_pos -= 1; 151 | } 152 | 153 | auto rr = r; 154 | rr.move(i); 155 | val = rr.next_geq(seq[i + skip]); 156 | MY_REQUIRE_EQUAL(exp_pos, val.first, 157 | "i = " << i << " skip = " << skip 158 | << " value expected = " << seq[i + skip] 159 | << " got = " << val.second); 160 | MY_REQUIRE_EQUAL(seq[i + skip], val.second, 161 | "i = " << i << " skip = " << skip); 162 | } 163 | } 164 | } 165 | 166 | // oh, C++ 167 | struct no_next_geq_tag {}; 168 | struct next_geq_tag : no_next_geq_tag {}; 169 | 170 | template 171 | void test_sequence(SequenceReader r, std::vector const& seq, 172 | no_next_geq_tag const&) 173 | { 174 | test_move_next(r, seq); 175 | } 176 | 177 | template 178 | typename std::enable_if::value, void>::type 179 | test_sequence(SequenceReader r, std::vector const& seq, 180 | next_geq_tag const&) 181 | { 182 | test_move_next(r, seq); 183 | test_next_geq(r, seq); 184 | } 185 | 186 | template 187 | void test_sequence(SequenceReader r, std::vector const& seq) 188 | { 189 | test_sequence(r, seq, next_geq_tag()); 190 | } 191 | 192 | template 193 | inline void test_sequence(SequenceType, 194 | ParamsType const& params, 195 | uint64_t universe, 196 | std::vector const& seq) 197 | { 198 | succinct::bit_vector_builder bvb; 199 | SequenceType::write(bvb, seq.begin(), universe, seq.size(), params); 200 | succinct::bit_vector bv(&bvb); 201 | typename SequenceType::enumerator r(bv, 0, universe, seq.size(), params); 202 | test_sequence(r, seq); 203 | } 204 | 205 | -------------------------------------------------------------------------------- /test/test_indexed_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE indexed_sequence 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "indexed_sequence.hpp" 6 | #include 7 | #include 8 | 9 | BOOST_AUTO_TEST_CASE(indexed_sequence) 10 | { 11 | ds2i::global_parameters params; 12 | 13 | std::vector avg_gaps = { 1.1, 1.9, 2.5, 3, 4, 5, 10 }; 14 | for (auto avg_gap: avg_gaps) { 15 | uint64_t n = 10000; 16 | uint64_t universe = uint64_t(n * avg_gap); 17 | auto seq = random_sequence(universe, n, true); 18 | 19 | test_sequence(ds2i::indexed_sequence(), params, universe, seq); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /test/test_partitioned_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE partitioned_sequence 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "test_generic_sequence.hpp" 8 | #include "partitioned_sequence.hpp" 9 | #include "strict_sequence.hpp" 10 | 11 | namespace ds2i { 12 | 13 | class partitioned_sequence_test { 14 | public: 15 | template 16 | static void test_construction(Enumerator& r, std::vector const& seq) 17 | { 18 | if (r.m_partitions == 1) { // nothing to test here 19 | return; 20 | } 21 | 22 | for (size_t p = 0; p < r.m_partitions; ++p) { 23 | r.switch_partition(p); 24 | 25 | uint64_t cur_begin = r.m_cur_begin; 26 | uint64_t cur_end = r.m_cur_end; 27 | 28 | uint64_t cur_base = p ? seq[cur_begin - 1] + 1 : seq[0]; 29 | uint64_t cur_upper_bound = seq[cur_end - 1]; 30 | MY_REQUIRE_EQUAL(cur_base, r.m_cur_base, 31 | "p = " << p); 32 | MY_REQUIRE_EQUAL(cur_upper_bound, r.m_cur_upper_bound, 33 | "p = " << p); 34 | 35 | for (uint64_t i = cur_begin; i < cur_end; ++i) { 36 | auto val = r.m_partition_enum.move(i - cur_begin); 37 | MY_REQUIRE_EQUAL(seq[i], cur_base + val.second, 38 | "p = " << p << " i = " << i); 39 | } 40 | } 41 | } 42 | }; 43 | } 44 | 45 | template 46 | void test_partitioned_sequence(uint64_t universe, 47 | std::vector const& seq) 48 | { 49 | ds2i::global_parameters params; 50 | typedef ds2i::partitioned_sequence sequence_type; 51 | 52 | succinct::bit_vector_builder bvb; 53 | sequence_type::write(bvb, seq.begin(), universe, seq.size(), params); 54 | succinct::bit_vector bv(&bvb); 55 | 56 | typename sequence_type::enumerator r(bv, 0, universe, seq.size(), params); 57 | ds2i::partitioned_sequence_test::test_construction(r, seq); 58 | test_sequence(r, seq); 59 | } 60 | 61 | BOOST_AUTO_TEST_CASE(partitioned_sequence) 62 | { 63 | using ds2i::indexed_sequence; 64 | using ds2i::strict_sequence; 65 | 66 | if (boost::unit_test::framework::master_test_suite().argc == 2) { 67 | const char* filename = boost::unit_test::framework::master_test_suite().argv[1]; 68 | std::cerr << "Testing sequence from file " << filename << std::endl; 69 | std::ifstream is(filename); 70 | uint64_t v; 71 | std::vector seq; 72 | while (is >> v) { 73 | seq.push_back(v); 74 | } 75 | uint64_t universe = seq.back() + 1; 76 | test_partitioned_sequence(universe, seq); 77 | test_partitioned_sequence(universe, seq); 78 | return; 79 | } 80 | 81 | // test singleton sequences 82 | { 83 | std::vector seq; 84 | seq.push_back(0); 85 | test_partitioned_sequence(1, seq); 86 | test_partitioned_sequence(1, seq); 87 | seq[0] = 1; 88 | test_partitioned_sequence(2, seq); 89 | test_partitioned_sequence(2, seq); 90 | } 91 | 92 | std::vector avg_gaps = { 1.1, 1.9, 2.5, 3, 4, 5, 10 }; 93 | for (auto avg_gap: avg_gaps) { 94 | uint64_t n = 10000; 95 | uint64_t universe = uint64_t(n * avg_gap); 96 | auto seq = random_sequence(universe, n, true); 97 | test_partitioned_sequence(universe, seq); 98 | test_partitioned_sequence(universe, seq); 99 | } 100 | 101 | // test also short (singleton partition) sequences with large universe 102 | for (size_t i = 1; i < 512; i += 41) { 103 | uint64_t universe = 100000; 104 | uint64_t initial_gap = rand() % 50000; 105 | auto short_seq = random_sequence(universe - initial_gap, i, true); 106 | for (auto& v: short_seq) v += initial_gap; 107 | test_partitioned_sequence(universe, short_seq); 108 | test_partitioned_sequence(universe, short_seq); 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /test/test_positive_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE positive_sequence 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "positive_sequence.hpp" 6 | #include "partitioned_sequence.hpp" 7 | #include "uniform_partitioned_sequence.hpp" 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | template 14 | void test_positive_sequence() 15 | { 16 | srand(42); 17 | ds2i::global_parameters params; 18 | size_t n = 50000; 19 | std::vector values(n); 20 | std::generate(values.begin(), values.end(), []() { return (rand() % 256) + 1; }); 21 | uint64_t universe = std::accumulate(values.begin(), values.end(), 0) + 1; 22 | 23 | typedef ds2i::positive_sequence sequence_type; 24 | succinct::bit_vector_builder bvb; 25 | sequence_type::write(bvb, values.begin(), universe, values.size(), params); 26 | succinct::bit_vector bv(&bvb); 27 | typename sequence_type::enumerator r(bv, 0, universe, values.size(), params); 28 | 29 | for (size_t i = 0; i < n; ++i) { 30 | auto val = r.move(i); 31 | MY_REQUIRE_EQUAL(i, val.first, 32 | "i = " << i); 33 | MY_REQUIRE_EQUAL(values[i], val.second, 34 | "i = " << i); 35 | } 36 | } 37 | 38 | BOOST_AUTO_TEST_CASE(positive_sequence) 39 | { 40 | test_positive_sequence(); 41 | test_positive_sequence>(); 42 | test_positive_sequence>(); 43 | } 44 | -------------------------------------------------------------------------------- /test/test_ranked_queries.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE ranked_queries 2 | 3 | #include "succinct/test_common.hpp" 4 | #include 5 | 6 | #include "ds2i_config.hpp" 7 | #include "index_types.hpp" 8 | #include "queries.hpp" 9 | 10 | namespace ds2i { namespace test { 11 | 12 | struct index_initialization { 13 | 14 | typedef single_index index_type; 15 | 16 | index_initialization() 17 | : collection(DS2I_SOURCE_DIR "/test/test_data/test_collection") 18 | , document_sizes(DS2I_SOURCE_DIR "/test/test_data/test_collection.sizes") 19 | , wdata(document_sizes.begin()->begin(), collection.num_docs(), collection) 20 | { 21 | index_type::builder builder(collection.num_docs(), params); 22 | for (auto const& plist: collection) { 23 | uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), 24 | plist.freqs.end(), uint64_t(0)); 25 | builder.add_posting_list(plist.docs.size(), plist.docs.begin(), 26 | plist.freqs.begin(), freqs_sum); 27 | } 28 | builder.build(index); 29 | 30 | term_id_vec q; 31 | std::ifstream qfile("test_data/queries"); 32 | while (read_query(q, qfile)) queries.push_back(q); 33 | } 34 | 35 | global_parameters params; 36 | binary_freq_collection collection; 37 | binary_collection document_sizes; 38 | index_type index; 39 | std::vector queries; 40 | wand_data<> wdata; 41 | 42 | template 43 | void test_against_or(QueryOp& op_q) const 44 | { 45 | ranked_or_query or_q(wdata, 10); 46 | 47 | for (auto const& q: queries) { 48 | or_q(index, q); 49 | op_q(index, q); 50 | BOOST_REQUIRE_EQUAL(or_q.topk().size(), op_q.topk().size()); 51 | for (size_t i = 0; i < or_q.topk().size(); ++i) { 52 | BOOST_REQUIRE_CLOSE(or_q.topk()[i], op_q.topk()[i], 0.1); // tolerance is % relative 53 | } 54 | } 55 | } 56 | 57 | 58 | }; 59 | 60 | }} 61 | 62 | BOOST_FIXTURE_TEST_CASE(wand, 63 | ds2i::test::index_initialization) 64 | { 65 | ds2i::wand_query wand_q(wdata, 10); 66 | test_against_or(wand_q); 67 | } 68 | 69 | BOOST_FIXTURE_TEST_CASE(maxscore, 70 | ds2i::test::index_initialization) 71 | { 72 | ds2i::maxscore_query maxscore_q(wdata, 10); 73 | test_against_or(maxscore_q); 74 | } 75 | -------------------------------------------------------------------------------- /test/test_sequence_collection.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE sequence_collection 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "sequence_collection.hpp" 6 | #include "indexed_sequence.hpp" 7 | #include "partitioned_sequence.hpp" 8 | #include "uniform_partitioned_sequence.hpp" 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | template 15 | void test_sequence_collection() 16 | { 17 | ds2i::global_parameters params; 18 | uint64_t universe = 10000; 19 | typedef ds2i::sequence_collection 20 | collection_type; 21 | typename collection_type::builder b(params); 22 | 23 | std::vector> sequences(30); 24 | for (auto& seq: sequences) { 25 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 26 | uint64_t n = uint64_t(universe / avg_gap); 27 | seq = random_sequence(universe, n, true); 28 | b.add_sequence(seq.begin(), seq.back() + 1, n); 29 | } 30 | 31 | { 32 | collection_type coll; 33 | b.build(coll); 34 | succinct::mapper::freeze(coll, "temp.bin"); 35 | } 36 | 37 | { 38 | collection_type coll; 39 | boost::iostreams::mapped_file_source m("temp.bin"); 40 | succinct::mapper::map(coll, m); 41 | 42 | for (size_t i = 0; i < sequences.size(); ++i) { 43 | test_sequence(coll[i], sequences[i]); 44 | } 45 | } 46 | } 47 | 48 | BOOST_AUTO_TEST_CASE(sequence_collection) 49 | { 50 | test_sequence_collection(); 51 | test_sequence_collection>(); 52 | test_sequence_collection>(); 53 | } 54 | -------------------------------------------------------------------------------- /test/test_strict_elias_fano.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE strict_elias_fano 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "strict_elias_fano.hpp" 6 | #include 7 | #include 8 | 9 | BOOST_AUTO_TEST_CASE(strict_elias_fano) 10 | { 11 | ds2i::global_parameters params; 12 | 13 | uint64_t n = 10000; 14 | uint64_t universe = uint64_t(2 * n); 15 | auto seq = random_sequence(universe, n, true); 16 | 17 | test_sequence(ds2i::strict_elias_fano(), params, universe, seq); 18 | } 19 | -------------------------------------------------------------------------------- /test/test_uniform_partitioned_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE uniform_partitioned_sequence 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "uniform_partitioned_sequence.hpp" 6 | #include "strict_sequence.hpp" 7 | #include 8 | #include 9 | 10 | BOOST_AUTO_TEST_CASE(uniform_partitioned_sequence) 11 | { 12 | ds2i::global_parameters params; 13 | using ds2i::indexed_sequence; 14 | using ds2i::strict_sequence; 15 | 16 | // test singleton sequences 17 | std::vector short_seq; 18 | short_seq.push_back(0); 19 | test_sequence(ds2i::uniform_partitioned_sequence(), 20 | params, 1, short_seq); 21 | test_sequence(ds2i::uniform_partitioned_sequence(), 22 | params, 1, short_seq); 23 | short_seq[0] = 1; 24 | test_sequence(ds2i::uniform_partitioned_sequence(), 25 | params, 2, short_seq); 26 | test_sequence(ds2i::uniform_partitioned_sequence(), 27 | params, 2, short_seq); 28 | 29 | std::vector avg_gaps = { 1.1, 1.9, 2.5, 3, 4, 5, 10 }; 30 | for (auto avg_gap: avg_gaps) { 31 | uint64_t n = 10000; 32 | uint64_t universe = uint64_t(n * avg_gap); 33 | auto seq = random_sequence(universe, n, true); 34 | 35 | test_sequence(ds2i::uniform_partitioned_sequence(), 36 | params, universe, seq); 37 | test_sequence(ds2i::uniform_partitioned_sequence(), 38 | params, universe, seq); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /vroom_env/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(encode encode.cpp) 2 | target_link_libraries(encode 3 | ${Boost_LIBRARIES} 4 | FastPFor_lib 5 | streamvbyte 6 | MaskedVByte 7 | ) 8 | 9 | add_executable(decode decode.cpp) 10 | target_link_libraries(decode 11 | ${Boost_LIBRARIES} 12 | FastPFor_lib 13 | streamvbyte 14 | MaskedVByte 15 | ) 16 | 17 | add_executable(check_encoded_data check_encoded_data.cpp) 18 | target_link_libraries(check_encoded_data 19 | ${Boost_LIBRARIES} 20 | FastPFor_lib 21 | streamvbyte 22 | MaskedVByte 23 | ) -------------------------------------------------------------------------------- /vroom_env/check_encoded_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #include "binary_collection.hpp" 14 | #include "codecs.hpp" 15 | #include "dint_codecs.hpp" 16 | #include "util.hpp" 17 | 18 | using namespace ds2i; 19 | 20 | template 21 | void check_dint(char const* collection_filename, 22 | char const* encoded_data_filename, 23 | char const* dictionary_filename) { 24 | if (!dictionary_filename) { 25 | throw std::runtime_error("dictionary_filename must be specified"); 26 | } 27 | 28 | boost::iostreams::mapped_file_source file; 29 | file.open(encoded_data_filename); 30 | if (!file.is_open()) { 31 | throw std::runtime_error("Error opening index file"); 32 | } 33 | 34 | uint8_t const* begin = (uint8_t const*)file.data(); 35 | uint64_t size = file.size() / sizeof(uint8_t); 36 | auto ret = posix_madvise((void*)begin, size, POSIX_MADV_SEQUENTIAL); 37 | if (ret) { 38 | logger() << "Error calling madvice: " << errno << std::endl; 39 | } 40 | 41 | binary_collection input(collection_filename); 42 | auto it = input.begin(); 43 | 44 | Dictionary dict; 45 | size_t dictionaries_bytes = 0; 46 | typename Dictionary::builder builder; 47 | std::ifstream dictionary_file(dictionary_filename); 48 | dictionaries_bytes += builder.load(dictionary_file); 49 | builder.print_usage(); 50 | builder.build(dict); 51 | dictionary_file.close(); 52 | logger() << "Dictionary memory: " 53 | << double(dictionaries_bytes) / constants::MiB << " [MiB]" 54 | << std::endl; 55 | 56 | std::vector decoded; 57 | decoded.resize(constants::max_size, 0); 58 | 59 | bool docs = true; 60 | boost::filesystem::path collection_path(collection_filename); 61 | if (collection_path.extension() == ".freqs") { 62 | docs = false; 63 | logger() << "checking freqs..." << std::endl; 64 | } else if (collection_path.extension() == ".docs") { 65 | ++it; // skip first singleton sequence, containing num. of docs 66 | logger() << "checking docs..." << std::endl; 67 | } else { 68 | throw std::runtime_error("unsupported file format"); 69 | } 70 | 71 | uint64_t total_decoded_ints = 0; 72 | uint64_t sequence = 0; 73 | 74 | dint_statistics stats; 75 | 76 | for (; it != input.end(); ++it) { 77 | auto const& list = *it; 78 | uint32_t size = list.size(); 79 | if (size > constants::min_size) { 80 | uint32_t n, universe; 81 | begin = header::read(begin, &n, &universe); 82 | 83 | if (n != size) { 84 | std::cerr << "sequence has wrong length: got " << n 85 | << " but expected " << sequence << std::endl; 86 | } 87 | 88 | begin = Decoder::decode(dict, begin, decoded.data(), universe, n); 89 | total_decoded_ints += n; 90 | 91 | uint32_t prev = docs ? -1 : 0; 92 | uint64_t j = 0; 93 | for (auto b = list.begin(); b != list.end(); ++b, ++j) { 94 | uint32_t expected = *b - prev - 1; 95 | if (docs) { 96 | prev = *b; 97 | } 98 | if (decoded[j] != expected) { 99 | std::cerr << "Sequence " << sequence 100 | << ": error at position " << j << "/" << n 101 | << " (got " << decoded[j] << " but expected " 102 | << expected << ")" << std::endl; 103 | } 104 | decoded[j] = 0; 105 | } 106 | 107 | for (; j != n + constants::max_entry_size; ++j) { 108 | decoded[j] = 0; 109 | } 110 | } 111 | 112 | ++sequence; 113 | } 114 | 115 | logger() << "checked " << total_decoded_ints << " integers: OK!" 116 | << std::endl; 117 | 118 | file.close(); 119 | } 120 | 121 | int main(int argc, char** argv) { 122 | int mandatory = 4; 123 | if (argc < mandatory) { 124 | std::cerr << "Usage " << argv[0] << ":\n" 125 | << "\t " 126 | "[--dict ]" 127 | << std::endl; 128 | return 1; 129 | } 130 | 131 | using namespace ds2i; 132 | std::string type = argv[1]; 133 | char const* collection_filename = argv[2]; 134 | char const* encoded_data_filename = argv[3]; 135 | char const* dictionary_filename = nullptr; 136 | 137 | for (int i = mandatory; i < argc; ++i) { 138 | if (argv[i] == std::string("--dict")) { 139 | ++i; 140 | dictionary_filename = argv[i]; 141 | } else { 142 | throw std::runtime_error("unknown parameter"); 143 | } 144 | } 145 | 146 | if (type == std::string("single_rect_dint")) { 147 | check_dint( 148 | collection_filename, encoded_data_filename, dictionary_filename); 149 | } else if (type == std::string("single_packed_dint")) { 150 | check_dint( 151 | collection_filename, encoded_data_filename, dictionary_filename); 152 | } else if (type == std::string("multi_packed_dint")) { 153 | check_dint( 154 | collection_filename, encoded_data_filename, dictionary_filename); 155 | } 156 | 157 | return 0; 158 | } 159 | -------------------------------------------------------------------------------- /vroom_env/jobs.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "semiasync_queue.hpp" 4 | 5 | namespace ds2i { 6 | 7 | // for generic block-codec 8 | template 9 | struct enc_sequence_adder : semiasync_queue::job { 10 | enc_sequence_adder(Iterator begin, uint64_t n, 11 | boost::progress_display& progress, 12 | std::vector& output, bool docs, 13 | uint64_t& num_processed_lists, uint64_t& num_total_ints) 14 | : begin(begin) 15 | , n(n) 16 | , universe(0) 17 | , progress(progress) 18 | , output(output) 19 | , docs(docs) 20 | , num_processed_lists(num_processed_lists) 21 | , num_total_ints(num_total_ints) {} 22 | 23 | virtual void prepare() { 24 | std::vector buf; 25 | buf.reserve(n); 26 | uint32_t prev = docs ? -1 : 0; 27 | for (uint64_t i = 0; i != n; ++i, ++begin) { 28 | buf.push_back(*begin - prev - 1); 29 | if (docs) 30 | prev = *begin; 31 | universe += buf.back(); 32 | } 33 | assert(buf.size() == n); 34 | 35 | Encoder::encode(buf.data(), universe, buf.size(), tmp); 36 | } 37 | 38 | virtual void commit() { 39 | header::write(n, universe, output); 40 | output.insert(output.end(), tmp.begin(), tmp.end()); 41 | progress += n + 1; 42 | ++num_processed_lists; 43 | num_total_ints += n; 44 | } 45 | 46 | Iterator begin; 47 | uint64_t n; 48 | uint32_t universe; 49 | boost::progress_display& progress; 50 | std::vector tmp; 51 | std::vector& output; 52 | bool docs; 53 | uint64_t& num_processed_lists; 54 | uint64_t& num_total_ints; 55 | }; 56 | 57 | // for DINT 58 | template 59 | struct dint_sequence_adder : semiasync_queue::job { 60 | dint_sequence_adder(Iterator begin, uint64_t n, Builder& builder, 61 | boost::progress_display& progress, 62 | std::vector& output, bool docs, 63 | uint64_t& num_processed_lists, uint64_t& num_total_ints) 64 | : begin(begin) 65 | , n(n) 66 | , universe(0) 67 | , builder(builder) 68 | , progress(progress) 69 | , output(output) 70 | , docs(docs) 71 | , num_processed_lists(num_processed_lists) 72 | , num_total_ints(num_total_ints) {} 73 | 74 | virtual void prepare() { 75 | std::vector buf; 76 | buf.reserve(n); 77 | uint32_t prev = docs ? -1 : 0; 78 | for (uint64_t i = 0; i != n; ++i, ++begin) { 79 | buf.push_back(*begin - prev - 1); 80 | if (docs) 81 | prev = *begin; 82 | universe += buf.back(); 83 | } 84 | assert(buf.size() == n); 85 | 86 | Encoder::encode(builder, buf.data(), universe, buf.size(), tmp); 87 | } 88 | 89 | virtual void commit() { 90 | header::write(n, universe, output); 91 | output.insert(output.end(), tmp.begin(), tmp.end()); 92 | progress += n + 1; 93 | ++num_processed_lists; 94 | num_total_ints += n; 95 | } 96 | 97 | Iterator begin; 98 | uint64_t n; 99 | uint32_t universe; 100 | Builder& builder; 101 | boost::progress_display& progress; 102 | std::vector tmp; 103 | std::vector& output; 104 | bool docs; 105 | uint64_t& num_processed_lists; 106 | uint64_t& num_total_ints; 107 | }; 108 | 109 | // for PEF 110 | template 111 | struct pef_sequence_adder : semiasync_queue::job { 112 | pef_sequence_adder(Iterator begin, uint64_t n, uint64_t universe, 113 | succinct::bit_vector_builder& bvb, 114 | boost::progress_display& progress, bool docs, 115 | uint64_t& num_processed_lists, uint64_t& num_total_ints) 116 | : begin(begin) 117 | , n(n) 118 | , universe(universe) 119 | , bvb(bvb) 120 | , progress(progress) 121 | , docs(docs) 122 | , num_processed_lists(num_processed_lists) 123 | , num_total_ints(num_total_ints) {} 124 | 125 | virtual void prepare() { 126 | if (not docs) { // on freqs, PEF needs the perfix sums 127 | universe = 0; 128 | auto in = begin; 129 | for (uint64_t i = 0; i != n; ++i, ++in) { 130 | universe += *in; 131 | } 132 | universe += 1; 133 | } 134 | pef::encode(begin, universe, n, tmp, not docs); 135 | } 136 | 137 | virtual void commit() { 138 | progress += n + 1; 139 | ++num_processed_lists; 140 | num_total_ints += n; 141 | uint64_t offset = bvb.size() + tmp.size(); 142 | bvb.append_bits(offset + 64 + 32 + 32, 64); 143 | bvb.append_bits(universe, 32); 144 | bvb.append_bits(n, 32); 145 | bvb.append(tmp); 146 | } 147 | 148 | Iterator begin; 149 | uint64_t n; 150 | uint64_t universe; 151 | succinct::bit_vector_builder& bvb; 152 | succinct::bit_vector_builder tmp; 153 | boost::progress_display& progress; 154 | bool docs; 155 | uint64_t& num_processed_lists; 156 | uint64_t& num_total_ints; 157 | }; 158 | 159 | } // namespace ds2i 160 | -------------------------------------------------------------------------------- /vroom_env/statistics.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.hpp" 4 | 5 | namespace ds2i { 6 | 7 | void print_statistics(std::string type, char const* encoded_data_filename, 8 | std::vector const& timings, 9 | uint64_t num_decoded_ints, uint64_t num_decoded_lists 10 | // , dint_statistics const& stats 11 | ) { 12 | static const uint64_t billion = 1000000000; 13 | double tot_elapsed = 14 | std::accumulate(timings.begin(), timings.end(), double(0.0)); 15 | double ns_x_int = tot_elapsed * billion / num_decoded_ints; 16 | uint64_t ints_x_sec = uint64_t(1 / ns_x_int * billion); 17 | 18 | logger() << "elapsed time " << tot_elapsed << " [sec]" << std::endl; 19 | logger() << ns_x_int << " [ns] x int" << std::endl; 20 | logger() << ints_x_sec << " ints x [sec]" << std::endl; 21 | 22 | // logger() << "avg. # of decoded integers x codeword: " << 23 | // double(stats.decoded_ints_from_dict) / stats.dict_codewords << std::endl; 24 | 25 | // stats to std output 26 | std::cout << "{"; 27 | std::cout << "\"filename\": \"" << encoded_data_filename << "\", "; 28 | std::cout << "\"num_sequences\": \"" << num_decoded_lists << "\", "; 29 | std::cout << "\"num_integers\": \"" << num_decoded_ints << "\", "; 30 | std::cout << "\"type\": \"" << type << "\", "; 31 | std::cout << "\"tot_elapsed_time\": \"" << tot_elapsed << "\", "; 32 | std::cout << "\"ns_x_int\": \"" << ns_x_int << "\", "; 33 | std::cout << "\"ints_x_sec\": \"" << ints_x_sec << "\""; 34 | std::cout << "}" << std::endl; 35 | 36 | // // binary entropy 37 | // { 38 | // uint64_t total_exceptions = 0; 39 | 40 | // uint64_t total_codewords = 0; 41 | // for (uint64_t i = 0; i != constants::num_entries; ++i) { 42 | // total_codewords += stats.occs[i]; 43 | // } 44 | 45 | // for (auto const& pair: stats.exceptions) { 46 | // total_exceptions += pair.second; 47 | // } 48 | 49 | // logger() << "total_exceptions " << total_exceptions << std::endl; 50 | 51 | // double exceptions_entropy = 0.0; 52 | // for (auto const& pair: stats.exceptions) { 53 | // double p = double(pair.second) / total_exceptions; 54 | // exceptions_entropy += p * std::log2(1.0 / p); 55 | // } 56 | 57 | // double entropy = 0.0; 58 | // for (uint64_t i = 0; i != constants::num_entries; ++i) { 59 | // double x = stats.occs[i]; 60 | // if (x != 0) { 61 | // double p = x / total_codewords; 62 | // entropy += p * std::log2(1.0 / p); 63 | // } 64 | // } 65 | 66 | // logger() << "total_ints " << stats.total_ints << std::endl; 67 | // logger() << "binary entropy of the source: " << (entropy * 68 | // total_codewords + exceptions_entropy * total_exceptions) / 8.0 / 69 | // constants::GiB << " [GiB]" << std::endl; logger() << "binary entropy 70 | // x codeword: " << entropy << " bits" << std::endl; logger() << "binary 71 | // entropy x exception: " << exceptions_entropy << " bits" << std::endl; 72 | // logger() << "binary entropy x integer: " << (entropy * 73 | // total_codewords + exceptions_entropy * total_exceptions) / 74 | // stats.total_ints << " bits" << std::endl; 75 | // } 76 | 77 | // uint64_t total_codewords = 0; 78 | // uint64_t total_decoded_ints = 0; 79 | // for (uint64_t i = 0; i < stats.codewords_distr.size(); ++i) { 80 | // total_codewords += stats.codewords_distr[i]; 81 | // total_decoded_ints += stats.ints_distr[i]; 82 | // } 83 | 84 | // std::cout << "total_codewords " << total_codewords << std::endl; 85 | // std::cout << "total_decoded_ints " << total_decoded_ints << std::endl; 86 | 87 | // for (uint64_t i = 0; i < stats.codewords_distr.size(); ++i) { 88 | // if (i == 0) std::cout << "freq:\n"; 89 | // else if (i == stats.codewords_distr.size() - 1) std::cout << 90 | // "rare:\n"; else std::cout << (uint32_t(1) << (i - 1)) << ":\n"; 91 | // std::cout << "\t codewords: " << stats.codewords_distr[i] * 100.0 / 92 | // total_codewords << "%" << std::endl; std::cout << "\t integers: " << 93 | // stats.ints_distr[i] * 100.0 / total_decoded_ints << "%" << std::endl; 94 | // } 95 | } 96 | 97 | } // namespace ds2i --------------------------------------------------------------------------------