├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── all_ones_sequence.hpp ├── binary_collection.hpp ├── binary_freq_collection.hpp ├── bitvector_collection.hpp ├── block_codecs.cpp ├── block_codecs.hpp ├── block_freq_index.hpp ├── block_posting_list.hpp ├── bm25.hpp ├── compact_elias_fano.hpp ├── compact_ranked_bitvector.hpp ├── configuration.hpp ├── create_freq_index.cpp ├── create_wand_data.cpp ├── freq_index.hpp ├── global_parameters.hpp ├── index_types.hpp ├── indexed_sequence.hpp ├── integer_codes.hpp ├── optimal_partition.hpp ├── partitioned_sequence.hpp ├── positive_sequence.hpp ├── queries.cpp ├── queries.hpp ├── semiasync_queue.hpp ├── sequence_collection.hpp ├── strict_elias_fano.hpp ├── strict_sequence.hpp ├── test ├── CMakeLists.txt ├── test_block_codecs.cpp ├── test_block_freq_index.cpp ├── test_block_posting_list.cpp ├── test_compact_elias_fano.cpp ├── test_compact_ranked_bitvector.cpp ├── test_data │ ├── queries │ ├── test_collection.docs │ ├── test_collection.freqs │ └── test_collection.sizes ├── test_freq_index.cpp ├── test_generic_sequence.hpp ├── test_indexed_sequence.cpp ├── test_partitioned_sequence.cpp ├── test_positive_sequence.cpp ├── test_ranked_queries.cpp ├── test_sequence_collection.cpp ├── test_strict_elias_fano.cpp └── test_uniform_partitioned_sequence.cpp ├── uniform_partitioned_sequence.hpp ├── util.hpp └── wand_data.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | test/temp.bin 3 | 4 | # Python junk 5 | *.pyc 6 | 7 | # Java junk 8 | .classpath 9 | .project 10 | *.jar 11 | .settings 12 | target/ 13 | 14 | # CMake junk 15 | CMakeCache.txt 16 | CMakeFiles/ 17 | *.cmake 18 | Makefile 19 | Testing/ 20 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "succinct"] 2 | path = succinct 3 | url = https://github.com/ot/succinct.git 4 | [submodule "FastPFor"] 5 | path = FastPFor 6 | url = https://github.com/lemire/FastPFor.git 7 | [submodule "integer_encoding_library"] 8 | path = integer_encoding_library 9 | url = https://github.com/maropu/integer_encoding_library 10 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(INDEX_PARTITIONING) 3 | 4 | if(NOT CMAKE_BUILD_TYPE) 5 | set(CMAKE_BUILD_TYPE "Release") 6 | endif() 7 | 8 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 10 | endif () 11 | 12 | if (UNIX) 13 | # C++11 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 15 | 16 | # For hardware popcount 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") 18 | 19 | # Extensive warnings 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces") 21 | # Silence a warning bug in Boost 22 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs") 24 | endif () 25 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion") 26 | 27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") # Add debug info anyway 28 | 29 | endif() 30 | 31 | find_package(Boost 1.42.0 COMPONENTS iostreams unit_test_framework REQUIRED) 32 | include_directories(${Boost_INCLUDE_DIRS}) 33 | link_directories(${Boost_LIBRARY_DIRS}) 34 | 35 | # add the root directory to include path to make includes absolute 36 | include_directories(${INDEX_PARTITIONING_SOURCE_DIR}) 37 | 38 | add_subdirectory(succinct EXCLUDE_FROM_ALL) 39 | add_subdirectory(FastPFor EXCLUDE_FROM_ALL) 40 | 41 | # bypass integer_encoding_library build system, only take what we need 42 | include_directories(${INDEX_PARTITIONING_SOURCE_DIR}/integer_encoding_library/include) 43 | add_library(block_codecs 44 | block_codecs.cpp 45 | integer_encoding_library/src/compress/table/decUnary.cpp 46 | integer_encoding_library/src/compress/table/decGamma.cpp 47 | integer_encoding_library/src/compress/table/decDelta.cpp 48 | integer_encoding_library/src/io/BitsReader.cpp 49 | integer_encoding_library/src/io/BitsWriter.cpp 50 | ) 51 | 52 | add_executable(create_freq_index create_freq_index.cpp) 53 | target_link_libraries(create_freq_index 54 | ${Boost_LIBRARIES} 55 | FastPFor_lib 56 | block_codecs 57 | ) 58 | 59 | add_executable(create_wand_data create_wand_data.cpp) 60 | target_link_libraries(create_wand_data 61 | ${Boost_LIBRARIES} 62 | ) 63 | 64 | add_executable(queries queries.cpp) 65 | target_link_libraries(queries 66 | ${Boost_LIBRARIES} 67 | FastPFor_lib 68 | block_codecs 69 | ) 70 | 71 | enable_testing() 72 | add_subdirectory(test) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2014 Giuseppe Ottaviano , Rossano Venturini 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | partitioned_elias_fano 2 | ====================== 3 | 4 | NOTE: This repository is maintained only for historical reasons. This code is 5 | now part of [ds2i](https://github.com/ot/ds2i). 6 | 7 | This repository contains the code used for the experiments in the paper 8 | 9 | * Giuseppe Ottaviano and Rossano Venturini, _Partitioned Elias-Fano Indexes_, 10 | ACM SIGIR 2014. 11 | 12 | 13 | Building the code 14 | ----------------- 15 | 16 | The code is tested on Linux with GCC 4.9 and OSX Mavericks with Clang. 17 | 18 | The following dependencies are needed for the build. 19 | 20 | * CMake >= 2.8, for the build system 21 | * Boost >= 1.42 22 | 23 | The code depends on several git submodules. If you have cloned the repository 24 | without `--recursive`, you will need to perform the following commands before 25 | building: 26 | 27 | $ git submodule init 28 | $ git submodule update 29 | 30 | To build, it should be sufficient to do: 31 | 32 | $ cmake . -DCMAKE_BUILD_TYPE=Release 33 | $ make 34 | 35 | It is also preferable to perform a `make test`, which runs the unit tests. 36 | 37 | 38 | Running the experiments 39 | ----------------------- 40 | 41 | The directory `test/test_data` contains a small document collection used in the 42 | unit tests. The binary format of the collection is described in the next 43 | section. 44 | 45 | To create an index use the command `create_freq_index`. The available index 46 | types are listed in `index_types.hpp`. For example, to create an index using the 47 | optimal partitioning algorithm using the test collection, execute the command: 48 | 49 | $ ./create_freq_index opt test/test_data/test_collection test_collection.index.opt --check 50 | 51 | where `test/test_data/test_collection` is the _basename_ of the collection, that 52 | is the name without the `.{docs,freqs,sizes}` extensions, and 53 | `test_collection.index.opt` is the filename of the output index. `--check` 54 | perform a verification step to check the correctness of the index. 55 | 56 | To perform BM25 queries it is necessary to build an additional file containing 57 | the parameters needed to compute the score, such as the document lengths. The 58 | file can be built with the following command: 59 | 60 | $ ./create_wand_data test/test_data/test_collection test_collection.wand 61 | 62 | Now it is possible to query the index. The command `queries` parses each line of 63 | the standard input as a tab-separated collection of term-ids, where the i-th 64 | term is the i-th list in the input collection. An example set of queries is 65 | again in `test/test_data`. 66 | 67 | $ ./queries opt test_collection.index.opt test_collection.wand < test/test_data/queries 68 | 69 | 70 | Collection input format 71 | ----------------------- 72 | 73 | A _binary sequence_ is a sequence of integers prefixed by its length, where both 74 | the sequence integers and the length are written as 32-bit little-endian 75 | unsigned integers. 76 | 77 | A _collection_ consists of 3 files, `.docs`, `.freqs`, 78 | `.sizes`. 79 | 80 | * `.docs` starts with a singleton binary sequence where its only 81 | integer is the number of documents in the collection. It is then followed by 82 | one binary sequence for each posting list, in order of term-ids. Each posting 83 | list contains the sequence of document-ids containing the term. 84 | 85 | * `basename.freqs` is composed of a one binary sequence per posting list, where 86 | each sequence contains the occurrence counts of the postings, aligned with the 87 | previous file (note however that this file does not have an additional 88 | singleton list at its beginning). 89 | 90 | * `basename.sizes` is composed of a single binary sequence whose length is the 91 | same as the number of documents in the collection, and the i-th element of the 92 | sequence is the size (number of terms) of the i-th document. 93 | 94 | 95 | Authors 96 | ------- 97 | 98 | * Giuseppe Ottaviano 99 | * Rossano Venturini 100 | -------------------------------------------------------------------------------- /all_ones_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "global_parameters.hpp" 4 | #include "util.hpp" 5 | 6 | namespace quasi_succinct { 7 | 8 | struct all_ones_sequence { 9 | 10 | inline static uint64_t 11 | bitsize(global_parameters const& /* params */, uint64_t universe, uint64_t n) 12 | { 13 | return (universe == n) ? 0 : uint64_t(-1); 14 | } 15 | 16 | template 17 | static void write(succinct::bit_vector_builder&, 18 | Iterator, 19 | uint64_t universe, uint64_t n, 20 | global_parameters const&) 21 | { 22 | assert(universe == n); (void)universe; (void)n; 23 | } 24 | 25 | class enumerator { 26 | public: 27 | 28 | typedef std::pair value_type; // (position, value) 29 | 30 | enumerator(succinct::bit_vector const&, uint64_t, 31 | uint64_t universe, uint64_t n, 32 | global_parameters const&) 33 | : m_universe(universe) 34 | , m_position(size()) 35 | { 36 | assert(universe == n); (void)n; 37 | } 38 | 39 | value_type move(uint64_t position) 40 | { 41 | assert(position <= size()); 42 | m_position = position; 43 | return value_type(m_position, m_position); 44 | } 45 | 46 | value_type next_geq(uint64_t lower_bound) 47 | { 48 | assert(lower_bound <= size()); 49 | m_position = lower_bound; 50 | return value_type(m_position, m_position); 51 | } 52 | 53 | value_type next() 54 | { 55 | m_position += 1; 56 | return value_type(m_position, m_position); 57 | } 58 | 59 | uint64_t size() const 60 | { 61 | return m_universe; 62 | } 63 | 64 | uint64_t prev_value() const 65 | { 66 | if (m_position == 0) { 67 | return 0; 68 | } 69 | return m_position - 1; 70 | } 71 | 72 | private: 73 | uint64_t m_universe; 74 | uint64_t m_position; 75 | }; 76 | }; 77 | } 78 | -------------------------------------------------------------------------------- /binary_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "util.hpp" 10 | 11 | namespace quasi_succinct { 12 | 13 | class binary_collection { 14 | public: 15 | typedef uint32_t posting_type; 16 | 17 | binary_collection(const char* filename) 18 | { 19 | m_file.open(filename); 20 | if ( !m_file.is_open() ) { 21 | throw std::runtime_error("Error opening file"); 22 | } 23 | m_data = (posting_type const*)m_file.data(); 24 | m_data_size = m_file.size() / sizeof(m_data[0]); 25 | 26 | auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL); 27 | if (ret) logger() << "Error calling madvice: " << errno << std::endl; 28 | } 29 | 30 | class iterator; 31 | 32 | iterator begin() const 33 | { 34 | return iterator(this, 0); 35 | } 36 | 37 | iterator end() const 38 | { 39 | return iterator(this, m_data_size); 40 | } 41 | 42 | class sequence { 43 | public: 44 | sequence() 45 | : m_begin(nullptr) 46 | , m_end(nullptr) 47 | {} 48 | 49 | posting_type const* begin() const 50 | { 51 | return m_begin; 52 | } 53 | 54 | posting_type const* end() const 55 | { 56 | return m_end; 57 | } 58 | 59 | posting_type back() const 60 | { 61 | assert(size()); 62 | return *(m_end - 1); 63 | } 64 | 65 | size_t size() const 66 | { 67 | return m_end - m_begin; 68 | } 69 | 70 | private: 71 | friend class binary_collection::iterator; 72 | 73 | sequence(posting_type const* begin, posting_type const* end) 74 | : m_begin(begin) 75 | , m_end(end) 76 | {} 77 | 78 | posting_type const* m_begin; 79 | posting_type const* m_end; 80 | }; 81 | 82 | class iterator : public std::iterator { 84 | public: 85 | iterator() 86 | : m_collection(nullptr) 87 | {} 88 | 89 | value_type const& operator*() const 90 | { 91 | return m_cur_seq; 92 | } 93 | 94 | value_type const* operator->() const 95 | { 96 | return &m_cur_seq; 97 | } 98 | 99 | iterator& operator++() 100 | { 101 | m_pos = m_next_pos; 102 | read(); 103 | return *this; 104 | } 105 | 106 | bool operator==(iterator const& other) const 107 | { 108 | assert(m_collection == other.m_collection); 109 | return m_pos == other.m_pos; 110 | } 111 | 112 | bool operator!=(iterator const& other) const 113 | { 114 | return !(*this == other); 115 | } 116 | 117 | private: 118 | friend class binary_collection; 119 | 120 | iterator(binary_collection const* coll, size_t pos) 121 | : m_collection(coll) 122 | , m_pos(pos) 123 | { 124 | read(); 125 | } 126 | 127 | void read() 128 | { 129 | assert(m_pos <= m_collection->m_data_size); 130 | if (m_pos == m_collection->m_data_size) return; 131 | 132 | size_t n = 0; 133 | size_t pos = m_pos; 134 | while (!(n = m_collection->m_data[pos++])); // skip empty seqs 135 | // file might be truncated 136 | n = std::min(n, size_t(m_collection->m_data_size - pos)); 137 | posting_type const* begin = &m_collection->m_data[pos]; 138 | posting_type const* end = begin + n; 139 | 140 | m_next_pos = pos + n; 141 | m_cur_seq = sequence(begin, end); 142 | } 143 | 144 | binary_collection const* m_collection; 145 | size_t m_pos, m_next_pos; 146 | sequence m_cur_seq; 147 | }; 148 | 149 | private: 150 | boost::iostreams::mapped_file_source m_file; 151 | posting_type const* m_data; 152 | size_t m_data_size; 153 | }; 154 | } 155 | -------------------------------------------------------------------------------- /binary_freq_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "binary_collection.hpp" 8 | 9 | namespace quasi_succinct { 10 | 11 | class binary_freq_collection { 12 | public: 13 | 14 | binary_freq_collection(const char* basename) 15 | : m_docs((std::string(basename) + ".docs").c_str()) 16 | , m_freqs((std::string(basename) + ".freqs").c_str()) 17 | { 18 | auto firstseq = *m_docs.begin(); 19 | if (firstseq.size() != 1) { 20 | throw std::invalid_argument("First sequence should only contain number of documents"); 21 | } 22 | m_num_docs = *firstseq.begin(); 23 | } 24 | 25 | class iterator; 26 | 27 | iterator begin() const 28 | { 29 | auto docs_it = m_docs.begin(); 30 | return iterator(++docs_it, m_freqs.begin()); 31 | } 32 | 33 | iterator end() const 34 | { 35 | return iterator(m_docs.end(), m_freqs.end()); 36 | } 37 | 38 | uint64_t num_docs() const 39 | { 40 | return m_num_docs; 41 | } 42 | 43 | struct sequence { 44 | binary_collection::sequence docs; 45 | binary_collection::sequence freqs; 46 | }; 47 | 48 | class iterator : public std::iterator { 50 | public: 51 | iterator() 52 | {} 53 | 54 | value_type const& operator*() const 55 | { 56 | return m_cur_seq; 57 | } 58 | 59 | value_type const* operator->() const 60 | { 61 | return &m_cur_seq; 62 | } 63 | 64 | iterator& operator++() 65 | { 66 | m_cur_seq.docs = *++m_docs_it; 67 | m_cur_seq.freqs = *++m_freqs_it; 68 | return *this; 69 | } 70 | 71 | bool operator==(iterator const& other) const 72 | { 73 | return m_docs_it == other.m_docs_it; 74 | } 75 | 76 | bool operator!=(iterator const& other) const 77 | { 78 | return !(*this == other); 79 | } 80 | 81 | private: 82 | friend class binary_freq_collection; 83 | 84 | iterator(binary_collection::iterator docs_it, 85 | binary_collection::iterator freqs_it) 86 | : m_docs_it(docs_it) 87 | , m_freqs_it(freqs_it) 88 | { 89 | m_cur_seq.docs = *m_docs_it; 90 | m_cur_seq.freqs = *m_freqs_it; 91 | } 92 | 93 | binary_collection::iterator m_docs_it; 94 | binary_collection::iterator m_freqs_it; 95 | sequence m_cur_seq; 96 | }; 97 | 98 | private: 99 | binary_collection m_docs; 100 | binary_collection m_freqs; 101 | uint64_t m_num_docs; 102 | }; 103 | } 104 | -------------------------------------------------------------------------------- /bitvector_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "compact_elias_fano.hpp" 6 | 7 | namespace quasi_succinct { 8 | 9 | class bitvector_collection { 10 | public: 11 | bitvector_collection() 12 | : m_size(0) 13 | {} 14 | 15 | class builder { 16 | public: 17 | builder(global_parameters const& params) 18 | : m_params(params) 19 | { 20 | m_endpoints.push_back(0); 21 | } 22 | 23 | void append(succinct::bit_vector_builder& bvb) 24 | { 25 | m_bitvectors.append(bvb); 26 | m_endpoints.push_back(m_bitvectors.size()); 27 | } 28 | 29 | void build(bitvector_collection& sq) 30 | { 31 | sq.m_size = m_endpoints.size() - 1; 32 | succinct::bit_vector(&m_bitvectors).swap(sq.m_bitvectors); 33 | 34 | succinct::bit_vector_builder bvb; 35 | compact_elias_fano::write(bvb, m_endpoints.begin(), 36 | m_bitvectors.size(), sq.m_size, 37 | m_params); 38 | succinct::bit_vector(&bvb).swap(sq.m_endpoints); 39 | } 40 | 41 | private: 42 | global_parameters m_params; 43 | std::vector m_endpoints; 44 | succinct::bit_vector_builder m_bitvectors; 45 | }; 46 | 47 | size_t size() const 48 | { 49 | return m_size; 50 | } 51 | 52 | succinct::bit_vector const& bits() const 53 | { 54 | return m_bitvectors; 55 | } 56 | 57 | succinct::bit_vector::enumerator 58 | get(global_parameters const& params, size_t i) const 59 | { 60 | assert(i < size()); 61 | compact_elias_fano::enumerator endpoints(m_endpoints, 0, 62 | m_bitvectors.size(), m_size, 63 | params); 64 | 65 | auto endpoint = endpoints.move(i).second; 66 | return succinct::bit_vector::enumerator(m_bitvectors, endpoint); 67 | } 68 | 69 | void swap(bitvector_collection& other) 70 | { 71 | std::swap(m_size, other.m_size); 72 | m_endpoints.swap(other.m_endpoints); 73 | m_bitvectors.swap(other.m_bitvectors); 74 | } 75 | 76 | template 77 | void map(Visitor& visit) 78 | { 79 | visit 80 | (m_size, "m_size") 81 | (m_endpoints, "m_endpoints") 82 | (m_bitvectors, "m_bitvectors") 83 | ; 84 | } 85 | 86 | private: 87 | size_t m_size; 88 | succinct::bit_vector m_endpoints; 89 | succinct::bit_vector m_bitvectors; 90 | }; 91 | } 92 | -------------------------------------------------------------------------------- /block_codecs.cpp: -------------------------------------------------------------------------------- 1 | #include "block_codecs.hpp" 2 | 3 | namespace quasi_succinct { 4 | optpfor_block::codec_type optpfor_block::optpfor_codec; 5 | TightVariableByte optpfor_block::vbyte_codec; 6 | 7 | VarIntG8IU varint_G8IU_block::varint_codec; 8 | TightVariableByte varint_G8IU_block::vbyte_codec; 9 | } 10 | -------------------------------------------------------------------------------- /block_codecs.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "FastPFor/headers/optpfor.h" 4 | #include "FastPFor/headers/variablebyte.h" 5 | #include "FastPFor/headers/VarIntG8IU.h" 6 | 7 | // from integer_encoding_library 8 | #undef ASSERT // XXX WHERE IS THIS DEFINED?? 9 | #include "io/BitsReader.hpp" 10 | #include "io/BitsWriter.hpp" 11 | 12 | namespace quasi_succinct { 13 | 14 | // workaround: VariableByte::decodeArray needs the buffer size, while we 15 | // only know the number of values. It also pads to 32 bits. We need to 16 | // rewrite 17 | class TightVariableByte { 18 | public: 19 | template 20 | static uint8_t extract7bits(const uint32_t val) { 21 | return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); 22 | } 23 | 24 | template 25 | static uint8_t extract7bitsmaskless(const uint32_t val) { 26 | return static_cast((val >> (7 * i))); 27 | } 28 | 29 | static void encode(const uint32_t *in, const size_t length, 30 | uint8_t *out, size_t& nvalue) 31 | { 32 | uint8_t * bout = out; 33 | for (size_t k = 0; k < length; ++k) { 34 | const uint32_t val(in[k]); 35 | /** 36 | * Code below could be shorter. Whether it could be faster 37 | * depends on your compiler and machine. 38 | */ 39 | if (val < (1U << 7)) { 40 | *bout = static_cast(val | (1U << 7)); 41 | ++bout; 42 | } else if (val < (1U << 14)) { 43 | *bout = extract7bits<0> (val); 44 | ++bout; 45 | *bout = extract7bitsmaskless<1> (val) | (1U << 7); 46 | ++bout; 47 | } else if (val < (1U << 21)) { 48 | *bout = extract7bits<0> (val); 49 | ++bout; 50 | *bout = extract7bits<1> (val); 51 | ++bout; 52 | *bout = extract7bitsmaskless<2> (val) | (1U << 7); 53 | ++bout; 54 | } else if (val < (1U << 28)) { 55 | *bout = extract7bits<0> (val); 56 | ++bout; 57 | *bout = extract7bits<1> (val); 58 | ++bout; 59 | *bout = extract7bits<2> (val); 60 | ++bout; 61 | *bout = extract7bitsmaskless<3> (val) | (1U << 7); 62 | ++bout; 63 | } else { 64 | *bout = extract7bits<0> (val); 65 | ++bout; 66 | *bout = extract7bits<1> (val); 67 | ++bout; 68 | *bout = extract7bits<2> (val); 69 | ++bout; 70 | *bout = extract7bits<3> (val); 71 | ++bout; 72 | *bout = extract7bitsmaskless<4> (val) | (1U << 7); 73 | ++bout; 74 | } 75 | } 76 | nvalue = bout - out; 77 | } 78 | 79 | static void encode_single(uint32_t val, std::vector& out) 80 | { 81 | uint8_t buf[5]; 82 | size_t nvalue; 83 | encode(&val, 1, buf, nvalue); 84 | out.insert(out.end(), buf, buf + nvalue); 85 | } 86 | 87 | static uint8_t const* decode(const uint8_t *in, uint32_t *out, size_t n) 88 | { 89 | const uint8_t * inbyte = in; 90 | for (size_t i = 0; i < n; ++i) { 91 | unsigned int shift = 0; 92 | for (uint32_t v = 0; ; shift += 7) { 93 | uint8_t c = *inbyte++; 94 | v += ((c & 127) << shift); 95 | if ((c & 128)) { 96 | *out++ = v; 97 | break; 98 | } 99 | } 100 | } 101 | return inbyte; 102 | } 103 | }; 104 | 105 | struct optpfor_block { 106 | 107 | struct codec_type : OPTPFor<4, Simple16> { 108 | // workaround: OPTPFor does not define decodeBlock, so we cut&paste 109 | // the code 110 | uint32_t const* decodeBlock(const uint32_t *in, uint32_t *out, size_t& nvalue) 111 | { 112 | const uint32_t * const initout(out); 113 | const uint32_t b = *in >> (32 - PFORDELTA_B); 114 | const size_t nExceptions = (*in >> (32 - (PFORDELTA_B 115 | + PFORDELTA_NEXCEPT))) & ((1 << PFORDELTA_NEXCEPT) - 1); 116 | const uint32_t encodedExceptionsSize = *in & ((1 << PFORDELTA_EXCEPTSZ) 117 | - 1); 118 | 119 | size_t twonexceptions = 2 * nExceptions; 120 | ++in; 121 | if (encodedExceptionsSize > 0) 122 | ecoder.decodeArray(in, encodedExceptionsSize, &exceptions[0], 123 | twonexceptions); 124 | assert(twonexceptions >= 2 * nExceptions); 125 | in += encodedExceptionsSize; 126 | 127 | uint32_t * beginout(out);// we use this later 128 | 129 | for (uint32_t j = 0; j < BlockSize; j += 32) { 130 | fastunpack(in, out, b); 131 | in += b; 132 | out += 32; 133 | } 134 | 135 | for (uint32_t e = 0, lpos = -1; e < nExceptions; e++) { 136 | lpos += exceptions[e] + 1; 137 | beginout[lpos] |= (exceptions[e + nExceptions] + 1) << b; 138 | } 139 | 140 | nvalue = out - initout; 141 | return in; 142 | } 143 | }; 144 | 145 | static codec_type optpfor_codec; 146 | static TightVariableByte vbyte_codec; 147 | 148 | static const uint64_t block_size = codec_type::BlockSize; 149 | 150 | static void encode(uint32_t const* in, uint32_t /* sum_of_values */, 151 | size_t n, std::vector& out) 152 | { 153 | assert(n <= block_size); 154 | // XXX this could be threadlocal static 155 | std::vector buf(2 * 4 * block_size); 156 | size_t out_len = buf.size(); 157 | 158 | if (n == block_size) { 159 | optpfor_codec.encodeBlock(in, reinterpret_cast(buf.data()), 160 | out_len); 161 | out_len *= 4; 162 | } else { 163 | vbyte_codec.encode(in, n, buf.data(), out_len); 164 | } 165 | out.insert(out.end(), buf.data(), buf.data() + out_len); 166 | } 167 | 168 | static uint8_t const* decode(uint8_t const* in, uint32_t* out, 169 | uint32_t /* sum_of_values */, size_t n) 170 | { 171 | assert(n <= block_size); 172 | size_t out_len = block_size; 173 | uint8_t const* ret; 174 | 175 | if (n == block_size) { 176 | ret = reinterpret_cast 177 | (optpfor_codec.decodeBlock(reinterpret_cast(in), 178 | out, out_len)); 179 | assert(out_len == n); 180 | } else { 181 | ret = vbyte_codec.decode(in, out, n); 182 | } 183 | return ret; 184 | } 185 | }; 186 | 187 | struct varint_G8IU_block { 188 | static VarIntG8IU varint_codec; 189 | static TightVariableByte vbyte_codec; 190 | 191 | static const uint64_t block_size = 128; 192 | 193 | static void encode(uint32_t const* in, uint32_t /* sum_of_values */, 194 | size_t n, std::vector& out) 195 | { 196 | assert(n <= block_size); 197 | // XXX this could be threadlocal static 198 | std::vector buf(2 * 4 * block_size); 199 | size_t out_len = buf.size(); 200 | 201 | if (n == block_size) { 202 | const uint32_t * src = in; 203 | unsigned char* dst = buf.data(); 204 | size_t srclen = n * 4; 205 | size_t dstlen = out_len; 206 | out_len = 0; 207 | while (srclen > 0 && dstlen >= 9) { 208 | out_len += varint_codec.encodeBlock(src, srclen, dst, dstlen); 209 | } 210 | assert(srclen == 0); 211 | } else { 212 | vbyte_codec.encode(in, n, buf.data(), out_len); 213 | } 214 | out.insert(out.end(), buf.data(), buf.data() + out_len); 215 | } 216 | 217 | static uint8_t const* decode(uint8_t const* in, uint32_t* out, 218 | uint32_t /* sum_of_values */, size_t n) 219 | { 220 | assert(n <= block_size); 221 | size_t out_len = block_size; 222 | uint8_t const* ret; 223 | 224 | if (n == block_size) { 225 | const uint8_t * src = in; 226 | uint32_t* dst = out; 227 | size_t srclen = 2 * out_len * 4; // upper bound 228 | size_t dstlen = out_len * 4; 229 | out_len = 0; 230 | while (out_len <= (n - 8)) { 231 | out_len += varint_codec.decodeBlock(src, srclen, dst, dstlen); 232 | } 233 | 234 | // decodeBlock can overshoot, so we decode the last blocks in a 235 | // local buffer 236 | while (out_len < n) { 237 | uint32_t buf[8]; 238 | uint32_t* bufptr = buf; 239 | size_t buflen = 8 * 4; 240 | size_t read = varint_codec.decodeBlock(src, srclen, bufptr, buflen); 241 | size_t needed = std::min(read, n - out_len); 242 | memcpy(dst, buf, needed * 4); 243 | dst += needed; 244 | out_len += needed; 245 | } 246 | assert(out_len == n); 247 | ret = src; 248 | } else { 249 | ret = vbyte_codec.decode(in, out, n); 250 | } 251 | return ret; 252 | } 253 | }; 254 | 255 | struct interpolative_block { 256 | static const uint64_t block_size = 128; 257 | 258 | static void encode(uint32_t const* in, uint32_t sum_of_values, 259 | size_t n, std::vector& out) 260 | { 261 | assert(n <= block_size); 262 | std::vector inbuf(n); 263 | inbuf[0] = *in; 264 | for (size_t i = 1; i < n; ++i) { 265 | inbuf[i] = inbuf[i - 1] + in[i] + 1; 266 | } 267 | std::vector buf(2 * block_size); 268 | if (sum_of_values == uint32_t(-1)) { 269 | sum_of_values = inbuf.back() - (n - 1); 270 | TightVariableByte::encode_single(sum_of_values, out); 271 | } 272 | 273 | if (n > 1) { 274 | uint32_t high = sum_of_values + n - 1; 275 | integer_encoding::internals::BitsWriter bw(buf.data(), buf.size()); 276 | bw.intrpolatvArray(inbuf.data(), n - 1, 0, 0, high); 277 | bw.flush_bits(); 278 | uint8_t const* bufptr = (uint8_t const*)buf.data(); 279 | out.insert(out.end(), bufptr, bufptr + bw.size() * 4); // XXX wasting one word! 280 | } 281 | } 282 | 283 | static uint8_t const* decode(uint8_t const* in, uint32_t* out, 284 | uint32_t sum_of_values, size_t n) 285 | { 286 | assert(n <= block_size); 287 | uint8_t const* inbuf = in; 288 | if (sum_of_values == uint32_t(-1)) { 289 | inbuf = TightVariableByte::decode(inbuf, &sum_of_values, 1); 290 | } 291 | 292 | uint32_t high = sum_of_values + n - 1; 293 | out[n - 1] = high; 294 | if (n > 1) { 295 | integer_encoding::internals::BitsReader br((uint32_t const*)inbuf, 2 * n); 296 | br.intrpolatvArray(out, n - 1, 0, 0, high); 297 | for (size_t i = n - 1; i > 0; --i) { 298 | out[i] -= out[i - 1] + 1; 299 | } 300 | return (uint8_t const*)(br.pos() + 1); 301 | } else { 302 | return inbuf; 303 | } 304 | } 305 | }; 306 | } 307 | -------------------------------------------------------------------------------- /block_freq_index.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "compact_elias_fano.hpp" 7 | #include "block_posting_list.hpp" 8 | 9 | namespace quasi_succinct { 10 | 11 | template 12 | class block_freq_index { 13 | public: 14 | block_freq_index() 15 | : m_size(0) 16 | {} 17 | 18 | class builder { 19 | public: 20 | builder(uint64_t num_docs, global_parameters const& params) 21 | : m_params(params) 22 | { 23 | m_num_docs = num_docs; 24 | m_endpoints.push_back(0); 25 | } 26 | template 27 | void add_posting_list(uint64_t n, DocsIterator docs_begin, 28 | FreqsIterator freqs_begin, uint64_t /* occurrences */) 29 | { 30 | if (!n) throw std::invalid_argument("List must be nonempty"); 31 | block_posting_list::write(m_lists, n, 32 | docs_begin, freqs_begin); 33 | m_endpoints.push_back(m_lists.size()); 34 | } 35 | 36 | void build(block_freq_index& sq) 37 | { 38 | sq.m_params = m_params; 39 | sq.m_size = m_endpoints.size() - 1; 40 | sq.m_num_docs = m_num_docs; 41 | sq.m_lists.steal(m_lists); 42 | 43 | succinct::bit_vector_builder bvb; 44 | compact_elias_fano::write(bvb, m_endpoints.begin(), 45 | sq.m_lists.size(), sq.m_size, 46 | m_params); // XXX 47 | succinct::bit_vector(&bvb).swap(sq.m_endpoints); 48 | } 49 | 50 | private: 51 | global_parameters m_params; 52 | size_t m_num_docs; 53 | std::vector m_endpoints; 54 | std::vector m_lists; 55 | }; 56 | 57 | size_t size() const 58 | { 59 | return m_size; 60 | } 61 | 62 | uint64_t num_docs() const 63 | { 64 | return m_num_docs; 65 | } 66 | 67 | typedef typename block_posting_list::document_enumerator document_enumerator; 68 | 69 | document_enumerator operator[](size_t i) const 70 | { 71 | assert(i < size()); 72 | compact_elias_fano::enumerator endpoints(m_endpoints, 0, 73 | m_lists.size(), m_size, 74 | m_params); 75 | 76 | auto endpoint = endpoints.move(i).second; 77 | return document_enumerator(m_lists.data() + endpoint, num_docs()); 78 | } 79 | 80 | void swap(block_freq_index& other) 81 | { 82 | std::swap(m_params, other.m_params); 83 | std::swap(m_size, other.m_size); 84 | m_endpoints.swap(other.m_endpoints); 85 | m_lists.swap(other.m_lists); 86 | } 87 | 88 | template 89 | void map(Visitor& visit) 90 | { 91 | visit 92 | (m_params, "m_params") 93 | (m_size, "m_size") 94 | (m_num_docs, "m_num_docs") 95 | (m_endpoints, "m_endpoints") 96 | (m_lists, "m_lists") 97 | ; 98 | } 99 | 100 | private: 101 | global_parameters m_params; 102 | size_t m_size; 103 | size_t m_num_docs; 104 | succinct::bit_vector m_endpoints; 105 | succinct::mapper::mappable_vector m_lists; 106 | }; 107 | } 108 | -------------------------------------------------------------------------------- /block_posting_list.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "succinct/util.hpp" 4 | #include "block_codecs.hpp" 5 | #include "util.hpp" 6 | 7 | namespace quasi_succinct { 8 | 9 | template 10 | struct block_posting_list { 11 | 12 | template 13 | static void write(std::vector& out, uint32_t n, 14 | DocsIterator docs_begin, FreqsIterator freqs_begin) { 15 | TightVariableByte::encode_single(n, out); 16 | 17 | uint64_t block_size = BlockCodec::block_size; 18 | uint64_t blocks = succinct::util::ceil_div(n, block_size); 19 | size_t begin_block_maxs = out.size(); 20 | size_t begin_block_endpoints = begin_block_maxs + 4 * blocks; 21 | size_t begin_blocks = begin_block_endpoints + 4 * (blocks - 1); 22 | out.resize(begin_blocks); 23 | 24 | DocsIterator docs_it(docs_begin); 25 | FreqsIterator freqs_it(freqs_begin); 26 | std::vector docs_buf(block_size); 27 | std::vector freqs_buf(block_size); 28 | uint32_t last_doc(-1); 29 | uint32_t block_base = 0; 30 | for (size_t b = 0; b < blocks; ++b) { 31 | uint32_t cur_block_size = 32 | ((b + 1) * block_size <= n) 33 | ? block_size : (n % block_size); 34 | 35 | for (size_t i = 0; i < cur_block_size; ++i) { 36 | uint32_t doc(*docs_it++); 37 | docs_buf[i] = doc - last_doc - 1; 38 | last_doc = doc; 39 | 40 | freqs_buf[i] = *freqs_it++ - 1; 41 | } 42 | *((uint32_t*)&out[begin_block_maxs + 4 * b]) = last_doc; 43 | 44 | BlockCodec::encode(docs_buf.data(), last_doc - block_base - (cur_block_size - 1), 45 | cur_block_size, out); 46 | BlockCodec::encode(freqs_buf.data(), uint32_t(-1), cur_block_size, out); 47 | if (b != blocks - 1) { 48 | *((uint32_t*)&out[begin_block_endpoints + 4 * b]) = out.size() - begin_blocks; 49 | } 50 | block_base = last_doc + 1; 51 | } 52 | } 53 | 54 | class document_enumerator { 55 | public: 56 | document_enumerator(uint8_t const* data, uint64_t universe) 57 | : m_n(0) // just to silence warnings 58 | , m_base(TightVariableByte::decode(data, &m_n, 1)) 59 | , m_blocks(succinct::util::ceil_div(m_n, BlockCodec::block_size)) 60 | , m_block_maxs(m_base) 61 | , m_block_endpoints(m_block_maxs + 4 * m_blocks) 62 | , m_blocks_data(m_block_endpoints + 4 * (m_blocks - 1)) 63 | , m_universe(universe) 64 | { 65 | m_docs_buf.resize(BlockCodec::block_size); 66 | m_freqs_buf.resize(BlockCodec::block_size); 67 | reset(); 68 | } 69 | 70 | void reset() 71 | { 72 | decode_docs_block(0); 73 | } 74 | 75 | void QS_ALWAYSINLINE next() 76 | { 77 | ++m_pos_in_block; 78 | if (QS_UNLIKELY(m_pos_in_block == m_cur_block_size)) { 79 | if (m_cur_block + 1 == m_blocks) { 80 | m_cur_docid = m_universe; 81 | return; 82 | } 83 | decode_docs_block(m_cur_block + 1); 84 | } else { 85 | m_cur_docid += m_docs_buf[m_pos_in_block] + 1; 86 | } 87 | } 88 | 89 | void QS_ALWAYSINLINE next_geq(uint64_t lower_bound) 90 | { 91 | assert(lower_bound >= m_cur_docid); 92 | if (QS_UNLIKELY(lower_bound > m_cur_block_max)) { 93 | // binary search seems to perform worse here 94 | if (lower_bound > block_max(m_blocks - 1)) { 95 | m_cur_docid = m_universe; 96 | return; 97 | } 98 | 99 | uint64_t block = m_cur_block + 1; 100 | while (block_max(block) < lower_bound) { 101 | ++block; 102 | } 103 | 104 | decode_docs_block(block); 105 | } 106 | 107 | while (docid() < lower_bound) { 108 | m_cur_docid += m_docs_buf[++m_pos_in_block] + 1; 109 | assert(m_pos_in_block < m_cur_block_size); 110 | } 111 | } 112 | 113 | void QS_ALWAYSINLINE move(uint64_t pos) 114 | { 115 | assert(pos >= position()); 116 | uint64_t block = pos / BlockCodec::block_size; 117 | if (QS_UNLIKELY(block != m_cur_block)) { 118 | decode_docs_block(block); 119 | } 120 | while (position() < pos) { 121 | m_cur_docid += m_docs_buf[++m_pos_in_block] + 1; 122 | } 123 | } 124 | 125 | uint64_t docid() const 126 | { 127 | return m_cur_docid; 128 | } 129 | 130 | uint64_t QS_ALWAYSINLINE freq() 131 | { 132 | if (!m_freqs_decoded) { 133 | decode_freqs_block(); 134 | } 135 | return m_freqs_buf[m_pos_in_block] + 1; 136 | } 137 | 138 | uint64_t position() const 139 | { 140 | return m_cur_block * BlockCodec::block_size + m_pos_in_block; 141 | } 142 | 143 | uint64_t size() const 144 | { 145 | return m_n; 146 | } 147 | 148 | uint64_t stats_freqs_size() const 149 | { 150 | uint64_t bytes = 0; 151 | uint8_t const* ptr = m_blocks_data; 152 | static const uint64_t block_size = BlockCodec::block_size; 153 | std::vector buf(block_size); 154 | for (size_t b = 0; b < m_blocks; ++b) { 155 | uint32_t cur_block_size = 156 | ((b + 1) * block_size <= size()) 157 | ? block_size : (size() % block_size); 158 | 159 | uint32_t cur_base = (b ? block_max(b - 1) : uint32_t(-1)) + 1; 160 | uint8_t const* freq_ptr = 161 | BlockCodec::decode(ptr, buf.data(), 162 | block_max(b) - cur_base - (cur_block_size - 1), 163 | cur_block_size); 164 | ptr = BlockCodec::decode(freq_ptr, buf.data(), 165 | uint32_t(-1), cur_block_size); 166 | bytes += ptr - freq_ptr; 167 | } 168 | 169 | return bytes; 170 | } 171 | 172 | private: 173 | uint32_t block_max(uint32_t block) const 174 | { 175 | return ((uint32_t const*)m_block_maxs)[block]; 176 | } 177 | 178 | void QS_NOINLINE decode_docs_block(uint64_t block) 179 | { 180 | static const uint64_t block_size = BlockCodec::block_size; 181 | uint32_t endpoint = block 182 | ? ((uint32_t const*)m_block_endpoints)[block - 1] 183 | : 0; 184 | uint8_t const* block_data = m_blocks_data + endpoint; 185 | m_cur_block_size = 186 | ((block + 1) * block_size <= size()) 187 | ? block_size : (size() % block_size); 188 | uint32_t cur_base = (block ? block_max(block - 1) : uint32_t(-1)) + 1; 189 | m_cur_block_max = block_max(block); 190 | m_freqs_block_data = 191 | BlockCodec::decode(block_data, m_docs_buf.data(), 192 | m_cur_block_max - cur_base - (m_cur_block_size - 1), 193 | m_cur_block_size); 194 | 195 | m_docs_buf[0] += cur_base; 196 | 197 | m_cur_block = block; 198 | m_pos_in_block = 0; 199 | m_cur_docid = m_docs_buf[0]; 200 | m_freqs_decoded = false; 201 | } 202 | 203 | void QS_NOINLINE decode_freqs_block() 204 | { 205 | BlockCodec::decode(m_freqs_block_data, m_freqs_buf.data(), 206 | uint32_t(-1), m_cur_block_size); 207 | m_freqs_decoded = true; 208 | } 209 | 210 | uint32_t m_n; 211 | uint8_t const* m_base; 212 | uint32_t m_blocks; 213 | uint8_t const* m_block_maxs; 214 | uint8_t const* m_block_endpoints; 215 | uint8_t const* m_blocks_data; 216 | uint64_t m_universe; 217 | 218 | uint32_t m_cur_block; 219 | uint32_t m_pos_in_block; 220 | uint32_t m_cur_block_max; 221 | uint32_t m_cur_block_size; 222 | uint32_t m_cur_docid; 223 | 224 | uint8_t const* m_freqs_block_data; 225 | bool m_freqs_decoded; 226 | 227 | std::vector m_docs_buf; 228 | std::vector m_freqs_buf; 229 | }; 230 | 231 | }; 232 | } 233 | -------------------------------------------------------------------------------- /bm25.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace quasi_succinct { 6 | 7 | struct bm25 { 8 | static constexpr float b = 0.5; 9 | static constexpr float k1 = 1.2; 10 | 11 | static float doc_term_weight(uint64_t freq, float norm_len) 12 | { 13 | float f = (float)freq; 14 | return f / (f + k1 * (1.0f - b + b * norm_len)); 15 | } 16 | 17 | static float query_term_weight(uint64_t freq, uint64_t df, uint64_t num_docs) 18 | { 19 | float f = (float)freq; 20 | float fdf = (float)df; 21 | float idf = std::log((float(num_docs) - fdf + 0.5f) / (fdf + 0.5f)); 22 | static const float epsilon_score = 1.0E-6; 23 | return f * std::max(epsilon_score, idf) * (1.0f + k1); 24 | } 25 | }; 26 | 27 | } 28 | -------------------------------------------------------------------------------- /compact_elias_fano.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "global_parameters.hpp" 8 | #include "util.hpp" 9 | 10 | namespace quasi_succinct { 11 | 12 | struct compact_elias_fano { 13 | 14 | struct offsets { 15 | offsets() 16 | {} 17 | 18 | offsets(uint64_t base_offset, 19 | uint64_t universe, 20 | uint64_t n, 21 | global_parameters const& params) 22 | : universe(universe) 23 | , n(n) 24 | , log_sampling0(params.ef_log_sampling0) 25 | , log_sampling1(params.ef_log_sampling1) 26 | 27 | , lower_bits(universe > n ? succinct::broadword::msb(universe / n) : 0) 28 | , mask((uint64_t(1) << lower_bits) - 1) 29 | // pad with a zero on both sides as sentinels 30 | , higher_bits_length(n + (universe >> lower_bits) + 2) 31 | , pointer_size(ceil_log2(higher_bits_length)) 32 | , pointers0((higher_bits_length - n) >> log_sampling0) // XXX 33 | , pointers1(n >> log_sampling1) 34 | 35 | , pointers0_offset(base_offset) 36 | , pointers1_offset(pointers0_offset + pointers0 * pointer_size) 37 | , higher_bits_offset(pointers1_offset + pointers1 * pointer_size) 38 | , lower_bits_offset(higher_bits_offset + higher_bits_length) 39 | , end(lower_bits_offset + n * lower_bits) 40 | { 41 | assert(n > 0); 42 | } 43 | 44 | uint64_t universe; 45 | uint64_t n; 46 | uint64_t log_sampling0; 47 | uint64_t log_sampling1; 48 | 49 | uint64_t lower_bits; 50 | uint64_t mask; 51 | uint64_t higher_bits_length; 52 | uint64_t pointer_size; 53 | uint64_t pointers0; 54 | uint64_t pointers1; 55 | 56 | uint64_t pointers0_offset; 57 | uint64_t pointers1_offset; 58 | uint64_t higher_bits_offset; 59 | uint64_t lower_bits_offset; 60 | uint64_t end; 61 | }; 62 | 63 | static QS_FLATTEN_FUNC uint64_t 64 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 65 | { 66 | return offsets(0, universe, n, params).end; 67 | } 68 | 69 | template 70 | static void write(succinct::bit_vector_builder& bvb, 71 | Iterator begin, 72 | uint64_t universe, uint64_t n, 73 | global_parameters const& params) 74 | { 75 | using succinct::util::ceil_div; 76 | uint64_t base_offset = bvb.size(); 77 | offsets of(base_offset, universe, n, params); 78 | // initialize all the bits to 0 79 | bvb.zero_extend(of.end - base_offset); 80 | 81 | uint64_t sample1_mask = (uint64_t(1) << of.log_sampling1) - 1; 82 | uint64_t offset; 83 | 84 | // utility function to set 0 pointers 85 | auto set_ptr0s = [&](uint64_t begin, uint64_t end, 86 | uint64_t rank_end) { 87 | 88 | uint64_t begin_zeros = begin - rank_end; 89 | uint64_t end_zeros = end - rank_end; 90 | 91 | for (uint64_t ptr0 = ceil_div(begin_zeros, uint64_t(1) << of.log_sampling0); 92 | (ptr0 << of.log_sampling0) < end_zeros; 93 | ++ptr0) { 94 | if (!ptr0) continue; 95 | offset = of.pointers0_offset + (ptr0 - 1) * of.pointer_size; 96 | assert(offset + of.pointer_size <= of.pointers1_offset); 97 | bvb.set_bits(offset, (ptr0 << of.log_sampling0) + rank_end, 98 | of.pointer_size); 99 | } 100 | }; 101 | 102 | uint64_t last = 0; 103 | uint64_t last_high = 0; 104 | Iterator it = begin; 105 | for (size_t i = 0; i < n; ++i) { 106 | uint64_t v = *it++; 107 | if (i && v < last) { 108 | throw std::runtime_error("Sequence is not sorted"); 109 | } 110 | assert(v < universe); 111 | uint64_t high = (v >> of.lower_bits) + i + 1; 112 | uint64_t low = v & of.mask; 113 | 114 | bvb.set(of.higher_bits_offset + high, 1); 115 | 116 | offset = of.lower_bits_offset + i * of.lower_bits; 117 | assert(offset + of.lower_bits <= of.end); 118 | bvb.set_bits(offset, low, of.lower_bits); 119 | 120 | if (i && (i & sample1_mask) == 0) { 121 | uint64_t ptr1 = i >> of.log_sampling1; 122 | assert(ptr1 > 0); 123 | offset = of.pointers1_offset + (ptr1 - 1) * of.pointer_size; 124 | assert(offset + of.pointer_size <= of.higher_bits_offset); 125 | bvb.set_bits(offset, high, of.pointer_size); 126 | } 127 | 128 | // write pointers for the run of zeros in [last_high, high) 129 | set_ptr0s(last_high + 1, high, i); 130 | last_high = high; 131 | last = v; 132 | } 133 | 134 | // pointers to zeros after the last 1 135 | set_ptr0s(last_high + 1, of.higher_bits_length, n); // XXX 136 | } 137 | 138 | class enumerator { 139 | public: 140 | 141 | typedef std::pair value_type; // (position, value) 142 | 143 | enumerator() 144 | {} 145 | 146 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 147 | uint64_t universe, uint64_t n, 148 | global_parameters const& params) 149 | : m_bv(&bv) 150 | , m_of(offset, universe, n, params) 151 | , m_position(size()) 152 | , m_value(m_of.universe) 153 | {} 154 | 155 | value_type move(uint64_t position) 156 | { 157 | assert(position <= m_of.n); 158 | 159 | if (position == m_position) { 160 | return value(); 161 | } 162 | 163 | uint64_t skip = position - m_position; 164 | // optimize small forward skips 165 | if (QS_LIKELY(position > m_position && skip <= linear_scan_threshold)) { 166 | m_position = position; 167 | if (QS_UNLIKELY(m_position == size())) { 168 | m_value = m_of.universe; 169 | } else { 170 | succinct::bit_vector::unary_enumerator he = m_high_enumerator; 171 | for (size_t i = 0; i < skip; ++i) { 172 | he.next(); 173 | } 174 | m_value = ((he.position() - m_of.higher_bits_offset - m_position - 1) 175 | << m_of.lower_bits) | read_low(); 176 | m_high_enumerator = he; 177 | } 178 | return value(); 179 | } 180 | 181 | return slow_move(position); 182 | } 183 | 184 | value_type next_geq(uint64_t lower_bound) 185 | { 186 | if (lower_bound == m_value) { 187 | return value(); 188 | } 189 | 190 | uint64_t high_lower_bound = lower_bound >> m_of.lower_bits; 191 | uint64_t cur_high = m_value >> m_of.lower_bits; 192 | uint64_t high_diff = high_lower_bound - cur_high; 193 | 194 | if (QS_LIKELY(lower_bound > m_value 195 | && high_diff <= linear_scan_threshold)) { 196 | // optimize small skips 197 | next_reader next_value(*this, m_position + 1); 198 | uint64_t val; 199 | do { 200 | m_position += 1; 201 | if (QS_LIKELY(m_position < size())) { 202 | val = next_value(); 203 | } else { 204 | val = m_of.universe; 205 | break; 206 | } 207 | } while (val < lower_bound); 208 | 209 | m_value = val; 210 | return value(); 211 | } else { 212 | return slow_next_geq(lower_bound); 213 | } 214 | } 215 | 216 | uint64_t size() const 217 | { 218 | return m_of.n; 219 | } 220 | 221 | value_type next() 222 | { 223 | m_position += 1; 224 | assert(m_position <= size()); 225 | 226 | if (QS_LIKELY(m_position < size())) { 227 | m_value = read_next(); 228 | } else { 229 | m_value = m_of.universe; 230 | } 231 | return value(); 232 | } 233 | 234 | uint64_t prev_value() const 235 | { 236 | if (m_position == 0) { 237 | return 0; 238 | } 239 | 240 | uint64_t prev_high = 0; 241 | if (QS_LIKELY(m_position < size())) { 242 | prev_high = m_bv->predecessor1(m_high_enumerator.position() - 1); 243 | } else { 244 | prev_high = m_bv->predecessor1(m_of.lower_bits_offset - 1); 245 | } 246 | prev_high -= m_of.higher_bits_offset; 247 | 248 | uint64_t prev_pos = m_position - 1; 249 | uint64_t prev_low = 250 | m_bv->get_word56(m_of.lower_bits_offset + 251 | prev_pos * m_of.lower_bits) 252 | & m_of.mask; 253 | return ((prev_high - prev_pos - 1) << m_of.lower_bits) | prev_low; 254 | } 255 | 256 | uint64_t position() const 257 | { 258 | return m_position; 259 | } 260 | 261 | private: 262 | 263 | value_type QS_NOINLINE slow_move(uint64_t position) 264 | { 265 | if (QS_UNLIKELY(position == size())) { 266 | m_position = position; 267 | m_value = m_of.universe; 268 | return value(); 269 | } 270 | 271 | uint64_t skip = position - m_position; 272 | uint64_t to_skip; 273 | if (position > m_position 274 | && (skip >> m_of.log_sampling1) == 0) { 275 | to_skip = skip - 1; 276 | } else { 277 | uint64_t ptr = position >> m_of.log_sampling1; 278 | uint64_t high_pos = pointer1(ptr); 279 | uint64_t high_rank = ptr << m_of.log_sampling1; 280 | m_high_enumerator = succinct::bit_vector::unary_enumerator 281 | (*m_bv, m_of.higher_bits_offset + high_pos); 282 | to_skip = position - high_rank; 283 | } 284 | 285 | m_high_enumerator.skip(to_skip); 286 | m_position = position; 287 | m_value = read_next(); 288 | return value(); 289 | } 290 | 291 | value_type QS_NOINLINE slow_next_geq(uint64_t lower_bound) 292 | { 293 | if (QS_UNLIKELY(lower_bound >= m_of.universe)) { 294 | return move(size()); 295 | } 296 | 297 | uint64_t high_lower_bound = lower_bound >> m_of.lower_bits; 298 | uint64_t cur_high = m_value >> m_of.lower_bits; 299 | uint64_t high_diff = high_lower_bound - cur_high; 300 | 301 | // XXX bounds checking! 302 | uint64_t to_skip; 303 | if (lower_bound > m_value 304 | && (high_diff >> m_of.log_sampling0) == 0) { 305 | // note: at the current position in the bitvector there 306 | // should be a 1, but since we already consumed it, it 307 | // is 0 in the enumerator, so we need to skip it 308 | to_skip = high_diff; 309 | } else { 310 | uint64_t ptr = high_lower_bound >> m_of.log_sampling0; 311 | uint64_t high_pos = pointer0(ptr); 312 | uint64_t high_rank0 = ptr << m_of.log_sampling0; 313 | 314 | m_high_enumerator = succinct::bit_vector::unary_enumerator 315 | (*m_bv, m_of.higher_bits_offset + high_pos); 316 | to_skip = high_lower_bound - high_rank0; 317 | } 318 | 319 | m_high_enumerator.skip0(to_skip); 320 | m_position = m_high_enumerator.position() - m_of.higher_bits_offset 321 | - high_lower_bound; 322 | 323 | next_reader read_value(*this, m_position); 324 | while (true) { 325 | if (QS_UNLIKELY(m_position == size())) { 326 | m_value = m_of.universe; 327 | return value(); 328 | } 329 | auto val = read_value(); 330 | if (val >= lower_bound) { 331 | m_value = val; 332 | return value(); 333 | } 334 | m_position++; 335 | } 336 | } 337 | 338 | static const uint64_t linear_scan_threshold = 8; 339 | 340 | inline value_type value() const 341 | { 342 | return value_type(m_position, m_value); 343 | } 344 | 345 | inline uint64_t read_low() 346 | { 347 | return m_bv->get_word56(m_of.lower_bits_offset 348 | + m_position * m_of.lower_bits) 349 | & m_of.mask; 350 | } 351 | 352 | inline uint64_t read_next() 353 | { 354 | assert(m_position < size()); 355 | uint64_t high = m_high_enumerator.next() - m_of.higher_bits_offset; 356 | return ((high - m_position - 1) << m_of.lower_bits) | read_low(); 357 | } 358 | 359 | struct next_reader { 360 | next_reader(enumerator& e, uint64_t position) 361 | : e(e) 362 | , high_enumerator(e.m_high_enumerator) 363 | , high_base(e.m_of.higher_bits_offset + position + 1) 364 | , lower_bits(e.m_of.lower_bits) 365 | , lower_base(e.m_of.lower_bits_offset + position * lower_bits) 366 | , mask(e.m_of.mask) 367 | , bv(*e.m_bv) 368 | {} 369 | 370 | ~next_reader() 371 | { 372 | e.m_high_enumerator = high_enumerator; 373 | } 374 | 375 | uint64_t operator()() 376 | { 377 | uint64_t high = high_enumerator.next() - high_base; 378 | uint64_t low = bv.get_word56(lower_base) & mask; 379 | high_base += 1; 380 | lower_base += lower_bits; 381 | return (high << lower_bits) | low; 382 | } 383 | 384 | enumerator& e; 385 | succinct::bit_vector::unary_enumerator high_enumerator; 386 | uint64_t high_base, lower_bits, lower_base, mask; 387 | succinct::bit_vector const& bv; 388 | }; 389 | 390 | inline uint64_t pointer(uint64_t offset, uint64_t i) const 391 | { 392 | if (i == 0) { 393 | return 0; 394 | } else { 395 | return 396 | m_bv->get_word56(offset + (i - 1) * m_of.pointer_size) 397 | & ((uint64_t(1) << m_of.pointer_size) - 1); 398 | } 399 | } 400 | 401 | inline uint64_t pointer0(uint64_t i) const 402 | { 403 | return pointer(m_of.pointers0_offset, i); 404 | } 405 | 406 | inline uint64_t pointer1(uint64_t i) const 407 | { 408 | return pointer(m_of.pointers1_offset, i); 409 | } 410 | 411 | succinct::bit_vector const* m_bv; 412 | offsets m_of; 413 | 414 | uint64_t m_position; 415 | uint64_t m_value; 416 | succinct::bit_vector::unary_enumerator m_high_enumerator; 417 | }; 418 | 419 | }; 420 | } 421 | -------------------------------------------------------------------------------- /compact_ranked_bitvector.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "global_parameters.hpp" 8 | #include "util.hpp" 9 | 10 | namespace quasi_succinct { 11 | 12 | struct compact_ranked_bitvector { 13 | 14 | struct offsets { 15 | offsets(uint64_t base_offset, 16 | uint64_t universe, 17 | uint64_t n, 18 | global_parameters const& params) 19 | : universe(universe) 20 | , n(n) 21 | , log_rank1_sampling(params.rb_log_rank1_sampling) 22 | , log_sampling1(params.rb_log_sampling1) 23 | 24 | , rank1_sample_size(ceil_log2(n + 1)) 25 | , pointer_size(ceil_log2(universe)) 26 | , rank1_samples(universe >> params.rb_log_rank1_sampling) 27 | , pointers1(n >> params.rb_log_sampling1) 28 | 29 | , rank1_samples_offset(base_offset) 30 | , pointers1_offset(rank1_samples_offset + rank1_samples * rank1_sample_size) 31 | , bits_offset(pointers1_offset + pointers1 * pointer_size) 32 | , end(bits_offset + universe) 33 | {} 34 | 35 | uint64_t universe; 36 | uint64_t n; 37 | uint64_t log_rank1_sampling; 38 | uint64_t log_sampling1; 39 | 40 | uint64_t rank1_sample_size; 41 | uint64_t pointer_size; 42 | 43 | uint64_t rank1_samples; 44 | uint64_t pointers1; 45 | 46 | uint64_t rank1_samples_offset; 47 | uint64_t pointers1_offset; 48 | uint64_t bits_offset; 49 | uint64_t end; 50 | }; 51 | 52 | static QS_FLATTEN_FUNC uint64_t 53 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 54 | { 55 | return offsets(0, universe, n, params).end; 56 | } 57 | 58 | template 59 | static void write(succinct::bit_vector_builder& bvb, 60 | Iterator begin, 61 | uint64_t universe, uint64_t n, 62 | global_parameters const& params) 63 | { 64 | using succinct::util::ceil_div; 65 | 66 | uint64_t base_offset = bvb.size(); 67 | offsets of(base_offset, universe, n, params); 68 | // initialize all the bits to 0 69 | bvb.zero_extend(of.end - base_offset); 70 | 71 | uint64_t offset; 72 | 73 | auto set_rank1_samples = [&](uint64_t begin, uint64_t end, 74 | uint64_t rank) { 75 | for (uint64_t sample = ceil_div(begin, uint64_t(1) << of.log_rank1_sampling); 76 | (sample << of.log_rank1_sampling) < end; 77 | ++sample) { 78 | if (!sample) continue; 79 | offset = of.rank1_samples_offset + (sample - 1) * of.rank1_sample_size; 80 | assert(offset + of.rank1_sample_size <= of.pointers1_offset); 81 | bvb.set_bits(offset, rank, of.rank1_sample_size); 82 | } 83 | }; 84 | 85 | uint64_t sample1_mask = (uint64_t(1) << of.log_sampling1) - 1; 86 | uint64_t last = 0; 87 | Iterator it = begin; 88 | for (size_t i = 0; i < n; ++i) { 89 | uint64_t v = *it++; 90 | if (i && v == last) { 91 | throw std::runtime_error("Duplicate element"); 92 | } 93 | if (i && v < last) { 94 | throw std::runtime_error("Sequence is not sorted"); 95 | } 96 | 97 | assert(!i || v > last); 98 | assert(v <= universe); 99 | 100 | bvb.set(of.bits_offset + v, 1); 101 | 102 | if (i && (i & sample1_mask) == 0) { 103 | uint64_t ptr1 = i >> of.log_sampling1; 104 | assert(ptr1 > 0); 105 | offset = of.pointers1_offset + (ptr1 - 1) * of.pointer_size; 106 | assert(offset + of.pointer_size <= of.bits_offset); 107 | bvb.set_bits(offset, v, of.pointer_size); 108 | } 109 | 110 | set_rank1_samples(last + 1, v + 1, i); 111 | last = v; 112 | } 113 | 114 | set_rank1_samples(last + 1, universe, n); 115 | } 116 | 117 | class enumerator { 118 | public: 119 | 120 | typedef std::pair value_type; // (position, value) 121 | 122 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 123 | uint64_t universe, uint64_t n, 124 | global_parameters const& params) 125 | : m_bv(&bv) 126 | , m_of(offset, universe, n, params) 127 | , m_position(size()) 128 | , m_value(m_of.universe) 129 | {} 130 | 131 | value_type move(uint64_t position) 132 | { 133 | assert(position <= size()); 134 | 135 | if (position == m_position) { 136 | return value(); 137 | } 138 | 139 | // optimize small forward skips 140 | uint64_t skip = position - m_position; 141 | if (QS_LIKELY(position > m_position && skip <= linear_scan_threshold)) { 142 | m_position = position; 143 | if (QS_UNLIKELY(m_position == size())) { 144 | m_value = m_of.universe; 145 | } else { 146 | succinct::bit_vector::unary_enumerator he = m_enumerator; 147 | for (size_t i = 0; i < skip; ++i) { 148 | he.next(); 149 | } 150 | m_value = he.position() - m_of.bits_offset; 151 | m_enumerator = he; 152 | } 153 | 154 | return value(); 155 | } 156 | 157 | return slow_move(position); 158 | } 159 | 160 | value_type next_geq(uint64_t lower_bound) 161 | { 162 | if (lower_bound == m_value) { 163 | return value(); 164 | } 165 | 166 | uint64_t diff = lower_bound - m_value; 167 | if (QS_LIKELY(lower_bound > m_value 168 | && diff <= linear_scan_threshold)) { 169 | // optimize small skips 170 | succinct::bit_vector::unary_enumerator he = m_enumerator; 171 | uint64_t val; 172 | do { 173 | m_position += 1; 174 | if (QS_LIKELY(m_position < size())) { 175 | val = he.next() - m_of.bits_offset; 176 | } else { 177 | val = m_of.universe; 178 | break; 179 | } 180 | } while (val < lower_bound); 181 | 182 | m_value = val; 183 | m_enumerator = he; 184 | return value(); 185 | } else { 186 | return slow_next_geq(lower_bound); 187 | } 188 | } 189 | 190 | value_type next() 191 | { 192 | m_position += 1; 193 | assert(m_position <= size()); 194 | 195 | if (QS_LIKELY(m_position < size())) { 196 | m_value = read_next(); 197 | } else { 198 | m_value = m_of.universe; 199 | } 200 | return value(); 201 | } 202 | 203 | uint64_t size() const 204 | { 205 | return m_of.n; 206 | } 207 | 208 | uint64_t prev_value() const 209 | { 210 | if (m_position == 0) { 211 | return 0; 212 | } 213 | 214 | uint64_t pos = 0; 215 | if (QS_LIKELY(m_position < size())) { 216 | pos = m_bv->predecessor1(m_enumerator.position() - 1); 217 | } else { 218 | pos = m_bv->predecessor1(m_of.end - 1); 219 | } 220 | 221 | return pos - m_of.bits_offset; 222 | } 223 | 224 | private: 225 | 226 | value_type QS_NOINLINE slow_move(uint64_t position) 227 | { 228 | uint64_t skip = position - m_position; 229 | if (QS_UNLIKELY(position == size())) { 230 | m_position = position; 231 | m_value = m_of.universe; 232 | return value(); 233 | } 234 | 235 | uint64_t to_skip; 236 | if (position > m_position 237 | && (skip >> m_of.log_sampling1) == 0) { 238 | to_skip = skip - 1; 239 | } else { 240 | uint64_t ptr = position >> m_of.log_sampling1; 241 | uint64_t ptr_pos = pointer1(ptr); 242 | 243 | m_enumerator = succinct::bit_vector::unary_enumerator 244 | (*m_bv, m_of.bits_offset + ptr_pos); 245 | to_skip = position - (ptr << m_of.log_sampling1); 246 | } 247 | 248 | m_enumerator.skip(to_skip); 249 | m_position = position; 250 | m_value = read_next(); 251 | 252 | return value(); 253 | } 254 | 255 | 256 | value_type QS_NOINLINE slow_next_geq(uint64_t lower_bound) 257 | { 258 | using succinct::broadword::popcount; 259 | 260 | if (QS_UNLIKELY(lower_bound >= m_of.universe)) { 261 | return move(size()); 262 | } 263 | 264 | uint64_t skip = lower_bound - m_value; 265 | m_enumerator = succinct::bit_vector::unary_enumerator 266 | (*m_bv, m_of.bits_offset + lower_bound); 267 | 268 | uint64_t begin; 269 | if (lower_bound > m_value 270 | && (skip >> m_of.log_rank1_sampling) == 0) { 271 | begin = m_of.bits_offset + m_value; 272 | } else { 273 | uint64_t block = lower_bound >> m_of.log_rank1_sampling; 274 | m_position = rank1_sample(block); 275 | 276 | begin = m_of.bits_offset + (block << m_of.log_rank1_sampling); 277 | } 278 | 279 | uint64_t end = m_of.bits_offset + lower_bound; 280 | uint64_t begin_word = begin / 64; 281 | uint64_t begin_shift = begin % 64; 282 | uint64_t end_word = end / 64; 283 | uint64_t end_shift = end % 64; 284 | uint64_t word = 285 | (m_bv->data()[begin_word] >> begin_shift) << begin_shift; 286 | 287 | while (begin_word < end_word) { 288 | m_position += popcount(word); 289 | word = m_bv->data()[++begin_word]; 290 | } 291 | if (end_shift) { 292 | m_position += popcount(word << (64 - end_shift)); 293 | } 294 | 295 | if (m_position < size()) { 296 | m_value = read_next(); 297 | } else { 298 | m_value = m_of.universe; 299 | } 300 | 301 | return value(); 302 | } 303 | 304 | 305 | static const uint64_t linear_scan_threshold = 8; 306 | 307 | inline value_type value() const 308 | { 309 | return value_type(m_position, m_value); 310 | } 311 | 312 | inline uint64_t read_next() 313 | { 314 | return m_enumerator.next() - m_of.bits_offset; 315 | } 316 | 317 | inline uint64_t pointer(uint64_t offset, uint64_t i, uint64_t size) const 318 | { 319 | if (i == 0) { 320 | return 0; 321 | } else { 322 | return 323 | m_bv->get_word56(offset + (i - 1) * size) 324 | & ((uint64_t(1) << size) - 1); 325 | } 326 | } 327 | 328 | inline uint64_t pointer1(uint64_t i) const 329 | { 330 | return pointer(m_of.pointers1_offset, i, m_of.pointer_size); 331 | } 332 | 333 | inline uint64_t rank1_sample(uint64_t i) const 334 | { 335 | return pointer(m_of.rank1_samples_offset, i, 336 | m_of.rank1_sample_size); 337 | } 338 | 339 | succinct::bit_vector const* m_bv; 340 | offsets m_of; 341 | 342 | uint64_t m_position; 343 | uint64_t m_value; 344 | succinct::bit_vector::unary_enumerator m_enumerator; 345 | }; 346 | }; 347 | } 348 | -------------------------------------------------------------------------------- /configuration.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace quasi_succinct { 9 | 10 | class configuration { 11 | public: 12 | static configuration const& get() { 13 | static configuration instance; 14 | return instance; 15 | } 16 | 17 | double eps1; 18 | double eps2; 19 | uint64_t fix_cost; 20 | 21 | size_t log_partition_size; 22 | size_t worker_threads; 23 | 24 | private: 25 | configuration() 26 | { 27 | fillvar("QS_EPS1", eps1, 0.03); 28 | fillvar("QS_EPS2", eps2, 0.3); 29 | fillvar("QS_FIXCOST", fix_cost, 64); 30 | fillvar("QS_LOG_PART", log_partition_size, 7); 31 | fillvar("QS_THREADS", worker_threads, std::thread::hardware_concurrency()); 32 | } 33 | 34 | template 35 | void fillvar(const char* envvar, T& var, T2 def) 36 | { 37 | const char* val = std::getenv(envvar); 38 | if (!val || !strlen(val)) { 39 | var = def; 40 | } else { 41 | var = boost::lexical_cast(val); 42 | } 43 | } 44 | }; 45 | 46 | } 47 | -------------------------------------------------------------------------------- /create_freq_index.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "configuration.hpp" 10 | #include "index_types.hpp" 11 | #include "util.hpp" 12 | 13 | using quasi_succinct::logger; 14 | 15 | template 16 | void verify_collection(InputCollection const& input, const char* filename) 17 | { 18 | Collection coll; 19 | boost::iostreams::mapped_file_source m(filename); 20 | succinct::mapper::map(coll, m); 21 | 22 | logger() << "Checking the written data, just to be extra safe..." << std::endl; 23 | size_t s = 0; 24 | for (auto seq: input) { 25 | auto e = coll[s]; 26 | if (e.size() != seq.docs.size()) { 27 | logger() << "sequence " << s 28 | << " has wrong length! (" 29 | << e.size() << " != " << seq.docs.size() << ")" 30 | << std::endl; 31 | exit(1); 32 | } 33 | 34 | for (size_t i = 0; i < e.size(); ++i, e.next()) { 35 | uint64_t docid = *(seq.docs.begin() + i); 36 | uint64_t freq = *(seq.freqs.begin() + i); 37 | 38 | if (docid != e.docid()) { 39 | logger() << "docid in sequence " << s 40 | << " differs at position " << i << "!" << std::endl; 41 | logger() << e.docid() << " != " << docid << std::endl; 42 | logger() << "sequence length: " << seq.docs.size() << std::endl; 43 | 44 | exit(1); 45 | } 46 | 47 | if (freq != e.freq()) { 48 | logger() << "freq in sequence " << s 49 | << " differs at position " << i << "!" << std::endl; 50 | logger() << e.freq() << " != " << freq << std::endl; 51 | logger() << "sequence length: " << seq.docs.size() << std::endl; 52 | 53 | exit(1); 54 | } 55 | } 56 | 57 | s += 1; 58 | } 59 | logger() << "Everything is OK!" << std::endl; 60 | } 61 | 62 | 63 | template 64 | void get_size_stats(quasi_succinct::freq_index& coll, 65 | uint64_t& docs_size, uint64_t& freqs_size) 66 | { 67 | auto size_tree = succinct::mapper::size_tree_of(coll); 68 | size_tree->dump(); 69 | for (auto const& node: size_tree->children) { 70 | if (node->name == "m_docs_sequences") { 71 | docs_size = node->size; 72 | } else if (node->name == "m_freqs_sequences") { 73 | freqs_size = node->size; 74 | } 75 | } 76 | } 77 | 78 | template 79 | void get_size_stats(quasi_succinct::block_freq_index& coll, 80 | uint64_t& docs_size, uint64_t& freqs_size) 81 | { 82 | auto size_tree = succinct::mapper::size_tree_of(coll); 83 | size_tree->dump(); 84 | uint64_t total_size = size_tree->size; 85 | freqs_size = 0; 86 | for (size_t i = 0; i < coll.size(); ++i) { 87 | freqs_size += coll[i].stats_freqs_size(); 88 | } 89 | docs_size = total_size - freqs_size; 90 | } 91 | 92 | template 93 | void dump_stats(Collection& coll, 94 | std::string const& type, 95 | uint64_t postings) 96 | { 97 | 98 | uint64_t docs_size = 0, freqs_size = 0; 99 | get_size_stats(coll, docs_size, freqs_size); 100 | 101 | double bits_per_doc = docs_size * 8.0 / postings; 102 | double bits_per_freq = freqs_size * 8.0 / postings; 103 | logger() << "Documents: " << docs_size << " bytes, " 104 | << bits_per_doc << " bits per element" << std::endl; 105 | logger() << "Frequencies: " << freqs_size << " bytes, " 106 | << bits_per_freq << " bits per element" << std::endl; 107 | 108 | quasi_succinct::stats_line() 109 | ("type", type) 110 | ("docs_size", docs_size) 111 | ("freqs_size", freqs_size) 112 | ("bits_per_doc", bits_per_doc) 113 | ("bits_per_freq", bits_per_freq) 114 | ; 115 | } 116 | 117 | template 118 | void dump_index_specific_stats(Collection const&, std::string const&) 119 | {} 120 | 121 | 122 | void dump_index_specific_stats(quasi_succinct::uniform_index const& coll, 123 | std::string const& type) 124 | { 125 | quasi_succinct::stats_line() 126 | ("type", type) 127 | ("log_partition_size", int(coll.params().log_partition_size)) 128 | ; 129 | } 130 | 131 | 132 | void dump_index_specific_stats(quasi_succinct::opt_index const& coll, 133 | std::string const& type) 134 | { 135 | auto const& conf = quasi_succinct::configuration::get(); 136 | 137 | uint64_t length_threshold = 4096; 138 | double long_postings = 0; 139 | double docs_partitions = 0; 140 | double freqs_partitions = 0; 141 | 142 | for (size_t s = 0; s < coll.size(); ++s) { 143 | auto const& list = coll[s]; 144 | if (list.size() >= length_threshold) { 145 | long_postings += list.size(); 146 | docs_partitions += list.docs_enum().num_partitions(); 147 | freqs_partitions += list.freqs_enum().base().num_partitions(); 148 | } 149 | } 150 | 151 | quasi_succinct::stats_line() 152 | ("type", type) 153 | ("eps1", conf.eps1) 154 | ("eps2", conf.eps2) 155 | ("fix_cost", conf.fix_cost) 156 | ("docs_avg_part", long_postings / docs_partitions) 157 | ("freqs_avg_part", long_postings / freqs_partitions) 158 | ; 159 | } 160 | 161 | 162 | struct progress_logger { 163 | progress_logger() 164 | : sequences(0) 165 | , postings(0) 166 | {} 167 | 168 | void log() 169 | { 170 | logger() << "Processed " << sequences << " sequences, " 171 | << postings << " postings" << std::endl; 172 | } 173 | 174 | void done_sequence(size_t n) 175 | { 176 | sequences += 1; 177 | postings += n; 178 | if (sequences % 1000000 == 0) { 179 | log(); 180 | } 181 | } 182 | 183 | size_t sequences, postings; 184 | }; 185 | 186 | template 187 | void create_collection(InputCollection const& input, 188 | quasi_succinct::global_parameters const& params, 189 | const char* output_filename, bool check, 190 | std::string const& seq_type) 191 | { 192 | using namespace quasi_succinct; 193 | 194 | logger() << "Processing " << input.num_docs() << " documents" << std::endl; 195 | double tick = get_time_usecs(); 196 | double user_tick = get_user_time_usecs(); 197 | 198 | typename CollectionType::builder builder(input.num_docs(), params); 199 | progress_logger plog; 200 | for (auto const& plist: input) { 201 | uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), 202 | plist.freqs.end(), uint64_t(0)); 203 | 204 | builder.add_posting_list(plist.docs.size(), plist.docs.begin(), 205 | plist.freqs.begin(), freqs_sum); 206 | plog.done_sequence(plist.docs.size()); 207 | } 208 | 209 | plog.log(); 210 | CollectionType coll; 211 | builder.build(coll); 212 | double elapsed_secs = (get_time_usecs() - tick) / 1000000; 213 | double user_elapsed_secs = (get_user_time_usecs() - user_tick) / 1000000; 214 | logger() << seq_type << " collection built in " 215 | << elapsed_secs << " seconds" << std::endl; 216 | 217 | stats_line() 218 | ("type", seq_type) 219 | ("worker_threads", configuration::get().worker_threads) 220 | ("construction_time", elapsed_secs) 221 | ("construction_user_time", user_elapsed_secs) 222 | ; 223 | 224 | dump_stats(coll, seq_type, plog.postings); 225 | dump_index_specific_stats(coll, seq_type); 226 | 227 | if (output_filename) { 228 | succinct::mapper::freeze(coll, output_filename); 229 | if (check) { 230 | verify_collection(input, output_filename); 231 | } 232 | } 233 | } 234 | 235 | 236 | int main(int argc, const char** argv) { 237 | 238 | using namespace quasi_succinct; 239 | 240 | if (argc < 3) { 241 | std::cerr << "Usage: " << argv[0] 242 | << " []" 243 | << std::endl; 244 | return 1; 245 | } 246 | 247 | std::string type = argv[1]; 248 | const char* input_basename = argv[2]; 249 | const char* output_filename = nullptr; 250 | if (argc > 3) { 251 | output_filename = argv[3]; 252 | } 253 | 254 | bool check = false; 255 | if (argc > 4 && std::string(argv[4]) == "--check") { 256 | check = true; 257 | } 258 | 259 | binary_freq_collection input(input_basename); 260 | quasi_succinct::global_parameters params; 261 | params.log_partition_size = configuration::get().log_partition_size; 262 | 263 | if (false) { 264 | #define LOOP_BODY(R, DATA, T) \ 265 | } else if (type == BOOST_PP_STRINGIZE(T)) { \ 266 | create_collection \ 268 | (input, params, output_filename, check, type); \ 269 | /**/ 270 | 271 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, QS_INDEX_TYPES); 272 | #undef LOOP_BODY 273 | } else { 274 | logger() << "ERROR: Unknown type " << type << std::endl; 275 | } 276 | 277 | return 0; 278 | } 279 | -------------------------------------------------------------------------------- /create_wand_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "succinct/mapper.hpp" 5 | #include "binary_freq_collection.hpp" 6 | #include "binary_collection.hpp" 7 | #include "wand_data.hpp" 8 | #include "util.hpp" 9 | 10 | int main(int argc, const char** argv) { 11 | 12 | using namespace quasi_succinct; 13 | 14 | if (argc != 3) { 15 | std::cerr << "Usage: " << argv[0] 16 | << " " 17 | << std::endl; 18 | return 1; 19 | } 20 | 21 | std::string input_basename = argv[1]; 22 | const char* output_filename = argv[2]; 23 | 24 | binary_collection sizes_coll((input_basename + ".sizes").c_str()); 25 | binary_freq_collection coll(input_basename.c_str()); 26 | 27 | wand_data<> wdata(sizes_coll.begin()->begin(), coll.num_docs(), coll); 28 | succinct::mapper::freeze(wdata, output_filename); 29 | } 30 | -------------------------------------------------------------------------------- /freq_index.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "bitvector_collection.hpp" 4 | #include "compact_elias_fano.hpp" 5 | #include "integer_codes.hpp" 6 | #include "global_parameters.hpp" 7 | #include "semiasync_queue.hpp" 8 | 9 | namespace quasi_succinct { 10 | 11 | template 12 | class freq_index { 13 | public: 14 | freq_index() 15 | : m_num_docs(0) 16 | {} 17 | 18 | class builder { 19 | public: 20 | builder(uint64_t num_docs, global_parameters const& params) 21 | : m_queue(1 << 24) 22 | , m_params(params) 23 | , m_num_docs(num_docs) 24 | , m_docs_sequences(params) 25 | , m_freqs_sequences(params) 26 | {} 27 | 28 | template 29 | void add_posting_list(uint64_t n, DocsIterator docs_begin, 30 | FreqsIterator freqs_begin, uint64_t occurrences) 31 | { 32 | if (!n) throw std::invalid_argument("List must be nonempty"); 33 | 34 | // make_shared does not seem to work 35 | std::shared_ptr> 36 | ptr(new list_adder 37 | (*this, docs_begin, 38 | freqs_begin, occurrences, n)); 39 | m_queue.add_job(ptr, 2 * n); 40 | } 41 | 42 | void build(freq_index& sq) 43 | { 44 | m_queue.complete(); 45 | sq.m_num_docs = m_num_docs; 46 | sq.m_params = m_params; 47 | 48 | m_docs_sequences.build(sq.m_docs_sequences); 49 | m_freqs_sequences.build(sq.m_freqs_sequences); 50 | } 51 | 52 | private: 53 | 54 | template 55 | struct list_adder : semiasync_queue::job { 56 | list_adder(builder& b, 57 | DocsIterator docs_begin, 58 | FreqsIterator freqs_begin, 59 | uint64_t occurrences, 60 | uint64_t n) 61 | : b(b) 62 | , docs_begin(docs_begin) 63 | , freqs_begin(freqs_begin) 64 | , occurrences(occurrences) 65 | , n(n) 66 | {} 67 | 68 | virtual void prepare() 69 | { 70 | write_gamma_nonzero(docs_bits, occurrences); 71 | if (occurrences > 1) { 72 | docs_bits.append_bits(n, ceil_log2(occurrences + 1)); 73 | } 74 | 75 | DocsSequence::write(docs_bits, docs_begin, 76 | b.m_num_docs, n, 77 | b.m_params); 78 | 79 | FreqsSequence::write(freqs_bits, freqs_begin, 80 | occurrences + 1, n, 81 | b.m_params); 82 | } 83 | 84 | virtual void commit() 85 | { 86 | b.m_docs_sequences.append(docs_bits); 87 | b.m_freqs_sequences.append(freqs_bits); 88 | } 89 | 90 | builder& b; 91 | DocsIterator docs_begin; 92 | FreqsIterator freqs_begin; 93 | uint64_t occurrences; 94 | uint64_t n; 95 | succinct::bit_vector_builder docs_bits; 96 | succinct::bit_vector_builder freqs_bits; 97 | }; 98 | 99 | semiasync_queue m_queue; 100 | global_parameters m_params; 101 | uint64_t m_num_docs; 102 | bitvector_collection::builder m_docs_sequences; 103 | bitvector_collection::builder m_freqs_sequences; 104 | }; 105 | 106 | uint64_t size() const 107 | { 108 | return m_docs_sequences.size(); 109 | } 110 | 111 | uint64_t num_docs() const 112 | { 113 | return m_num_docs; 114 | } 115 | 116 | class document_enumerator { 117 | public: 118 | void reset() 119 | { 120 | m_cur_pos = 0; 121 | m_cur_docid = m_docs_enum.move(0).second; 122 | } 123 | 124 | void QS_FLATTEN_FUNC next() 125 | { 126 | auto val = m_docs_enum.next(); 127 | m_cur_pos = val.first; 128 | m_cur_docid = val.second; 129 | } 130 | 131 | void QS_FLATTEN_FUNC next_geq(uint64_t lower_bound) 132 | { 133 | auto val = m_docs_enum.next_geq(lower_bound); 134 | m_cur_pos = val.first; 135 | m_cur_docid = val.second; 136 | } 137 | 138 | void QS_FLATTEN_FUNC move(uint64_t position) 139 | { 140 | auto val = m_docs_enum.move(position); 141 | m_cur_pos = val.first; 142 | m_cur_docid = val.second; 143 | } 144 | 145 | uint64_t docid() const 146 | { 147 | return m_cur_docid; 148 | } 149 | 150 | uint64_t QS_FLATTEN_FUNC freq() 151 | { 152 | return m_freqs_enum.move(m_cur_pos).second; 153 | } 154 | 155 | uint64_t position() const 156 | { 157 | return m_cur_pos; 158 | } 159 | 160 | uint64_t size() const 161 | { 162 | return m_docs_enum.size(); 163 | } 164 | 165 | typename DocsSequence::enumerator const& docs_enum() const 166 | { 167 | return m_docs_enum; 168 | } 169 | 170 | typename FreqsSequence::enumerator const& freqs_enum() const 171 | { 172 | return m_freqs_enum; 173 | } 174 | 175 | private: 176 | friend class freq_index; 177 | 178 | document_enumerator(typename DocsSequence::enumerator docs_enum, 179 | typename FreqsSequence::enumerator freqs_enum) 180 | : m_docs_enum(docs_enum) 181 | , m_freqs_enum(freqs_enum) 182 | { 183 | reset(); 184 | } 185 | 186 | uint64_t m_cur_pos; 187 | uint64_t m_cur_docid; 188 | typename DocsSequence::enumerator m_docs_enum; 189 | typename FreqsSequence::enumerator m_freqs_enum; 190 | }; 191 | 192 | document_enumerator operator[](size_t i) const 193 | { 194 | assert(i < size()); 195 | auto docs_it = m_docs_sequences.get(m_params, i); 196 | uint64_t occurrences = read_gamma_nonzero(docs_it); 197 | uint64_t n = 1; 198 | if (occurrences > 1) { 199 | n = docs_it.take(ceil_log2(occurrences + 1)); 200 | } 201 | 202 | typename DocsSequence::enumerator docs_enum(m_docs_sequences.bits(), 203 | docs_it.position(), 204 | num_docs(), n, 205 | m_params); 206 | 207 | auto freqs_it = m_freqs_sequences.get(m_params, i); 208 | typename FreqsSequence::enumerator freqs_enum(m_freqs_sequences.bits(), 209 | freqs_it.position(), 210 | occurrences + 1, n, 211 | m_params); 212 | 213 | return document_enumerator(docs_enum, freqs_enum); 214 | } 215 | 216 | global_parameters const& params() const 217 | { 218 | return m_params; 219 | } 220 | 221 | void swap(freq_index& other) 222 | { 223 | std::swap(m_params, other.m_params); 224 | std::swap(m_num_docs, other.m_num_docs); 225 | m_docs_sequences.swap(other.m_docs_sequences); 226 | m_freqs_sequences.swap(other.m_freqs_sequences); 227 | } 228 | 229 | template 230 | void map(Visitor& visit) 231 | { 232 | visit 233 | (m_params, "m_params") 234 | (m_num_docs, "m_num_docs") 235 | (m_docs_sequences, "m_docs_sequences") 236 | (m_freqs_sequences, "m_freqs_sequences") 237 | ; 238 | } 239 | 240 | private: 241 | global_parameters m_params; 242 | uint64_t m_num_docs; 243 | bitvector_collection m_docs_sequences; 244 | bitvector_collection m_freqs_sequences; 245 | }; 246 | } 247 | -------------------------------------------------------------------------------- /global_parameters.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace quasi_succinct { 4 | 5 | struct global_parameters { 6 | global_parameters() 7 | : ef_log_sampling0(9) 8 | , ef_log_sampling1(8) 9 | , rb_log_rank1_sampling(9) 10 | , rb_log_sampling1(8) 11 | , log_partition_size(7) 12 | {} 13 | 14 | template 15 | void map(Visitor& visit) 16 | { 17 | visit 18 | (ef_log_sampling0, "ef_log_sampling0") 19 | (ef_log_sampling1, "ef_log_sampling1") 20 | (rb_log_rank1_sampling, "rb_log_rank1_sampling") 21 | (rb_log_sampling1, "rb_log_sampling1") 22 | (log_partition_size, "log_partition_size") 23 | ; 24 | } 25 | 26 | uint8_t ef_log_sampling0; 27 | uint8_t ef_log_sampling1; 28 | uint8_t rb_log_rank1_sampling; 29 | uint8_t rb_log_sampling1; 30 | uint8_t log_partition_size; 31 | }; 32 | 33 | } 34 | -------------------------------------------------------------------------------- /index_types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "freq_index.hpp" 8 | #include "positive_sequence.hpp" 9 | #include "partitioned_sequence.hpp" 10 | #include "uniform_partitioned_sequence.hpp" 11 | #include "binary_freq_collection.hpp" 12 | #include "block_freq_index.hpp" 13 | #include "block_codecs.hpp" 14 | 15 | namespace quasi_succinct { 16 | 17 | typedef freq_index> ef_index; 19 | 20 | typedef freq_index> single_index; 22 | 23 | typedef freq_index< 24 | uniform_partitioned_sequence<>, 25 | positive_sequence> 26 | > uniform_index; 27 | 28 | typedef freq_index< 29 | partitioned_sequence<>, 30 | positive_sequence> 31 | > opt_index; 32 | 33 | typedef block_freq_index block_optpfor_index; 34 | 35 | typedef block_freq_index block_varint_index; 36 | 37 | typedef block_freq_index block_interpolative_index; 38 | } 39 | 40 | #define QS_INDEX_TYPES (ef)(single)(uniform)(opt)(block_optpfor)(block_varint)(block_interpolative) 41 | -------------------------------------------------------------------------------- /indexed_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "compact_elias_fano.hpp" 6 | #include "compact_ranked_bitvector.hpp" 7 | #include "all_ones_sequence.hpp" 8 | #include "global_parameters.hpp" 9 | 10 | namespace quasi_succinct { 11 | 12 | struct indexed_sequence { 13 | 14 | enum index_type { 15 | elias_fano = 0, 16 | ranked_bitvector = 1, 17 | all_ones = 2, 18 | 19 | index_types = 3 20 | }; 21 | 22 | static const uint64_t type_bits = 1; // all_ones is implicit 23 | 24 | static QS_FLATTEN_FUNC uint64_t 25 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 26 | { 27 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 28 | 29 | uint64_t ef_cost = compact_elias_fano::bitsize(params, universe, n) + type_bits; 30 | if (ef_cost < best_cost) { 31 | best_cost = ef_cost; 32 | } 33 | 34 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(params, universe, n) + type_bits; 35 | if (rb_cost < best_cost) { 36 | best_cost = rb_cost; 37 | } 38 | 39 | return best_cost; 40 | } 41 | 42 | template 43 | static void write(succinct::bit_vector_builder& bvb, 44 | Iterator begin, 45 | uint64_t universe, uint64_t n, 46 | global_parameters const& params) 47 | { 48 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 49 | int best_type = all_ones; 50 | 51 | if (best_cost) { 52 | uint64_t ef_cost = compact_elias_fano::bitsize(params, universe, n) + type_bits; 53 | if (ef_cost < best_cost) { 54 | best_cost = ef_cost; 55 | best_type = elias_fano; 56 | } 57 | 58 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(params, universe, n) + type_bits; 59 | if (rb_cost < best_cost) { 60 | best_cost = rb_cost; 61 | best_type = ranked_bitvector; 62 | } 63 | 64 | bvb.append_bits(best_type, type_bits); 65 | } 66 | 67 | 68 | switch (best_type) { 69 | case elias_fano: 70 | compact_elias_fano::write(bvb, begin, 71 | universe, n, 72 | params); 73 | break; 74 | case ranked_bitvector: 75 | compact_ranked_bitvector::write(bvb, begin, 76 | universe, n, 77 | params); 78 | break; 79 | case all_ones: 80 | all_ones_sequence::write(bvb, begin, 81 | universe, n, 82 | params); 83 | break; 84 | default: 85 | assert(false); 86 | } 87 | } 88 | 89 | class enumerator { 90 | public: 91 | 92 | typedef std::pair value_type; // (position, value) 93 | 94 | enumerator() 95 | {} 96 | 97 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 98 | uint64_t universe, uint64_t n, 99 | global_parameters const& params) 100 | { 101 | if (all_ones_sequence::bitsize(params, universe, n) == 0) { 102 | m_type = all_ones; 103 | } else { 104 | m_type = index_type(bv.get_word56(offset) 105 | & ((uint64_t(1) << type_bits) - 1)); 106 | } 107 | 108 | switch (m_type) { 109 | case elias_fano: 110 | m_ef_enumerator = compact_elias_fano::enumerator(bv, offset + type_bits, 111 | universe, n, 112 | params); 113 | break; 114 | case ranked_bitvector: 115 | m_rb_enumerator = compact_ranked_bitvector::enumerator(bv, offset + type_bits, 116 | universe, n, 117 | params); 118 | break; 119 | case all_ones: 120 | m_ao_enumerator = all_ones_sequence::enumerator(bv, offset + type_bits, 121 | universe, n, 122 | params); 123 | break; 124 | default: 125 | throw std::invalid_argument("Unsupported type"); 126 | } 127 | } 128 | 129 | #define ENUMERATOR_METHOD(RETURN_TYPE, METHOD, FORMALS, ACTUALS) \ 130 | RETURN_TYPE QS_FLATTEN_FUNC METHOD FORMALS \ 131 | { \ 132 | switch (__builtin_expect(m_type, elias_fano)) { \ 133 | case elias_fano: \ 134 | return m_ef_enumerator.METHOD ACTUALS; \ 135 | case ranked_bitvector: \ 136 | return m_rb_enumerator.METHOD ACTUALS; \ 137 | case all_ones: \ 138 | return m_ao_enumerator.METHOD ACTUALS; \ 139 | default: \ 140 | assert(false); \ 141 | __builtin_unreachable(); \ 142 | } \ 143 | } \ 144 | /**/ 145 | 146 | // semicolons are redundant but they are needed to get emacs to 147 | // align the lines properly 148 | ENUMERATOR_METHOD(value_type, move, (uint64_t position), (position)); 149 | ENUMERATOR_METHOD(value_type, next_geq, (uint64_t lower_bound), (lower_bound)); 150 | ENUMERATOR_METHOD(value_type, next, (), ()); 151 | ENUMERATOR_METHOD(uint64_t, size, () const, ()); 152 | ENUMERATOR_METHOD(uint64_t, prev_value, () const, ()); 153 | 154 | #undef ENUMERATOR_METHOD 155 | #undef ENUMERATOR_VOID_METHOD 156 | 157 | private: 158 | index_type m_type; 159 | union { 160 | compact_elias_fano::enumerator m_ef_enumerator; 161 | compact_ranked_bitvector::enumerator m_rb_enumerator; 162 | all_ones_sequence::enumerator m_ao_enumerator; 163 | }; 164 | }; 165 | }; 166 | } 167 | -------------------------------------------------------------------------------- /integer_codes.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace quasi_succinct { 4 | 5 | // note: n can be 0 6 | void write_gamma(succinct::bit_vector_builder& bvb, uint64_t n) 7 | { 8 | uint64_t nn = n + 1; 9 | uint64_t l = succinct::broadword::msb(nn); 10 | uint64_t hb = uint64_t(1) << l; 11 | bvb.append_bits(hb, l + 1); 12 | bvb.append_bits(nn ^ hb, l); 13 | } 14 | 15 | void write_gamma_nonzero(succinct::bit_vector_builder& bvb, uint64_t n) 16 | { 17 | assert(n > 0); 18 | write_gamma(bvb, n - 1); 19 | } 20 | 21 | uint64_t read_gamma(succinct::bit_vector::enumerator& it) 22 | { 23 | uint64_t l = it.skip_zeros(); 24 | return (it.take(l) | (uint64_t(1) << l)) - 1; 25 | } 26 | 27 | uint64_t read_gamma_nonzero(succinct::bit_vector::enumerator& it) 28 | { 29 | return read_gamma(it) + 1; 30 | } 31 | 32 | void write_delta(succinct::bit_vector_builder& bvb, uint64_t n) 33 | { 34 | uint64_t nn = n + 1; 35 | uint64_t l = succinct::broadword::msb(nn); 36 | uint64_t hb = uint64_t(1) << l; 37 | write_gamma(bvb, l); 38 | bvb.append_bits(nn ^ hb, l); 39 | } 40 | 41 | uint64_t read_delta(succinct::bit_vector::enumerator& it) 42 | { 43 | uint64_t l = read_gamma(it); 44 | return (it.take(l) | (uint64_t(1) << l)) - 1; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /optimal_partition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "util.hpp" 7 | 8 | namespace quasi_succinct { 9 | 10 | typedef uint32_t posting_t ; 11 | typedef uint64_t cost_t; 12 | 13 | struct optimal_partition { 14 | 15 | std::vector partition; 16 | cost_t cost_opt = 0; // the costs are in bits! 17 | 18 | template 19 | struct cost_window { 20 | // a window reppresent the cost of the interval [start, end) 21 | 22 | ForwardIterator start_it; 23 | ForwardIterator end_it; 24 | // starting and ending position of the window 25 | posting_t start = 0; 26 | posting_t end = 0; // end-th position is not in the current window 27 | posting_t min_p = 0; // element that preceed the first element of the window 28 | posting_t max_p = 0; 29 | 30 | cost_t cost_upper_bound; // The maximum cost for this window 31 | 32 | cost_window(ForwardIterator begin, cost_t cost_upper_bound) 33 | : start_it(begin) 34 | , end_it(begin) 35 | , min_p(*begin) 36 | , max_p(0) 37 | , cost_upper_bound(cost_upper_bound) 38 | {} 39 | 40 | uint64_t universe() const 41 | { 42 | return max_p - min_p + 1; 43 | } 44 | 45 | uint64_t size() const 46 | { 47 | return end - start; 48 | } 49 | 50 | void advance_start() 51 | { 52 | min_p = *start_it + 1; 53 | ++start; 54 | ++start_it; 55 | } 56 | 57 | void advance_end() 58 | { 59 | max_p = *end_it; 60 | ++end; 61 | ++end_it; 62 | } 63 | 64 | }; 65 | 66 | optimal_partition() 67 | {} 68 | 69 | template 70 | optimal_partition(ForwardIterator begin, uint64_t universe, uint64_t size, 71 | CostFunction cost_fun, double eps1, double eps2) 72 | { 73 | cost_t single_block_cost = cost_fun(universe, size); 74 | std::vector min_cost(size+1, single_block_cost); 75 | min_cost[0] = 0; 76 | 77 | // create the required window: one for each power of approx_factor 78 | std::vector> windows; 79 | cost_t cost_lb = cost_fun(1, 1); // minimum cost 80 | cost_t cost_bound = cost_lb; 81 | while (eps1 == 0 || cost_bound < cost_lb / eps1) { 82 | windows.emplace_back(begin, cost_bound); 83 | if (cost_bound >= single_block_cost) break; 84 | cost_bound = cost_bound * (1 + eps2); 85 | } 86 | 87 | std::vector path(size + 1, 0); 88 | for (posting_t i = 0; i < size; i++) { 89 | size_t last_end = i + 1; 90 | for (auto& window: windows) { 91 | 92 | assert(window.start == i); 93 | while (window.end < last_end) { 94 | window.advance_end(); 95 | } 96 | 97 | cost_t window_cost; 98 | while (true) { 99 | window_cost = cost_fun(window.universe(), window.size()); 100 | if ((min_cost[i] + window_cost < min_cost[window.end])) { 101 | min_cost[window.end] = min_cost[i] + window_cost; 102 | path[window.end] = i; 103 | } 104 | last_end = window.end; 105 | if (window.end == size) break; 106 | if (window_cost >= window.cost_upper_bound) break; 107 | window.advance_end(); 108 | } 109 | 110 | window.advance_start(); 111 | } 112 | } 113 | 114 | posting_t curr_pos = size; 115 | while( curr_pos != 0 ) { 116 | partition.push_back(curr_pos); 117 | curr_pos = path[curr_pos]; 118 | } 119 | std::reverse(partition.begin(), partition.end()); 120 | cost_opt = min_cost[size]; 121 | } 122 | }; 123 | 124 | } 125 | -------------------------------------------------------------------------------- /partitioned_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "configuration.hpp" 6 | #include "global_parameters.hpp" 7 | #include "compact_elias_fano.hpp" 8 | #include "indexed_sequence.hpp" 9 | #include "integer_codes.hpp" 10 | #include "util.hpp" 11 | #include "optimal_partition.hpp" 12 | 13 | namespace quasi_succinct { 14 | 15 | template 16 | struct partitioned_sequence { 17 | 18 | typedef BaseSequence base_sequence_type; 19 | typedef typename base_sequence_type::enumerator base_sequence_enumerator; 20 | 21 | template 22 | static void write(succinct::bit_vector_builder& bvb, 23 | Iterator begin, 24 | uint64_t universe, uint64_t n, 25 | global_parameters const& params) 26 | { 27 | assert(n > 0); 28 | auto const& conf = configuration::get(); 29 | 30 | auto cost_fun = [&](uint64_t universe, uint64_t n) { 31 | return base_sequence_type::bitsize(params, universe, n) + conf.fix_cost; 32 | }; 33 | 34 | optimal_partition opt(begin, universe, n, cost_fun, conf.eps1, conf.eps2); 35 | 36 | size_t partitions = opt.partition.size(); 37 | assert(partitions > 0); 38 | assert(opt.partition.front() != 0); 39 | assert(opt.partition.back() == n); 40 | write_gamma_nonzero(bvb, partitions); 41 | 42 | std::vector cur_partition; 43 | uint64_t cur_base = 0; 44 | if (partitions == 1) { 45 | cur_base = *begin; 46 | Iterator it = begin; 47 | 48 | for (size_t i = 0; i < n; ++i, ++it) { 49 | cur_partition.push_back(*it - cur_base); 50 | } 51 | 52 | uint64_t universe_bits = ceil_log2(universe); 53 | bvb.append_bits(cur_base, universe_bits); 54 | 55 | // write universe only if non-singleton and not tight 56 | if (n > 1) { 57 | if (cur_base + cur_partition.back() + 1 == universe) { 58 | // tight universe 59 | write_delta(bvb, 0); 60 | } else { 61 | write_delta(bvb, cur_partition.back()); 62 | } 63 | } 64 | 65 | base_sequence_type::write(bvb, cur_partition.begin(), 66 | cur_partition.back() + 1, 67 | cur_partition.size(), 68 | params); 69 | } else { 70 | succinct::bit_vector_builder bv_sequences; 71 | std::vector endpoints; 72 | std::vector upper_bounds; 73 | 74 | uint64_t cur_i = 0; 75 | Iterator it = begin; 76 | cur_base = *begin; 77 | upper_bounds.push_back(cur_base); 78 | 79 | for (size_t p = 0; p < opt.partition.size(); ++p) { 80 | cur_partition.clear(); 81 | uint64_t value = 0; 82 | for (; cur_i < opt.partition[p]; ++cur_i, ++it) { 83 | value = *it; 84 | cur_partition.push_back(value - cur_base); 85 | } 86 | 87 | uint64_t upper_bound = value; 88 | assert(cur_partition.size() > 0); 89 | base_sequence_type::write(bv_sequences, cur_partition.begin(), 90 | cur_partition.back() + 1, 91 | cur_partition.size(), // XXX skip last one? 92 | params); 93 | endpoints.push_back(bv_sequences.size()); 94 | upper_bounds.push_back(upper_bound); 95 | cur_base = upper_bound + 1; 96 | } 97 | 98 | succinct::bit_vector_builder bv_sizes; 99 | compact_elias_fano::write(bv_sizes, opt.partition.begin(), 100 | n, partitions - 1, 101 | params); 102 | 103 | succinct::bit_vector_builder bv_upper_bounds; 104 | compact_elias_fano::write(bv_upper_bounds, upper_bounds.begin(), 105 | universe, partitions + 1, 106 | params); 107 | 108 | uint64_t endpoint_bits = ceil_log2(bv_sequences.size() + 1); 109 | write_gamma(bvb, endpoint_bits); 110 | 111 | bvb.append(bv_sizes); 112 | bvb.append(bv_upper_bounds); 113 | 114 | for (uint64_t p = 0; p < endpoints.size() - 1; ++p) { 115 | bvb.append_bits(endpoints[p], endpoint_bits); 116 | } 117 | 118 | bvb.append(bv_sequences); 119 | } 120 | } 121 | 122 | class enumerator { 123 | public: 124 | 125 | typedef std::pair value_type; // (position, value) 126 | 127 | enumerator() 128 | {} 129 | 130 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 131 | uint64_t universe, uint64_t n, 132 | global_parameters const& params) 133 | : m_params(params) 134 | , m_size(n) 135 | , m_universe(universe) 136 | , m_bv(&bv) 137 | { 138 | succinct::bit_vector::enumerator it(bv, offset); 139 | m_partitions = read_gamma_nonzero(it); 140 | if (m_partitions == 1) { 141 | m_cur_partition = 0; 142 | m_cur_begin = 0; 143 | m_cur_end = n; 144 | 145 | uint64_t universe_bits = ceil_log2(universe); 146 | m_cur_base = it.take(universe_bits); 147 | auto ub = 0; 148 | if (n > 1) { 149 | uint64_t universe_delta = read_delta(it); 150 | ub = universe_delta ? universe_delta : (universe - m_cur_base - 1); 151 | } 152 | 153 | m_partition_enum = base_sequence_enumerator 154 | (*m_bv, it.position(), ub + 1, n, m_params); 155 | 156 | m_cur_upper_bound = m_cur_base + ub; 157 | } else { 158 | m_endpoint_bits = read_gamma(it); 159 | 160 | uint64_t cur_offset = it.position(); 161 | m_sizes = compact_elias_fano::enumerator(bv, cur_offset, 162 | n, m_partitions - 1, 163 | params); 164 | cur_offset += compact_elias_fano::bitsize(params, n, 165 | m_partitions - 1); 166 | 167 | m_upper_bounds = compact_elias_fano::enumerator(bv, cur_offset, 168 | universe, m_partitions + 1, 169 | params); 170 | cur_offset += compact_elias_fano::bitsize(params, universe, 171 | m_partitions + 1); 172 | 173 | m_endpoints_offset = cur_offset; 174 | uint64_t endpoints_size = m_endpoint_bits * (m_partitions - 1); 175 | cur_offset += endpoints_size; 176 | 177 | m_sequences_offset = cur_offset; 178 | } 179 | 180 | m_position = size(); 181 | slow_move(); 182 | } 183 | 184 | value_type QS_ALWAYSINLINE move(uint64_t position) 185 | { 186 | assert(position <= size()); 187 | m_position = position; 188 | 189 | if (m_position >= m_cur_begin && m_position < m_cur_end) { 190 | uint64_t val = m_cur_base + m_partition_enum.move(m_position - m_cur_begin).second; 191 | return value_type(m_position, val); 192 | } 193 | 194 | return slow_move(); 195 | } 196 | 197 | // note: this is instantiated oly if BaseSequence has next_geq 198 | value_type QS_ALWAYSINLINE next_geq(uint64_t lower_bound) 199 | { 200 | if (QS_LIKELY(lower_bound >= m_cur_base && lower_bound <= m_cur_upper_bound)) { 201 | auto val = m_partition_enum.next_geq(lower_bound - m_cur_base); 202 | m_position = m_cur_begin + val.first; 203 | return value_type(m_position, m_cur_base + val.second); 204 | } 205 | return slow_next_geq(lower_bound); 206 | } 207 | 208 | value_type QS_ALWAYSINLINE next() 209 | { 210 | ++m_position; 211 | 212 | if (QS_LIKELY(m_position < m_cur_end)) { 213 | uint64_t val = m_cur_base + m_partition_enum.next().second; 214 | return value_type(m_position, val); 215 | } 216 | return slow_next(); 217 | } 218 | 219 | uint64_t size() const 220 | { 221 | return m_size; 222 | } 223 | 224 | uint64_t prev_value() const 225 | { 226 | if (QS_UNLIKELY(m_position == m_cur_begin)) { 227 | return m_cur_partition ? m_cur_base - 1 : 0; 228 | } else { 229 | return m_cur_base + m_partition_enum.prev_value(); 230 | } 231 | } 232 | 233 | uint64_t num_partitions() const 234 | { 235 | return m_partitions; 236 | } 237 | 238 | friend class partitioned_sequence_test; 239 | 240 | private: 241 | 242 | // the compiler does not seem smart enough to figure out that this 243 | // is a very unlikely condition, and inlines the move(0) inside the 244 | // next(), causing the code to grow. Since next is called in very 245 | // tight loops, on microbenchmarks this causes an improvement of 246 | // about 3ns on my i7 3Ghz 247 | value_type QS_NOINLINE slow_next() 248 | { 249 | if (QS_UNLIKELY(m_position == m_size)) { 250 | assert(m_cur_partition == m_partitions - 1); 251 | auto val = m_partition_enum.next(); 252 | assert(val.first == m_partition_enum.size()); (void)val; 253 | return value_type(m_position, m_universe); 254 | } 255 | 256 | switch_partition(m_cur_partition + 1); 257 | uint64_t val = m_cur_base + m_partition_enum.move(0).second; 258 | return value_type(m_position, val); 259 | } 260 | 261 | value_type QS_NOINLINE slow_move() 262 | { 263 | if (m_position == size()) { 264 | if (m_partitions > 1) { 265 | switch_partition(m_partitions - 1); 266 | } 267 | m_partition_enum.move(m_partition_enum.size()); 268 | return value_type(m_position, m_universe); 269 | } 270 | auto size_it = m_sizes.next_geq(m_position + 1); // need endpoint strictly > m_position 271 | switch_partition(size_it.first); 272 | uint64_t val = m_cur_base + m_partition_enum.move(m_position - m_cur_begin).second; 273 | return value_type(m_position, val); 274 | } 275 | 276 | value_type QS_NOINLINE slow_next_geq(uint64_t lower_bound) 277 | { 278 | if (m_partitions == 1) { 279 | if (lower_bound < m_cur_base) { 280 | return move(0); 281 | } else { 282 | return move(size()); 283 | } 284 | } 285 | 286 | auto ub_it = m_upper_bounds.next_geq(lower_bound); 287 | if (ub_it.first == 0) { 288 | return move(0); 289 | } 290 | 291 | if (ub_it.first == m_upper_bounds.size()) { 292 | return move(size()); 293 | } 294 | 295 | switch_partition(ub_it.first - 1); 296 | return next_geq(lower_bound); 297 | } 298 | 299 | void switch_partition(uint64_t partition) 300 | { 301 | assert(m_partitions > 1); 302 | 303 | uint64_t endpoint = partition 304 | ? (m_bv->get_word56(m_endpoints_offset + 305 | (partition - 1) * m_endpoint_bits) 306 | & ((uint64_t(1) << m_endpoint_bits) - 1)) 307 | : 0; 308 | 309 | uint64_t partition_begin = m_sequences_offset + endpoint; 310 | m_bv->data().prefetch(partition_begin / 64); 311 | 312 | m_cur_partition = partition; 313 | auto size_it = m_sizes.move(partition); 314 | m_cur_end = size_it.second; 315 | m_cur_begin = m_sizes.prev_value(); 316 | 317 | auto ub_it = m_upper_bounds.move(partition + 1); 318 | m_cur_upper_bound = ub_it.second; 319 | m_cur_base = m_upper_bounds.prev_value() + (partition ? 1 : 0); 320 | 321 | m_partition_enum = base_sequence_enumerator 322 | (*m_bv, partition_begin, 323 | m_cur_upper_bound - m_cur_base + 1, 324 | m_cur_end - m_cur_begin, 325 | m_params); 326 | } 327 | 328 | global_parameters m_params; 329 | uint64_t m_partitions; 330 | uint64_t m_endpoints_offset; 331 | uint64_t m_endpoint_bits; 332 | uint64_t m_sequences_offset; 333 | uint64_t m_size; 334 | uint64_t m_universe; 335 | 336 | uint64_t m_position; 337 | uint64_t m_cur_partition; 338 | uint64_t m_cur_begin; 339 | uint64_t m_cur_end; 340 | uint64_t m_cur_base; 341 | uint64_t m_cur_upper_bound; 342 | 343 | succinct::bit_vector const* m_bv; 344 | compact_elias_fano::enumerator m_sizes; 345 | compact_elias_fano::enumerator m_upper_bounds; 346 | base_sequence_enumerator m_partition_enum; 347 | }; 348 | }; 349 | } 350 | -------------------------------------------------------------------------------- /positive_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "global_parameters.hpp" 4 | #include "strict_sequence.hpp" 5 | #include "util.hpp" 6 | 7 | namespace quasi_succinct { 8 | 9 | template 10 | struct positive_sequence { 11 | 12 | typedef BaseSequence base_sequence_type; 13 | typedef typename base_sequence_type::enumerator base_sequence_enumerator; 14 | 15 | template 16 | static void write(succinct::bit_vector_builder& bvb, 17 | Iterator begin, 18 | uint64_t universe, uint64_t n, 19 | global_parameters const& params) 20 | { 21 | assert(n > 0); 22 | auto cumulative_begin = 23 | make_function_iterator(std::make_pair(uint64_t(*begin), begin), 24 | [](std::pair& state) { 25 | state.first += *++state.second; 26 | }, [](std::pair const& state) { 27 | return state.first; 28 | }); 29 | base_sequence_type::write(bvb, cumulative_begin, universe, n, params); 30 | 31 | } 32 | 33 | class enumerator { 34 | public: 35 | 36 | typedef std::pair value_type; // (position, value) 37 | 38 | enumerator() 39 | {} 40 | 41 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 42 | uint64_t universe, uint64_t n, 43 | global_parameters const& params) 44 | : m_base_enum(bv, offset, universe, n, params) 45 | , m_position(m_base_enum.size()) 46 | {} 47 | 48 | value_type move(uint64_t position) 49 | { 50 | // we cache m_position and m_cur to avoid the call overhead in 51 | // the most common cases 52 | uint64_t prev = m_cur; 53 | if (position != m_position + 1) { 54 | if (QS_UNLIKELY(position == 0)) { 55 | // we need to special-case position 0 56 | m_cur = m_base_enum.move(0).second; 57 | m_position = 0; 58 | return value_type(m_position, m_cur); 59 | } 60 | prev = m_base_enum.move(position - 1).second; 61 | } 62 | 63 | m_cur = m_base_enum.next().second; 64 | m_position = position; 65 | return value_type(position, m_cur - prev); 66 | } 67 | 68 | base_sequence_enumerator const& base() const 69 | { 70 | return m_base_enum; 71 | } 72 | 73 | private: 74 | 75 | base_sequence_enumerator m_base_enum; 76 | uint64_t m_position; 77 | uint64_t m_cur; 78 | }; 79 | }; 80 | } 81 | -------------------------------------------------------------------------------- /queries.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include "index_types.hpp" 6 | #include "wand_data.hpp" 7 | #include "queries.hpp" 8 | #include "util.hpp" 9 | 10 | template 11 | void op_perftest(IndexType const& index, 12 | QueryOperator&& query_op, // XXX!!! 13 | std::vector const& queries, 14 | std::string const& index_type, 15 | std::string const& query_type, 16 | size_t runs) 17 | { 18 | using namespace quasi_succinct; 19 | 20 | std::vector query_times; 21 | 22 | for (size_t run = 0; run <= runs; ++run) { 23 | for (auto const& query: queries) { 24 | auto tick = get_time_usecs(); 25 | uint64_t result = query_op(index, query); 26 | do_not_optimize_away(result); 27 | double elapsed = double(get_time_usecs() - tick); 28 | if (run != 0) { // first run is not timed 29 | query_times.push_back(elapsed); 30 | } 31 | } 32 | } 33 | 34 | if (false) { 35 | for (auto t: query_times) { 36 | std::cout << (t / 1000) << std::endl; 37 | } 38 | } else { 39 | std::sort(query_times.begin(), query_times.end()); 40 | double avg = std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); 41 | double q50 = query_times[query_times.size() / 2]; 42 | double q90 = query_times[90 * query_times.size() / 100]; 43 | double q95 = query_times[95 * query_times.size() / 100]; 44 | logger() << "---- " << index_type << " " << query_type << std::endl; 45 | logger() << "Mean: " << avg << std::endl; 46 | logger() << "50% quantile: " << q50 << std::endl; 47 | logger() << "90% quantile: " << q90 << std::endl; 48 | logger() << "95% quantile: " << q95 << std::endl; 49 | 50 | stats_line() 51 | ("type", index_type) 52 | ("query", query_type) 53 | ("avg", avg) 54 | ("q50", q50) 55 | ("q90", q90) 56 | ("q95", q95) 57 | ; 58 | } 59 | } 60 | 61 | 62 | template 63 | void perftest(const char* index_filename, 64 | const char* wand_data_filename, 65 | std::vector const& queries, 66 | std::string const& type) 67 | { 68 | using namespace quasi_succinct; 69 | 70 | IndexType index; 71 | logger() << "Loading index from " << index_filename << std::endl; 72 | boost::iostreams::mapped_file_source m(index_filename); 73 | succinct::mapper::map(index, m, succinct::mapper::map_flags::warmup); 74 | 75 | logger() << "Performing " << type << " queries" << std::endl; 76 | op_perftest(index, and_query(), queries, type, "and", 3); 77 | op_perftest(index, and_query(), queries, type, "and_freq", 3); 78 | op_perftest(index, or_query(), queries, type, "or", 1); 79 | op_perftest(index, or_query(), queries, type, "or_freq", 1); 80 | 81 | if (wand_data_filename) { 82 | wand_data<> wdata; 83 | boost::iostreams::mapped_file_source md(wand_data_filename); 84 | succinct::mapper::map(wdata, md, succinct::mapper::map_flags::warmup); 85 | op_perftest(index, ranked_and_query(wdata, 10), queries, type, "ranked_and", 3); 86 | op_perftest(index, ranked_or_query(wdata, 10), queries, type, "ranked_or", 1); 87 | op_perftest(index, wand_query(wdata, 10), queries, type, "wand", 1); 88 | op_perftest(index, maxscore_query(wdata, 10), queries, type, "maxscore", 1); 89 | } 90 | 91 | } 92 | 93 | int main(int argc, const char** argv) 94 | { 95 | using namespace quasi_succinct; 96 | 97 | std::string type = argv[1]; 98 | const char* index_filename = argv[2]; 99 | const char* wand_data_filename = nullptr; 100 | if (argc >= 4) { 101 | wand_data_filename = argv[3]; 102 | } 103 | 104 | std::vector queries; 105 | term_id_vec q; 106 | while (read_query(q)) queries.push_back(q); 107 | 108 | if (false) { 109 | #define LOOP_BODY(R, DATA, T) \ 110 | } else if (type == BOOST_PP_STRINGIZE(T)) { \ 111 | perftest \ 112 | (index_filename, wand_data_filename, queries, type); \ 113 | /**/ 114 | 115 | BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, QS_INDEX_TYPES); 116 | #undef LOOP_BODY 117 | } else { 118 | logger() << "ERROR: Unknown type " << type << std::endl; 119 | } 120 | 121 | } 122 | -------------------------------------------------------------------------------- /semiasync_queue.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "configuration.hpp" 8 | #include "util.hpp" 9 | 10 | namespace quasi_succinct { 11 | 12 | class semiasync_queue { 13 | public: 14 | 15 | semiasync_queue(double work_per_thread) 16 | : m_expected_work(0) 17 | , m_work_per_thread(work_per_thread) 18 | { 19 | m_max_threads = configuration::get().worker_threads; 20 | logger() << "semiasync_queue using " << m_max_threads 21 | << " worker threads" << std::endl; 22 | } 23 | 24 | class job { 25 | public: 26 | virtual void prepare() = 0; 27 | virtual void commit() = 0; 28 | }; 29 | 30 | typedef std::shared_ptr job_ptr_type; 31 | 32 | void add_job(job_ptr_type j, double expected_work) 33 | { 34 | if (m_max_threads) { 35 | m_next_thread.first.push_back(j); 36 | m_expected_work += expected_work; 37 | if (m_expected_work >= m_work_per_thread) { 38 | spawn_next_thread(); 39 | } 40 | } else { // all in main thread 41 | j->prepare(); 42 | j->commit(); 43 | j.reset(); 44 | } 45 | } 46 | 47 | void complete() 48 | { 49 | if (!m_next_thread.first.empty()) { 50 | spawn_next_thread(); 51 | while (!m_running_threads.empty()) { 52 | commit_thread(); 53 | } 54 | } 55 | } 56 | 57 | private: 58 | 59 | void spawn_next_thread() 60 | { 61 | if (m_running_threads.size() == m_max_threads) { 62 | commit_thread(); 63 | } 64 | 65 | m_running_threads.emplace_back(); 66 | std::swap(m_next_thread, m_running_threads.back()); 67 | 68 | std::vector const& cur_queue = m_running_threads.back().first; 69 | m_running_threads.back().second = std::thread([&]() { 70 | for (auto const& j: cur_queue) { 71 | j->prepare(); 72 | } 73 | }); 74 | 75 | m_expected_work = 0; 76 | } 77 | 78 | void commit_thread() 79 | { 80 | assert(!m_running_threads.empty()); 81 | m_running_threads.front().second.join(); 82 | for (auto& j: m_running_threads.front().first) { 83 | j->commit(); 84 | j.reset(); 85 | } 86 | m_running_threads.pop_front(); 87 | } 88 | 89 | typedef std::pair, std::thread> thread_t; 90 | thread_t m_next_thread; 91 | std::deque m_running_threads; 92 | 93 | size_t m_expected_work; 94 | double m_work_per_thread; 95 | size_t m_max_threads; 96 | }; 97 | 98 | } 99 | -------------------------------------------------------------------------------- /sequence_collection.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "bitvector_collection.hpp" 4 | #include "compact_elias_fano.hpp" 5 | #include "integer_codes.hpp" 6 | #include "global_parameters.hpp" 7 | #include "semiasync_queue.hpp" 8 | 9 | namespace quasi_succinct { 10 | 11 | template 12 | class sequence_collection { 13 | public: 14 | typedef typename IndexedSequence::enumerator enumerator_type; 15 | 16 | sequence_collection() 17 | {} 18 | 19 | class builder { 20 | public: 21 | builder(global_parameters const& params) 22 | : m_queue(1 << 24) 23 | , m_params(params) 24 | , m_sequences(params) 25 | {} 26 | 27 | template 28 | void add_sequence(Iterator begin, uint64_t last_element, uint64_t n) 29 | { 30 | if (!n) throw std::invalid_argument("Sequence must be nonempty"); 31 | 32 | // make_shared does not seem to work 33 | std::shared_ptr> 34 | ptr(new sequence_adder(*this, begin, last_element, n)); 35 | m_queue.add_job(ptr, n); 36 | } 37 | 38 | void build(sequence_collection& sq) 39 | { 40 | m_queue.complete(); 41 | sq.m_params = m_params; 42 | m_sequences.build(sq.m_sequences); 43 | } 44 | 45 | private: 46 | 47 | template 48 | struct sequence_adder : semiasync_queue::job { 49 | sequence_adder(builder& b, 50 | Iterator begin, 51 | uint64_t last_element, 52 | uint64_t n) 53 | : b(b) 54 | , begin(begin) 55 | , last_element(last_element) 56 | , n(n) 57 | {} 58 | 59 | virtual void prepare() 60 | { 61 | // store approximation of the universe as smallest power of two 62 | // that can represent last_element 63 | uint64_t universe_bits = ceil_log2(last_element); 64 | write_gamma(bits, universe_bits); 65 | write_gamma_nonzero(bits, n); 66 | IndexedSequence::write(bits, begin, 67 | (uint64_t(1) << universe_bits) + 1, n, 68 | b.m_params); 69 | } 70 | 71 | virtual void commit() 72 | { 73 | b.m_sequences.append(bits); 74 | } 75 | 76 | builder& b; 77 | Iterator begin; 78 | uint64_t last_element; 79 | uint64_t n; 80 | succinct::bit_vector_builder bits; 81 | }; 82 | 83 | semiasync_queue m_queue; 84 | global_parameters m_params; 85 | bitvector_collection::builder m_sequences; 86 | }; 87 | 88 | size_t size() const 89 | { 90 | return m_sequences.size(); 91 | } 92 | 93 | enumerator_type operator[](size_t i) const 94 | { 95 | assert(i < size()); 96 | auto it = m_sequences.get(m_params, i); 97 | uint64_t universe_bits = read_gamma(it); 98 | uint64_t n = read_gamma_nonzero(it); 99 | 100 | return enumerator_type(m_sequences.bits(), it.position(), 101 | (uint64_t(1) << universe_bits) + 1, n, 102 | m_params); 103 | } 104 | 105 | void swap(sequence_collection& other) 106 | { 107 | std::swap(m_params, other.m_params); 108 | std::swap(m_size, other.m_size); 109 | m_sequences.swap(other.m_sequences); 110 | } 111 | 112 | template 113 | void map(Visitor& visit) 114 | { 115 | visit 116 | (m_params, "m_params") 117 | (m_size, "m_size") 118 | (m_sequences, "m_sequences") 119 | ; 120 | } 121 | 122 | private: 123 | global_parameters m_params; 124 | size_t m_size; 125 | bitvector_collection m_sequences; 126 | }; 127 | } 128 | -------------------------------------------------------------------------------- /strict_elias_fano.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "compact_elias_fano.hpp" 6 | #include "util.hpp" 7 | 8 | namespace quasi_succinct { 9 | 10 | struct strict_elias_fano { 11 | 12 | static QS_FLATTEN_FUNC uint64_t 13 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 14 | { 15 | assert(universe >= n); 16 | return compact_elias_fano::bitsize(params, universe - n + 1, n); 17 | } 18 | 19 | template 20 | static void write(succinct::bit_vector_builder& bvb, 21 | Iterator begin, 22 | uint64_t universe, uint64_t n, 23 | global_parameters const& params) 24 | { 25 | uint64_t new_universe = universe - n + 1; 26 | typedef typename std::iterator_traits::value_type value_type; 27 | auto new_begin = 28 | make_function_iterator(std::make_pair(value_type(0), begin), 29 | [](std::pair& state) { 30 | ++state.first; 31 | ++state.second; 32 | }, [](std::pair const& state) { 33 | return *state.second - state.first; 34 | }); 35 | compact_elias_fano::write(bvb, new_begin, new_universe, n, params); 36 | } 37 | 38 | class enumerator { 39 | public: 40 | 41 | typedef std::pair value_type; // (position, value) 42 | 43 | enumerator() 44 | {} 45 | 46 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 47 | uint64_t universe, uint64_t n, 48 | global_parameters const& params) 49 | : m_ef_enum(bv, offset, universe - n + 1, n, params) 50 | {} 51 | 52 | value_type move(uint64_t position) 53 | { 54 | auto val = m_ef_enum.move(position); 55 | return value_type(val.first, val.second + val.first); 56 | } 57 | 58 | value_type next() 59 | { 60 | auto val = m_ef_enum.next(); 61 | return value_type(val.first, val.second + val.first); 62 | } 63 | 64 | uint64_t size() const 65 | { 66 | return m_ef_enum.size(); 67 | } 68 | 69 | uint64_t prev_value() const 70 | { 71 | if (m_ef_enum.position()) { 72 | return m_ef_enum.prev_value() + m_ef_enum.position() - 1; 73 | } else { 74 | return 0; 75 | } 76 | } 77 | 78 | private: 79 | compact_elias_fano::enumerator m_ef_enum; 80 | }; 81 | 82 | }; 83 | } 84 | -------------------------------------------------------------------------------- /strict_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "strict_elias_fano.hpp" 6 | #include "compact_ranked_bitvector.hpp" 7 | #include "all_ones_sequence.hpp" 8 | #include "global_parameters.hpp" 9 | 10 | namespace quasi_succinct { 11 | 12 | struct strict_sequence { 13 | 14 | enum index_type { 15 | elias_fano = 0, 16 | ranked_bitvector = 1, 17 | all_ones = 2, 18 | 19 | index_types = 3 20 | }; 21 | 22 | static const uint64_t type_bits = 1; // all_ones is implicit 23 | 24 | static global_parameters strict_params(global_parameters params) 25 | { 26 | // we do not need to index the zeros 27 | params.ef_log_sampling0 = 63; 28 | params.rb_log_rank1_sampling = 63; 29 | return params; 30 | } 31 | 32 | static QS_FLATTEN_FUNC uint64_t 33 | bitsize(global_parameters const& params, uint64_t universe, uint64_t n) 34 | { 35 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 36 | auto sparams = strict_params(params); 37 | 38 | uint64_t ef_cost = strict_elias_fano::bitsize(sparams, universe, n) + type_bits; 39 | if (ef_cost < best_cost) { 40 | best_cost = ef_cost; 41 | } 42 | 43 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(sparams, universe, n) + type_bits; 44 | if (rb_cost < best_cost) { 45 | best_cost = rb_cost; 46 | } 47 | 48 | return best_cost; 49 | } 50 | 51 | template 52 | static void write(succinct::bit_vector_builder& bvb, 53 | Iterator begin, 54 | uint64_t universe, uint64_t n, 55 | global_parameters const& params) 56 | { 57 | auto sparams = strict_params(params); 58 | uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); 59 | int best_type = all_ones; 60 | 61 | if (best_cost) { 62 | uint64_t ef_cost = strict_elias_fano::bitsize(sparams, universe, n) + type_bits; 63 | if (ef_cost < best_cost) { 64 | best_cost = ef_cost; 65 | best_type = elias_fano; 66 | } 67 | 68 | uint64_t rb_cost = compact_ranked_bitvector::bitsize(sparams, universe, n) + type_bits; 69 | if (rb_cost < best_cost) { 70 | best_cost = rb_cost; 71 | best_type = ranked_bitvector; 72 | } 73 | 74 | bvb.append_bits(best_type, type_bits); 75 | } 76 | 77 | switch (best_type) { 78 | case elias_fano: 79 | strict_elias_fano::write(bvb, begin, 80 | universe, n, 81 | sparams); 82 | break; 83 | case ranked_bitvector: 84 | compact_ranked_bitvector::write(bvb, begin, 85 | universe, n, 86 | sparams); 87 | break; 88 | case all_ones: 89 | all_ones_sequence::write(bvb, begin, 90 | universe, n, 91 | sparams); 92 | break; 93 | default: 94 | assert(false); 95 | } 96 | } 97 | 98 | class enumerator { 99 | public: 100 | 101 | typedef std::pair value_type; // (position, value) 102 | 103 | enumerator() 104 | {} 105 | 106 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 107 | uint64_t universe, uint64_t n, 108 | global_parameters const& params) 109 | { 110 | auto sparams = strict_params(params); 111 | 112 | if (all_ones_sequence::bitsize(params, universe, n) == 0) { 113 | m_type = all_ones; 114 | } else { 115 | m_type = index_type(bv.get_word56(offset) 116 | & ((uint64_t(1) << type_bits) - 1)); 117 | } 118 | 119 | switch (m_type) { 120 | case elias_fano: 121 | m_ef_enumerator = strict_elias_fano::enumerator(bv, offset + type_bits, 122 | universe, n, 123 | sparams); 124 | break; 125 | case ranked_bitvector: 126 | m_rb_enumerator = compact_ranked_bitvector::enumerator(bv, offset + type_bits, 127 | universe, n, 128 | sparams); 129 | break; 130 | case all_ones: 131 | m_ao_enumerator = all_ones_sequence::enumerator(bv, offset + type_bits, 132 | universe, n, 133 | sparams); 134 | break; 135 | default: 136 | throw std::invalid_argument("Unsupported type"); 137 | } 138 | } 139 | 140 | #define ENUMERATOR_METHOD(RETURN_TYPE, METHOD, FORMALS, ACTUALS) \ 141 | RETURN_TYPE QS_FLATTEN_FUNC METHOD FORMALS \ 142 | { \ 143 | switch (__builtin_expect(m_type, elias_fano)) { \ 144 | case elias_fano: \ 145 | return m_ef_enumerator.METHOD ACTUALS; \ 146 | case ranked_bitvector: \ 147 | return m_rb_enumerator.METHOD ACTUALS; \ 148 | case all_ones: \ 149 | return m_ao_enumerator.METHOD ACTUALS; \ 150 | default: \ 151 | assert(false); \ 152 | __builtin_unreachable(); \ 153 | } \ 154 | } \ 155 | /**/ 156 | 157 | // semicolons are redundant but they are needed to get emacs to 158 | // align the lines properly 159 | ENUMERATOR_METHOD(value_type, move, (uint64_t position), (position)); 160 | ENUMERATOR_METHOD(value_type, next, (), ()); 161 | ENUMERATOR_METHOD(uint64_t, size, () const, ()); 162 | ENUMERATOR_METHOD(uint64_t, prev_value, () const, ()); 163 | 164 | #undef ENUMERATOR_METHOD 165 | #undef ENUMERATOR_VOID_METHOD 166 | 167 | private: 168 | index_type m_type; 169 | union { 170 | strict_elias_fano::enumerator m_ef_enumerator; 171 | compact_ranked_bitvector::enumerator m_rb_enumerator; 172 | all_ones_sequence::enumerator m_ao_enumerator; 173 | }; 174 | }; 175 | }; 176 | } 177 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB TEST_SOURCES test_*.cpp) 2 | foreach(TEST_SRC ${TEST_SOURCES}) 3 | get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) 4 | add_executable(${TEST_SRC_NAME} ${TEST_SRC}) 5 | target_link_libraries(${TEST_SRC_NAME} 6 | succinct 7 | ${Boost_LIBRARIES} 8 | ) 9 | add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME}) 10 | endforeach(TEST_SRC) 11 | 12 | target_link_libraries(test_block_codecs 13 | FastPFor_lib 14 | block_codecs) 15 | 16 | target_link_libraries(test_block_posting_list 17 | FastPFor_lib 18 | block_codecs) 19 | 20 | target_link_libraries(test_block_freq_index 21 | FastPFor_lib 22 | block_codecs) 23 | 24 | -------------------------------------------------------------------------------- /test/test_block_codecs.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE block_codecs 2 | 3 | #include "succinct/test_common.hpp" 4 | #include "block_codecs.hpp" 5 | #include 6 | #include 7 | 8 | template 9 | void test_block_codec() 10 | { 11 | std::vector sizes = {1, 16, BlockCodec::block_size - 1, BlockCodec::block_size}; 12 | for (auto size: sizes) { 13 | std::vector values(size); 14 | std::generate(values.begin(), values.end(), []() { return (uint32_t)rand() % (1 << 24); }); 15 | 16 | for (size_t tcase = 0; tcase < 2; ++tcase) { 17 | // test both undefined and given sum_of_values 18 | uint32_t sum_of_values(-1); 19 | if (tcase == 1) { 20 | sum_of_values = std::accumulate(values.begin(), values.end(), 0); 21 | } 22 | std::vector encoded; 23 | BlockCodec::encode(values.data(), sum_of_values, values.size(), encoded); 24 | 25 | std::vector decoded(values.size()); 26 | uint8_t const* out = BlockCodec::decode(encoded.data(), decoded.data(), 27 | sum_of_values, values.size()); 28 | 29 | BOOST_REQUIRE_EQUAL(encoded.size(), out - encoded.data()); 30 | BOOST_REQUIRE_EQUAL_COLLECTIONS(values.begin(), values.end(), 31 | decoded.begin(), decoded.end()); 32 | } 33 | } 34 | } 35 | 36 | BOOST_AUTO_TEST_CASE(block_codecs) 37 | { 38 | test_block_codec(); 39 | test_block_codec(); 40 | test_block_codec(); 41 | } 42 | -------------------------------------------------------------------------------- /test/test_block_freq_index.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE block_freq_index 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "block_freq_index.hpp" 6 | #include "block_codecs.hpp" 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | template 14 | void test_block_freq_index() 15 | { 16 | quasi_succinct::global_parameters params; 17 | uint64_t universe = 20000; 18 | typedef quasi_succinct::block_freq_index collection_type; 19 | typename collection_type::builder b(universe, params); 20 | 21 | typedef std::vector vec_type; 22 | std::vector> posting_lists(30); 23 | for (auto& plist: posting_lists) { 24 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 25 | uint64_t n = uint64_t(universe / avg_gap); 26 | plist.first = random_sequence(universe, n, true); 27 | plist.second.resize(n); 28 | std::generate(plist.second.begin(), plist.second.end(), 29 | []() { return (rand() % 256) + 1; }); 30 | 31 | b.add_posting_list(n, plist.first.begin(), 32 | plist.second.begin(), 0); 33 | 34 | } 35 | 36 | { 37 | collection_type coll; 38 | b.build(coll); 39 | succinct::mapper::freeze(coll, "temp.bin"); 40 | } 41 | 42 | { 43 | collection_type coll; 44 | boost::iostreams::mapped_file_source m("temp.bin"); 45 | succinct::mapper::map(coll, m); 46 | 47 | for (size_t i = 0; i < posting_lists.size(); ++i) { 48 | auto const& plist = posting_lists[i]; 49 | auto doc_enum = coll[i]; 50 | BOOST_REQUIRE_EQUAL(plist.first.size(), doc_enum.size()); 51 | for (size_t p = 0; p < plist.first.size(); ++p, doc_enum.next()) { 52 | MY_REQUIRE_EQUAL(plist.first[p], doc_enum.docid(), 53 | "i = " << i << " p = " << p); 54 | MY_REQUIRE_EQUAL(plist.second[p], doc_enum.freq(), 55 | "i = " << i << " p = " << p); 56 | } 57 | BOOST_REQUIRE_EQUAL(coll.num_docs(), doc_enum.docid()); 58 | } 59 | } 60 | } 61 | 62 | BOOST_AUTO_TEST_CASE(block_freq_index) 63 | { 64 | test_block_freq_index(); 65 | test_block_freq_index(); 66 | test_block_freq_index(); 67 | } 68 | -------------------------------------------------------------------------------- /test/test_block_posting_list.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE block_posting_list 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "block_posting_list.hpp" 6 | #include "block_codecs.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | void test_block_posting_list() 14 | { 15 | typedef quasi_succinct::block_posting_list posting_list_type; 16 | uint64_t universe = 20000; 17 | for (size_t t = 0; t < 20; ++t) { 18 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 19 | uint64_t n = uint64_t(universe / avg_gap); 20 | std::vector docs = random_sequence(universe, n, true); 21 | std::vector freqs(n); 22 | std::generate(freqs.begin(), freqs.end(), 23 | []() { return (rand() % 256) + 1; }); 24 | 25 | std::vector data; 26 | posting_list_type::write(data, n, docs.begin(), freqs.begin()); 27 | 28 | typename posting_list_type::document_enumerator e(data.data(), universe); 29 | BOOST_REQUIRE_EQUAL(n, e.size()); 30 | for (size_t i = 0; i < n; ++i, e.next()) { 31 | MY_REQUIRE_EQUAL(docs[i], e.docid(), 32 | "i = " << i << " size = " << n); 33 | MY_REQUIRE_EQUAL(freqs[i], e.freq(), 34 | "i = " << i << " size = " << n); 35 | } 36 | // XXX better testing of next_geq 37 | for (size_t i = 0; i < n; ++i) { 38 | e.reset(); 39 | e.next_geq(docs[i]); 40 | MY_REQUIRE_EQUAL(docs[i], e.docid(), 41 | "i = " << i << " size = " << n); 42 | MY_REQUIRE_EQUAL(freqs[i], e.freq(), 43 | "i = " << i << " size = " << n); 44 | } 45 | e.reset(); e.next_geq(docs.back() + 1); 46 | BOOST_REQUIRE_EQUAL(universe, e.docid()); 47 | e.reset(); e.next_geq(universe); 48 | BOOST_REQUIRE_EQUAL(universe, e.docid()); 49 | } 50 | } 51 | 52 | BOOST_AUTO_TEST_CASE(block_posting_list) 53 | { 54 | test_block_posting_list(); 55 | test_block_posting_list(); 56 | test_block_posting_list(); 57 | } 58 | -------------------------------------------------------------------------------- /test/test_compact_elias_fano.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE compact_elias_fano 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "compact_elias_fano.hpp" 6 | #include 7 | #include 8 | 9 | struct sequence_initialization { 10 | sequence_initialization() 11 | { 12 | n = 100000; 13 | universe = n * 1024; 14 | seq = random_sequence(universe, n); 15 | 16 | // high granularity to test more corner cases 17 | params.ef_log_sampling0 = 4; 18 | params.ef_log_sampling1 = 5; 19 | succinct::bit_vector_builder bvb; 20 | quasi_succinct::compact_elias_fano::write(bvb, 21 | seq.begin(), 22 | universe, seq.size(), 23 | params); 24 | succinct::bit_vector(&bvb).swap(bv); 25 | } 26 | 27 | quasi_succinct::global_parameters params; 28 | size_t n; 29 | size_t universe; 30 | std::vector seq; 31 | succinct::bit_vector bv; 32 | }; 33 | 34 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_singleton, 35 | sequence_initialization) 36 | { 37 | // test singleton sequences 38 | std::vector short_seq; 39 | short_seq.push_back(0); 40 | test_sequence(quasi_succinct::compact_elias_fano(), params, 1, short_seq); 41 | short_seq[0] = 1; 42 | test_sequence(quasi_succinct::compact_elias_fano(), params, 2, short_seq); 43 | } 44 | 45 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_construction, 46 | sequence_initialization) 47 | { 48 | 49 | // test pointers and low-level values 50 | quasi_succinct::compact_elias_fano::offsets of(0, 51 | universe, seq.size(), 52 | params); 53 | uint64_t rank = 0; 54 | for (uint64_t pos = 0; pos < of.higher_bits_length; ++pos) { 55 | bool b = bv[of.higher_bits_offset + pos]; 56 | uint64_t rank0 = pos - rank; 57 | 58 | if (b) { 59 | uint64_t read_v = ((pos - rank - 1) << of.lower_bits) | 60 | bv.get_bits(of.lower_bits_offset + rank * of.lower_bits, 61 | of.lower_bits); 62 | MY_REQUIRE_EQUAL(seq[rank], read_v, "rank = " << rank); 63 | } 64 | 65 | if (b && rank && (rank % (1 << of.log_sampling1)) == 0) { 66 | uint64_t ptr_offset = of.pointers1_offset + 67 | ((rank >> of.log_sampling1) - 1) * of.pointer_size; 68 | MY_REQUIRE_EQUAL(pos, bv.get_bits(ptr_offset, of.pointer_size), 69 | "rank = " << rank); 70 | } 71 | 72 | if (!b && rank0 && (rank0 % (1 << of.log_sampling0)) == 0) { 73 | uint64_t ptr_offset = of.pointers0_offset + 74 | ((rank0 >> of.log_sampling0) - 1) * of.pointer_size; 75 | MY_REQUIRE_EQUAL(pos, bv.get_bits(ptr_offset, of.pointer_size), 76 | "rank0 = " << rank0); 77 | } 78 | rank += b; 79 | } 80 | } 81 | 82 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_enumerator, 83 | sequence_initialization) 84 | { 85 | quasi_succinct::compact_elias_fano::enumerator r(bv, 0, 86 | universe, seq.size(), 87 | params); 88 | test_sequence(r, seq); 89 | } 90 | 91 | BOOST_FIXTURE_TEST_CASE(compact_elias_fano_weakly_monotone, 92 | sequence_initialization) 93 | { 94 | n = 100000; 95 | universe = n * 3; 96 | std::vector seq = random_sequence(universe, n, false); 97 | test_sequence(quasi_succinct::compact_elias_fano(), params, universe, seq); 98 | } 99 | 100 | -------------------------------------------------------------------------------- /test/test_compact_ranked_bitvector.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE compact_ranked_bitvector 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "compact_ranked_bitvector.hpp" 6 | #include 7 | #include 8 | 9 | struct sequence_initialization { 10 | sequence_initialization() 11 | { 12 | n = 100000; 13 | universe = n * 3; 14 | seq = random_sequence(universe, n, true); 15 | 16 | // high granularity to test more corner cases 17 | params.rb_log_rank1_sampling = 6; 18 | params.rb_log_sampling1 = 5; 19 | succinct::bit_vector_builder bvb; 20 | quasi_succinct::compact_ranked_bitvector::write(bvb, 21 | seq.begin(), 22 | universe, seq.size(), 23 | params); 24 | succinct::bit_vector(&bvb).swap(bv); 25 | } 26 | 27 | quasi_succinct::global_parameters params; 28 | size_t n; 29 | size_t universe; 30 | uint64_t log_rank1_sampling; 31 | uint64_t log_sampling1; 32 | std::vector seq; 33 | succinct::bit_vector bv; 34 | }; 35 | 36 | BOOST_FIXTURE_TEST_CASE(compact_ranked_bitvector_construction, 37 | sequence_initialization) 38 | { 39 | 40 | // test pointers and rank samples 41 | quasi_succinct::compact_ranked_bitvector::offsets of(0, 42 | universe, seq.size(), 43 | params); 44 | uint64_t rank = 0; 45 | for (uint64_t pos = 0; pos < of.universe; ++pos) { 46 | bool b = bv[of.bits_offset + pos]; 47 | 48 | if (b) { 49 | MY_REQUIRE_EQUAL(seq[rank], pos, "rank = " << rank); 50 | } 51 | 52 | if (b && rank && (rank % (1 << of.log_sampling1)) == 0) { 53 | uint64_t ptr_offset = of.pointers1_offset + 54 | ((rank >> of.log_sampling1) - 1) * of.pointer_size; 55 | MY_REQUIRE_EQUAL(pos, bv.get_bits(ptr_offset, of.pointer_size), 56 | "rank = " << rank); 57 | } 58 | 59 | if (pos && (pos % (1 << of.log_rank1_sampling) == 0)) { 60 | uint64_t sample_offset = of.rank1_samples_offset + 61 | ((pos >> of.log_rank1_sampling) - 1) * of.rank1_sample_size; 62 | MY_REQUIRE_EQUAL(rank, bv.get_bits(sample_offset, of.rank1_sample_size), 63 | "pos = " << pos); 64 | } 65 | 66 | rank += b; 67 | } 68 | } 69 | 70 | BOOST_FIXTURE_TEST_CASE(compact_ranked_bitvector_singleton, 71 | sequence_initialization) 72 | { 73 | // test singleton sequences 74 | std::vector short_seq; 75 | short_seq.push_back(0); 76 | test_sequence(quasi_succinct::compact_ranked_bitvector(), params, 1, short_seq); 77 | short_seq[0] = 1; 78 | test_sequence(quasi_succinct::compact_ranked_bitvector(), params, 2, short_seq); 79 | } 80 | 81 | BOOST_FIXTURE_TEST_CASE(compact_ranked_bitvector_enumerator, 82 | sequence_initialization) 83 | { 84 | quasi_succinct::compact_ranked_bitvector::enumerator r(bv, 0, 85 | universe, seq.size(), 86 | params); 87 | test_sequence(r, seq); 88 | } 89 | -------------------------------------------------------------------------------- /test/test_data/queries: -------------------------------------------------------------------------------- 1 | 101587 61936 2 | 40429 86328 3 | 13975 94987 102912 75488 86157 4 | 80811 110278 90269 96541 5 | 33726 6 | 78401 68238 7 | 59451 82510 8 | 110622 102912 53265 66945 43418 101818 99022 54523 54209 9 | 67842 54513 67848 10 | 55900 91909 11 | 51079 89883 12 | 38616 96982 13 | 97986 43403 14 | 106967 75552 59184 15 | 86328 82481 95555 80147 16 | 101785 47930 17 | 44232 103219 18 | 90882 72383 19 | 48145 68857 20 | 73102 55872 68283 21 | 43460 110362 22 | 46586 23 | 47320 33596 24 | 101682 72197 25 | 62885 43748 26 | 110278 44879 27 | 62574 93388 40150 68583 28 | 102046 74112 29 | 65953 111200 30 | 101365 17496 110642 53842 31 | 82777 83431 41152 44915 32 | 60341 49248 34323 95878 67486 75119 33 | 102133 112621 65989 34 | 60740 78250 62198 35 | 60392 75877 86281 36 | 67574 37 | 33856 88404 38 | 40975 39 | 97369 110949 40 | 110717 76695 110770 74156 102912 54599 42353 111450 41 | 73411 82481 72583 79520 46235 42 | 40013 42353 42958 106267 43 | 51571 51834 82481 91489 44 | 46410 47753 45 | 81496 33252 59377 46 | 80219 72531 82632 47 | 61559 110479 71821 48 | 46352 86758 75773 49 | 105328 44427 5924 86157 50 | 82607 103402 98558 51 | 59519 47436 39332 52 | 105329 61936 53 | 74447 49248 49285 54 | 67262 62044 105677 67262 62044 105677 96886 55 | 113086 52033 56 | 69774 67486 50806 57 | 43974 96023 91015 58 | 62557 59 | 86738 96807 40429 59978 57905 60 | 99001 78599 61 | 65446 91071 50240 93962 111030 62 | 55612 111457 63 | 82620 79303 111530 102324 97353 68820 34390 112715 66631 71126 69016 64 | 97366 85132 65 | 92457 91889 66 | 111200 67486 84677 67 | 86157 93388 82481 61684 41505 70086 8468 48343 68 | 111450 93388 91851 67486 94022 38961 69 | 102133 97188 47852 70 | 60392 44792 47436 39332 71 | 109782 78596 68754 42738 72 | 71780 82481 102046 91015 65989 73 | 102503 62083 74 | 34247 44390 75 | 42771 63843 76 | 93479 77 | 110622 68820 102133 47977 82481 102133 43302 78 | 45777 102533 61690 79 | 58684 42983 80 | 102133 39983 82481 77197 34202 76695 81 | 91753 40749 82 | 47487 57873 62029 83 | 33229 44941 84 | 69805 31550 42004 85 | 79610 98398 86 | 30300 32436 71869 59978 93645 94610 106016 87 | 90013 88 | 86938 74830 44915 89 | 69359 100705 58774 78596 102889 90 | 58625 48720 82481 78623 91 | 78500 49248 80811 90144 56796 92 | 33708 42738 93 | 93788 70008 93879 102339 96015 94 | 68581 58195 59978 53338 34202 88081 95 | 69805 96470 93944 96 | 102133 72383 48169 67558 97 | 110622 68820 53187 98 | 100035 68289 45194 79365 99 | 56694 100 | 92489 84496 47977 101 | 47675 44915 102 | 74156 53113 103 | 48792 82481 44782 44145 82481 104965 88209 104 | 50265 53793 95978 105 | 54599 34202 76462 106 | 52857 107 | 57681 47478 108 | 66014 102912 48819 58131 68462 98077 59953 109 | 60120 34194 110 | 64274 69016 83392 74156 69016 111 | 80432 102046 80422 112 | 51590 94716 79520 113 | 60317 75609 79072 114 | 76897 73806 115 | 101585 95555 116 | 99306 68335 68551 117 | 91214 95878 59978 96921 53338 93388 67486 77217 118 | 102133 57803 93670 44596 119 | 69571 74156 65063 120 | 57729 47034 101846 45930 121 | 70609 65356 122 | 71712 89029 105677 40967 99737 123 | 97478 32942 90144 56796 124 | 99856 42059 111730 125 | 101268 80644 126 | 75039 101681 127 | 81398 55245 84949 104433 128 | 68820 102133 40683 80689 51060 31550 46819 40683 129 | 93959 97650 33229 95458 130 | 61690 62697 131 | 110278 44713 132 | 93788 69773 49248 49285 94399 133 | 52165 78514 89883 134 | 47089 82481 75567 135 | 58663 58634 69640 136 | 53889 105983 96013 105677 67486 59951 137 | 68645 58676 95458 103402 44145 138 | 59451 97116 139 | 90435 56089 88388 47753 140 | 66631 72410 141 | 65814 63815 75496 142 | 104388 44850 33229 47302 143 | 102555 87083 95997 91738 144 | 78567 65741 59978 93645 145 | 84890 98474 56035 146 | 66945 112832 74156 97319 95496 102176 60392 147 | 106967 33286 148 | 38616 76506 86773 149 | 98388 63026 150 | 69800 76231 151 | 60392 82179 82481 71537 152 | 41996 110299 153 | 42585 78960 46337 154 | 104317 72842 83942 57392 155 | 49314 47382 156 | 49251 59940 67558 157 | 102133 41029 47521 64342 158 | 44297 71101 159 | 96035 63790 99413 160 | 80377 83553 71627 161 | 33385 103552 51209 162 | 32556 34202 57081 163 | 104322 103029 164 | 79660 103590 98779 87320 165 | 62029 30298 4807 96598 26877 6386 46406 47487 166 | 87869 111161 80913 68238 167 | 98289 85861 98077 168 | 88154 110278 68583 60392 169 | 97600 96472 96062 170 | 80377 112825 47089 89876 88225 171 | 86000 101610 67910 172 | 54191 58195 173 | 106830 82481 104506 76023 58520 174 | 100072 175 | 64131 51040 92214 101985 176 | 86537 60870 177 | 88435 110278 32606 178 | 81950 47436 39332 179 | 100437 87304 100018 180 | 72377 87092 181 | 42250 182 | 44241 59978 93645 67486 59451 183 | 104801 98449 184 | 65447 82481 95754 92013 80811 185 | 31550 109770 82984 102133 78408 78623 186 | 54550 67486 67203 8802 187 | 110770 84205 43628 75415 38658 61157 100705 188 | 74433 49248 101444 38817 189 | 69571 61327 190 | 56809 41152 191 | 41911 103874 192 | 50855 82984 45058 47750 193 | 72231 54729 194 | 41173 195 | 105871 62567 62697 59978 54935 196 | 49806 65959 197 | 46444 47487 198 | 60845 91919 199 | 32007 200 | 102095 112839 201 | 43302 44961 73912 202 | 110349 45930 203 | 57771 64563 110245 96541 204 | 97919 48164 102749 205 | 94508 59978 63248 206 | 91705 73102 50393 207 | 60392 98612 101985 47427 67203 208 | 110786 86769 39667 109901 103219 2671 209 | 63254 47673 97604 210 | 82697 75944 103402 211 | 97593 34202 62207 47753 59978 46369 212 | 71089 34175 213 | 33300 76282 214 | 85795 33745 215 | 65959 80377 112825 216 | 48754 89457 50481 97213 217 | 45286 218 | 77016 65807 219 | 93959 89635 220 | 4542 84803 221 | 65543 222 | 105922 80724 60551 86294 105677 103960 223 | 69628 42585 33229 73293 224 | 106928 47521 67701 110389 225 | 96585 51814 226 | 109945 94508 82481 88549 227 | 59995 89564 49516 55913 228 | 98449 87992 69227 40277 85111 229 | 49254 65741 73764 230 | 46248 12608 60458 231 | 102133 51198 113242 232 | 45612 76695 233 | 43422 64630 234 | 86157 71692 75182 235 | 68689 62558 236 | 85853 60484 91015 67486 80377 69613 237 | 66333 96160 238 | 111542 39667 65741 239 | 65741 57563 73126 240 | 90595 97823 53778 45773 93388 79216 241 | 8859 242 | 66309 97919 243 | 84734 94508 60458 44782 244 | 34281 67486 33941 245 | 80377 69613 53287 246 | 46556 86987 247 | 69032 55929 52484 248 | 95458 60836 65741 88572 249 | 80913 46579 72575 41346 32477 250 | 70461 251 | 63837 93388 49024 78067 252 | 42353 51339 253 | 93176 254 | 97489 84852 255 | 84672 89564 256 | 67558 257 | 27281 258 | 47647 80143 78250 259 | 97863 32177 260 | 112916 61891 82620 261 | 39717 40712 64889 262 | 39749 80410 263 | 111497 49248 264 | 105871 51834 82481 54935 83069 85130 265 | 61508 109936 102679 266 | 111457 85054 267 | 33883 268 | 72160 95997 269 | 87439 82519 270 | 39363 100394 84617 271 | 57929 105065 68394 272 | 34267 273 | 75721 98492 42738 82481 49134 274 | 77175 38658 275 | 98955 97248 96035 276 | 48062 99737 93880 47223 277 | 65741 86670 278 | 85319 71012 279 | 79365 104515 40277 66631 71573 54383 93388 79365 54383 100705 280 | 85853 64590 281 | 67567 282 | 94508 113242 44381 32606 283 | 59451 98492 53114 82481 49053 284 | 110952 80377 69613 82179 58076 9365 285 | 103393 84803 67486 47979 97986 286 | 32942 33944 87059 96541 287 | 111530 102912 77013 97353 288 | 44941 79216 45804 289 | 64169 51039 290 | 46607 100605 59978 79216 291 | 98705 45882 34202 111635 64585 292 | 32747 31550 85853 293 | 103368 63248 294 | 52853 112626 295 | 86783 72044 59439 296 | 49066 95458 297 | 44596 98492 48757 101985 298 | 43653 65886 96216 93536 299 | 63864 9072 103744 300 | 69032 84983 95868 99381 301 | 90640 102142 106822 80377 112825 302 | 47521 76492 303 | 80377 69613 80410 304 | 59951 99019 305 | 99373 67486 78960 306 | 110684 111455 79303 69453 53612 73754 307 | 43987 86092 308 | 34522 84496 49472 309 | 70624 310 | 102339 34202 74890 39919 48343 311 | 83993 48669 91087 312 | 51400 49583 313 | 106403 47089 99045 314 | 65959 55753 71627 75361 315 | 85065 89402 47930 316 | 42490 317 | 91013 102912 44347 60870 318 | 60392 88156 53847 319 | 76756 41520 104515 93388 31550 320 | 112799 41183 68820 96935 102181 102133 321 | 98819 49251 79216 322 | 95793 96987 323 | 80811 74112 324 | 40157 76848 43843 79303 101688 325 | 68271 101635 326 | 87885 64601 327 | 45967 104367 83015 60120 79315 328 | 9174 59995 57368 329 | 69553 106830 330 | 63974 331 | 84852 51834 78464 106255 332 | 54264 96107 333 | 104405 96293 48186 59978 110677 83392 81520 102265 334 | 99045 110663 102265 95217 78960 67486 69297 77095 335 | 47977 67266 336 | 44782 80410 84689 337 | 74964 64286 39332 338 | 79610 64619 9174 46410 47753 339 | 52853 65959 340 | 61566 70878 341 | 112601 79303 92489 342 | 94679 41646 32241 343 | 41650 83906 78567 344 | 55987 74044 63248 345 | 44878 92539 93143 346 | 56498 77200 347 | 39750 97650 348 | 87869 111450 349 | 60392 85801 94916 350 | 60830 351 | 60392 57206 82481 88464 111542 90847 352 | 93959 78586 353 | 69805 13974 6756 354 | 71860 355 | 86110 45512 356 | 56640 49248 94508 53047 357 | 32804 34202 75808 358 | 86610 19275 359 | 89012 360 | 40240 74112 361 | 66014 102912 68183 31550 45860 57755 362 | 97578 111530 59978 60484 363 | 61293 82481 87731 364 | 51656 97353 365 | 60612 366 | 90144 56796 67486 63326 82713 367 | 77811 60870 368 | 93925 76278 369 | 94904 85497 370 | 102168 371 | 95831 88277 372 | 86113 96015 373 | 80811 65959 374 | 75799 65907 86157 375 | 42395 84494 376 | 8682 102476 377 | 64756 70537 378 | 68910 379 | 97356 48943 380 | 94679 381 | 89613 63041 67486 42857 66839 382 | 105922 82549 88153 87992 383 | 41650 105871 41152 384 | 69291 54520 63814 385 | 91754 74719 110639 386 | 71730 49248 58828 387 | 48928 43179 63334 388 | 89621 98558 32804 87398 83459 389 | 59451 71899 101813 47753 390 | 73414 102912 50745 93388 78250 44381 91787 391 | 69227 78542 392 | 102133 104677 78266 80410 84689 393 | 89283 65959 110507 103834 32807 394 | 112590 46758 53831 48169 395 | 99008 89437 60535 78623 396 | 80377 112825 95668 80643 47521 397 | 92739 398 | 102339 91795 82984 103402 399 | 66204 49248 49285 400 | 105922 102133 61091 52558 13828 91223 42958 401 | 93190 50806 402 | 103189 101119 85189 403 | 99378 63218 59978 31550 58181 87083 63176 404 | 71101 405 | 59451 91355 83446 406 | 85003 82481 48062 96921 86799 59978 85314 407 | 70911 408 | 82834 111457 409 | 40467 46414 410 | 53778 76438 411 | 82070 69904 80410 412 | 66246 81952 413 | 66014 102912 48510 31550 97640 95173 414 | 86157 82481 62521 63041 102133 110949 96530 415 | 103446 416 | 64855 60146 417 | 73885 44611 39332 418 | 34393 49569 91087 419 | 92007 59954 420 | 68789 95458 421 | 55364 75285 72096 33432 422 | 48731 33252 423 | 71102 58520 53718 86328 424 | 61647 34202 47415 56096 67486 102133 54523 74719 425 | 46630 106255 94508 426 | 82620 79303 75285 66631 74156 65357 97142 427 | 34281 67486 33941 89437 428 | 52672 104474 70970 429 | 54577 62065 430 | 53573 431 | 52309 44879 432 | 103552 67486 43388 433 | 91754 33634 434 | 59978 75428 82915 80081 435 | 88154 75471 102912 47977 82481 33353 436 | 61625 103347 437 | 44893 67486 102133 32674 438 | 73783 66358 439 | 81507 75476 440 | 10647 42254 66853 441 | 110622 66945 112850 53338 61784 42284 44381 31550 102533 442 | 48669 88081 67695 443 | 48343 67486 41597 67702 444 | 52079 59451 65664 68070 41158 445 | 90953 109804 446 | 89575 60535 102133 90144 111591 447 | 32556 84936 448 | 96392 46410 449 | 102133 50714 106909 47753 78623 450 | 55245 53484 49285 451 | 110291 452 | 94680 44882 44056 57457 66113 103219 453 | 98492 40732 82481 102046 454 | 60392 102116 112832 45169 455 | 46535 86587 39212 81926 456 | 111389 457 | 41389 49248 71537 61559 458 | 110230 48030 75739 74830 459 | 69876 51553 106251 90144 98492 39842 460 | 90144 39012 461 | 51039 43703 462 | 85497 98558 91767 463 | 97425 51021 87059 464 | 47089 82481 83588 33353 465 | 65959 34202 78553 59978 93645 67486 97478 102046 466 | 57916 49248 85515 88846 90374 467 | 88374 53793 468 | 65938 79568 58828 469 | 80926 85619 470 | 85975 102535 471 | 106830 61241 40467 472 | 83918 40702 91015 88438 473 | 80443 474 | 44189 44824 475 | 46556 48087 88438 476 | 33972 80525 477 | 65768 478 | 46328 34202 102133 46758 57613 78623 479 | 61684 45612 480 | 65357 481 | 46999 96987 90144 56796 33003 482 | 96267 32199 483 | 49569 45169 40150 88323 484 | 107372 40601 23611 485 | 46406 59377 486 | 56006 39992 67486 487 | 65356 86281 488 | 43616 95458 489 | 68565 94045 40702 490 | 86066 68381 33262 491 | 106170 65745 492 | 53576 55403 493 | 79075 494 | 85577 43189 495 | 46414 93766 496 | 96392 61241 497 | 101688 46344 498 | 45147 58429 96216 96676 499 | 39485 49251 500 | 43537 501 | -------------------------------------------------------------------------------- /test/test_data/test_collection.docs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ot/partitioned_elias_fano/92c939c0840e81c58a903f5094be5f963760c3cf/test/test_data/test_collection.docs -------------------------------------------------------------------------------- /test/test_data/test_collection.freqs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ot/partitioned_elias_fano/92c939c0840e81c58a903f5094be5f963760c3cf/test/test_data/test_collection.freqs -------------------------------------------------------------------------------- /test/test_data/test_collection.sizes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ot/partitioned_elias_fano/92c939c0840e81c58a903f5094be5f963760c3cf/test/test_data/test_collection.sizes -------------------------------------------------------------------------------- /test/test_freq_index.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE freq_index 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "freq_index.hpp" 6 | #include "indexed_sequence.hpp" 7 | #include "partitioned_sequence.hpp" 8 | #include "positive_sequence.hpp" 9 | #include "uniform_partitioned_sequence.hpp" 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | template 18 | void test_freq_index() 19 | { 20 | quasi_succinct::global_parameters params; 21 | uint64_t universe = 20000; 22 | typedef quasi_succinct::freq_index 23 | collection_type; 24 | typename collection_type::builder b(universe, params); 25 | 26 | typedef std::vector vec_type; 27 | std::vector> posting_lists(30); 28 | for (auto& plist: posting_lists) { 29 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 30 | uint64_t n = uint64_t(universe / avg_gap); 31 | plist.first = random_sequence(universe, n, true); 32 | plist.second.resize(n); 33 | std::generate(plist.second.begin(), plist.second.end(), 34 | []() { return (rand() % 256) + 1; }); 35 | uint64_t freqs_sum = std::accumulate(plist.second.begin(), 36 | plist.second.end(), uint64_t(0)); 37 | 38 | b.add_posting_list(n, plist.first.begin(), 39 | plist.second.begin(), freqs_sum); 40 | 41 | } 42 | 43 | { 44 | collection_type coll; 45 | b.build(coll); 46 | succinct::mapper::freeze(coll, "temp.bin"); 47 | } 48 | 49 | { 50 | collection_type coll; 51 | boost::iostreams::mapped_file_source m("temp.bin"); 52 | succinct::mapper::map(coll, m); 53 | 54 | for (size_t i = 0; i < posting_lists.size(); ++i) { 55 | auto const& plist = posting_lists[i]; 56 | auto doc_enum = coll[i]; 57 | BOOST_REQUIRE_EQUAL(plist.first.size(), doc_enum.size()); 58 | for (size_t p = 0; p < plist.first.size(); ++p, doc_enum.next()) { 59 | MY_REQUIRE_EQUAL(plist.first[p], doc_enum.docid(), 60 | "i = " << i << " p = " << p); 61 | MY_REQUIRE_EQUAL(plist.second[p], doc_enum.freq(), 62 | "i = " << i << " p = " << p); 63 | } 64 | BOOST_REQUIRE_EQUAL(coll.num_docs(), doc_enum.docid()); 65 | } 66 | } 67 | } 68 | 69 | BOOST_AUTO_TEST_CASE(freq_index) 70 | { 71 | using quasi_succinct::indexed_sequence; 72 | using quasi_succinct::strict_sequence; 73 | using quasi_succinct::positive_sequence; 74 | using quasi_succinct::partitioned_sequence; 75 | using quasi_succinct::uniform_partitioned_sequence; 76 | 77 | test_freq_index>(); 79 | 80 | test_freq_index, 81 | positive_sequence>>(); 82 | test_freq_index, 83 | positive_sequence>>(); 84 | } 85 | -------------------------------------------------------------------------------- /test/test_generic_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "succinct/test_common.hpp" 4 | #include "succinct/bit_vector.hpp" 5 | #include "util.hpp" 6 | 7 | std::vector random_sequence(size_t universe, size_t n, 8 | bool strict = true) 9 | { 10 | srand(42); 11 | std::vector seq; 12 | 13 | uint64_t u = strict ? (universe - n) : universe; 14 | for (size_t i = 0; i < n; ++i) { 15 | seq.push_back(rand() % u); 16 | } 17 | std::sort(seq.begin(), seq.end()); 18 | 19 | if (strict) { 20 | for (size_t i = 0; i < n; ++i) { 21 | seq[i] += i; 22 | } 23 | } 24 | 25 | return seq; 26 | } 27 | 28 | template 29 | void test_move_next(SequenceReader r, std::vector const& seq) 30 | { 31 | BOOST_REQUIRE_EQUAL(seq.size(), r.size()); 32 | if (seq.empty()) { 33 | // just check that move works 34 | BOOST_REQUIRE_EQUAL(seq.size(), r.move(seq.size()).first); 35 | return; 36 | } 37 | 38 | typename SequenceReader::value_type val; 39 | 40 | // test random access and enumeration 41 | for (uint64_t i = 0; i < seq.size(); ++i) { 42 | val = r.move(i); 43 | MY_REQUIRE_EQUAL(i, val.first, 44 | "i = " << i); 45 | MY_REQUIRE_EQUAL(seq[i], val.second, 46 | "i = " << i); 47 | 48 | if (i) { 49 | MY_REQUIRE_EQUAL(seq[i - 1], r.prev_value(), 50 | "i = " << i); 51 | } else { 52 | MY_REQUIRE_EQUAL(0, r.prev_value(), 53 | "i = " << i); 54 | } 55 | } 56 | r.move(seq.size()); 57 | BOOST_REQUIRE_EQUAL(seq.back(), r.prev_value()); 58 | 59 | val = r.move(0); 60 | for (uint64_t i = 0; i < seq.size(); ++i) { 61 | MY_REQUIRE_EQUAL(seq[i], val.second, 62 | "i = " << i); 63 | 64 | if (i) { 65 | MY_REQUIRE_EQUAL(seq[i - 1], r.prev_value(), 66 | "i = " << i); 67 | } else { 68 | MY_REQUIRE_EQUAL(0, r.prev_value(), 69 | "i = " << i); 70 | } 71 | val = r.next(); 72 | } 73 | BOOST_REQUIRE_EQUAL(r.size(), val.first); 74 | BOOST_REQUIRE_EQUAL(seq.back(), r.prev_value()); 75 | 76 | // test small skips 77 | for (size_t i = 0; i < seq.size(); ++i) { 78 | for (size_t skip = 1; skip < seq.size() - i; skip <<= 1) { 79 | auto rr = r; 80 | rr.move(i); 81 | auto val = rr.move(i + skip); 82 | MY_REQUIRE_EQUAL(i + skip, val.first, 83 | "i = " << i << " skip = " << skip); 84 | MY_REQUIRE_EQUAL(seq[i + skip], val.second, 85 | "i = " << i << " skip = " << skip); 86 | } 87 | } 88 | } 89 | 90 | template 91 | void test_next_geq(SequenceReader r, std::vector const& seq) 92 | { 93 | BOOST_REQUIRE_EQUAL(seq.size(), r.size()); 94 | if (seq.empty()) { 95 | // just check that next_geq works 96 | BOOST_REQUIRE_EQUAL(seq.size(), r.next_geq(1).first); 97 | return; 98 | } 99 | 100 | typename SequenceReader::value_type val; 101 | 102 | // test successor 103 | uint64_t last = 0; 104 | for (size_t i = 0; i < seq.size(); ++i) { 105 | if (seq[i] == last) continue; 106 | 107 | auto rr = r; 108 | for (size_t t = 0; t < 10; ++t) { 109 | uint64_t p = 0; 110 | switch (i) { 111 | case 0: 112 | p = last + 1; break; 113 | case 1: 114 | p = seq[i]; break; 115 | default: 116 | p = last + 1 + (rand() % (seq[i] - last)); 117 | } 118 | 119 | val = rr.next_geq(p); 120 | BOOST_REQUIRE_EQUAL(i, val.first); 121 | MY_REQUIRE_EQUAL(seq[i], val.second, 122 | "p = " << p); 123 | 124 | if (val.first) { 125 | MY_REQUIRE_EQUAL(seq[val.first - 1], rr.prev_value(), 126 | "i = " << i); 127 | } else { 128 | MY_REQUIRE_EQUAL(0, rr.prev_value(), 129 | "i = " << i); 130 | } 131 | } 132 | last = seq[i]; 133 | } 134 | 135 | val = r.next_geq(seq.back() + 1); 136 | BOOST_REQUIRE_EQUAL(r.size(), val.first); 137 | BOOST_REQUIRE_EQUAL(seq.back(), r.prev_value()); 138 | 139 | // check next_geq beyond universe 140 | val = r.next_geq(2 * seq.back() + 1); 141 | BOOST_REQUIRE_EQUAL(r.size(), val.first); 142 | 143 | // test small skips 144 | for (size_t i = 0; i < seq.size(); ++i) { 145 | for (size_t skip = 1; skip < seq.size() - i; skip <<= 1) { 146 | size_t exp_pos = i + skip; 147 | // for weakly monotone sequences, next_at returns the first of the 148 | // run of equal values 149 | while ((exp_pos > 0) && seq[exp_pos - 1] == seq[i + skip]) { 150 | exp_pos -= 1; 151 | } 152 | 153 | auto rr = r; 154 | rr.move(i); 155 | val = rr.next_geq(seq[i + skip]); 156 | MY_REQUIRE_EQUAL(exp_pos, val.first, 157 | "i = " << i << " skip = " << skip 158 | << " value expected = " << seq[i + skip] 159 | << " got = " << val.second); 160 | MY_REQUIRE_EQUAL(seq[i + skip], val.second, 161 | "i = " << i << " skip = " << skip); 162 | } 163 | } 164 | } 165 | 166 | // oh, C++ 167 | struct no_next_geq_tag {}; 168 | struct next_geq_tag : no_next_geq_tag {}; 169 | 170 | template 171 | void test_sequence(SequenceReader r, std::vector const& seq, 172 | no_next_geq_tag const&) 173 | { 174 | test_move_next(r, seq); 175 | } 176 | 177 | template 178 | typename std::enable_if::value, void>::type 179 | test_sequence(SequenceReader r, std::vector const& seq, 180 | next_geq_tag const&) 181 | { 182 | test_move_next(r, seq); 183 | test_next_geq(r, seq); 184 | } 185 | 186 | template 187 | void test_sequence(SequenceReader r, std::vector const& seq) 188 | { 189 | test_sequence(r, seq, next_geq_tag()); 190 | } 191 | 192 | template 193 | inline void test_sequence(SequenceType, 194 | ParamsType const& params, 195 | uint64_t universe, 196 | std::vector const& seq) 197 | { 198 | succinct::bit_vector_builder bvb; 199 | SequenceType::write(bvb, seq.begin(), universe, seq.size(), params); 200 | succinct::bit_vector bv(&bvb); 201 | typename SequenceType::enumerator r(bv, 0, universe, seq.size(), params); 202 | test_sequence(r, seq); 203 | } 204 | 205 | -------------------------------------------------------------------------------- /test/test_indexed_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE indexed_sequence 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "indexed_sequence.hpp" 6 | #include 7 | #include 8 | 9 | BOOST_AUTO_TEST_CASE(indexed_sequence) 10 | { 11 | quasi_succinct::global_parameters params; 12 | 13 | std::vector avg_gaps = { 1.1, 1.9, 2.5, 3, 4, 5, 10 }; 14 | for (auto avg_gap: avg_gaps) { 15 | uint64_t n = 10000; 16 | uint64_t universe = uint64_t(n * avg_gap); 17 | auto seq = random_sequence(universe, n, true); 18 | 19 | test_sequence(quasi_succinct::indexed_sequence(), params, universe, seq); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /test/test_partitioned_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE partitioned_sequence 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "test_generic_sequence.hpp" 8 | #include "partitioned_sequence.hpp" 9 | #include "strict_sequence.hpp" 10 | 11 | namespace quasi_succinct { 12 | 13 | class partitioned_sequence_test { 14 | public: 15 | template 16 | static void test_construction(Enumerator& r, std::vector const& seq) 17 | { 18 | if (r.m_partitions == 1) { // nothing to test here 19 | return; 20 | } 21 | 22 | for (size_t p = 0; p < r.m_partitions; ++p) { 23 | r.switch_partition(p); 24 | 25 | uint64_t cur_begin = r.m_cur_begin; 26 | uint64_t cur_end = r.m_cur_end; 27 | 28 | uint64_t cur_base = p ? seq[cur_begin - 1] + 1 : seq[0]; 29 | uint64_t cur_upper_bound = seq[cur_end - 1]; 30 | MY_REQUIRE_EQUAL(cur_base, r.m_cur_base, 31 | "p = " << p); 32 | MY_REQUIRE_EQUAL(cur_upper_bound, r.m_cur_upper_bound, 33 | "p = " << p); 34 | 35 | for (uint64_t i = cur_begin; i < cur_end; ++i) { 36 | auto val = r.m_partition_enum.move(i - cur_begin); 37 | MY_REQUIRE_EQUAL(seq[i], cur_base + val.second, 38 | "p = " << p << " i = " << i); 39 | } 40 | } 41 | } 42 | }; 43 | } 44 | 45 | template 46 | void test_partitioned_sequence(uint64_t universe, 47 | std::vector const& seq) 48 | { 49 | quasi_succinct::global_parameters params; 50 | typedef quasi_succinct::partitioned_sequence sequence_type; 51 | 52 | succinct::bit_vector_builder bvb; 53 | sequence_type::write(bvb, seq.begin(), universe, seq.size(), params); 54 | succinct::bit_vector bv(&bvb); 55 | 56 | typename sequence_type::enumerator r(bv, 0, universe, seq.size(), params); 57 | quasi_succinct::partitioned_sequence_test::test_construction(r, seq); 58 | test_sequence(r, seq); 59 | } 60 | 61 | BOOST_AUTO_TEST_CASE(partitioned_sequence) 62 | { 63 | using quasi_succinct::indexed_sequence; 64 | using quasi_succinct::strict_sequence; 65 | 66 | if (boost::unit_test::framework::master_test_suite().argc == 2) { 67 | const char* filename = boost::unit_test::framework::master_test_suite().argv[1]; 68 | std::cerr << "Testing sequence from file " << filename << std::endl; 69 | std::ifstream is(filename); 70 | uint64_t v; 71 | std::vector seq; 72 | while (is >> v) { 73 | seq.push_back(v); 74 | } 75 | uint64_t universe = seq.back() + 1; 76 | test_partitioned_sequence(universe, seq); 77 | test_partitioned_sequence(universe, seq); 78 | return; 79 | } 80 | 81 | // test singleton sequences 82 | { 83 | std::vector seq; 84 | seq.push_back(0); 85 | test_partitioned_sequence(1, seq); 86 | test_partitioned_sequence(1, seq); 87 | seq[0] = 1; 88 | test_partitioned_sequence(2, seq); 89 | test_partitioned_sequence(2, seq); 90 | } 91 | 92 | std::vector avg_gaps = { 1.1, 1.9, 2.5, 3, 4, 5, 10 }; 93 | for (auto avg_gap: avg_gaps) { 94 | uint64_t n = 10000; 95 | uint64_t universe = uint64_t(n * avg_gap); 96 | auto seq = random_sequence(universe, n, true); 97 | test_partitioned_sequence(universe, seq); 98 | test_partitioned_sequence(universe, seq); 99 | } 100 | 101 | // test also short (singleton partition) sequences with large universe 102 | for (size_t i = 1; i < 512; i += 41) { 103 | uint64_t universe = 100000; 104 | uint64_t initial_gap = rand() % 50000; 105 | auto short_seq = random_sequence(universe - initial_gap, i, true); 106 | for (auto& v: short_seq) v += initial_gap; 107 | test_partitioned_sequence(universe, short_seq); 108 | test_partitioned_sequence(universe, short_seq); 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /test/test_positive_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE positive_sequence 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "positive_sequence.hpp" 6 | #include "partitioned_sequence.hpp" 7 | #include "uniform_partitioned_sequence.hpp" 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | template 14 | void test_positive_sequence() 15 | { 16 | srand(42); 17 | quasi_succinct::global_parameters params; 18 | size_t n = 50000; 19 | std::vector values(n); 20 | std::generate(values.begin(), values.end(), []() { return (rand() % 256) + 1; }); 21 | uint64_t universe = std::accumulate(values.begin(), values.end(), 0) + 1; 22 | 23 | typedef quasi_succinct::positive_sequence sequence_type; 24 | succinct::bit_vector_builder bvb; 25 | sequence_type::write(bvb, values.begin(), universe, values.size(), params); 26 | succinct::bit_vector bv(&bvb); 27 | typename sequence_type::enumerator r(bv, 0, universe, values.size(), params); 28 | 29 | for (size_t i = 0; i < n; ++i) { 30 | auto val = r.move(i); 31 | MY_REQUIRE_EQUAL(i, val.first, 32 | "i = " << i); 33 | MY_REQUIRE_EQUAL(values[i], val.second, 34 | "i = " << i); 35 | } 36 | } 37 | 38 | BOOST_AUTO_TEST_CASE(positive_sequence) 39 | { 40 | test_positive_sequence(); 41 | test_positive_sequence>(); 42 | test_positive_sequence>(); 43 | } 44 | -------------------------------------------------------------------------------- /test/test_ranked_queries.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE ranked_queries 2 | 3 | #include "succinct/test_common.hpp" 4 | #include 5 | 6 | #include "index_types.hpp" 7 | #include "queries.hpp" 8 | 9 | namespace quasi_succinct { namespace test { 10 | 11 | struct index_initialization { 12 | 13 | typedef single_index index_type; 14 | 15 | index_initialization() 16 | : collection("test_data/test_collection") // XXX path should be absolute 17 | , document_sizes("test_data/test_collection.sizes") 18 | , wdata(document_sizes.begin()->begin(), collection.num_docs(), collection) 19 | { 20 | index_type::builder builder(collection.num_docs(), params); 21 | for (auto const& plist: collection) { 22 | uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), 23 | plist.freqs.end(), uint64_t(0)); 24 | builder.add_posting_list(plist.docs.size(), plist.docs.begin(), 25 | plist.freqs.begin(), freqs_sum); 26 | } 27 | builder.build(index); 28 | 29 | term_id_vec q; 30 | std::ifstream qfile("test_data/queries"); 31 | while (read_query(q, qfile)) queries.push_back(q); 32 | } 33 | 34 | global_parameters params; 35 | binary_freq_collection collection; 36 | binary_collection document_sizes; 37 | index_type index; 38 | std::vector queries; 39 | wand_data<> wdata; 40 | 41 | template 42 | void test_against_or(QueryOp& op_q) const 43 | { 44 | ranked_or_query or_q(wdata, 10); 45 | 46 | for (auto const& q: queries) { 47 | or_q(index, q); 48 | op_q(index, q); 49 | BOOST_REQUIRE_EQUAL(or_q.topk().size(), op_q.topk().size()); 50 | for (size_t i = 0; i < or_q.topk().size(); ++i) { 51 | BOOST_REQUIRE_CLOSE(or_q.topk()[i], op_q.topk()[i], 0.1); // tolerance is % relative 52 | } 53 | } 54 | } 55 | 56 | 57 | }; 58 | 59 | }} 60 | 61 | 62 | BOOST_FIXTURE_TEST_CASE(wand, 63 | quasi_succinct::test::index_initialization) 64 | { 65 | quasi_succinct::wand_query wand_q(wdata, 10); 66 | test_against_or(wand_q); 67 | } 68 | 69 | BOOST_FIXTURE_TEST_CASE(maxscore, 70 | quasi_succinct::test::index_initialization) 71 | { 72 | quasi_succinct::maxscore_query maxscore_q(wdata, 10); 73 | test_against_or(maxscore_q); 74 | } 75 | -------------------------------------------------------------------------------- /test/test_sequence_collection.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE sequence_collection 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "sequence_collection.hpp" 6 | #include "indexed_sequence.hpp" 7 | #include "partitioned_sequence.hpp" 8 | #include "uniform_partitioned_sequence.hpp" 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | template 15 | void test_sequence_collection() 16 | { 17 | quasi_succinct::global_parameters params; 18 | uint64_t universe = 10000; 19 | typedef quasi_succinct::sequence_collection 20 | collection_type; 21 | typename collection_type::builder b(params); 22 | 23 | std::vector> sequences(30); 24 | for (auto& seq: sequences) { 25 | double avg_gap = 1.1 + double(rand()) / RAND_MAX * 10; 26 | uint64_t n = uint64_t(universe / avg_gap); 27 | seq = random_sequence(universe, n, true); 28 | b.add_sequence(seq.begin(), seq.back() + 1, n); 29 | } 30 | 31 | { 32 | collection_type coll; 33 | b.build(coll); 34 | succinct::mapper::freeze(coll, "temp.bin"); 35 | } 36 | 37 | { 38 | collection_type coll; 39 | boost::iostreams::mapped_file_source m("temp.bin"); 40 | succinct::mapper::map(coll, m); 41 | 42 | for (size_t i = 0; i < sequences.size(); ++i) { 43 | test_sequence(coll[i], sequences[i]); 44 | } 45 | } 46 | } 47 | 48 | BOOST_AUTO_TEST_CASE(sequence_collection) 49 | { 50 | test_sequence_collection(); 51 | test_sequence_collection>(); 52 | test_sequence_collection>(); 53 | } 54 | -------------------------------------------------------------------------------- /test/test_strict_elias_fano.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE strict_elias_fano 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "strict_elias_fano.hpp" 6 | #include 7 | #include 8 | 9 | BOOST_AUTO_TEST_CASE(strict_elias_fano) 10 | { 11 | quasi_succinct::global_parameters params; 12 | 13 | uint64_t n = 10000; 14 | uint64_t universe = uint64_t(2 * n); 15 | auto seq = random_sequence(universe, n, true); 16 | 17 | test_sequence(quasi_succinct::strict_elias_fano(), params, universe, seq); 18 | } 19 | -------------------------------------------------------------------------------- /test/test_uniform_partitioned_sequence.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE uniform_partitioned_sequence 2 | 3 | #include "test_generic_sequence.hpp" 4 | 5 | #include "uniform_partitioned_sequence.hpp" 6 | #include "strict_sequence.hpp" 7 | #include 8 | #include 9 | 10 | BOOST_AUTO_TEST_CASE(uniform_partitioned_sequence) 11 | { 12 | quasi_succinct::global_parameters params; 13 | using quasi_succinct::indexed_sequence; 14 | using quasi_succinct::strict_sequence; 15 | 16 | // test singleton sequences 17 | std::vector short_seq; 18 | short_seq.push_back(0); 19 | test_sequence(quasi_succinct::uniform_partitioned_sequence(), 20 | params, 1, short_seq); 21 | test_sequence(quasi_succinct::uniform_partitioned_sequence(), 22 | params, 1, short_seq); 23 | short_seq[0] = 1; 24 | test_sequence(quasi_succinct::uniform_partitioned_sequence(), 25 | params, 2, short_seq); 26 | test_sequence(quasi_succinct::uniform_partitioned_sequence(), 27 | params, 2, short_seq); 28 | 29 | std::vector avg_gaps = { 1.1, 1.9, 2.5, 3, 4, 5, 10 }; 30 | for (auto avg_gap: avg_gaps) { 31 | uint64_t n = 10000; 32 | uint64_t universe = uint64_t(n * avg_gap); 33 | auto seq = random_sequence(universe, n, true); 34 | 35 | test_sequence(quasi_succinct::uniform_partitioned_sequence(), 36 | params, universe, seq); 37 | test_sequence(quasi_succinct::uniform_partitioned_sequence(), 38 | params, universe, seq); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /uniform_partitioned_sequence.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "global_parameters.hpp" 6 | #include "compact_elias_fano.hpp" 7 | #include "indexed_sequence.hpp" 8 | #include "integer_codes.hpp" 9 | #include "util.hpp" 10 | 11 | namespace quasi_succinct { 12 | 13 | template 14 | struct uniform_partitioned_sequence { 15 | 16 | typedef BaseSequence base_sequence_type; 17 | typedef typename base_sequence_type::enumerator base_sequence_enumerator; 18 | 19 | template 20 | static void write(succinct::bit_vector_builder& bvb, 21 | Iterator begin, 22 | uint64_t universe, uint64_t n, 23 | global_parameters const& params) 24 | { 25 | using succinct::util::ceil_div; 26 | assert(n > 0); 27 | uint64_t partition_size = uint64_t(1) << params.log_partition_size; 28 | size_t partitions = ceil_div(n, partition_size); 29 | write_gamma_nonzero(bvb, partitions); 30 | 31 | std::vector cur_partition; 32 | uint64_t cur_base = 0; 33 | if (partitions == 1) { 34 | cur_base = *begin; 35 | Iterator it = begin; 36 | 37 | for (size_t i = 0; i < n; ++i, ++it) { 38 | cur_partition.push_back(*it - cur_base); 39 | } 40 | 41 | uint64_t universe_bits = ceil_log2(universe); 42 | bvb.append_bits(cur_base, universe_bits); 43 | // write universe only if non-singleton and not tight 44 | if (n > 1) { 45 | if (cur_base + cur_partition.back() + 1 == universe) { 46 | // tight universe 47 | write_delta(bvb, 0); 48 | } else { 49 | write_delta(bvb, cur_partition.back()); 50 | } 51 | } 52 | 53 | base_sequence_type::write(bvb, cur_partition.begin(), 54 | cur_partition.back() + 1, 55 | cur_partition.size(), 56 | params); 57 | } else { 58 | succinct::bit_vector_builder bv_sequences; 59 | std::vector endpoints; 60 | std::vector upper_bounds; 61 | 62 | uint64_t cur_i = 0; 63 | Iterator it = begin; 64 | cur_base = *begin; 65 | upper_bounds.push_back(cur_base); 66 | 67 | for (size_t p = 0; p < partitions; ++p) { 68 | cur_partition.clear(); 69 | uint64_t value = 0; 70 | for (; cur_i < ((p + 1) * partition_size) && cur_i < n; 71 | ++cur_i, ++it) { 72 | value = *it; 73 | cur_partition.push_back(value - cur_base); 74 | } 75 | assert(cur_partition.size() <= partition_size); 76 | assert((p == partitions - 1) 77 | || cur_partition.size() == partition_size); 78 | 79 | uint64_t upper_bound = value; 80 | assert(cur_partition.size() > 0); 81 | base_sequence_type::write(bv_sequences, cur_partition.begin(), 82 | cur_partition.back() + 1, 83 | cur_partition.size(), // XXX skip last one? 84 | params); 85 | endpoints.push_back(bv_sequences.size()); 86 | upper_bounds.push_back(upper_bound); 87 | cur_base = upper_bound + 1; 88 | } 89 | 90 | succinct::bit_vector_builder bv_upper_bounds; 91 | compact_elias_fano::write(bv_upper_bounds, upper_bounds.begin(), 92 | universe, partitions + 1, 93 | params); 94 | 95 | uint64_t endpoint_bits = ceil_log2(bv_sequences.size() + 1); 96 | write_gamma(bvb, endpoint_bits); 97 | bvb.append(bv_upper_bounds); 98 | 99 | for (uint64_t p = 0; p < endpoints.size() - 1; ++p) { 100 | bvb.append_bits(endpoints[p], endpoint_bits); 101 | } 102 | 103 | bvb.append(bv_sequences); 104 | } 105 | } 106 | 107 | class enumerator { 108 | public: 109 | 110 | typedef std::pair value_type; // (position, value) 111 | 112 | enumerator() 113 | {} 114 | 115 | enumerator(succinct::bit_vector const& bv, uint64_t offset, 116 | uint64_t universe, uint64_t n, 117 | global_parameters const& params) 118 | : m_params(params) 119 | , m_size(n) 120 | , m_universe(universe) 121 | , m_bv(&bv) 122 | { 123 | succinct::bit_vector::enumerator it(bv, offset); 124 | m_partitions = read_gamma_nonzero(it); 125 | if (m_partitions == 1) { 126 | m_cur_partition = 0; 127 | m_cur_begin = 0; 128 | m_cur_end = n; 129 | 130 | uint64_t universe_bits = ceil_log2(universe); 131 | m_cur_base = it.take(universe_bits); 132 | auto ub = 0; 133 | if (n > 1) { 134 | uint64_t universe_delta = read_delta(it); 135 | ub = universe_delta ? universe_delta : (universe - m_cur_base - 1); 136 | } 137 | 138 | m_partition_enum = base_sequence_enumerator 139 | (*m_bv, it.position(), ub + 1, n, m_params); 140 | 141 | m_cur_upper_bound = m_cur_base + ub; 142 | } else { 143 | m_endpoint_bits = read_gamma(it); 144 | uint64_t cur_offset = it.position(); 145 | 146 | m_upper_bounds = compact_elias_fano::enumerator(bv, cur_offset, 147 | universe, m_partitions + 1, 148 | params); 149 | cur_offset += compact_elias_fano::offsets(0, universe, 150 | m_partitions + 1, 151 | params).end; 152 | 153 | m_endpoints_offset = cur_offset; 154 | uint64_t endpoints_size = m_endpoint_bits * (m_partitions - 1); 155 | cur_offset += endpoints_size; 156 | 157 | m_sequences_offset = cur_offset; 158 | } 159 | 160 | m_position = size(); 161 | slow_move(); 162 | } 163 | 164 | value_type QS_ALWAYSINLINE move(uint64_t position) 165 | { 166 | assert(position <= size()); 167 | m_position = position; 168 | 169 | if (m_position >= m_cur_begin && m_position < m_cur_end) { 170 | uint64_t val = m_cur_base + m_partition_enum.move(m_position - m_cur_begin).second; 171 | return value_type(m_position, val); 172 | } 173 | 174 | return slow_move(); 175 | } 176 | 177 | // note: this is instantiated oly if BaseSequence has next_geq 178 | value_type QS_ALWAYSINLINE next_geq(uint64_t lower_bound) 179 | { 180 | if (QS_LIKELY(lower_bound >= m_cur_base && lower_bound <= m_cur_upper_bound)) { 181 | auto val = m_partition_enum.next_geq(lower_bound - m_cur_base); 182 | m_position = m_cur_begin + val.first; 183 | return value_type(m_position, m_cur_base + val.second); 184 | } 185 | return slow_next_geq(lower_bound); 186 | } 187 | 188 | value_type QS_ALWAYSINLINE next() 189 | { 190 | ++m_position; 191 | 192 | if (QS_LIKELY(m_position < m_cur_end)) { 193 | uint64_t val = m_cur_base + m_partition_enum.next().second; 194 | return value_type(m_position, val); 195 | } 196 | return slow_next(); 197 | } 198 | 199 | uint64_t size() const 200 | { 201 | return m_size; 202 | } 203 | 204 | uint64_t prev_value() const 205 | { 206 | if (QS_UNLIKELY(m_position == m_cur_begin)) { 207 | return m_cur_partition ? m_cur_base - 1 : 0; 208 | } else { 209 | return m_cur_base + m_partition_enum.prev_value(); 210 | } 211 | } 212 | 213 | private: 214 | 215 | // the compiler does not seem smart enough to figure out that this 216 | // is a very unlikely condition, and inlines the move(0) inside the 217 | // next(), causing the code to grow. Since next is called in very 218 | // tight loops, on microbenchmarks this causes an improvement of 219 | // about 3ns on my i7 3Ghz 220 | value_type QS_NOINLINE slow_next() 221 | { 222 | if (QS_UNLIKELY(m_position == m_size)) { 223 | assert(m_cur_partition == m_partitions - 1); 224 | auto val = m_partition_enum.next(); 225 | assert(val.first == m_partition_enum.size()); (void)val; 226 | return value_type(m_position, m_universe); 227 | } 228 | 229 | switch_partition(m_cur_partition + 1); 230 | uint64_t val = m_cur_base + m_partition_enum.move(0).second; 231 | return value_type(m_position, val); 232 | } 233 | 234 | value_type QS_NOINLINE slow_move() 235 | { 236 | if (m_position == size()) { 237 | if (m_partitions > 1) { 238 | switch_partition(m_partitions - 1); 239 | } 240 | m_partition_enum.move(m_partition_enum.size()); 241 | return value_type(m_position, m_universe); 242 | } 243 | uint64_t partition = m_position >> m_params.log_partition_size; 244 | switch_partition(partition); 245 | uint64_t val = m_cur_base + m_partition_enum.move(m_position - m_cur_begin).second; 246 | return value_type(m_position, val); 247 | } 248 | 249 | value_type QS_NOINLINE slow_next_geq(uint64_t lower_bound) 250 | { 251 | if (m_partitions == 1) { 252 | if (lower_bound < m_cur_base) { 253 | return move(0); 254 | } else { 255 | return move(size()); 256 | } 257 | } 258 | 259 | auto ub_it = m_upper_bounds.next_geq(lower_bound); 260 | if (ub_it.first == 0) { 261 | return move(0); 262 | } 263 | 264 | if (ub_it.first == m_upper_bounds.size()) { 265 | return move(size()); 266 | } 267 | 268 | switch_partition(ub_it.first - 1); 269 | return next_geq(lower_bound); 270 | } 271 | 272 | void switch_partition(uint64_t partition) 273 | { 274 | assert(m_partitions > 1); 275 | 276 | uint64_t endpoint = partition 277 | ? m_bv->get_bits(m_endpoints_offset + 278 | (partition - 1) * m_endpoint_bits, 279 | m_endpoint_bits) 280 | : 0; 281 | m_bv->data().prefetch((m_sequences_offset + endpoint) / 64); 282 | 283 | m_cur_partition = partition; 284 | m_cur_begin = partition << m_params.log_partition_size; 285 | m_cur_end = std::min(size(), (partition + 1) << m_params.log_partition_size); 286 | 287 | auto ub_it = m_upper_bounds.move(partition + 1); 288 | m_cur_upper_bound = ub_it.second; 289 | m_cur_base = m_upper_bounds.prev_value() + (partition ? 1 : 0); 290 | 291 | m_partition_enum = base_sequence_enumerator 292 | (*m_bv, m_sequences_offset + endpoint, 293 | m_cur_upper_bound - m_cur_base + 1, 294 | m_cur_end - m_cur_begin, 295 | m_params); 296 | } 297 | 298 | global_parameters m_params; 299 | uint64_t m_partitions; 300 | uint64_t m_endpoints_offset; 301 | uint64_t m_endpoint_bits; 302 | uint64_t m_sequences_offset; 303 | uint64_t m_size; 304 | uint64_t m_universe; 305 | 306 | uint64_t m_position; 307 | uint64_t m_cur_partition; 308 | uint64_t m_cur_begin; 309 | uint64_t m_cur_end; 310 | uint64_t m_cur_base; 311 | uint64_t m_cur_upper_bound; 312 | 313 | succinct::bit_vector const* m_bv; 314 | compact_elias_fano::enumerator m_upper_bounds; 315 | base_sequence_enumerator m_partition_enum; 316 | }; 317 | }; 318 | } 319 | -------------------------------------------------------------------------------- /util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "succinct/broadword.hpp" 12 | 13 | #define QS_LIKELY(x) __builtin_expect(!!(x), 1) 14 | #define QS_UNLIKELY(x) __builtin_expect(!!(x), 0) 15 | #define QS_NOINLINE __attribute__((noinline)) 16 | #define QS_ALWAYSINLINE __attribute__((always_inline)) 17 | 18 | #if defined(__GNUC__) && !defined(__clang__) 19 | # define QS_FLATTEN_FUNC __attribute__((always_inline,flatten)) 20 | #else 21 | # define QS_FLATTEN_FUNC QS_ALWAYSINLINE 22 | #endif 23 | 24 | namespace quasi_succinct { 25 | 26 | inline uint64_t ceil_log2(const uint64_t x) { 27 | assert(x > 0); 28 | return (x > 1) ? succinct::broadword::msb(x - 1) + 1 : 0; 29 | } 30 | 31 | inline std::ostream& logger() 32 | { 33 | time_t t = std::time(nullptr); 34 | // XXX(ot): put_time unsupported in g++ 4.7 35 | // return std::cerr 36 | // << std::put_time(std::localtime(&t), "%F %T") 37 | // << ": "; 38 | std::locale loc; 39 | const std::time_put& tp = 40 | std::use_facet>(loc); 41 | const char *fmt = "%F %T"; 42 | tp.put(std::cerr, std::cerr, ' ', 43 | std::localtime(&t), fmt, fmt + strlen(fmt)); 44 | return std::cerr << ": "; 45 | } 46 | 47 | inline double get_time_usecs() { 48 | timeval tv; 49 | gettimeofday(&tv, NULL); 50 | return double(tv.tv_sec) * 1000000 + double(tv.tv_usec); 51 | } 52 | 53 | inline double get_user_time_usecs() { 54 | rusage ru; 55 | getrusage(RUSAGE_SELF, &ru); 56 | return double(ru.ru_utime.tv_sec) * 1000000 + double(ru.ru_utime.tv_usec); 57 | } 58 | 59 | // stolen from folly 60 | template 61 | inline void do_not_optimize_away(T&& datum) { 62 | asm volatile("" : "+r" (datum)); 63 | } 64 | 65 | template 66 | struct has_next_geq 67 | { 68 | template struct sfinae {}; 69 | template static char test(sfinae); 70 | template static int test(...); 71 | enum { value = sizeof(test(0)) == sizeof(char) }; 72 | }; 73 | 74 | // A more powerful version of boost::function_input_iterator that also works 75 | // with lambdas. 76 | // 77 | // Important: the functors must be stateless, otherwise the behavior is 78 | // undefined. 79 | template 80 | class function_iterator 81 | : public std::iterator::type> { 83 | 84 | public: 85 | function_iterator() 86 | {} 87 | 88 | function_iterator(State initial_state) 89 | : m_state(initial_state) 90 | {} 91 | 92 | friend inline 93 | void swap(function_iterator& lhs, function_iterator& rhs) 94 | { 95 | using std::swap; 96 | swap(lhs.m_state, rhs.m_state); 97 | } 98 | 99 | // XXX why isn't this inherited from std::iterator? 100 | typedef typename std::result_of::type value_type; 101 | 102 | value_type operator*() const 103 | { 104 | // XXX I do not know if this trick is legal for stateless lambdas, 105 | // but it seems to work on GCC and Clang 106 | return (*static_cast(nullptr))(m_state); 107 | } 108 | 109 | function_iterator& operator++() 110 | { 111 | (*static_cast(nullptr))(m_state); 112 | return *this; 113 | } 114 | 115 | function_iterator operator++(int) 116 | { 117 | function_iterator it(*this); 118 | operator++(); 119 | return it; 120 | } 121 | 122 | bool operator==(function_iterator const& other) const 123 | { 124 | return m_state == other.m_state; 125 | } 126 | 127 | bool operator!=(function_iterator const& other) const 128 | { 129 | return !(*this == other); 130 | } 131 | 132 | private: 133 | State m_state; 134 | }; 135 | 136 | template 137 | function_iterator 138 | make_function_iterator(State initial_state, AdvanceFunctor, ValueFunctor) 139 | { 140 | return function_iterator(initial_state); 141 | } 142 | 143 | 144 | struct stats_line { 145 | stats_line() 146 | : first(true) 147 | { 148 | std::cout << "{"; 149 | } 150 | 151 | ~stats_line() 152 | { 153 | std::cout << "}" << std::endl; 154 | } 155 | 156 | template 157 | stats_line& operator()(K const& key, T const& value) 158 | { 159 | if (!first) { 160 | std::cout << ", "; 161 | } else { 162 | first = false; 163 | } 164 | 165 | emit(key); 166 | std::cout << ": "; 167 | emit(value); 168 | return *this; 169 | } 170 | 171 | private: 172 | 173 | template 174 | void emit(T const& v) const 175 | { 176 | std::cout << v; 177 | } 178 | 179 | // XXX properly escape strings 180 | void emit(const char* s) const 181 | { 182 | std::cout << '"' << s << '"'; 183 | } 184 | 185 | void emit(std::string const& s) const 186 | { 187 | emit(s.c_str()); 188 | } 189 | 190 | bool first; 191 | }; 192 | } 193 | -------------------------------------------------------------------------------- /wand_data.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "binary_freq_collection.hpp" 6 | #include "bm25.hpp" 7 | #include "util.hpp" 8 | 9 | namespace quasi_succinct { 10 | 11 | template 12 | class wand_data { 13 | public: 14 | wand_data() 15 | {} 16 | 17 | template 18 | wand_data(LengthsIterator len_it, uint64_t num_docs, 19 | binary_freq_collection const& coll) 20 | { 21 | std::vector norm_lens(num_docs); 22 | double lens_sum = 0; 23 | logger() << "Reading sizes..." << std::endl; 24 | for (size_t i = 0; i < num_docs; ++i) { 25 | float len = *len_it++; 26 | norm_lens[i] = len; 27 | lens_sum += len; 28 | } 29 | float avg_len = float(lens_sum / double(num_docs)); 30 | for (size_t i = 0; i < num_docs; ++i) { 31 | norm_lens[i] /= avg_len; 32 | } 33 | 34 | logger() << "Storing max weight for each list..." << std::endl; 35 | std::vector max_term_weight; 36 | for (auto const& seq: coll) { 37 | float max_score = 0; 38 | for (size_t i = 0; i < seq.docs.size(); ++i) { 39 | uint64_t docid = *(seq.docs.begin() + i); 40 | uint64_t freq = *(seq.freqs.begin() + i); 41 | float score = Scorer::doc_term_weight(freq, norm_lens[docid]); 42 | max_score = std::max(max_score, score); 43 | } 44 | max_term_weight.push_back(max_score); 45 | if ((max_term_weight.size() % 1000000) == 0) { 46 | logger() << max_term_weight.size() << " list processed" << std::endl; 47 | } 48 | } 49 | logger() << max_term_weight.size() << " list processed" << std::endl; 50 | 51 | m_norm_lens.steal(norm_lens); 52 | m_max_term_weight.steal(max_term_weight); 53 | } 54 | 55 | float norm_len(uint64_t doc_id) const 56 | { 57 | return m_norm_lens[doc_id]; 58 | } 59 | 60 | float max_term_weight(uint64_t term_id) const 61 | { 62 | return m_max_term_weight[term_id]; 63 | } 64 | 65 | void swap(wand_data& other) 66 | { 67 | m_norm_lens.swap(other.m_norm_lens); 68 | m_max_term_weight.swap(other.m_max_term_weight); 69 | } 70 | 71 | template 72 | void map(Visitor& visit) 73 | { 74 | visit 75 | (m_norm_lens, "m_norm_lens") 76 | (m_max_term_weight, "m_max_term_weight") 77 | ; 78 | } 79 | 80 | private: 81 | succinct::mapper::mappable_vector m_norm_lens; 82 | succinct::mapper::mappable_vector m_max_term_weight; 83 | }; 84 | 85 | } 86 | --------------------------------------------------------------------------------