├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── external └── external.cmake ├── include └── rgb │ ├── rgb.hpp │ └── util.hpp ├── src ├── CMakeLists.txt └── rgb.cpp └── test └── CMakeLists.txt /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: WebKit 2 | IndentWidth: 4 3 | Language: Cpp 4 | DerivePointerAlignment: false 5 | PointerAlignment: Left 6 | ColumnLimit: 80 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | build/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/tbb"] 2 | path = external/tbb 3 | url = https://github.com/01org/tbb.git 4 | [submodule "external/parallelstl"] 5 | path = external/parallelstl 6 | url = https://github.com/intel/parallelstl.git 7 | [submodule "external/CLI11"] 8 | path = external/CLI11 9 | url = https://github.com/CLIUtils/CLI11.git 10 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | if (NOT CMAKE_BUILD_TYPE) 4 | message(STATUS "No build type selected, default to Release") 5 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) 6 | endif() 7 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} ) 8 | 9 | project(rgb) 10 | 11 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 12 | 13 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 14 | if (CXX_COMPILER_VERSION VERSION_LESS 4.7) 15 | message(STATUS "GCC version must be at least 4.7!") 16 | endif() 17 | set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -fcilkplus -O3 -lm -DNDEBUG -std=c++17 -DHAVE_CXX0X -march=native") 18 | set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -fcilkplus -ggdb -lm -std=c++17 -DHAVE_CXX0X -march=native") 19 | elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 20 | if (CXX_COMPILER_VERSION VERSION_LESS 4.2.1) 21 | message(STATUS "Clang version must be at least 4.2.1!" ) 22 | endif() 23 | set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -fcilkplus -O3 -DNDEBUG -std=c++17 -DHAVE_CXX0X -msse4.1 -march=native") 24 | set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -fcilkplus -ggdb -std=c++17 -DHAVE_CXX0X -msse4.1 -march=native") 25 | else () 26 | message(FATAL_ERROR "Please, use GCC or Clang compiler!") 27 | endif() 28 | 29 | include_directories(include) 30 | add_library(rgb INTERFACE) 31 | target_include_directories(rgb INTERFACE 32 | $ 33 | ) 34 | 35 | include(external/external.cmake) 36 | 37 | add_subdirectory(src) 38 | 39 | 40 | enable_testing() 41 | add_subdirectory(test) 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Matthias Petri 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Recursive graph bisection 2 | 3 | This program implements the following graph reordering technique: 4 | 5 | - Laxman Dhulipala, Igor Kabiljo, Brian Karrer, Giuseppe Ottaviano, Sergey Pupyrev, Alon Shalita: 6 | Compressing Graphs and Indexes with Recursive Graph Bisection. KDD 2016: 1535-1544 7 | 8 | 9 | ## Requirements 10 | 11 | The program requires `cilkplus` which should be available in all newish compilers (e.g. gcc7) 12 | 13 | ## Installing and Compiling 14 | 15 | To install and compile the code run the following commands 16 | 17 | ``` 18 | git clone https://github.com/mpetri/recursive_graph_bisection.git 19 | ``` 20 | 21 | And compile using `make`: 22 | 23 | ``` 24 | make 25 | ``` 26 | 27 | which produces the `rec_graph_bisect.x` binary. 28 | 29 | 30 | ## Input format 31 | 32 | We use the [ds2i](https://github.com/ot/ds2i) input format (description taken from the repo): 33 | 34 | 35 | A _binary sequence_ is a sequence of integers prefixed by its length, where both 36 | the sequence integers and the length are written as 32-bit little-endian 37 | unsigned integers. 38 | 39 | A _collection_ consists of 3 files, `.docs`, `.freqs`, 40 | `.sizes`. 41 | 42 | * `.docs` starts with a singleton binary sequence where its only 43 | integer is the number of documents in the collection. It is then followed by 44 | one binary sequence for each posting list, in order of term-ids. Each posting 45 | list contains the sequence of document-ids containing the term. 46 | 47 | * `.freqs` is composed of a one binary sequence per posting list, where 48 | each sequence contains the occurrence counts of the postings, aligned with the 49 | previous file (note however that this file does not have an additional 50 | singleton list at its beginning). 51 | 52 | * `.sizes` is composed of a single binary sequence whose length is the 53 | same as the number of documents in the collection, and the i-th element of the 54 | sequence is the size (number of terms) of the i-th document. 55 | 56 | 57 | ## Running command 58 | 59 | To reorder the index the following options are provided: 60 | 61 | `rec_graph_bisect.x ` 62 | 63 | where 64 | 65 | * `ds2i_prefix` is the `` specified above 66 | 67 | * `ds2i_out_prefix` is the output prefix where the reordered index should be stored 68 | 69 | * `min_list_len` specifies a minimum list threshold which should be ignored during reordering. This does not mean the lists will be lost. Lists below the threshold are just not considered in the reordering phase but will still appear in the final output. 70 | 71 | * `num_threads` specifies the number of threads to use during computation 72 | 73 | ## Example 74 | 75 | Say you have stored `gov2` in `ds2i` format described above in a directory: 76 | 77 | 78 | ``` 79 | [10:56:28 mpetri]$ ls -l /storage/gov2-d2si/ 80 | total 43084248 81 | -rw-r--r-- 1 mpetri mpetri 21765004632 Jul 18 14:37 gov2.docs 82 | -rw-r--r-- 1 mpetri mpetri 21765004624 Jul 18 14:37 gov2.freqs 83 | -rw-r--r-- 1 mpetri mpetri 98831272 Jul 18 14:37 gov2.sizes 84 | ``` 85 | 86 | so the `ds2i_prefix` would be `/storage/gov2-ds2i/gov2` 87 | 88 | and we execute the bisection command with the parameters 89 | 90 | ``` 91 | rec_graph_bisect.x /storage/gov2-ds2i/gov2 /storage/gov2-ds2i/gov2-bisected 256 32 92 | ``` 93 | 94 | which uses `32` threads, a minimum list length of `256` and stores the result as: 95 | 96 | ``` 97 | [10:56:28 mpetri]$ ls -l /storage/gov2-d2si/ 98 | total 43084248 99 | -rw-r--r-- 1 mpetri mpetri 21765004632 Jul 18 14:37 gov2.docs 100 | -rw-r--r-- 1 mpetri mpetri 21765004624 Jul 18 14:37 gov2.freqs 101 | -rw-r--r-- 1 mpetri mpetri 98831272 Jul 18 14:37 gov2.sizes 102 | -rw-r--r-- 1 mpetri mpetri 21765004632 Jul 18 18:32 gov2-bisected.docs 103 | -rw-r--r-- 1 mpetri mpetri 21765004624 Jul 18 18:32 gov2-bisected.freqs 104 | -rw-r--r-- 1 mpetri mpetri 98831272 Jul 18 18:32 gov2-bisected.sizes 105 | -rw-r--r-- 1 mpetri mpetri 98831272 Jul 18 18:32 gov2-bisected.mapping 106 | ``` 107 | 108 | where `gov2-bisected.mapping` specifies how the document identifiers were remapped in the following format: 109 | 110 | ``` 111 | 112 | ``` 113 | 114 | ## Runtime 115 | 116 | The code is not as optimized as in the paper but finishes in reasonable time frame. For example, `gov2` 117 | can be reordered in less than two hours. 118 | 119 | However, memory consumption is quite high. It requires at least O(size of input files) RAM. 120 | 121 | ## Authors 122 | 123 | * **Matthias Petri** - [mpetri](https://github.com/mpetri) 124 | 125 | * **Joel Mackenzie** - [JMMackenzie](https://github.com/JMMackenzie) 126 | 127 | ## License 128 | 129 | This project is licensed under the BSD 3-Clause License - see the [LICENSE.md](LICENSE.md) file for details 130 | -------------------------------------------------------------------------------- /external/external.cmake: -------------------------------------------------------------------------------- 1 | EXECUTE_PROCESS(COMMAND git submodule update --init 2 | WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/.. 3 | OUTPUT_QUIET 4 | ) 5 | 6 | 7 | # Add CLI11 8 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/CLI11 EXCLUDE_FROM_ALL) 9 | 10 | # Add TBB 11 | set(TBB_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb/) 12 | include(${TBB_ROOT}/cmake/TBBBuild.cmake) 13 | tbb_build( 14 | TBB_ROOT ${TBB_ROOT} 15 | CONFIG_DIR TBB_DIR) 16 | 17 | # Add ParallelSTL 18 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/parallelstl EXCLUDE_FROM_ALL) 19 | -------------------------------------------------------------------------------- /include/rgb/rgb.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "util.hpp" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace constants { 19 | const int MAX_ITER = 20; 20 | const uint64_t PARALLEL_SWITCH_DEPTH = 6; 21 | } 22 | 23 | struct docid_node { 24 | uint64_t initial_id; 25 | uint32_t* terms; 26 | uint32_t* freqs; 27 | size_t num_terms; 28 | size_t num_terms_not_pruned; 29 | }; 30 | 31 | std::vector log2_precomp; 32 | 33 | float log2_cmp(uint32_t idx) 34 | { 35 | if (idx < 256) 36 | return log2_precomp[idx]; 37 | return log2f(idx); 38 | } 39 | 40 | void swap_nodes(docid_node* a, docid_node* b) 41 | { 42 | std::swap(a->initial_id, b->initial_id); 43 | std::swap(a->terms, b->terms); 44 | std::swap(a->freqs, b->freqs); 45 | std::swap(a->num_terms, b->num_terms); 46 | std::swap(a->num_terms_not_pruned, b->num_terms_not_pruned); 47 | } 48 | 49 | void swap_nodes(docid_node* a, docid_node* b, std::vector& deg1, 50 | std::vector& deg2, std::vector& queries_changed) 51 | { 52 | { 53 | size_t n = a->num_terms / 4; 54 | size_t m = a->num_terms % 4; 55 | for (size_t i = 0; i < n * 4; i+=4) { 56 | auto q0 = a->terms[i]; 57 | auto q1 = a->terms[i + 1]; 58 | auto q2 = a->terms[i + 2]; 59 | auto q3 = a->terms[i + 3]; 60 | __m128i _one = _mm_set1_epi32(1); 61 | 62 | { 63 | __m128i _deg1 = _mm_set_epi32(deg1[q0], deg1[q1], deg1[q2], deg1[q3]); 64 | __m128i _result = _mm_sub_epi32(_deg1, _one); 65 | 66 | deg1[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3))); 67 | deg1[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2))); 68 | deg1[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1))); 69 | deg1[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0))); 70 | } 71 | { 72 | __m128i _deg2 = _mm_set_epi32(deg2[q0], deg2[q1], deg2[q2], deg2[q3]); 73 | __m128i _result = _mm_add_epi32(_deg2, _one); 74 | deg2[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3))); 75 | deg2[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2))); 76 | deg2[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1))); 77 | deg2[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0))); 78 | 79 | } 80 | queries_changed[q0] = _mm_cvtsi128_si32(_one); 81 | queries_changed[q1] = _mm_cvtsi128_si32(_one); 82 | queries_changed[q2] = _mm_cvtsi128_si32(_one); 83 | queries_changed[q3] = _mm_cvtsi128_si32(_one); 84 | } 85 | for (size_t i = 0; i < m; i++) { 86 | auto qry = a->terms[n * 4 + i]; 87 | deg1[qry]--; 88 | deg2[qry]++; 89 | queries_changed[qry] = 1; 90 | } 91 | } 92 | { 93 | size_t n = b->num_terms / 4; 94 | size_t m = b->num_terms % 4; 95 | for (size_t i = 0; i < n * 4; i+=4) { 96 | auto q0 = b->terms[i]; 97 | auto q1 = b->terms[i + 1]; 98 | auto q2 = b->terms[i + 2]; 99 | auto q3 = b->terms[i + 3]; 100 | __m128i _one = _mm_set1_epi32(1); 101 | 102 | { 103 | __m128i _deg1 = _mm_set_epi32(deg1[q0], deg1[q1], deg1[q2], deg1[q3]); 104 | __m128i _result = _mm_add_epi32(_deg1, _one); 105 | deg1[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3))); 106 | deg1[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2))); 107 | deg1[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1))); 108 | deg1[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0))); 109 | } 110 | { 111 | __m128i _deg2 = _mm_set_epi32(deg2[q0], deg2[q1], deg2[q2], deg2[q3]); 112 | __m128i _result = _mm_sub_epi32(_deg2, _one); 113 | deg2[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3))); 114 | deg2[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2))); 115 | deg2[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1))); 116 | deg2[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0))); 117 | 118 | } 119 | queries_changed[q0] = _mm_cvtsi128_si32(_one); 120 | queries_changed[q1] = _mm_cvtsi128_si32(_one); 121 | queries_changed[q2] = _mm_cvtsi128_si32(_one); 122 | queries_changed[q3] = _mm_cvtsi128_si32(_one); 123 | } 124 | for (size_t i = 0; i < m; i++) { 125 | auto qry = b->terms[n * 4 + i]; 126 | deg1[qry]++; 127 | deg2[qry]--; 128 | queries_changed[qry] = 1; 129 | } 130 | } 131 | swap_nodes(a, b); 132 | } 133 | 134 | struct bipartite_graph { 135 | size_t num_queries; 136 | size_t num_docs; 137 | size_t num_docs_inc_empty; 138 | std::vector graph; 139 | std::vector doc_contents; 140 | std::vector doc_freqs; 141 | }; 142 | 143 | struct partition_t { 144 | docid_node* V1; 145 | docid_node* V2; 146 | size_t n1; 147 | size_t n2; 148 | }; 149 | 150 | void compute_doc_sizes(inverted_index& idx, std::vector& doc_sizes, 151 | std::vector& doc_sizes_non_pruned, uint32_t min_doc_id, 152 | uint32_t max_doc_id, size_t min_list_len) 153 | { 154 | for (size_t termid = 0; termid < idx.docids.size(); termid++) { 155 | const auto& plist = idx.docids[termid]; 156 | for (const auto& doc_id : plist) { 157 | if (min_doc_id <= doc_id && doc_id < max_doc_id) { 158 | if (plist.size() >= min_list_len) { 159 | doc_sizes[doc_id] += 1; 160 | } 161 | doc_sizes_non_pruned[doc_id] += 1; 162 | } 163 | } 164 | } 165 | } 166 | 167 | void create_graph(bipartite_graph& bg, inverted_index& idx, uint32_t min_doc_id, 168 | uint32_t max_doc_id, size_t min_list_len) 169 | { 170 | std::vector doc_offset(idx.max_doc_id + 1, 0); 171 | for (size_t termid = 0; termid < idx.docids.size(); termid++) { 172 | const auto& dlist = idx.docids[termid]; 173 | const auto& flist = idx.freqs[termid]; 174 | if (dlist.size() >= min_list_len) { 175 | for (size_t pos = 0; pos < dlist.size(); pos++) { 176 | const auto& doc_id = dlist[pos]; 177 | if (min_doc_id <= doc_id && doc_id < max_doc_id) { 178 | bg.graph[doc_id].initial_id = doc_id; 179 | bg.graph[doc_id].freqs[doc_offset[doc_id]] = flist[pos]; 180 | bg.graph[doc_id].terms[doc_offset[doc_id]++] = termid; 181 | } 182 | } 183 | } 184 | } 185 | for (size_t termid = 0; termid < idx.docids.size(); termid++) { 186 | const auto& dlist = idx.docids[termid]; 187 | const auto& flist = idx.freqs[termid]; 188 | if (dlist.size() < min_list_len) { 189 | for (size_t pos = 0; pos < dlist.size(); pos++) { 190 | const auto& doc_id = dlist[pos]; 191 | if (min_doc_id <= doc_id && doc_id < max_doc_id) { 192 | bg.graph[doc_id].initial_id = doc_id; 193 | bg.graph[doc_id].freqs[doc_offset[doc_id]] = flist[pos]; 194 | bg.graph[doc_id].terms[doc_offset[doc_id]++] = termid; 195 | } 196 | } 197 | } 198 | } 199 | } 200 | 201 | bipartite_graph construct_bipartite_graph( 202 | inverted_index& idx, size_t min_list_len) 203 | { 204 | timer t("construct_bipartite_graph"); 205 | bipartite_graph bg; 206 | bg.num_queries = idx.size(); 207 | { 208 | timer t("determine doc sizes"); 209 | size_t workers = __cilkrts_get_nworkers(); 210 | std::vector doc_sizes(idx.max_doc_id + 1); 211 | std::vector doc_sizes_non_pruned(idx.max_doc_id + 1); 212 | std::vector> tmp_doc_sizes(workers); 213 | std::vector> tmp_doc_sizes_non_pruned(workers); 214 | for (auto& v : tmp_doc_sizes) 215 | v.resize(idx.max_doc_id + 1); 216 | for (auto& v : tmp_doc_sizes_non_pruned) 217 | v.resize(idx.max_doc_id + 1); 218 | size_t doc_ids_in_slice = idx.max_doc_id / workers; 219 | for (size_t id = 0; id < workers; id++) { 220 | size_t min_doc_id = id * doc_ids_in_slice; 221 | size_t max_doc_id = min_doc_id + doc_ids_in_slice; 222 | if (id + 1 == workers) { 223 | max_doc_id = idx.max_doc_id + 1; 224 | compute_doc_sizes(idx, tmp_doc_sizes[id], 225 | tmp_doc_sizes_non_pruned[id], min_doc_id, max_doc_id, 226 | min_list_len); 227 | } else { 228 | cilk_spawn compute_doc_sizes(idx, tmp_doc_sizes[id], 229 | tmp_doc_sizes_non_pruned[id], min_doc_id, max_doc_id, 230 | min_list_len); 231 | } 232 | } 233 | cilk_sync; 234 | for (auto& v : tmp_doc_sizes) { 235 | for (size_t i = 0; i < v.size(); i++) { 236 | if (v[i] != 0) 237 | doc_sizes[i] = v[i]; 238 | } 239 | } 240 | for (auto& v : tmp_doc_sizes_non_pruned) { 241 | for (size_t i = 0; i < v.size(); i++) { 242 | if (v[i] != 0) 243 | doc_sizes_non_pruned[i] = v[i]; 244 | } 245 | } 246 | bg.doc_contents.resize(idx.num_postings); 247 | bg.doc_freqs.resize(idx.num_postings); 248 | bg.graph.resize(idx.max_doc_id + 1); 249 | bg.num_docs_inc_empty = idx.max_doc_id + 1; 250 | bg.graph[0].terms = bg.doc_contents.data(); 251 | bg.graph[0].freqs = bg.doc_freqs.data(); 252 | bg.graph[0].num_terms = doc_sizes[0]; 253 | bg.graph[0].num_terms_not_pruned = doc_sizes_non_pruned[0]; 254 | for (size_t i = 1; i < doc_sizes.size(); i++) { 255 | bg.graph[i].terms 256 | = bg.graph[i - 1].terms + bg.graph[i - 1].num_terms_not_pruned; 257 | bg.graph[i].freqs 258 | = bg.graph[i - 1].freqs + bg.graph[i - 1].num_terms_not_pruned; 259 | bg.graph[i].num_terms = doc_sizes[i]; 260 | bg.graph[i].num_terms_not_pruned = doc_sizes_non_pruned[i]; 261 | } 262 | } 263 | { 264 | timer t("create forward index"); 265 | size_t workers = __cilkrts_get_nworkers(); 266 | size_t doc_ids_in_slice = idx.max_doc_id / workers; 267 | for (size_t id = 0; id < workers; id++) { 268 | size_t min_doc_id = id * doc_ids_in_slice; 269 | size_t max_doc_id = min_doc_id + doc_ids_in_slice; 270 | if (id + 1 == workers) { 271 | max_doc_id = idx.max_doc_id + 1; 272 | create_graph(bg, idx, min_doc_id, max_doc_id, min_list_len); 273 | } else { 274 | cilk_spawn create_graph( 275 | bg, idx, min_doc_id, max_doc_id, min_list_len); 276 | } 277 | } 278 | cilk_sync; 279 | } 280 | 281 | // Set ID for empty documents. 282 | for (uint32_t doc_id = 0; doc_id < idx.num_docs; ++doc_id) { 283 | if (bg.graph[doc_id].initial_id != doc_id) { 284 | bg.graph[doc_id].initial_id = doc_id; 285 | } 286 | } 287 | size_t num_empty = 0; 288 | { 289 | // all docs with 0 size go to the back! 290 | auto empty_cmp = [](const auto& a, const auto& b) { 291 | return a.num_terms > b.num_terms; 292 | }; 293 | std::sort(bg.graph.begin(), bg.graph.end(), empty_cmp); 294 | auto ritr = bg.graph.end() - 1; 295 | auto itr = bg.graph.begin(); 296 | while (itr != ritr) { 297 | if (itr->num_terms == 0) { 298 | num_empty++; 299 | } else { 300 | break; 301 | } 302 | --ritr; 303 | } 304 | bg.num_docs = bg.num_docs_inc_empty - num_empty; 305 | } 306 | 307 | size_t num_skipped_lists = 0; 308 | size_t num_lists = 0; 309 | for (size_t termid = 0; termid < idx.docids.size(); termid++) { 310 | const auto& dlist = idx.docids[termid]; 311 | if (dlist.size() < min_list_len) { 312 | num_skipped_lists++; 313 | } else { 314 | num_lists++; 315 | } 316 | } 317 | std::cout << "\tnum_empty docs = " << num_empty << std::endl; 318 | std::cout << "\tnum_skipped lists = " << num_skipped_lists << std::endl; 319 | std::cout << "\tnum_lists = " << num_lists << std::endl; 320 | std::cout << "\tnum_docs = " << bg.num_docs << std::endl; 321 | return bg; 322 | } 323 | 324 | void recreate_lists(const bipartite_graph& bg, inverted_index& idx, 325 | uint32_t min_q_id, uint32_t max_q_id, std::vector& qmap, 326 | std::vector& dsizes) 327 | { 328 | for (size_t docid = 0; docid < bg.num_docs_inc_empty; docid++) { 329 | const auto& doc = bg.graph[docid]; 330 | for (size_t i = 0; i < doc.num_terms_not_pruned; i++) { 331 | auto qid = doc.terms[i]; 332 | if (min_q_id <= qmap[qid] && qmap[qid] < max_q_id) { 333 | auto freq = doc.freqs[i]; 334 | idx.docids[qid].push_back(docid); 335 | idx.freqs[qid].push_back(freq); 336 | dsizes[docid] += freq; 337 | } 338 | } 339 | } 340 | } 341 | 342 | inverted_index recreate_invidx(const bipartite_graph& bg, size_t num_lists) 343 | { 344 | timer t("recreate_invidx"); 345 | inverted_index idx; 346 | size_t num_postings = 0; 347 | idx.resize(num_lists); 348 | { 349 | size_t workers = __cilkrts_get_nworkers(); 350 | size_t qids_in_slice = num_lists / workers; 351 | std::vector qids_map(num_lists); 352 | for (size_t i = 0; i < qids_map.size(); i++) 353 | qids_map[i] = i; 354 | std::mt19937 rnd(1); 355 | std::shuffle(qids_map.begin(), qids_map.end(), rnd); 356 | std::vector> doc_sizes(workers); 357 | for (size_t id = 0; id < workers; id++) { 358 | doc_sizes[id].resize(bg.num_docs_inc_empty); 359 | size_t min_q_id = id * qids_in_slice; 360 | size_t max_q_id = min_q_id + qids_in_slice; 361 | if (id + 1 == workers) { 362 | max_q_id = num_lists; 363 | recreate_lists( 364 | bg, idx, min_q_id, max_q_id, qids_map, doc_sizes[id]); 365 | } else { 366 | cilk_spawn recreate_lists( 367 | bg, idx, min_q_id, max_q_id, qids_map, doc_sizes[id]); 368 | } 369 | } 370 | cilk_sync; 371 | idx.doc_lengths.resize(bg.num_docs_inc_empty); 372 | for (size_t id = 0; id < workers; id++) { 373 | for (size_t docid = 0; docid < bg.num_docs_inc_empty; docid++) { 374 | idx.doc_lengths[docid] += doc_sizes[id][docid]; 375 | } 376 | } 377 | } 378 | { 379 | 380 | for (size_t docid = 0; docid < bg.num_docs_inc_empty; docid++) { 381 | const auto& doc = bg.graph[docid]; 382 | idx.doc_id_mapping.push_back(doc.initial_id); 383 | num_postings += doc.num_terms_not_pruned; 384 | } 385 | } 386 | idx.num_docs = bg.num_docs_inc_empty; 387 | idx.max_doc_id = idx.num_docs - 1; 388 | idx.num_postings = num_postings; 389 | std::cout << "\tnum_docs = " << idx.num_docs << std::endl; 390 | std::cout << "\tmax_doc_id = " << idx.max_doc_id << std::endl; 391 | std::cout << "\tnum_lists = " << idx.docids.size() << std::endl; 392 | std::cout << "\tnum_postings = " << idx.num_postings << std::endl; 393 | return idx; 394 | } 395 | 396 | /* random shuffle seems to do ok */ 397 | partition_t initial_partition(docid_node* G, size_t n) 398 | { 399 | partition_t p; 400 | std::mt19937 rnd(n); 401 | std::shuffle(G, G + n, rnd); 402 | p.V1 = G; 403 | p.n1 = (n / 2); 404 | p.V2 = G + p.n1; 405 | p.n2 = n - p.n1; 406 | return p; 407 | } 408 | 409 | struct move_gain { 410 | double gain; 411 | docid_node* node; 412 | move_gain() 413 | : gain(0) 414 | , node(nullptr) 415 | { 416 | } 417 | move_gain(double g, docid_node* n) 418 | : gain(g) 419 | , node(n) 420 | { 421 | } 422 | bool operator<(const move_gain& other) { return gain > other.gain; } 423 | }; 424 | 425 | struct move_gains_t { 426 | std::vector V1; 427 | std::vector V2; 428 | }; 429 | 430 | move_gain compute_single_gain(docid_node* doc, 431 | std::vector& before, std::vector& after) 432 | { 433 | __m128 _vsum = _mm_set1_ps(0); 434 | float gain[4]; 435 | size_t n = doc->num_terms / 4; 436 | size_t m = doc->num_terms % 4; 437 | for (size_t j = 0; j < n * 4; j+=4) { 438 | auto q0 = doc->terms[j]; 439 | auto q1 = doc->terms[j + 1]; 440 | auto q2 = doc->terms[j + 2]; 441 | auto q3 = doc->terms[j + 3]; 442 | __m128 _before = _mm_set_ps(before[q0], before[q1], before[q2], before[q3]); 443 | __m128 _after = _mm_set_ps(after[q0], after[q1], after[q2], after[q3]); 444 | __m128 _val = _mm_sub_ps(_before, _after); 445 | _vsum = _mm_add_ps(_vsum, _val); 446 | } 447 | _mm_store_ps(gain, _vsum); 448 | auto total = gain[0] + gain[1] + gain[2] + gain[3]; 449 | for (size_t j = 0; j < m; j++) { 450 | auto q = doc->terms[n * 4 + j]; 451 | total += before[q] - after[q]; 452 | } 453 | return move_gain(total, doc); 454 | } 455 | 456 | void compute_deg(docid_node* docs, size_t n, std::vector& deg, std::vector &query_changed) 457 | { 458 | for (size_t i = 0; i < n; i++) { 459 | auto doc = docs + i; 460 | size_t n = doc->num_terms / 4; 461 | size_t m = doc->num_terms % 4; 462 | for (size_t j = 0; j < n * 4; j+=4) { 463 | auto q0 = doc->terms[j]; 464 | auto q1 = doc->terms[j + 1]; 465 | auto q2 = doc->terms[j + 2]; 466 | auto q3 = doc->terms[j + 3]; 467 | __m128i _one = _mm_set1_epi32(1); 468 | __m128i _deg = _mm_set_epi32(deg[q0], deg[q1], deg[q2], deg[q3]); 469 | __m128i _result = _mm_add_epi32(_deg, _one); 470 | 471 | deg[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3))); 472 | deg[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2))); 473 | deg[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1))); 474 | deg[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0))); 475 | 476 | query_changed[q0] = _mm_cvtsi128_si32(_one); 477 | query_changed[q1] = _mm_cvtsi128_si32(_one); 478 | query_changed[q2] = _mm_cvtsi128_si32(_one); 479 | query_changed[q3] = _mm_cvtsi128_si32(_one); 480 | } 481 | for (size_t j = 0; j < m; j++) { 482 | auto qry = doc->terms[n * 4 + j]; 483 | deg[qry]++; 484 | query_changed[qry] = 1; 485 | } 486 | } 487 | } 488 | 489 | void compute_gains(docid_node* docs, size_t n, std::vector& before, 490 | std::vector& after, std::vector& res) 491 | { 492 | res.resize(n); 493 | cilk_for(size_t i = 0; i < n; i++) 494 | { 495 | auto doc = docs + i; 496 | res[i] = compute_single_gain(doc, before, after); 497 | } 498 | } 499 | 500 | void compute_gains_np(docid_node* docs, size_t n, std::vector& before, 501 | std::vector& after, std::vector& res) 502 | { 503 | res.resize(n); 504 | for (size_t i = 0; i < n; i++) { 505 | auto doc = docs + i; 506 | res[i] = compute_single_gain(doc, before, after); 507 | } 508 | } 509 | 510 | move_gains_t compute_move_gains(partition_t& P, size_t num_queries, 511 | std::vector& deg1, std::vector& deg2, 512 | std::vector& before, std::vector& left2right, 513 | std::vector& right2left, std::vector& qry_changed) 514 | { 515 | move_gains_t gains; 516 | 517 | float logn1 = log2f(P.n1); 518 | float logn2 = log2f(P.n2); 519 | cilk_for(size_t q = 0; q < num_queries; q++) 520 | { 521 | if (qry_changed[q] == 1) { 522 | qry_changed[q] = 0; 523 | before[q] = 0; 524 | left2right[q] = 0; 525 | right2left[q] = 0; 526 | if (deg1[q] or deg2[q]) { 527 | before[q] = deg1[q] * logn1 - deg1[q] * log2_cmp(deg1[q] + 1) 528 | + deg2[q] * logn2 - deg2[q] * log2_cmp(deg2[q] + 1); 529 | } 530 | if (deg1[q]) { 531 | left2right[q] = (deg1[q] - 1) * logn1 532 | - (deg1[q] - 1) * log2_cmp(deg1[q]) + (deg2[q] + 1) * logn2 533 | - (deg2[q] + 1) * log2_cmp(deg2[q] + 2); 534 | } 535 | if (deg2[q]) 536 | right2left[q] = (deg1[q] + 1) * logn1 537 | - (deg1[q] + 1) * log2_cmp(deg1[q] + 2) 538 | + (deg2[q] - 1) * logn2 - (deg2[q] - 1) * log2_cmp(deg2[q]); 539 | } 540 | } 541 | 542 | // (2) compute gains from moving docs 543 | cilk_spawn compute_gains(P.V1, P.n1, before, left2right, gains.V1); 544 | compute_gains(P.V2, P.n2, before, right2left, gains.V2); 545 | cilk_sync; 546 | 547 | return gains; 548 | } 549 | 550 | move_gains_t compute_move_gains_np(partition_t& P, size_t num_queries, 551 | std::vector& deg1, std::vector& deg2, 552 | std::vector& before, std::vector& left2right, 553 | std::vector& right2left, std::vector& qry_changed) 554 | { 555 | move_gains_t gains; 556 | 557 | float logn1 = log2f(P.n1); 558 | float logn2 = log2f(P.n2); 559 | for (size_t q = 0; q < num_queries; q++) { 560 | if (qry_changed[q] == 1) { 561 | qry_changed[q] = 0; 562 | before[q] = 0; 563 | left2right[q] = 0; 564 | right2left[q] = 0; 565 | if (deg1[q] or deg2[q]) { 566 | before[q] = deg1[q] * logn1 - deg1[q] * log2_cmp(deg1[q] + 1) 567 | + deg2[q] * logn2 - deg2[q] * log2_cmp(deg2[q] + 1); 568 | } 569 | if (deg1[q]) { 570 | left2right[q] = (deg1[q] - 1) * logn1 571 | - (deg1[q] - 1) * log2_cmp(deg1[q]) + (deg2[q] + 1) * logn2 572 | - (deg2[q] + 1) * log2_cmp(deg2[q] + 2); 573 | } 574 | if (deg2[q]) 575 | right2left[q] = (deg1[q] + 1) * logn1 576 | - (deg1[q] + 1) * log2_cmp(deg1[q] + 2) 577 | + (deg2[q] - 1) * logn2 - (deg2[q] - 1) * log2_cmp(deg2[q]); 578 | } 579 | } 580 | 581 | // (2) compute gains from moving docs 582 | compute_gains(P.V1, P.n1, before, left2right, gains.V1); 583 | compute_gains(P.V2, P.n2, before, right2left, gains.V2); 584 | 585 | return gains; 586 | } 587 | 588 | void recursive_bisection_np(progress_bar& progress, docid_node* G, 589 | size_t num_queries, size_t n,uint64_t depth,uint64_t max_depth) 590 | { 591 | // (1) create the initial partition. O(n) 592 | auto partition = initial_partition(G, n); 593 | 594 | { 595 | // (2) we compute deg1 and deg2 only once 596 | std::vector deg1(num_queries, 0); 597 | std::vector deg2(num_queries, 0); 598 | std::vector before(num_queries); 599 | std::vector left2right(num_queries); 600 | std::vector right2left(num_queries); 601 | 602 | std::vector query_changed(num_queries, 0); 603 | { 604 | compute_deg(partition.V1, partition.n1, deg1, query_changed); 605 | compute_deg(partition.V2, partition.n2, deg2, query_changed); 606 | } 607 | 608 | // (3) perform bisection. constant number of iterations 609 | for (int cur_iter = 1; cur_iter <= constants::MAX_ITER; cur_iter++) { 610 | // (3a) compute move gains 611 | auto gains = compute_move_gains_np(partition, num_queries, deg1, 612 | deg2, before, left2right, right2left, query_changed); 613 | memset(query_changed.data(), 0, num_queries); 614 | 615 | // (3b) sort by decreasing gain. O(n log n) 616 | { 617 | std::sort(gains.V1.begin(), gains.V1.end()); 618 | std::sort(gains.V2.begin(), gains.V2.end()); 619 | } 620 | 621 | // (3c) swap. O(n) 622 | size_t num_swaps = 0; 623 | { 624 | auto itr_v1 = gains.V1.begin(); 625 | auto itr_v2 = gains.V2.begin(); 626 | while (itr_v1 != gains.V1.end() && itr_v2 != gains.V2.end()) { 627 | if (itr_v1->gain + itr_v2->gain > 0) { 628 | // maybe we need to do something here to make 629 | // compute_move_gains() efficient? 630 | swap_nodes(itr_v1->node, itr_v2->node, deg1, deg2, 631 | query_changed); 632 | num_swaps++; 633 | } else { 634 | break; 635 | } 636 | ++itr_v1; 637 | ++itr_v2; 638 | } 639 | } 640 | 641 | // (3d) converged? 642 | if (num_swaps == 0) { 643 | break; 644 | } 645 | } 646 | } 647 | 648 | // (4) recurse. at most O(log n) recursion steps 649 | if (depth + 1 <= max_depth) { 650 | if (partition.n1 > 1) 651 | recursive_bisection_np( 652 | progress, partition.V1, num_queries, partition.n1, depth + 1,max_depth); 653 | if (partition.n2 > 1) 654 | recursive_bisection_np( 655 | progress, partition.V2, num_queries, partition.n2, depth + 1,max_depth); 656 | 657 | if (partition.n1 == 1) 658 | progress.done(1); 659 | if (partition.n2 == 1) 660 | progress.done(1); 661 | } else { 662 | progress.done(n); 663 | } 664 | } 665 | 666 | void recursive_bisection(progress_bar& progress, docid_node* G, 667 | size_t num_queries, size_t n, uint64_t depth,uint64_t max_depth) 668 | { 669 | // (1) create the initial partition. O(n) 670 | auto partition = initial_partition(G, n); 671 | 672 | { 673 | // (2) we compute deg1 and deg2 only once 674 | std::vector deg1(num_queries, 0); 675 | std::vector deg2(num_queries, 0); 676 | std::vector before(num_queries); 677 | std::vector left2right(num_queries); 678 | std::vector right2left(num_queries); 679 | 680 | std::vector query_changed(num_queries, 0); 681 | { 682 | cilk_spawn compute_deg(partition.V1, partition.n1, deg1, query_changed); 683 | compute_deg(partition.V2, partition.n2, deg2, query_changed); 684 | cilk_sync; 685 | } 686 | 687 | // (3) perform bisection. constant number of iterations 688 | for (int cur_iter = 1; cur_iter <= constants::MAX_ITER; cur_iter++) { 689 | // (3a) compute move gains 690 | auto gains = compute_move_gains(partition, num_queries, deg1, deg2, 691 | before, left2right, right2left, query_changed); 692 | memset(query_changed.data(), 0, num_queries); 693 | 694 | // (3b) sort by decreasing gain. O(n log n) 695 | { 696 | cilk_spawn std::sort(gains.V1.begin(), gains.V1.end()); 697 | std::sort(gains.V2.begin(), gains.V2.end()); 698 | cilk_sync; 699 | } 700 | 701 | // (3c) swap. O(n) 702 | size_t num_swaps = 0; 703 | { 704 | auto itr_v1 = gains.V1.begin(); 705 | auto itr_v2 = gains.V2.begin(); 706 | while (itr_v1 != gains.V1.end() && itr_v2 != gains.V2.end()) { 707 | if (itr_v1->gain + itr_v2->gain > 0) { 708 | // maybe we need to do something here to make 709 | // compute_move_gains() efficient? 710 | swap_nodes(itr_v1->node, itr_v2->node, deg1, deg2, 711 | query_changed); 712 | num_swaps++; 713 | } else { 714 | break; 715 | } 716 | ++itr_v1; 717 | ++itr_v2; 718 | } 719 | } 720 | 721 | // (3d) converged? 722 | if (num_swaps == 0) { 723 | break; 724 | } 725 | } 726 | } 727 | 728 | // (4) recurse. at most O(log n) recursion steps 729 | if (depth + 1 <= max_depth) { 730 | if (depth < constants::PARALLEL_SWITCH_DEPTH) { 731 | if (partition.n1 > 1) { 732 | cilk_spawn recursive_bisection(progress, partition.V1, 733 | num_queries, partition.n1, depth + 1,max_depth); 734 | } 735 | if (partition.n2 > 1) { 736 | recursive_bisection(progress, partition.V2, num_queries, 737 | partition.n2, depth + 1,max_depth); 738 | } 739 | cilk_sync; 740 | } else { 741 | if (partition.n1 > 1) { 742 | recursive_bisection_np(progress, partition.V1, num_queries, 743 | partition.n1, depth + 1,max_depth); 744 | } 745 | if (partition.n2 > 1) { 746 | recursive_bisection_np(progress, partition.V2, num_queries, 747 | partition.n2, depth + 1,max_depth); 748 | } 749 | } 750 | if (partition.n1 == 1) 751 | progress.done(1); 752 | if (partition.n2 == 1) 753 | progress.done(1); 754 | } else { 755 | progress.done(n); 756 | } 757 | } 758 | 759 | inverted_index reorder_docids_graph_bisection( 760 | inverted_index& invidx, size_t min_list_len) 761 | { 762 | auto num_lists = invidx.docids.size(); 763 | auto bg = construct_bipartite_graph(invidx, min_list_len); 764 | 765 | // free up some space 766 | invidx.clear(); 767 | 768 | // make things faster by precomputing some logs 769 | log2_precomp.resize(256); 770 | for(size_t i = 0; i < 256; i++) { log2_precomp[i] = log2f(i); } 771 | 772 | { 773 | auto max_depth = std::max(1.0,ceil(log2(bg.num_docs)-5)); 774 | std::cout << "recursion depth = " << max_depth << std::endl; 775 | timer t("recursive_bisection"); 776 | progress_bar bp("recursive_bisection", bg.num_docs); 777 | recursive_bisection(bp, bg.graph.data(), bg.num_queries, bg.num_docs, 0, max_depth); 778 | } 779 | return recreate_invidx(bg, num_lists); 780 | } -------------------------------------------------------------------------------- /include/rgb/util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std::chrono; 12 | 13 | using postings_list = std::vector; 14 | 15 | struct inverted_index { 16 | size_t num_postings; 17 | uint32_t num_docs; 18 | uint32_t max_doc_id; 19 | std::vector docids; 20 | std::vector freqs; 21 | std::vector doc_lengths; 22 | std::vector doc_id_mapping; 23 | void resize(size_t new_size) 24 | { 25 | docids.resize(new_size); 26 | freqs.resize(new_size); 27 | } 28 | 29 | size_t size() const { return docids.size(); } 30 | 31 | void clear() 32 | { 33 | docids.resize(0); 34 | freqs.resize(0); 35 | doc_lengths.resize(0); 36 | doc_id_mapping.resize(0); 37 | } 38 | }; 39 | 40 | int tsfprintff(FILE* f, const char* format, ...) 41 | { 42 | static std::mutex pmutex; 43 | std::lock_guard lock(pmutex); 44 | va_list args; 45 | va_start(args, format); 46 | int ret = vfprintf(f, format, args); 47 | va_end(args); 48 | fflush(f); 49 | return ret; 50 | } 51 | 52 | struct timer { 53 | high_resolution_clock::time_point start; 54 | std::string name; 55 | timer(const std::string& _n) 56 | : name(_n) 57 | { 58 | tsfprintff(stdout, "START(%s)\n", name.c_str()); 59 | start = high_resolution_clock::now(); 60 | } 61 | ~timer() 62 | { 63 | auto stop = high_resolution_clock::now(); 64 | tsfprintff(stdout, "STOP(%s) - %.3f sec\n", name.c_str(), 65 | duration_cast(stop - start).count() / 1000.0f); 66 | } 67 | }; 68 | 69 | struct progress_bar { 70 | high_resolution_clock::time_point start; 71 | size_t total; 72 | size_t current; 73 | size_t cur_percent; 74 | progress_bar(std::string str, size_t t) 75 | : total(t) 76 | , current(0) 77 | , cur_percent(0) 78 | { 79 | std::cout << str << ":" << std::endl; 80 | tsfprintff(stdout, "[ 0/100] |"); 81 | for (size_t i = 0; i < 50; i++) 82 | tsfprintff(stdout, " "); 83 | tsfprintff(stdout, "|\r"); 84 | } 85 | progress_bar& operator++() 86 | { 87 | static std::mutex pmutex; 88 | std::lock_guard lock(pmutex); 89 | current++; 90 | float fcp = float(current) / float(total) * 100; 91 | size_t cp = fcp; 92 | if (cp != cur_percent) { 93 | cur_percent = cp; 94 | tsfprintff(stdout, "[%3d/100] |", (int)cur_percent); 95 | size_t print_percent = cur_percent / 2; 96 | for (size_t i = 0; i < print_percent; i++) 97 | tsfprintff(stdout, "="); 98 | tsfprintff(stdout, ">"); 99 | for (size_t i = print_percent; i < 50; i++) 100 | tsfprintff(stdout, " "); 101 | tsfprintff(stdout, "|\r"); 102 | } 103 | return *this; 104 | } 105 | void done(size_t num) 106 | { 107 | static std::mutex pmutex; 108 | std::lock_guard lock(pmutex); 109 | current += num; 110 | float fcp = float(current) / float(total) * 100; 111 | size_t cp = fcp; 112 | if (cp != cur_percent) { 113 | cur_percent = cp; 114 | tsfprintff(stdout, "[%3d/100] |", (int)cur_percent); 115 | size_t print_percent = cur_percent / 2; 116 | for (size_t i = 0; i < print_percent; i++) 117 | tsfprintff(stdout, "="); 118 | tsfprintff(stdout, ">"); 119 | for (size_t i = print_percent; i < 50; i++) 120 | tsfprintff(stdout, " "); 121 | tsfprintff(stdout, "|\r"); 122 | } 123 | } 124 | ~progress_bar() 125 | { 126 | tsfprintff(stdout, "[100/100] |"); 127 | for (size_t i = 0; i < 50; i++) 128 | tsfprintff(stdout, "="); 129 | tsfprintff(stdout, ">|\n"); 130 | } 131 | }; 132 | 133 | int fprintff(FILE* f, const char* format, ...) 134 | { 135 | va_list args; 136 | va_start(args, format); 137 | int ret = vfprintf(f, format, args); 138 | va_end(args); 139 | fflush(f); 140 | return ret; 141 | } 142 | 143 | void quit(const char* format, ...) 144 | { 145 | va_list args; 146 | va_start(args, format); 147 | fprintf(stderr, "error: "); 148 | vfprintf(stderr, format, args); 149 | va_end(args); 150 | if (errno != 0) { 151 | fprintf(stderr, ": %s\n", strerror(errno)); 152 | } else { 153 | fprintf(stderr, "\n"); 154 | } 155 | fflush(stderr); 156 | exit(EXIT_FAILURE); 157 | } 158 | 159 | FILE* fopen_or_fail(std::string file_name, const char* mode) 160 | { 161 | FILE* out_file = fopen(file_name.c_str(), mode); 162 | if (!out_file) { 163 | quit("opening output file %s failed", file_name.c_str()); 164 | } 165 | return out_file; 166 | } 167 | 168 | void fclose_or_fail(FILE* f) 169 | { 170 | int ret = fclose(f); 171 | if (ret != 0) { 172 | quit("closing file failed"); 173 | } 174 | } 175 | 176 | uint32_t read_u32(FILE* f) 177 | { 178 | uint32_t x; 179 | int ret = fread(&x, sizeof(uint32_t), 1, f); 180 | if (feof(f)) { 181 | return 0; 182 | } 183 | if (ret != 1) { 184 | quit("read u32 from file failed: %d != %d", ret, 1); 185 | } 186 | return x; 187 | } 188 | 189 | void read_u32s(FILE* f, void* ptr, size_t n) 190 | { 191 | size_t ret = fread(ptr, sizeof(uint32_t), n, f); 192 | if (ret != n) { 193 | quit("read u32s from file failed: %d != %d", ret, n); 194 | } 195 | } 196 | 197 | std::vector read_uint32_list(FILE* f) 198 | { 199 | uint32_t list_len = read_u32(f); 200 | if (list_len == 0) 201 | return std::vector(); 202 | std::vector list(list_len); 203 | read_u32s(f, list.data(), list_len); 204 | return list; 205 | } 206 | 207 | size_t write_u32(FILE* f, uint32_t x) 208 | { 209 | size_t ret = fwrite(&x, sizeof(uint32_t), 1u, f); 210 | if (ret != 1u) { 211 | quit("writing byte to file: %u != %u", ret, 1u); 212 | } 213 | return sizeof(uint32_t); 214 | } 215 | 216 | size_t write_u32s(FILE* f, uint32_t* buf, size_t n) 217 | { 218 | size_t ret = fwrite(buf, sizeof(uint32_t), n, f); 219 | if (ret != n) { 220 | quit("writing byte to file: %u != %u", ret, n); 221 | } 222 | return n * sizeof(uint32_t); 223 | } 224 | 225 | size_t write_uint32_list(FILE* f, std::vector& list) 226 | { 227 | size_t written_bytes = write_u32(f, list.size()); 228 | written_bytes += write_u32s(f, list.data(), list.size()); 229 | return written_bytes; 230 | } 231 | 232 | inverted_index read_ds2i_files(std::string ds2i_prefix) 233 | { 234 | inverted_index idx; 235 | std::string docs_file = ds2i_prefix + ".docs"; 236 | timer t("read input list from " + docs_file); 237 | auto df = fopen_or_fail(docs_file, "rb"); 238 | size_t num_docs = 0; 239 | size_t num_postings = 0; 240 | size_t num_lists = 0; 241 | uint32_t max_doc_id = 0; 242 | { 243 | // (1) skip the numdocs list 244 | read_uint32_list(df); 245 | // (2) keep reading lists 246 | while (!feof(df)) { 247 | const auto& list = read_uint32_list(df); 248 | size_t n = list.size(); 249 | if (n == 0) { 250 | break; 251 | } 252 | max_doc_id = std::max(max_doc_id, list.back()); 253 | num_lists++; 254 | num_postings += n; 255 | idx.docids.emplace_back(std::move(list)); 256 | } 257 | num_docs = max_doc_id + 1; 258 | } 259 | fclose_or_fail(df); 260 | std::string freqs_file = ds2i_prefix + ".freqs"; 261 | auto ff = fopen_or_fail(freqs_file, "rb"); 262 | { 263 | while (!feof(ff)) { 264 | const auto& list = read_uint32_list(ff); 265 | size_t n = list.size(); 266 | if (n == 0) { 267 | break; 268 | } 269 | idx.freqs.emplace_back(std::move(list)); 270 | } 271 | } 272 | fclose_or_fail(ff); 273 | idx.num_docs = num_docs; 274 | idx.max_doc_id = max_doc_id; 275 | idx.num_postings = num_postings; 276 | std::cout << "\tnum_docs = " << num_docs << std::endl; 277 | std::cout << "\tmax_doc_id = " << max_doc_id << std::endl; 278 | std::cout << "\tnum_lists = " << num_lists << std::endl; 279 | std::cout << "\tnum_postings = " << num_postings << std::endl; 280 | return idx; 281 | } 282 | 283 | void write_ds2i_files(inverted_index& idx, std::string ds2i_out_prefix) 284 | { 285 | std::string docs_file = ds2i_out_prefix + ".docs"; 286 | std::string freqs_file = ds2i_out_prefix + ".freqs"; 287 | std::string lens_file = ds2i_out_prefix + ".sizes"; 288 | std::string mapping_file = ds2i_out_prefix + ".mapping"; 289 | { 290 | auto df = fopen_or_fail(docs_file, "wb"); 291 | { 292 | // ds2i: 1st list contains num docs 293 | std::vector tmp(1); 294 | tmp[0] = idx.num_docs; 295 | write_uint32_list(df, tmp); 296 | } 297 | for (size_t i = 0; i < idx.docids.size(); i++) { 298 | write_uint32_list(df, idx.docids[i]); 299 | } 300 | fclose_or_fail(df); 301 | } 302 | { 303 | auto ff = fopen_or_fail(freqs_file, "wb"); 304 | for (size_t i = 0; i < idx.freqs.size(); i++) { 305 | write_uint32_list(ff, idx.freqs[i]); 306 | } 307 | fclose_or_fail(ff); 308 | } 309 | { 310 | auto sf = fopen_or_fail(lens_file, "wb"); 311 | write_uint32_list(sf, idx.doc_lengths); 312 | fclose_or_fail(sf); 313 | } 314 | { 315 | auto mf = fopen_or_fail(mapping_file, "w"); 316 | for (size_t i = 0; i < idx.doc_id_mapping.size(); ++i) { 317 | fprintff(mf, "%zu %zu\n", idx.doc_id_mapping[i], i); 318 | } 319 | fclose_or_fail(mf); 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(recursive_graph_bisection rgb.cpp) 2 | target_link_libraries(recursive_graph_bisection 3 | rgb 4 | CLI11 5 | ParallelSTL 6 | ) 7 | -------------------------------------------------------------------------------- /src/rgb.cpp: -------------------------------------------------------------------------------- 1 | #include "rgb/rgb.hpp" 2 | #include "rgb/util.hpp" 3 | 4 | #include 5 | #include 6 | 7 | double comp_sum_log_gap( 8 | const std::vector& ids, const std::vector& log2_precomp) 9 | { 10 | double sum_log_gaps = log2f(ids[0] + 1); 11 | for (size_t i = 1; i < ids.size(); i++) { 12 | auto gap = ids[i] - ids[i - 1]; 13 | if (gap < 256) 14 | sum_log_gaps += log2_precomp[gap]; 15 | else 16 | sum_log_gaps += log2f(gap); 17 | } 18 | return sum_log_gaps; 19 | } 20 | 21 | float compute_avg_loggap(const inverted_index& idx) 22 | { 23 | std::vector log2_precomp(256); 24 | for (size_t i = 0; i < 256; i++) { 25 | log2_precomp[i] = log2f(i); 26 | } 27 | 28 | cilk::reducer> sum_log_gaps(0.0); 29 | cilk::reducer> num_gaps(0); 30 | cilk_for(size_t i = idx.docids.size(); i != 0; i--) 31 | { 32 | *sum_log_gaps += comp_sum_log_gap(idx.docids[i - 1], log2_precomp); 33 | *num_gaps += idx.docids[i - 1].size(); 34 | } 35 | return double(sum_log_gaps.get_value()) / double(num_gaps.get_value()); 36 | } 37 | 38 | int main(int argc, char** argv) 39 | { 40 | if (argc < 4) { 41 | fprintf(stderr, 42 | "%s \n", 43 | argv[0]); 44 | return EXIT_FAILURE; 45 | } 46 | std::string ds2i_prefix = argv[1]; 47 | std::string ds2i_out_prefix = argv[2]; 48 | size_t min_list_len = atoi(argv[3]); 49 | if (argc == 5) { 50 | int threads = atoi(argv[4]); 51 | __cilkrts_set_param("nworkers", std::to_string(threads).c_str()); 52 | } 53 | 54 | auto invidx = read_ds2i_files(ds2i_prefix); 55 | 56 | std::cout << "BEFORE average LogGap " << compute_avg_loggap(invidx) 57 | << std::endl; 58 | 59 | auto reordered_invidx 60 | = reorder_docids_graph_bisection(invidx, min_list_len); 61 | 62 | std::cout << "AFTER average LogGap " << compute_avg_loggap(reordered_invidx) 63 | << std::endl; 64 | 65 | { 66 | timer t("write ds2i files"); 67 | write_ds2i_files(reordered_invidx, ds2i_out_prefix); 68 | } 69 | 70 | return EXIT_SUCCESS; 71 | } -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpetri/recursive_graph_bisection/ac9fde47e45e01d5149de0dcf0500d677bd699d9/test/CMakeLists.txt --------------------------------------------------------------------------------