├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── external
    └── external.cmake
├── include
    └── rgb
    │   ├── rgb.hpp
    │   └── util.hpp
├── src
    ├── CMakeLists.txt
    └── rgb.cpp
└── test
    └── CMakeLists.txt


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: WebKit
2 | IndentWidth: 4
3 | Language: Cpp
4 | DerivePointerAlignment: false
5 | PointerAlignment: Left
6 | ColumnLimit: 80


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | build/


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "external/tbb"]
 2 | 	path = external/tbb
 3 | 	url = https://github.com/01org/tbb.git
 4 | [submodule "external/parallelstl"]
 5 | 	path = external/parallelstl
 6 | 	url = https://github.com/intel/parallelstl.git
 7 | [submodule "external/CLI11"]
 8 | 	path = external/CLI11
 9 | 	url = https://github.com/CLIUtils/CLI11.git
10 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | if (NOT CMAKE_BUILD_TYPE)
 4 |         message(STATUS "No build type selected, default to Release")
 5 |         set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
 6 | endif()
 7 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} )
 8 | 
 9 | project(rgb)
10 | 
11 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
12 | 
13 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
14 |     if (CXX_COMPILER_VERSION VERSION_LESS 4.7)
15 |         message(STATUS "GCC version must be at least 4.7!")
16 |     endif()
17 |     set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -fcilkplus -O3 -lm -DNDEBUG -std=c++17 -DHAVE_CXX0X -march=native")
18 |     set (CMAKE_CXX_FLAGS_DEBUG   "-Wall -Wcast-align -fcilkplus -ggdb  -lm -std=c++17 -DHAVE_CXX0X -march=native")
19 | elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
20 |     if (CXX_COMPILER_VERSION VERSION_LESS 4.2.1)
21 |         message(STATUS  "Clang version must be at least 4.2.1!" )
22 |     endif()
23 |     set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -fcilkplus -O3 -DNDEBUG -std=c++17 -DHAVE_CXX0X -msse4.1 -march=native")
24 |     set (CMAKE_CXX_FLAGS_DEBUG   "-Wall -Wcast-align -fcilkplus -ggdb  -std=c++17 -DHAVE_CXX0X -msse4.1 -march=native")
25 | else ()
26 |     message(FATAL_ERROR "Please, use GCC or Clang compiler!")
27 | endif()
28 | 
29 | include_directories(include)
30 | add_library(rgb INTERFACE)
31 | target_include_directories(rgb INTERFACE
32 |     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
33 | )
34 | 
35 | include(external/external.cmake)
36 | 
37 | add_subdirectory(src)
38 | 
39 | 
40 | enable_testing()
41 | add_subdirectory(test)
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Matthias Petri
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Recursive graph bisection
  2 | 
  3 | This program implements the following graph reordering technique:
  4 | 
  5 | - Laxman Dhulipala, Igor Kabiljo, Brian Karrer, Giuseppe Ottaviano, Sergey Pupyrev, Alon Shalita:
  6 | Compressing Graphs and Indexes with Recursive Graph Bisection. KDD 2016: 1535-1544
  7 | 
  8 | 
  9 | ## Requirements
 10 | 
 11 | The program requires `cilkplus` which should be available in all newish compilers (e.g. gcc7)
 12 | 
 13 | ## Installing and Compiling
 14 | 
 15 | To install and compile the code run the following commands
 16 | 
 17 | ```
 18 | git clone https://github.com/mpetri/recursive_graph_bisection.git
 19 | ```
 20 | 
 21 | And compile using `make`:
 22 | 
 23 | ```
 24 | make
 25 | ```
 26 | 
 27 | which produces the `rec_graph_bisect.x` binary.
 28 | 
 29 | 
 30 | ## Input format
 31 | 
 32 | We use the [ds2i](https://github.com/ot/ds2i) input format (description taken from the repo):
 33 | 
 34 | 
 35 | A _binary sequence_ is a sequence of integers prefixed by its length, where both
 36 | the sequence integers and the length are written as 32-bit little-endian
 37 | unsigned integers.
 38 | 
 39 | A _collection_ consists of 3 files, `<basename>.docs`, `<basename>.freqs`,
 40 | `<basename>.sizes`.
 41 | 
 42 | * `<basename>.docs` starts with a singleton binary sequence where its only
 43 |   integer is the number of documents in the collection. It is then followed by
 44 |   one binary sequence for each posting list, in order of term-ids. Each posting
 45 |   list contains the sequence of document-ids containing the term.
 46 | 
 47 | * `<basename>.freqs` is composed of a one binary sequence per posting list, where
 48 |   each sequence contains the occurrence counts of the postings, aligned with the
 49 |   previous file (note however that this file does not have an additional
 50 |   singleton list at its beginning).
 51 | 
 52 | * `<basename>.sizes` is composed of a single binary sequence whose length is the
 53 |   same as the number of documents in the collection, and the i-th element of the
 54 |   sequence is the size (number of terms) of the i-th document.
 55 | 
 56 | 
 57 | ## Running command
 58 | 
 59 | To reorder the index the following options are provided:
 60 | 
 61 | `rec_graph_bisect.x <ds2i_prefix> <ds2i_out_prefix> <min_list_len> <num threads>`
 62 | 
 63 | where
 64 | 
 65 | * `ds2i_prefix` is the `<basename>` specified above
 66 | 
 67 | * `ds2i_out_prefix` is the output prefix where the reordered index should be stored
 68 | 
 69 | * `min_list_len` specifies a minimum list threshold which should be ignored during reordering. This does not mean the lists will be lost. Lists below the threshold are just not considered in the reordering phase but will still appear in the final output.
 70 | 
 71 | * `num_threads` specifies the number of threads to use during computation
 72 | 
 73 | ## Example
 74 | 
 75 | Say you have stored `gov2` in `ds2i` format described above in a directory:
 76 | 
 77 | 
 78 | ```
 79 | [10:56:28 mpetri]$ ls -l /storage/gov2-d2si/
 80 | total 43084248
 81 | -rw-r--r-- 1 mpetri mpetri 21765004632 Jul 18 14:37 gov2.docs
 82 | -rw-r--r-- 1 mpetri mpetri 21765004624 Jul 18 14:37 gov2.freqs
 83 | -rw-r--r-- 1 mpetri mpetri    98831272 Jul 18 14:37 gov2.sizes
 84 | ```
 85 | 
 86 | so the `ds2i_prefix` would be `/storage/gov2-ds2i/gov2`
 87 | 
 88 | and we execute the bisection command with the parameters
 89 | 
 90 | ```
 91 | rec_graph_bisect.x /storage/gov2-ds2i/gov2 /storage/gov2-ds2i/gov2-bisected 256 32
 92 | ```
 93 | 
 94 | which uses `32` threads, a minimum list length of `256` and stores the result as:
 95 | 
 96 | ```
 97 | [10:56:28 mpetri]$ ls -l /storage/gov2-d2si/
 98 | total 43084248
 99 | -rw-r--r-- 1 mpetri mpetri 21765004632 Jul 18 14:37 gov2.docs
100 | -rw-r--r-- 1 mpetri mpetri 21765004624 Jul 18 14:37 gov2.freqs
101 | -rw-r--r-- 1 mpetri mpetri    98831272 Jul 18 14:37 gov2.sizes
102 | -rw-r--r-- 1 mpetri mpetri 21765004632 Jul 18 18:32 gov2-bisected.docs
103 | -rw-r--r-- 1 mpetri mpetri 21765004624 Jul 18 18:32 gov2-bisected.freqs
104 | -rw-r--r-- 1 mpetri mpetri    98831272 Jul 18 18:32 gov2-bisected.sizes
105 | -rw-r--r-- 1 mpetri mpetri    98831272 Jul 18 18:32 gov2-bisected.mapping
106 | ```
107 | 
108 | where `gov2-bisected.mapping` specifies how the document identifiers were remapped in the following format:
109 | 
110 | ```
111 | <initial id> <new id>
112 | ```
113 | 
114 | ## Runtime
115 | 
116 | The code is not as optimized as in the paper but finishes in reasonable time frame. For example, `gov2`
117 | can be reordered in less than two hours.
118 | 
119 | However, memory consumption is quite high. It requires at least O(size of input files) RAM.
120 | 
121 | ## Authors
122 | 
123 | * **Matthias Petri** - [mpetri](https://github.com/mpetri)
124 | 
125 | * **Joel Mackenzie** - [JMMackenzie](https://github.com/JMMackenzie)
126 | 
127 | ## License
128 | 
129 | This project is licensed under the BSD 3-Clause License - see the [LICENSE.md](LICENSE.md) file for details
130 | 


--------------------------------------------------------------------------------
/external/external.cmake:
--------------------------------------------------------------------------------
 1 | EXECUTE_PROCESS(COMMAND git submodule update --init
 2 |                 WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
 3 |                 OUTPUT_QUIET
 4 | )
 5 | 
 6 | 
 7 | # Add CLI11
 8 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/CLI11 EXCLUDE_FROM_ALL)
 9 | 
10 | # Add TBB
11 | set(TBB_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb/)
12 | include(${TBB_ROOT}/cmake/TBBBuild.cmake)
13 | tbb_build(
14 |     TBB_ROOT ${TBB_ROOT}
15 |     CONFIG_DIR TBB_DIR)
16 | 
17 | # Add ParallelSTL
18 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/parallelstl EXCLUDE_FROM_ALL)
19 | 


--------------------------------------------------------------------------------
/include/rgb/rgb.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <cstdint>
  5 | #include <random>
  6 | #include <string>
  7 | #include <unordered_map>
  8 | #include <vector>
  9 | #include <x86intrin.h>
 10 | 
 11 | #include "util.hpp"
 12 | 
 13 | #include <cilk/cilk.h>
 14 | #include <cilk/cilk_api.h>
 15 | #include <cilk/reducer_max.h>
 16 | #include <cilk/reducer_opadd.h>
 17 | 
 18 | namespace constants {
 19 | const int MAX_ITER = 20;
 20 | const uint64_t PARALLEL_SWITCH_DEPTH = 6;
 21 | }
 22 | 
 23 | struct docid_node {
 24 |     uint64_t initial_id;
 25 |     uint32_t* terms;
 26 |     uint32_t* freqs;
 27 |     size_t num_terms;
 28 |     size_t num_terms_not_pruned;
 29 | };
 30 | 
 31 | std::vector<float> log2_precomp;
 32 | 
 33 | float log2_cmp(uint32_t idx)
 34 | {
 35 |     if (idx < 256)
 36 |         return log2_precomp[idx];
 37 |     return log2f(idx);
 38 | }
 39 | 
 40 | void swap_nodes(docid_node* a, docid_node* b)
 41 | {
 42 |     std::swap(a->initial_id, b->initial_id);
 43 |     std::swap(a->terms, b->terms);
 44 |     std::swap(a->freqs, b->freqs);
 45 |     std::swap(a->num_terms, b->num_terms);
 46 |     std::swap(a->num_terms_not_pruned, b->num_terms_not_pruned);
 47 | }
 48 | 
 49 | void swap_nodes(docid_node* a, docid_node* b, std::vector<uint32_t>& deg1,
 50 |     std::vector<uint32_t>& deg2, std::vector<uint8_t>& queries_changed)
 51 | {
 52 |     {
 53 |         size_t n = a->num_terms / 4;
 54 |         size_t m = a->num_terms % 4;
 55 |         for (size_t i = 0; i < n * 4; i+=4) {
 56 |             auto q0 = a->terms[i];
 57 |             auto q1 = a->terms[i + 1];
 58 |             auto q2 = a->terms[i + 2];
 59 |             auto q3 = a->terms[i + 3];
 60 |             __m128i _one = _mm_set1_epi32(1);
 61 | 
 62 |             {
 63 |                 __m128i _deg1 = _mm_set_epi32(deg1[q0], deg1[q1], deg1[q2], deg1[q3]);
 64 |                 __m128i _result = _mm_sub_epi32(_deg1, _one);
 65 | 
 66 |                 deg1[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3)));
 67 |                 deg1[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2)));
 68 |                 deg1[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1)));
 69 |                 deg1[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0)));
 70 |             }
 71 |             {
 72 |                 __m128i _deg2 = _mm_set_epi32(deg2[q0], deg2[q1], deg2[q2], deg2[q3]);
 73 |                 __m128i _result = _mm_add_epi32(_deg2, _one);
 74 |                 deg2[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3)));
 75 |                 deg2[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2)));
 76 |                 deg2[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1)));
 77 |                 deg2[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0)));
 78 | 
 79 |             }
 80 |             queries_changed[q0] = _mm_cvtsi128_si32(_one);
 81 |             queries_changed[q1] = _mm_cvtsi128_si32(_one);
 82 |             queries_changed[q2] = _mm_cvtsi128_si32(_one);
 83 |             queries_changed[q3] = _mm_cvtsi128_si32(_one);
 84 |         }
 85 |         for (size_t i = 0; i < m; i++) {
 86 |             auto qry = a->terms[n * 4 + i];
 87 |             deg1[qry]--;
 88 |             deg2[qry]++;
 89 |             queries_changed[qry] = 1;
 90 |         }
 91 |     }
 92 |     {
 93 |         size_t n = b->num_terms / 4;
 94 |         size_t m = b->num_terms % 4;
 95 |         for (size_t i = 0; i < n * 4; i+=4) {
 96 |             auto q0 = b->terms[i];
 97 |             auto q1 = b->terms[i + 1];
 98 |             auto q2 = b->terms[i + 2];
 99 |             auto q3 = b->terms[i + 3];
100 |             __m128i _one = _mm_set1_epi32(1);
101 | 
102 |             {
103 |                 __m128i _deg1 = _mm_set_epi32(deg1[q0], deg1[q1], deg1[q2], deg1[q3]);
104 |                 __m128i _result = _mm_add_epi32(_deg1, _one);
105 |                 deg1[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3)));
106 |                 deg1[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2)));
107 |                 deg1[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1)));
108 |                 deg1[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0)));
109 |             }
110 |             {
111 |                 __m128i _deg2 = _mm_set_epi32(deg2[q0], deg2[q1], deg2[q2], deg2[q3]);
112 |                 __m128i _result = _mm_sub_epi32(_deg2, _one);
113 |                 deg2[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3)));
114 |                 deg2[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2)));
115 |                 deg2[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1)));
116 |                 deg2[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0)));
117 | 
118 |             }
119 |             queries_changed[q0] = _mm_cvtsi128_si32(_one);
120 |             queries_changed[q1] = _mm_cvtsi128_si32(_one);
121 |             queries_changed[q2] = _mm_cvtsi128_si32(_one);
122 |             queries_changed[q3] = _mm_cvtsi128_si32(_one);
123 |         }
124 |         for (size_t i = 0; i < m; i++) {
125 |             auto qry = b->terms[n * 4 + i];
126 |             deg1[qry]++;
127 |             deg2[qry]--;
128 |             queries_changed[qry] = 1;
129 |         }
130 |     }
131 |     swap_nodes(a, b);
132 | }
133 | 
134 | struct bipartite_graph {
135 |     size_t num_queries;
136 |     size_t num_docs;
137 |     size_t num_docs_inc_empty;
138 |     std::vector<docid_node> graph;
139 |     std::vector<uint32_t> doc_contents;
140 |     std::vector<uint32_t> doc_freqs;
141 | };
142 | 
143 | struct partition_t {
144 |     docid_node* V1;
145 |     docid_node* V2;
146 |     size_t n1;
147 |     size_t n2;
148 | };
149 | 
150 | void compute_doc_sizes(inverted_index& idx, std::vector<uint32_t>& doc_sizes,
151 |     std::vector<uint32_t>& doc_sizes_non_pruned, uint32_t min_doc_id,
152 |     uint32_t max_doc_id, size_t min_list_len)
153 | {
154 |     for (size_t termid = 0; termid < idx.docids.size(); termid++) {
155 |         const auto& plist = idx.docids[termid];
156 |         for (const auto& doc_id : plist) {
157 |             if (min_doc_id <= doc_id && doc_id < max_doc_id) {
158 |                 if (plist.size() >= min_list_len) {
159 |                     doc_sizes[doc_id] += 1;
160 |                 }
161 |                 doc_sizes_non_pruned[doc_id] += 1;
162 |             }
163 |         }
164 |     }
165 | }
166 | 
167 | void create_graph(bipartite_graph& bg, inverted_index& idx, uint32_t min_doc_id,
168 |     uint32_t max_doc_id, size_t min_list_len)
169 | {
170 |     std::vector<uint32_t> doc_offset(idx.max_doc_id + 1, 0);
171 |     for (size_t termid = 0; termid < idx.docids.size(); termid++) {
172 |         const auto& dlist = idx.docids[termid];
173 |         const auto& flist = idx.freqs[termid];
174 |         if (dlist.size() >= min_list_len) {
175 |             for (size_t pos = 0; pos < dlist.size(); pos++) {
176 |                 const auto& doc_id = dlist[pos];
177 |                 if (min_doc_id <= doc_id && doc_id < max_doc_id) {
178 |                     bg.graph[doc_id].initial_id = doc_id;
179 |                     bg.graph[doc_id].freqs[doc_offset[doc_id]] = flist[pos];
180 |                     bg.graph[doc_id].terms[doc_offset[doc_id]++] = termid;
181 |                 }
182 |             }
183 |         }
184 |     }
185 |     for (size_t termid = 0; termid < idx.docids.size(); termid++) {
186 |         const auto& dlist = idx.docids[termid];
187 |         const auto& flist = idx.freqs[termid];
188 |         if (dlist.size() < min_list_len) {
189 |             for (size_t pos = 0; pos < dlist.size(); pos++) {
190 |                 const auto& doc_id = dlist[pos];
191 |                 if (min_doc_id <= doc_id && doc_id < max_doc_id) {
192 |                     bg.graph[doc_id].initial_id = doc_id;
193 |                     bg.graph[doc_id].freqs[doc_offset[doc_id]] = flist[pos];
194 |                     bg.graph[doc_id].terms[doc_offset[doc_id]++] = termid;
195 |                 }
196 |             }
197 |         }
198 |     }
199 | }
200 | 
201 | bipartite_graph construct_bipartite_graph(
202 |     inverted_index& idx, size_t min_list_len)
203 | {
204 |     timer t("construct_bipartite_graph");
205 |     bipartite_graph bg;
206 |     bg.num_queries = idx.size();
207 |     {
208 |         timer t("determine doc sizes");
209 |         size_t workers = __cilkrts_get_nworkers();
210 |         std::vector<uint32_t> doc_sizes(idx.max_doc_id + 1);
211 |         std::vector<uint32_t> doc_sizes_non_pruned(idx.max_doc_id + 1);
212 |         std::vector<std::vector<uint32_t>> tmp_doc_sizes(workers);
213 |         std::vector<std::vector<uint32_t>> tmp_doc_sizes_non_pruned(workers);
214 |         for (auto& v : tmp_doc_sizes)
215 |             v.resize(idx.max_doc_id + 1);
216 |         for (auto& v : tmp_doc_sizes_non_pruned)
217 |             v.resize(idx.max_doc_id + 1);
218 |         size_t doc_ids_in_slice = idx.max_doc_id / workers;
219 |         for (size_t id = 0; id < workers; id++) {
220 |             size_t min_doc_id = id * doc_ids_in_slice;
221 |             size_t max_doc_id = min_doc_id + doc_ids_in_slice;
222 |             if (id + 1 == workers) {
223 |                 max_doc_id = idx.max_doc_id + 1;
224 |                 compute_doc_sizes(idx, tmp_doc_sizes[id],
225 |                     tmp_doc_sizes_non_pruned[id], min_doc_id, max_doc_id,
226 |                     min_list_len);
227 |             } else {
228 |                 cilk_spawn compute_doc_sizes(idx, tmp_doc_sizes[id],
229 |                     tmp_doc_sizes_non_pruned[id], min_doc_id, max_doc_id,
230 |                     min_list_len);
231 |             }
232 |         }
233 |         cilk_sync;
234 |         for (auto& v : tmp_doc_sizes) {
235 |             for (size_t i = 0; i < v.size(); i++) {
236 |                 if (v[i] != 0)
237 |                     doc_sizes[i] = v[i];
238 |             }
239 |         }
240 |         for (auto& v : tmp_doc_sizes_non_pruned) {
241 |             for (size_t i = 0; i < v.size(); i++) {
242 |                 if (v[i] != 0)
243 |                     doc_sizes_non_pruned[i] = v[i];
244 |             }
245 |         }
246 |         bg.doc_contents.resize(idx.num_postings);
247 |         bg.doc_freqs.resize(idx.num_postings);
248 |         bg.graph.resize(idx.max_doc_id + 1);
249 |         bg.num_docs_inc_empty = idx.max_doc_id + 1;
250 |         bg.graph[0].terms = bg.doc_contents.data();
251 |         bg.graph[0].freqs = bg.doc_freqs.data();
252 |         bg.graph[0].num_terms = doc_sizes[0];
253 |         bg.graph[0].num_terms_not_pruned = doc_sizes_non_pruned[0];
254 |         for (size_t i = 1; i < doc_sizes.size(); i++) {
255 |             bg.graph[i].terms
256 |                 = bg.graph[i - 1].terms + bg.graph[i - 1].num_terms_not_pruned;
257 |             bg.graph[i].freqs
258 |                 = bg.graph[i - 1].freqs + bg.graph[i - 1].num_terms_not_pruned;
259 |             bg.graph[i].num_terms = doc_sizes[i];
260 |             bg.graph[i].num_terms_not_pruned = doc_sizes_non_pruned[i];
261 |         }
262 |     }
263 |     {
264 |         timer t("create forward index");
265 |         size_t workers = __cilkrts_get_nworkers();
266 |         size_t doc_ids_in_slice = idx.max_doc_id / workers;
267 |         for (size_t id = 0; id < workers; id++) {
268 |             size_t min_doc_id = id * doc_ids_in_slice;
269 |             size_t max_doc_id = min_doc_id + doc_ids_in_slice;
270 |             if (id + 1 == workers) {
271 |                 max_doc_id = idx.max_doc_id + 1;
272 |                 create_graph(bg, idx, min_doc_id, max_doc_id, min_list_len);
273 |             } else {
274 |                 cilk_spawn create_graph(
275 |                     bg, idx, min_doc_id, max_doc_id, min_list_len);
276 |             }
277 |         }
278 |         cilk_sync;
279 |     }
280 | 
281 |     // Set ID for empty documents.
282 |     for (uint32_t doc_id = 0; doc_id < idx.num_docs; ++doc_id) {
283 |         if (bg.graph[doc_id].initial_id != doc_id) {
284 |             bg.graph[doc_id].initial_id = doc_id;
285 |         }
286 |     }
287 |     size_t num_empty = 0;
288 |     {
289 |         // all docs with 0 size go to the back!
290 |         auto empty_cmp = [](const auto& a, const auto& b) {
291 |             return a.num_terms > b.num_terms;
292 |         };
293 |         std::sort(bg.graph.begin(), bg.graph.end(), empty_cmp);
294 |         auto ritr = bg.graph.end() - 1;
295 |         auto itr = bg.graph.begin();
296 |         while (itr != ritr) {
297 |             if (itr->num_terms == 0) {
298 |                 num_empty++;
299 |             } else {
300 |                 break;
301 |             }
302 |             --ritr;
303 |         }
304 |         bg.num_docs = bg.num_docs_inc_empty - num_empty;
305 |     }
306 | 
307 |     size_t num_skipped_lists = 0;
308 |     size_t num_lists = 0;
309 |     for (size_t termid = 0; termid < idx.docids.size(); termid++) {
310 |         const auto& dlist = idx.docids[termid];
311 |         if (dlist.size() < min_list_len) {
312 |             num_skipped_lists++;
313 |         } else {
314 |             num_lists++;
315 |         }
316 |     }
317 |     std::cout << "\tnum_empty docs = " << num_empty << std::endl;
318 |     std::cout << "\tnum_skipped lists = " << num_skipped_lists << std::endl;
319 |     std::cout << "\tnum_lists = " << num_lists << std::endl;
320 |     std::cout << "\tnum_docs = " << bg.num_docs << std::endl;
321 |     return bg;
322 | }
323 | 
324 | void recreate_lists(const bipartite_graph& bg, inverted_index& idx,
325 |     uint32_t min_q_id, uint32_t max_q_id, std::vector<uint32_t>& qmap,
326 |     std::vector<uint32_t>& dsizes)
327 | {
328 |     for (size_t docid = 0; docid < bg.num_docs_inc_empty; docid++) {
329 |         const auto& doc = bg.graph[docid];
330 |         for (size_t i = 0; i < doc.num_terms_not_pruned; i++) {
331 |             auto qid = doc.terms[i];
332 |             if (min_q_id <= qmap[qid] && qmap[qid] < max_q_id) {
333 |                 auto freq = doc.freqs[i];
334 |                 idx.docids[qid].push_back(docid);
335 |                 idx.freqs[qid].push_back(freq);
336 |                 dsizes[docid] += freq;
337 |             }
338 |         }
339 |     }
340 | }
341 | 
342 | inverted_index recreate_invidx(const bipartite_graph& bg, size_t num_lists)
343 | {
344 |     timer t("recreate_invidx");
345 |     inverted_index idx;
346 |     size_t num_postings = 0;
347 |     idx.resize(num_lists);
348 |     {
349 |         size_t workers = __cilkrts_get_nworkers();
350 |         size_t qids_in_slice = num_lists / workers;
351 |         std::vector<uint32_t> qids_map(num_lists);
352 |         for (size_t i = 0; i < qids_map.size(); i++)
353 |             qids_map[i] = i;
354 |         std::mt19937 rnd(1);
355 |         std::shuffle(qids_map.begin(), qids_map.end(), rnd);
356 |         std::vector<std::vector<uint32_t>> doc_sizes(workers);
357 |         for (size_t id = 0; id < workers; id++) {
358 |             doc_sizes[id].resize(bg.num_docs_inc_empty);
359 |             size_t min_q_id = id * qids_in_slice;
360 |             size_t max_q_id = min_q_id + qids_in_slice;
361 |             if (id + 1 == workers) {
362 |                 max_q_id = num_lists;
363 |                 recreate_lists(
364 |                     bg, idx, min_q_id, max_q_id, qids_map, doc_sizes[id]);
365 |             } else {
366 |                 cilk_spawn recreate_lists(
367 |                     bg, idx, min_q_id, max_q_id, qids_map, doc_sizes[id]);
368 |             }
369 |         }
370 |         cilk_sync;
371 |         idx.doc_lengths.resize(bg.num_docs_inc_empty);
372 |         for (size_t id = 0; id < workers; id++) {
373 |             for (size_t docid = 0; docid < bg.num_docs_inc_empty; docid++) {
374 |                 idx.doc_lengths[docid] += doc_sizes[id][docid];
375 |             }
376 |         }
377 |     }
378 |     {
379 | 
380 |         for (size_t docid = 0; docid < bg.num_docs_inc_empty; docid++) {
381 |             const auto& doc = bg.graph[docid];
382 |             idx.doc_id_mapping.push_back(doc.initial_id);
383 |             num_postings += doc.num_terms_not_pruned;
384 |         }
385 |     }
386 |     idx.num_docs = bg.num_docs_inc_empty;
387 |     idx.max_doc_id = idx.num_docs - 1;
388 |     idx.num_postings = num_postings;
389 |     std::cout << "\tnum_docs = " << idx.num_docs << std::endl;
390 |     std::cout << "\tmax_doc_id = " << idx.max_doc_id << std::endl;
391 |     std::cout << "\tnum_lists = " << idx.docids.size() << std::endl;
392 |     std::cout << "\tnum_postings = " << idx.num_postings << std::endl;
393 |     return idx;
394 | }
395 | 
396 | /* random shuffle seems to do ok */
397 | partition_t initial_partition(docid_node* G, size_t n)
398 | {
399 |     partition_t p;
400 |     std::mt19937 rnd(n);
401 |     std::shuffle(G, G + n, rnd);
402 |     p.V1 = G;
403 |     p.n1 = (n / 2);
404 |     p.V2 = G + p.n1;
405 |     p.n2 = n - p.n1;
406 |     return p;
407 | }
408 | 
409 | struct move_gain {
410 |     double gain;
411 |     docid_node* node;
412 |     move_gain()
413 |         : gain(0)
414 |         , node(nullptr)
415 |     {
416 |     }
417 |     move_gain(double g, docid_node* n)
418 |         : gain(g)
419 |         , node(n)
420 |     {
421 |     }
422 |     bool operator<(const move_gain& other) { return gain > other.gain; }
423 | };
424 | 
425 | struct move_gains_t {
426 |     std::vector<move_gain> V1;
427 |     std::vector<move_gain> V2;
428 | };
429 | 
430 | move_gain compute_single_gain(docid_node* doc,
431 |     std::vector<float>& before, std::vector<float>& after)
432 | {
433 |     __m128 _vsum = _mm_set1_ps(0);
434 |     float gain[4];
435 |     size_t n = doc->num_terms / 4;
436 |     size_t m = doc->num_terms % 4;
437 |     for (size_t j = 0; j < n * 4; j+=4) {
438 |         auto q0 = doc->terms[j];
439 |         auto q1 = doc->terms[j + 1];
440 |         auto q2 = doc->terms[j + 2];
441 |         auto q3 = doc->terms[j + 3];
442 |         __m128 _before = _mm_set_ps(before[q0], before[q1], before[q2], before[q3]);
443 |         __m128 _after = _mm_set_ps(after[q0], after[q1], after[q2], after[q3]);
444 |         __m128 _val = _mm_sub_ps(_before, _after);
445 |         _vsum = _mm_add_ps(_vsum, _val);
446 |     }
447 |     _mm_store_ps(gain, _vsum);
448 |     auto total = gain[0] + gain[1] + gain[2] + gain[3];
449 |     for (size_t j = 0; j < m; j++) {
450 |         auto q = doc->terms[n * 4 + j];
451 |         total += before[q] - after[q];
452 |     }
453 |     return move_gain(total, doc);
454 | }
455 | 
456 | void compute_deg(docid_node* docs, size_t n, std::vector<uint32_t>& deg, std::vector<uint8_t> &query_changed)
457 | {
458 |     for (size_t i = 0; i < n; i++) {
459 |         auto doc = docs + i;
460 |         size_t n = doc->num_terms / 4;
461 |         size_t m = doc->num_terms % 4;
462 |         for (size_t j = 0; j < n * 4; j+=4) {
463 |             auto q0 = doc->terms[j];
464 |             auto q1 = doc->terms[j + 1];
465 |             auto q2 = doc->terms[j + 2];
466 |             auto q3 = doc->terms[j + 3];
467 |             __m128i _one = _mm_set1_epi32(1);
468 |             __m128i _deg = _mm_set_epi32(deg[q0], deg[q1], deg[q2], deg[q3]);
469 |             __m128i _result = _mm_add_epi32(_deg, _one);
470 | 
471 |             deg[q0] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 3)));
472 |             deg[q1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 2)));
473 |             deg[q2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 1)));
474 |             deg[q3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(_result, _MM_SHUFFLE(0, 0, 0, 0)));
475 | 
476 |             query_changed[q0] = _mm_cvtsi128_si32(_one);
477 |             query_changed[q1] = _mm_cvtsi128_si32(_one);
478 |             query_changed[q2] = _mm_cvtsi128_si32(_one);
479 |             query_changed[q3] = _mm_cvtsi128_si32(_one);
480 |         }
481 |         for (size_t j = 0; j < m; j++) {
482 |             auto qry = doc->terms[n * 4 + j];
483 |             deg[qry]++;
484 |             query_changed[qry] = 1;
485 |         }
486 |     }
487 | }
488 | 
489 | void compute_gains(docid_node* docs, size_t n, std::vector<float>& before,
490 |     std::vector<float>& after, std::vector<move_gain>& res)
491 | {
492 |     res.resize(n);
493 |     cilk_for(size_t i = 0; i < n; i++)
494 |     {
495 |         auto doc = docs + i;
496 |         res[i] = compute_single_gain(doc, before, after);
497 |     }
498 | }
499 | 
500 | void compute_gains_np(docid_node* docs, size_t n, std::vector<float>& before,
501 |     std::vector<float>& after, std::vector<move_gain>& res)
502 | {
503 |     res.resize(n);
504 |     for (size_t i = 0; i < n; i++) {
505 |         auto doc = docs + i;
506 |         res[i] = compute_single_gain(doc, before, after);
507 |     }
508 | }
509 | 
510 | move_gains_t compute_move_gains(partition_t& P, size_t num_queries,
511 |     std::vector<uint32_t>& deg1, std::vector<uint32_t>& deg2,
512 |     std::vector<float>& before, std::vector<float>& left2right,
513 |     std::vector<float>& right2left, std::vector<uint8_t>& qry_changed)
514 | {
515 |     move_gains_t gains;
516 | 
517 |     float logn1 = log2f(P.n1);
518 |     float logn2 = log2f(P.n2);
519 |     cilk_for(size_t q = 0; q < num_queries; q++)
520 |     {
521 |         if (qry_changed[q] == 1) {
522 |             qry_changed[q] = 0;
523 |             before[q] = 0;
524 |             left2right[q] = 0;
525 |             right2left[q] = 0;
526 |             if (deg1[q] or deg2[q]) {
527 |                 before[q] = deg1[q] * logn1 - deg1[q] * log2_cmp(deg1[q] + 1)
528 |                     + deg2[q] * logn2 - deg2[q] * log2_cmp(deg2[q] + 1);
529 |             }
530 |             if (deg1[q]) {
531 |                 left2right[q] = (deg1[q] - 1) * logn1
532 |                     - (deg1[q] - 1) * log2_cmp(deg1[q]) + (deg2[q] + 1) * logn2
533 |                     - (deg2[q] + 1) * log2_cmp(deg2[q] + 2);
534 |             }
535 |             if (deg2[q])
536 |                 right2left[q] = (deg1[q] + 1) * logn1
537 |                     - (deg1[q] + 1) * log2_cmp(deg1[q] + 2)
538 |                     + (deg2[q] - 1) * logn2 - (deg2[q] - 1) * log2_cmp(deg2[q]);
539 |         }
540 |     }
541 | 
542 |     // (2) compute gains from moving docs
543 |     cilk_spawn compute_gains(P.V1, P.n1, before, left2right, gains.V1);
544 |     compute_gains(P.V2, P.n2, before, right2left, gains.V2);
545 |     cilk_sync;
546 | 
547 |     return gains;
548 | }
549 | 
550 | move_gains_t compute_move_gains_np(partition_t& P, size_t num_queries,
551 |     std::vector<uint32_t>& deg1, std::vector<uint32_t>& deg2,
552 |     std::vector<float>& before, std::vector<float>& left2right,
553 |     std::vector<float>& right2left, std::vector<uint8_t>& qry_changed)
554 | {
555 |     move_gains_t gains;
556 | 
557 |     float logn1 = log2f(P.n1);
558 |     float logn2 = log2f(P.n2);
559 |     for (size_t q = 0; q < num_queries; q++) {
560 |         if (qry_changed[q] == 1) {
561 |             qry_changed[q] = 0;
562 |             before[q] = 0;
563 |             left2right[q] = 0;
564 |             right2left[q] = 0;
565 |             if (deg1[q] or deg2[q]) {
566 |                 before[q] = deg1[q] * logn1 - deg1[q] * log2_cmp(deg1[q] + 1)
567 |                     + deg2[q] * logn2 - deg2[q] * log2_cmp(deg2[q] + 1);
568 |             }
569 |             if (deg1[q]) {
570 |                 left2right[q] = (deg1[q] - 1) * logn1
571 |                     - (deg1[q] - 1) * log2_cmp(deg1[q]) + (deg2[q] + 1) * logn2
572 |                     - (deg2[q] + 1) * log2_cmp(deg2[q] + 2);
573 |             }
574 |             if (deg2[q])
575 |                 right2left[q] = (deg1[q] + 1) * logn1
576 |                     - (deg1[q] + 1) * log2_cmp(deg1[q] + 2)
577 |                     + (deg2[q] - 1) * logn2 - (deg2[q] - 1) * log2_cmp(deg2[q]);
578 |         }
579 |     }
580 | 
581 |     // (2) compute gains from moving docs
582 |     compute_gains(P.V1, P.n1, before, left2right, gains.V1);
583 |     compute_gains(P.V2, P.n2, before, right2left, gains.V2);
584 | 
585 |     return gains;
586 | }
587 | 
588 | void recursive_bisection_np(progress_bar& progress, docid_node* G,
589 |     size_t num_queries, size_t n,uint64_t depth,uint64_t max_depth)
590 | {
591 |     // (1) create the initial partition. O(n)
592 |     auto partition = initial_partition(G, n);
593 | 
594 |     {
595 |         // (2) we compute deg1 and deg2 only once
596 |         std::vector<uint32_t> deg1(num_queries, 0);
597 |         std::vector<uint32_t> deg2(num_queries, 0);
598 |         std::vector<float> before(num_queries);
599 |         std::vector<float> left2right(num_queries);
600 |         std::vector<float> right2left(num_queries);
601 | 
602 |         std::vector<uint8_t> query_changed(num_queries, 0);
603 |         {
604 |             compute_deg(partition.V1, partition.n1, deg1, query_changed);
605 |             compute_deg(partition.V2, partition.n2, deg2, query_changed);
606 |         }
607 | 
608 |         // (3) perform bisection. constant number of iterations
609 |         for (int cur_iter = 1; cur_iter <= constants::MAX_ITER; cur_iter++) {
610 |             // (3a) compute move gains
611 |             auto gains = compute_move_gains_np(partition, num_queries, deg1,
612 |                 deg2, before, left2right, right2left, query_changed);
613 |             memset(query_changed.data(), 0, num_queries);
614 | 
615 |             // (3b) sort by decreasing gain. O(n log n)
616 |             {
617 |                 std::sort(gains.V1.begin(), gains.V1.end());
618 |                 std::sort(gains.V2.begin(), gains.V2.end());
619 |             }
620 | 
621 |             // (3c) swap. O(n)
622 |             size_t num_swaps = 0;
623 |             {
624 |                 auto itr_v1 = gains.V1.begin();
625 |                 auto itr_v2 = gains.V2.begin();
626 |                 while (itr_v1 != gains.V1.end() && itr_v2 != gains.V2.end()) {
627 |                     if (itr_v1->gain + itr_v2->gain > 0) {
628 |                         // maybe we need to do something here to make
629 |                         // compute_move_gains() efficient?
630 |                         swap_nodes(itr_v1->node, itr_v2->node, deg1, deg2,
631 |                             query_changed);
632 |                         num_swaps++;
633 |                     } else {
634 |                         break;
635 |                     }
636 |                     ++itr_v1;
637 |                     ++itr_v2;
638 |                 }
639 |             }
640 | 
641 |             // (3d) converged?
642 |             if (num_swaps == 0) {
643 |                 break;
644 |             }
645 |         }
646 |     }
647 | 
648 |     // (4) recurse. at most O(log n) recursion steps
649 |     if (depth + 1 <= max_depth) {
650 |         if (partition.n1 > 1)
651 |             recursive_bisection_np(
652 |                 progress, partition.V1, num_queries, partition.n1, depth + 1,max_depth);
653 |         if (partition.n2 > 1)
654 |             recursive_bisection_np(
655 |                 progress, partition.V2, num_queries, partition.n2, depth + 1,max_depth);
656 | 
657 |         if (partition.n1 == 1)
658 |             progress.done(1);
659 |         if (partition.n2 == 1)
660 |             progress.done(1);
661 |     } else {
662 |         progress.done(n);
663 |     }
664 | }
665 | 
666 | void recursive_bisection(progress_bar& progress, docid_node* G,
667 |     size_t num_queries, size_t n, uint64_t depth,uint64_t max_depth)
668 | {
669 |     // (1) create the initial partition. O(n)
670 |     auto partition = initial_partition(G, n);
671 | 
672 |     {
673 |         // (2) we compute deg1 and deg2 only once
674 |         std::vector<uint32_t> deg1(num_queries, 0);
675 |         std::vector<uint32_t> deg2(num_queries, 0);
676 |         std::vector<float> before(num_queries);
677 |         std::vector<float> left2right(num_queries);
678 |         std::vector<float> right2left(num_queries);
679 | 
680 |         std::vector<uint8_t> query_changed(num_queries, 0);
681 |         {
682 |             cilk_spawn compute_deg(partition.V1, partition.n1, deg1, query_changed);
683 |             compute_deg(partition.V2, partition.n2, deg2, query_changed);
684 |             cilk_sync;
685 |         }
686 | 
687 |         // (3) perform bisection. constant number of iterations
688 |         for (int cur_iter = 1; cur_iter <= constants::MAX_ITER; cur_iter++) {
689 |             // (3a) compute move gains
690 |             auto gains = compute_move_gains(partition, num_queries, deg1, deg2,
691 |                 before, left2right, right2left, query_changed);
692 |             memset(query_changed.data(), 0, num_queries);
693 | 
694 |             // (3b) sort by decreasing gain. O(n log n)
695 |             {
696 |                 cilk_spawn std::sort(gains.V1.begin(), gains.V1.end());
697 |                 std::sort(gains.V2.begin(), gains.V2.end());
698 |                 cilk_sync;
699 |             }
700 | 
701 |             // (3c) swap. O(n)
702 |             size_t num_swaps = 0;
703 |             {
704 |                 auto itr_v1 = gains.V1.begin();
705 |                 auto itr_v2 = gains.V2.begin();
706 |                 while (itr_v1 != gains.V1.end() && itr_v2 != gains.V2.end()) {
707 |                     if (itr_v1->gain + itr_v2->gain > 0) {
708 |                         // maybe we need to do something here to make
709 |                         // compute_move_gains() efficient?
710 |                         swap_nodes(itr_v1->node, itr_v2->node, deg1, deg2,
711 |                             query_changed);
712 |                         num_swaps++;
713 |                     } else {
714 |                         break;
715 |                     }
716 |                     ++itr_v1;
717 |                     ++itr_v2;
718 |                 }
719 |             }
720 | 
721 |             // (3d) converged?
722 |             if (num_swaps == 0) {
723 |                 break;
724 |             }
725 |         }
726 |     }
727 | 
728 |     // (4) recurse. at most O(log n) recursion steps
729 |     if (depth + 1 <= max_depth) {
730 |         if (depth < constants::PARALLEL_SWITCH_DEPTH) {
731 |             if (partition.n1 > 1) {
732 |                 cilk_spawn recursive_bisection(progress, partition.V1,
733 |                     num_queries, partition.n1, depth + 1,max_depth);
734 |             }
735 |             if (partition.n2 > 1) {
736 |                 recursive_bisection(progress, partition.V2, num_queries,
737 |                     partition.n2, depth + 1,max_depth);
738 |             }
739 |             cilk_sync;
740 |         } else {
741 |             if (partition.n1 > 1) {
742 |                 recursive_bisection_np(progress, partition.V1, num_queries,
743 |                     partition.n1, depth + 1,max_depth);
744 |             }
745 |             if (partition.n2 > 1) {
746 |                 recursive_bisection_np(progress, partition.V2, num_queries,
747 |                     partition.n2, depth + 1,max_depth);
748 |             }
749 |         }
750 |         if (partition.n1 == 1)
751 |             progress.done(1);
752 |         if (partition.n2 == 1)
753 |             progress.done(1);
754 |     } else {
755 |         progress.done(n);
756 |     }
757 | }
758 | 
759 | inverted_index reorder_docids_graph_bisection(
760 |     inverted_index& invidx, size_t min_list_len)
761 | {
762 |     auto num_lists = invidx.docids.size();
763 |     auto bg = construct_bipartite_graph(invidx, min_list_len);
764 | 
765 |     // free up some space
766 |     invidx.clear();
767 | 
768 |     // make things faster by precomputing some logs
769 |     log2_precomp.resize(256);
770 |     for(size_t i = 0; i < 256; i++) { log2_precomp[i] = log2f(i); }
771 | 
772 |     {
773 |         auto max_depth = std::max(1.0,ceil(log2(bg.num_docs)-5));
774 |         std::cout << "recursion depth = " << max_depth << std::endl;
775 |         timer t("recursive_bisection");
776 |         progress_bar bp("recursive_bisection", bg.num_docs);
777 |         recursive_bisection(bp, bg.graph.data(), bg.num_queries, bg.num_docs, 0, max_depth);
778 |     }
779 |     return recreate_invidx(bg, num_lists);
780 | }


--------------------------------------------------------------------------------
/include/rgb/util.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <chrono>
  4 | #include <cstdarg>
  5 | #include <cstring>
  6 | #include <iostream>
  7 | #include <memory>
  8 | #include <mutex>
  9 | #include <vector>
 10 | 
 11 | using namespace std::chrono;
 12 | 
 13 | using postings_list = std::vector<uint32_t>;
 14 | 
 15 | struct inverted_index {
 16 |     size_t num_postings;
 17 |     uint32_t num_docs;
 18 |     uint32_t max_doc_id;
 19 |     std::vector<postings_list> docids;
 20 |     std::vector<postings_list> freqs;
 21 |     std::vector<uint32_t> doc_lengths;
 22 |     std::vector<uint32_t> doc_id_mapping;
 23 |     void resize(size_t new_size)
 24 |     {
 25 |         docids.resize(new_size);
 26 |         freqs.resize(new_size);
 27 |     }
 28 | 
 29 |     size_t size() const { return docids.size(); }
 30 | 
 31 |     void clear()
 32 |     {
 33 |         docids.resize(0);
 34 |         freqs.resize(0);
 35 |         doc_lengths.resize(0);
 36 |         doc_id_mapping.resize(0);
 37 |     }
 38 | };
 39 | 
 40 | int tsfprintff(FILE* f, const char* format, ...)
 41 | {
 42 |     static std::mutex pmutex;
 43 |     std::lock_guard<std::mutex> lock(pmutex);
 44 |     va_list args;
 45 |     va_start(args, format);
 46 |     int ret = vfprintf(f, format, args);
 47 |     va_end(args);
 48 |     fflush(f);
 49 |     return ret;
 50 | }
 51 | 
 52 | struct timer {
 53 |     high_resolution_clock::time_point start;
 54 |     std::string name;
 55 |     timer(const std::string& _n)
 56 |         : name(_n)
 57 |     {
 58 |         tsfprintff(stdout, "START(%s)\n", name.c_str());
 59 |         start = high_resolution_clock::now();
 60 |     }
 61 |     ~timer()
 62 |     {
 63 |         auto stop = high_resolution_clock::now();
 64 |         tsfprintff(stdout, "STOP(%s) - %.3f sec\n", name.c_str(),
 65 |             duration_cast<milliseconds>(stop - start).count() / 1000.0f);
 66 |     }
 67 | };
 68 | 
 69 | struct progress_bar {
 70 |     high_resolution_clock::time_point start;
 71 |     size_t total;
 72 |     size_t current;
 73 |     size_t cur_percent;
 74 |     progress_bar(std::string str, size_t t)
 75 |         : total(t)
 76 |         , current(0)
 77 |         , cur_percent(0)
 78 |     {
 79 |         std::cout << str << ":" << std::endl;
 80 |         tsfprintff(stdout, "[  0/100] |");
 81 |         for (size_t i = 0; i < 50; i++)
 82 |             tsfprintff(stdout, " ");
 83 |         tsfprintff(stdout, "|\r");
 84 |     }
 85 |     progress_bar& operator++()
 86 |     {
 87 |         static std::mutex pmutex;
 88 |         std::lock_guard<std::mutex> lock(pmutex);
 89 |         current++;
 90 |         float fcp = float(current) / float(total) * 100;
 91 |         size_t cp = fcp;
 92 |         if (cp != cur_percent) {
 93 |             cur_percent = cp;
 94 |             tsfprintff(stdout, "[%3d/100] |", (int)cur_percent);
 95 |             size_t print_percent = cur_percent / 2;
 96 |             for (size_t i = 0; i < print_percent; i++)
 97 |                 tsfprintff(stdout, "=");
 98 |             tsfprintff(stdout, ">");
 99 |             for (size_t i = print_percent; i < 50; i++)
100 |                 tsfprintff(stdout, " ");
101 |             tsfprintff(stdout, "|\r");
102 |         }
103 |         return *this;
104 |     }
105 |     void done(size_t num)
106 |     {
107 |         static std::mutex pmutex;
108 |         std::lock_guard<std::mutex> lock(pmutex);
109 |         current += num;
110 |         float fcp = float(current) / float(total) * 100;
111 |         size_t cp = fcp;
112 |         if (cp != cur_percent) {
113 |             cur_percent = cp;
114 |             tsfprintff(stdout, "[%3d/100] |", (int)cur_percent);
115 |             size_t print_percent = cur_percent / 2;
116 |             for (size_t i = 0; i < print_percent; i++)
117 |                 tsfprintff(stdout, "=");
118 |             tsfprintff(stdout, ">");
119 |             for (size_t i = print_percent; i < 50; i++)
120 |                 tsfprintff(stdout, " ");
121 |             tsfprintff(stdout, "|\r");
122 |         }
123 |     }
124 |     ~progress_bar()
125 |     {
126 |         tsfprintff(stdout, "[100/100] |");
127 |         for (size_t i = 0; i < 50; i++)
128 |             tsfprintff(stdout, "=");
129 |         tsfprintff(stdout, ">|\n");
130 |     }
131 | };
132 | 
133 | int fprintff(FILE* f, const char* format, ...)
134 | {
135 |     va_list args;
136 |     va_start(args, format);
137 |     int ret = vfprintf(f, format, args);
138 |     va_end(args);
139 |     fflush(f);
140 |     return ret;
141 | }
142 | 
143 | void quit(const char* format, ...)
144 | {
145 |     va_list args;
146 |     va_start(args, format);
147 |     fprintf(stderr, "error: ");
148 |     vfprintf(stderr, format, args);
149 |     va_end(args);
150 |     if (errno != 0) {
151 |         fprintf(stderr, ": %s\n", strerror(errno));
152 |     } else {
153 |         fprintf(stderr, "\n");
154 |     }
155 |     fflush(stderr);
156 |     exit(EXIT_FAILURE);
157 | }
158 | 
159 | FILE* fopen_or_fail(std::string file_name, const char* mode)
160 | {
161 |     FILE* out_file = fopen(file_name.c_str(), mode);
162 |     if (!out_file) {
163 |         quit("opening output file %s failed", file_name.c_str());
164 |     }
165 |     return out_file;
166 | }
167 | 
168 | void fclose_or_fail(FILE* f)
169 | {
170 |     int ret = fclose(f);
171 |     if (ret != 0) {
172 |         quit("closing file failed");
173 |     }
174 | }
175 | 
176 | uint32_t read_u32(FILE* f)
177 | {
178 |     uint32_t x;
179 |     int ret = fread(&x, sizeof(uint32_t), 1, f);
180 |     if (feof(f)) {
181 |         return 0;
182 |     }
183 |     if (ret != 1) {
184 |         quit("read u32 from file failed: %d != %d", ret, 1);
185 |     }
186 |     return x;
187 | }
188 | 
189 | void read_u32s(FILE* f, void* ptr, size_t n)
190 | {
191 |     size_t ret = fread(ptr, sizeof(uint32_t), n, f);
192 |     if (ret != n) {
193 |         quit("read u32s from file failed: %d != %d", ret, n);
194 |     }
195 | }
196 | 
197 | std::vector<uint32_t> read_uint32_list(FILE* f)
198 | {
199 |     uint32_t list_len = read_u32(f);
200 |     if (list_len == 0)
201 |         return std::vector<uint32_t>();
202 |     std::vector<uint32_t> list(list_len);
203 |     read_u32s(f, list.data(), list_len);
204 |     return list;
205 | }
206 | 
207 | size_t write_u32(FILE* f, uint32_t x)
208 | {
209 |     size_t ret = fwrite(&x, sizeof(uint32_t), 1u, f);
210 |     if (ret != 1u) {
211 |         quit("writing byte to file: %u != %u", ret, 1u);
212 |     }
213 |     return sizeof(uint32_t);
214 | }
215 | 
216 | size_t write_u32s(FILE* f, uint32_t* buf, size_t n)
217 | {
218 |     size_t ret = fwrite(buf, sizeof(uint32_t), n, f);
219 |     if (ret != n) {
220 |         quit("writing byte to file: %u != %u", ret, n);
221 |     }
222 |     return n * sizeof(uint32_t);
223 | }
224 | 
225 | size_t write_uint32_list(FILE* f, std::vector<uint32_t>& list)
226 | {
227 |     size_t written_bytes = write_u32(f, list.size());
228 |     written_bytes += write_u32s(f, list.data(), list.size());
229 |     return written_bytes;
230 | }
231 | 
232 | inverted_index read_ds2i_files(std::string ds2i_prefix)
233 | {
234 |     inverted_index idx;
235 |     std::string docs_file = ds2i_prefix + ".docs";
236 |     timer t("read input list from " + docs_file);
237 |     auto df = fopen_or_fail(docs_file, "rb");
238 |     size_t num_docs = 0;
239 |     size_t num_postings = 0;
240 |     size_t num_lists = 0;
241 |     uint32_t max_doc_id = 0;
242 |     {
243 |         // (1) skip the numdocs list
244 |         read_uint32_list(df);
245 |         // (2) keep reading lists
246 |         while (!feof(df)) {
247 |             const auto& list = read_uint32_list(df);
248 |             size_t n = list.size();
249 |             if (n == 0) {
250 |                 break;
251 |             }
252 |             max_doc_id = std::max(max_doc_id, list.back());
253 |             num_lists++;
254 |             num_postings += n;
255 |             idx.docids.emplace_back(std::move(list));
256 |         }
257 |         num_docs = max_doc_id + 1;
258 |     }
259 |     fclose_or_fail(df);
260 |     std::string freqs_file = ds2i_prefix + ".freqs";
261 |     auto ff = fopen_or_fail(freqs_file, "rb");
262 |     {
263 |         while (!feof(ff)) {
264 |             const auto& list = read_uint32_list(ff);
265 |             size_t n = list.size();
266 |             if (n == 0) {
267 |                 break;
268 |             }
269 |             idx.freqs.emplace_back(std::move(list));
270 |         }
271 |     }
272 |     fclose_or_fail(ff);
273 |     idx.num_docs = num_docs;
274 |     idx.max_doc_id = max_doc_id;
275 |     idx.num_postings = num_postings;
276 |     std::cout << "\tnum_docs = " << num_docs << std::endl;
277 |     std::cout << "\tmax_doc_id = " << max_doc_id << std::endl;
278 |     std::cout << "\tnum_lists = " << num_lists << std::endl;
279 |     std::cout << "\tnum_postings = " << num_postings << std::endl;
280 |     return idx;
281 | }
282 | 
283 | void write_ds2i_files(inverted_index& idx, std::string ds2i_out_prefix)
284 | {
285 |     std::string docs_file = ds2i_out_prefix + ".docs";
286 |     std::string freqs_file = ds2i_out_prefix + ".freqs";
287 |     std::string lens_file = ds2i_out_prefix + ".sizes";
288 |     std::string mapping_file = ds2i_out_prefix + ".mapping";
289 |     {
290 |         auto df = fopen_or_fail(docs_file, "wb");
291 |         {
292 |             // ds2i: 1st list contains num docs
293 |             std::vector<uint32_t> tmp(1);
294 |             tmp[0] = idx.num_docs;
295 |             write_uint32_list(df, tmp);
296 |         }
297 |         for (size_t i = 0; i < idx.docids.size(); i++) {
298 |             write_uint32_list(df, idx.docids[i]);
299 |         }
300 |         fclose_or_fail(df);
301 |     }
302 |     {
303 |         auto ff = fopen_or_fail(freqs_file, "wb");
304 |         for (size_t i = 0; i < idx.freqs.size(); i++) {
305 |             write_uint32_list(ff, idx.freqs[i]);
306 |         }
307 |         fclose_or_fail(ff);
308 |     }
309 |     {
310 |         auto sf = fopen_or_fail(lens_file, "wb");
311 |         write_uint32_list(sf, idx.doc_lengths);
312 |         fclose_or_fail(sf);
313 |     }
314 |     {
315 |         auto mf = fopen_or_fail(mapping_file, "w");
316 |         for (size_t i = 0; i < idx.doc_id_mapping.size(); ++i) {
317 |             fprintff(mf, "%zu %zu\n", idx.doc_id_mapping[i], i);
318 |         }
319 |         fclose_or_fail(mf);
320 |     }
321 | }
322 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(recursive_graph_bisection rgb.cpp)
2 | target_link_libraries(recursive_graph_bisection
3 |   rgb
4 |   CLI11
5 |   ParallelSTL
6 | )
7 | 


--------------------------------------------------------------------------------
/src/rgb.cpp:
--------------------------------------------------------------------------------
 1 | #include "rgb/rgb.hpp"
 2 | #include "rgb/util.hpp"
 3 | 
 4 | #include <cilk/cilk_api.h>
 5 | #include <cilk/reducer_opadd.h>
 6 | 
 7 | double comp_sum_log_gap(
 8 |     const std::vector<uint32_t>& ids, const std::vector<float>& log2_precomp)
 9 | {
10 |     double sum_log_gaps = log2f(ids[0] + 1);
11 |     for (size_t i = 1; i < ids.size(); i++) {
12 |         auto gap = ids[i] - ids[i - 1];
13 |         if (gap < 256)
14 |             sum_log_gaps += log2_precomp[gap];
15 |         else
16 |             sum_log_gaps += log2f(gap);
17 |     }
18 |     return sum_log_gaps;
19 | }
20 | 
21 | float compute_avg_loggap(const inverted_index& idx)
22 | {
23 |     std::vector<float> log2_precomp(256);
24 |     for (size_t i = 0; i < 256; i++) {
25 |         log2_precomp[i] = log2f(i);
26 |     }
27 | 
28 |     cilk::reducer<cilk::op_add<double>> sum_log_gaps(0.0);
29 |     cilk::reducer<cilk::op_add<size_t>> num_gaps(0);
30 |     cilk_for(size_t i = idx.docids.size(); i != 0; i--)
31 |     {
32 |         *sum_log_gaps += comp_sum_log_gap(idx.docids[i - 1], log2_precomp);
33 |         *num_gaps += idx.docids[i - 1].size();
34 |     }
35 |     return double(sum_log_gaps.get_value()) / double(num_gaps.get_value());
36 | }
37 | 
38 | int main(int argc, char** argv)
39 | {
40 |     if (argc < 4) {
41 |         fprintf(stderr,
42 |             "%s <ds2i_prefix> <ds2i_out_prefix> <min_list_len> <num threads>\n",
43 |             argv[0]);
44 |         return EXIT_FAILURE;
45 |     }
46 |     std::string ds2i_prefix = argv[1];
47 |     std::string ds2i_out_prefix = argv[2];
48 |     size_t min_list_len = atoi(argv[3]);
49 |     if (argc == 5) {
50 |         int threads = atoi(argv[4]);
51 |         __cilkrts_set_param("nworkers", std::to_string(threads).c_str());
52 |     }
53 | 
54 |     auto invidx = read_ds2i_files(ds2i_prefix);
55 | 
56 |     std::cout << "BEFORE average LogGap " << compute_avg_loggap(invidx)
57 |               << std::endl;
58 | 
59 |     auto reordered_invidx
60 |         = reorder_docids_graph_bisection(invidx, min_list_len);
61 | 
62 |     std::cout << "AFTER average LogGap " << compute_avg_loggap(reordered_invidx)
63 |               << std::endl;
64 | 
65 |     {
66 |         timer t("write ds2i files");
67 |         write_ds2i_files(reordered_invidx, ds2i_out_prefix);
68 |     }
69 | 
70 |     return EXIT_SUCCESS;
71 | }


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpetri/recursive_graph_bisection/ac9fde47e45e01d5149de0dcf0500d677bd699d9/test/CMakeLists.txt


--------------------------------------------------------------------------------