├── .circleci └── config.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── README.md ├── src ├── bbsearch.c └── bbsearch.h └── test └── test_bbsearch.cpp /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | jobs: 4 | build: 5 | docker: 6 | - image: "debian:buster" 7 | steps: 8 | - checkout 9 | - run: 10 | name: Apt update 11 | command: 'apt-get update' 12 | - run: 13 | name: Install dependencies 14 | command: 'apt-get install -y gcc g++ cmake git' 15 | - run: 16 | name: "Pull Submodules" 17 | command: | 18 | git submodule init 19 | git submodule update --remote 20 | - run: 21 | name: Create make file 22 | command: 'mkdir cmake-build-release && cd cmake-build-release && cmake ..' 23 | - run: 24 | name: Building 25 | command: 'cd cmake-build-release && make' 26 | - run: 27 | name: Testing 28 | command: './cmake-build-release/test_bbsearch' 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cmake-build-* 2 | .idea 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extern/googletest"] 2 | path = extern/googletest 3 | url = https://github.com/google/googletest.git 4 | shallow = true 5 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13) 2 | project(branchless_binary_search C CXX) 3 | set(CMAKE_C_STANDARD 11) 4 | enable_testing() 5 | 6 | #CMAKE_C_FLAGS_[DEBUG|RELEASE|RELWITHDEBINFO|MINSIZEREL] 7 | #set(CMAKE_CXX_FLAGS "-O3 -Winline") 8 | 9 | # extern 10 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extern/googletest) 11 | 12 | # src 13 | add_library(bbsearch SHARED 14 | src/bbsearch.c 15 | src/bbsearch.h 16 | ) 17 | target_include_directories(bbsearch PUBLIC src) 18 | target_compile_options(bbsearch PRIVATE "-O3" "-Winline") 19 | 20 | # test 21 | add_executable(test_bbsearch 22 | test/test_bbsearch.cpp 23 | ) 24 | target_include_directories(test_bbsearch 25 | PUBLIC ${gtest_SOURCE_DIR}/include 26 | PUBLIC ${gtest_SOURCE_DIR} 27 | ) 28 | target_link_libraries(test_bbsearch 29 | PUBLIC bbsearch 30 | PUBLIC gtest 31 | PUBLIC gtest_main 32 | ) 33 | add_test(NAME test_bbsearch 34 | COMMAND test_bbsearch 35 | ) 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Branchless Binary Search 2 | [![CircleCI](https://circleci.com/gh/SiftScience/sift-java.svg?style=svg)](https://circleci.com/gh/SiftScience/sift-java) 3 | 4 | This is a proof-of-concept binary search implementation that entirely avoids branching, 5 | i.e. `if-then-else`. It accomplishes this with a mix of bit twiddling, 6 | jump tables, and access to the x86 `BSR` instruction through a compiler builtin. 7 | 8 | ## Background 9 | * [Binary search](https://en.wikipedia.org/wiki/Binary_search_algorithm) 10 | * [Branch prediction](https://en.wikipedia.org/wiki/Branch_predictor) 11 | 12 | ## Motivation 13 | Branch mispredictions can be expensive, especially on architectures with long 14 | pipelines. This approach avoids branching entirely, at the expense of code size 15 | and more overall instructions. 16 | 17 | Additionally, because the same code is run regardless of input, this approach can be 18 | extended to have SIMD/vector support, parallelizing searches across the same 19 | (or different) data. 20 | 21 | ## Implementation 22 | At a high level, the search works as follows: 23 | 24 | 1. compute `log2(length)` 25 | 2. `switch` on the result 26 | 3. Taking advantage of case fall-through, run a modified binary search update 27 | `log2(length)` times. 28 | 4. Interpret the values of the search window to get the result 29 | 30 | ### `log2(length)` call 31 | This is an optimization. The can work correctly by running `log2(sizeof(length))` 32 | (e.g. 32, 64) times. 33 | 34 | A standard binary search implicitly computes this empirically, essentially 35 | dividing `length` by 2 until it's 0. Because this implementation uses a jump 36 | table to avoid a loop condition, it needs to be known before the search. 37 | 38 | ```c 39 | unsigned int bits = 64 - __builtin_clzl(size); 40 | ``` 41 | 42 | ### Jump table 43 | This is functionally the same as the loop in an iterative binary search 44 | implementation⁠—it runs `update_search_window` `log2(length)` times. 45 | 46 | Without `bits % 64`, the compiler adds a check to make sure there's a case 47 | for `bits`. 48 | 49 | ```c 50 | switch (bits % 64) { 51 | case 63: update_search_window(array, &start, &end, key); 52 | case 62: update_search_window(array, &start, &end, key); 53 | case 61: update_search_window(array, &start, &end, key); 54 | ... 55 | ``` 56 | 57 | ### Window update function 58 | For the most part, the update function looks similar to a standard binary 59 | search, but instead of returning when the result is found, the window remains 60 | the same allowing the function to be run again. 61 | 62 | The second difference is more in implementation than function. Rather than 63 | using an `if-then-else` construct to updated `start` or `end`, both are 64 | updated regardless, with bit masks used to select the next value. 65 | 66 | ```c 67 | __attribute__((always_inline)) static inline void update_search_window( 68 | const int32_t *array, ptrdiff_t *start, ptrdiff_t *end, const int32_t key) { 69 | 70 | ptrdiff_t median = (*start + *end) / 2; 71 | int32_t value = array[median]; 72 | 73 | uintptr_t start_mask = -(key < value); 74 | uintptr_t end_mask = -(key > value); 75 | 76 | *start = (median & ~start_mask) | (*start & start_mask); 77 | *end = ((median & ~end_mask) | (*end & end_mask)) + 78 | (~start_mask & ~end_mask & 1); 79 | } 80 | ``` 81 | 82 | ### Computing the result 83 | Like a recursive binary search's base case, the result is determined by 84 | comparing the `key` being looked up the current window. Like in the window 85 | update function, bitmasks are used to avoid branching. 86 | 87 | Unlike `bsearch()`, but like Java's `Arrays.binarySearch()`, 88 | this returns `-insertion_point - 1` when `key` isn't found. 89 | 90 | ```c 91 | int is_match_mask = -(array[start] == key); 92 | int is_after_start = -(array[start] < key); 93 | int is_before_end = -(array[end] > key); 94 | 95 | return (is_empty_mask & -1) | (is_non_empty_mask & ( 96 | (is_match_mask & start) | (~is_match_mask & ( 97 | (is_after_start & (-end - 1)) | 98 | (~is_after_start & (is_before_end & (-start - 1))))))); 99 | ``` 100 | -------------------------------------------------------------------------------- /src/bbsearch.c: -------------------------------------------------------------------------------- 1 | #include "bbsearch.h" 2 | 3 | static int32_t EMPTY_INT32[1] = {INT32_MAX}; 4 | 5 | __attribute__((always_inline)) static inline void update_search_window(const int32_t *array, ptrdiff_t *start, ptrdiff_t *end, const int32_t key) { 6 | ptrdiff_t median = (*start + *end) / 2; 7 | int32_t value = array[median]; 8 | 9 | uintptr_t start_mask = -(key < value); 10 | uintptr_t end_mask = -(key > value); 11 | 12 | *start = (median & ~start_mask) | (*start & start_mask); 13 | *end = ((median & ~end_mask) | (*end & end_mask)) + 14 | (~start_mask & ~end_mask & 1); 15 | } 16 | 17 | ptrdiff_t bbsearch(int32_t *array, size_t size, int32_t key) { 18 | unsigned int bits = 64 - __builtin_clzl(size); 19 | int is_non_empty_mask = -(size > 0); 20 | int is_empty_mask = ~is_non_empty_mask; 21 | 22 | // __builtin_clzl has undefined behavior for 0, so set bits to 0 in that case 23 | bits = (unsigned int) is_non_empty_mask & bits; 24 | 25 | // Clear most bits to make a jump table more likely 26 | // The switch below has extra cases to avoid checking if bits is handled by 27 | // a case 28 | bits = bits % 64; 29 | 30 | ptrdiff_t start = 0; 31 | 32 | // use a dummy array if size == 0 33 | array = (int32_t *) (((uintptr_t) is_empty_mask & (uintptr_t) EMPTY_INT32) | 34 | ((uintptr_t) is_non_empty_mask & (uintptr_t) array)); 35 | 36 | ptrdiff_t end = is_non_empty_mask & size | is_empty_mask & 1; 37 | 38 | switch (bits) { // NOLINT(hicpp-multiway-paths-covered) 39 | case 63: update_search_window(array, &start, &end, key); 40 | case 62: update_search_window(array, &start, &end, key); 41 | case 61: update_search_window(array, &start, &end, key); 42 | case 60: update_search_window(array, &start, &end, key); 43 | case 59: update_search_window(array, &start, &end, key); 44 | case 58: update_search_window(array, &start, &end, key); 45 | case 57: update_search_window(array, &start, &end, key); 46 | case 56: update_search_window(array, &start, &end, key); 47 | case 55: update_search_window(array, &start, &end, key); 48 | case 54: update_search_window(array, &start, &end, key); 49 | case 53: update_search_window(array, &start, &end, key); 50 | case 52: update_search_window(array, &start, &end, key); 51 | case 51: update_search_window(array, &start, &end, key); 52 | case 50: update_search_window(array, &start, &end, key); 53 | case 49: update_search_window(array, &start, &end, key); 54 | case 48: update_search_window(array, &start, &end, key); 55 | case 47: update_search_window(array, &start, &end, key); 56 | case 46: update_search_window(array, &start, &end, key); 57 | case 45: update_search_window(array, &start, &end, key); 58 | case 44: update_search_window(array, &start, &end, key); 59 | case 43: update_search_window(array, &start, &end, key); 60 | case 42: update_search_window(array, &start, &end, key); 61 | case 41: update_search_window(array, &start, &end, key); 62 | case 40: update_search_window(array, &start, &end, key); 63 | case 39: update_search_window(array, &start, &end, key); 64 | case 38: update_search_window(array, &start, &end, key); 65 | case 37: update_search_window(array, &start, &end, key); 66 | case 36: update_search_window(array, &start, &end, key); 67 | case 35: update_search_window(array, &start, &end, key); 68 | case 34: update_search_window(array, &start, &end, key); 69 | case 33: update_search_window(array, &start, &end, key); 70 | case 32: update_search_window(array, &start, &end, key); 71 | case 31: update_search_window(array, &start, &end, key); 72 | case 30: update_search_window(array, &start, &end, key); 73 | case 29: update_search_window(array, &start, &end, key); 74 | case 28: update_search_window(array, &start, &end, key); 75 | case 27: update_search_window(array, &start, &end, key); 76 | case 26: update_search_window(array, &start, &end, key); 77 | case 25: update_search_window(array, &start, &end, key); 78 | case 24: update_search_window(array, &start, &end, key); 79 | case 23: update_search_window(array, &start, &end, key); 80 | case 22: update_search_window(array, &start, &end, key); 81 | case 21: update_search_window(array, &start, &end, key); 82 | case 20: update_search_window(array, &start, &end, key); 83 | case 19: update_search_window(array, &start, &end, key); 84 | case 18: update_search_window(array, &start, &end, key); 85 | case 17: update_search_window(array, &start, &end, key); 86 | case 16: update_search_window(array, &start, &end, key); 87 | case 15: update_search_window(array, &start, &end, key); 88 | case 14: update_search_window(array, &start, &end, key); 89 | case 13: update_search_window(array, &start, &end, key); 90 | case 12: update_search_window(array, &start, &end, key); 91 | case 11: update_search_window(array, &start, &end, key); 92 | case 10: update_search_window(array, &start, &end, key); 93 | case 9: update_search_window(array, &start, &end, key); 94 | case 8: update_search_window(array, &start, &end, key); 95 | case 7: update_search_window(array, &start, &end, key); 96 | case 6: update_search_window(array, &start, &end, key); 97 | case 5: update_search_window(array, &start, &end, key); 98 | case 4: update_search_window(array, &start, &end, key); 99 | case 3: update_search_window(array, &start, &end, key); 100 | case 2: update_search_window(array, &start, &end, key); 101 | case 1: update_search_window(array, &start, &end, key); 102 | case 0: {} 103 | } 104 | 105 | /* this is equivalent to the bitwise operations below 106 | if (is_empty_mask) { 107 | return -1; 108 | } else if (array[start] == key) { 109 | return start; 110 | } else if (array[start] < key) { 111 | return -end - 1; 112 | } else if (array[end] > key) { 113 | return -start - 1; 114 | } else { 115 | return 0; 116 | } 117 | */ 118 | int is_match_mask = -(array[start] == key); 119 | int is_after_start = -(array[start] < key); 120 | int is_before_end = -(array[end] > key); 121 | 122 | return (is_empty_mask & -1) | (is_non_empty_mask & ( 123 | (is_match_mask & start) | (~is_match_mask & ( 124 | (is_after_start & (-end - 1)) | 125 | (~is_after_start & (is_before_end & (-start - 1))))))); 126 | } 127 | -------------------------------------------------------------------------------- /src/bbsearch.h: -------------------------------------------------------------------------------- 1 | #ifndef BBSEARCH_H 2 | #define BBSEARCH_H 3 | 4 | #include 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | extern ptrdiff_t bbsearch(int32_t *array, size_t size, int32_t key); 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | #endif //BBSEARCH_H 18 | -------------------------------------------------------------------------------- /test/test_bbsearch.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "bbsearch.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | static const int TOTAL_TRIALS = 20000; 13 | 14 | static int compare_int32(const void * a, const void * b) { 15 | if (*(int32_t *) a < *(int32_t *) b) { 16 | return -1; 17 | } else if (*(int32_t *) a > *(int32_t *) b) { 18 | return 1; 19 | } else { 20 | return 0; 21 | } 22 | } 23 | 24 | TEST(BranchlessBinarySearchTest, RandomSparse) { 25 | std::default_random_engine generator; 26 | std::vector array; 27 | for (int trial = 0; trial < TOTAL_TRIALS; ++trial) { 28 | array.clear(); 29 | 30 | for (auto i = std::uniform_int_distribution(0, 1000)(generator); i >= 0; --i) { 31 | array.push_back(generator()); 32 | } 33 | 34 | int32_t key = array[0]; 35 | std::sort(array.begin(), array.end()); 36 | 37 | ptrdiff_t index = bbsearch(array.data(), array.size(), key); 38 | ASSERT_EQ(key, array[index]); 39 | } 40 | } 41 | 42 | TEST(BranchlessBinarySearchTest, RandomDense) { 43 | std::default_random_engine generator; 44 | std::vector array; 45 | 46 | for (int trial = 0; trial < TOTAL_TRIALS; ++trial) { 47 | array.clear(); 48 | 49 | for (auto i = std::uniform_int_distribution(0, 1000)(generator); i >= 0; --i) { 50 | array.push_back(generator() % 100); 51 | } 52 | 53 | int32_t key = array[0]; 54 | std::sort(array.begin(), array.end()); 55 | 56 | ptrdiff_t index = bbsearch(array.data(), array.size(), key); 57 | ASSERT_EQ(key, array[index]); 58 | } 59 | } 60 | 61 | TEST(BranchlessBinarySearchTest, Empty) { 62 | std::vector array; 63 | ptrdiff_t index = bbsearch(array.data(), array.size(), 42); 64 | ASSERT_EQ(-1, index); 65 | } 66 | 67 | TEST(BranchlessBinarySearchTest, MissingSparse) { 68 | std::default_random_engine generator; 69 | std::vector array; 70 | for (int trial = 0; trial < TOTAL_TRIALS; ++trial) { 71 | array.clear(); 72 | std::set set; 73 | 74 | for (auto i = std::uniform_int_distribution(0, 1000)(generator); i >= 0; --i) { 75 | auto val = generator(); 76 | set.insert(val); 77 | array.push_back(val); 78 | } 79 | 80 | std::sort(array.begin(), array.end()); 81 | 82 | int32_t key; 83 | do { 84 | key = generator(); 85 | } while (set.find(key) != set.end()); 86 | 87 | ptrdiff_t index = bbsearch(array.data(), array.size(), key); 88 | 89 | ASSERT_LT(index, 0); 90 | ptrdiff_t insertion_index = -(index + 1); 91 | if (insertion_index > 0) { 92 | ASSERT_GE(key, array[insertion_index - 1]); 93 | } 94 | if (insertion_index < array.size()) { 95 | ASSERT_LE(key,array[insertion_index]); 96 | } 97 | } 98 | } 99 | 100 | TEST(BranchlessBinarySearchTest, MissingDense) { 101 | std::default_random_engine generator; 102 | std::vector array; 103 | for (int trial = 0; trial < TOTAL_TRIALS; ++trial) { 104 | array.clear(); 105 | int32_t key = generator() % 100; 106 | 107 | for (auto i = std::uniform_int_distribution(0, 1000)(generator); i >= 0; --i) { 108 | int32_t val; 109 | while ((val = generator()) == key); 110 | array.push_back(val); 111 | } 112 | 113 | std::sort(array.begin(), array.end()); 114 | 115 | ptrdiff_t index = bbsearch(array.data(), array.size(), key); 116 | 117 | ASSERT_LT(index, 0); 118 | ptrdiff_t insertion_index = -(index + 1); 119 | if (insertion_index > 0) { 120 | ASSERT_GE(key, array[insertion_index - 1]); 121 | } 122 | if (insertion_index < array.size()) { 123 | ASSERT_LE(key,array[insertion_index]); 124 | } 125 | } 126 | } 127 | 128 | TEST(BranchlessBinarySearchTest, BenchmarkSparse) { 129 | std::default_random_engine generator; 130 | std::vector array; 131 | 132 | for (int i = 0; i < 65536; ++i) { 133 | array.push_back(i); 134 | } 135 | 136 | int64_t branchless_nanos = 0; 137 | int64_t branchless_trials = 0; 138 | int64_t builtin_nanos = 0; 139 | int64_t builtin_trials = 0; 140 | 141 | int use_branchless = 0; 142 | 143 | for (int trial = 0; trial < 1000000; ++trial) { 144 | int32_t key = std::uniform_int_distribution(0, array.size() - 1)(generator); 145 | 146 | auto begin = std::chrono::high_resolution_clock::now(); 147 | if (use_branchless) { 148 | bbsearch(array.data(), array.size(), key); 149 | } else { 150 | bsearch(&key, array.data(), array.size(), sizeof(array[0]), compare_int32); 151 | } 152 | auto end = std::chrono::high_resolution_clock::now(); 153 | 154 | if (use_branchless) { 155 | branchless_nanos += std::chrono::duration_cast(end-begin).count(); 156 | ++branchless_trials; 157 | } else { 158 | builtin_nanos += std::chrono::duration_cast(end-begin).count(); 159 | ++builtin_trials; 160 | } 161 | use_branchless = !use_branchless; 162 | } 163 | 164 | printf("mean branchless time: %ldns; mean builtin time: %ldns\n", 165 | branchless_nanos / branchless_trials, 166 | builtin_nanos / builtin_trials); 167 | } 168 | 169 | int main(int argc, char **argv) { 170 | ::testing::InitGoogleTest(&argc, argv); 171 | return RUN_ALL_TESTS(); 172 | } 173 | --------------------------------------------------------------------------------