├── .gitignore ├── README ├── skarupke_binary_search.h ├── bitwise_binary_search.h ├── plot.ipynb └── eval.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.svg 3 | .ipynb_checkpoints 4 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This is supporting code for my blog post at https://orlp.net/blog/bitwise-binary-search/. 2 | -------------------------------------------------------------------------------- /skarupke_binary_search.h: -------------------------------------------------------------------------------- 1 | /* Copyright Malte Skarupke 2023. 2 | Boost Software License - Version 1.0 - August 17th, 2003 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | The copyright notices in the Software and this entire statement, including 10 | the above license grant, this restriction and the following disclaimer, 11 | must be included in all copies of the Software, in whole or in part, and 12 | all derivative works of the Software, unless such copies or derivative 13 | works are solely in the form of machine-executable object code generated by 14 | a source language processor. 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 18 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 19 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | DEALINGS IN THE SOFTWARE.*/ 22 | 23 | #pragma once 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | inline size_t bit_floor(size_t i) 30 | { 31 | constexpr int num_bits = sizeof(i) * 8; 32 | return size_t(1) << (num_bits - std::countl_zero(i) - 1); 33 | } 34 | inline size_t bit_ceil(size_t i) 35 | { 36 | constexpr int num_bits = sizeof(i) * 8; 37 | return size_t(1) << (num_bits - std::countl_zero(i - 1)); 38 | } 39 | 40 | template 41 | It branchless_lower_bound(It begin, It end, const T & value, Cmp compare) 42 | { 43 | std::size_t length = end - begin; 44 | if (length == 0) 45 | return end; 46 | std::size_t step = bit_floor(length); 47 | if (step != length && compare(begin[step], value)) 48 | { 49 | length -= step + 1; 50 | if (length == 0) 51 | return end; 52 | step = bit_ceil(length); 53 | begin = end - step; 54 | } 55 | for (step /= 2; step != 0; step /= 2) 56 | { 57 | if (compare(begin[step], value)) 58 | begin += step; 59 | } 60 | return begin + compare(*begin, value); 61 | } 62 | 63 | template 64 | It branchless_lower_bound(It begin, It end, const T & value) 65 | { 66 | return branchless_lower_bound(begin, end, value, std::less<>{}); 67 | } 68 | -------------------------------------------------------------------------------- /bitwise_binary_search.h: -------------------------------------------------------------------------------- 1 | /* 2 | See https://orlp.net/blog/bitwise-binary-search. 3 | 4 | Copyright (c) 2023 Orson Peters 5 | 6 | This software is provided 'as-is', without any express or implied warranty. In 7 | no event will the authors be held liable for any damages arising from the use of 8 | this software. 9 | 10 | Permission is granted to anyone to use this software for any purpose, including 11 | commercial applications, and to alter it and redistribute it freely, subject to 12 | the following restrictions: 13 | 14 | 1. The origin of this software must not be misrepresented; you must not claim 15 | that you wrote the original software. If you use this software in a product, 16 | an acknowledgment in the product documentation would be appreciated but is 17 | not required. 18 | 19 | 2. Altered source versions must be plainly marked as such, and must not be 20 | misrepresented as being the original software. 21 | 22 | 3. This notice may not be removed or altered from any source distribution. 23 | */ 24 | 25 | #include 26 | #include 27 | 28 | 29 | // More efficient shim for std::bit_floor. 30 | inline size_t std_bit_floor(size_t n) { 31 | if (n == 0) return 0; 32 | return size_t(1) << (std::bit_width(n) - 1); 33 | } 34 | 35 | 36 | template 37 | It lower_bound_pad(It begin, It end, const T& value, Cmp comp) { 38 | size_t n = end - begin; 39 | size_t b = -1; 40 | for (size_t bit = std_bit_floor(n); bit != 0; bit >>= 1) { 41 | if (b + bit < n && comp(begin[b + bit], value)) b += bit; 42 | } 43 | return begin + (b + 1); 44 | } 45 | 46 | 47 | template 48 | It lower_bound_overlap(It begin, It end, const T& value, Cmp comp) { 49 | size_t n = end - begin; 50 | if (n == 0) return begin; 51 | 52 | size_t two_k = std_bit_floor(n); 53 | size_t b = comp(begin[n / 2], value) ? n - two_k : -1; 54 | for (size_t bit = two_k >> 1; bit != 0; bit >>= 1) { 55 | if (comp(begin[b + bit], value)) b += bit; 56 | } 57 | return begin + (b + 1); 58 | } 59 | 60 | 61 | template 62 | It lower_bound_opt(It begin, It end, const T& value, Cmp comp) { 63 | size_t n = end - begin; 64 | if (n == 0) return begin; 65 | 66 | size_t two_r = std_bit_floor(n); 67 | size_t two_l = two_r - ((two_r >> 1) & ~n); 68 | bool use_r = comp(begin[two_l - 1], value); 69 | size_t two_k = use_r ? two_r : two_l; 70 | begin = use_r ? end - (two_r - 1) : begin; 71 | 72 | size_t b = -1; 73 | for (size_t bit = two_k >> 1; bit != 0; bit >>= 1) { 74 | if (comp(begin[b + bit], value)) b += bit; 75 | } 76 | return begin + (b + 1); 77 | } 78 | 79 | 80 | -------------------------------------------------------------------------------- /plot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0462961e-8c5a-4c33-8101-69225f60bd9f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "%config InlineBackend.figure_format = 'svg'\n", 11 | "\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "6af42d42-fa68-4f98-ada6-1d337f75182e", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "speed = pd.read_csv(\"speed.csv\", header=0)\n", 24 | "speed_str = pd.read_csv(\"speed-str.csv\", header=0)\n", 25 | "comp_counts = pd.read_csv(\"comp-counts.csv\", header=0)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "id": "bc08867c-02a2-4872-9789-bc4574c4bc20", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "algos = [\"lower_bound_\" + a for a in \"skarupke,pad,overlap,opt,std\".split(\",\")]\n", 36 | "for algo in algos:\n", 37 | " data = comp_counts[comp_counts[\"algo\"] == algo]\n", 38 | " plt.plot(data[\"n\"], data[\"num_cmp\"], label = algo)\n", 39 | "\n", 40 | "plt.legend()\n", 41 | "plt.ylim(2.5, 9.5)\n", 42 | "plt.xlabel(\"n\")\n", 43 | "plt.ylabel(\"Average # comparisons\")\n", 44 | "plt.tight_layout()\n", 45 | "plt.savefig(\"comparisons.svg\", bbox_inches='tight', pad_inches=0.05)\n", 46 | "plt.show()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "3ac496d5-6bfb-4720-a290-7d4a89ab53b6", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "algos = [\"lower_bound_\" + a for a in \"skarupke,pad,overlap,opt,std\".split(\",\")]\n", 57 | "\n", 58 | "std = comp_counts[comp_counts[\"algo\"] == \"lower_bound_std\"][\"num_cmp\"].mean()\n", 59 | "for algo in algos:\n", 60 | " data = comp_counts[comp_counts[\"algo\"] == algo]\n", 61 | " print(algo, data[\"num_cmp\"].mean() - std)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "9740fc78-5e00-4df4-80c9-45860bc25ed0", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "algos = [a for a in \"skarupke,pad,overlap,opt,std\".split(\",\")]\n", 72 | "for algo in algos:\n", 73 | " data = speed[speed[\"algo\"] == \"lower_bound_\" + algo]\n", 74 | " data = data[data[\"n\"] <= 256]\n", 75 | " plt.plot(data[\"n\"], data[\"nanosec\"], label = algo)\n", 76 | "\n", 77 | "plt.legend(loc=\"upper left\")\n", 78 | "plt.xlabel(\"n\")\n", 79 | "plt.ylabel(\"Nanoseconds\")\n", 80 | "plt.tight_layout()\n", 81 | "plt.savefig(\"runtime.svg\", bbox_inches='tight', pad_inches=0.05)\n", 82 | "plt.show()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "ce1f612f-460a-4eaa-9da9-1dfea4e29159", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "algos = [a for a in \"skarupke,pad,overlap,opt,std\".split(\",\")]\n", 93 | "for algo in algos:\n", 94 | " data = speed[speed[\"algo\"] == \"lower_bound_\" + algo]\n", 95 | " data = data[data[\"n\"] >= 3]\n", 96 | " plt.plot(data[\"n\"], data[\"nanosec\"], label = algo)\n", 97 | "\n", 98 | "plt.legend(loc=\"upper left\")\n", 99 | "plt.xlabel(\"n\")\n", 100 | "plt.ylabel(\"Nanoseconds\")\n", 101 | "plt.xscale(\"log\", base=2)\n", 102 | "plt.tight_layout()\n", 103 | "plt.savefig(\"runtime-large.svg\", bbox_inches='tight', pad_inches=0.05)\n", 104 | "plt.show()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "a6384e9e-abc5-4aac-9f47-de1c144b0b67", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "algos = [a for a in \"linear,skarupke,pad,overlap,opt,std\".split(\",\")]\n", 115 | "for algo in algos:\n", 116 | " data = speed[speed[\"algo\"] == \"lower_bound_\" + algo]\n", 117 | " data = data[data[\"n\"] <= 32]\n", 118 | " plt.plot(data[\"n\"], data[\"nanosec\"], label = algo)\n", 119 | "\n", 120 | "plt.legend(loc=\"upper left\")\n", 121 | "plt.xlabel(\"n\")\n", 122 | "plt.ylabel(\"Nanoseconds\")\n", 123 | "plt.tight_layout()\n", 124 | "plt.savefig(\"runtime-small.svg\", bbox_inches='tight', pad_inches=0.05)\n", 125 | "plt.show()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "195d0264-9c31-4b93-81b9-f3599c50a921", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "algos = [a for a in \"skarupke,pad,overlap,opt,std\".split(\",\")]\n", 136 | "for algo in algos:\n", 137 | " data = speed_str[speed_str[\"algo\"] == \"lower_bound_\" + algo]\n", 138 | " data = data[data[\"n\"] <= 256]\n", 139 | " plt.plot(data[\"n\"], data[\"nanosec\"], label = algo)\n", 140 | "\n", 141 | "plt.legend(loc=\"upper left\")\n", 142 | "plt.ylim(20, 75)\n", 143 | "plt.xlabel(\"n\")\n", 144 | "plt.ylabel(\"Nanoseconds\")\n", 145 | "plt.tight_layout()\n", 146 | "plt.savefig(\"runtime-str.svg\", bbox_inches='tight', pad_inches=0.05)\n", 147 | "plt.show()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "192eeafa-4038-4960-b165-ab37c0d2b980", 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "algos = [a for a in \"skarupke,pad,overlap,opt,std\".split(\",\")]\n", 158 | "for algo in algos:\n", 159 | " data = speed_str[speed_str[\"algo\"] == \"lower_bound_\" + algo]\n", 160 | " data = data[data[\"n\"] >= 3]\n", 161 | " plt.plot(data[\"n\"], data[\"nanosec\"], label = algo)\n", 162 | "\n", 163 | "plt.legend(loc=\"upper left\")\n", 164 | "plt.xlabel(\"n\")\n", 165 | "plt.ylabel(\"Nanoseconds\")\n", 166 | "plt.ylim(0, 300)\n", 167 | "plt.xscale(\"log\", base=2)\n", 168 | "plt.tight_layout()\n", 169 | "plt.savefig(\"runtime-str-large.svg\", bbox_inches='tight', pad_inches=0.05)\n", 170 | "plt.show()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "f1615e67-7351-4c86-abc5-0e1a4e1422a0", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "algos = [a for a in \"linear,skarupke,pad,overlap,opt,std\".split(\",\")]\n", 181 | "for algo in algos:\n", 182 | " data = speed_str[speed_str[\"algo\"] == \"lower_bound_\" + algo]\n", 183 | " data = data[data[\"n\"] <= 32]\n", 184 | " plt.plot(data[\"n\"], data[\"nanosec\"], label = algo)\n", 185 | "\n", 186 | "plt.legend(loc=\"upper left\")\n", 187 | "plt.xlabel(\"n\")\n", 188 | "plt.ylabel(\"Nanoseconds\")\n", 189 | "plt.tight_layout()\n", 190 | "plt.savefig(\"runtime-str-small.svg\", bbox_inches='tight', pad_inches=0.05)\n", 191 | "plt.show()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "fea4afce-0fab-4751-82a3-16ac15d65a39", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Python 3 (ipykernel)", 206 | "language": "python", 207 | "name": "python3" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.10.2" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 5 224 | } 225 | -------------------------------------------------------------------------------- /eval.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "skarupke_binary_search.h" 11 | #include "bitwise_binary_search.h" 12 | 13 | template 14 | It lower_bound_linear(It begin, It end, const T& value, Cmp comp) { 15 | size_t n = end - begin; 16 | size_t i = 0; 17 | while (i < n && comp(begin[i], value)) i += 1; 18 | return begin + i; 19 | } 20 | 21 | std::vector sorted_int(size_t n) { 22 | std::vector v; v.reserve(n); 23 | for (uint32_t i = 0; i < n; ++i) v.push_back(i); 24 | return v; 25 | } 26 | 27 | template 28 | std::vector to_strings(const std::vector& v, size_t width) { 29 | std::vector vs; vs.reserve(v.size()); 30 | for (auto i : v) { 31 | std::ostringstream ss; 32 | ss << std::setw(width) << std::setfill('0') << i << "\n"; 33 | vs.push_back(ss.str()); 34 | } 35 | return vs; 36 | } 37 | 38 | std::vector rand_ints(size_t n, size_t min, size_t max, std::mt19937_64& rng) { 39 | std::uniform_int_distribution<> distrib(min, max); 40 | std::vector v; v.reserve(n); 41 | for (size_t i = 0; i < n; ++i) v.push_back(distrib(rng)); 42 | return v; 43 | } 44 | 45 | 46 | struct CountCmp { 47 | size_t& num; 48 | 49 | CountCmp(size_t& num) : num(num) { } 50 | 51 | template 52 | bool operator()(const T& lhs, const T& rhs) const { 53 | this->num += 1; 54 | return lhs < rhs; 55 | } 56 | }; 57 | 58 | typedef const uint32_t* (*IntBoundF)(const uint32_t*, const uint32_t*, const uint32_t&, std::less); 59 | typedef const uint32_t* (*CountBoundF)(const uint32_t*, const uint32_t*, const uint32_t&, CountCmp); 60 | typedef const std::string* (*StrBoundF)(const std::string*, const std::string*, const std::string&, std::less); 61 | 62 | std::tuple algos[] = { 63 | { 64 | "lower_bound_linear", 65 | lower_bound_linear>, 66 | lower_bound_linear, 67 | lower_bound_linear> 68 | }, 69 | 70 | { 71 | "lower_bound_std", 72 | std::lower_bound>, 73 | std::lower_bound, 74 | std::lower_bound> 75 | }, 76 | 77 | { 78 | "lower_bound_pad", 79 | lower_bound_pad>, 80 | lower_bound_pad, 81 | lower_bound_pad> 82 | }, 83 | 84 | { 85 | "lower_bound_overlap", 86 | lower_bound_overlap>, 87 | lower_bound_overlap, 88 | lower_bound_overlap> 89 | }, 90 | 91 | { 92 | "lower_bound_opt", 93 | lower_bound_opt>, 94 | lower_bound_opt, 95 | lower_bound_opt> 96 | }, 97 | 98 | { 99 | "lower_bound_skarupke", 100 | branchless_lower_bound>, 101 | branchless_lower_bound, 102 | branchless_lower_bound> 103 | }, 104 | }; 105 | 106 | 107 | 108 | int main(int argc, char** argv) { 109 | std::mt19937_64 rng; 110 | 111 | // Correctness and comparison count. 112 | std::ofstream comp_count_out("comp-counts.csv"); 113 | comp_count_out << "algo,n,num_cmp\n"; 114 | for (auto [algo, bound_int, bound_cmp, bound_str] : algos) { 115 | for (size_t n = 0; n <= 256; ++n) { 116 | std::vector arr = sorted_int(n); 117 | 118 | size_t num_cmp = 0; 119 | CountCmp cmp(num_cmp); 120 | 121 | for (uint32_t rank = 0; rank <= n; ++rank) { 122 | size_t ret = bound_cmp(arr.data(), arr.data() + n, rank, cmp) - arr.data(); 123 | if (ret != rank) { 124 | std::cerr << "Algorithm " << algo << " returned incorrect result on size " << n << ".\n"; 125 | return 1; 126 | } 127 | } 128 | 129 | comp_count_out << algo << "," << n << "," << double(num_cmp) / double(n + 1) << std::endl; 130 | } 131 | } 132 | comp_count_out.close(); 133 | 134 | std::vector sizes; 135 | for (size_t n = 0; n < 256; ++n) { 136 | sizes.push_back(n); 137 | } 138 | double scale = std::pow(2.0, 1.0/12.0); 139 | for (double nf = 256.0; nf <= double(1 << 20) + 0.45; nf *= scale) { 140 | sizes.push_back(size_t(nf + 0.5)); 141 | } 142 | 143 | // Speed. 144 | const size_t repeats = 1000000; 145 | std::ofstream speed_out("speed.csv"); 146 | speed_out << "algo,n,nanosec,dummy\n"; 147 | for (auto [algo, bound_int, bound_cmp, bound_str] : algos) { 148 | rng.seed(0xdeadbeef); 149 | 150 | for (auto n : sizes) { 151 | if (algo == "lower_bound_linear" and n > 256) continue; 152 | std::cout << "Benchmarking " << algo << " " << n << " (int).\n"; 153 | std::vector arr = sorted_int(n); 154 | std::vector ranks = rand_ints(repeats, 0, n, rng); 155 | 156 | size_t dummy = 0; // Prevents optimizing out. 157 | auto start = std::chrono::high_resolution_clock::now(); 158 | for (auto rank : ranks) { 159 | size_t ret = bound_int(arr.data(), arr.data() + n, rank, std::less()) - arr.data(); 160 | dummy += ret; 161 | } 162 | auto stop = std::chrono::high_resolution_clock::now(); 163 | 164 | uint64_t ns = std::chrono::duration_cast(stop - start).count(); 165 | speed_out << algo << "," << n << "," << double(ns) / double(repeats) << "," << dummy << std::endl; 166 | } 167 | } 168 | speed_out.close(); 169 | 170 | 171 | // Speed for strings. 172 | const size_t str_repeats = 300000; 173 | const size_t str_width = 4; 174 | std::ofstream speed_str_out("speed-str.csv"); 175 | speed_str_out << "algo,n,nanosec,dummy\n"; 176 | for (auto [algo, bound_int, bound_cmp, bound_str] : algos) { 177 | rng.seed(0xdeadbeef); 178 | 179 | for (auto n : sizes) { 180 | if (algo == "lower_bound_linear" and n > 256) continue; 181 | std::cout << "Benchmarking " << algo << " " << n << " (str).\n"; 182 | std::vector arr = to_strings(sorted_int(n), str_width); 183 | std::vector ranks = to_strings(rand_ints(str_repeats, 0, n, rng), str_width); 184 | 185 | size_t dummy = 0; // Prevents optimizing out. 186 | auto start = std::chrono::high_resolution_clock::now(); 187 | for (auto rank : ranks) { 188 | size_t ret = bound_str(arr.data(), arr.data() + n, rank, std::less()) - arr.data(); 189 | dummy += ret; 190 | } 191 | auto stop = std::chrono::high_resolution_clock::now(); 192 | 193 | uint64_t ns = std::chrono::duration_cast(stop - start).count(); 194 | speed_str_out << algo << "," << n << "," << double(ns) / double(str_repeats) << "," << dummy << std::endl; 195 | } 196 | } 197 | speed_str_out.close(); 198 | 199 | return 0; 200 | } 201 | --------------------------------------------------------------------------------