├── LICENSE ├── README.md ├── c.bat ├── collectdata.py ├── makeplots.py └── search.cpp /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 stgatilov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # linear-vs-binary-search 2 | 3 | These are the materials for the blog post: 4 | https://dirtyhandscoding.github.io/posts/performance-comparison-linear-search-vs-binary-search.html 5 | 6 | It compares several implementations of linear or binary search intended to find position within a sorted array. 7 | Here is the main plot with results (Broadwell 2 Ghz CPU), see the blog post for more information. 8 | 9 | 10 | 11 | 12 | Contents 13 | -------- 14 | 15 | All the C++ code is in `search.cpp` file, see more comments inside it. 16 | 17 | Tiny script `c.bat` gives the command line for compiling `search.cpp` using MSVC compiler. Aside from the obvious `/O2` setting there, also `/arch:AVX2` and `/D NDEBUG` are important for proper performance measurements. 18 | Also it contains commented command line for GCC compilation, which also needs one specific flag `-fno-strict-overflow`. 19 | 20 | The python scripts work as follows (Python 3 is required). 21 | First you run `collectdata.py`, which takes `search.cpp` source, copies it into `search_tmp.cpp` with some changes, compiles it using `c.bat` script and runs `search_tmp.exe` to generate text logs into `/res` subdirectory. 22 | Then you run `makeplots.py`, which takes all logs in `/res` subdirectory and generates png images with plots (in the same subdirectory). 23 | -------------------------------------------------------------------------------- /c.bat: -------------------------------------------------------------------------------- 1 | cl search.cpp /O2 /W2 /EHsc /D _CRT_SECURE_NO_DEPRECATE /D NDEBUG /arch:AVX2 /FAs 2 | rem g++ search.cpp --std=c++11 -fno-strict-overflow -O3 -D NDEBUG -mavx2 -o search_gcc.exe 3 | -------------------------------------------------------------------------------- /collectdata.py: -------------------------------------------------------------------------------- 1 | import re, os, glob 2 | 3 | def CollectData(N, mem): 4 | for fn in glob.glob("search_tmp.*"): 5 | os.remove(fn) 6 | 7 | with open("search.cpp", "rt") as f: 8 | src = f.read() 9 | ints = mem / 8 10 | src = re.sub(r"const int SIZE = (\d*);", r"const int SIZE = %d;" % N, src) 11 | src = re.sub(r"const int ARR_SAMPLES = (\(.*?\))", r"const int ARR_SAMPLES = %d" % ints, src) 12 | src = re.sub(r"const int KEY_SAMPLES = (\(.*?\))", r"const int KEY_SAMPLES = %d" % ints, src) 13 | with open("search_tmp.cpp", "wt") as f: 14 | f.write(src) 15 | 16 | with open("c.bat", "rt") as f: 17 | bat = f.read() 18 | bat = bat.replace("search.cpp", "search_tmp.cpp") 19 | os.system(bat) 20 | 21 | logname = "res_%04d_%d.log" % (N, mem) 22 | os.system("search_tmp >res/" + logname) 23 | os.system("search_tmp >res/" + logname) 24 | 25 | 26 | for fn in glob.glob("res/*"): 27 | os.remove(fn) 28 | 29 | sizes = [16, 32, 64, 128, 256, 512, 1024] 30 | #sizes = [128, 256, 512, 1024, 2048, 4096] 31 | for s in sizes: 32 | CollectData(s, 64<<10) 33 | # CollectData(s, 512<<10) 34 | -------------------------------------------------------------------------------- /makeplots.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.ticker as ticker 3 | import sys, os, glob, re, math 4 | 5 | 6 | results = {} 7 | def AddRes(name, mem, N, eltime): 8 | global results 9 | results.setdefault(mem, {}).setdefault(name, {})[N] = eltime 10 | 11 | 12 | def ReadRes(fn): 13 | with open(fn, "rt") as f: 14 | data = f.read() 15 | 16 | match = re.search(r"Arrays: (\d*) x (\d*)", data) 17 | N = int(match.group(2)) 18 | match = re.search(r"Memory: (\d*)", data) 19 | mem = int(match.group(1)) 20 | 21 | for match in re.finditer(r"\s*([0-9.]+)\s*ns\s*:\s*(\S+)", data): 22 | eltime = float(match.group(1)) 23 | name = match.group(2) 24 | AddRes(name, mem, N, eltime) 25 | 26 | 27 | for fn in glob.glob("res/*.log"): 28 | ReadRes(fn) 29 | 30 | # plt.loglog([1,2,3,4], [1,4,9,16], 'bo', [1,2,3,4], [16,9,9,10], 'ro', basex=2, basey=2, linestyle='-') 31 | # plt.show() 32 | 33 | styles = ['yx', 'rx', 'r+', 'mx', 'm+', 'k.', 'ko', 'bo', 'bs', 'yo', 'g*', 'gP', 'gd', 'm*', 'c*'] 34 | 35 | dpi = 150 36 | 37 | for mem, graphs in results.items(): 38 | args = [] 39 | names = [] 40 | argsPE = [] 41 | argsLog = [] 42 | 43 | idx = 0 44 | for name, graph in graphs.items(): 45 | if ('linear' in name and 'scalar' in name): 46 | continue 47 | X = [] 48 | Y = [] 49 | Z = [] 50 | W = [] 51 | for N, eltime in graph.items(): 52 | X.append(N) 53 | Y.append(eltime) 54 | Z.append(eltime / N) 55 | W.append(eltime / math.log(N, 2.0)) 56 | args += [X, Y, styles[idx]] 57 | argsPE += [X, Z, styles[idx]] 58 | argsLog += [X, W, styles[idx]] 59 | names.append(name) 60 | idx += 1 61 | print("%s: %s" % (name, args[-1])) 62 | 63 | 64 | title = "(memory = %dB)" % mem 65 | if len(sys.argv) > 1: 66 | title = sys.argv[1] + " " + title 67 | 68 | ax = plt.axes() 69 | ax.set_title(title) 70 | ax.loglog(*args, basex=2, basey=2, linestyle='-') 71 | ax.set_xlabel("Array length (N)") 72 | ax.set_ylabel("Time per search, ns") 73 | ax.grid(True, which="major") 74 | ax.grid(True, which="minor", color='0.8', linestyle=':') 75 | ax.legend(names, loc=2, prop={'size': 6}) 76 | ax.get_yaxis().get_minor_locator().subs([1.25, 1.5, 1.75]) 77 | ax.get_yaxis().set_minor_formatter(ticker.FuncFormatter(lambda x,p: str(int(x)))) 78 | ax.get_yaxis().set_major_formatter(ticker.ScalarFormatter()) 79 | ax.get_xaxis().set_major_formatter(ticker.ScalarFormatter()) 80 | #plt.show() 81 | plt.savefig('res/plot_search_%d.png' % mem, bbox_inches='tight', dpi=dpi) 82 | plt.gcf().clear() 83 | 84 | ax = plt.axes() 85 | ax.set_title(title) 86 | ax.semilogx(*argsPE, basex=2, linestyle='-') 87 | ax.set_xlabel("Array length (N)") 88 | ax.set_ylabel("Time per element, ns") 89 | ax.grid(True, which="major") 90 | ax.grid(True, which="minor", color='0.8', linestyle=':') 91 | ax.legend(names, loc=1, prop={'size': 6}) 92 | ax.set_ylim(0.0, 0.5) 93 | ax.get_yaxis().set_minor_locator(ticker.MultipleLocator(0.01)) 94 | ax.get_yaxis().tick_right() 95 | ax.get_xaxis().set_major_formatter(ticker.ScalarFormatter()) 96 | #plt.show() 97 | plt.savefig('res/plot_elem_%d.png' % mem, bbox_inches='tight', dpi=dpi) 98 | plt.gcf().clear() 99 | 100 | ax = plt.axes() 101 | ax.set_title(title) 102 | ax.semilogx(*argsLog, basex=2, linestyle='-') 103 | ax.set_xlabel("Array length (N)") 104 | ax.set_ylabel("Time per one bin.search comparison, ns") 105 | ax.grid(True, which="major") 106 | ax.grid(True, which="minor", color='0.8', linestyle=':') 107 | ax.legend(names, loc=2, prop={'size': 6}) 108 | ax.set_ylim(1.0, 7.0) 109 | ax.get_yaxis().set_minor_locator(ticker.MultipleLocator(0.5)) 110 | ax.get_yaxis().tick_right() 111 | ax.get_xaxis().set_major_formatter(ticker.ScalarFormatter()) 112 | #plt.show() 113 | plt.savefig('res/plot_log_%d.png' % mem, bbox_inches='tight', dpi=dpi) 114 | plt.gcf().clear() 115 | -------------------------------------------------------------------------------- /search.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | //little-endian _MM_SHUFFLE 11 | #define SHUF(i0, i1, i2, i3) (i0 + i1*4 + i2*16 + i3*64) 12 | 13 | #ifdef _MSC_VER 14 | #define FORCEINLINE __forceinline 15 | #define NOINLINE __declspec(noinline) 16 | #define ALIGN(n) __declspec(align(n)) 17 | FORCEINLINE uint32_t bsr(uint32_t x) { 18 | unsigned long res; 19 | _BitScanReverse(&res, x); 20 | return res; 21 | } 22 | FORCEINLINE uint32_t bsf(uint32_t x) { 23 | unsigned long res; 24 | _BitScanForward(&res, x); 25 | return res; 26 | } 27 | #else 28 | #define FORCEINLINE __attribute__((always_inline)) inline 29 | #define NOINLINE __attribute__((noinline)) 30 | #define ALIGN(n) __attribute__((aligned(n))) 31 | FORCEINLINE uint32_t bsr(uint32_t x) { 32 | return 31 - __builtin_clz(x); 33 | } 34 | FORCEINLINE uint32_t bsf(uint32_t x) { 35 | return __builtin_ctz(x); 36 | } 37 | #endif 38 | 39 | //if true, then average latency of one search is measured 40 | //if false, then average throughput performance of one search is measured 41 | //implementation-wise, setting it makes the index of the next array/key 42 | //to be searched dependent on the answer of the current search 43 | #define MEASURE_LATENCY false 44 | 45 | //controls inlining of all search functions being tested 46 | //NOINLINE means that inlining is forbidden: 47 | //in this case benchmarking code is less likely to influence search performance 48 | #define TESTINLINE NOINLINE 49 | 50 | //======================= search implementations ======================= 51 | 52 | static TESTINLINE int binary_search_std (const int *arr, int n, int key) { 53 | return std::lower_bound(arr, arr + n, key) - arr; 54 | } 55 | 56 | static TESTINLINE int binary_search_simple (const int *arr, int n, int key) { 57 | intptr_t left = -1; 58 | intptr_t right = n; 59 | while (right - left > 1) { 60 | intptr_t middle = (left + right) >> 1; 61 | if (arr[middle] < key) 62 | left = middle; 63 | else 64 | right = middle; 65 | } 66 | return right; 67 | } 68 | 69 | intptr_t MINUS_ONE = -1; 70 | static TESTINLINE int binary_search_branchless (const int *arr, int n, int key) { 71 | assert((n & (n+1)) == 0); //n = 2^k - 1 72 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput 73 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency 74 | intptr_t logstep = bsr(n); 75 | intptr_t step = intptr_t(1) << logstep; 76 | while (step > 0) { 77 | pos = (arr[pos + step] < key ? pos + step : pos); 78 | step >>= 1; 79 | } 80 | return pos + 1; 81 | } 82 | 83 | template static TESTINLINE int binary_search_branchless_UR (const int *arr, int n, int key) { 84 | assert((n & (n+1)) == 0); //n = 2^k - 1 85 | assert(n+1 == MAXN); 86 | assert(n < (1<<20)); 87 | 88 | //intptr_t pos = -1; 89 | intptr_t pos = MINUS_ONE; 90 | #define STEP(logstep) \ 91 | if ((1< static TESTINLINE int linear_search_sse_UR (const int *arr, int n, int key) { 141 | assert(size_t(arr) % 16 == 0); 142 | assert(n <= 1024); 143 | __m128i vkey = _mm_set1_epi32(key); 144 | __m128i cnt = _mm_setzero_si128(); 145 | intptr_t i = 0; 146 | #define STEP \ 147 | if (i < MAXN) {\ 148 | __m128i mask0 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+0]), vkey); \ 149 | __m128i mask1 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+4]), vkey); \ 150 | __m128i mask2 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+8]), vkey); \ 151 | __m128i mask3 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+12]), vkey); \ 152 | __m128i sum = _mm_add_epi32(_mm_add_epi32(mask0, mask1), _mm_add_epi32(mask2, mask3)); \ 153 | cnt = _mm_sub_epi32(cnt, sum); \ 154 | } i += 16; 155 | STEP STEP STEP STEP STEP STEP STEP STEP 156 | STEP STEP STEP STEP STEP STEP STEP STEP 157 | STEP STEP STEP STEP STEP STEP STEP STEP 158 | STEP STEP STEP STEP STEP STEP STEP STEP 159 | STEP STEP STEP STEP STEP STEP STEP STEP 160 | STEP STEP STEP STEP STEP STEP STEP STEP 161 | STEP STEP STEP STEP STEP STEP STEP STEP 162 | STEP STEP STEP STEP STEP STEP STEP STEP 163 | #undef STEP 164 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(2, 3, 0, 1))); 165 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(1, 0, 3, 2))); 166 | return _mm_cvtsi128_si32(cnt); 167 | } 168 | 169 | static TESTINLINE int linear_search_avx (const int *arr, int n, int key) { 170 | assert(size_t(arr) % 32 == 0); 171 | __m256i vkey = _mm256_set1_epi32(key); 172 | __m256i cnt = _mm256_setzero_si256(); 173 | for (int i = 0; i < n; i += 16) { 174 | __m256i mask0 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+0])); 175 | __m256i mask1 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+8])); 176 | __m256i sum = _mm256_add_epi32(mask0, mask1); 177 | cnt = _mm256_sub_epi32(cnt, sum); 178 | } 179 | __m128i xcnt = _mm_add_epi32(_mm256_extracti128_si256(cnt, 1), _mm256_castsi256_si128(cnt)); 180 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(2, 3, 0, 1))); 181 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(1, 0, 3, 2))); 182 | return _mm_cvtsi128_si32(xcnt); 183 | } 184 | 185 | template static TESTINLINE int linear_search_avx_UR (const int *arr, int n, int key) { 186 | assert(size_t(arr) % 32 == 0); 187 | assert(n <= 1024); 188 | __m256i vkey = _mm256_set1_epi32(key); 189 | __m256i cnt = _mm256_setzero_si256(); 190 | intptr_t i = 0; 191 | #define STEP \ 192 | if (i < MAXN) {\ 193 | __m256i mask0 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+0])); \ 194 | __m256i mask1 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+8])); \ 195 | __m256i sum = _mm256_add_epi32(mask0, mask1); \ 196 | cnt = _mm256_sub_epi32(cnt, sum); \ 197 | } i += 16; 198 | STEP STEP STEP STEP STEP STEP STEP STEP 199 | STEP STEP STEP STEP STEP STEP STEP STEP 200 | STEP STEP STEP STEP STEP STEP STEP STEP 201 | STEP STEP STEP STEP STEP STEP STEP STEP 202 | STEP STEP STEP STEP STEP STEP STEP STEP 203 | STEP STEP STEP STEP STEP STEP STEP STEP 204 | STEP STEP STEP STEP STEP STEP STEP STEP 205 | STEP STEP STEP STEP STEP STEP STEP STEP 206 | #undef STEP 207 | __m128i xcnt = _mm_add_epi32(_mm256_extracti128_si256(cnt, 1), _mm256_castsi256_si128(cnt)); 208 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(2, 3, 0, 1))); 209 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(1, 0, 3, 2))); 210 | return _mm_cvtsi128_si32(xcnt); 211 | } 212 | 213 | // from https://stackoverflow.com/questions/2741859/how-fast-can-you-make-linear-search 214 | // type of "i" changed from int to intptr_t 215 | static TESTINLINE int linearX_search_scalar (const int *arr, int n, int key) { 216 | intptr_t i = 0; 217 | while (i < n) { 218 | if (arr[i] >= key) 219 | break; 220 | ++i; 221 | } 222 | return i; 223 | } 224 | 225 | // from https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/ 226 | // type of "i" changed from int to intptr_t 227 | static TESTINLINE int linearX_search_sse (const int *arr, int n, int key) { 228 | __m128i *in_data = (__m128i*)arr; 229 | __m128i key4 = _mm_set1_epi32(key); 230 | intptr_t i = 0; 231 | int res; 232 | for (;;) { 233 | __m128i cmp0 = _mm_cmpgt_epi32 (key4, in_data [i + 0]); 234 | __m128i cmp1 = _mm_cmpgt_epi32 (key4, in_data [i + 1]); 235 | __m128i cmp2 = _mm_cmpgt_epi32 (key4, in_data [i + 2]); 236 | __m128i cmp3 = _mm_cmpgt_epi32 (key4, in_data [i + 3]); 237 | __m128i pack01 = _mm_packs_epi32 (cmp0, cmp1); 238 | __m128i pack23 = _mm_packs_epi32 (cmp2, cmp3); 239 | __m128i pack0123 = _mm_packs_epi16 (pack01, pack23); 240 | res = _mm_movemask_epi8 (pack0123); 241 | if (res != 0xFFFF) 242 | break; 243 | i += 4; 244 | } 245 | return i * 4 + bsf(~res); 246 | } 247 | 248 | //additional versions (experimental) 249 | 250 | static TESTINLINE int hybrid_search (const int *arr, int n, int key) { 251 | assert((n & (n+1)) == 0); //n = 2^k - 1 252 | intptr_t pos = MINUS_ONE; 253 | intptr_t logstep = bsr(n); 254 | intptr_t step = intptr_t(1) << logstep; 255 | while (step > 8) { 256 | pos = (arr[pos + step] < key ? pos + step : pos); 257 | step >>= 1; 258 | } 259 | pos++; 260 | step <<= 1; 261 | 262 | assert(size_t(arr) % 16 == 0); 263 | assert(pos % 16 == 0); 264 | __m128i vkey = _mm_set1_epi32(key); 265 | __m128i mask0 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+0]), vkey); 266 | __m128i mask1 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+4]), vkey); 267 | __m128i mask2 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+8]), vkey); 268 | __m128i mask3 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+12]), vkey); 269 | __m128i cnt = _mm_add_epi32(_mm_add_epi32(mask0, mask1), _mm_add_epi32(mask2, mask3)); 270 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(2, 3, 0, 1))); 271 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(1, 0, 3, 2))); 272 | 273 | return pos - _mm_cvtsi128_si32(cnt); 274 | } 275 | 276 | static TESTINLINE int hybridX_search (const int *arr, int n, int key) { 277 | assert((n & (n+1)) == 0); //n = 2^k - 1 278 | intptr_t pos = MINUS_ONE; 279 | intptr_t logstep = bsr(n); 280 | intptr_t step = intptr_t(1) << logstep; 281 | while (step > 8) { 282 | pos = (arr[pos + step] < key ? pos + step : pos); 283 | step >>= 1; 284 | } 285 | pos++; 286 | step <<= 1; 287 | 288 | assert(size_t(arr) % 16 == 0); 289 | assert(pos % 16 == 0); 290 | __m128i vkey = _mm_set1_epi32(key); 291 | __m128i cnt = _mm_setzero_si128(); 292 | __m128i cmp0 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+0])); 293 | __m128i cmp1 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+4])); 294 | __m128i cmp2 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+8])); 295 | __m128i cmp3 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+12])); 296 | __m128i pack01 = _mm_packs_epi32 (cmp0, cmp1); 297 | __m128i pack23 = _mm_packs_epi32 (cmp2, cmp3); 298 | __m128i pack0123 = _mm_packs_epi16 (pack01, pack23); 299 | uint32_t res = _mm_movemask_epi8 (pack0123); 300 | 301 | return pos + bsf(~res); 302 | } 303 | 304 | static TESTINLINE int binary_search_branchlessM (const int *arr, int n, int key) { 305 | assert((n & (n+1)) == 0); //n = 2^k - 1 306 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput 307 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency 308 | intptr_t logstep = bsr(n); 309 | intptr_t step = intptr_t(1) << logstep; 310 | while (step > 0) { 311 | pos += (arr[pos + step] < key) * step; 312 | step >>= 1; 313 | } 314 | return pos + 1; 315 | } 316 | 317 | static TESTINLINE int binary_search_branchlessA (const int *arr, int n, int key) { 318 | assert((n & (n+1)) == 0); //n = 2^k - 1 319 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput 320 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency 321 | intptr_t logstep = bsr(n); 322 | intptr_t step = intptr_t(1) << logstep; 323 | while (step > 0) { 324 | pos += (-(arr[pos + step] < key)) & step; 325 | step >>= 1; 326 | } 327 | return pos + 1; 328 | } 329 | 330 | static TESTINLINE int binary_search_branchlessS (const int *arr, int n, int key) { 331 | assert((n & (n+1)) == 0); //n = 2^k - 1 332 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput 333 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency 334 | intptr_t logstep = bsr(n); 335 | intptr_t step = intptr_t(1) << logstep; 336 | while (step > 0) { 337 | pos += ((arr[pos + step] - key) >> 31) & step; 338 | step >>= 1; 339 | } 340 | return pos + 1; 341 | } 342 | 343 | static TESTINLINE int binary_search_branchless_pre (const int *arr, int n, int key) { 344 | assert((n & (n+1)) == 0); //n = 2^k - 1 345 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput 346 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency 347 | intptr_t logstep = bsr(n); 348 | intptr_t step = intptr_t(1) << logstep; 349 | int pivot = arr[pos + step]; 350 | while (step > 1) { 351 | intptr_t nextstep = step >> 1; 352 | int pivotL = arr[pos + nextstep]; 353 | int pivotR = arr[pos + step + nextstep]; 354 | pos = (pivot < key ? pos + step : pos); 355 | pivot = (pivot < key ? pivotR : pivotL); 356 | step = nextstep; 357 | } 358 | pos = (pivot < key ? pos + step : pos); 359 | return pos + 1; 360 | } 361 | 362 | static TESTINLINE int quaternary_search_branchless (const int *arr, int n, int key) { 363 | assert((n & (n+1)) == 0); //n = 2^k - 1 364 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput 365 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency 366 | intptr_t logstep = bsr(n) - 1; 367 | intptr_t step = intptr_t(1) << logstep; 368 | while (step > 0) { 369 | int pivotL = arr[pos + step * 1]; 370 | int pivotM = arr[pos + step * 2]; 371 | int pivotR = arr[pos + step * 3]; 372 | pos = (pivotL < key ? pos + step : pos); 373 | pos = (pivotM < key ? pos + step : pos); 374 | pos = (pivotR < key ? pos + step : pos); 375 | step >>= 2; 376 | } 377 | pos = (arr[pos + 1] < key ? pos + 1 : pos); 378 | return pos + 1; 379 | } 380 | 381 | static TESTINLINE int quaternary_search_branchless2 (const int *arr, int n, int key) { 382 | assert((n & (n+1)) == 0); //n = 2^k - 1 383 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput 384 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency 385 | intptr_t logstep = bsr(n); 386 | intptr_t step2 = intptr_t(1) << logstep; 387 | intptr_t step = step2 >> 1; 388 | while (step > 0) { 389 | int pivotL = arr[pos + step]; 390 | int pivotM = arr[pos + step2]; 391 | int pivotR = arr[pos + step2 + step]; 392 | pos = (pivotM < key ? pos + step2 : pos); 393 | int pivotX = (pivotM < key ? pivotR : pivotL); 394 | pos = (pivotX < key ? pos + step : pos); 395 | step >>= 2; 396 | step2 >>= 2; 397 | } 398 | pos = (arr[pos + 1] < key ? pos + 1 : pos); 399 | return pos + 1; 400 | } 401 | 402 | //======================= testing code ======================= 403 | 404 | //length of each input array (including one sentinel element) 405 | //must be power of two 406 | static const int SIZE = 64; 407 | //how many searches are done in benchmark 408 | static const int TRIES = (1<<30) / SIZE; 409 | //number of pre-generated input arrays (rotated cyclically) 410 | static const int ARR_SAMPLES = (4<<10) / SIZE; 411 | //number of pre-generated keys for search (rotated cyclically) 412 | static const int KEY_SAMPLES = (4<<10); 413 | 414 | //number of elements in every array to be searched (excluding sentinel element) 415 | int n = SIZE - 1; 416 | 417 | //input arrays for search (aligned for AVX) 418 | ALIGN(32) int input[ARR_SAMPLES][SIZE]; 419 | //keys to be searched 420 | int keys[KEY_SAMPLES]; 421 | 422 | 423 | int main() { 424 | //used RNG 425 | std::mt19937 rnd; 426 | std::uniform_int_distribution distr(0, SIZE); 427 | 428 | //generate all input arrays 429 | for (int s = 0; s < ARR_SAMPLES; s++) { 430 | for (int i = 0; i < n; i++) 431 | input[s][i] = distr(rnd); 432 | std::sort(input[s], input[s] + n); 433 | //set sentinel element to INT_MAX 434 | for (int i = n; i < (n+15)/16*16; i++) 435 | input[s][i] = INT_MAX; 436 | } 437 | 438 | //test correctness of searches AND generate all keys to be searched 439 | const int ITERS = std::max(TRIES / 10, std::max(KEY_SAMPLES, ARR_SAMPLES)) + 10; 440 | for (int t = 0; t < ITERS; t++) { 441 | const int *arr = input[t % ARR_SAMPLES]; 442 | int key = keys[t % KEY_SAMPLES] = distr(rnd); 443 | 444 | int res[32], sk = 0; 445 | //res[sk++] = linearX_search_scalar(arr, n, key); 446 | //res[sk++] = linear_search_scalar(arr, n, key); 447 | res[sk++] = linearX_search_sse(arr, n, key); 448 | res[sk++] = linear_search_sse(arr, n, key); 449 | res[sk++] = linear_search_sse_UR(arr, n, key); 450 | res[sk++] = linear_search_avx(arr, n, key); 451 | res[sk++] = linear_search_avx_UR(arr, n, key); 452 | res[sk++] = binary_search_std(arr, n, key); 453 | res[sk++] = binary_search_simple(arr, n, key); 454 | res[sk++] = binary_search_branchless(arr, n, key); 455 | res[sk++] = binary_search_branchless_UR(arr, n, key); 456 | //some experimental implementations: 457 | res[sk++] = hybridX_search(arr, n, key); 458 | res[sk++] = binary_search_branchlessM(arr, n, key); 459 | res[sk++] = binary_search_branchlessA(arr, n, key); 460 | res[sk++] = binary_search_branchlessS(arr, n, key); 461 | res[sk++] = binary_search_branchless_pre(arr, n, key); 462 | res[sk++] = quaternary_search_branchless(arr, n, key); 463 | 464 | //program terminates if any search gives different answer 465 | for (int i = 1; i < sk; i++) 466 | if (res[i-1] != res[i]) { 467 | printf("ERROR: "); 468 | for (int j = 0; j < sk; j++) 469 | printf(" %d", res[j]); 470 | printf("\n"); 471 | exit(0); 472 | } 473 | } 474 | 475 | //print some info about current benchmark parameters 476 | static const int DARR = 1779033703 & (ARR_SAMPLES - 1); 477 | static const int DKEY = 2654435769 & (KEY_SAMPLES - 1); 478 | printf("Arrays: %d x %d (+%d)\n", ARR_SAMPLES, SIZE, DARR); 479 | printf("Keys: %d (+%d)\n", KEY_SAMPLES, DKEY); 480 | printf("Memory: %d B\n", int(sizeof(input) + sizeof(keys))); 481 | 482 | //benchmark environment 483 | //note: it could had been a function instead of macro 484 | //but originally I wanted to benchmark inlined code of every search 485 | #define TEST_SEARCH(func) \ 486 | { \ 487 | int start = clock(); \ 488 | int check = 0; \ 489 | for (int t = 0; t < TRIES; t++) { \ 490 | int i = (t * DARR + (MEASURE_LATENCY ? check&1 : 0)) & (ARR_SAMPLES - 1); \ 491 | int j = (t * DKEY + (MEASURE_LATENCY ? check&1 : 0)) & (KEY_SAMPLES - 1); \ 492 | const int *arr = input[i]; \ 493 | int key = keys[j]; \ 494 | int res = func(arr, n, key); \ 495 | check += res; \ 496 | } \ 497 | double elapsed = double(clock() - start) / CLOCKS_PER_SEC; \ 498 | printf("%8.1lf ns : %40s (%d)\n", 1e+9 * elapsed / TRIES, #func, check); \ 499 | } 500 | 501 | //run performance benchmark and print formatted results 502 | //TEST_SEARCH(linearX_search_scalar); 503 | //TEST_SEARCH(linear_search_scalar); 504 | 505 | TEST_SEARCH(linearX_search_sse); 506 | TEST_SEARCH(linear_search_sse); 507 | TEST_SEARCH(linear_search_sse_UR); 508 | TEST_SEARCH(linear_search_avx); 509 | TEST_SEARCH(linear_search_avx_UR); 510 | 511 | TEST_SEARCH(binary_search_std); 512 | TEST_SEARCH(binary_search_simple); 513 | TEST_SEARCH(binary_search_branchless); 514 | TEST_SEARCH(binary_search_branchless_UR); 515 | 516 | //some experimental implementations: 517 | TEST_SEARCH(hybridX_search); 518 | 519 | TEST_SEARCH(binary_search_branchlessM); 520 | TEST_SEARCH(binary_search_branchlessA); 521 | TEST_SEARCH(binary_search_branchlessS); 522 | 523 | TEST_SEARCH(binary_search_branchless_pre); 524 | TEST_SEARCH(quaternary_search_branchless); 525 | 526 | return 0; 527 | } 528 | --------------------------------------------------------------------------------