├── LICENSE
├── README.md
├── c.bat
├── collectdata.py
├── makeplots.py
└── search.cpp
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 stgatilov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # linear-vs-binary-search
2 |
3 | These are the materials for the blog post:
4 | https://dirtyhandscoding.github.io/posts/performance-comparison-linear-search-vs-binary-search.html
5 |
6 | It compares several implementations of linear or binary search intended to find position within a sorted array.
7 | Here is the main plot with results (Broadwell 2 Ghz CPU), see the blog post for more information.
8 |
9 |
10 |
11 |
12 | Contents
13 | --------
14 |
15 | All the C++ code is in `search.cpp` file, see more comments inside it.
16 |
17 | Tiny script `c.bat` gives the command line for compiling `search.cpp` using MSVC compiler. Aside from the obvious `/O2` setting there, also `/arch:AVX2` and `/D NDEBUG` are important for proper performance measurements.
18 | Also it contains commented command line for GCC compilation, which also needs one specific flag `-fno-strict-overflow`.
19 |
20 | The python scripts work as follows (Python 3 is required).
21 | First you run `collectdata.py`, which takes `search.cpp` source, copies it into `search_tmp.cpp` with some changes, compiles it using `c.bat` script and runs `search_tmp.exe` to generate text logs into `/res` subdirectory.
22 | Then you run `makeplots.py`, which takes all logs in `/res` subdirectory and generates png images with plots (in the same subdirectory).
23 |
--------------------------------------------------------------------------------
/c.bat:
--------------------------------------------------------------------------------
1 | cl search.cpp /O2 /W2 /EHsc /D _CRT_SECURE_NO_DEPRECATE /D NDEBUG /arch:AVX2 /FAs
2 | rem g++ search.cpp --std=c++11 -fno-strict-overflow -O3 -D NDEBUG -mavx2 -o search_gcc.exe
3 |
--------------------------------------------------------------------------------
/collectdata.py:
--------------------------------------------------------------------------------
1 | import re, os, glob
2 |
3 | def CollectData(N, mem):
4 | for fn in glob.glob("search_tmp.*"):
5 | os.remove(fn)
6 |
7 | with open("search.cpp", "rt") as f:
8 | src = f.read()
9 | ints = mem / 8
10 | src = re.sub(r"const int SIZE = (\d*);", r"const int SIZE = %d;" % N, src)
11 | src = re.sub(r"const int ARR_SAMPLES = (\(.*?\))", r"const int ARR_SAMPLES = %d" % ints, src)
12 | src = re.sub(r"const int KEY_SAMPLES = (\(.*?\))", r"const int KEY_SAMPLES = %d" % ints, src)
13 | with open("search_tmp.cpp", "wt") as f:
14 | f.write(src)
15 |
16 | with open("c.bat", "rt") as f:
17 | bat = f.read()
18 | bat = bat.replace("search.cpp", "search_tmp.cpp")
19 | os.system(bat)
20 |
21 | logname = "res_%04d_%d.log" % (N, mem)
22 | os.system("search_tmp >res/" + logname)
23 | os.system("search_tmp >res/" + logname)
24 |
25 |
26 | for fn in glob.glob("res/*"):
27 | os.remove(fn)
28 |
29 | sizes = [16, 32, 64, 128, 256, 512, 1024]
30 | #sizes = [128, 256, 512, 1024, 2048, 4096]
31 | for s in sizes:
32 | CollectData(s, 64<<10)
33 | # CollectData(s, 512<<10)
34 |
--------------------------------------------------------------------------------
/makeplots.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import matplotlib.ticker as ticker
3 | import sys, os, glob, re, math
4 |
5 |
6 | results = {}
7 | def AddRes(name, mem, N, eltime):
8 | global results
9 | results.setdefault(mem, {}).setdefault(name, {})[N] = eltime
10 |
11 |
12 | def ReadRes(fn):
13 | with open(fn, "rt") as f:
14 | data = f.read()
15 |
16 | match = re.search(r"Arrays: (\d*) x (\d*)", data)
17 | N = int(match.group(2))
18 | match = re.search(r"Memory: (\d*)", data)
19 | mem = int(match.group(1))
20 |
21 | for match in re.finditer(r"\s*([0-9.]+)\s*ns\s*:\s*(\S+)", data):
22 | eltime = float(match.group(1))
23 | name = match.group(2)
24 | AddRes(name, mem, N, eltime)
25 |
26 |
27 | for fn in glob.glob("res/*.log"):
28 | ReadRes(fn)
29 |
30 | # plt.loglog([1,2,3,4], [1,4,9,16], 'bo', [1,2,3,4], [16,9,9,10], 'ro', basex=2, basey=2, linestyle='-')
31 | # plt.show()
32 |
33 | styles = ['yx', 'rx', 'r+', 'mx', 'm+', 'k.', 'ko', 'bo', 'bs', 'yo', 'g*', 'gP', 'gd', 'm*', 'c*']
34 |
35 | dpi = 150
36 |
37 | for mem, graphs in results.items():
38 | args = []
39 | names = []
40 | argsPE = []
41 | argsLog = []
42 |
43 | idx = 0
44 | for name, graph in graphs.items():
45 | if ('linear' in name and 'scalar' in name):
46 | continue
47 | X = []
48 | Y = []
49 | Z = []
50 | W = []
51 | for N, eltime in graph.items():
52 | X.append(N)
53 | Y.append(eltime)
54 | Z.append(eltime / N)
55 | W.append(eltime / math.log(N, 2.0))
56 | args += [X, Y, styles[idx]]
57 | argsPE += [X, Z, styles[idx]]
58 | argsLog += [X, W, styles[idx]]
59 | names.append(name)
60 | idx += 1
61 | print("%s: %s" % (name, args[-1]))
62 |
63 |
64 | title = "(memory = %dB)" % mem
65 | if len(sys.argv) > 1:
66 | title = sys.argv[1] + " " + title
67 |
68 | ax = plt.axes()
69 | ax.set_title(title)
70 | ax.loglog(*args, basex=2, basey=2, linestyle='-')
71 | ax.set_xlabel("Array length (N)")
72 | ax.set_ylabel("Time per search, ns")
73 | ax.grid(True, which="major")
74 | ax.grid(True, which="minor", color='0.8', linestyle=':')
75 | ax.legend(names, loc=2, prop={'size': 6})
76 | ax.get_yaxis().get_minor_locator().subs([1.25, 1.5, 1.75])
77 | ax.get_yaxis().set_minor_formatter(ticker.FuncFormatter(lambda x,p: str(int(x))))
78 | ax.get_yaxis().set_major_formatter(ticker.ScalarFormatter())
79 | ax.get_xaxis().set_major_formatter(ticker.ScalarFormatter())
80 | #plt.show()
81 | plt.savefig('res/plot_search_%d.png' % mem, bbox_inches='tight', dpi=dpi)
82 | plt.gcf().clear()
83 |
84 | ax = plt.axes()
85 | ax.set_title(title)
86 | ax.semilogx(*argsPE, basex=2, linestyle='-')
87 | ax.set_xlabel("Array length (N)")
88 | ax.set_ylabel("Time per element, ns")
89 | ax.grid(True, which="major")
90 | ax.grid(True, which="minor", color='0.8', linestyle=':')
91 | ax.legend(names, loc=1, prop={'size': 6})
92 | ax.set_ylim(0.0, 0.5)
93 | ax.get_yaxis().set_minor_locator(ticker.MultipleLocator(0.01))
94 | ax.get_yaxis().tick_right()
95 | ax.get_xaxis().set_major_formatter(ticker.ScalarFormatter())
96 | #plt.show()
97 | plt.savefig('res/plot_elem_%d.png' % mem, bbox_inches='tight', dpi=dpi)
98 | plt.gcf().clear()
99 |
100 | ax = plt.axes()
101 | ax.set_title(title)
102 | ax.semilogx(*argsLog, basex=2, linestyle='-')
103 | ax.set_xlabel("Array length (N)")
104 | ax.set_ylabel("Time per one bin.search comparison, ns")
105 | ax.grid(True, which="major")
106 | ax.grid(True, which="minor", color='0.8', linestyle=':')
107 | ax.legend(names, loc=2, prop={'size': 6})
108 | ax.set_ylim(1.0, 7.0)
109 | ax.get_yaxis().set_minor_locator(ticker.MultipleLocator(0.5))
110 | ax.get_yaxis().tick_right()
111 | ax.get_xaxis().set_major_formatter(ticker.ScalarFormatter())
112 | #plt.show()
113 | plt.savefig('res/plot_log_%d.png' % mem, bbox_inches='tight', dpi=dpi)
114 | plt.gcf().clear()
115 |
--------------------------------------------------------------------------------
/search.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | //little-endian _MM_SHUFFLE
11 | #define SHUF(i0, i1, i2, i3) (i0 + i1*4 + i2*16 + i3*64)
12 |
13 | #ifdef _MSC_VER
14 | #define FORCEINLINE __forceinline
15 | #define NOINLINE __declspec(noinline)
16 | #define ALIGN(n) __declspec(align(n))
17 | FORCEINLINE uint32_t bsr(uint32_t x) {
18 | unsigned long res;
19 | _BitScanReverse(&res, x);
20 | return res;
21 | }
22 | FORCEINLINE uint32_t bsf(uint32_t x) {
23 | unsigned long res;
24 | _BitScanForward(&res, x);
25 | return res;
26 | }
27 | #else
28 | #define FORCEINLINE __attribute__((always_inline)) inline
29 | #define NOINLINE __attribute__((noinline))
30 | #define ALIGN(n) __attribute__((aligned(n)))
31 | FORCEINLINE uint32_t bsr(uint32_t x) {
32 | return 31 - __builtin_clz(x);
33 | }
34 | FORCEINLINE uint32_t bsf(uint32_t x) {
35 | return __builtin_ctz(x);
36 | }
37 | #endif
38 |
39 | //if true, then average latency of one search is measured
40 | //if false, then average throughput performance of one search is measured
41 | //implementation-wise, setting it makes the index of the next array/key
42 | //to be searched dependent on the answer of the current search
43 | #define MEASURE_LATENCY false
44 |
45 | //controls inlining of all search functions being tested
46 | //NOINLINE means that inlining is forbidden:
47 | //in this case benchmarking code is less likely to influence search performance
48 | #define TESTINLINE NOINLINE
49 |
50 | //======================= search implementations =======================
51 |
52 | static TESTINLINE int binary_search_std (const int *arr, int n, int key) {
53 | return std::lower_bound(arr, arr + n, key) - arr;
54 | }
55 |
56 | static TESTINLINE int binary_search_simple (const int *arr, int n, int key) {
57 | intptr_t left = -1;
58 | intptr_t right = n;
59 | while (right - left > 1) {
60 | intptr_t middle = (left + right) >> 1;
61 | if (arr[middle] < key)
62 | left = middle;
63 | else
64 | right = middle;
65 | }
66 | return right;
67 | }
68 |
69 | intptr_t MINUS_ONE = -1;
70 | static TESTINLINE int binary_search_branchless (const int *arr, int n, int key) {
71 | assert((n & (n+1)) == 0); //n = 2^k - 1
72 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput
73 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency
74 | intptr_t logstep = bsr(n);
75 | intptr_t step = intptr_t(1) << logstep;
76 | while (step > 0) {
77 | pos = (arr[pos + step] < key ? pos + step : pos);
78 | step >>= 1;
79 | }
80 | return pos + 1;
81 | }
82 |
83 | template static TESTINLINE int binary_search_branchless_UR (const int *arr, int n, int key) {
84 | assert((n & (n+1)) == 0); //n = 2^k - 1
85 | assert(n+1 == MAXN);
86 | assert(n < (1<<20));
87 |
88 | //intptr_t pos = -1;
89 | intptr_t pos = MINUS_ONE;
90 | #define STEP(logstep) \
91 | if ((1< static TESTINLINE int linear_search_sse_UR (const int *arr, int n, int key) {
141 | assert(size_t(arr) % 16 == 0);
142 | assert(n <= 1024);
143 | __m128i vkey = _mm_set1_epi32(key);
144 | __m128i cnt = _mm_setzero_si128();
145 | intptr_t i = 0;
146 | #define STEP \
147 | if (i < MAXN) {\
148 | __m128i mask0 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+0]), vkey); \
149 | __m128i mask1 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+4]), vkey); \
150 | __m128i mask2 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+8]), vkey); \
151 | __m128i mask3 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+12]), vkey); \
152 | __m128i sum = _mm_add_epi32(_mm_add_epi32(mask0, mask1), _mm_add_epi32(mask2, mask3)); \
153 | cnt = _mm_sub_epi32(cnt, sum); \
154 | } i += 16;
155 | STEP STEP STEP STEP STEP STEP STEP STEP
156 | STEP STEP STEP STEP STEP STEP STEP STEP
157 | STEP STEP STEP STEP STEP STEP STEP STEP
158 | STEP STEP STEP STEP STEP STEP STEP STEP
159 | STEP STEP STEP STEP STEP STEP STEP STEP
160 | STEP STEP STEP STEP STEP STEP STEP STEP
161 | STEP STEP STEP STEP STEP STEP STEP STEP
162 | STEP STEP STEP STEP STEP STEP STEP STEP
163 | #undef STEP
164 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(2, 3, 0, 1)));
165 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(1, 0, 3, 2)));
166 | return _mm_cvtsi128_si32(cnt);
167 | }
168 |
169 | static TESTINLINE int linear_search_avx (const int *arr, int n, int key) {
170 | assert(size_t(arr) % 32 == 0);
171 | __m256i vkey = _mm256_set1_epi32(key);
172 | __m256i cnt = _mm256_setzero_si256();
173 | for (int i = 0; i < n; i += 16) {
174 | __m256i mask0 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+0]));
175 | __m256i mask1 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+8]));
176 | __m256i sum = _mm256_add_epi32(mask0, mask1);
177 | cnt = _mm256_sub_epi32(cnt, sum);
178 | }
179 | __m128i xcnt = _mm_add_epi32(_mm256_extracti128_si256(cnt, 1), _mm256_castsi256_si128(cnt));
180 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(2, 3, 0, 1)));
181 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(1, 0, 3, 2)));
182 | return _mm_cvtsi128_si32(xcnt);
183 | }
184 |
185 | template static TESTINLINE int linear_search_avx_UR (const int *arr, int n, int key) {
186 | assert(size_t(arr) % 32 == 0);
187 | assert(n <= 1024);
188 | __m256i vkey = _mm256_set1_epi32(key);
189 | __m256i cnt = _mm256_setzero_si256();
190 | intptr_t i = 0;
191 | #define STEP \
192 | if (i < MAXN) {\
193 | __m256i mask0 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+0])); \
194 | __m256i mask1 = _mm256_cmpgt_epi32(vkey, _mm256_load_si256((__m256i *)&arr[i+8])); \
195 | __m256i sum = _mm256_add_epi32(mask0, mask1); \
196 | cnt = _mm256_sub_epi32(cnt, sum); \
197 | } i += 16;
198 | STEP STEP STEP STEP STEP STEP STEP STEP
199 | STEP STEP STEP STEP STEP STEP STEP STEP
200 | STEP STEP STEP STEP STEP STEP STEP STEP
201 | STEP STEP STEP STEP STEP STEP STEP STEP
202 | STEP STEP STEP STEP STEP STEP STEP STEP
203 | STEP STEP STEP STEP STEP STEP STEP STEP
204 | STEP STEP STEP STEP STEP STEP STEP STEP
205 | STEP STEP STEP STEP STEP STEP STEP STEP
206 | #undef STEP
207 | __m128i xcnt = _mm_add_epi32(_mm256_extracti128_si256(cnt, 1), _mm256_castsi256_si128(cnt));
208 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(2, 3, 0, 1)));
209 | xcnt = _mm_add_epi32(xcnt, _mm_shuffle_epi32(xcnt, SHUF(1, 0, 3, 2)));
210 | return _mm_cvtsi128_si32(xcnt);
211 | }
212 |
213 | // from https://stackoverflow.com/questions/2741859/how-fast-can-you-make-linear-search
214 | // type of "i" changed from int to intptr_t
215 | static TESTINLINE int linearX_search_scalar (const int *arr, int n, int key) {
216 | intptr_t i = 0;
217 | while (i < n) {
218 | if (arr[i] >= key)
219 | break;
220 | ++i;
221 | }
222 | return i;
223 | }
224 |
225 | // from https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/
226 | // type of "i" changed from int to intptr_t
227 | static TESTINLINE int linearX_search_sse (const int *arr, int n, int key) {
228 | __m128i *in_data = (__m128i*)arr;
229 | __m128i key4 = _mm_set1_epi32(key);
230 | intptr_t i = 0;
231 | int res;
232 | for (;;) {
233 | __m128i cmp0 = _mm_cmpgt_epi32 (key4, in_data [i + 0]);
234 | __m128i cmp1 = _mm_cmpgt_epi32 (key4, in_data [i + 1]);
235 | __m128i cmp2 = _mm_cmpgt_epi32 (key4, in_data [i + 2]);
236 | __m128i cmp3 = _mm_cmpgt_epi32 (key4, in_data [i + 3]);
237 | __m128i pack01 = _mm_packs_epi32 (cmp0, cmp1);
238 | __m128i pack23 = _mm_packs_epi32 (cmp2, cmp3);
239 | __m128i pack0123 = _mm_packs_epi16 (pack01, pack23);
240 | res = _mm_movemask_epi8 (pack0123);
241 | if (res != 0xFFFF)
242 | break;
243 | i += 4;
244 | }
245 | return i * 4 + bsf(~res);
246 | }
247 |
248 | //additional versions (experimental)
249 |
250 | static TESTINLINE int hybrid_search (const int *arr, int n, int key) {
251 | assert((n & (n+1)) == 0); //n = 2^k - 1
252 | intptr_t pos = MINUS_ONE;
253 | intptr_t logstep = bsr(n);
254 | intptr_t step = intptr_t(1) << logstep;
255 | while (step > 8) {
256 | pos = (arr[pos + step] < key ? pos + step : pos);
257 | step >>= 1;
258 | }
259 | pos++;
260 | step <<= 1;
261 |
262 | assert(size_t(arr) % 16 == 0);
263 | assert(pos % 16 == 0);
264 | __m128i vkey = _mm_set1_epi32(key);
265 | __m128i mask0 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+0]), vkey);
266 | __m128i mask1 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+4]), vkey);
267 | __m128i mask2 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+8]), vkey);
268 | __m128i mask3 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[pos+12]), vkey);
269 | __m128i cnt = _mm_add_epi32(_mm_add_epi32(mask0, mask1), _mm_add_epi32(mask2, mask3));
270 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(2, 3, 0, 1)));
271 | cnt = _mm_add_epi32(cnt, _mm_shuffle_epi32(cnt, SHUF(1, 0, 3, 2)));
272 |
273 | return pos - _mm_cvtsi128_si32(cnt);
274 | }
275 |
276 | static TESTINLINE int hybridX_search (const int *arr, int n, int key) {
277 | assert((n & (n+1)) == 0); //n = 2^k - 1
278 | intptr_t pos = MINUS_ONE;
279 | intptr_t logstep = bsr(n);
280 | intptr_t step = intptr_t(1) << logstep;
281 | while (step > 8) {
282 | pos = (arr[pos + step] < key ? pos + step : pos);
283 | step >>= 1;
284 | }
285 | pos++;
286 | step <<= 1;
287 |
288 | assert(size_t(arr) % 16 == 0);
289 | assert(pos % 16 == 0);
290 | __m128i vkey = _mm_set1_epi32(key);
291 | __m128i cnt = _mm_setzero_si128();
292 | __m128i cmp0 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+0]));
293 | __m128i cmp1 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+4]));
294 | __m128i cmp2 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+8]));
295 | __m128i cmp3 = _mm_cmpgt_epi32 (vkey, _mm_load_si128((__m128i *)&arr[pos+12]));
296 | __m128i pack01 = _mm_packs_epi32 (cmp0, cmp1);
297 | __m128i pack23 = _mm_packs_epi32 (cmp2, cmp3);
298 | __m128i pack0123 = _mm_packs_epi16 (pack01, pack23);
299 | uint32_t res = _mm_movemask_epi8 (pack0123);
300 |
301 | return pos + bsf(~res);
302 | }
303 |
304 | static TESTINLINE int binary_search_branchlessM (const int *arr, int n, int key) {
305 | assert((n & (n+1)) == 0); //n = 2^k - 1
306 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput
307 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency
308 | intptr_t logstep = bsr(n);
309 | intptr_t step = intptr_t(1) << logstep;
310 | while (step > 0) {
311 | pos += (arr[pos + step] < key) * step;
312 | step >>= 1;
313 | }
314 | return pos + 1;
315 | }
316 |
317 | static TESTINLINE int binary_search_branchlessA (const int *arr, int n, int key) {
318 | assert((n & (n+1)) == 0); //n = 2^k - 1
319 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput
320 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency
321 | intptr_t logstep = bsr(n);
322 | intptr_t step = intptr_t(1) << logstep;
323 | while (step > 0) {
324 | pos += (-(arr[pos + step] < key)) & step;
325 | step >>= 1;
326 | }
327 | return pos + 1;
328 | }
329 |
330 | static TESTINLINE int binary_search_branchlessS (const int *arr, int n, int key) {
331 | assert((n & (n+1)) == 0); //n = 2^k - 1
332 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput
333 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency
334 | intptr_t logstep = bsr(n);
335 | intptr_t step = intptr_t(1) << logstep;
336 | while (step > 0) {
337 | pos += ((arr[pos + step] - key) >> 31) & step;
338 | step >>= 1;
339 | }
340 | return pos + 1;
341 | }
342 |
343 | static TESTINLINE int binary_search_branchless_pre (const int *arr, int n, int key) {
344 | assert((n & (n+1)) == 0); //n = 2^k - 1
345 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput
346 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency
347 | intptr_t logstep = bsr(n);
348 | intptr_t step = intptr_t(1) << logstep;
349 | int pivot = arr[pos + step];
350 | while (step > 1) {
351 | intptr_t nextstep = step >> 1;
352 | int pivotL = arr[pos + nextstep];
353 | int pivotR = arr[pos + step + nextstep];
354 | pos = (pivot < key ? pos + step : pos);
355 | pivot = (pivot < key ? pivotR : pivotL);
356 | step = nextstep;
357 | }
358 | pos = (pivot < key ? pos + step : pos);
359 | return pos + 1;
360 | }
361 |
362 | static TESTINLINE int quaternary_search_branchless (const int *arr, int n, int key) {
363 | assert((n & (n+1)) == 0); //n = 2^k - 1
364 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput
365 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency
366 | intptr_t logstep = bsr(n) - 1;
367 | intptr_t step = intptr_t(1) << logstep;
368 | while (step > 0) {
369 | int pivotL = arr[pos + step * 1];
370 | int pivotM = arr[pos + step * 2];
371 | int pivotR = arr[pos + step * 3];
372 | pos = (pivotL < key ? pos + step : pos);
373 | pos = (pivotM < key ? pos + step : pos);
374 | pos = (pivotR < key ? pos + step : pos);
375 | step >>= 2;
376 | }
377 | pos = (arr[pos + 1] < key ? pos + 1 : pos);
378 | return pos + 1;
379 | }
380 |
381 | static TESTINLINE int quaternary_search_branchless2 (const int *arr, int n, int key) {
382 | assert((n & (n+1)) == 0); //n = 2^k - 1
383 | //intptr_t pos = -1; //generates "or r9, -1" on MSVC -- false dependency harms throughput
384 | intptr_t pos = MINUS_ONE; //workaround for MSVC: generates mov without dependency
385 | intptr_t logstep = bsr(n);
386 | intptr_t step2 = intptr_t(1) << logstep;
387 | intptr_t step = step2 >> 1;
388 | while (step > 0) {
389 | int pivotL = arr[pos + step];
390 | int pivotM = arr[pos + step2];
391 | int pivotR = arr[pos + step2 + step];
392 | pos = (pivotM < key ? pos + step2 : pos);
393 | int pivotX = (pivotM < key ? pivotR : pivotL);
394 | pos = (pivotX < key ? pos + step : pos);
395 | step >>= 2;
396 | step2 >>= 2;
397 | }
398 | pos = (arr[pos + 1] < key ? pos + 1 : pos);
399 | return pos + 1;
400 | }
401 |
402 | //======================= testing code =======================
403 |
404 | //length of each input array (including one sentinel element)
405 | //must be power of two
406 | static const int SIZE = 64;
407 | //how many searches are done in benchmark
408 | static const int TRIES = (1<<30) / SIZE;
409 | //number of pre-generated input arrays (rotated cyclically)
410 | static const int ARR_SAMPLES = (4<<10) / SIZE;
411 | //number of pre-generated keys for search (rotated cyclically)
412 | static const int KEY_SAMPLES = (4<<10);
413 |
414 | //number of elements in every array to be searched (excluding sentinel element)
415 | int n = SIZE - 1;
416 |
417 | //input arrays for search (aligned for AVX)
418 | ALIGN(32) int input[ARR_SAMPLES][SIZE];
419 | //keys to be searched
420 | int keys[KEY_SAMPLES];
421 |
422 |
423 | int main() {
424 | //used RNG
425 | std::mt19937 rnd;
426 | std::uniform_int_distribution distr(0, SIZE);
427 |
428 | //generate all input arrays
429 | for (int s = 0; s < ARR_SAMPLES; s++) {
430 | for (int i = 0; i < n; i++)
431 | input[s][i] = distr(rnd);
432 | std::sort(input[s], input[s] + n);
433 | //set sentinel element to INT_MAX
434 | for (int i = n; i < (n+15)/16*16; i++)
435 | input[s][i] = INT_MAX;
436 | }
437 |
438 | //test correctness of searches AND generate all keys to be searched
439 | const int ITERS = std::max(TRIES / 10, std::max(KEY_SAMPLES, ARR_SAMPLES)) + 10;
440 | for (int t = 0; t < ITERS; t++) {
441 | const int *arr = input[t % ARR_SAMPLES];
442 | int key = keys[t % KEY_SAMPLES] = distr(rnd);
443 |
444 | int res[32], sk = 0;
445 | //res[sk++] = linearX_search_scalar(arr, n, key);
446 | //res[sk++] = linear_search_scalar(arr, n, key);
447 | res[sk++] = linearX_search_sse(arr, n, key);
448 | res[sk++] = linear_search_sse(arr, n, key);
449 | res[sk++] = linear_search_sse_UR(arr, n, key);
450 | res[sk++] = linear_search_avx(arr, n, key);
451 | res[sk++] = linear_search_avx_UR(arr, n, key);
452 | res[sk++] = binary_search_std(arr, n, key);
453 | res[sk++] = binary_search_simple(arr, n, key);
454 | res[sk++] = binary_search_branchless(arr, n, key);
455 | res[sk++] = binary_search_branchless_UR(arr, n, key);
456 | //some experimental implementations:
457 | res[sk++] = hybridX_search(arr, n, key);
458 | res[sk++] = binary_search_branchlessM(arr, n, key);
459 | res[sk++] = binary_search_branchlessA(arr, n, key);
460 | res[sk++] = binary_search_branchlessS(arr, n, key);
461 | res[sk++] = binary_search_branchless_pre(arr, n, key);
462 | res[sk++] = quaternary_search_branchless(arr, n, key);
463 |
464 | //program terminates if any search gives different answer
465 | for (int i = 1; i < sk; i++)
466 | if (res[i-1] != res[i]) {
467 | printf("ERROR: ");
468 | for (int j = 0; j < sk; j++)
469 | printf(" %d", res[j]);
470 | printf("\n");
471 | exit(0);
472 | }
473 | }
474 |
475 | //print some info about current benchmark parameters
476 | static const int DARR = 1779033703 & (ARR_SAMPLES - 1);
477 | static const int DKEY = 2654435769 & (KEY_SAMPLES - 1);
478 | printf("Arrays: %d x %d (+%d)\n", ARR_SAMPLES, SIZE, DARR);
479 | printf("Keys: %d (+%d)\n", KEY_SAMPLES, DKEY);
480 | printf("Memory: %d B\n", int(sizeof(input) + sizeof(keys)));
481 |
482 | //benchmark environment
483 | //note: it could had been a function instead of macro
484 | //but originally I wanted to benchmark inlined code of every search
485 | #define TEST_SEARCH(func) \
486 | { \
487 | int start = clock(); \
488 | int check = 0; \
489 | for (int t = 0; t < TRIES; t++) { \
490 | int i = (t * DARR + (MEASURE_LATENCY ? check&1 : 0)) & (ARR_SAMPLES - 1); \
491 | int j = (t * DKEY + (MEASURE_LATENCY ? check&1 : 0)) & (KEY_SAMPLES - 1); \
492 | const int *arr = input[i]; \
493 | int key = keys[j]; \
494 | int res = func(arr, n, key); \
495 | check += res; \
496 | } \
497 | double elapsed = double(clock() - start) / CLOCKS_PER_SEC; \
498 | printf("%8.1lf ns : %40s (%d)\n", 1e+9 * elapsed / TRIES, #func, check); \
499 | }
500 |
501 | //run performance benchmark and print formatted results
502 | //TEST_SEARCH(linearX_search_scalar);
503 | //TEST_SEARCH(linear_search_scalar);
504 |
505 | TEST_SEARCH(linearX_search_sse);
506 | TEST_SEARCH(linear_search_sse);
507 | TEST_SEARCH(linear_search_sse_UR);
508 | TEST_SEARCH(linear_search_avx);
509 | TEST_SEARCH(linear_search_avx_UR);
510 |
511 | TEST_SEARCH(binary_search_std);
512 | TEST_SEARCH(binary_search_simple);
513 | TEST_SEARCH(binary_search_branchless);
514 | TEST_SEARCH(binary_search_branchless_UR);
515 |
516 | //some experimental implementations:
517 | TEST_SEARCH(hybridX_search);
518 |
519 | TEST_SEARCH(binary_search_branchlessM);
520 | TEST_SEARCH(binary_search_branchlessA);
521 | TEST_SEARCH(binary_search_branchlessS);
522 |
523 | TEST_SEARCH(binary_search_branchless_pre);
524 | TEST_SEARCH(quaternary_search_branchless);
525 |
526 | return 0;
527 | }
528 |
--------------------------------------------------------------------------------