├── images
├── AVX_chart.png
└── NEON_chart.png
├── test
├── simd_bitonic.xcodeproj
│ ├── project.xcworkspace
│ │ └── contents.xcworkspacedata
│ └── project.pbxproj
└── simd_bitonic
│ ├── random.h
│ ├── main.cpp
│ └── sokol_time.h
├── .gitignore
├── README.md
└── simd_bitonic.h
/images/AVX_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Geolm/simd_bitonic/HEAD/images/AVX_chart.png
--------------------------------------------------------------------------------
/images/NEON_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Geolm/simd_bitonic/HEAD/images/NEON_chart.png
--------------------------------------------------------------------------------
/test/simd_bitonic.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Object files
5 | *.o
6 | *.ko
7 | *.obj
8 | *.elf
9 |
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 |
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 |
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 |
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 |
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 |
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 |
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 |
54 | # MacOS
55 | *.DS_Store
56 |
57 | # XCode
58 | xcuserdata/
59 | xcshareddata/
60 |
61 | # visual studio code
62 | *.code-workspace
63 | *.vscode
64 |
--------------------------------------------------------------------------------
/test/simd_bitonic/random.h:
--------------------------------------------------------------------------------
1 | #ifndef __INSHAPESRANDOM__
2 | #define __INSHAPESRANDOM__
3 |
4 | #define _USE_MATH_DEFINES
5 | #include
6 | #include
7 |
8 | // based on https://www.iquilezles.org/www/articles/sfrand/sfrand.htm
9 |
10 | static inline int iq_random( int* seed)
11 | {
12 | *seed *= 16807;
13 | return (*seed) >> 9;
14 | }
15 |
16 | // returns a random number in the range [min, max]
17 | static inline int iq_random_clamped(int* seed, int min, int max)
18 | {
19 | int range = max - min + 1;
20 | return abs(iq_random(seed) % range) + min;
21 | }
22 |
23 | // returns a random float in the range [0 ; 1]
24 | static inline float iq_random_float(int* seed)
25 | {
26 | union
27 | {
28 | float fres;
29 | unsigned int ires;
30 | } float2int;
31 |
32 | *seed *= 16807;
33 |
34 | float2int.ires = ((((unsigned int)*seed)>>9 ) | 0x3f800000);
35 | return float2int.fres - 1.0f;
36 | }
37 |
38 | // returns a random float in the range [0 ; 2pi]
39 | static inline float iq_random_angle(int* seed)
40 | {
41 | return iq_random_float(seed) * (float)M_PI * 2.f;
42 | }
43 |
44 |
45 | #endif
46 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SIMD bitonic sort
2 |
3 | This library works with AVX or NEON instructions.
4 |
5 | ## Bitonic sort
6 |
7 | Sort small arrays of float using SIMD instructions to parallelize work.
8 |
9 | Based on "Fast Sorting Algorithms using AVX-512 on Intel Knights Landing" https://hal.inria.fr/hal-01512970v1/document
10 |
11 | ## Merge sort
12 |
13 | Tiled merge sort using SIMD merge sort based on "Efficient Implementation of Sorting on Multi-Core SIMD CPU Architecture" http://www.vldb.org/pvldb/vol1/1454171.pdf
14 |
15 |
16 | # Library
17 | One C99 header file, simd_bitonic.h. C99 probably needed. Tested with clang -mavx -o3.
18 |
19 | * simd_small_sort_max(), returns the maximum number of float at max that be can sorted with the small sort function
20 | * simd_small_sort(), bitonic sort small arrays
21 | * simd_merge_sort(), tiled merge sort
22 |
23 | # Results
24 |
25 | Profile was done by sorting 10,000,000 times an array from random elements.
26 |
27 | ## Mac mini 2018, i7, AVX
28 | Array size vary from 2 to 128 elements. The simd bitonic sort is almost 7x faster than std::sort() at best.
29 |
30 | 
31 |
32 | Note that we can clearly see that the sort is more optimal when the array size is multiple of 8. Because loading data is faster and all float in the registers are used to do the sort.
33 |
34 | ## M1 macbook air (2020), NEON
35 | Array size vary from 2 to 64 elements.
36 |
37 | 
38 |
39 | This chart is more all over the place, gain are still impressive though.
40 |
41 | # Why is it faster?
42 | * SIMD Bitonic sort runs in parallel thanks to SIMD wide register (4 floats NEON / 8 floats AVX)
43 | * There are less access to memory as most of the work is done inside SIMD registers
44 |
45 | # Drawbacks
46 | * Only for small arrays, merge-sort is not as efficient
47 | * Sort only "pure" floats, cannot sort an array of struct {float a; int b;} for example
48 |
49 | # What are the typical use-case?
50 | * Sorting values for image compression, usually 8x8 or 4x4 pixels
51 | * Sorting values for kdtree building, for example each leaf of kdtree could have 16 points and when we need to split the node we sort the points using one axis
52 | * Sorting values that are already in SIMD registers
53 |
--------------------------------------------------------------------------------
/test/simd_bitonic/main.cpp:
--------------------------------------------------------------------------------
1 | #define __SIMD_BITONIC_IMPLEMENTATION__
2 | #include "../../simd_bitonic.h"
3 |
4 | #define SOKOL_IMPL
5 | #include "sokol_time.h"
6 |
7 | #include "random.h"
8 |
9 | #include
10 | #include
11 | #include
12 |
13 | #define NUMBER_OF_SORTS (1000000)
14 | #define MAX_ARRAY_SIZE (SIMD_VECTOR_WIDTH * 32)
15 |
16 | int seed = 0x12345678;
17 |
18 | void profile_small(int array_size)
19 | {
20 | float array[MAX_ARRAY_SIZE];
21 |
22 | std::vector vector;
23 | vector.resize(array_size);
24 |
25 | uint64_t start_time;
26 | uint64_t diff = 0;
27 | float result = 0.f;
28 |
29 | for(int i=0; i vector;
74 | vector.resize(array_size);
75 |
76 | for(int j=0; j vector;
96 | vector.resize(array_size);
97 |
98 | for(int j=0; j vector;
122 | vector.resize(array_size);
123 |
124 | uint64_t stl_diff = 0;
125 | uint64_t simd_diff = 0;
126 |
127 | for(int i=0; i<100; ++i)
128 | {
129 | for(int j=0; j
103 |
104 | #ifndef SOKOL_API_DECL
105 | #if defined(_WIN32) && defined(SOKOL_DLL) && defined(SOKOL_IMPL)
106 | #define SOKOL_API_DECL __declspec(dllexport)
107 | #elif defined(_WIN32) && defined(SOKOL_DLL)
108 | #define SOKOL_API_DECL __declspec(dllimport)
109 | #else
110 | #define SOKOL_API_DECL extern
111 | #endif
112 | #endif
113 |
114 | #ifdef __cplusplus
115 | extern "C" {
116 | #endif
117 |
118 | SOKOL_API_DECL void stm_setup(void);
119 | SOKOL_API_DECL uint64_t stm_now(void);
120 | SOKOL_API_DECL uint64_t stm_diff(uint64_t new_ticks, uint64_t old_ticks);
121 | SOKOL_API_DECL uint64_t stm_since(uint64_t start_ticks);
122 | SOKOL_API_DECL uint64_t stm_laptime(uint64_t* last_time);
123 | SOKOL_API_DECL uint64_t stm_round_to_common_refresh_rate(uint64_t frame_ticks);
124 | SOKOL_API_DECL double stm_sec(uint64_t ticks);
125 | SOKOL_API_DECL double stm_ms(uint64_t ticks);
126 | SOKOL_API_DECL double stm_us(uint64_t ticks);
127 | SOKOL_API_DECL double stm_ns(uint64_t ticks);
128 |
129 | #ifdef __cplusplus
130 | } /* extern "C" */
131 | #endif
132 | #endif // SOKOL_TIME_INCLUDED
133 |
134 | /*-- IMPLEMENTATION ----------------------------------------------------------*/
135 | #ifdef SOKOL_IMPL
136 | #define SOKOL_TIME_IMPL_INCLUDED (1)
137 | #include /* memset */
138 |
139 | #ifndef SOKOL_API_IMPL
140 | #define SOKOL_API_IMPL
141 | #endif
142 | #ifndef SOKOL_ASSERT
143 | #include
144 | #define SOKOL_ASSERT(c) assert(c)
145 | #endif
146 | #ifndef _SOKOL_PRIVATE
147 | #if defined(__GNUC__) || defined(__clang__)
148 | #define _SOKOL_PRIVATE __attribute__((unused)) static
149 | #else
150 | #define _SOKOL_PRIVATE static
151 | #endif
152 | #endif
153 |
154 | #if defined(_WIN32)
155 | #ifndef WIN32_LEAN_AND_MEAN
156 | #define WIN32_LEAN_AND_MEAN
157 | #endif
158 | #include
159 | typedef struct {
160 | uint32_t initialized;
161 | LARGE_INTEGER freq;
162 | LARGE_INTEGER start;
163 | } _stm_state_t;
164 | #elif defined(__APPLE__) && defined(__MACH__)
165 | #include
166 | typedef struct {
167 | uint32_t initialized;
168 | mach_timebase_info_data_t timebase;
169 | uint64_t start;
170 | } _stm_state_t;
171 | #elif defined(__EMSCRIPTEN__)
172 | #include
173 | typedef struct {
174 | uint32_t initialized;
175 | double start;
176 | } _stm_state_t;
177 | #else /* anything else, this will need more care for non-Linux platforms */
178 | #ifdef ESP8266
179 | // On the ESP8266, clock_gettime ignores the first argument and CLOCK_MONOTONIC isn't defined
180 | #define CLOCK_MONOTONIC 0
181 | #endif
182 | #include
183 | typedef struct {
184 | uint32_t initialized;
185 | uint64_t start;
186 | } _stm_state_t;
187 | #endif
188 | static _stm_state_t _stm;
189 |
190 | /* prevent 64-bit overflow when computing relative timestamp
191 | see https://gist.github.com/jspohr/3dc4f00033d79ec5bdaf67bc46c813e3
192 | */
193 | #if defined(_WIN32) || (defined(__APPLE__) && defined(__MACH__))
194 | _SOKOL_PRIVATE int64_t int64_muldiv(int64_t value, int64_t numer, int64_t denom) {
195 | int64_t q = value / denom;
196 | int64_t r = value % denom;
197 | return q * numer + r * numer / denom;
198 | }
199 | #endif
200 |
201 | #if defined(__EMSCRIPTEN__)
202 | EM_JS(double, stm_js_perfnow, (void), {
203 | return performance.now();
204 | });
205 | #endif
206 |
207 | SOKOL_API_IMPL void stm_setup(void) {
208 | memset(&_stm, 0, sizeof(_stm));
209 | _stm.initialized = 0xABCDABCD;
210 | #if defined(_WIN32)
211 | QueryPerformanceFrequency(&_stm.freq);
212 | QueryPerformanceCounter(&_stm.start);
213 | #elif defined(__APPLE__) && defined(__MACH__)
214 | mach_timebase_info(&_stm.timebase);
215 | _stm.start = mach_absolute_time();
216 | #elif defined(__EMSCRIPTEN__)
217 | _stm.start = stm_js_perfnow();
218 | #else
219 | struct timespec ts;
220 | clock_gettime(CLOCK_MONOTONIC, &ts);
221 | _stm.start = (uint64_t)ts.tv_sec*1000000000 + (uint64_t)ts.tv_nsec;
222 | #endif
223 | }
224 |
225 | SOKOL_API_IMPL uint64_t stm_now(void) {
226 | SOKOL_ASSERT(_stm.initialized == 0xABCDABCD);
227 | uint64_t now;
228 | #if defined(_WIN32)
229 | LARGE_INTEGER qpc_t;
230 | QueryPerformanceCounter(&qpc_t);
231 | now = int64_muldiv(qpc_t.QuadPart - _stm.start.QuadPart, 1000000000, _stm.freq.QuadPart);
232 | #elif defined(__APPLE__) && defined(__MACH__)
233 | const uint64_t mach_now = mach_absolute_time() - _stm.start;
234 | now = int64_muldiv(mach_now, _stm.timebase.numer, _stm.timebase.denom);
235 | #elif defined(__EMSCRIPTEN__)
236 | double js_now = stm_js_perfnow() - _stm.start;
237 | SOKOL_ASSERT(js_now >= 0.0);
238 | now = (uint64_t) (js_now * 1000000.0);
239 | #else
240 | struct timespec ts;
241 | clock_gettime(CLOCK_MONOTONIC, &ts);
242 | now = ((uint64_t)ts.tv_sec*1000000000 + (uint64_t)ts.tv_nsec) - _stm.start;
243 | #endif
244 | return now;
245 | }
246 |
247 | SOKOL_API_IMPL uint64_t stm_diff(uint64_t new_ticks, uint64_t old_ticks) {
248 | if (new_ticks > old_ticks) {
249 | return new_ticks - old_ticks;
250 | }
251 | else {
252 | return 1;
253 | }
254 | }
255 |
256 | SOKOL_API_IMPL uint64_t stm_since(uint64_t start_ticks) {
257 | return stm_diff(stm_now(), start_ticks);
258 | }
259 |
260 | SOKOL_API_IMPL uint64_t stm_laptime(uint64_t* last_time) {
261 | SOKOL_ASSERT(last_time);
262 | uint64_t dt = 0;
263 | uint64_t now = stm_now();
264 | if (0 != *last_time) {
265 | dt = stm_diff(now, *last_time);
266 | }
267 | *last_time = now;
268 | return dt;
269 | }
270 |
271 | // first number is frame duration in ns, second number is tolerance in ns,
272 | // the resulting min/max values must not overlap!
273 | static const uint64_t _stm_refresh_rates[][2] = {
274 | { 16666667, 1000000 }, // 60 Hz: 16.6667 +- 1ms
275 | { 13888889, 250000 }, // 72 Hz: 13.8889 +- 0.25ms
276 | { 13333333, 250000 }, // 75 Hz: 13.3333 +- 0.25ms
277 | { 11764706, 250000 }, // 85 Hz: 11.7647 +- 0.25
278 | { 11111111, 250000 }, // 90 Hz: 11.1111 +- 0.25ms
279 | { 8333333, 500000 }, // 120 Hz: 8.3333 +- 0.5ms
280 | { 6944445, 500000 }, // 144 Hz: 6.9445 +- 0.5ms
281 | { 4166667, 1000000 }, // 240 Hz: 4.1666 +- 1ms
282 | { 0, 0 }, // keep the last element always at zero
283 | };
284 |
285 | SOKOL_API_IMPL uint64_t stm_round_to_common_refresh_rate(uint64_t ticks) {
286 | uint64_t ns;
287 | int i = 0;
288 | while (0 != (ns = _stm_refresh_rates[i][0])) {
289 | uint64_t tol = _stm_refresh_rates[i][1];
290 | if ((ticks > (ns - tol)) && (ticks < (ns + tol))) {
291 | return ns;
292 | }
293 | i++;
294 | }
295 | // fallthough: didn't fit into any buckets
296 | return ticks;
297 | }
298 |
299 | SOKOL_API_IMPL double stm_sec(uint64_t ticks) {
300 | return (double)ticks / 1000000000.0;
301 | }
302 |
303 | SOKOL_API_IMPL double stm_ms(uint64_t ticks) {
304 | return (double)ticks / 1000000.0;
305 | }
306 |
307 | SOKOL_API_IMPL double stm_us(uint64_t ticks) {
308 | return (double)ticks / 1000.0;
309 | }
310 |
311 | SOKOL_API_IMPL double stm_ns(uint64_t ticks) {
312 | return (double)ticks;
313 | }
314 | #endif /* SOKOL_IMPL */
315 |
316 |
--------------------------------------------------------------------------------
/simd_bitonic.h:
--------------------------------------------------------------------------------
1 |
2 |
3 | /*
4 | To use this library, do this in *one* C or C++ file:
5 | #define __SIMD_BITONIC_IMPLEMENTATION__
6 | #include "simd_bitonic.h"
7 |
8 | COMPILATION
9 |
10 | DOCUMENTATION
11 |
12 | int simd_small_sort_max();
13 |
14 | Returns the number of float at max that the library can sort
15 |
16 | void simd_small_sort(float* array, int element_count);
17 |
18 | Sort a small array of float. Do nothing if there is too many elements in the array (more than simd_small_sort_max())
19 |
20 | void simd_merge_sort(float* array, int element_count);
21 |
22 | Sort an array of float using a mix merge sort and bitonic sort
23 | */
24 |
25 |
26 | #ifndef __SIMD_BITONIC__
27 | #define __SIMD_BITONIC__
28 |
29 |
30 | //----------------------------------------------------------------------------------------------------------------------
31 | // Prototypes
32 | //----------------------------------------------------------------------------------------------------------------------
33 |
34 | #ifdef __cplusplus
35 | extern "C" {
36 | #endif
37 |
38 | int simd_small_sort_max();
39 | void simd_small_sort(float* array, int element_count);
40 | void simd_merge_sort(float* array, int element_count);
41 |
42 | #ifdef __cplusplus
43 | }
44 | #endif
45 |
46 | //----------------------------------------------------------------------------------------------------------------------
47 | // Implementation
48 | //----------------------------------------------------------------------------------------------------------------------
49 |
50 | #ifdef __SIMD_BITONIC_IMPLEMENTATION__
51 | #undef __SIMD_BITONIC_IMPLEMENTATION__
52 |
53 | #include
54 | #include
55 | #include
56 |
57 | // positive infinity float hexadecimal value
58 | #define FLOAT_PINF (0x7F800000)
59 |
60 | //----------------------------------------------------------------------------------------------------------------------
61 | // Neon
62 | //----------------------------------------------------------------------------------------------------------------------
63 | #if defined(__ARM_NEON) && defined(__ARM_NEON__)
64 |
65 | #include
66 |
67 | #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
68 | #define SIMD_VECTOR_WIDTH (4)
69 |
70 | typedef float32x4_t simd_vector;
71 |
72 | //----------------------------------------------------------------------------------------------------------------------
73 | static inline float32x4_t simd_sort_1V(float32x4_t input)
74 | {
75 | {
76 | float32x4_t perm_neigh = vrev64q_f32(input);
77 | float32x4_t perm_neigh_min = vminq_f32(input, perm_neigh);
78 | float32x4_t perm_neigh_max = vmaxq_f32(input, perm_neigh);
79 | input = vtrn2q_f32(perm_neigh_min, perm_neigh_max);
80 | }
81 | {
82 | float32x4_t perm_neigh = __builtin_shufflevector(input, input, 3, 2, 1, 0);
83 | float32x4_t perm_neigh_min = vminq_f32(input, perm_neigh);
84 | float32x4_t perm_neigh_max = vmaxq_f32(input, perm_neigh);
85 | input = vextq_u64(perm_neigh_min, perm_neigh_max, 1);
86 | }
87 | {
88 | float32x4_t perm_neigh = vrev64q_f32(input);
89 | float32x4_t perm_neigh_min = vminq_f32(input, perm_neigh);
90 | float32x4_t perm_neigh_max = vmaxq_f32(input, perm_neigh);
91 | input = vtrn2q_f32(perm_neigh_min, perm_neigh_max);
92 | }
93 | return input;
94 | }
95 |
96 | //----------------------------------------------------------------------------------------------------------------------
97 | static inline float32x4_t simd_aftermerge_1V(float32x4_t input)
98 | {
99 | {
100 | float32x4_t perm_neigh = __builtin_shufflevector(input, input, 2, 3, 0, 1);
101 | float32x4_t perm_neigh_min = vminq_f32(input, perm_neigh);
102 | float32x4_t perm_neigh_max = vmaxq_f32(input, perm_neigh);
103 | input = vextq_u64(perm_neigh_min, perm_neigh_max, 1);
104 | }
105 | {
106 | float32x4_t perm_neigh = vrev64q_f32(input);
107 | float32x4_t perm_neigh_min = vminq_f32(input, perm_neigh);
108 | float32x4_t perm_neigh_max = vmaxq_f32(input, perm_neigh);
109 | input = vtrn2q_f32(perm_neigh_min, perm_neigh_max);
110 | }
111 | return input;
112 | }
113 |
114 | //----------------------------------------------------------------------------------------------------------------------
115 | static inline void simd_permute_minmax_2V(float32x4_t *a, float32x4_t *b)
116 | {
117 | float32x4_t perm_neigh = __builtin_shufflevector(*b, *b, 3, 2, 1, 0);
118 | float32x4_t perm_neigh_min = vminq_f32(*a, perm_neigh);
119 | float32x4_t perm_neigh_max = vmaxq_f32(*a, perm_neigh);
120 | *a = perm_neigh_min;
121 | *b = perm_neigh_max;
122 | }
123 |
124 | //----------------------------------------------------------------------------------------------------------------------
125 | static inline void simd_minmax_2V(float32x4_t *a, float32x4_t *b)
126 | {
127 | float32x4_t perm_neigh_min = vminq_f32(*a, *b);
128 | float32x4_t perm_neigh_max = vmaxq_f32(*a, *b);
129 | *a = perm_neigh_min;
130 | *b = perm_neigh_max;
131 | }
132 |
133 | //----------------------------------------------------------------------------------------------------------------------
134 | static inline float32x4_t simd_load_partial(const float* array, int index, int element_count)
135 | {
136 | int array_index = SIMD_VECTOR_WIDTH * index;
137 | if (element_count == SIMD_VECTOR_WIDTH)
138 | return vld1q_f32(array + array_index);
139 |
140 | static const uint32_t float_positive_inf = FLOAT_PINF;
141 | float32x4_t result = vmovq_n_f32(*(float*)&float_positive_inf);
142 | result = vsetq_lane_f32(array[array_index + 0], result, 0);
143 |
144 | if (element_count > 1)
145 | result = vsetq_lane_f32(array[array_index + 1], result, 1);
146 |
147 | if (element_count > 2)
148 | result = vsetq_lane_f32(array[array_index + 2], result, 2);
149 |
150 | return result;
151 | }
152 |
153 | //----------------------------------------------------------------------------------------------------------------------
154 | static inline void simd_store_partial(float* array, float32x4_t a, int index, int element_count)
155 | {
156 | int array_index = SIMD_VECTOR_WIDTH * index;
157 | if (element_count == SIMD_VECTOR_WIDTH)
158 | {
159 | vst1q_f32(array + array_index, a);
160 | }
161 | else
162 | {
163 | array[array_index] = vgetq_lane_f32(a, 0);
164 |
165 | if (element_count > 1)
166 | array[array_index+1] = vgetq_lane_f32(a, 1);
167 |
168 | if (element_count > 2)
169 | array[array_index+2] = vgetq_lane_f32(a, 2);
170 | }
171 | }
172 |
173 | //----------------------------------------------------------------------------------------------------------------------
174 | static inline float32x4_t simd_load_vector(const float* array, int index)
175 | {
176 | return vld1q_f32(array + SIMD_VECTOR_WIDTH * index);
177 | }
178 |
179 | //----------------------------------------------------------------------------------------------------------------------
180 | static inline void simd_store_vector(float* array, float32x4_t a, int index)
181 | {
182 | vst1q_f32(array + SIMD_VECTOR_WIDTH * index, a);
183 | }
184 |
185 | #else
186 |
187 | //----------------------------------------------------------------------------------------------------------------------
188 | // AVX
189 | //----------------------------------------------------------------------------------------------------------------------
190 |
191 | #include
192 | #include
193 |
194 | #define SIMD_VECTOR_WIDTH (8)
195 |
196 | typedef __m256 simd_vector;
197 |
198 | //----------------------------------------------------------------------------------------------------------------------
199 | static inline simd_vector _mm256_swap(__m256 input)
200 | {
201 | return _mm256_permute2f128_ps(input, input, _MM_SHUFFLE(0, 0, 1, 1));
202 | }
203 |
204 | //----------------------------------------------------------------------------------------------------------------------
205 | static inline void simd_minmax_2V(__m256* a, __m256* b)
206 | {
207 | __m256 a_copy = *a;
208 | *a = _mm256_min_ps(*b, a_copy);
209 | *b = _mm256_max_ps(*b, a_copy);
210 | }
211 |
212 | //----------------------------------------------------------------------------------------------------------------------
213 | static inline void simd_permute_minmax_2V(__m256* a, __m256* b)
214 | {
215 | __m256 swap = _mm256_swap(*b);
216 | __m256 perm_neigh = _mm256_permute_ps(swap, _MM_SHUFFLE(0, 1, 2, 3));
217 | __m256 perm_neigh_min = _mm256_min_ps(*a, perm_neigh);
218 | __m256 perm_neigh_max = _mm256_max_ps(*a, perm_neigh);
219 | *a = perm_neigh_min;
220 | *b = perm_neigh_max;
221 | }
222 |
223 | //----------------------------------------------------------------------------------------------------------------------
224 | static inline __m256 simd_sort_1V(simd_vector input)
225 | {
226 | {
227 | __m256 perm_neigh = _mm256_permute_ps(input, _MM_SHUFFLE(2, 3, 0, 1));
228 | __m256 perm_neigh_min = _mm256_min_ps(input, perm_neigh);
229 | __m256 perm_neigh_max = _mm256_max_ps(input, perm_neigh);
230 | input = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xAA);
231 | }
232 | {
233 | __m256 perm_neigh = _mm256_permute_ps(input, _MM_SHUFFLE(0, 1, 2, 3));
234 | __m256 perm_neigh_min = _mm256_min_ps(input, perm_neigh);
235 | __m256 perm_neigh_max = _mm256_max_ps(input, perm_neigh);
236 | input = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xCC);
237 | }
238 | {
239 | __m256 perm_neigh = _mm256_permute_ps(input, _MM_SHUFFLE(2, 3, 0, 1));
240 | __m256 perm_neigh_min = _mm256_min_ps(input, perm_neigh);
241 | __m256 perm_neigh_max = _mm256_max_ps(input, perm_neigh);
242 | input = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xAA);
243 | }
244 | {
245 | __m256 swap = _mm256_swap(input);
246 | __m256 perm_neigh = _mm256_permute_ps(swap, _MM_SHUFFLE(0, 1, 2, 3));
247 | __m256 perm_neigh_min = _mm256_min_ps(input, perm_neigh);
248 | __m256 perm_neigh_max = _mm256_max_ps(input, perm_neigh);
249 | input = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xF0);
250 | }
251 | {
252 | __m256 perm_neigh = _mm256_permute_ps(input, _MM_SHUFFLE(1, 0, 3, 2));
253 | __m256 perm_neigh_min = _mm256_min_ps(input, perm_neigh);
254 | __m256 perm_neigh_max = _mm256_max_ps(input, perm_neigh);
255 | input = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xCC);
256 | }
257 | {
258 | __m256 perm_neigh = _mm256_permute_ps(input, _MM_SHUFFLE(2, 3, 0, 1));
259 | __m256 perm_neigh_min = _mm256_min_ps(input, perm_neigh);
260 | __m256 perm_neigh_max = _mm256_max_ps(input, perm_neigh);
261 | input = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xAA);
262 | }
263 | return input;
264 | }
265 |
266 | //----------------------------------------------------------------------------------------------------------------------
267 | static inline __m256 simd_aftermerge_1V(simd_vector a)
268 | {
269 | {
270 | __m256 swap = _mm256_swap(a);
271 | __m256 perm_neigh = _mm256_permute_ps(swap, _MM_SHUFFLE(3, 2, 1, 0));
272 | __m256 perm_neigh_min = _mm256_min_ps(a, perm_neigh);
273 | __m256 perm_neigh_max = _mm256_max_ps(a, perm_neigh);
274 | a = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xF0);
275 | }
276 | {
277 | __m256 perm_neigh = _mm256_permute_ps(a, _MM_SHUFFLE(1, 0, 3, 2));
278 | __m256 perm_neigh_min = _mm256_min_ps(a, perm_neigh);
279 | __m256 perm_neigh_max = _mm256_max_ps(a, perm_neigh);
280 | a = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xCC);
281 | }
282 | {
283 | __m256 perm_neigh = _mm256_permute_ps(a, _MM_SHUFFLE(2, 3, 0, 1));
284 | __m256 perm_neigh_min = _mm256_min_ps(a, perm_neigh);
285 | __m256 perm_neigh_max = _mm256_max_ps(a, perm_neigh);
286 | a = _mm256_blend_ps(perm_neigh_min, perm_neigh_max, 0xAA);
287 | }
288 | return a;
289 | }
290 |
291 | //----------------------------------------------------------------------------------------------------------------------
292 | static inline __m256i loadstore_mask(int element_count)
293 | {
294 | return _mm256_set_epi32(0,
295 | (element_count>6) ? 0xffffffff : 0,
296 | (element_count>5) ? 0xffffffff : 0,
297 | (element_count>4) ? 0xffffffff : 0,
298 | (element_count>3) ? 0xffffffff : 0,
299 | (element_count>2) ? 0xffffffff : 0,
300 | (element_count>1) ? 0xffffffff : 0,
301 | (element_count>0) ? 0xffffffff : 0);
302 | }
303 |
304 | //----------------------------------------------------------------------------------------------------------------------
305 | static inline __m256 simd_load_partial(const float* array, int index, int element_count)
306 | {
307 | if (element_count == SIMD_VECTOR_WIDTH)
308 | return _mm256_loadu_ps(array + index * SIMD_VECTOR_WIDTH);
309 |
310 | __m256 inf_mask = _mm256_cvtepi32_ps(_mm256_set_epi32(FLOAT_PINF,
311 | (element_count>6) ? 0 : FLOAT_PINF,
312 | (element_count>5) ? 0 : FLOAT_PINF,
313 | (element_count>4) ? 0 : FLOAT_PINF,
314 | (element_count>3) ? 0 : FLOAT_PINF,
315 | (element_count>2) ? 0 : FLOAT_PINF,
316 | (element_count>1) ? 0 : FLOAT_PINF,
317 | (element_count>0) ? 0 : FLOAT_PINF));
318 |
319 | __m256 a = _mm256_maskload_ps(array + index * SIMD_VECTOR_WIDTH, loadstore_mask(element_count));
320 | return _mm256_or_ps(a, inf_mask);
321 | }
322 |
323 | //----------------------------------------------------------------------------------------------------------------------
324 | static inline void simd_store_partial(float* array, __m256 a, int index, int element_count)
325 | {
326 | if (element_count == SIMD_VECTOR_WIDTH)
327 | {
328 | _mm256_storeu_ps(array + index * SIMD_VECTOR_WIDTH, a);
329 | }
330 | else
331 | {
332 | _mm256_maskstore_ps(array + index * SIMD_VECTOR_WIDTH, loadstore_mask(element_count), a);
333 | }
334 | }
335 |
336 | //----------------------------------------------------------------------------------------------------------------------
337 | static inline __m256 simd_load_vector(const float* array, int vector_index)
338 | {
339 | return _mm256_loadu_ps(array + SIMD_VECTOR_WIDTH * vector_index);
340 | }
341 |
342 | //----------------------------------------------------------------------------------------------------------------------
343 | static inline void simd_store_vector(float* array, __m256 a, int vector_index)
344 | {
345 | _mm256_storeu_ps(array + SIMD_VECTOR_WIDTH * vector_index, a);
346 | }
347 |
348 | //----------------------------------------------------------------------------------------------------------------------
349 | #endif // AVX
350 | //----------------------------------------------------------------------------------------------------------------------
351 |
352 | //----------------------------------------------------------------------------------------------------------------------
353 | static inline void simd_aftermerge_2V(simd_vector *a, simd_vector *b)
354 | {
355 | simd_minmax_2V(a, b);
356 | *a = simd_aftermerge_1V(*a);
357 | *b = simd_aftermerge_1V(*b);
358 | }
359 |
360 | //----------------------------------------------------------------------------------------------------------------------
361 | static inline void simd_aftermerge_3V(simd_vector *a, simd_vector *b, simd_vector *c)
362 | {
363 | simd_minmax_2V(a, c);
364 | simd_minmax_2V(a, b);
365 | *a = simd_aftermerge_1V(*a);
366 | *b = simd_aftermerge_1V(*b);
367 | *c = simd_aftermerge_1V(*c);
368 | }
369 |
370 | //----------------------------------------------------------------------------------------------------------------------
371 | static inline void simd_aftermerge_4V(simd_vector *a, simd_vector *b, simd_vector *c, simd_vector *d)
372 | {
373 | simd_minmax_2V(a, c);
374 | simd_minmax_2V(b, d);
375 | simd_minmax_2V(a, b);
376 | simd_minmax_2V(c, d);
377 | *a = simd_aftermerge_1V(*a);
378 | *b = simd_aftermerge_1V(*b);
379 | *c = simd_aftermerge_1V(*c);
380 | *d = simd_aftermerge_1V(*d);
381 | }
382 |
383 | //----------------------------------------------------------------------------------------------------------------------
384 | static inline void simd_aftermerge_5V(simd_vector *a, simd_vector *b, simd_vector *c, simd_vector *d, simd_vector* e)
385 | {
386 | simd_minmax_2V(a, e);
387 | simd_minmax_2V(a, c);
388 | simd_minmax_2V(b, d);
389 | simd_minmax_2V(a, b);
390 | simd_minmax_2V(c, d);
391 | *a = simd_aftermerge_1V(*a);
392 | *b = simd_aftermerge_1V(*b);
393 | *c = simd_aftermerge_1V(*c);
394 | *d = simd_aftermerge_1V(*d);
395 | *e = simd_aftermerge_1V(*e);
396 | }
397 |
398 | //----------------------------------------------------------------------------------------------------------------------
399 | static inline void simd_aftermerge_6V(simd_vector *a, simd_vector *b, simd_vector *c, simd_vector *d, simd_vector* e, simd_vector* f)
400 | {
401 | simd_minmax_2V(a, e);
402 | simd_minmax_2V(b, f);
403 | simd_minmax_2V(a, c);
404 | simd_minmax_2V(b, d);
405 | simd_minmax_2V(a, b);
406 | simd_minmax_2V(c, d);
407 | simd_minmax_2V(e, f);
408 | *a = simd_aftermerge_1V(*a);
409 | *b = simd_aftermerge_1V(*b);
410 | *c = simd_aftermerge_1V(*c);
411 | *d = simd_aftermerge_1V(*d);
412 | *e = simd_aftermerge_1V(*e);
413 | *f = simd_aftermerge_1V(*f);
414 | }
415 |
416 | //----------------------------------------------------------------------------------------------------------------------
417 | static inline void simd_aftermerge_7V(simd_vector *a, simd_vector *b, simd_vector *c, simd_vector *d, simd_vector* e, simd_vector* f, simd_vector *g)
418 | {
419 | simd_minmax_2V(a, e);
420 | simd_minmax_2V(b, f);
421 | simd_minmax_2V(c, g);
422 | simd_minmax_2V(a, c);
423 | simd_minmax_2V(b, d);
424 | simd_minmax_2V(a, b);
425 | simd_minmax_2V(c, d);
426 | simd_minmax_2V(e, g);
427 | simd_minmax_2V(e, f);
428 | *a = simd_aftermerge_1V(*a);
429 | *b = simd_aftermerge_1V(*b);
430 | *c = simd_aftermerge_1V(*c);
431 | *d = simd_aftermerge_1V(*d);
432 | *e = simd_aftermerge_1V(*e);
433 | *f = simd_aftermerge_1V(*f);
434 | *g = simd_aftermerge_1V(*g);
435 | }
436 |
437 | //----------------------------------------------------------------------------------------------------------------------
438 | static inline void simd_aftermerge_8V(simd_vector *a, simd_vector *b, simd_vector *c, simd_vector *d, simd_vector* e, simd_vector* f, simd_vector *g, simd_vector* h)
439 | {
440 | simd_minmax_2V(a, e);
441 | simd_minmax_2V(b, f);
442 | simd_minmax_2V(c, g);
443 | simd_minmax_2V(d, h);
444 |
445 | simd_minmax_2V(a, c);
446 | simd_minmax_2V(b, d);
447 | simd_minmax_2V(a, b);
448 | simd_minmax_2V(c, d);
449 |
450 | simd_minmax_2V(e, g);
451 | simd_minmax_2V(f, h);
452 | simd_minmax_2V(e, f);
453 | simd_minmax_2V(g, h);
454 |
455 | *a = simd_aftermerge_1V(*a);
456 | *b = simd_aftermerge_1V(*b);
457 | *c = simd_aftermerge_1V(*c);
458 | *d = simd_aftermerge_1V(*d);
459 | *e = simd_aftermerge_1V(*e);
460 | *f = simd_aftermerge_1V(*f);
461 | *g = simd_aftermerge_1V(*g);
462 | *h = simd_aftermerge_1V(*h);
463 | }
464 |
465 | //----------------------------------------------------------------------------------------------------------------------
466 | static inline void simd_aftermerge_16V(simd_vector *a, simd_vector *b, simd_vector *c, simd_vector *d, simd_vector* e, simd_vector* f, simd_vector *g, simd_vector* h,
467 | simd_vector *i, simd_vector *j, simd_vector *k, simd_vector *l, simd_vector *m, simd_vector *n, simd_vector *o, simd_vector *p)
468 | {
469 | simd_minmax_2V(a, i);
470 | simd_minmax_2V(b, j);
471 | simd_minmax_2V(c, k);
472 | simd_minmax_2V(d, l);
473 | simd_minmax_2V(e, m);
474 | simd_minmax_2V(f, n);
475 | simd_minmax_2V(g, o);
476 | simd_minmax_2V(h, p);
477 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
478 | simd_aftermerge_8V(i, j, k, l, m, n, o, p);
479 | }
480 |
481 | //----------------------------------------------------------------------------------------------------------------------
482 | static inline void simd_merge_2V_sorted(simd_vector* a, simd_vector* b)
483 | {
484 | simd_permute_minmax_2V(a, b);
485 | *a = simd_aftermerge_1V(*a);
486 | *b = simd_aftermerge_1V(*b);
487 | }
488 |
489 | //----------------------------------------------------------------------------------------------------------------------
490 | static inline void simd_sort_2V(simd_vector* a, simd_vector* b)
491 | {
492 | *a = simd_sort_1V(*a);
493 | *b = simd_sort_1V(*b);
494 | simd_merge_2V_sorted(a, b);
495 | }
496 |
497 | //----------------------------------------------------------------------------------------------------------------------
498 | static inline void simd_merge_3V_sorted(simd_vector* a, simd_vector* b, simd_vector* c)
499 | {
500 | simd_permute_minmax_2V(b, c);
501 | simd_minmax_2V(a, b);
502 | *a = simd_aftermerge_1V(*a);
503 | *b = simd_aftermerge_1V(*b);
504 | *c = simd_aftermerge_1V(*c);
505 | }
506 |
507 | //----------------------------------------------------------------------------------------------------------------------
508 | static inline void simd_sort_3V(simd_vector* a, simd_vector* b, simd_vector* c)
509 | {
510 | simd_sort_2V(a, b);
511 | *c = simd_sort_1V(*c);
512 | simd_merge_3V_sorted(a, b, c);
513 | }
514 |
515 | //----------------------------------------------------------------------------------------------------------------------
516 | static inline void simd_merge_4V_sorted(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d)
517 | {
518 | simd_permute_minmax_2V(a, d);
519 | simd_permute_minmax_2V(b, c);
520 | simd_minmax_2V(a, b);
521 | simd_minmax_2V(c, d);
522 | *a = simd_aftermerge_1V(*a);
523 | *b = simd_aftermerge_1V(*b);
524 | *c = simd_aftermerge_1V(*c);
525 | *d = simd_aftermerge_1V(*d);
526 | }
527 |
528 | //----------------------------------------------------------------------------------------------------------------------
529 | static inline void simd_sort_4V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d)
530 | {
531 | simd_sort_2V(a, b);
532 | simd_sort_2V(c, d);
533 | simd_merge_4V_sorted(a, b, c, d);
534 | }
535 |
536 | //----------------------------------------------------------------------------------------------------------------------
537 | static inline void simd_sort_5V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e)
538 | {
539 | simd_sort_4V(a, b, c, d);
540 | *e = simd_sort_1V(*e);
541 | simd_permute_minmax_2V(d, e);
542 | simd_minmax_2V(a, c);
543 | simd_minmax_2V(b, d);
544 | simd_minmax_2V(a, b);
545 | simd_minmax_2V(c, d);
546 | *a = simd_aftermerge_1V(*a);
547 | *b = simd_aftermerge_1V(*b);
548 | *c = simd_aftermerge_1V(*c);
549 | *d = simd_aftermerge_1V(*d);
550 | *e = simd_aftermerge_1V(*e);
551 | }
552 |
553 | //----------------------------------------------------------------------------------------------------------------------
554 | static inline void simd_sort_6V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f)
555 | {
556 | simd_sort_4V(a, b, c, d);
557 | simd_sort_2V(e, f);
558 | simd_permute_minmax_2V(c, f);
559 | simd_permute_minmax_2V(d, e);
560 | simd_minmax_2V(a, c);
561 | simd_minmax_2V(b, d);
562 | simd_minmax_2V(a, b);
563 | simd_minmax_2V(c, d);
564 | simd_minmax_2V(e, f);
565 | *a = simd_aftermerge_1V(*a);
566 | *b = simd_aftermerge_1V(*b);
567 | *c = simd_aftermerge_1V(*c);
568 | *d = simd_aftermerge_1V(*d);
569 | *e = simd_aftermerge_1V(*e);
570 | *f = simd_aftermerge_1V(*f);
571 | }
572 |
573 | //----------------------------------------------------------------------------------------------------------------------
574 | static inline void simd_sort_7V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g)
575 | {
576 | simd_sort_4V(a, b, c, d);
577 | simd_sort_3V(e, f, g);
578 | simd_permute_minmax_2V(c, f);
579 | simd_permute_minmax_2V(d, e);
580 | simd_permute_minmax_2V(b, g);
581 | simd_minmax_2V(a, c);
582 | simd_minmax_2V(b, d);
583 | simd_minmax_2V(a, b);
584 | simd_minmax_2V(c, d);
585 | simd_minmax_2V(e, g);
586 | simd_minmax_2V(e, f);
587 | *a = simd_aftermerge_1V(*a);
588 | *b = simd_aftermerge_1V(*b);
589 | *c = simd_aftermerge_1V(*c);
590 | *d = simd_aftermerge_1V(*d);
591 | *e = simd_aftermerge_1V(*e);
592 | *f = simd_aftermerge_1V(*f);
593 | *g = simd_aftermerge_1V(*g);
594 | }
595 |
596 | //----------------------------------------------------------------------------------------------------------------------
597 | static inline void simd_sort_8V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h)
598 | {
599 | simd_sort_4V(a, b, c, d);
600 | simd_sort_4V(e, f, g, h);
601 | simd_permute_minmax_2V(a, h);
602 | simd_permute_minmax_2V(b, g);
603 | simd_permute_minmax_2V(c, f);
604 | simd_permute_minmax_2V(d, e);
605 | simd_minmax_2V(a, c);
606 | simd_minmax_2V(b, d);
607 | simd_minmax_2V(a, b);
608 | simd_minmax_2V(c, d);
609 | simd_minmax_2V(e, g);
610 | simd_minmax_2V(f, h);
611 | simd_minmax_2V(e, f);
612 | simd_minmax_2V(g, h);
613 | *a = simd_aftermerge_1V(*a);
614 | *b = simd_aftermerge_1V(*b);
615 | *c = simd_aftermerge_1V(*c);
616 | *d = simd_aftermerge_1V(*d);
617 | *e = simd_aftermerge_1V(*e);
618 | *f = simd_aftermerge_1V(*f);
619 | *g = simd_aftermerge_1V(*g);
620 | *h = simd_aftermerge_1V(*h);
621 | }
622 |
623 | //----------------------------------------------------------------------------------------------------------------------
624 | static inline void simd_sort_9V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i)
625 | {
626 | simd_sort_8V(a, b, c, d, e, f, g, h);
627 | *i = simd_sort_1V(*i);
628 | simd_permute_minmax_2V(h, i);
629 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
630 | *i = simd_aftermerge_1V(*i);
631 | }
632 |
633 | //----------------------------------------------------------------------------------------------------------------------
634 | static inline void simd_sort_10V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i, simd_vector* j)
635 | {
636 | simd_sort_8V(a, b, c, d, e, f, g, h);
637 | simd_sort_2V(i, j);
638 | simd_permute_minmax_2V(g, j);
639 | simd_permute_minmax_2V(h, i);
640 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
641 | simd_aftermerge_2V(i, j);
642 | }
643 |
644 | //----------------------------------------------------------------------------------------------------------------------
645 | static inline void simd_sort_11V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i, simd_vector* j, simd_vector* k)
646 | {
647 | simd_sort_8V(a, b, c, d, e, f, g, h);
648 | simd_sort_3V(i, j, k);
649 | simd_permute_minmax_2V(f, k);
650 | simd_permute_minmax_2V(g, j);
651 | simd_permute_minmax_2V(h, i);
652 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
653 | simd_aftermerge_3V(i, j, k);
654 | }
655 |
656 | //----------------------------------------------------------------------------------------------------------------------
657 | static inline void simd_sort_12V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l)
658 | {
659 | simd_sort_8V(a, b, c, d, e, f, g, h);
660 | simd_sort_4V(i, j, k, l);
661 | simd_permute_minmax_2V(e, l);
662 | simd_permute_minmax_2V(f, k);
663 | simd_permute_minmax_2V(g, j);
664 | simd_permute_minmax_2V(h, i);
665 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
666 | simd_aftermerge_4V(i, j, k, l);
667 | }
668 |
669 | //----------------------------------------------------------------------------------------------------------------------
670 | static inline void simd_sort_13V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m)
671 | {
672 | simd_sort_8V(a, b, c, d, e, f, g, h);
673 | simd_sort_5V(i, j, k, l, m);
674 | simd_permute_minmax_2V(d, m);
675 | simd_permute_minmax_2V(e, l);
676 | simd_permute_minmax_2V(f, k);
677 | simd_permute_minmax_2V(g, j);
678 | simd_permute_minmax_2V(h, i);
679 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
680 | simd_aftermerge_5V(i, j, k, l, m);
681 | }
682 |
683 | //----------------------------------------------------------------------------------------------------------------------
684 | static inline void simd_sort_14V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n)
685 | {
686 | simd_sort_8V(a, b, c, d, e, f, g, h);
687 | simd_sort_6V(i, j, k, l, m, n);
688 | simd_permute_minmax_2V(c, n);
689 | simd_permute_minmax_2V(d, m);
690 | simd_permute_minmax_2V(e, l);
691 | simd_permute_minmax_2V(f, k);
692 | simd_permute_minmax_2V(g, j);
693 | simd_permute_minmax_2V(h, i);
694 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
695 | simd_aftermerge_6V(i, j, k, l, m, n);
696 | }
697 |
698 | //----------------------------------------------------------------------------------------------------------------------
699 | static inline void simd_sort_15V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o)
700 | {
701 | simd_sort_8V(a, b, c, d, e, f, g, h);
702 | simd_sort_7V(i, j, k, l, m, n, o);
703 | simd_permute_minmax_2V(b, o);
704 | simd_permute_minmax_2V(c, n);
705 | simd_permute_minmax_2V(d, m);
706 | simd_permute_minmax_2V(e, l);
707 | simd_permute_minmax_2V(f, k);
708 | simd_permute_minmax_2V(g, j);
709 | simd_permute_minmax_2V(h, i);
710 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
711 | simd_aftermerge_7V(i, j, k, l, m, n, o);
712 | }
713 |
714 | //----------------------------------------------------------------------------------------------------------------------
715 | static inline void simd_sort_16V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h, simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p)
716 | {
717 | simd_sort_8V(a, b, c, d, e, f, g, h);
718 | simd_sort_8V(i, j, k, l, m, n, o, p);
719 | simd_permute_minmax_2V(a, p);
720 | simd_permute_minmax_2V(b, o);
721 | simd_permute_minmax_2V(c, n);
722 | simd_permute_minmax_2V(d, m);
723 | simd_permute_minmax_2V(e, l);
724 | simd_permute_minmax_2V(f, k);
725 | simd_permute_minmax_2V(g, j);
726 | simd_permute_minmax_2V(h, i);
727 | simd_aftermerge_8V(a, b, c, d, e, f, g, h);
728 | simd_aftermerge_8V(i, j, k, l, m, n, o, p);
729 | }
730 |
731 | //----------------------------------------------------------------------------------------------------------------------
732 | static inline void simd_sort_17V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
733 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
734 | simd_vector* q)
735 | {
736 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
737 | *q = simd_sort_1V(*q);
738 | simd_permute_minmax_2V(p, q);
739 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
740 | *q = simd_aftermerge_1V(*q);
741 | }
742 |
743 | //----------------------------------------------------------------------------------------------------------------------
744 | static inline void simd_sort_18V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
745 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
746 | simd_vector* q, simd_vector* r)
747 | {
748 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
749 | simd_sort_2V(q, r);
750 | simd_permute_minmax_2V(p, q);
751 | simd_permute_minmax_2V(o, r);
752 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
753 | simd_aftermerge_2V(q, r);
754 | }
755 |
756 | //----------------------------------------------------------------------------------------------------------------------
757 | static inline void simd_sort_19V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
758 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
759 | simd_vector* q, simd_vector* r, simd_vector* s)
760 | {
761 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
762 | simd_sort_3V(q, r, s);
763 | simd_permute_minmax_2V(p, q);
764 | simd_permute_minmax_2V(o, r);
765 | simd_permute_minmax_2V(n, s);
766 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
767 | simd_aftermerge_3V(q, r, s);
768 | }
769 |
770 | //----------------------------------------------------------------------------------------------------------------------
771 | static inline void simd_sort_20V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
772 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
773 | simd_vector* q, simd_vector* r, simd_vector* s, simd_vector* t)
774 | {
775 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
776 | simd_sort_4V(q, r, s, t);
777 | simd_permute_minmax_2V(p, q);
778 | simd_permute_minmax_2V(o, r);
779 | simd_permute_minmax_2V(n, s);
780 | simd_permute_minmax_2V(m, t);
781 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
782 | simd_aftermerge_4V(q, r, s, t);
783 | }
784 |
785 | //----------------------------------------------------------------------------------------------------------------------
786 | static inline void simd_sort_21V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
787 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
788 | simd_vector* q, simd_vector* r, simd_vector* s, simd_vector* t, simd_vector* u)
789 | {
790 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
791 | simd_sort_5V(q, r, s, t, u);
792 | simd_permute_minmax_2V(p, q);
793 | simd_permute_minmax_2V(o, r);
794 | simd_permute_minmax_2V(n, s);
795 | simd_permute_minmax_2V(m, t);
796 | simd_permute_minmax_2V(l, u);
797 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
798 | simd_aftermerge_5V(q, r, s, t, u);
799 | }
800 |
801 | //----------------------------------------------------------------------------------------------------------------------
802 | static inline void simd_sort_22V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
803 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
804 | simd_vector* q, simd_vector* r, simd_vector* s, simd_vector* t, simd_vector* u, simd_vector* v)
805 | {
806 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
807 | simd_sort_6V(q, r, s, t, u, v);
808 | simd_permute_minmax_2V(p, q);
809 | simd_permute_minmax_2V(o, r);
810 | simd_permute_minmax_2V(n, s);
811 | simd_permute_minmax_2V(m, t);
812 | simd_permute_minmax_2V(l, u);
813 | simd_permute_minmax_2V(k, v);
814 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
815 | simd_aftermerge_6V(q, r, s, t, u, v);
816 | }
817 |
818 | //----------------------------------------------------------------------------------------------------------------------
819 | static inline void simd_sort_23V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
820 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
821 | simd_vector* q, simd_vector* r, simd_vector* s, simd_vector* t, simd_vector* u, simd_vector* v, simd_vector* x)
822 | {
823 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
824 | simd_sort_7V(q, r, s, t, u, v, x);
825 | simd_permute_minmax_2V(p, q);
826 | simd_permute_minmax_2V(o, r);
827 | simd_permute_minmax_2V(n, s);
828 | simd_permute_minmax_2V(m, t);
829 | simd_permute_minmax_2V(l, u);
830 | simd_permute_minmax_2V(k, v);
831 | simd_permute_minmax_2V(j, x);
832 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
833 | simd_aftermerge_7V(q, r, s, t, u, v, x);
834 | }
835 |
836 | //----------------------------------------------------------------------------------------------------------------------
837 | static inline void simd_sort_24V(simd_vector* a, simd_vector* b, simd_vector* c, simd_vector* d, simd_vector* e, simd_vector* f, simd_vector* g, simd_vector* h,
838 | simd_vector* i, simd_vector* j, simd_vector* k, simd_vector* l, simd_vector *m, simd_vector* n, simd_vector* o, simd_vector* p,
839 | simd_vector* q, simd_vector* r, simd_vector* s, simd_vector* t, simd_vector* u, simd_vector* v, simd_vector* x, simd_vector* y)
840 | {
841 | simd_sort_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
842 | simd_sort_8V(q, r, s, t, u, v, x, y);
843 | simd_permute_minmax_2V(p, q);
844 | simd_permute_minmax_2V(o, r);
845 | simd_permute_minmax_2V(n, s);
846 | simd_permute_minmax_2V(m, t);
847 | simd_permute_minmax_2V(l, u);
848 | simd_permute_minmax_2V(k, v);
849 | simd_permute_minmax_2V(j, x);
850 | simd_permute_minmax_2V(i, y);
851 | simd_aftermerge_16V(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
852 | simd_aftermerge_8V(q, r, s, t, u, v, x, y);
853 | }
854 |
855 | //----------------------------------------------------------------------------------------------------------------------
856 | int simd_small_sort_max()
857 | {
858 | return SIMD_VECTOR_WIDTH * 24;
859 | }
860 |
861 | //----------------------------------------------------------------------------------------------------------------------
862 | void simd_small_sort(float* array, int element_count)
863 | {
864 | if (element_count <= 1)
865 | return;
866 |
867 | const int full_vec_count = element_count / SIMD_VECTOR_WIDTH;
868 | const int last_vec_size = element_count - (full_vec_count * SIMD_VECTOR_WIDTH);
869 |
870 | simd_vector data[24];
871 |
872 | for(int i=0; i size) ? simd_load_partial(array + *index, 0, size - *index) : simd_load_vector(array + *index, 0);
986 | *index += SIMD_VECTOR_WIDTH;
987 | return result;
988 | }
989 |
990 | //----------------------------------------------------------------------------------------------------------------------
991 | static inline void simd_store_vector_overflow(float* array, int size, int *index, simd_vector a)
992 | {
993 | if (*index + SIMD_VECTOR_WIDTH > size)
994 | {
995 | simd_store_partial(array + *index, a, 0, size - *index);
996 | }
997 | else
998 | {
999 | simd_store_vector(array + *index, a, 0);
1000 | }
1001 | *index += SIMD_VECTOR_WIDTH;
1002 | }
1003 |
1004 | //----------------------------------------------------------------------------------------------------------------------
1005 | // based on Efficient Implementation of Sorting on MultiCore SIMD CPU Architecture paper
1006 | void merge_arrays(float* array, int left, int middle, int right)
1007 | {
1008 | int left_element_count = middle - left + 1;
1009 | int right_element_count = right - middle;
1010 |
1011 | float* left_array = (float*) malloc(sizeof(float) * left_element_count);
1012 | float* right_array = (float*) malloc(sizeof(float) * right_element_count);
1013 |
1014 | memcpy(left_array, array + left, sizeof(float) * left_element_count);
1015 | memcpy(right_array, array + middle + 1, sizeof(float) * right_element_count);
1016 |
1017 | int left_index, right_index, output_index;
1018 | left_index = 0;
1019 | right_index = 0;
1020 | output_index = left;
1021 |
1022 | simd_vector a = simd_load_vector_overflow(left_array, left_element_count, &left_index);
1023 | simd_vector b = simd_load_vector_overflow(right_array, right_element_count, &right_index);
1024 |
1025 | simd_merge_2V_sorted(&a, &b);
1026 | simd_store_vector_overflow(array, right+1, &output_index, a);
1027 |
1028 | while (left_index < left_element_count && right_index < right_element_count)
1029 | {
1030 | if (left_array[left_index] MERGE_SORT_TILE)
1070 | {
1071 | middle = left + MERGE_SORT_TILE - 1;
1072 | }
1073 | else
1074 | {
1075 | middle = left + (right - left) / 2;
1076 | }
1077 |
1078 | left_element_count = middle - left + 1;
1079 | right_element_count = right - middle;
1080 |
1081 | if (left_element_count <= MERGE_SORT_TILE)
1082 | simd_small_sort(array + left, left_element_count);
1083 | else
1084 | merge_sort(array, left, middle);
1085 |
1086 | if (right_element_count <= MERGE_SORT_TILE)
1087 | simd_small_sort(array + middle + 1, right_element_count);
1088 | else
1089 | merge_sort(array, middle + 1, right);
1090 |
1091 | merge_arrays(array, left, middle, right);
1092 | }
1093 | }
1094 |
1095 | //----------------------------------------------------------------------------------------------------------------------
1096 | void simd_merge_sort(float* array, int element_count)
1097 | {
1098 | if (element_count <= simd_small_sort_max())
1099 | {
1100 | simd_small_sort(array, element_count);
1101 | }
1102 | else
1103 | {
1104 | merge_sort(array, 0, element_count - 1);
1105 | }
1106 | }
1107 |
1108 | #endif
1109 | #endif
1110 |
--------------------------------------------------------------------------------