├── .github
    └── workflows
    │   └── test.yml
├── LICENSE
├── README.md
├── missing_sse.h
└── test
    ├── .gitignore
    ├── Makefile
    ├── bswap.c
    ├── bswap_trampoline.c
    ├── exhaustive_16bit.c
    └── exhaustive_8bit.c


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   Test:
 7 |     strategy:
 8 |       matrix:
 9 |         cc: [gcc, clang]
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: Checkout
13 |       uses: actions/checkout@v3
14 |     - name: Run tests
15 |       env:
16 |         CC: ${{matrix.cc}}
17 |       working-directory: test
18 |       run: make
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A few missing SSE intrinsics
  2 | 
  3 | [![Test](https://github.com/aklomp/missing-sse-intrinsics/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/missing-sse-intrinsics/actions/workflows/test.yml)
  4 | 
  5 | C and C++ programmers already have many
  6 | [SSE intrinsics](https://software.intel.com/sites/landingpage/IntrinsicsGuide/)
  7 | available to them. Most of those map straightforwardly to their matching
  8 | hardware instructions, but there are holes where the hardware doesn't natively
  9 | support a given operation or type. For instance, there's an `_mm_cmpgt_epi8`
 10 | intrinsic, but not an `_mm_cmpgt_epu8`. The first one exists because there's a
 11 | hardware primitive, the second one is missing because its implementation would
 12 | have to be composite.
 13 | 
 14 | The set of intrinsics that we do have is quite rich, and can be combined in
 15 | many ways to make up for the ones we don't. Because SSE programming is tricky
 16 | enough as it is, I'm a big fan of building ever more small, self-contained
 17 | "primitives" that are composited inline into ever larger functional units. We
 18 | create the building blocks, forge them into subassemblies, and then let the
 19 | compiler inline everything into an optimized function body.
 20 | 
 21 | This repo contains some of the new "missing intrinsics" that I built on top of
 22 | the existing ones, and gradually also on top of the ones defined earlier. I
 23 | purposefully followed the existing naming conventions for type and operator
 24 | names, so that these functions act as a kind of "polyfill" for the holes in the
 25 | existing set (to use a JavaScript term). The naming convention probably
 26 | violates a standard somewhere, but I think it's defensible: most of these
 27 | functions are small enough to be considered almost-primitives, some of the
 28 | original intrinsics are also implemented as composites, and most operate in the
 29 | same way as the actual intrinsics, namely in vertical fashion. (`x0` and `y0`
 30 | form a pair, `x1` and `y1` form a pair, and so on.) With these new
 31 | "primitives", SSE code becomes easier to read, test and reason about.
 32 | 
 33 | ## License
 34 | 
 35 | Informally speaking, these are fundamental building blocks as far as I'm
 36 | concerned, and I wouldn't want to claim any rights to these. For what it's
 37 | worth, I declare that I place these in the public domain. Do what you want with
 38 | this stuff.
 39 | 
 40 | Formally speaking, this repo is released under the
 41 | [Unlicense](https://unlicense.org/). The contents of this license can be found
 42 | in the `LICENSE` file.
 43 | 
 44 | ## This repository
 45 | 
 46 | This repository contains the code for some of the "missing SSE intrinsics"
 47 | documented below. There are also some exhaustive tests for all 8-bit and 16-bit
 48 | inputs. The only function that doesn't pass these tests is
 49 | `_mm_divfast_epu8()`, and that's deliberate. It trades speed and simplicity for
 50 | a few rare off-by-one errors.
 51 | 
 52 | ## Documentation
 53 | 
 54 | ### \_mm\_cmple\_epu8
 55 | 
 56 | Compare each of the 8-bit unsigned ints in `x` to those in `y` and set the
 57 | result to `0xFF` where `x <= y`. Equivalent to taking the minimum of both
 58 | vectors and checking where `x` equals this minimum.
 59 | 
 60 | | R0                         | R1                         | ... | R15                          |
 61 | |----------------------------|----------------------------|-----|------------------------------|
 62 | | `(x0 <= y0) ? 0xFF : 0x00` | `(x1 <= y1) ? 0xFF : 0x00` | ... | `(x15 <= y15) ? 0xFF : 0x00` |
 63 | 
 64 | ### \_mm\_cmpge\_epu8
 65 | 
 66 | Compare each of the 8-bit unsigned ints in `x` to those in `y` and set the
 67 | result to `0xFF` where `x >= y`. Equivalent to calling `_mm_cmple_epu8` above
 68 | with the arguments swapped.
 69 | 
 70 | | R0                        | R1                        | ... | R15                         |
 71 | |---------------------------|---------------------------|-----|-----------------------------|
 72 | | `(x0 >= y0) ? 0xFF : 0x0` | `(x1 >= y1) ? 0xFF : 0x0` | ... | `(x15 >= y15) ? 0xFF : 0x0` |
 73 | 
 74 | ### \_mm\_cmpgt\_epu8
 75 | 
 76 | Compare each of the 8-bit unsigned ints in `x` to those in `y` and set the
 77 | result to `0xFF` where `x > y`. Equivalent to checking whether `x` is equal to
 78 | the maximum of `x` and `y`, but not equal to `y` itself.
 79 | 
 80 | | R0                       | R1                       | ... | R15                        |
 81 | |--------------------------|--------------------------|-----|----------------------------|
 82 | | `(x0 > y0) ? 0xFF : 0x0` | `(x1 > y1) ? 0xFF : 0x0` | ... | `(x15 > y15) ? 0xFF : 0x0` |
 83 | 
 84 | ### \_mm\_cmplt\_epu8
 85 | 
 86 | Compare each of the 8-bit unsigned ints in `x` to those in `y` and set the
 87 | result to `0xFF` where `x < y`. Equivalent to calling `_mm_cmpgt_epu8` above
 88 | with swapped arguments.
 89 | 
 90 | | R0                       | R1                       | ... | R15                        |
 91 | |--------------------------|--------------------------|-----|----------------------------|
 92 | | `(x0 < y0) ? 0xFF : 0x0` | `(x1 < y1) ? 0xFF : 0x0` | ... | `(x15 < y15) ? 0xFF : 0x0` |
 93 | 
 94 | ### \_mm\_cmple\_epu16
 95 | 
 96 | Compare each of the eight 16-bit unsigned ints in `x` to those in `y` and set
 97 | the result to `0xFFFF` where `x <= y`. Equivalent to doing the saturating
 98 | subtraction of `x − y` and checking where the result is zero (meaning `x` was
 99 | larger than or equal to `y`).
100 | 
101 | | R0                          | R1                          | ... | R7                          |
102 | |-----------------------------|-----------------------------|-----|-----------------------------|
103 | | `(x0 <= y0) ? 0xFFFF : 0x0` | `(x1 <= y1) ? 0xFFFF : 0x0` | ... | `(x7 <= y7) ? 0xFFFF : 0x0` |
104 | 
105 | ### \_mm\_cmpge\_epu16
106 | 
107 | Compare each of the eight 16-bit unsigned ints in `x` to those in `y` and set
108 | the result to `0xFFFF` where `x >= y`. Equivalent to calling `_mm_cmple_epu16`
109 | above with the arguments swapped.
110 | 
111 | | R0                          | R1                          | ... | R7                          |
112 | |-----------------------------|-----------------------------|-----|-----------------------------|
113 | | `(x0 >= y0) ? 0xFFFF : 0x0` | `(x1 >= y1) ? 0xFFFF : 0x0` | ... | `(x7 >= y7) ? 0xFFFF : 0x0` |
114 | 
115 | ### \_mm\_cmpgt\_epu16
116 | 
117 | Compare each of the eight 16-bit unsigned ints in `x` to those in `y` and set
118 | the result to `0xFFFF` where `x > y`. Equivalent to checking whether `x` is
119 | larger than, but not equal to, `y`.
120 | 
121 | | R0                         | R1                         | ... | R7                         |
122 | |----------------------------|----------------------------|-----|----------------------------|
123 | | `(x0 > y0) ? 0xFFFF : 0x0` | `(x1 > y1) ? 0xFFFF : 0x0` | ... | `(x7 > y7) ? 0xFFFF : 0x0` |
124 | 
125 | ### \_mm\_cmplt\_epu16
126 | 
127 | Compare each of the 16-bit unsigned ints in `x` to those in `y` and set the
128 | result to `0xFFFF` where `x < y`. Equivalent to calling `_mm_cmpgt_epu16` above
129 | with swapped arguments.
130 | 
131 | | R0                         | R1                         | ... | R7                         |
132 | |----------------------------|----------------------------|-----|----------------------------|
133 | | `(x0 < y0) ? 0xFFFF : 0x0` | `(x1 < y1) ? 0xFFFF : 0x0` | ... | `(x7 < y7) ? 0xFFFF : 0x0` |
134 | 
135 | ### \_mm\_cmpge\_epi16
136 | 
137 | Compare each of the eight 16-bit signed ints in `x` to those in `y` and set the
138 | result to `0xFFFF` where `x >= y`. Equivalent to checking whether `x` is either
139 | larger than or equal to `y`.
140 | 
141 | | R0                          | R1                          | ... | R7                          |
142 | |-----------------------------|-----------------------------|-----|-----------------------------|
143 | | `(x0 >= y0) ? 0xFFFF : 0x0` | `(x1 >= y1) ? 0xFFFF : 0x0` | ... | `(x7 >= y7) ? 0xFFFF : 0x0` |
144 | 
145 | ### \_mm\_not\_si128
146 | 
147 | Returns `~x`, the bitwise complement (inverse) of `x`. Perform an `xor` of
148 | every bit with 1, because `0 xor 1 = 1` and `1 xor 1 = 0`. This function can
149 | usually be avoided by applying
150 | [De Morgan's laws](https://en.wikipedia.org/wiki/De_Morgan's_laws) to your
151 | logic statements, so that they use the native `_mm_andnot_si128`.
152 | 
153 | | R0    |
154 | |-------|
155 | | `~x0` |
156 | 
157 | ### \_mm\_setone\_epi8
158 | 
159 | Just like `_mm_setzero_si128` sets everything to zero, this function sets all
160 | bytes in the vector to the value `1` without touching memory. A vector where
161 | every lane is `1` can be useful for incrementing a set of counters or
162 | performing a shift. However, a `_mm_set1_epi8(1)` will probably still be
163 | faster.
164 | 
165 | | R0    | R1    | ... | R15  |
166 | |-------|-------|-----|------|
167 | | `0x1` | `0x1` | ... |`0x1` |
168 | 
169 | ### \_mm\_setone\_epi16
170 | 
171 | See above. Sets all words in the vector to the value `1` without touching
172 | memory. A `_mm_set1_epi16(1)` is probably faster.
173 | 
174 | | R0    | R1    | ... | R7    |
175 | |-------|-------|-----|-------|
176 | | `0x1` | `0x1` | ... | `0x1` |
177 | 
178 | ### \_mm\_blendv\_si128
179 | 
180 | Copies the bits from `x` to the output where `mask` is not set, and the bits
181 | from `y` where mask is set. Combines the values in `x` with those in `y` based
182 | on the bitmask. This is useful for replacing branching logic with masking
183 | logic, because the SSE truth operations return all-ones for the bytes where a
184 | given condition holds.
185 | 
186 | | R0                            |
187 | |-------------------------------|
188 | | `(x0 & ~mask) \| (y0 & mask)` |
189 | 
190 | ### \_mm\_blendv\_epi8
191 | 
192 | This function was introduced as a hardware primitive in SSE 4.1. For each of
193 | the 16 bytes, copy either the byte from `x` or the byte from `y` to the output,
194 | depending on whether the matching byte in `mask` has its most significant bit
195 | (MSB) set. If the MSB is set, the byte in `y` is copied, else the one from `x`
196 | is passed through.
197 | 
198 | In practice with older SSE versions, when you're using logic masks in which
199 | each byte is known to be `0x00` or `0xFF`, it's faster to use
200 | `_mm_blendv_si128`.
201 | 
202 | | R0                         | R1                         | ... | R15                           |
203 | |----------------------------|----------------------------|-----|-------------------------------|
204 | | `(mask0 & 0x80) ? y0 : x0` | `(mask1 & 0x80) ? y1 : x1` | ... | `(mask15 & 0x80) ? y15 : x15` |
205 | 
206 | ### \_mm\_min\_epu16
207 | 
208 | Compare each of the eight 16-bit unsigned ints in `x` to those in `y` and place
209 | the one with the lowest value in the result. This demonstrates the use of
210 | `_mm_blendv_si128` defined above to sculpt the output based on a condition
211 | mask. This instruction was introduced as a hardware primitive in SSE4.
212 | 
213 | | R0            | R1            | ... | R7            |
214 | |---------------|---------------|-----|---------------|
215 | | `min(x0, y0)` | `min(x1, y1)` | ... | `min(x7, y7)` |
216 | 
217 | ### \_mm\_max\_epu16
218 | 
219 | Compare each of the eight 16-bit unsigned ints in `x` to those in `y` and place
220 | the one with the highest value in the result. Equivalent to `_mm_min_epu16`
221 | above with the inverse result. This instruction was introduced as a hardware
222 | primitive in SSE4.
223 | 
224 | | R0            | R1            | ... | R7            |
225 | |---------------|---------------|-----|---------------|
226 | | `max(x0, y0)` | `max(x1, y1)` | ... | `max(x7, y7)` |
227 | 
228 | ### \_mm\_absdiff\_epu8
229 | 
230 | Calculate the absolute difference between the 8-bit unsigned int in `x` and
231 | `y`. Use unsigned saturating subtraction for fast calculation. Saturating
232 | subtraction clamps negative results to zero. The absolute difference is
233 | `subs(x, y) + subs(y, x)`. For example, the absolute difference between 16 and
234 | 7 is `subs(16 − 7) + subs(7 − 16)`, or `9 + 0`. Since at least one of the
235 | subtractions will be zero, we can use an `or` to combine them.
236 | 
237 | | R0             | R1             | ... | R15              |
238 | |----------------|----------------|-----|------------------|
239 | | `abs(x0 − y0)` | `abs(x1 − y1)` | ... | `abs(x15 − y15)` |
240 | 
241 | ### \_mm\_absdiff\_epu16
242 | 
243 | Same as above, but for 16-bit unsigned ints.
244 | 
245 | | R0             | R1             | ... | R7             |
246 | |----------------|----------------|-----|----------------|
247 | | `abs(x0 − y0)` | `abs(x1 − y1)` | ... | `abs(x7 − y7)` |
248 | 
249 | ### \_mm\_div255\_epu16
250 | 
251 | Divides all eight 16-bit unsigned ints in `x` by the constant value 255, using
252 | the formula `x := ((x + 1) + (x >> 8)) >> 8`.
253 | 
254 | | R0         | R1         | ... | R7         |
255 | |------------|------------|-----|------------|
256 | | `x0 ∕ 255` | `x1 ∕ 255` | ... | `x7 ∕ 255` |
257 | 
258 | ### \_mm\_scale\_epu8
259 | 
260 | "Alpha blend" the 8-bit unsigned ints in `x` with the 8-bit unsigned "opacity"
261 | value in `y`. Calculates `x := x * (y / 255)`, or scaling `x` by the ratio in
262 | `y`. Could be useful for image alpha blending.
263 | 
264 | | R0                | R1                | ... | R15                 |
265 | |-------------------|-------------------|-----|---------------------|
266 | | `(x0 * y0) ∕ 255` | `(x1 * y1) ∕ 255` | ... | `(x15 * y15) ∕ 255` |
267 | 
268 | ### \_mm\_divfast\_epu8
269 | 
270 | Divide the 8-bit unsigned ints in `x` by the (scalar, non-vector) 8-bit
271 | unsigned integer `d`, accepting a slight error for 0.12% of the input space.
272 | This works on the principle that `a / b` is equal to `(a * c) / (b * c)`, where
273 | `c` is some arbitrary constant. After rearranging parentheses, we have `(a * (c
274 | / b)) * c`. This reduces the problem to two multiplications and one constant
275 | division. The trick is finding a properly sized constant `c` such that `c / b`
276 | does not alias (implying that `c` should be at least `256 * b`), but not so big
277 | that multiplying later by `c` will overflow the result. And `c` should be a
278 | power of two so that we can do the last multiply with a left shift.
279 | 
280 | For all 65280 possible inputs (256 numerators with 255 denominators, since zero
281 | is not a denominator), this function is wrong in just 78 cases, as compared to
282 | regular truncating integer division. In each of those cases, the error is the
283 | same, namely off by +1. If that is acceptable, this method is fast. Or we can
284 | correct the error at a small performance cost as shown below with
285 | `_mm_div_epu8`.
286 | 
287 | | R0       | R1       | ... | R15       |
288 | |----------|----------|-----|-----------|
289 | | `x0 ∕ d` | `x1 ∕ d` | ... | `x15 ∕ d` |
290 | 
291 | ### \_mm\_div\_epu8
292 | 
293 | Divide the 8-bit unsigned ints in `x` by the (non-vector) 8-bit unsigned
294 | integer `d`, doing back-substitution to check for and correct for the 78 error
295 | cases of the faster but inaccurate method above. Once we found the result the
296 | fast way, we multiply it with the divisor and check whether the result is close
297 | enough to the original numerator. If not, we splice in the result minus one.
298 | Surprisingly, this is just 10 to 20 percent slower than the fast version above.
299 | 
300 | | R0       | R1       | ... | R15       |
301 | |----------|----------|-----|-----------|
302 | | `x0 ∕ d` | `x1 ∕ d` | ... | `x15 ∕ d` |
303 | 
304 | ### \_mm\_bswap\_epi16
305 | 
306 | Change endianness (reverse byte order) in each 16-bit word by exchanging the
307 | high and the low byte.
308 | 
309 | | R0   | R1   | R2   | R3   | R4   | ... | R15   |
310 | |------|------|------|------|------|-----|-------|
311 | | `R1` | `R0` | `R3` | `R2` | `R5` | ... | `R14` |
312 | 
313 | ### \_mm\_bswap\_epi32
314 | 
315 | Change endianness (reverse byte order) in each 32-bit word by reversing all
316 | four bytes. Assume that the SSSE3 `_mm_shuffle_epi8` function plus a literal
317 | value does the job faster if available.
318 | 
319 | | R0   | R1   | R2   | R3   | R4   | ... | R15   |
320 | |------|------|------|------|------|-----|-------|
321 | | `R3` | `R2` | `R1` | `R0` | `R7` | ... | `R12` |
322 | 
323 | ### \_mm\_bswap\_epi64
324 | 
325 | Change endianness (reverse byte order) in each 64-bit word by reversing all
326 | eight bytes.
327 | 
328 | | R0   | R1   | R2   | R3   | R4   | ... | R15  |
329 | |------|------|------|------|------|-----|------|
330 | | `R7` | `R6` | `R5` | `R4` | `R3` | ... | `R8` |
331 | 
332 | ### \_mm\_bswap\_si128
333 | 
334 | Change endianness (reverse byte order) in the 128-bit vector by reversing all
335 | sixteen bytes.
336 | 
337 | | R0    | R1    | R2    | R3    | R4    | ... | R15  |
338 | |-------|-------|-------|-------|-------|-----|------|
339 | | `R15` | `R14` | `R13` | `R12` | `R11` | ... | `R0` |
340 | 


--------------------------------------------------------------------------------
/missing_sse.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdint.h>
  4 | 
  5 | #if defined(__SSSE3__)
  6 | #include <tmmintrin.h>
  7 | #elif defined(__SSE2__)
  8 | #include <emmintrin.h>
  9 | #else
 10 | #error "No SSE support"
 11 | #endif
 12 | 
 13 | // Return 0xFF where x <= y, else 0x00.
 14 | static inline __m128i
 15 | _mm_cmple_epu8 (__m128i x, __m128i y)
 16 | {
 17 | 	return _mm_cmpeq_epi8(_mm_min_epu8(x, y), x);
 18 | }
 19 | 
 20 | // Return 0xFF where x >= y, else 0x00.
 21 | static inline __m128i
 22 | _mm_cmpge_epu8 (__m128i x, __m128i y)
 23 | {
 24 | 	return _mm_cmple_epu8(y, x);
 25 | }
 26 | 
 27 | // Return 0xFF where x > y, else 0x00.
 28 | static inline __m128i
 29 | _mm_cmpgt_epu8 (__m128i x, __m128i y)
 30 | {
 31 | 	return _mm_andnot_si128(
 32 | 		_mm_cmpeq_epi8(x, y),
 33 | 		_mm_cmpeq_epi8(_mm_max_epu8(x, y), x)
 34 | 	);
 35 | }
 36 | 
 37 | // Return 0xFF where x < y, else 0x00.
 38 | static inline __m128i
 39 | _mm_cmplt_epu8 (__m128i x, __m128i y)
 40 | {
 41 | 	return _mm_cmpgt_epu8(y, x);
 42 | }
 43 | 
 44 | // Return 0xFFFF where x <= y, else 0x0000.
 45 | static inline __m128i
 46 | _mm_cmple_epu16 (__m128i x, __m128i y)
 47 | {
 48 | 	return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
 49 | }
 50 | 
 51 | // Return 0xFFFF where x >= y, else 0x0000.
 52 | static inline __m128i
 53 | _mm_cmpge_epu16 (__m128i x, __m128i y)
 54 | {
 55 | 	return _mm_cmple_epu16(y, x);
 56 | }
 57 | 
 58 | // Return 0xFFFF where x > y, else 0x0000.
 59 | static inline __m128i
 60 | _mm_cmpgt_epu16 (__m128i x, __m128i y)
 61 | {
 62 | 	return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
 63 | }
 64 | 
 65 | // Return 0xFFFF where x < y, else 0x0000.
 66 | static inline __m128i
 67 | _mm_cmplt_epu16 (__m128i x, __m128i y)
 68 | {
 69 | 	return _mm_cmpgt_epu16(y, x);
 70 | }
 71 | 
 72 | // Return 0xFFFF where x >= y, else 0x0000.
 73 | static inline __m128i
 74 | _mm_cmpge_epi16 (__m128i x, __m128i y)
 75 | {
 76 | 	return _mm_or_si128(_mm_cmpeq_epi16(x, y), _mm_cmpgt_epi16(x, y));
 77 | }
 78 | 
 79 | // Return ~x, the bitwise complement of x.
 80 | static inline __m128i
 81 | _mm_not_si128 (__m128i x)
 82 | {
 83 | 	return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
 84 | }
 85 | 
 86 | // Return a vector where every byte has the value 1.
 87 | static inline __m128i
 88 | _mm_setone_epi8 (void)
 89 | {
 90 | 	__m128i x = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
 91 | 	return _mm_xor_si128(_mm_add_epi8(x, x), x);
 92 | }
 93 | 
 94 | // Return a vector where every word has the value 1.
 95 | static inline __m128i
 96 | _mm_setone_epi16 (void)
 97 | {
 98 | 	__m128i x = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
 99 | 	return _mm_xor_si128(_mm_add_epi16(x, x), x);
100 | }
101 | 
102 | // Replace the bit in x with the bit in y when the matching bit in the mask is
103 | // set.
104 | static inline __m128i
105 | _mm_blendv_si128 (__m128i x, __m128i y, __m128i mask)
106 | {
107 | 	return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y));
108 | }
109 | 
110 | // Replace the byte in x with the byte in y when the MSB of the corresponding
111 | // byte in the mask is set.
112 | #ifndef __SSE4_1__
113 | static inline __m128i
114 | _mm_blendv_epi8 (__m128i x, __m128i y, __m128i mask)
115 | {
116 | 	return _mm_blendv_si128(x, y, _mm_cmplt_epi8(mask, _mm_setzero_si128()));
117 | }
118 | 
119 | // Return x where x <= y, else y.
120 | static inline __m128i
121 | _mm_min_epu16 (__m128i x, __m128i y)
122 | {
123 | 	return _mm_sub_epi16(x, _mm_subs_epu16(x, y));
124 | }
125 | 
126 | // Return x where x >= y, else y.
127 | static inline __m128i
128 | _mm_max_epu16 (__m128i x, __m128i y)
129 | {
130 | 	return _mm_add_epi16(x, _mm_subs_epu16(y, x));
131 | }
132 | #endif
133 | 
134 | // Calculate the absolute difference: abs(x - y).
135 | static inline __m128i
136 | _mm_absdiff_epu8 (__m128i x, __m128i y)
137 | {
138 | 	return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x));
139 | }
140 | 
141 | // Calculate the absolute difference: abs(x - y).
142 | static inline __m128i
143 | _mm_absdiff_epu16 (__m128i x, __m128i y)
144 | {
145 | 	return _mm_or_si128(_mm_subs_epu16(x, y), _mm_subs_epu16(y, x));
146 | }
147 | 
148 | // Divide 8 16-bit uints by 255:
149 | //   x := ((x + 1) + (x >> 8)) >> 8
150 | static inline __m128i
151 | _mm_div255_epu16 (__m128i x)
152 | {
153 | 	return _mm_srli_epi16(_mm_adds_epu16(
154 | 		_mm_adds_epu16(x, _mm_set1_epi16(1)),
155 | 		_mm_srli_epi16(x, 8)), 8);
156 | }
157 | 
158 | // Returns an "alpha blend" of x scaled by y/255;
159 | //   x := x * (y / 255)
160 | // Reorder: x := (x * y) / 255
161 | static inline __m128i
162 | _mm_scale_epu8 (__m128i x, __m128i y)
163 | {
164 | 	// Unpack x and y into 16-bit uints:
165 | 	__m128i xlo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
166 | 	__m128i ylo = _mm_unpacklo_epi8(y, _mm_setzero_si128());
167 | 	__m128i xhi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
168 | 	__m128i yhi = _mm_unpackhi_epi8(y, _mm_setzero_si128());
169 | 
170 | 	// Multiply x with y, keeping the low 16 bits:
171 | 	xlo = _mm_mullo_epi16(xlo, ylo);
172 | 	xhi = _mm_mullo_epi16(xhi, yhi);
173 | 
174 | 	// Divide by 255:
175 | 	xlo = _mm_div255_epu16(xlo);
176 | 	xhi = _mm_div255_epu16(xhi);
177 | 
178 | 	// Repack the 16-bit uints to clamped 8-bit values:
179 | 	return _mm_packus_epi16(xlo, xhi);
180 | }
181 | 
182 | static inline __m128i
183 | _mm_divfast_epu8 (__m128i x, uint8_t d)
184 | {
185 | 	// Find shift factor:
186 | 	// This is actually much faster than using __builtin_clz():
187 | 	uint8_t n
188 | 		= (d >= 128) ? 15
189 | 		: (d >= 64) ? 14
190 | 		: (d >= 32) ? 13
191 | 		: (d >= 16) ? 12
192 | 		: (d >= 8) ? 11
193 | 		: (d >= 4) ? 10
194 | 		: (d >= 2) ? 9
195 | 		: 8;
196 | 
197 | 	// Set 8 words of "inverse sensitivity":
198 | 	// Multiplying by this amount and right-shifting will give a
199 | 	// very good approximation of the result:
200 | 	__m128i inv = _mm_set1_epi16((1 << n) / d + 1);
201 | 
202 | 	// Unpack input into two 16-bit uints:
203 | 	__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
204 | 	__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
205 | 
206 | 	// Multiply with the "inverse sensitivity" and divide:
207 | 	lo = _mm_srli_epi16(_mm_mullo_epi16(lo, inv), n);
208 | 	hi = _mm_srli_epi16(_mm_mullo_epi16(hi, inv), n);
209 | 
210 | 	// Repack:
211 | 	return _mm_packus_epi16(lo, hi);
212 | }
213 | 
214 | static inline __m128i
215 | _mm_div_epu8 (__m128i x, uint8_t d)
216 | {
217 | 	// This is the same routine as above, but exact; it includes a
218 | 	// correction step where we back-substitute the quotient and correct
219 | 	// the off-by-one errors of the faster routine.
220 | 	uint8_t n
221 | 		= (d >= 128) ? 15
222 | 		: (d >= 64) ? 14
223 | 		: (d >= 32) ? 13
224 | 		: (d >= 16) ? 12
225 | 		: (d >= 8) ? 11
226 | 		: (d >= 4) ? 10
227 | 		: (d >= 2) ? 9
228 | 		: 8;
229 | 
230 | 	// Set 8 words of "inverse sensitivity":
231 | 	// Multiplying by this amount and right-shifting will give a
232 | 	// very good approximation of the result:
233 | 	__m128i inv = _mm_set1_epi16((1 << n) / d + 1);
234 | 
235 | 	// Unpack input into two 16-bit uints:
236 | 	__m128i xlo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
237 | 	__m128i xhi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
238 | 
239 | 	// Multiply with the "inverse sensitivity" and divide:
240 | 	__m128i alo = _mm_srli_epi16(_mm_mullo_epi16(xlo, inv), n);
241 | 	__m128i ahi = _mm_srli_epi16(_mm_mullo_epi16(xhi, inv), n);
242 | 
243 | 	// alo and ahi contain the quotients. The result is sometimes too large
244 | 	// by one, so we introduce a correction step. Decrement the quotients:
245 | 	__m128i blo = _mm_subs_epu16(alo, _mm_set1_epi16(1));
246 | 	__m128i bhi = _mm_subs_epu16(ahi, _mm_set1_epi16(1));
247 | 
248 | 	// Multiply with the divisor to get something back in the neighbourhood
249 | 	// of the original numbers:
250 | 	__m128i mdd = _mm_set1_epi16(d);
251 | 	__m128i plo = _mm_mullo_epi16(blo, mdd);
252 | 	__m128i phi = _mm_mullo_epi16(bhi, mdd);
253 | 
254 | 	// If (orig - new) >= d, the existing quotient is a better fit:
255 | 	// We can use native epi16 ops because these are all 8-bit numbers:
256 | 	__m128i masklo = _mm_cmpeq_epi16(_mm_min_epi16(_mm_subs_epu16(xlo, plo), mdd), mdd);
257 | 	__m128i maskhi = _mm_cmpeq_epi16(_mm_min_epi16(_mm_subs_epu16(xhi, phi), mdd), mdd);
258 | 
259 | 	// Decrement the original divisors according to the inverse masks:
260 | 	alo = _mm_subs_epu16(alo, _mm_andnot_si128(masklo, _mm_set1_epi16(1)));
261 | 	ahi = _mm_subs_epu16(ahi, _mm_andnot_si128(maskhi, _mm_set1_epi16(1)));
262 | 
263 | 	// Repack:
264 | 	return _mm_packus_epi16(alo, ahi);
265 | }
266 | 
267 | // Swap the upper and lower byte in each 16-bit word.
268 | static inline __m128i
269 | _mm_bswap_epi16 (__m128i x)
270 | {
271 | 	return _mm_or_si128(
272 | 		_mm_slli_epi16(x, 8),
273 | 		_mm_srli_epi16(x, 8));
274 | }
275 | 
276 | // Reverse the order of all bytes in each 32-bit word.
277 | static inline __m128i
278 | _mm_bswap_epi32 (__m128i x)
279 | {
280 | #ifdef __SSSE3__
281 | 	return _mm_shuffle_epi8(x,
282 | 		_mm_set_epi8(
283 | 			12, 13, 14, 15,
284 | 			 8,  9, 10, 11,
285 | 			 4,  5,  6,  7,
286 | 			 0,  1,  2,  3));
287 | #else
288 | 	// First swap bytes in each 16-bit word.
289 | 	__m128i a = _mm_bswap_epi16(x);
290 | 
291 | 	// Then swap all 16-bit words.
292 | 	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
293 | 	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
294 | 
295 | 	return a;
296 | #endif
297 | }
298 | 
299 | // Reverse the order of all bytes in each 64-bit word.
300 | static inline __m128i
301 | _mm_bswap_epi64 (__m128i x)
302 | {
303 | #ifdef __SSSE3__
304 | 	return _mm_shuffle_epi8(x,
305 | 		_mm_set_epi8(
306 | 			 8,  9, 10, 11,
307 | 			12, 13, 14, 15,
308 | 			 0,  1,  2,  3,
309 | 			 4,  5,  6,  7));
310 | #else
311 | 	// Swap bytes in each 16-bit word.
312 | 	__m128i a = _mm_bswap_epi16(x);
313 | 
314 | 	// Reverse all 16-bit words in 64-bit halves.
315 | 	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
316 | 	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
317 | 
318 | 	return a;
319 | #endif
320 | }
321 | 
322 | // Reverse the order of all bytes in the 128-bit word.
323 | static inline __m128i
324 | _mm_bswap_si128 (__m128i x)
325 | {
326 | #ifdef __SSSE3__
327 | 	return _mm_shuffle_epi8(x,
328 | 		_mm_set_epi8(
329 | 			 0,  1,  2,  3,
330 | 			 4,  5,  6,  7,
331 | 			 8,  9, 10, 11,
332 | 			12, 13, 14, 15));
333 | #else
334 | 	// Swap bytes in each 16-bit word.
335 | 	__m128i a = _mm_bswap_epi16(x);
336 | 
337 | 	// Reverse all 16-bit words in 64-bit halves.
338 | 	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
339 | 	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
340 | 
341 | 	// Reverse 64-bit halves:
342 | 	return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2));
343 | #endif
344 | }
345 | 


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | bswap
3 | exhaustive_8bit
4 | exhaustive_16bit
5 | 


--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS += -std=c99 -Wall -Wextra -Werror -pedantic -msse2 -O3
 2 | 
 3 | .PHONY: test clean
 4 | 
 5 | PROGS = \
 6 |   bswap \
 7 |   exhaustive_8bit \
 8 |   exhaustive_16bit
 9 | 
10 | # The default target rebuilds and runs all tests.
11 | test: clean all
12 | 	./bswap
13 | 	./exhaustive_8bit
14 | 	./exhaustive_16bit
15 | 
16 | all: $(PROGS)
17 | 
18 | exhaustive_%bit: exhaustive_%bit.o
19 | 	$(CC) $(LDFLAGS) -o $@ $^
20 | 
21 | exhaustive_%bit.o: exhaustive_%bit.c ../lib/*.h
22 | 	$(CC) $(CFLAGS) -o $@ -c $<
23 | 
24 | bswap: bswap.o bswap_sse2.o bswap_ssse3.o
25 | 	$(CC) $(LDFLAGS) -o $@ $^
26 | 
27 | bswap_sse2.o:  CFLAGS += -DFUNC_POSTFIX=sse2
28 | bswap_ssse3.o: CFLAGS += -DFUNC_POSTFIX=ssse3 -mssse3
29 | 
30 | bswap_%.o: bswap_trampoline.c
31 | 	$(CC) $(CFLAGS) -o $@ -c $<
32 | 
33 | %.o: %.c
34 | 	$(CC) $(CFLAGS) -o $@ -c $<
35 | 
36 | clean:
37 | 	$(RM) *.o $(PROGS)
38 | 


--------------------------------------------------------------------------------
/test/bswap.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | #include <stdio.h>
 3 | #include <emmintrin.h>
 4 | 
 5 | // Forward declarations of the functions under test:
 6 | __m128i _mm_bswap_epi16_sse2 (__m128i);
 7 | __m128i _mm_bswap_epi32_sse2 (__m128i);
 8 | __m128i _mm_bswap_epi64_sse2 (__m128i);
 9 | __m128i _mm_bswap_si128_sse2 (__m128i);
10 | __m128i _mm_bswap_epi16_ssse3 (__m128i);
11 | __m128i _mm_bswap_epi32_ssse3 (__m128i);
12 | __m128i _mm_bswap_epi64_ssse3 (__m128i);
13 | __m128i _mm_bswap_si128_ssse3 (__m128i);
14 | 
15 | #define TESTCASE(__name, __input, __expect)	\
16 | 	{ .name   = # __name			\
17 | 	, .input  = __input			\
18 | 	, .expect = __expect			\
19 | 	, .func   = _mm_bswap_ ## __name	\
20 | 	}
21 | 
22 | static struct testcase {
23 | 	char *name;
24 | 	char *input;
25 | 	char *expect;
26 | 	__m128i (* func)(__m128i);
27 | }
28 | testcases[] =
29 | {
30 | 	TESTCASE (epi16_sse2,  "ABCDEFGHIJKLMNPO", "BADCFEHGJILKNMOP"),
31 | 	TESTCASE (epi16_ssse3, "ABCDEFGHIJKLMNPO", "BADCFEHGJILKNMOP"),
32 | 	TESTCASE (epi32_sse2,  "ABCDEFGHIJKLMNPO", "DCBAHGFELKJIOPNM"),
33 | 	TESTCASE (epi32_ssse3, "ABCDEFGHIJKLMNPO", "DCBAHGFELKJIOPNM"),
34 | 	TESTCASE (epi64_sse2,  "ABCDEFGHIJKLMNPO", "HGFEDCBAOPNMLKJI"),
35 | 	TESTCASE (epi64_ssse3, "ABCDEFGHIJKLMNPO", "HGFEDCBAOPNMLKJI"),
36 | 	TESTCASE (si128_sse2,  "ABCDEFGHIJKLMNPO", "OPNMLKJIHGFEDCBA"),
37 | 	TESTCASE (si128_ssse3, "ABCDEFGHIJKLMNPO", "OPNMLKJIHGFEDCBA"),
38 | };
39 | 
40 | static int
41 | run_testcase (struct testcase *t)
42 | {
43 | 	char buf[17];
44 | 	__m128i in = _mm_loadu_si128((__m128i *)t->input);
45 | 	__m128i out = t->func(in);
46 | 	_mm_storeu_si128((__m128i *)buf, out);
47 | 	buf[16] = '\0';
48 | 
49 | 	if (memcmp(t->expect, buf, 16) == 0) {
50 | 		return 0;
51 | 	}
52 | 	printf("FAIL: %s:\n for: %s\n exp: %s\n got: %s\n", t->name, t->input, t->expect, buf);
53 | 	return 1;
54 | }
55 | 
56 | int
57 | main (void)
58 | {
59 | 	unsigned int i, ret = 0;
60 | 
61 | 	for (i = 0; i < sizeof(testcases) / sizeof(testcases[0]); i++) {
62 | 		ret |= run_testcase(&testcases[i]);
63 | 	}
64 | 	return ret;
65 | }
66 | 


--------------------------------------------------------------------------------
/test/bswap_trampoline.c:
--------------------------------------------------------------------------------
 1 | #if __SSSE3__
 2 | #include <tmmintrin.h>
 3 | #elif __SSE2__
 4 | #include <emmintrin.h>
 5 | #endif
 6 | 
 7 | // Include inline function definitions:
 8 | #include "../missing_sse.h"
 9 | 
10 | // Machinery to create trampoline functions for different
11 | // word widths and instruction sets (SSE2 and SSSE3):
12 | #define BSWAP(a)	_BSWAP(a,FUNC_POSTFIX)
13 | #define _BSWAP(a,b)	__BSWAP(a,b)
14 | #define __BSWAP(a,b)				\
15 | __m128i _mm_bswap_##a##_##b (__m128i x)		\
16 | {						\
17 | 	return _mm_bswap_##a(x);		\
18 | }
19 | 
20 | BSWAP(epi16)
21 | BSWAP(epi32)
22 | BSWAP(epi64)
23 | BSWAP(si128)
24 | 


--------------------------------------------------------------------------------
/test/exhaustive_16bit.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdint.h>
  3 | #include <stdio.h>
  4 | #include <emmintrin.h>
  5 | 
  6 | #include "../missing_sse.h"
  7 | 
  8 | // Inputs to test function:
  9 | struct testcase {
 10 | 	const uint16_t	i;	// First input value
 11 | 	const uint16_t	j;	// Second input value
 12 | 	uint16_t	expect;	// Expected output value
 13 | };
 14 | 
 15 | // Struct to define a simple testcase:
 16 | struct test {
 17 | 	const char *name;
 18 | 	const char *op;
 19 | 	__m128i    (*run) (struct testcase *);
 20 | };
 21 | 
 22 | // Check if all 8 epu16's are identical:
 23 | static bool
 24 | epu16_all_same (uint16_t buf[8])
 25 | {
 26 | 	for (int i = 1; i < 8; i++) {
 27 | 		if (buf[0] != buf[i]) {
 28 | 			return false;
 29 | 		}
 30 | 	}
 31 | 	return true;
 32 | }
 33 | 
 34 | static bool
 35 | test_epu16_two (struct test *test)
 36 | {
 37 | 	uint16_t buf[8] __attribute__ ((aligned(16)));
 38 | 	bool pass = true;
 39 | 
 40 | 	puts(test->name);
 41 | 
 42 | 	for (int i = 0; i < 0x10000; i++) {
 43 | 		for (int j = 0; j < 0x10000; j++) {
 44 | 
 45 | 			// Create testcase
 46 | 			struct testcase tc = {
 47 | 				.i = i,
 48 | 				.j = j,
 49 | 			};
 50 | 
 51 | 			// Run testcase:
 52 | 			__m128i c = test->run(&tc);
 53 | 
 54 | 			// Save result to array:
 55 | 			_mm_store_si128((__m128i *)buf, c);
 56 | 
 57 | 			// Check that all elements in the result are identical:
 58 | 			if (!epu16_all_same(buf)) {
 59 | 				printf("FAIL: %d %s %d, not all identical\n", tc.i, test->op, tc.j);
 60 | 				pass = false;
 61 | 				continue;
 62 | 			}
 63 | 			// Does the expected result differ?
 64 | 			if (buf[0] != tc.expect) {
 65 | 				printf("FAIL: %d %s %d, expected %d, got %d\n", tc.i, test->op, tc.j, tc.expect, buf[0]);
 66 | 				pass = false;
 67 | 				continue;
 68 | 			}
 69 | 		}
 70 | 	}
 71 | 	return pass;
 72 | }
 73 | 
 74 | static __m128i
 75 | test_mm_cmplt_epu16 (struct testcase *tc)
 76 | {
 77 | 	tc->expect = (tc->i < tc->j) ? 0xFFFF : 0x0000;
 78 | 	return _mm_cmplt_epu16(
 79 | 		_mm_set1_epi16(tc->i),
 80 | 		_mm_set1_epi16(tc->j));
 81 | }
 82 | 
 83 | static __m128i
 84 | test_mm_cmple_epu16 (struct testcase *tc)
 85 | {
 86 | 	tc->expect = (tc->i <= tc->j) ? 0xFFFF : 0x0000;
 87 | 	return _mm_cmple_epu16(
 88 | 		_mm_set1_epi16(tc->i),
 89 | 		_mm_set1_epi16(tc->j));
 90 | }
 91 | 
 92 | static __m128i
 93 | test_mm_cmpge_epu16 (struct testcase *tc)
 94 | {
 95 | 	tc->expect = (tc->i >= tc->j) ? 0xFFFF : 0x0000;
 96 | 	return _mm_cmpge_epu16(
 97 | 		_mm_set1_epi16(tc->i),
 98 | 		_mm_set1_epi16(tc->j));
 99 | }
100 | 
101 | static __m128i
102 | test_mm_cmpgt_epu16 (struct testcase *tc)
103 | {
104 | 	tc->expect = (tc->i > tc->j) ? 0xFFFF : 0x0000;
105 | 	return _mm_cmpgt_epu16(
106 | 		_mm_set1_epi16(tc->i),
107 | 		_mm_set1_epi16(tc->j));
108 | }
109 | 
110 | static __m128i
111 | test_mm_min_epu16 (struct testcase *tc)
112 | {
113 | 	tc->expect = (tc->i > tc->j) ? tc->j : tc->i;
114 | 	return _mm_min_epu16(
115 | 		_mm_set1_epi16(tc->i),
116 | 		_mm_set1_epi16(tc->j));
117 | }
118 | 
119 | static __m128i
120 | test_mm_max_epu16 (struct testcase *tc)
121 | {
122 | 	tc->expect = (tc->i > tc->j) ? tc->i : tc->j;
123 | 	return _mm_max_epu16(
124 | 		_mm_set1_epi16(tc->i),
125 | 		_mm_set1_epi16(tc->j));
126 | }
127 | 
128 | static __m128i
129 | test_mm_absdiff_epu16 (struct testcase *tc)
130 | {
131 | 	tc->expect = (tc->i > tc->j) ? (tc->i - tc->j) : (tc->j - tc->i);
132 | 	return _mm_absdiff_epu16(
133 | 		_mm_set1_epi16(tc->i),
134 | 		_mm_set1_epi16(tc->j));
135 | }
136 | 
137 | static bool
138 | test_mm_div255_epu16 (void)
139 | {
140 | 	bool pass = true;
141 | 
142 | 	// Only works when i < 256*255 = 65280:
143 | 	// (result is 8-bit):
144 | 	puts("_mm_div255_epu16");
145 | 	for (int i = 0; i < 0xFF00; i++) {
146 | 		uint16_t c = _mm_extract_epi16(_mm_div255_epu16(_mm_set1_epi16(i)), 1);
147 | 		if (c != (i / 255)) {
148 | 			printf("FAIL: div255(%d), got %d, expected %d\n", i, c, (i / 255));
149 | 			pass = false;
150 | 		}
151 | 	}
152 | 	return pass;
153 | }
154 | 
155 | int
156 | main (void)
157 | {
158 | 	// Map for testing simple bytewise functions:
159 | 	struct test map[] = {
160 | 		{ "_mm_cmplt_epu16",   "<",       test_mm_cmplt_epu16   },
161 | 		{ "_mm_cmple_epu16",   "<=",      test_mm_cmple_epu16   },
162 | 		{ "_mm_cmpge_epu16",   ">=",      test_mm_cmpge_epu16   },
163 | 		{ "_mm_cmpgt_epu16",   ">",       test_mm_cmpgt_epu16   },
164 | 		{ "_mm_min_epu16",     "min",     test_mm_min_epu16     },
165 | 		{ "_mm_max_epu16",     "max",     test_mm_max_epu16     },
166 | 		{ "_mm_absdiff_epu16", "absdiff", test_mm_absdiff_epu16 },
167 | 	};
168 | 
169 | 	bool pass = true;
170 | 
171 | 	for (size_t i = 0; i < sizeof(map) / sizeof(map[0]); i++)
172 | 		pass &= test_epu16_two(&map[i]);
173 | 
174 | 	// Handle this one separately:
175 | 	pass &= test_mm_div255_epu16();
176 | 
177 | 	return (pass) ? 0 : 1;
178 | }
179 | 


--------------------------------------------------------------------------------
/test/exhaustive_8bit.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdint.h>
  3 | #include <stdio.h>
  4 | #include <emmintrin.h>
  5 | 
  6 | #include "../missing_sse.h"
  7 | 
  8 | // Inputs to test function:
  9 | struct testcase {
 10 | 	const uint8_t	i;	// First input value
 11 | 	const uint8_t	j;	// Second input value
 12 | 	uint8_t		expect;	// Expected output value
 13 | };
 14 | 
 15 | // Struct to define a simple testcase:
 16 | struct test {
 17 | 	const char *name;
 18 | 	const char *op;
 19 | 	__m128i    (*run) (struct testcase *);
 20 | };
 21 | 
 22 | // Check if all 16 epu8's are identical:
 23 | static bool
 24 | epu8_all_same (uint8_t buf[16])
 25 | {
 26 | 	for (int i = 1; i < 16; i++)
 27 | 		if (buf[0] != buf[i])
 28 | 			return false;
 29 | 
 30 | 	return true;
 31 | }
 32 | 
 33 | static bool
 34 | test_epu8_two (struct test *test)
 35 | {
 36 | 	uint8_t buf[16] __attribute__ ((aligned(16)));
 37 | 	bool pass = true;
 38 | 
 39 | 	puts(test->name);
 40 | 
 41 | 	for (int i = 0; i < 0x100; i++) {
 42 | 		for (int j = 0; j < 0x100; j++) {
 43 | 
 44 | 			// Create testcase
 45 | 			struct testcase tc = {
 46 | 				.i = i,
 47 | 				.j = j,
 48 | 			};
 49 | 
 50 | 			// Run testcase:
 51 | 			__m128i c = test->run(&tc);
 52 | 
 53 | 			// Save result to array:
 54 | 			_mm_store_si128((__m128i *)buf, c);
 55 | 
 56 | 			// Check that all elements in the result are identical:
 57 | 			if (!epu8_all_same(buf)) {
 58 | 				printf("FAIL: %d %s %d, not all identical\n", tc.i, test->op, tc.j);
 59 | 				pass = false;
 60 | 				continue;
 61 | 			}
 62 | 
 63 | 			// Does the expected result differ?
 64 | 			if (buf[0] != tc.expect) {
 65 | 				printf("FAIL: %d %s %d, expected %d, got %d\n", tc.i, test->op, tc.j, tc.expect, buf[0]);
 66 | 				pass = false;
 67 | 				continue;
 68 | 			}
 69 | 		}
 70 | 	}
 71 | 	return pass;
 72 | }
 73 | 
 74 | static __m128i
 75 | test_mm_cmplt_epu8 (struct testcase *tc)
 76 | {
 77 | 	tc->expect = (tc->i < tc->j) ? 0xFF : 0x00;
 78 | 	return _mm_cmplt_epu8(
 79 | 		_mm_set1_epi8(tc->i),
 80 | 		_mm_set1_epi8(tc->j));
 81 | }
 82 | 
 83 | static __m128i
 84 | test_mm_cmple_epu8 (struct testcase *tc)
 85 | {
 86 | 	tc->expect = (tc->i <= tc->j) ? 0xFF : 0x00;
 87 | 	return _mm_cmple_epu8(
 88 | 		_mm_set1_epi8(tc->i),
 89 | 		_mm_set1_epi8(tc->j));
 90 | }
 91 | 
 92 | static __m128i
 93 | test_mm_cmpge_epu8 (struct testcase *tc)
 94 | {
 95 | 	tc->expect = (tc->i >= tc->j) ? 0xFF : 0x00;
 96 | 	return _mm_cmpge_epu8(
 97 | 		_mm_set1_epi8(tc->i),
 98 | 		_mm_set1_epi8(tc->j));
 99 | }
100 | 
101 | static __m128i
102 | test_mm_cmpgt_epu8 (struct testcase *tc)
103 | {
104 | 	tc->expect = (tc->i > tc->j) ? 0xFF : 0x00;
105 | 	return _mm_cmpgt_epu8(
106 | 		_mm_set1_epi8(tc->i),
107 | 		_mm_set1_epi8(tc->j));
108 | }
109 | 
110 | static __m128i
111 | test_mm_absdiff_epu8 (struct testcase *tc)
112 | {
113 | 	tc->expect = (tc->i > tc->j) ? (tc->i - tc->j) : (tc->j - tc->i);
114 | 	return _mm_absdiff_epu8(
115 | 		_mm_set1_epi8(tc->i),
116 | 		_mm_set1_epi8(tc->j));
117 | }
118 | 
119 | static __m128i
120 | test_mm_scale_epu8 (struct testcase *tc)
121 | {
122 | 	tc->expect = (tc->i * tc->j) / 255;
123 | 	return _mm_scale_epu8(
124 | 		_mm_set1_epi8(tc->i),
125 | 		_mm_set1_epi8(tc->j));
126 | }
127 | 
128 | static bool
129 | test_mm_divfast_epu8 (void)
130 | {
131 | 	bool pass = true;
132 | 
133 | 	puts("_mm_divfast_epu8");
134 | 	for (int i = 0; i < 0x100; i++) {
135 | 		for (int j = 1; j < 0x100; j++) {
136 | 			__m128i a = _mm_set1_epi8(i);
137 | 			uint8_t c = _mm_extract_epi16(_mm_divfast_epu8(a, j), 1) & 0xFF;
138 | 			uint8_t s = i / j;
139 | 			if (c != s) {
140 | 				printf("FAIL: div(%d, %d), got %d, expected %d\n", i, j, c, s);
141 | 				pass = false;
142 | 			}
143 | 		}
144 | 	}
145 | 	return pass;
146 | }
147 | 
148 | static bool
149 | test_mm_div_epu8 (void)
150 | {
151 | 	bool pass = true;
152 | 
153 | 	puts("_mm_div_epu8");
154 | 	for (int i = 0; i < 0x100; i++) {
155 | 		for (int j = 1; j < 0x100; j++) {
156 | 			__m128i a = _mm_set1_epi8(i);
157 | 			uint8_t c = _mm_extract_epi16(_mm_div_epu8(a, j), 1) & 0xFF;
158 | 			uint8_t s = i / j;
159 | 			if (c != s) {
160 | 				printf("FAIL: divp(%d, %d), got %d, expected %d\n", i, j, c, s);
161 | 				pass = false;
162 | 			}
163 | 		}
164 | 	}
165 | 	return pass;
166 | }
167 | 
168 | int
169 | main (void)
170 | {
171 | 	// Map for testing simple bytewise functions:
172 | 	struct test map[] = {
173 | 		{ "_mm_cmplt_epu8",   "<",       test_mm_cmplt_epu8   },
174 | 		{ "_mm_cmple_epu8",   "<=",      test_mm_cmple_epu8   },
175 | 		{ "_mm_cmpge_epu8",   ">=",      test_mm_cmpge_epu8   },
176 | 		{ "_mm_cmpgt_epu8",   ">",       test_mm_cmpgt_epu8   },
177 | 		{ "_mm_absdiff_epu8", "absdiff", test_mm_absdiff_epu8 },
178 | 		{ "_mm_scale_epu8",   "scale",   test_mm_scale_epu8   },
179 | 	};
180 | 
181 | 	bool pass = true;
182 | 
183 | 	for (size_t i = 0; i < sizeof(map) / sizeof(map[0]); i++)
184 | 		pass &= test_epu8_two(&map[i]);
185 | 
186 | 	pass &= test_mm_div_epu8();
187 | 
188 | 	// This function is inexact by design:
189 | 	(void) test_mm_divfast_epu8();
190 | 
191 | 	return (pass) ? 0 : 1;
192 | }
193 | 


--------------------------------------------------------------------------------