├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── bin
    └── test
├── graphs
    ├── \
    ├── data.txt
    ├── exp_graph.cpp
    ├── graph.png
    ├── graph.svg
    ├── optimize.cpp
    └── plot.py
├── include
    ├── simdexp.h
    ├── simdmath.h
    ├── simdrcp.h
    ├── simdrsqrt.h
    ├── simdtools.h
    └── simdtri.h
└── test
    ├── test.cpp
    ├── test_common.h
    ├── test_exp.h
    ├── test_rcp.h
    └── test_rsqrt.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | GXX=g++
 2 | ICPC=icpc
 3 | 
 4 | GCC_AVX_FLAGS=-O3 -mavx2 -mfma -ffast-math
 5 | INTEL_AVX_FLAGS=-O3 -fma -march=core-avx2
 6 | 
 7 | all: test-intel
 8 | 
 9 | test-gcc:
10 | 	mkdir -p bin/
11 | 	$(GXX) test/test.cpp -o bin/test $(GCC_AVX_FLAGS)
12 | 	bin/test
13 | 
14 | test-intel:
15 | 	mkdir -p bin/
16 | 	$(ICPC) test/test.cpp -o bin/test $(INTEL_AVX_FLAGS)
17 | 	bin/test
18 | 
19 | clean:
20 | 	@rm bin/test
21 | 
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | libsimd.h
 2 | ----------
 3 | 
 4 | C++ library of fast, approximate math function for Intel AVX.
 5 | 
 6 | How to use?
 7 | ----------
 8 | Include the relevant header in your file, e.g.:
 9 | 
10 |     #include "libsimdrcp.h"
11 | 
12 | You need to enable the relevant compiler flags, see below for GCC and Intel compilers.
13 | 
14 | To compile with gcc, use e.g.
15 | 
16 |     g++ my_source.cpp -o my_exec -O3 -mfma -mavx2 -ffast-math
17 | 
18 | To compile with icpc, use e.g.
19 | 
20 |     icpc my_source.cpp -o my_exec -O3 -march=core-avx2 -fma
21 | 
22 | 
23 | Currently supported functions:
24 | ----------
25 | 
26 | Calculates 1/x, using one Newton-Raphson iterations on the start guess from the approximate AVX intrinsic _mm256_rcp_ps. Accurate to 32-bit precision, but faster than `_m256_div_ps(ONE, q)`.
27 | 
28 |     #include "libsimdrcp.h"
29 |     __m256 _mm256_rcp1s_ps(const __m256 &q)
30 | 
31 | Calculates 1/sqrt(x), using one Newton-Raphson iterations on the start guess from the approximate AVX intrinsic _mm256_rsqrt_ps. Accurate to 32-bit precision, but faster than `_m256_div_ps(ONE, _mm256_sqrt_ps(q))`.
32 | 
33 |     #include "libsimdrsqrt.h"
34 |     __m256 _mm256_rsqrt1s_ps(const __m256 &q)
35 | 
36 | Calculates exp(x) using a bit shifting technique. Extremely fast, but has an error of about 10%.
37 | 
38 |     #include "libsimdexp.h"
39 |     __m256 _mm256_expfaster_ps(const __m256 &q)
40 | 
41 | Only valid for -126 < x < 0.0.  Calculates exp(x) via bit shifting techniques and the Newton-Raphson approximation. Not super fast, but has an error of about 0.0001% or 0.001%, depending on which approximation is used for the reciprocal 1/x. If high accuracy is desired, use _mm256_rcp1s_ps() instead of _mm256_rcp_ps().
42 | 
43 |     #include "libsimdexp.h"
44 |     __m256 _mm256_expfastsmallneg_ps(const __m256 &q)
45 | 
46 | Prints a __m256 vector to std. out.
47 | 
48 |     #include "libsimdtools.h"
49 |     void _mm256_print_ps(const __m256 &q)
50 | 
51 | 
52 | License:
53 | ----------
54 | The license for this software has very little restriction. If you do use libsimd for academic publications, please cite this GitHub repository. 
55 | 
56 |     Anders S. Christensen (2015) "libsimd - a C++ library of fast, approximate math functions for Intel AVX." https://github.com/andersx/libsimd
57 | 
58 | There will be a preprint on arXiv soon, if I have enough time to wrap things up.
59 | 
60 | 


--------------------------------------------------------------------------------
/bin/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersx/libsimdmath/c72b0cebd0595ea69f2e851a279823bd7567b688/bin/test


--------------------------------------------------------------------------------
/graphs/\:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2015 Anders S. Christensen
 2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 3 | //
 4 | // This is free and unencumbered software released into the public domain.
 5 | //
 6 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 7 | // distribute this software, either in source code form or as a compiled
 8 | // binary, for any purpose, commercial or non-commercial, and by any
 9 | // means.
10 | //
11 | // In jurisdictions that recognize copyright laws, the author or authors
12 | // of this software dedicate any and all copyright interest in the
13 | // software to the public domain. We make this dedication for the benefit
14 | // of the public at large and to the detriment of our heirs and
15 | // successors. We intend this dedication to be an overt act of
16 | // relinquishment in perpetuity of all present and future rights to this
17 | // software under copyright law.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | // OTHER DEALINGS IN THE SOFTWARE.
26 | //
27 | // For more information, please refer to <http://unlicense.org>
28 | 
29 | 
30 | #include <stdio.h>
31 | #include <immintrin.h>
32 | #include <iostream>
33 | #include <math.h> 
34 | 
35 | #include "../include/simdexp.h"
36 | #include "../include/simdrsqrt.h"
37 | #include "../include/simdtools.h"
38 | 
39 | int main() {
40 | 
41 | 
42 |     __m256 ONE = _mm256_set1_ps(1.0f);
43 |     __m256 ONE_HUNDRED = _mm256_set1_ps(100.0f);
44 | 
45 |     for (float i = -63.0f; i < 63.0f; i += 0.001f) {
46 | 
47 | 
48 |         __m256 t = _mm256_set1_ps(i);
49 | 
50 |         __m256 rcp = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(_mm256_rcp_ps(t), _mm256_div_ps(ONE, t)), ONE));
51 |         __m256 rcp1s = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_rcp1s_ps(t), _mm256_div_ps(ONE, t)), ONE));
52 |         __m256 rsqrt = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(_mm256_rsqrt_ps(t), _mm256_div_ps(ONE, _mm256_sqrt_ps(t))), ONE));
53 |         __m256 rsqrt1s = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_rsqrt1s_ps(t), _mm256_div_ps(ONE, _mm256_sqrt_ps(t))), ONE));
54 |         __m256 expfast = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_expfast_ps(t), _mm256_exp_ps(t)), ONE));
55 |         __m256 expfast1s = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_expfast1s_ps(t), _mm256_exp_ps(t)), ONE));
56 |         __m256 expfaster = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_expfaster_ps(t), _mm256_exp_ps(t)), ONE));
57 | 
58 |         float tvec[8];
59 | 
60 |         float frcp[8];
61 |         float frcp1s[8];
62 |         float frsqrt[8];
63 |         float frsqrt1s[8];
64 |         float fexpfast[8];
65 |         float fexpfast1s[8];
66 |         float fexpfaster[8];
67 | 
68 |         _mm256_store_ps(tvec, t); 
69 |         _mm256_store_ps(f, fast); 
70 |         _mm256_store_ps(f1s, fast1s); 
71 |         _mm256_store_ps(fer, faster); 
72 | 
73 |         printf("%16.10f %16.10f  %16.10f  %16.10f \n", tvec[0], f[0], f1s[0], fer[0]);
74 | 
75 | 
76 |     }
77 | 
78 |     return 0;
79 | 
80 | }
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/graphs/exp_graph.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2015 Anders S. Christensen
 2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 3 | //
 4 | // This is free and unencumbered software released into the public domain.
 5 | //
 6 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 7 | // distribute this software, either in source code form or as a compiled
 8 | // binary, for any purpose, commercial or non-commercial, and by any
 9 | // means.
10 | //
11 | // In jurisdictions that recognize copyright laws, the author or authors
12 | // of this software dedicate any and all copyright interest in the
13 | // software to the public domain. We make this dedication for the benefit
14 | // of the public at large and to the detriment of our heirs and
15 | // successors. We intend this dedication to be an overt act of
16 | // relinquishment in perpetuity of all present and future rights to this
17 | // software under copyright law.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | // OTHER DEALINGS IN THE SOFTWARE.
26 | //
27 | // For more information, please refer to <http://unlicense.org>
28 | 
29 | 
30 | #include <stdio.h>
31 | #include <immintrin.h>
32 | #include <iostream>
33 | #include <math.h> 
34 | 
35 | #include "../include/simdexp.h"
36 | #include "../include/simdrsqrt.h"
37 | #include "../include/simdtools.h"
38 | 
39 | int main() {
40 | 
41 | 
42 |     __m256 ONE = _mm256_set1_ps(1.0f);
43 |     __m256 ONE_HUNDRED = _mm256_set1_ps(100.0f);
44 | 
45 |     for (float i = -63.0f; i < 63.0f; i += 0.001f) {
46 | 
47 | 
48 |         __m256 t = _mm256_set1_ps(i);
49 | 
50 |         __m256 rcp = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(_mm256_rcp_ps(t), _mm256_div_ps(ONE, t)), ONE));
51 |         __m256 rcp1s = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_rcp1s_ps(t), _mm256_div_ps(ONE, t)), ONE));
52 |         __m256 rsqrt = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(_mm256_rsqrt_ps(t), _mm256_div_ps(ONE, _mm256_sqrt_ps(t))), ONE));
53 |         __m256 rsqrt1s = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_rsqrt1s_ps(t), _mm256_div_ps(ONE, _mm256_sqrt_ps(t))), ONE));
54 |         __m256 expfast = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_expfast_ps(t), _mm256_exp_ps(t)), ONE));
55 |         __m256 expfast1s = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_expfast1s_ps(t), _mm256_exp_ps(t)), ONE));
56 |         __m256 expfaster = _mm256_mul_ps(ONE_HUNDRED, _mm256_sub_ps(_mm256_div_ps(simdmath::_mm256_expfaster_ps(t), _mm256_exp_ps(t)), ONE));
57 | 
58 |         float tvec[8];
59 | 
60 |         float frcp[8];
61 |         float frcp1s[8];
62 |         float frsqrt[8];
63 |         float frsqrt1s[8];
64 |         float fexpfast[8];
65 |         float fexpfast1s[8];
66 |         float fexpfaster[8];
67 | 
68 |         _mm256_store_ps(tvec, t); 
69 |         _mm256_store_ps(frcp, rcp);
70 |         _mm256_store_ps(frcp1s, rcp1s);
71 |         _mm256_store_ps(frsqrt, rsqrt);
72 |         _mm256_store_ps(frsqrt1s, rsqrt1s);
73 |         _mm256_store_ps(fexpfast, expfast);
74 |         _mm256_store_ps(fexpfast1s, expfast1s);
75 |         _mm256_store_ps(fexpfaster, expfaster);
76 | 
77 | 
78 | 
79 |         printf("%16.10f %16.10f  %16.10f  %16.10f  %16.10f  %16.10f  %16.10f  %16.10f \n", tvec[0], frcp[0], frcp1s[0],
80 |                 frsqrt[0], frsqrt1s[0], fexpfast[0], fexpfast1s[0], fexpfaster[0]);
81 | 
82 | 
83 |     }
84 | 
85 |     return 0;
86 | 
87 | }
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/graphs/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersx/libsimdmath/c72b0cebd0595ea69f2e851a279823bd7567b688/graphs/graph.png


--------------------------------------------------------------------------------
/graphs/optimize.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2015 Anders S. Christensen
 2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 3 | //
 4 | // This is free and unencumbered software released into the public domain.
 5 | //
 6 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 7 | // distribute this software, either in source code form or as a compiled
 8 | // binary, for any purpose, commercial or non-commercial, and by any
 9 | // means.
10 | //
11 | // In jurisdictions that recognize copyright laws, the author or authors
12 | // of this software dedicate any and all copyright interest in the
13 | // software to the public domain. We make this dedication for the benefit
14 | // of the public at large and to the detriment of our heirs and
15 | // successors. We intend this dedication to be an overt act of
16 | // relinquishment in perpetuity of all present and future rights to this
17 | // software under copyright law.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | // OTHER DEALINGS IN THE SOFTWARE.
26 | //
27 | // For more information, please refer to <http://unlicense.org>
28 | 
29 | 
30 | #include <stdio.h>
31 | #include <immintrin.h>
32 | #include <iostream>
33 | #include <math.h> 
34 | #include <time.h> 
35 | 
36 | #include "../include/simdexp.h"
37 | #include "../include/simdtools.h"
38 | #include "../test/test_common.h"
39 | 
40 | int main() {
41 | 
42 |     srand (time(NULL));
43 | 
44 |     __m256 ONE = _mm256_set1_ps(1.0f);
45 |     __m256 ONE_HUNDRED = _mm256_set1_ps(100.0f);
46 | 
47 | 
48 |     // float C1 = 1064872507.1541044f;
49 |     // float C2 = 12102203.161561485f;
50 | 
51 |     float C1 = 1064870507.1541044f;
52 |     float C2 = 12102000.161561485f;
53 | 
54 |     unsigned int steps = 100
55 | 
56 |     for (unsigned int n = 0.0; n < steps; 
57 | 
58 | 
59 | 
60 |     __m256 C1v = _mm256_set1_ps(C1);
61 |     __m256 C2v = _mm256_set1_ps(C2);
62 | 
63 | 
64 |     __m256 error_sum = _mm256_set1_ps(0.0f);
65 | 
66 |     __m256 c = _mm256_set1_ps(0.0f);
67 |     
68 |     for (unsigned int i = 0; i < 10000000; i++) {
69 | 
70 |         __m256 q = generate_vector(-63.0f, 63.0f);
71 |     
72 |         __m256 approx = _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_fmadd_ps(C2v, q, C1v)));
73 |         __m256 exact = _mm256_exp_ps(q);
74 |         
75 |         __m256 error =_mm256_sub_ps(ONE_HUNDRED, _mm256_mul_ps(ONE_HUNDRED, _mm256_div_ps(approx, exact)));
76 |         error = _mm256_mul_ps(error, error);
77 |        
78 |        __m256 y = _mm256_sub_ps(error, c);
79 |        __m256 t = _mm256_add_ps(error_sum, y);
80 |        c =  _mm256_sub_ps(_mm256_sub_ps(t, error_sum), y);
81 |        error_sum = t;
82 | 
83 | 
84 |     }
85 | 
86 |         float tvec[8];
87 |         _mm256_store_ps(tvec, error_sum); 
88 | 
89 |         float e = tvec[0] + tvec[1] + tvec[2] + tvec[3] + tvec[4] + tvec[5] + tvec[6] + tvec[7];
90 | 
91 |         std::cout << e << std::endl;
92 | 
93 | 
94 |     return 0;
95 | 
96 | }
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/graphs/plot.py:
--------------------------------------------------------------------------------
  1 | from matplotlib import pyplot
  2 | import numpy
  3 | 
  4 | f = open("data.txt", "r")
  5 | lines = f.readlines()
  6 | f.close()
  7 | 
  8 | 
  9 | n = len(lines)
 10 | 
 11 | tvec = numpy.empty(n)
 12 | rcp = numpy.empty(n)
 13 | rcp1s = numpy.empty(n)
 14 | rsqrt = numpy.empty(n)
 15 | rsqrt1s = numpy.empty(n)
 16 | fast = numpy.empty(n)
 17 | fast1s = numpy.empty(n)
 18 | faster = numpy.empty(n)
 19 | 
 20 | 
 21 | for i, line in enumerate(lines):
 22 | 
 23 |     tokens = line.split()
 24 | 
 25 |     tvec[i] = float(tokens[0])
 26 | 
 27 |     rcp[i] = float(tokens[1])
 28 |     rcp1s[i] = float(tokens[2])
 29 |     rsqrt[i] = float(tokens[3])
 30 |     rsqrt1s[i] = float(tokens[4])
 31 |     fast[i] = float(tokens[5])
 32 |     fast1s[i] = float(tokens[6])
 33 |     faster[i] = float(tokens[7])
 34 | 
 35 | 
 36 | x_min = -4.0
 37 | x_max = 4.0
 38 | 
 39 | pyplot.figure(figsize=[24,13.5])
 40 | 
 41 | cols = 4
 42 | rows = 2
 43 | 
 44 | pyplot.subplot(rows,cols,1)
 45 | pyplot.plot(tvec, rcp)
 46 | pyplot.xlim([x_min, x_max])
 47 | pyplot.title("_mm256_rcp_ps()")
 48 | pyplot.xlabel("x")
 49 | pyplot.ylabel("1.0f/x error [%]")
 50 | pyplot.grid(True)
 51 | 
 52 | 
 53 | pyplot.subplot(rows,cols,2)
 54 | pyplot.plot(tvec, rcp1s)
 55 | pyplot.xlim([x_min, x_max])
 56 | pyplot.title("simdmath::_mm256_rcp1s_ps()")
 57 | pyplot.xlabel("x")
 58 | pyplot.ylabel("1.0f/x error [%]")
 59 | pyplot.grid(True)
 60 | 
 61 | 
 62 | pyplot.subplot(rows,cols,3)
 63 | pyplot.plot(tvec, rsqrt)
 64 | pyplot.xlim([0.0, x_max*2])
 65 | pyplot.title("_mm256_rsqrt_ps()")
 66 | pyplot.xlabel("x")
 67 | pyplot.ylabel("1.0f/sqrt(x) error [%]")
 68 | pyplot.grid(True)
 69 | 
 70 | 
 71 | pyplot.subplot(rows,cols,4)
 72 | pyplot.plot(tvec, rsqrt1s)
 73 | pyplot.xlim([0.0, x_max*2])
 74 | pyplot.title("simdmath::_mm256_rsqrt1s_ps()")
 75 | pyplot.xlabel("x")
 76 | pyplot.ylabel("1.0f/sqrt(x) error [%]")
 77 | pyplot.grid(True)
 78 | 
 79 | 
 80 | pyplot.subplot(rows,cols,5)
 81 | pyplot.plot(tvec, fast)
 82 | pyplot.xlim([x_min, x_max])
 83 | pyplot.title("simdmath::_mm256_expfast_ps()")
 84 | pyplot.xlabel("x")
 85 | pyplot.ylabel("exp(x) error [%]")
 86 | pyplot.grid(True)
 87 | 
 88 | 
 89 | pyplot.subplot(rows,cols,6)
 90 | pyplot.plot(tvec, fast1s)
 91 | pyplot.xlim([x_min, x_max])
 92 | pyplot.title("simdmath::_mm256_expfast1s_ps()")
 93 | pyplot.xlabel("x")
 94 | pyplot.ylabel("exp(x) error [%]")
 95 | pyplot.grid(True)
 96 | 
 97 | 
 98 | pyplot.subplot(rows,cols,7)
 99 | pyplot.plot(tvec, faster)
100 | pyplot.xlim([x_min, x_max])
101 | pyplot.title("simdmath::_mm256_expfaster_ps()")
102 | pyplot.xlabel("x")
103 | pyplot.ylabel("exp(x) error [%]")
104 | pyplot.grid(True)
105 | 
106 | 
107 | pyplot.savefig("graph.png")
108 | 


--------------------------------------------------------------------------------
/include/simdexp.h:
--------------------------------------------------------------------------------
  1 | // libsimdexp -- Approximate EXP(x) implementations for SSE and AVX.
  2 | // Copyright (C) 2015 Anders S. Christensen
  3 | // Report bugs, etc at: https://github.com/andersx/simd-exp
  4 | //
  5 | // This is free and unencumbered software released into the public domain.
  6 | //
  7 | // Anyone is free to copy, modify, publish, use, compile, sell, or
  8 | // distribute this software, either in source code form or as a compiled
  9 | // binary, for any purpose, commercial or non-commercial, and by any
 10 | // means.
 11 | //
 12 | // In jurisdictions that recognize copyright laws, the author or authors
 13 | // of this software dedicate any and all copyright interest in the
 14 | // software to the public domain. We make this dedication for the benefit
 15 | // of the public at large and to the detriment of our heirs and
 16 | // successors. We intend this dedication to be an overt act of
 17 | // relinquishment in perpetuity of all present and future rights to this
 18 | // software under copyright law.
 19 | //
 20 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 21 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 22 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 23 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 24 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 25 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 26 | // OTHER DEALINGS IN THE SOFTWARE.
 27 | //
 28 | // For more information, please refer to <http://unlicense.org>
 29 | 
 30 | 
 31 | #ifndef INCLUDE_SIMD_EXP_H
 32 | #define INCLUDE_SIMD_EXP_H
 33 | 
 34 | #include <immintrin.h>
 35 | #include "simdrcp.h"
 36 | 
 37 | 
 38 | namespace simdmath {
 39 | 
 40 | 
 41 |     // Approximation for EXP(x), only valid for -63.0f < x < 63.0f.
 42 |     // Error is about 4%.
 43 |     static inline __m256 _mm256_expfaster_ps(const __m256 &q) {
 44 | 
 45 |         const __m256 C1 = _mm256_set1_ps(1064872507.1541044f);
 46 |         const __m256 C2 = _mm256_set1_ps(12102203.161561485f);
 47 | 
 48 |         return _mm256_castsi256_ps(_mm256_cvttps_epi32(
 49 |                     _mm256_fmadd_ps(C2, q, C1)));
 50 |     }
 51 | 
 52 | 
 53 |     // Approximation for EXP(x), only valid for -63.0f < x < 63.0f.
 54 |     // Error is about 0.2%.
 55 |     static inline __m256 _mm256_expfast_ps(const __m256 &q) {
 56 | 
 57 |         const __m256 INVLOG_2 = _mm256_set1_ps(1.442695040f);
 58 |         const __m256 BIT_SHIFT = _mm256_set1_ps(8388608);
 59 | 
 60 |         const __m256 C1 = _mm256_set1_ps(121.2740838f);
 61 |         const __m256 C2 = _mm256_set1_ps(27.7280233f);
 62 |         const __m256 C3 = _mm256_set1_ps(4.84252568f);
 63 |         const __m256 C4 = _mm256_set1_ps(1.49012907f);
 64 | 
 65 |         const __m256 p = _mm256_mul_ps(INVLOG_2, q);
 66 |         const __m256 z = _mm256_sub_ps(p, _mm256_floor_ps(p));
 67 | 
 68 |         // Note: Use approximation to 1/x in this line.
 69 |         __m256 rcp = _mm256_rcp_ps(_mm256_sub_ps(C3, z));
 70 |         rcp = _mm256_fmadd_ps(rcp, C2, _mm256_add_ps(C1, p));
 71 |         rcp = _mm256_fnmadd_ps(C4, z, rcp);
 72 | 
 73 |         return _mm256_castsi256_ps(_mm256_cvttps_epi32(
 74 |                     _mm256_mul_ps(BIT_SHIFT, rcp)));
 75 | 
 76 |     }
 77 | 
 78 | 
 79 |     // Approximation for EXP(x), only valid for -63.0f < x < 63.0f.
 80 |     // Error is about 0.005%.
 81 |     static inline __m256 _mm256_expfast1s_ps(const __m256 &q) {
 82 | 
 83 |         const __m256 INVLOG_2 = _mm256_set1_ps(1.442695040f);
 84 |         const __m256 BIT_SHIFT = _mm256_set1_ps(8388608);
 85 | 
 86 |         const __m256 C1 = _mm256_set1_ps(121.2740838f);
 87 |         const __m256 C2 = _mm256_set1_ps(27.7280233f);
 88 |         const __m256 C3 = _mm256_set1_ps(4.84252568f);
 89 |         const __m256 C4 = _mm256_set1_ps(1.49012907f);
 90 | 
 91 |         const __m256 ONE = _mm256_set1_ps(1.0f);
 92 | 
 93 |         const __m256 p = _mm256_mul_ps(INVLOG_2, q);
 94 |         const __m256 z = _mm256_sub_ps(p, _mm256_floor_ps(p));
 95 | 
 96 |         // Note: Use exact 1/x in this line.
 97 |         // __m256 rcp = _mm256_rcp1s_ps(_mm256_sub_ps(C3, z));
 98 |         __m256 rcp = _mm256_div_ps(ONE, _mm256_sub_ps(C3, z));
 99 |         rcp = _mm256_fmadd_ps(rcp, C2, _mm256_add_ps(C1, p));
100 |         rcp = _mm256_fnmadd_ps(C4, z, rcp);
101 | 
102 |         return _mm256_castsi256_ps(_mm256_cvttps_epi32(
103 |                     _mm256_mul_ps(BIT_SHIFT, rcp)));
104 | 
105 |     }
106 | 
107 | } // end namespace simdmath
108 | 
109 | 
110 | #endif // INCLUDE_SIMD_EXP_H
111 | 


--------------------------------------------------------------------------------
/include/simdmath.h:
--------------------------------------------------------------------------------
 1 | // libsimdmath -- Header to import everything from libsimd*.h
 2 | // Copyright (C) 2015 Anders S. Christensen
 3 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 4 | //
 5 | // This is free and unencumbered software released into the public domain.
 6 | //
 7 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 8 | // distribute this software, either in source code form or as a compiled
 9 | // binary, for any purpose, commercial or non-commercial, and by any
10 | // means.
11 | //
12 | // In jurisdictions that recognize copyright laws, the author or authors
13 | // of this software dedicate any and all copyright interest in the
14 | // software to the public domain. We make this dedication for the benefit
15 | // of the public at large and to the detriment of our heirs and
16 | // successors. We intend this dedication to be an overt act of
17 | // relinquishment in perpetuity of all present and future rights to this
18 | // software under copyright law.
19 | //
20 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26 | // OTHER DEALINGS IN THE SOFTWARE.
27 | //
28 | // For more information, please refer to <http://unlicense.org>
29 | 
30 | #ifndef LIB_SIMD_MATH
31 | #define LIB_SIMD_MATH
32 | 
33 | #include "simdexp.h"
34 | #include "simdrcp.h"
35 | #include "simdrsqrt.h"
36 | #include "simdtools.h"
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/include/simdrcp.h:
--------------------------------------------------------------------------------
  1 | // libsimdrcp -- Approximate RCP(x) implementations for SSE and AVX.
  2 | // Copyright (C) 2015 Anders S. Christensen
  3 | // Report bugs, etc at: https://github.com/andersx/simd-exp
  4 | //
  5 | // This is free and unencumbered software released into the public domain.
  6 | // 
  7 | // Anyone is free to copy, modify, publish, use, compile, sell, or
  8 | // distribute this software, either in source code form or as a compiled
  9 | // binary, for any purpose, commercial or non-commercial, and by any
 10 | // means.
 11 | // 
 12 | // In jurisdictions that recognize copyright laws, the author or authors
 13 | // of this software dedicate any and all copyright interest in the
 14 | // software to the public domain. We make this dedication for the benefit
 15 | // of the public at large and to the detriment of our heirs and
 16 | // successors. We intend this dedication to be an overt act of
 17 | // relinquishment in perpetuity of all present and future rights to this
 18 | // software under copyright law.
 19 | // 
 20 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 21 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 22 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 23 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 24 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 25 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 26 | // OTHER DEALINGS IN THE SOFTWARE.
 27 | // 
 28 | // For more information, please refer to <http://unlicense.org>
 29 | 
 30 | 
 31 | #ifndef LIB_SIMD_DIV
 32 | #define LIB_SIMD_DIV
 33 | 
 34 | #include <immintrin.h>
 35 | 
 36 | 
 37 | namespace simdmath {
 38 | 
 39 |     // Approximation for 1/x -- Newtons method to 1st order.
 40 |     // using an AVX2-intrinsic function as initial guess.
 41 |     // Speed inbetween _mm256_rcp_ps(x) and _mm256_div_ps(ONE, x).
 42 |     // Practically same accuracy as _mm256_div_ps(ONE, x).
 43 |     // Probably the only useful function in this header.
 44 |     // Error of _mm256_rcp1s_ps is 0.00001%.
 45 |     // In comparison Error of intrinsic _mm256_rcp_ps is 0.02%.
 46 |     static inline __m256 _mm256_rcp1s_ps(const __m256 &q) {
 47 | 
 48 |         const __m256 TWO = _mm256_set1_ps(2.0f);
 49 | 
 50 |         __m256 rcp = _mm256_rcp_ps(q);
 51 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
 52 | 
 53 |         return rcp;
 54 | 
 55 |     }
 56 | 
 57 | 
 58 |     // Approximation for 1/x -- Newtons method to 0th order.
 59 |     // Same speed as _mm256_rcp_ps(), but not as accurate.
 60 |     // Error of _mm256_rcp0_ps is about 3%.
 61 |     static inline __m256 _mm256_rcp0_ps(const __m256 &q) {
 62 | 
 63 |         const __m256 TWO = _mm256_set1_ps(2.0f);
 64 |         const __m256i MAGIC_NUMBER = _mm256_set1_epi32(0x7EF311C2);
 65 | 
 66 |         __m256i x = _mm256_castps_si256(q);
 67 |         x = _mm256_sub_epi32(MAGIC_NUMBER, x);
 68 |         __m256 rcp = _mm256_castsi256_ps(x);
 69 | 
 70 |         return rcp;
 71 | 
 72 |     }
 73 | 
 74 | 
 75 |     // Approximation for 1/x -- Newtons method to 1th order.
 76 |     // Slightly slower than _mm256_rcp_ps(), but not as accurate.
 77 |     // Error of _mm256_rcp1_ps is about 0.2%.
 78 |     static inline __m256 _mm256_rcp1_ps(const __m256 &q) {
 79 | 
 80 |         const __m256 TWO = _mm256_set1_ps(2.0f);
 81 |         const __m256i MAGIC_NUMBER = _mm256_set1_epi32(0x7EF311C3);
 82 | 
 83 |         __m256i x = _mm256_castps_si256(q);
 84 |         x = _mm256_sub_epi32(MAGIC_NUMBER, x);
 85 |         __m256 rcp = _mm256_castsi256_ps(x);
 86 | 
 87 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
 88 | 
 89 |         return rcp;
 90 | 
 91 |     }
 92 | 
 93 | 
 94 |     // Approximation for 1/x -- Newtons method to 2nd order
 95 |     // Slightly faster than as _mm256_div_ps(ONE, x), but not as accurate.
 96 |     // More accurate than _mm256_rcp_ps().
 97 |     // Error of _mm256_rcp2_ps is about 0.001%.
 98 |     static inline __m256 _mm256_rcp2_ps(const __m256 &q) {
 99 | 
100 |         const __m256 TWO = _mm256_set1_ps(2.0f);
101 |         const __m256i MAGIC_NUMBER = _mm256_set1_epi32(0x7EF312AC);
102 | 
103 |         __m256i x = _mm256_castps_si256(q);
104 |         x = _mm256_sub_epi32(MAGIC_NUMBER, x);
105 |         __m256 rcp = _mm256_castsi256_ps(x);
106 | 
107 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
108 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
109 | 
110 |         return rcp;
111 | 
112 |     }
113 | 
114 | 
115 |     // Approximation for 1/x -- Newtons method to 3rd order.
116 |     // Slower than _mm256_div_ps(ONE, x), and just as accurate.
117 |     // Error of _mm256_rcp3_ps is about 0.00001%.
118 |     static inline __m256 _mm256_rcp3_ps(const __m256 &q) {
119 | 
120 |         const __m256 TWO = _mm256_set1_ps(2.0f);
121 |         const __m256i MAGIC_NUMBER = _mm256_set1_epi32(0x7EEEEBB3);
122 | 
123 |         __m256i x = _mm256_castps_si256(q);
124 |         x = _mm256_sub_epi32(MAGIC_NUMBER, x);
125 |         __m256 rcp = _mm256_castsi256_ps(x);
126 | 
127 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
128 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
129 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
130 | 
131 |         return rcp;
132 | 
133 |     }
134 | 
135 | 
136 |     // Approximation for 1/x -- Newtons method to 4th order.
137 |     // Much slower than _mm256_div_ps(ONE, x), and just as accurate.
138 |     // Error of _mm256_rcp4_ps is about 0.00001%.
139 |     static inline __m256 _mm256_rcp4_ps(const __m256 &q) {
140 | 
141 |         const __m256 TWO = _mm256_set1_ps(2.0f);
142 |         const __m256i MAGIC_NUMBER = _mm256_set1_epi32(0x7EEEEEEE);
143 | 
144 |         __m256i x = _mm256_castps_si256(q);
145 |         x = _mm256_sub_epi32(MAGIC_NUMBER, x);
146 |         __m256 rcp = _mm256_castsi256_ps(x);
147 | 
148 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
149 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
150 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
151 |         rcp = _mm256_mul_ps(rcp, _mm256_fnmadd_ps(rcp, q, TWO));
152 | 
153 |         return rcp;
154 | 
155 |     }
156 | 
157 | }
158 | 
159 | #endif
160 | 
161 | 


--------------------------------------------------------------------------------
/include/simdrsqrt.h:
--------------------------------------------------------------------------------
 1 | // libsimdrsqrt -- Approximate RSQRT(x) implementations for Intel AVX.
 2 | // Copyright (C) 2015 Anders S. Christensen
 3 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 4 | //
 5 | // This is free and unencumbered software released into the public domain.
 6 | //
 7 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 8 | // distribute this software, either in source code form or as a compiled
 9 | // binary, for any purpose, commercial or non-commercial, and by any
10 | // means.
11 | //
12 | // In jurisdictions that recognize copyright laws, the author or authors
13 | // of this software dedicate any and all copyright interest in the
14 | // software to the public domain. We make this dedication for the benefit
15 | // of the public at large and to the detriment of our heirs and
16 | // successors. We intend this dedication to be an overt act of
17 | // relinquishment in perpetuity of all present and future rights to this
18 | // software under copyright law.
19 | //
20 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26 | // OTHER DEALINGS IN THE SOFTWARE.
27 | //
28 | // For more information, please refer to <http://unlicense.org>
29 | 
30 | 
31 | #ifndef INCLUDE_SIMD_RSQRT_H
32 | #define INCLUDE_SIMD_RSQRT_H
33 | 
34 | #include <immintrin.h>
35 | 
36 | 
37 | namespace simdmath {
38 | 
39 | 
40 |     // Approximation to 1/sqrt(x) -- accurate to 32-bit precision,
41 |     // but faster than the exact _mm256_div_ps(ONE, _mm256_sqrt_ps(x)).
42 |     // Error is about 0.00002%
43 |     static inline __m256 _mm256_rsqrt1s_ps(const __m256 &q) {
44 | 
45 |         const __m256 HALF = _mm256_set1_ps(0.5f);
46 |         const __m256 THREE = _mm256_set1_ps(3.0f);
47 | 
48 |         // Initial guess from intrinsic rsqrt.
49 |         __m256 rsqrt = _mm256_rsqrt_ps(q);
50 | 
51 |         // Add one iteration of Newton-Raphson.
52 |         return _mm256_mul_ps(HALF, _mm256_mul_ps(rsqrt, 
53 |                     _mm256_fnmadd_ps( _mm256_mul_ps(rsqrt, rsqrt), q, THREE)));
54 | 
55 |     }
56 | 
57 | 
58 | }
59 | 
60 | 
61 | #endif // INCLUDE_SIMD_EXP_H
62 | 


--------------------------------------------------------------------------------
/include/simdtools.h:
--------------------------------------------------------------------------------
  1 | // libsimdtools -- Tools for vector math - SSE and AVX.
  2 | // Copyright (C) 2015 Anders S. Christensen
  3 | // Report bugs, etc at: https://github.com/andersx/simd-exp
  4 | //
  5 | // This is free and unencumbered software released into the public domain.
  6 | //
  7 | // Anyone is free to copy, modify, publish, use, compile, sell, or
  8 | // distribute this software, either in source code form or as a compiled
  9 | // binary, for any purpose, commercial or non-commercial, and by any
 10 | // means.
 11 | //
 12 | // In jurisdictions that recognize copyright laws, the author or authors
 13 | // of this software dedicate any and all copyright interest in the
 14 | // software to the public domain. We make this dedication for the benefit
 15 | // of the public at large and to the detriment of our heirs and
 16 | // successors. We intend this dedication to be an overt act of
 17 | // relinquishment in perpetuity of all present and future rights to this
 18 | // software under copyright law.
 19 | //
 20 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 21 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 22 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 23 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 24 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 25 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 26 | // OTHER DEALINGS IN THE SOFTWARE.
 27 | //
 28 | // For more information, please refer to <http://unlicense.org>
 29 | 
 30 | 
 31 | #ifndef INCLUDE_SIMD_TOOLS_H
 32 | #define INCLUDE_SIMD_TOOLS_H
 33 | 
 34 | #include <stdio.h>
 35 | #include <immintrin.h>
 36 | 
 37 | 
 38 | namespace simdmath {
 39 | 
 40 | 
 41 |     // Prints a __m256 vector to std out.
 42 |     static void _mm256_print_si256(const __m256i &q) {
 43 | 
 44 |         int * temp;
 45 |         _mm256_storeu_si256((__m256i*)temp, q);
 46 | 
 47 |         printf("0: %16.10f  1: %16.10f  2: %16.10f  3: %16.10f\n",
 48 |                 temp, temp + 1, temp +1 , temp + 3);
 49 | 
 50 |         // printf("4: %16.10f  5: %16.10f  6: %16.10f  7: %16.10f\n",
 51 |         //         temp[4], temp[5], temp[6], temp[7]);
 52 | 
 53 |     }
 54 | 
 55 | 
 56 |     // Prints a __m256 vector to std out.
 57 |     static void _mm256_print_ps(const __m256 &q) {
 58 | 
 59 |         float temp[8];
 60 |         _mm256_store_ps(temp, q);
 61 | 
 62 |         printf("0: %16.10f  1: %16.10f  2: %16.10f  3: %16.10f\n",
 63 |                 temp[0], temp[1], temp[2], temp[3]);
 64 | 
 65 |         printf("4: %16.10f  5: %16.10f  6: %16.10f  7: %16.10f\n",
 66 |                 temp[4], temp[5], temp[6], temp[7]);
 67 | 
 68 |     }
 69 | 
 70 | 
 71 |     // Return the lowest element from a __m256 vector as float.
 72 |     static float _mm256_minelement_ps(const __m256 &q) {
 73 | 
 74 |         float temp[8];
 75 |         _mm256_store_ps(temp, q);
 76 | 
 77 |         float min = temp[0];
 78 | 
 79 |         for (unsigned int i = 1; i < 8; i++) {
 80 | 
 81 |             if (temp[i] < min) min = temp[i];
 82 |         }
 83 | 
 84 |         return min;
 85 | 
 86 |     }
 87 | 
 88 | 
 89 |     // Return the largest element from a __m256 vector as float.
 90 |     static float _mm256_maxelement_ps(const __m256 &q) {
 91 | 
 92 |         float temp[8];
 93 |         _mm256_store_ps(temp, q);
 94 | 
 95 |         float max = temp[0];
 96 | 
 97 |         for (unsigned int i = 1; i < 8; i++) {
 98 | 
 99 |             if (temp[i] > max) max = temp[i];
100 |         }
101 | 
102 |         return max;
103 | 
104 |     }
105 | 
106 | 
107 | } // namespace simdmath
108 | 
109 | 
110 | #endif // INCLUDE_SIMD_TOOLS_H
111 | 


--------------------------------------------------------------------------------
/include/simdtri.h:
--------------------------------------------------------------------------------
 1 | // simdtri.h -- Approximate COS(X), SIN(X) implementations for Intel AVX2.
 2 | // Copyright (C) 2015 Anders S. Christensen
 3 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 4 | //
 5 | // This is free and unencumbered software released into the public domain.
 6 | //
 7 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 8 | // distribute this software, either in source code form or as a compiled
 9 | // binary, for any purpose, commercial or non-commercial, and by any
10 | // means.
11 | //
12 | // In jurisdictions that recognize copyright laws, the author or authors
13 | // of this software dedicate any and all copyright interest in the
14 | // software to the public domain. We make this dedication for the benefit
15 | // of the public at large and to the detriment of our heirs and
16 | // successors. We intend this dedication to be an overt act of
17 | // relinquishment in perpetuity of all present and future rights to this
18 | // software under copyright law.
19 | //
20 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26 | // OTHER DEALINGS IN THE SOFTWARE.
27 | //
28 | // For more information, please refer to <http://unlicense.org>
29 | 
30 | 
31 | #ifndef INCLUDE_SIMD_TRI_H
32 | #define INCLUDE_SIMD_TRI_H
33 | 
34 | #include <immintrin.h>
35 | 
36 | 
37 | namespace simdmath {
38 | 
39 | 
40 | 
41 | 
42 | }
43 | 
44 | 
45 | #endif // INCLUDE_SIMD_TRI_H
46 | 


--------------------------------------------------------------------------------
/test/test.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2015 Anders S. Christensen
 2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 3 | //
 4 | // This is free and unencumbered software released into the public domain.
 5 | //
 6 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 7 | // distribute this software, either in source code form or as a compiled
 8 | // binary, for any purpose, commercial or non-commercial, and by any
 9 | // means.
10 | //
11 | // In jurisdictions that recognize copyright laws, the author or authors
12 | // of this software dedicate any and all copyright interest in the
13 | // software to the public domain. We make this dedication for the benefit
14 | // of the public at large and to the detriment of our heirs and
15 | // successors. We intend this dedication to be an overt act of
16 | // relinquishment in perpetuity of all present and future rights to this
17 | // software under copyright law.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | // OTHER DEALINGS IN THE SOFTWARE.
26 | //
27 | // For more information, please refer to <http://unlicense.org>
28 | 
29 | 
30 | #include <iostream>
31 | 
32 | #include "test_rcp.h"
33 | #include "test_exp.h"
34 | #include "test_rsqrt.h"
35 | 
36 | 
37 | int main() {
38 | 
39 |     srand(time(NULL));
40 | 
41 |     int failed_tests = 0;
42 | 
43 |     failed_tests += test_mm256_rcp_ps();
44 |     failed_tests += test_mm256_rcp1s_ps();
45 |     failed_tests += test_mm256_expfaster_ps();
46 |     failed_tests += test_mm256_expfast_ps();
47 |     failed_tests += test_mm256_expfast1s_ps();
48 |     failed_tests += test_mm256_rsqrt_ps();
49 |     failed_tests += test_mm256_rsqrt1s_ps();
50 | 
51 |     std::cout << "Number of failed tests: " << failed_tests << std::endl;
52 | 
53 |     return 0;
54 | 
55 | }
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/test/test_common.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2015 Anders S. Christensen
  2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
  3 | //
  4 | // This is free and unencumbered software released into the public domain.
  5 | //
  6 | // Anyone is free to copy, modify, publish, use, compile, sell, or
  7 | // distribute this software, either in source code form or as a compiled
  8 | // binary, for any purpose, commercial or non-commercial, and by any
  9 | // means.
 10 | //
 11 | // In jurisdictions that recognize copyright laws, the author or authors
 12 | // of this software dedicate any and all copyright interest in the
 13 | // software to the public domain. We make this dedication for the benefit
 14 | // of the public at large and to the detriment of our heirs and
 15 | // successors. We intend this dedication to be an overt act of
 16 | // relinquishment in perpetuity of all present and future rights to this
 17 | // software under copyright law.
 18 | //
 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 20 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 21 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 22 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 23 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 24 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 | // OTHER DEALINGS IN THE SOFTWARE.
 26 | //
 27 | // For more information, please refer to <http://unlicense.org>
 28 | 
 29 | 
 30 | #ifndef LIB_SIMD_TEST_COMMON_H
 31 | #define LIB_SIMD_TEST_COMMON_H
 32 | 
 33 | #include <stdlib.h>
 34 | #include <cstdlib>
 35 | 
 36 | #include "../include/simdtools.h"
 37 | 
 38 | 
 39 | using namespace simdmath;
 40 | 
 41 | // Lowest guaranteed accuracy of a float in %.
 42 | const float FLOAT_ACCURACY = 0.00002f;
 43 | 
 44 | 
 45 | // Return a random float in a given range.
 46 | float rand_float(float min, float max) {
 47 | 
 48 |     return ((max-min)*((float)rand()/RAND_MAX))+min;
 49 | 
 50 | }
 51 | 
 52 | 
 53 | // Return a SIMD-vector of random floats in a given range.
 54 | __m256 generate_vector(float min, float max) {
 55 | 
 56 |      return _mm256_set_ps(rand_float(min, max), rand_float(min, max),
 57 |                           rand_float(min, max), rand_float(min, max), 
 58 |                           rand_float(min, max), rand_float(min, max),
 59 |                           rand_float(min, max), rand_float(min, max));
 60 | 
 61 | }    
 62 | 
 63 | 
 64 | // Run a test case, given the input vector, the approximate and
 65 | // exact results for a function and a given target accuracy [%].
 66 | float compare_results(__m256 input, __m256 approx, 
 67 |                       __m256 exact, float accuracy) {
 68 | 
 69 |     // Calculate error in %.
 70 |     __m256 error = _mm256_mul_ps(_mm256_set1_ps(100.0f),
 71 |                    _mm256_div_ps(approx, exact));
 72 | 
 73 | 
 74 |     // Print input-vector.
 75 |     std::cout << "Random input:" << std::endl; 
 76 |     _mm256_print_ps(input);
 77 | 
 78 |     // Print error.
 79 |     std::cout << "Ratio deviation[%]:      Allowed error = " << accuracy << std::endl;
 80 |     _mm256_print_ps(error);
 81 | 
 82 |     // Determine if error is withing target accuracy.
 83 |     if ((_mm256_maxelement_ps(error) > 100.0f + accuracy) ||
 84 |         (_mm256_minelement_ps(error) < 100.0f - accuracy)) {
 85 | 
 86 |         std::cout << "FAILED ..." << std::endl;
 87 |         std::cout << std::endl;
 88 |         _mm256_print_ps(approx);
 89 |         _mm256_print_ps(exact);
 90 |         return 1;
 91 | 
 92 |     } else {
 93 | 
 94 |         std::cout << "PASSED ..." << std::endl;
 95 |         std::cout << std::endl;
 96 |         return 0;
 97 | 
 98 |     }
 99 | 
100 | }
101 | 
102 | 
103 | #endif // LIB_SIMD_TEST_COMMON_H
104 | 


--------------------------------------------------------------------------------
/test/test_exp.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2015 Anders S. Christensen
 2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 3 | //
 4 | // This is free and unencumbered software released into the public domain.
 5 | //
 6 | // Anyone is free to copy, modify, publish, use, compile, sell, or
 7 | // distribute this software, either in source code form or as a compiled
 8 | // binary, for any purpose, commercial or non-commercial, and by any
 9 | // means.
10 | //
11 | // In jurisdictions that recognize copyright laws, the author or authors
12 | // of this software dedicate any and all copyright interest in the
13 | // software to the public domain. We make this dedication for the benefit
14 | // of the public at large and to the detriment of our heirs and
15 | // successors. We intend this dedication to be an overt act of
16 | // relinquishment in perpetuity of all present and future rights to this
17 | // software under copyright law.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 | // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | // OTHER DEALINGS IN THE SOFTWARE.
26 | //
27 | // For more information, please refer to <http://unlicense.org>
28 | 
29 | 
30 | #ifndef LIB_SIMD_TEST_EXP_H
31 | #define LIB_SIMD_TEST_EXP_H
32 | 
33 | #include <iostream>
34 | 
35 | #include "../include/simdexp.h"
36 | #include "../include/simdtools.h"
37 | #include "test_common.h"
38 | 
39 | 
40 | int test_mm256_expfaster_ps() {
41 | 
42 |     const float accuracy = 4.0f;
43 | 
44 |     __m256 input = generate_vector(-63.0f, 63.0f);
45 | 
46 |     __m256 approx = _mm256_expfaster_ps(input);
47 |     __m256 exact = _mm256_exp_ps(input);
48 | 
49 |     std::cout << "Testing: _mm256_expfaster_ps()     -63.0f < x < 63.0" << std::endl;
50 |     return compare_results(input, approx, exact, accuracy);
51 | 
52 | }
53 | 
54 | 
55 | int test_mm256_expfast_ps() {
56 | 
57 |     const float accuracy = 0.2;
58 | 
59 |     __m256 input = generate_vector(-63.0f, 63.0f);
60 | 
61 |     __m256 approx = _mm256_expfast_ps(input);
62 |     __m256 exact = _mm256_exp_ps(input);
63 | 
64 |     std::cout << "Testing: _mm256_expfastnegsmall_ps()     -63.0f < x < 63.0" << std::endl;
65 |     return compare_results(input, approx, exact, accuracy);
66 | 
67 | }
68 | 
69 | 
70 | int test_mm256_expfast1s_ps() {
71 | 
72 |     const float accuracy = 0.005f;
73 | 
74 |     __m256 input = generate_vector(-63.0f, 63.0f);
75 | 
76 |     __m256 approx = _mm256_expfast1s_ps(input);
77 |     __m256 exact = _mm256_exp_ps(input);
78 | 
79 |     std::cout << "Testing: _mm256_expfastnegsmall1s_ps()     -63.0f < x < 63.0" << std::endl;
80 |     return compare_results(input, approx, exact, accuracy);
81 | 
82 | }
83 | 
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/test/test_rcp.h:
--------------------------------------------------------------------------------
 1 | // Test cases for libsimdrcp
 2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 3 | //
 4 | // The MIT License (MIT)
 5 | // 
 6 | // Copyright (C) 2015 Anders S. Christensen
 7 | // 
 8 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | // of this software and associated documentation files (the "Software"), to deal
10 | // in the Software without restriction, including without limitation the rights
11 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | // copies of the Software, and to permit persons to whom the Software is
13 | // furnished to do so, subject to the following conditions:
14 | // 
15 | // The above copyright notice and this permission notice shall be included in all
16 | // copies or substantial portions of the Software.
17 | // 
18 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | // SOFTWARE.
25 | 
26 | 
27 | #ifndef LIB_SIMD_TEST_RCP_H
28 | #define LIB_SIMD_TEST_RCP_H
29 | 
30 | #include <iostream>
31 | 
32 | #include "../include/simdrcp.h"
33 | #include "../include/simdtools.h"
34 | #include "test_common.h"
35 | 
36 | 
37 | // Test case for the function _mm256_rcp1s_ps()
38 | static int test_mm256_rcp1s_ps() {
39 | 
40 |     // Target accuracy
41 |     const float accuracy = FLOAT_ACCURACY;
42 | 
43 |     // Random input data
44 |     __m256 input = generate_vector(-10000.0f, 10000.0f);
45 | 
46 |     // Exact value of 1/x
47 |     __m256 exact = _mm256_div_ps(_mm256_set1_ps(1.0f), input);
48 | 
49 |     // Approximate value of 1/x
50 |     __m256 approx = _mm256_rcp1s_ps(input);
51 | 
52 |     // Print and run test
53 |     std::cout << "Testing: _mm256_rcp1s_ps()      -10000.0 < x 10000.0" << std::endl;
54 |     return compare_results(input, approx, exact, accuracy);
55 | 
56 | }
57 | 
58 | 
59 | // Test case for the intrinsic function _mm256_rcp_ps()
60 | static int test_mm256_rcp_ps() {
61 | 
62 |     // Target accuracy
63 |     const float accuracy = 0.04f;
64 | 
65 |     // Generate random input
66 |     __m256 input = generate_vector(-10000.0f, 10000.0f);
67 | 
68 |     // Exact value of 1/x
69 |     __m256 exact = _mm256_div_ps(_mm256_set1_ps(1.0f), input);
70 | 
71 |     // Approximate value of 1/x
72 |     __m256 approx = _mm256_rcp_ps(input);
73 | 
74 |     // Print and run test
75 |     std::cout << "Testing: _mm256_rcp_ps()      -10000.0 < x 10000.0" << std::endl;
76 |     return compare_results(input, approx, exact, accuracy);
77 | 
78 | }
79 | 
80 | 
81 | #endif // LIB_SIMD_TEST_RCP_H
82 | 


--------------------------------------------------------------------------------
/test/test_rsqrt.h:
--------------------------------------------------------------------------------
 1 | // Test cases for libsimdrsqrt
 2 | // Report bugs, etc at: https://github.com/andersx/simd-exp
 3 | //
 4 | // The MIT License (MIT)
 5 | // 
 6 | // Copyright (C) 2015 Anders S. Christensen
 7 | // 
 8 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | // of this software and associated documentation files (the "Software"), to deal
10 | // in the Software without restriction, including without limitation the rights
11 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | // copies of the Software, and to permit persons to whom the Software is
13 | // furnished to do so, subject to the following conditions:
14 | // 
15 | // The above copyright notice and this permission notice shall be included in all
16 | // copies or substantial portions of the Software.
17 | // 
18 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | // SOFTWARE.
25 | 
26 | 
27 | #ifndef LIB_SIMD_TEST_RSQRT_H
28 | #define LIB_SIMD_TEST_RSQRT_H
29 | 
30 | #include <iostream>
31 | 
32 | #include "../include/simdrsqrt.h"
33 | #include "../include/simdtools.h"
34 | #include "test_common.h"
35 | 
36 | 
37 | // Test case for the intrinsic function _mm256_rsqrt_ps()
38 | static int test_mm256_rsqrt_ps() {
39 | 
40 |     // Target accuracy
41 |     const float accuracy = 0.03f;
42 | 
43 |     // Random input data
44 |     __m256 input = generate_vector(0.0f, 10000.0f);
45 | 
46 |     // Exact value of 1/sqrt(x)
47 |     __m256 exact = _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_sqrt_ps(input));
48 | 
49 |     // Approximate value of 1/sqrt(x)
50 |     __m256 approx = _mm256_rsqrt_ps(input);
51 | 
52 |     // Print and run test
53 |     std::cout << "Testing: _mm256_rsqrt_ps()      0.0 < x 10000.0" << std::endl;
54 |     return compare_results(input, approx, exact, accuracy);
55 | 
56 | }
57 | 
58 | 
59 | // Test case for the function _mm256_rsqrt1s_ps()
60 | static int test_mm256_rsqrt1s_ps() {
61 | 
62 |     // Target accuracy
63 |     const float accuracy = FLOAT_ACCURACY;
64 | 
65 |     // Random input data
66 |     __m256 input = generate_vector(0.0f, 10000.0f);
67 | 
68 |     // Exact value of 1/sqrt(x)
69 |     __m256 exact = _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_sqrt_ps(input));
70 | 
71 |     // Approximate value of 1/sqrt(x)
72 |     __m256 approx = _mm256_rsqrt1s_ps(input);
73 | 
74 |     // Print and run test
75 |     std::cout << "Testing: _mm256_rsqrt1s_ps()      0.0 < x 10000.0" << std::endl;
76 |     return compare_results(input, approx, exact, accuracy);
77 | 
78 | }
79 | 
80 | 
81 | #endif // LIB_SIMD_TEST_COMMON_H
82 | 


--------------------------------------------------------------------------------