├── include └── machine.h ├── .gitignore ├── result ├── makefile ├── Corei5-3330S@2.70GHz_mac64_clang6.1_normaldist_time.png ├── Corei5-3330S@2.70GHz_mac64_clang6.1_normaldistf_time.png ├── Corei5-3330S@2.70GHz_mac64_clang6.1.csv ├── template.php └── Corei5-3330S@2.70GHz_mac64_clang6.1.html ├── src ├── null.cpp ├── cpp11random.cpp ├── marsagliapolar.cpp ├── boxmuller.cpp ├── boxmuller_sse2.cpp ├── clt.cpp ├── boxmuller_avx.cpp ├── clt_sse2.cpp ├── clt_avx.cpp ├── timer.h ├── test.h ├── resultfilename.h ├── inverse.cpp ├── lcg.h ├── main.cpp ├── ziggurat.cpp ├── sse_mathfun.h └── avx_mathfun.h ├── LICENSE └── readme.md /include/machine.h: -------------------------------------------------------------------------------- 1 | // If src/machine.h does not exists, it will include this empty header 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bin/* 2 | /build/gmake 3 | /build/vs*/ 4 | /intermediate 5 | result/result.csv 6 | result/result.html 7 | 8 | src/machine.h 9 | -------------------------------------------------------------------------------- /result/makefile: -------------------------------------------------------------------------------- 1 | %.html: %.csv template.php 2 | php template.php $< > $@ 3 | 4 | CSVFILES = $(basename $(wildcard *.csv)) 5 | all: $(addsuffix .html, $(CSVFILES)) 6 | -------------------------------------------------------------------------------- /result/Corei5-3330S@2.70GHz_mac64_clang6.1_normaldist_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miloyip/normaldist-benchmark/HEAD/result/Corei5-3330S@2.70GHz_mac64_clang6.1_normaldist_time.png -------------------------------------------------------------------------------- /result/Corei5-3330S@2.70GHz_mac64_clang6.1_normaldistf_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miloyip/normaldist-benchmark/HEAD/result/Corei5-3330S@2.70GHz_mac64_clang6.1_normaldistf_time.png -------------------------------------------------------------------------------- /src/null.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "lcg.h" 3 | 4 | static void normaldistf_null(float* data, size_t count) { 5 | LCG r; 6 | for (size_t i = 0; i < count; i++) 7 | data[i] = r(); 8 | } 9 | 10 | static void normaldist_null(double* data, size_t count) { 11 | LCG r; 12 | for (size_t i = 0; i < count; i++) 13 | data[i] = r(); 14 | } 15 | 16 | REGISTER_TEST(null); 17 | -------------------------------------------------------------------------------- /src/cpp11random.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER > 1700) // VS2012 4 | 5 | #include 6 | 7 | template 8 | static void genericNormalDist(RealType* data, size_t count) { 9 | std::minstd_rand gen; 10 | std::normal_distribution d; 11 | for (size_t i = 0; i < count; i++) 12 | data[i] = d(gen); 13 | } 14 | 15 | static void normaldistf_cpp11random(float* data, size_t count) { 16 | genericNormalDist(data, count); 17 | } 18 | 19 | static void normaldist_cpp11random(double* data, size_t count) { 20 | genericNormalDist(data, count); 21 | } 22 | 23 | REGISTER_TEST(cpp11random); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/marsagliapolar.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "lcg.h" 3 | #include 4 | #include 5 | 6 | template 7 | void marsagliapolar(T* data, size_t count) { 8 | assert(count % 2 == 0); 9 | 10 | LCG r; 11 | for (size_t i = 0; i < count; i += 2) { 12 | T u, v, s; 13 | do { 14 | u = r() * 2 - 1; 15 | v = r() * 2 - 1; 16 | s = u * u + v * v; 17 | } while (s >= 1 || s == 0); 18 | 19 | T mul = std::sqrt(-2 * std::log(s) / s); 20 | data[i ] = mul * u; 21 | data[i + 1] = mul * v; 22 | } 23 | } 24 | 25 | static void normaldistf_marsagliapolar(float* data, size_t count) { 26 | marsagliapolar(data, count); 27 | } 28 | 29 | static void normaldist_marsagliapolar(double* data, size_t count) { 30 | marsagliapolar(data, count); 31 | } 32 | 33 | REGISTER_TEST(marsagliapolar); 34 | -------------------------------------------------------------------------------- /src/boxmuller.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "lcg.h" 3 | #include 4 | #include 5 | 6 | template 7 | void boxmuller(T* data, size_t count) { 8 | assert(count % 2 == 0); 9 | static const T twopi = T(2.0 * 3.14159265358979323846); 10 | 11 | LCG r; 12 | for (size_t i = 0; i < count; i += 2) { 13 | T u1 = 1.0f - r(); // [0, 1) -> (0, 1] 14 | T u2 = r(); 15 | T radius = std::sqrt(-2 * std::log(u1)); 16 | T theta = twopi * u2; 17 | data[i ] = radius * std::cos(theta); 18 | data[i + 1] = radius * std::sin(theta); 19 | } 20 | } 21 | 22 | static void normaldistf_boxmuller(float* data, size_t count) { 23 | boxmuller(data, count); 24 | } 25 | 26 | static void normaldist_boxmuller(double* data, size_t count) { 27 | boxmuller(data, count); 28 | } 29 | 30 | REGISTER_TEST(boxmuller); 31 | -------------------------------------------------------------------------------- /result/Corei5-3330S@2.70GHz_mac64_clang6.1.csv: -------------------------------------------------------------------------------- 1 | Type,Function,Time(ns) 2 | normaldistf,boxmuller,10.548000 3 | normaldistf,boxmuller_avx,2.253000 4 | normaldistf,boxmuller_sse2,3.752000 5 | normaldistf,clt4,5.542000 6 | normaldistf,clt8,10.683000 7 | normaldistf,clt16,21.384000 8 | normaldistf,clt4_avx,2.730000 9 | normaldistf,clt8_avx,7.636000 10 | normaldistf,clt16_avx,16.295000 11 | normaldistf,clt4_sse2,3.557000 12 | normaldistf,clt8_sse2,7.056000 13 | normaldistf,clt16_sse2,14.585000 14 | normaldistf,cpp11random,18.642000 15 | normaldistf,inverse,13.090000 16 | normaldistf,marsagliapolar,10.926000 17 | normaldistf,null,1.253000 18 | normaldistf,ziggurat,6.731000 19 | normaldist,boxmuller,16.427000 20 | normaldist,clt4,7.402000 21 | normaldist,clt8,14.178000 22 | normaldist,clt16,28.113000 23 | normaldist,cpp11random,32.245000 24 | normaldist,inverse,14.625000 25 | normaldist,marsagliapolar,12.837000 26 | normaldist,null,1.456000 27 | normaldist,ziggurat,7.086000 28 | -------------------------------------------------------------------------------- /src/boxmuller_sse2.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #ifdef USE_SSE2 4 | 5 | #include "lcg.h" 6 | #include 7 | #include 8 | 9 | static void normaldistf_boxmuller_sse2(float* data, size_t count) { 10 | assert(count % 8 == 0); 11 | const __m128 twopi = _mm_set1_ps(2.0f * 3.14159265358979323846f); 12 | const __m128 one = _mm_set1_ps(1.0f); 13 | const __m128 minustwo = _mm_set1_ps(-2.0f); 14 | 15 | LCG<__m128> r; 16 | for (size_t i = 0; i < count; i += 8) { 17 | __m128 u1 = _mm_sub_ps(one, r()); // [0, 1) -> (0, 1] 18 | __m128 u2 = r(); 19 | __m128 radius = _mm_sqrt_ps(_mm_mul_ps(minustwo, log_ps(u1))); 20 | __m128 theta = _mm_mul_ps(twopi, u2); 21 | __m128 sintheta, costheta; 22 | sincos_ps(theta, &sintheta, &costheta); 23 | _mm_store_ps(&data[i ], _mm_mul_ps(radius, costheta)); 24 | _mm_store_ps(&data[i + 4], _mm_mul_ps(radius, sintheta)); 25 | } 26 | } 27 | 28 | REGISTER_TEST_FLOATONLY(boxmuller_sse2); 29 | 30 | #endif // USE_SSE2 31 | -------------------------------------------------------------------------------- /src/clt.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "lcg.h" 3 | #include 4 | 5 | // By central limit theorem (CLT), 6 | // U ~ [0, 1] 7 | // S = sum U_i for i = 1 to M 8 | // S ~ N(M / 2, M / 12) 9 | // Z = (S - M / 2) / sqrt(M / 12) 10 | // Z ~ N(0, 1) 11 | 12 | template 13 | static inline T clt(RNG& r) { 14 | static T inv = 1 / std::sqrt(T(M) / 12); 15 | 16 | T sum = r(); 17 | for (int i = 1; i < M; i++) 18 | sum += r(); 19 | 20 | return (sum - M / T(2)) * inv; 21 | } 22 | 23 | template 24 | static void clt(T* data, size_t count) { 25 | LCG r; 26 | for (size_t i = 0; i < count; i++) 27 | data[i] = clt, M>(r); 28 | } 29 | 30 | #define CLT_TEST(M)\ 31 | static void normaldistf_clt##M(float* data, size_t count) {\ 32 | clt(data, count);\ 33 | }\ 34 | static void normaldist_clt##M(double* data, size_t count) {\ 35 | clt(data, count);\ 36 | }\ 37 | REGISTER_TEST(clt##M) 38 | 39 | CLT_TEST(4); 40 | CLT_TEST(8); 41 | CLT_TEST(16); 42 | -------------------------------------------------------------------------------- /src/boxmuller_avx.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #ifdef USE_AVX 4 | 5 | #include "lcg.h" 6 | #include 7 | #include 8 | 9 | static void normaldistf_boxmuller_avx(float* data, size_t count) { 10 | assert(count % 16 == 0); 11 | const __m256 twopi = _mm256_set1_ps(2.0f * 3.14159265358979323846f); 12 | const __m256 one = _mm256_set1_ps(1.0f); 13 | const __m256 minustwo = _mm256_set1_ps(-2.0f); 14 | 15 | LCG<__m256> r; 16 | for (size_t i = 0; i < count; i += 16) { 17 | __m256 u1 = _mm256_sub_ps(one, r()); // [0, 1) -> (0, 1] 18 | __m256 u2 = r(); 19 | __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(minustwo, log256_ps(u1))); 20 | __m256 theta = _mm256_mul_ps(twopi, u2); 21 | __m256 sintheta, costheta; 22 | sincos256_ps(theta, &sintheta, &costheta); 23 | _mm256_store_ps(&data[i ], _mm256_mul_ps(radius, costheta)); 24 | _mm256_store_ps(&data[i + 8], _mm256_mul_ps(radius, sintheta)); 25 | } 26 | } 27 | 28 | REGISTER_TEST_FLOATONLY(boxmuller_avx); 29 | 30 | #endif // USE_AVX 31 | -------------------------------------------------------------------------------- /src/clt_sse2.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #ifdef USE_SSE2 4 | 5 | #include "lcg.h" 6 | #include 7 | 8 | // By central limit theorem (CLT), 9 | // U ~ [0, 1] 10 | // S = sum U_i for i = 1 to M 11 | // S ~ N(M / 2, M / 12) 12 | // Z = (S - M / 2) / sqrt(M / 12) 13 | // Z ~ N(0, 1) 14 | 15 | template 16 | static void clt_sse2(float* data, size_t count) { 17 | static const __m128 halfm = _mm_set1_ps(float(M) / 2); 18 | static const __m128 inv = _mm_set1_ps(1 / std::sqrt(float(M) / 12)); 19 | 20 | LCG<__m128> r; 21 | for (size_t i = 0; i < count; i += 4) { 22 | __m128 sum = r(); 23 | for (int j = 1; j < M; j++) 24 | sum = _mm_add_ps(sum, r()); 25 | 26 | _mm_store_ps(&data[i], _mm_mul_ps(_mm_sub_ps(sum, halfm), inv)); 27 | } 28 | } 29 | 30 | #define CLT_SSE2_TEST(M)\ 31 | static void normaldistf_clt##M##_sse2(float* data, size_t count) {\ 32 | clt_sse2(data, count);\ 33 | }\ 34 | REGISTER_TEST_FLOATONLY(clt##M##_sse2) 35 | 36 | CLT_SSE2_TEST(4); 37 | CLT_SSE2_TEST(8); 38 | CLT_SSE2_TEST(16); 39 | 40 | #endif -------------------------------------------------------------------------------- /src/clt_avx.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #ifdef USE_AVX 4 | 5 | #include "lcg.h" 6 | #include 7 | 8 | // By central limit theorem (CLT), 9 | // U ~ [0, 1] 10 | // S = sum U_i for i = 1 to M 11 | // S ~ N(M / 2, M / 12) 12 | // Z = (S - M / 2) / sqrt(M / 12) 13 | // Z ~ N(0, 1) 14 | 15 | template 16 | static void clt_avx(float* data, size_t count) { 17 | static const __m256 halfm = _mm256_set1_ps(float(M) / 2); 18 | static const __m256 inv = _mm256_set1_ps(1 / std::sqrt(float(M) / 12)); 19 | 20 | LCG<__m256> r; 21 | for (size_t i = 0; i < count; i += 4) { 22 | __m256 sum = r(); 23 | for (int j = 1; j < M; j++) 24 | sum = _mm256_add_ps(sum, r()); 25 | 26 | _mm256_store_ps(&data[i], _mm256_mul_ps(_mm256_sub_ps(sum, halfm), inv)); 27 | } 28 | } 29 | 30 | #define CLT_AVX_TEST(M)\ 31 | static void normaldistf_clt##M##_avx(float* data, size_t count) {\ 32 | clt_avx(data, count);\ 33 | }\ 34 | REGISTER_TEST_FLOATONLY(clt##M##_avx) 35 | 36 | CLT_AVX_TEST(4); 37 | CLT_AVX_TEST(8); 38 | CLT_AVX_TEST(16); 39 | 40 | #endif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Milo Yip 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef _WIN32 4 | 5 | #define WIN32_LEAN_AND_MEAN 6 | #include 7 | 8 | class Timer { 9 | public: 10 | Timer() : start_(), end_() { 11 | } 12 | 13 | void Start() { 14 | QueryPerformanceCounter(&start_); 15 | } 16 | 17 | void Stop() { 18 | QueryPerformanceCounter(&end_); 19 | } 20 | 21 | double GetElapsedMilliseconds() { 22 | LARGE_INTEGER freq; 23 | QueryPerformanceFrequency(&freq); 24 | return (end_.QuadPart - start_.QuadPart) * 1000.0 / freq.QuadPart; 25 | } 26 | 27 | private: 28 | LARGE_INTEGER start_; 29 | LARGE_INTEGER end_; 30 | }; 31 | 32 | // Undefine Windows bad macros 33 | #undef min 34 | #undef max 35 | 36 | #else 37 | 38 | #include 39 | 40 | class Timer { 41 | public: 42 | Timer() : start_(), end_() { 43 | } 44 | 45 | void Start() { 46 | gettimeofday(&start_, NULL); 47 | } 48 | 49 | void Stop() { 50 | gettimeofday(&end_, NULL); 51 | } 52 | 53 | double GetElapsedMilliseconds() { 54 | return (end_.tv_sec - start_.tv_sec) * 1000.0 55 | + (end_.tv_usec - start_.tv_usec) / 1000.0; 56 | } 57 | 58 | private: 59 | struct timeval start_; 60 | struct timeval end_; 61 | }; 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define USE_SSE2 4 | #define USE_AVX 5 | 6 | #include 7 | #include 8 | 9 | struct Test; 10 | typedef std::vector TestList; 11 | class TestManager { 12 | public: 13 | static TestManager& Instance() { 14 | static TestManager singleton; 15 | return singleton; 16 | } 17 | 18 | void AddTest(const Test* test) { 19 | mTests.push_back(test); 20 | } 21 | 22 | const TestList& GetTests() const { 23 | return mTests; 24 | } 25 | 26 | TestList& GetTests() { 27 | return mTests; 28 | } 29 | 30 | private: 31 | TestList mTests; 32 | }; 33 | 34 | struct Test { 35 | Test( 36 | const char* fname, 37 | void (*normaldistf)(float*, size_t), 38 | void (*normaldist)(double*, size_t)) 39 | : 40 | fname(fname), 41 | normaldistf(normaldistf), 42 | normaldist(normaldist) 43 | { 44 | TestManager::Instance().AddTest(this); 45 | } 46 | 47 | bool operator<(const Test& rhs) const { 48 | return strcmp(fname, rhs.fname) < 0; 49 | } 50 | 51 | const char* fname; 52 | void (*normaldistf)(float*, size_t); 53 | void (*normaldist)(double*, size_t); 54 | }; 55 | 56 | 57 | #define STRINGIFY(x) #x 58 | #define REGISTER_TEST(f) static Test gRegister##f(STRINGIFY(f), normaldistf##_##f, normaldist##_##f) 59 | #define REGISTER_TEST_FLOATONLY(f) static Test gRegister##f(STRINGIFY(f), normaldistf##_##f, 0) 60 | -------------------------------------------------------------------------------- /src/resultfilename.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "machine.h" 4 | 5 | #ifndef MACHINE 6 | #define MACHINE "unknown" 7 | #endif 8 | 9 | #if defined(_WIN64) 10 | # define OS "win64" 11 | #elif defined(_WIN32) 12 | # define OS "win32" 13 | #elif defined(__CYGWIN__) && defined(__x86_64) 14 | # define OS "cygwin64" 15 | #elif defined(__CYGWIN__) 16 | # define OS "cygwin32" 17 | #elif defined(__APPLE__) 18 | # include "TargetConditionals.h" 19 | # if TARGET_OS_IPHONE 20 | # ifdef __LP64__ 21 | # define OS "ios64" 22 | # else 23 | # define OS "ios32" 24 | # endif 25 | # elif TARGET_OS_MAC 26 | # ifdef __amd64__ 27 | # define OS "mac64" 28 | # else 29 | # define OS "mac32" 30 | # endif 31 | # endif 32 | #elif defined(__linux) 33 | # ifdef __LP64__ 34 | # define OS "linux64" 35 | # else 36 | # define OS "linux32" 37 | # endif 38 | #endif 39 | 40 | #ifndef OS 41 | #define OS "unknown" 42 | #endif 43 | 44 | #define STR_HELPER(x) #x 45 | #define STR(x) STR_HELPER(x) 46 | 47 | #if defined(_MSC_VER) 48 | # if _MSC_VER >= 1800 49 | # define COMPILER "vc2013" 50 | # elif _MSC_VER >= 1700 51 | # define COMPILER "vc2012" 52 | # elif _MSC_VER >= 1600 53 | # define COMPILER "vc2010" 54 | # elif _MSC_VER >= 1500 55 | # define COMPILER "vc2008" 56 | # elif _MSC_VER >= 1400 57 | # define COMPILER "vc2005" 58 | # else 59 | # define COMPILER "vc" 60 | # endif 61 | #elif defined(__clang__) 62 | # define COMPILER "clang" STR(__clang_major__) "." STR(__clang_minor__) 63 | #elif defined(__GNUC__) 64 | # define COMPILER "gcc" STR(__GNUC__) "." STR(__GNUC_MINOR__) 65 | #else 66 | # define COMPILER "Unknown" 67 | #endif 68 | 69 | #define RESULT_FILENAME MACHINE "_" OS "_" COMPILER ".csv" 70 | -------------------------------------------------------------------------------- /src/inverse.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "lcg.h" 3 | #include 4 | #include 5 | #include 6 | 7 | // http://home.online.no/~pjacklam/notes/invnorm/ 8 | template 9 | static T normsinv(T p) { 10 | static const T a1 = T(-3.969683028665376e+01); 11 | static const T a2 = T( 2.209460984245205e+02); 12 | static const T a3 = T(-2.759285104469687e+02); 13 | static const T a4 = T( 1.383577518672690e+02); 14 | static const T a5 = T(-3.066479806614716e+01); 15 | static const T a6 = T( 2.506628277459239e+00); 16 | static const T b1 = T(-5.447609879822406e+01); 17 | static const T b2 = T( 1.615858368580409e+02); 18 | static const T b3 = T(-1.556989798598866e+02); 19 | static const T b4 = T( 6.680131188771972e+01); 20 | static const T b5 = T(-1.328068155288572e+01); 21 | static const T c1 = T(-7.784894002430293e-03); 22 | static const T c2 = T(-3.223964580411365e-01); 23 | static const T c3 = T(-2.400758277161838e+00); 24 | static const T c4 = T(-2.549732539343734e+00); 25 | static const T c5 = T( 4.374664141464968e+00); 26 | static const T c6 = T( 2.938163982698783e+00); 27 | static const T d1 = T( 7.784695709041462e-03); 28 | static const T d2 = T( 3.224671290700398e-01); 29 | static const T d3 = T( 2.445134137142996e+00); 30 | static const T d4 = T( 3.754408661907416e+00); 31 | 32 | T q = std::min(p, 1 - p), u; 33 | if (q > 0.02425) { 34 | // Central region 35 | T d = q - T(0.5); 36 | T t = d * d; 37 | u = d * (((((a1 * t + a2) * t + a3) * t + a4) * t + a5) * t + a6) / 38 | (((((b1 * t + b2) * t + b3) * t + b4) * t + b5) * t + 1); 39 | } 40 | else { 41 | // Tail region 42 | T t = std::sqrt(-2 * std::log(q)); 43 | u = (((((c1 * t + c2) * t + c3) * t + c4) * t + c5) * t + c6) / 44 | ((((d1 * t + d2) * t + d3) * t + d4) * t + 1); 45 | } 46 | 47 | return p > 0.5 ? -u : u; 48 | } 49 | 50 | template 51 | static void inverse(T* data, size_t count) { 52 | LCG r; 53 | for (size_t i = 0; i < count; i++) 54 | data[i] = normsinv(r()); 55 | } 56 | 57 | static void normaldistf_inverse(float* data, size_t count) { 58 | inverse(data, count); 59 | } 60 | 61 | static void normaldist_inverse(double* data, size_t count) { 62 | inverse(data, count); 63 | } 64 | 65 | REGISTER_TEST(inverse); 66 | -------------------------------------------------------------------------------- /src/lcg.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | class LCG; 7 | 8 | template <> 9 | class LCG { 10 | public: 11 | LCG() : x(1) {} 12 | 13 | uint32_t operator()() { 14 | return x = x * 1664525 + 1013904223; // Numerical Recipes 15 | } 16 | 17 | private: 18 | uint32_t x; 19 | }; 20 | 21 | template <> 22 | class LCG { 23 | public: 24 | LCG() : x(1) {} 25 | 26 | float operator()() { 27 | x = x * 1664525 + 1013904223; // Numerical Recipes 28 | union { 29 | uint32_t u; 30 | float f; 31 | }u; 32 | u.u = (x >> 9) | 0x3F800000; 33 | return u.f - 1.0f; 34 | } 35 | 36 | private: 37 | uint32_t x; 38 | }; 39 | 40 | template <> 41 | class LCG { 42 | public: 43 | LCG() : x(1) {} 44 | 45 | double operator()() { 46 | x = x * 2862933555777941757 + 3037000493; // http://nuclear.llnl.gov/CNP/rng/rngman/node4.html 47 | union { 48 | uint64_t u; 49 | double f; 50 | }u; 51 | u.u = (x >> 12) | (uint64_t(0x3FF00000) << 32); 52 | return u.f - 1; 53 | } 54 | 55 | private: 56 | uint64_t x; 57 | }; 58 | 59 | //////////////////////////////////////////////////////////////////////////////// 60 | // SSE2 61 | 62 | #ifdef USE_SSE2 63 | #include "sse_mathfun.h" 64 | 65 | _PS_CONST_TYPE(lcg_a, uint32_t, 1664525); 66 | _PS_CONST_TYPE(lcg_b, uint32_t, 1013904223); 67 | _PS_CONST_TYPE(lcg_mask, uint32_t, 0x3F800000); 68 | 69 | template <> 70 | class LCG<__m128> { 71 | public: 72 | LCG() : x(_mm_setr_epi32(1, 2, 3, 4)) {} 73 | 74 | __m128 operator()() { 75 | x = _mm_add_epi32(mullo_epi32(x, *(__m128i*)_ps_lcg_a), *(__m128i*)_ps_lcg_b); 76 | __m128i u = _mm_or_si128(_mm_srli_epi32(x, 9), *(__m128i*)_ps_lcg_mask); 77 | __m128 f = _mm_sub_ps(_mm_castsi128_ps(u), *(__m128*)_ps_1); 78 | return f; 79 | } 80 | 81 | private: 82 | // _mm_mullo_epi32() is in SSE4.1 83 | static __m128i mullo_epi32(__m128i a, __m128i b) { 84 | const __m128i tmp1 = _mm_mul_epu32(a, b); /* mul 2,0*/ 85 | const __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); /* mul 3,1 */ 86 | return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */ 87 | } 88 | __m128i x; 89 | }; 90 | 91 | #endif // USE_SSE2 92 | 93 | //////////////////////////////////////////////////////////////////////////////// 94 | // AVX 95 | #ifdef USE_AVX 96 | #include "avx_mathfun.h" 97 | 98 | _PS256_CONST_TYPE(lcg_a, uint32_t, 1664525); 99 | _PS256_CONST_TYPE(lcg_b, uint32_t, 1013904223); 100 | _PS256_CONST_TYPE(lcg_mask, uint32_t, 0x3F800000); 101 | AVX2_INTOP_USING_SSE2(mullo_epi32); // Actually uses SSE4.1 _mm_mullo_epi32() 102 | AVX2_INTOP_USING_SSE2(or_si128); 103 | 104 | template <> 105 | class LCG<__m256> { 106 | public: 107 | LCG() : x(_mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8)) {} 108 | 109 | __m256 operator()() { 110 | x = _mm256_add_epi32(_mm256_mullo_epi32(x, *(__m256i*)_ps256_lcg_a), *(__m256i*)_ps256_lcg_b); 111 | __m256i u = _mm256_or_si128_sse2(_mm256_srli_epi32(x, 9), *(__m256i*)_ps256_lcg_mask); 112 | __m256 f = _mm256_sub_ps(_mm256_castsi256_ps(u), *(__m256*)_ps256_1); 113 | return f; 114 | } 115 | 116 | private: 117 | __m256i x; 118 | }; 119 | 120 | #endif // USE_AVX 121 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "resultfilename.h" 11 | #include "timer.h" 12 | #include "test.h" 13 | 14 | static const size_t kTrial = 10; 15 | static const size_t kCount = 1000000; 16 | 17 | template 18 | static void Verify(void(*f)(T*, size_t), const char* test, const char* fname) { 19 | printf("Verifying %s %s ... ", test, fname); 20 | 21 | void* p = malloc(kCount * sizeof(T) + 32); 22 | T* data = reinterpret_cast(((uintptr_t)p + 32) & ~31); 23 | f(data, kCount); 24 | 25 | // Compute mean, minimum and maximum 26 | double sum = 0.0; 27 | double minimum = data[0]; 28 | double maximum = data[0]; 29 | for (size_t i = 0; i < kCount; i++) { 30 | sum += data[i]; 31 | if (data[i] < minimum) minimum = data[i]; 32 | if (data[i] > maximum) maximum = data[i]; 33 | } 34 | 35 | double mean = sum / kCount; 36 | 37 | // Compute standard deviation 38 | double sqDeltaSum = 0.0; 39 | for (size_t i = 0; i < kCount; i++) { 40 | double delta = data[i] - mean; 41 | sqDeltaSum += delta * delta; 42 | } 43 | double sd = std::sqrt(sqDeltaSum / kCount); 44 | 45 | // Compute skewness 46 | double skewnessSum = 0.0; 47 | for (size_t i = 0; i < kCount; i++) { 48 | double term = (data[i] - mean) / sd; 49 | skewnessSum += term * term * term; 50 | } 51 | double skewness = skewnessSum / kCount; 52 | 53 | // Compute kurtosis 54 | double kurtosisSum = 0.0; 55 | for (size_t i = 0; i < kCount; i++) { 56 | double delta = (data[i] - mean); 57 | kurtosisSum += (delta * delta) * (delta * delta); 58 | } 59 | double kurtosis = (kurtosisSum / kCount) / ((sd * sd) * (sd * sd)) - 3.0; 60 | 61 | free(p); 62 | 63 | if (std::abs(mean) < 0.01 && std::abs(sd - 1.0) < 0.01 && std::abs(skewness) < 0.01 && std::abs(kurtosis) < 0.01) 64 | printf("OK\n"); 65 | else 66 | printf("Fail\n"); 67 | 68 | printf( 69 | "mean = % .6f\n" 70 | "SD = % .6f\n" 71 | "minimum = % .6f\n" 72 | "maximum = % .6f\n" 73 | "skewness = % .6f\n" 74 | "kurtosis = % .6f\n\n", 75 | mean, sd, minimum, maximum, skewness, kurtosis); 76 | } 77 | 78 | static void VerifyAll() { 79 | const TestList& tests = TestManager::Instance().GetTests(); 80 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) { 81 | if (strcmp((*itr)->fname, "null") != 0) { // skip null 82 | if ((*itr)->normaldistf) 83 | Verify((*itr)->normaldistf, "normaldistf", (*itr)->fname); 84 | if ((*itr)->normaldist) 85 | Verify((*itr)->normaldist, "normaldist", (*itr)->fname); 86 | } 87 | } 88 | } 89 | 90 | template 91 | static void Bench(void(*f)(T*, size_t), const char* type, const char* fname, FILE* fp) { 92 | printf("Benchmarking %-20s ... ", fname); 93 | 94 | double duration = std::numeric_limits::max(); 95 | char* p = static_cast(malloc(kCount * sizeof(T) + 32)); 96 | T* data = reinterpret_cast(((uintptr_t)p + 32) & ~31); 97 | f(data, kCount); 98 | for (unsigned trial = 0; trial < kTrial; trial++) { 99 | Timer timer; 100 | timer.Start(); 101 | 102 | f(data, kCount); 103 | 104 | timer.Stop(); 105 | duration = std::min(duration, timer.GetElapsedMilliseconds()); 106 | } 107 | free(p); 108 | 109 | duration *= 1e6 / kCount; // convert to nano second per operation 110 | fprintf(fp, "%s,%s,%f\n", type, fname, duration); 111 | 112 | printf("%8.3fns\n", duration); 113 | } 114 | 115 | static void BenchAll() { 116 | // Try to write to /result path, where template.php exists 117 | FILE *fp; 118 | if ((fp = fopen("../../result/template.php", "r")) != NULL) { 119 | fclose(fp); 120 | fp = fopen("../../result/" RESULT_FILENAME, "w"); 121 | } 122 | else if ((fp = fopen("../result/template.php", "r")) != NULL) { 123 | fclose(fp); 124 | fp = fopen("../result/" RESULT_FILENAME, "w"); 125 | } 126 | else 127 | fp = fopen(RESULT_FILENAME, "w"); 128 | 129 | fprintf(fp, "Type,Function,Time(ns)\n"); 130 | 131 | const TestList& tests = TestManager::Instance().GetTests(); 132 | 133 | puts("normaldistf"); 134 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) 135 | if ((*itr)->normaldistf) 136 | Bench((*itr)->normaldistf, "normaldistf", (*itr)->fname, fp); 137 | 138 | puts("\nnormaldist"); 139 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) 140 | if ((*itr)->normaldist) 141 | Bench((*itr)->normaldist, "normaldist", (*itr)->fname, fp); 142 | 143 | fclose(fp); 144 | } 145 | 146 | int main() { 147 | // sort tests 148 | TestList& tests = TestManager::Instance().GetTests(); 149 | std::sort(tests.begin(), tests.end()); 150 | 151 | VerifyAll(); 152 | BenchAll(); 153 | } 154 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Normally Distributed Random Number Generator Benchmark 2 | 3 | Copyright(c) 2015 Milo Yip (miloyip@gmail.com) 4 | 5 | ## Introduction 6 | 7 | This benchmark evaluates the performance of generting random numbers with standard normal distribution. The function prototypes are: 8 | 9 | ~~~~~~~~cpp 10 | void normaldistf(float* data, size_t n); 11 | void normaldist(double* data, size_t n); 12 | ~~~~~~~~ 13 | 14 | These functions generate `n` standard normal distributed random numbers (samples), in `float` and `double` respectively. 15 | 16 | Generating muliple random numbers, instead of generating a single random number, can be suitable for some algorithms (such as Box-Muller generates two numbers at once, and also SIMD versions). 17 | 18 | Some implemenetations require `data` to be 16 or 32 byte aligned, and `n` to be multiples of 2, 8, 16, 32 etc. 19 | 20 | ## Procedure 21 | 22 | Firstly the program verifies the correctness of implementations. The correctness is simply using the following critera: 23 | 24 | ~~~cpp 25 | bool correctness = 26 | std::abs(mean ) < 0.01 && 27 | std::abs(sd - 1.0) < 0.01 && 28 | std::abs(skewness) < 0.01 && 29 | std::abs(kurtosis) < 0.01; 30 | ~~~ 31 | 32 | where skewness is [Pearson's moment coefficient of skewness](https://en.wikipedia.org/wiki/Skewness#Pearson.27s_moment_coefficient_of_skewness) and kurtosis is [excess kurtosis](https://en.wikipedia.org/wiki/Kurtosis#Pearson_moments). 33 | 34 | In the benchmark, each trial generates `n = 1000000` (1 million) samples. The minimum time duration is measured for 10 trials. 35 | 36 | ## Build and Run 37 | 38 | 1. Obtain [premake4](http://industriousone.com/premake/download). 39 | 2. Copy premake4 executable to `normaldist-benchmark/build` folder (or system path). 40 | 3. Run `premake.bat` or `premake.sh` in `normaldist-benchmark/build` 41 | 4. On Windows, build the solution at `normaldist-benchmark/build/vs2008/` or `/vs2010/`. 42 | 5. On other platforms, run GNU `make config=release32` (or `release64`) at `normaldist-benchmark/build/gmake/` 43 | 6. On success, run the `normaldistXXX` executable is generated at `normaldist-benchmark/` 44 | 7. The results in CSV format will be written to `normaldist-benchmark/result`. 45 | 8. Run GNU `make` in `normaldist-benchmark/result` to generate results in HTML. 46 | 47 | Note that, for platforms not supporting SSE2/AVX, please modify `build/premake4.lua` and `src/test.h`. 48 | 49 | ## Implementations 50 | 51 | Function | Description 52 | ---------------|----------- 53 | boxmuller | [Box-Muller transform](https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform) [1]. Requires `n % 2 == 0`. 54 | cpp11random | `std::normal_distribution` with `std::minstd_rand`. 55 | clt`m` | By [central limit theorem](https://en.wikipedia.org/wiki/Central_limit_theorem) (CLT), sum `m` uniform random numbers, then adjust the mean and re-scale for standard deviation. 56 | inverse | [Inverse transform sampling](https://en.wikipedia.org/wiki/Inverse_transform_sampling) with inverse normal CDF developed by [Peter John Acklam](http://home.online.no/~pjacklam/notes/invnorm/). 57 | marsagliapolar | [Marsaglia polar method](https://en.wikipedia.org/wiki/Marsaglia_polar_method) [2]. Requires `n % 2 == 0`. 58 | ziggurat | [Ziggurat algorithm](https://en.wikipedia.org/wiki/Ziggurat_algorithm) by Marsaglia et al [3], using Jochen Voss's [implementation](http://www.seehuhn.de/pages/ziggurat). 59 | null | Generates uniform random numbers. 60 | 61 | Note that the `null` implementation generates unform random numbers. It measures the overheads of looping, memory writing, and uniform random number generation. Uniform number generation is included because normally distributed random number generators are based on at least one uniform random number generation. 62 | 63 | CLT implementations were actually unable to pass the correctness tests, as their kurtosis are higher than threshold. 64 | 65 | All implementations except `cpp11random` uses simplest [linear congruential generator](https://en.wikipedia.org/wiki/Linear_congruential_generator) as uniform distributed pseudo random number generator (PRNG). 66 | 67 | Suffixes | Description 68 | ---------------|----------- 69 | sse2 | SSE2 version (`data` requires 16-byte alignment) 70 | avx | AVX version (`data` requires 32-byte alignment) 71 | 72 | Some implementations of sse2 and avx version are using math libraries [sse_mathfun](http://gruntthepeon.free.fr/ssemath/) and [avx_mathfun](http://software-lisc.fbk.eu/avx_mathfun/), which provides logarithm and sine/cosine functions. 73 | 74 | ## Results 75 | 76 | The following are results measured on a iMac (Core i5 3330S @2.70GHz). 77 | 78 | normaldistf (single precision): 79 | 80 | Function | Time (ns) | Speedup 81 | ----------------|------------:|--------: 82 | clt16 | 21.384 | 1.00x 83 | cpp11random | 18.642 | 1.15x 84 | clt16_avx | 16.295 | 1.31x 85 | clt16_sse2 | 14.585 | 1.47x 86 | inverse | 13.090 | 1.63x 87 | marsagliapolar | 10.926 | 1.96x 88 | clt8 | 10.683 | 2.00x 89 | boxmuller | 10.548 | 2.03x 90 | clt8_avx | 7.636 | 2.80x 91 | clt8_sse2 | 7.056 | 3.03x 92 | ziggurat | 6.731 | 3.18x 93 | clt4 | 5.542 | 3.86x 94 | boxmuller_sse2 | 3.752 | 5.70x 95 | clt4_sse2 | 3.557 | 6.01x 96 | clt4_avx | 2.730 | 7.83x 97 | boxmuller_avx | 2.253 | 9.49x 98 | null | 1.253 | 17.07x 99 | 100 | ![Corei5-3330S@2.70GHz_mac64_clang6.1_normaldistf_time](result/Corei5-3330S@2.70GHz_mac64_clang6.1_normaldistf_time.png) 101 | 102 | normaldist (double precision): 103 | 104 | Function | Time (ns) | Speedup 105 | ---------------|-----------:|--------: 106 | cpp11random | 32.245 | 1.00x 107 | clt16 | 28.113 | 1.15x 108 | boxmuller | 16.427 | 1.96x 109 | inverse | 14.625 | 2.20x 110 | clt8 | 14.178 | 2.27x 111 | marsagliapolar | 12.837 | 2.51x 112 | clt4 | 7.402 | 4.36x 113 | ziggurat | 7.086 | 4.55x 114 | null | 1.456 | 22.15x 115 | 116 | ![Corei5-3330S@2.70GHz_mac64_clang6.1_normaldist_time](result/Corei5-3330S@2.70GHz_mac64_clang6.1_normaldist_time.png) 117 | 118 | * [Corei5-3330S@2.70GHz_mac64_clang6.1](http://rawgit.com/miloyip/normaldist-benchmark/master/result/Corei5-3330S@2.70GHz_mac64_clang6.1.html) 119 | 120 | ## FAQ 121 | 122 | 1. How to add an implementation? 123 | 124 | You may clone an existing implementation file (e.g. `boxmuller.cpp`). And then modify it. Re-run `premake` to add it to project or makefile. Note that it will automatically register to the benchmark by macro `REGISTER_TEST(name)`. 125 | 126 | Making pull request of new implementations is welcome. 127 | 128 | ## References 129 | 130 | [1] G. E. P. Box and Mervin E. Muller, A Note on the Generation of Random Normal Deviates, The Annals of Mathematical Statistics (1958), Vol. 29, No. 2 pp. 610–611. 131 | 132 | [2] Marsaglia, George, and Thomas A. Bray. "A convenient method for generating normal variables." SIAM review 6.3 (1964): 260-264. 133 | 134 | [3] Marsaglia, George, and Wai Wan Tsang. "The ziggurat method for generating random variables." Journal of statistical software 5.8 (2000): 1-7. 135 | 136 | ## Related Benchmarks and Discussions 137 | 138 | * [Generate random numbers following a normal distribution in C/C++](http://stackoverflow.com/questions/2325472/generate-random-numbers-following-a-normal-distribution-in-c-c) 139 | -------------------------------------------------------------------------------- /result/template.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 187 | 208 | 209 | 210 |

211 | 242 | 245 |

246 |

Source CSV

247 |

248 | <?php include $argv[1] ?>
249 |

250 |

251 | 262 | 268 | 269 | 270 | -------------------------------------------------------------------------------- /result/Corei5-3330S@2.70GHz_mac64_clang6.1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 181 | 202 | 203 | 204 |

205 | 236 | 239 |

240 |

Source CSV

241 |

242 | Type,Function,Time(ns)
243 | normaldistf,boxmuller,10.548000
244 | normaldistf,boxmuller_avx,2.253000
245 | normaldistf,boxmuller_sse2,3.752000
246 | normaldistf,clt4,5.542000
247 | normaldistf,clt8,10.683000
248 | normaldistf,clt16,21.384000
249 | normaldistf,clt4_avx,2.730000
250 | normaldistf,clt8_avx,7.636000
251 | normaldistf,clt16_avx,16.295000
252 | normaldistf,clt4_sse2,3.557000
253 | normaldistf,clt8_sse2,7.056000
254 | normaldistf,clt16_sse2,14.585000
255 | normaldistf,cpp11random,18.642000
256 | normaldistf,inverse,13.090000
257 | normaldistf,marsagliapolar,10.926000
258 | normaldistf,null,1.253000
259 | normaldistf,ziggurat,6.731000
260 | normaldist,boxmuller,16.427000
261 | normaldist,clt4,7.402000
262 | normaldist,clt8,14.178000
263 | normaldist,clt16,28.113000
264 | normaldist,cpp11random,32.245000
265 | normaldist,inverse,14.625000
266 | normaldist,marsagliapolar,12.837000
267 | normaldist,null,1.456000
268 | normaldist,ziggurat,7.086000
269 |

270 |

271 | 282 | 288 | 289 | 290 | -------------------------------------------------------------------------------- /src/ziggurat.cpp: -------------------------------------------------------------------------------- 1 | // MILO { 2 | #include "test.h" 3 | #include "lcg.h" 4 | // MILO } 5 | 6 | /* gauss.c - gaussian random numbers, using the Ziggurat method 7 | * 8 | * Copyright (C) 2005 Jochen Voss. 9 | * 10 | * For details see the following article. 11 | * 12 | * George Marsaglia, Wai Wan Tsang 13 | * The Ziggurat Method for Generating Random Variables 14 | * Journal of Statistical Software, vol. 5 (2000), no. 8 15 | * http://www.jstatsoft.org/v05/i08/ 16 | * 17 | * This program is free software; you can redistribute it and/or modify 18 | * it under the terms of the GNU General Public License as published by 19 | * the Free Software Foundation; either version 2 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * This program is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU General Public License 28 | * along with this program; if not, write to the Free Software 29 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 30 | * 31 | * $Id: gauss.c 6739 2005-11-12 02:47:20Z voss $ 32 | */ 33 | 34 | // MILO { 35 | //#include 36 | #include 37 | // MILO } 38 | #include 39 | 40 | // MILO { 41 | // #include 42 | // MILO } 43 | 44 | /* position of right-most step */ 45 | #define PARAM_R 3.44428647676 46 | 47 | /* tabulated values for the heigt of the Ziggurat levels */ 48 | static const double ytab[128] = { 49 | 1, 0.963598623011, 0.936280813353, 0.913041104253, 50 | 0.892278506696, 0.873239356919, 0.855496407634, 0.838778928349, 51 | 0.822902083699, 0.807732738234, 0.793171045519, 0.779139726505, 52 | 0.765577436082, 0.752434456248, 0.739669787677, 0.727249120285, 53 | 0.715143377413, 0.703327646455, 0.691780377035, 0.68048276891, 54 | 0.669418297233, 0.65857233912, 0.647931876189, 0.637485254896, 55 | 0.62722199145, 0.617132611532, 0.607208517467, 0.597441877296, 56 | 0.587825531465, 0.578352913803, 0.569017984198, 0.559815170911, 57 | 0.550739320877, 0.541785656682, 0.532949739145, 0.524227434628, 58 | 0.515614886373, 0.507108489253, 0.498704867478, 0.490400854812, 59 | 0.482193476986, 0.47407993601, 0.466057596125, 0.458123971214, 60 | 0.450276713467, 0.442513603171, 0.434832539473, 0.427231532022, 61 | 0.419708693379, 0.41226223212, 0.404890446548, 0.397591718955, 62 | 0.390364510382, 0.383207355816, 0.376118859788, 0.369097692334, 63 | 0.362142585282, 0.355252328834, 0.348425768415, 0.341661801776, 64 | 0.334959376311, 0.328317486588, 0.321735172063, 0.31521151497, 65 | 0.308745638367, 0.302336704338, 0.29598391232, 0.289686497571, 66 | 0.283443729739, 0.27725491156, 0.271119377649, 0.265036493387, 67 | 0.259005653912, 0.253026283183, 0.247097833139, 0.241219782932, 68 | 0.235391638239, 0.229612930649, 0.223883217122, 0.218202079518, 69 | 0.212569124201, 0.206983981709, 0.201446306496, 0.195955776745, 70 | 0.190512094256, 0.185114984406, 0.179764196185, 0.174459502324, 71 | 0.169200699492, 0.1639876086, 0.158820075195, 0.153697969964, 72 | 0.148621189348, 0.143589656295, 0.138603321143, 0.133662162669, 73 | 0.128766189309, 0.123915440582, 0.119109988745, 0.114349940703, 74 | 0.10963544023, 0.104966670533, 0.100343857232, 0.0957672718266, 75 | 0.0912372357329, 0.0867541250127, 0.082318375932, 0.0779304915295, 76 | 0.0735910494266, 0.0693007111742, 0.065060233529, 0.0608704821745, 77 | 0.056732448584, 0.05264727098, 0.0486162607163, 0.0446409359769, 78 | 0.0407230655415, 0.0368647267386, 0.0330683839378, 0.0293369977411, 79 | 0.0256741818288, 0.0220844372634, 0.0185735200577, 0.0151490552854, 80 | 0.0118216532614, 0.00860719483079, 0.00553245272614, 0.00265435214565 81 | }; 82 | 83 | // MILO { 84 | static const float ytabf[128] = { 85 | 1.0f, 0.963598623011f, 0.936280813353f, 0.913041104253f, 86 | 0.892278506696f, 0.873239356919f, 0.855496407634f, 0.838778928349f, 87 | 0.822902083699f, 0.807732738234f, 0.793171045519f, 0.779139726505f, 88 | 0.765577436082f, 0.752434456248f, 0.739669787677f, 0.727249120285f, 89 | 0.715143377413f, 0.703327646455f, 0.691780377035f, 0.68048276891f, 90 | 0.669418297233f, 0.65857233912f, 0.647931876189f, 0.637485254896f, 91 | 0.62722199145f, 0.617132611532f, 0.607208517467f, 0.597441877296f, 92 | 0.587825531465f, 0.578352913803f, 0.569017984198f, 0.559815170911f, 93 | 0.550739320877f, 0.541785656682f, 0.532949739145f, 0.524227434628f, 94 | 0.515614886373f, 0.507108489253f, 0.498704867478f, 0.490400854812f, 95 | 0.482193476986f, 0.47407993601f, 0.466057596125f, 0.458123971214f, 96 | 0.450276713467f, 0.442513603171f, 0.434832539473f, 0.427231532022f, 97 | 0.419708693379f, 0.41226223212f, 0.404890446548f, 0.397591718955f, 98 | 0.390364510382f, 0.383207355816f, 0.376118859788f, 0.369097692334f, 99 | 0.362142585282f, 0.355252328834f, 0.348425768415f, 0.341661801776f, 100 | 0.334959376311f, 0.328317486588f, 0.321735172063f, 0.31521151497f, 101 | 0.308745638367f, 0.302336704338f, 0.29598391232f, 0.289686497571f, 102 | 0.283443729739f, 0.27725491156f, 0.271119377649f, 0.265036493387f, 103 | 0.259005653912f, 0.253026283183f, 0.247097833139f, 0.241219782932f, 104 | 0.235391638239f, 0.229612930649f, 0.223883217122f, 0.218202079518f, 105 | 0.212569124201f, 0.206983981709f, 0.201446306496f, 0.195955776745f, 106 | 0.190512094256f, 0.185114984406f, 0.179764196185f, 0.174459502324f, 107 | 0.169200699492f, 0.1639876086f, 0.158820075195f, 0.153697969964f, 108 | 0.148621189348f, 0.143589656295f, 0.138603321143f, 0.133662162669f, 109 | 0.128766189309f, 0.123915440582f, 0.119109988745f, 0.114349940703f, 110 | 0.10963544023f, 0.104966670533f, 0.100343857232f, 0.0957672718266f, 111 | 0.0912372357329f, 0.0867541250127f, 0.082318375932f, 0.0779304915295f, 112 | 0.0735910494266f, 0.0693007111742f, 0.065060233529f, 0.0608704821745f, 113 | 0.056732448584f, 0.05264727098f, 0.0486162607163f, 0.0446409359769f, 114 | 0.0407230655415f, 0.0368647267386f, 0.0330683839378f, 0.0293369977411f, 115 | 0.0256741818288f, 0.0220844372634f, 0.0185735200577f, 0.0151490552854f, 116 | 0.0118216532614f, 0.00860719483079f, 0.00553245272614f, 0.00265435214565f 117 | }; 118 | // MILO } 119 | 120 | /* tabulated values for 2^24 times x[i]/x[i+1], 121 | * used to accept for U*x[i+1]<=x[i] without any floating point operations */ 122 | static const unsigned long ktab[128] = { 123 | 0, 12590644, 14272653, 14988939, 124 | 15384584, 15635009, 15807561, 15933577, 125 | 16029594, 16105155, 16166147, 16216399, 126 | 16258508, 16294295, 16325078, 16351831, 127 | 16375291, 16396026, 16414479, 16431002, 128 | 16445880, 16459343, 16471578, 16482744, 129 | 16492970, 16502368, 16511031, 16519039, 130 | 16526459, 16533352, 16539769, 16545755, 131 | 16551348, 16556584, 16561493, 16566101, 132 | 16570433, 16574511, 16578353, 16581977, 133 | 16585398, 16588629, 16591685, 16594575, 134 | 16597311, 16599901, 16602354, 16604679, 135 | 16606881, 16608968, 16610945, 16612818, 136 | 16614592, 16616272, 16617861, 16619363, 137 | 16620782, 16622121, 16623383, 16624570, 138 | 16625685, 16626730, 16627708, 16628619, 139 | 16629465, 16630248, 16630969, 16631628, 140 | 16632228, 16632768, 16633248, 16633671, 141 | 16634034, 16634340, 16634586, 16634774, 142 | 16634903, 16634972, 16634980, 16634926, 143 | 16634810, 16634628, 16634381, 16634066, 144 | 16633680, 16633222, 16632688, 16632075, 145 | 16631380, 16630598, 16629726, 16628757, 146 | 16627686, 16626507, 16625212, 16623794, 147 | 16622243, 16620548, 16618698, 16616679, 148 | 16614476, 16612071, 16609444, 16606571, 149 | 16603425, 16599973, 16596178, 16591995, 150 | 16587369, 16582237, 16576520, 16570120, 151 | 16562917, 16554758, 16545450, 16534739, 152 | 16522287, 16507638, 16490152, 16468907, 153 | 16442518, 16408804, 16364095, 16301683, 154 | 16207738, 16047994, 15704248, 15472926 155 | }; 156 | 157 | /* tabulated values of 2^{-24}*x[i] */ 158 | static const double wtab[128] = { 159 | 1.62318314817e-08, 2.16291505214e-08, 2.54246305087e-08, 2.84579525938e-08, 160 | 3.10340022482e-08, 3.33011726243e-08, 3.53439060345e-08, 3.72152672658e-08, 161 | 3.8950989572e-08, 4.05763964764e-08, 4.21101548915e-08, 4.35664624904e-08, 162 | 4.49563968336e-08, 4.62887864029e-08, 4.75707945735e-08, 4.88083237257e-08, 163 | 5.00063025384e-08, 5.11688950428e-08, 5.22996558616e-08, 5.34016475624e-08, 164 | 5.44775307871e-08, 5.55296344581e-08, 5.65600111659e-08, 5.75704813695e-08, 165 | 5.85626690412e-08, 5.95380306862e-08, 6.04978791776e-08, 6.14434034901e-08, 166 | 6.23756851626e-08, 6.32957121259e-08, 6.42043903937e-08, 6.51025540077e-08, 167 | 6.59909735447e-08, 6.68703634341e-08, 6.77413882848e-08, 6.8604668381e-08, 168 | 6.94607844804e-08, 7.03102820203e-08, 7.11536748229e-08, 7.1991448372e-08, 169 | 7.2824062723e-08, 7.36519550992e-08, 7.44755422158e-08, 7.52952223703e-08, 170 | 7.61113773308e-08, 7.69243740467e-08, 7.77345662086e-08, 7.85422956743e-08, 171 | 7.93478937793e-08, 8.01516825471e-08, 8.09539758128e-08, 8.17550802699e-08, 172 | 8.25552964535e-08, 8.33549196661e-08, 8.41542408569e-08, 8.49535474601e-08, 173 | 8.57531242006e-08, 8.65532538723e-08, 8.73542180955e-08, 8.8156298059e-08, 174 | 8.89597752521e-08, 8.97649321908e-08, 9.05720531451e-08, 9.138142487e-08, 175 | 9.21933373471e-08, 9.30080845407e-08, 9.38259651738e-08, 9.46472835298e-08, 176 | 9.54723502847e-08, 9.63014833769e-08, 9.71350089201e-08, 9.79732621669e-08, 177 | 9.88165885297e-08, 9.96653446693e-08, 1.00519899658e-07, 1.0138063623e-07, 178 | 1.02247952126e-07, 1.03122261554e-07, 1.04003996769e-07, 1.04893609795e-07, 179 | 1.05791574313e-07, 1.06698387725e-07, 1.07614573423e-07, 1.08540683296e-07, 180 | 1.09477300508e-07, 1.1042504257e-07, 1.11384564771e-07, 1.12356564007e-07, 181 | 1.13341783071e-07, 1.14341015475e-07, 1.15355110887e-07, 1.16384981291e-07, 182 | 1.17431607977e-07, 1.18496049514e-07, 1.19579450872e-07, 1.20683053909e-07, 183 | 1.21808209468e-07, 1.2295639141e-07, 1.24129212952e-07, 1.25328445797e-07, 184 | 1.26556042658e-07, 1.27814163916e-07, 1.29105209375e-07, 1.30431856341e-07, 185 | 1.31797105598e-07, 1.3320433736e-07, 1.34657379914e-07, 1.36160594606e-07, 186 | 1.37718982103e-07, 1.39338316679e-07, 1.41025317971e-07, 1.42787873535e-07, 187 | 1.44635331499e-07, 1.4657889173e-07, 1.48632138436e-07, 1.50811780719e-07, 188 | 1.53138707402e-07, 1.55639532047e-07, 1.58348931426e-07, 1.61313325908e-07, 189 | 1.64596952856e-07, 1.68292495203e-07, 1.72541128694e-07, 1.77574279496e-07, 190 | 1.83813550477e-07, 1.92166040885e-07, 2.05295471952e-07, 2.22600839893e-07 191 | }; 192 | 193 | // MILO { 194 | static const float wtabf[128] = { 195 | 1.62318314817e-08f, 2.16291505214e-08f, 2.54246305087e-08f, 2.84579525938e-08f, 196 | 3.10340022482e-08f, 3.33011726243e-08f, 3.53439060345e-08f, 3.72152672658e-08f, 197 | 3.8950989572e-08f, 4.05763964764e-08f, 4.21101548915e-08f, 4.35664624904e-08f, 198 | 4.49563968336e-08f, 4.62887864029e-08f, 4.75707945735e-08f, 4.88083237257e-08f, 199 | 5.00063025384e-08f, 5.11688950428e-08f, 5.22996558616e-08f, 5.34016475624e-08f, 200 | 5.44775307871e-08f, 5.55296344581e-08f, 5.65600111659e-08f, 5.75704813695e-08f, 201 | 5.85626690412e-08f, 5.95380306862e-08f, 6.04978791776e-08f, 6.14434034901e-08f, 202 | 6.23756851626e-08f, 6.32957121259e-08f, 6.42043903937e-08f, 6.51025540077e-08f, 203 | 6.59909735447e-08f, 6.68703634341e-08f, 6.77413882848e-08f, 6.8604668381e-08f, 204 | 6.94607844804e-08f, 7.03102820203e-08f, 7.11536748229e-08f, 7.1991448372e-08f, 205 | 7.2824062723e-08f, 7.36519550992e-08f, 7.44755422158e-08f, 7.52952223703e-08f, 206 | 7.61113773308e-08f, 7.69243740467e-08f, 7.77345662086e-08f, 7.85422956743e-08f, 207 | 7.93478937793e-08f, 8.01516825471e-08f, 8.09539758128e-08f, 8.17550802699e-08f, 208 | 8.25552964535e-08f, 8.33549196661e-08f, 8.41542408569e-08f, 8.49535474601e-08f, 209 | 8.57531242006e-08f, 8.65532538723e-08f, 8.73542180955e-08f, 8.8156298059e-08f, 210 | 8.89597752521e-08f, 8.97649321908e-08f, 9.05720531451e-08f, 9.138142487e-08f, 211 | 9.21933373471e-08f, 9.30080845407e-08f, 9.38259651738e-08f, 9.46472835298e-08f, 212 | 9.54723502847e-08f, 9.63014833769e-08f, 9.71350089201e-08f, 9.79732621669e-08f, 213 | 9.88165885297e-08f, 9.96653446693e-08f, 1.00519899658e-07f, 1.0138063623e-07f, 214 | 1.02247952126e-07f, 1.03122261554e-07f, 1.04003996769e-07f, 1.04893609795e-07f, 215 | 1.05791574313e-07f, 1.06698387725e-07f, 1.07614573423e-07f, 1.08540683296e-07f, 216 | 1.09477300508e-07f, 1.1042504257e-07f, 1.11384564771e-07f, 1.12356564007e-07f, 217 | 1.13341783071e-07f, 1.14341015475e-07f, 1.15355110887e-07f, 1.16384981291e-07f, 218 | 1.17431607977e-07f, 1.18496049514e-07f, 1.19579450872e-07f, 1.20683053909e-07f, 219 | 1.21808209468e-07f, 1.2295639141e-07f, 1.24129212952e-07f, 1.25328445797e-07f, 220 | 1.26556042658e-07f, 1.27814163916e-07f, 1.29105209375e-07f, 1.30431856341e-07f, 221 | 1.31797105598e-07f, 1.3320433736e-07f, 1.34657379914e-07f, 1.36160594606e-07f, 222 | 1.37718982103e-07f, 1.39338316679e-07f, 1.41025317971e-07f, 1.42787873535e-07f, 223 | 1.44635331499e-07f, 1.4657889173e-07f, 1.48632138436e-07f, 1.50811780719e-07f, 224 | 1.53138707402e-07f, 1.55639532047e-07f, 1.58348931426e-07f, 1.61313325908e-07f, 225 | 1.64596952856e-07f, 1.68292495203e-07f, 1.72541128694e-07f, 1.77574279496e-07f, 226 | 1.83813550477e-07f, 1.92166040885e-07f, 2.05295471952e-07f, 2.22600839893e-07f 227 | }; 228 | // MILO } 229 | 230 | // MILO { 231 | #if 0 232 | static unsigned long 233 | gsl_rng_uint32 (gsl_rng *r) 234 | /* the uniform distribution on 0..2^{32}-1 */ 235 | { 236 | unsigned long min = gsl_rng_min (r); 237 | unsigned long max = gsl_rng_max (r); 238 | 239 | if (min == 0 && max == 4294967295U) { /* we have full 32 bit values */ 240 | return gsl_rng_get (r); 241 | } else { 242 | assert (max-min >= 65536); /* make sure we have at least 16 bit */ 243 | unsigned long a = (gsl_rng_get (r)-min)&0xFFFF; 244 | unsigned long b = (gsl_rng_get (r)-min)&0xFFFF; 245 | return (a<<16)|b; 246 | } 247 | } 248 | #endif 249 | 250 | template 251 | static double gsl_rng_uniform(RNGReal& r) { 252 | return r(); 253 | } 254 | // MILO } 255 | 256 | // MILO { 257 | //double 258 | //gsl_ran_gaussian_ziggurat (gsl_rng *r, double sigma) 259 | template 260 | double ziggurat(RNGReal& r, RNGInt& ri) 261 | { 262 | double sigma = 1; 263 | // MILO } 264 | unsigned long U, sign, i, j; 265 | double x, y; 266 | 267 | for(;;) { 268 | // MILO { 269 | //U = gsl_rng_uint32 (r); 270 | U = ri(); 271 | // MILO } 272 | i = U & 0x0000007F; /* 7 bit to choose the step */ 273 | sign = U & 0x00000080; /* 1 bit for the sign */ 274 | j = U>>8; /* 24 bit for the x-value */ 275 | 276 | x = j*wtab[i]; 277 | if (j < ktab[i]) break; 278 | 279 | if (i<127) { 280 | double y0, y1; 281 | y0 = ytab[i]; 282 | y1 = ytab[i+1]; 283 | y = y1+(y0-y1)*gsl_rng_uniform(r); 284 | } else { 285 | x = PARAM_R - log(1.0-gsl_rng_uniform(r))/PARAM_R; 286 | y = exp(-PARAM_R*(x-0.5*PARAM_R))*gsl_rng_uniform(r); 287 | } 288 | if (y < exp(-0.5*x*x)) break; 289 | } 290 | return sign ? sigma*x : -sigma*x; 291 | } 292 | 293 | // MILO { 294 | template 295 | float zigguratf(RNGReal& r, RNGInt& ri) 296 | { 297 | uint32_t U, sign, i, j; 298 | float x, y; 299 | 300 | for(;;) { 301 | U = ri(); 302 | i = U & 0x0000007F; /* 7 bit to choose the step */ 303 | sign = U & 0x00000080; /* 1 bit for the sign */ 304 | j = U >> 8; /* 24 bit for the x-value */ 305 | 306 | x = j * wtabf[i]; 307 | if (j < ktab[i]) break; 308 | 309 | if (i < 127) { 310 | float y0, y1; 311 | y0 = ytabf[i]; 312 | y1 = ytabf[i+1]; 313 | y = y1 + (y0 - y1) * r(); 314 | } else { 315 | x = float(PARAM_R) - std::log(1.0f - r()) / float(PARAM_R); 316 | y = std::exp(float(-PARAM_R) * (x - 0.5f * float(PARAM_R))) * r(); 317 | } 318 | if (y < std::exp(-0.5f * x * x)) break; 319 | } 320 | return sign ? x : -x; 321 | } 322 | // MILO } 323 | 324 | // MILO { 325 | 326 | static void normaldistf_ziggurat(float* data, size_t count) { 327 | LCG r; 328 | LCG ri; 329 | for (size_t i = 0; i < count; i++) 330 | data[i] = zigguratf(r, ri); 331 | } 332 | 333 | static void normaldist_ziggurat(double* data, size_t count) { 334 | LCG r; 335 | LCG ri; 336 | for (size_t i = 0; i < count; i++) 337 | data[i] = ziggurat(r, ri); 338 | } 339 | 340 | REGISTER_TEST(ziggurat); 341 | 342 | // MILO } -------------------------------------------------------------------------------- /src/sse_mathfun.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log 4 | 5 | Inspired by Intel Approximate Math library, and based on the 6 | corresponding algorithms of the cephes math library 7 | 8 | The default is to use the SSE1 version. If you define USE_SSE2 the 9 | the SSE2 intrinsics will be used in place of the MMX intrinsics. Do 10 | not expect any significant performance improvement with SSE2. 11 | */ 12 | 13 | /* Copyright (C) 2007 Julien Pommier 14 | 15 | This software is provided 'as-is', without any express or implied 16 | warranty. In no event will the authors be held liable for any damages 17 | arising from the use of this software. 18 | 19 | Permission is granted to anyone to use this software for any purpose, 20 | including commercial applications, and to alter it and redistribute it 21 | freely, subject to the following restrictions: 22 | 23 | 1. The origin of this software must not be misrepresented; you must not 24 | claim that you wrote the original software. If you use this software 25 | in a product, an acknowledgment in the product documentation would be 26 | appreciated but is not required. 27 | 2. Altered source versions must be plainly marked as such, and must not be 28 | misrepresented as being the original software. 29 | 3. This notice may not be removed or altered from any source distribution. 30 | 31 | (this is the zlib license) 32 | */ 33 | 34 | #include 35 | 36 | /* yes I know, the top of this file is quite ugly */ 37 | 38 | #ifdef _MSC_VER /* visual c++ */ 39 | # define ALIGN16_BEG __declspec(align(16)) 40 | # define ALIGN16_END 41 | #else /* gcc or icc */ 42 | # define ALIGN16_BEG 43 | # define ALIGN16_END __attribute__((aligned(16))) 44 | #endif 45 | 46 | /* __m128 is ugly to write */ 47 | typedef __m128 v4sf; // vector of 4 float (sse1) 48 | 49 | #ifdef USE_SSE2 50 | # include 51 | typedef __m128i v4si; // vector of 4 int (sse2) 52 | #else 53 | typedef __m64 v2si; // vector of 2 int (mmx) 54 | #endif 55 | 56 | /* declare some SSE constants -- why can't I figure a better way to do that? */ 57 | #define _PS_CONST(Name, Val) \ 58 | static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 59 | #define _PI32_CONST(Name, Val) \ 60 | static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 61 | #define _PS_CONST_TYPE(Name, Type, Val) \ 62 | static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 63 | 64 | _PS_CONST(1 , 1.0f); 65 | _PS_CONST(0p5, 0.5f); 66 | /* the smallest non denormalized float number */ 67 | _PS_CONST_TYPE(min_norm_pos, int, 0x00800000); 68 | _PS_CONST_TYPE(mant_mask, int, 0x7f800000); 69 | _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); 70 | 71 | _PS_CONST_TYPE(sign_mask, int, (int)0x80000000); 72 | _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); 73 | 74 | _PI32_CONST(1, 1); 75 | _PI32_CONST(inv1, ~1); 76 | _PI32_CONST(2, 2); 77 | _PI32_CONST(4, 4); 78 | _PI32_CONST(0x7f, 0x7f); 79 | 80 | _PS_CONST(cephes_SQRTHF, 0.707106781186547524f); 81 | _PS_CONST(cephes_log_p0, 7.0376836292E-2f); 82 | _PS_CONST(cephes_log_p1, - 1.1514610310E-1f); 83 | _PS_CONST(cephes_log_p2, 1.1676998740E-1f); 84 | _PS_CONST(cephes_log_p3, - 1.2420140846E-1f); 85 | _PS_CONST(cephes_log_p4, + 1.4249322787E-1f); 86 | _PS_CONST(cephes_log_p5, - 1.6668057665E-1f); 87 | _PS_CONST(cephes_log_p6, + 2.0000714765E-1f); 88 | _PS_CONST(cephes_log_p7, - 2.4999993993E-1f); 89 | _PS_CONST(cephes_log_p8, + 3.3333331174E-1f); 90 | _PS_CONST(cephes_log_q1, -2.12194440e-4f); 91 | _PS_CONST(cephes_log_q2, 0.693359375f); 92 | 93 | #ifndef USE_SSE2 94 | typedef union xmm_mm_union { 95 | __m128 xmm; 96 | __m64 mm[2]; 97 | } xmm_mm_union; 98 | 99 | #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ 100 | xmm_mm_union u; u.xmm = xmm_; \ 101 | mm0_ = u.mm[0]; \ 102 | mm1_ = u.mm[1]; \ 103 | } 104 | 105 | #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ 106 | xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ 107 | } 108 | 109 | #endif // USE_SSE2 110 | 111 | /* natural logarithm computed for 4 simultaneous float 112 | return NaN for x <= 0 113 | */ 114 | inline v4sf log_ps(v4sf x) { 115 | #ifdef USE_SSE2 116 | v4si emm0; 117 | #else 118 | v2si mm0, mm1; 119 | #endif 120 | v4sf one = *(v4sf*)_ps_1; 121 | 122 | v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 123 | 124 | x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */ 125 | 126 | #ifndef USE_SSE2 127 | /* part 1: x = frexpf(x, &e); */ 128 | COPY_XMM_TO_MM(x, mm0, mm1); 129 | mm0 = _mm_srli_pi32(mm0, 23); 130 | mm1 = _mm_srli_pi32(mm1, 23); 131 | #else 132 | emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); 133 | #endif 134 | /* keep only the fractional part */ 135 | x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); 136 | x = _mm_or_ps(x, *(v4sf*)_ps_0p5); 137 | 138 | #ifndef USE_SSE2 139 | /* now e=mm0:mm1 contain the really base-2 exponent */ 140 | mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); 141 | mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); 142 | v4sf e = _mm_cvtpi32x2_ps(mm0, mm1); 143 | _mm_empty(); /* bye bye mmx */ 144 | #else 145 | emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); 146 | v4sf e = _mm_cvtepi32_ps(emm0); 147 | #endif 148 | 149 | e = _mm_add_ps(e, one); 150 | 151 | /* part2: 152 | if( x < SQRTHF ) { 153 | e -= 1; 154 | x = x + x - 1.0; 155 | } else { x = x - 1.0; } 156 | */ 157 | v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); 158 | v4sf tmp = _mm_and_ps(x, mask); 159 | x = _mm_sub_ps(x, one); 160 | e = _mm_sub_ps(e, _mm_and_ps(one, mask)); 161 | x = _mm_add_ps(x, tmp); 162 | 163 | 164 | v4sf z = _mm_mul_ps(x,x); 165 | 166 | v4sf y = *(v4sf*)_ps_cephes_log_p0; 167 | y = _mm_mul_ps(y, x); 168 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); 169 | y = _mm_mul_ps(y, x); 170 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); 171 | y = _mm_mul_ps(y, x); 172 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); 173 | y = _mm_mul_ps(y, x); 174 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); 175 | y = _mm_mul_ps(y, x); 176 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); 177 | y = _mm_mul_ps(y, x); 178 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); 179 | y = _mm_mul_ps(y, x); 180 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); 181 | y = _mm_mul_ps(y, x); 182 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); 183 | y = _mm_mul_ps(y, x); 184 | 185 | y = _mm_mul_ps(y, z); 186 | 187 | 188 | tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); 189 | y = _mm_add_ps(y, tmp); 190 | 191 | 192 | tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 193 | y = _mm_sub_ps(y, tmp); 194 | 195 | tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); 196 | x = _mm_add_ps(x, y); 197 | x = _mm_add_ps(x, tmp); 198 | x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 199 | return x; 200 | } 201 | 202 | _PS_CONST(exp_hi, 88.3762626647949f); 203 | _PS_CONST(exp_lo, -88.3762626647949f); 204 | 205 | _PS_CONST(cephes_LOG2EF, 1.44269504088896341f); 206 | _PS_CONST(cephes_exp_C1, 0.693359375f); 207 | _PS_CONST(cephes_exp_C2, -2.12194440e-4f); 208 | 209 | _PS_CONST(cephes_exp_p0, 1.9875691500E-4f); 210 | _PS_CONST(cephes_exp_p1, 1.3981999507E-3f); 211 | _PS_CONST(cephes_exp_p2, 8.3334519073E-3f); 212 | _PS_CONST(cephes_exp_p3, 4.1665795894E-2f); 213 | _PS_CONST(cephes_exp_p4, 1.6666665459E-1f); 214 | _PS_CONST(cephes_exp_p5, 5.0000001201E-1f); 215 | 216 | inline v4sf exp_ps(v4sf x) { 217 | v4sf tmp = _mm_setzero_ps(), fx; 218 | #ifdef USE_SSE2 219 | v4si emm0; 220 | #else 221 | v2si mm0, mm1; 222 | #endif 223 | v4sf one = *(v4sf*)_ps_1; 224 | 225 | x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); 226 | x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); 227 | 228 | /* express exp(x) as exp(g + n*log(2)) */ 229 | fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); 230 | fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); 231 | 232 | /* how to perform a floorf with SSE: just below */ 233 | #ifndef USE_SSE2 234 | /* step 1 : cast to int */ 235 | tmp = _mm_movehl_ps(tmp, fx); 236 | mm0 = _mm_cvttps_pi32(fx); 237 | mm1 = _mm_cvttps_pi32(tmp); 238 | /* step 2 : cast back to float */ 239 | tmp = _mm_cvtpi32x2_ps(mm0, mm1); 240 | #else 241 | emm0 = _mm_cvttps_epi32(fx); 242 | tmp = _mm_cvtepi32_ps(emm0); 243 | #endif 244 | /* if greater, substract 1 */ 245 | v4sf mask = _mm_cmpgt_ps(tmp, fx); 246 | mask = _mm_and_ps(mask, one); 247 | fx = _mm_sub_ps(tmp, mask); 248 | 249 | tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); 250 | v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); 251 | x = _mm_sub_ps(x, tmp); 252 | x = _mm_sub_ps(x, z); 253 | 254 | z = _mm_mul_ps(x,x); 255 | 256 | v4sf y = *(v4sf*)_ps_cephes_exp_p0; 257 | y = _mm_mul_ps(y, x); 258 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); 259 | y = _mm_mul_ps(y, x); 260 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); 261 | y = _mm_mul_ps(y, x); 262 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); 263 | y = _mm_mul_ps(y, x); 264 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); 265 | y = _mm_mul_ps(y, x); 266 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); 267 | y = _mm_mul_ps(y, z); 268 | y = _mm_add_ps(y, x); 269 | y = _mm_add_ps(y, one); 270 | 271 | /* build 2^n */ 272 | #ifndef USE_SSE2 273 | z = _mm_movehl_ps(z, fx); 274 | mm0 = _mm_cvttps_pi32(fx); 275 | mm1 = _mm_cvttps_pi32(z); 276 | mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); 277 | mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); 278 | mm0 = _mm_slli_pi32(mm0, 23); 279 | mm1 = _mm_slli_pi32(mm1, 23); 280 | 281 | v4sf pow2n; 282 | COPY_MM_TO_XMM(mm0, mm1, pow2n); 283 | _mm_empty(); 284 | #else 285 | emm0 = _mm_cvttps_epi32(fx); 286 | emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); 287 | emm0 = _mm_slli_epi32(emm0, 23); 288 | v4sf pow2n = _mm_castsi128_ps(emm0); 289 | #endif 290 | y = _mm_mul_ps(y, pow2n); 291 | return y; 292 | } 293 | 294 | _PS_CONST(minus_cephes_DP1, -0.78515625f); 295 | _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); 296 | _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); 297 | _PS_CONST(sincof_p0, -1.9515295891E-4f); 298 | _PS_CONST(sincof_p1, 8.3321608736E-3f); 299 | _PS_CONST(sincof_p2, -1.6666654611E-1f); 300 | _PS_CONST(coscof_p0, 2.443315711809948E-005f); 301 | _PS_CONST(coscof_p1, -1.388731625493765E-003f); 302 | _PS_CONST(coscof_p2, 4.166664568298827E-002f); 303 | _PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI 304 | 305 | 306 | /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so 307 | it runs also on old athlons XPs and the pentium III of your grand 308 | mother. 309 | 310 | The code is the exact rewriting of the cephes sinf function. 311 | Precision is excellent as long as x < 8192 (I did not bother to 312 | take into account the special handling they have for greater values 313 | -- it does not return garbage for arguments over 8192, though, but 314 | the extra precision is missing). 315 | 316 | Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the 317 | surprising but correct result. 318 | 319 | Performance is also surprisingly good, 1.33 times faster than the 320 | macos vsinf SSE2 function, and 1.5 times faster than the 321 | __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not 322 | too bad for an SSE1 function (with no special tuning) ! 323 | However the latter libraries probably have a much better handling of NaN, 324 | Inf, denormalized and other special arguments.. 325 | 326 | On my core 1 duo, the execution of this function takes approximately 95 cycles. 327 | 328 | From what I have observed on the experiments with Intel AMath lib, switching to an 329 | SSE2 version would improve the perf by only 10%. 330 | 331 | Since it is based on SSE intrinsics, it has to be compiled at -O2 to 332 | deliver full speed. 333 | */ 334 | inline v4sf sin_ps(v4sf x) { // any x 335 | v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; 336 | 337 | #ifdef USE_SSE2 338 | v4si emm0, emm2; 339 | #else 340 | v2si mm0, mm1, mm2, mm3; 341 | #endif 342 | sign_bit = x; 343 | /* take the absolute value */ 344 | x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 345 | /* extract the sign bit (upper one) */ 346 | sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 347 | 348 | /* scale by 4/Pi */ 349 | y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 350 | 351 | #ifdef USE_SSE2 352 | /* store the integer part of y in mm0 */ 353 | emm2 = _mm_cvttps_epi32(y); 354 | /* j=(j+1) & (~1) (see the cephes sources) */ 355 | emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 356 | emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 357 | y = _mm_cvtepi32_ps(emm2); 358 | 359 | /* get the swap sign flag */ 360 | emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 361 | emm0 = _mm_slli_epi32(emm0, 29); 362 | /* get the polynom selection mask 363 | there is one polynom for 0 <= x <= Pi/4 364 | and another one for Pi/4 35 | 36 | /* yes I know, the top of this file is quite ugly */ 37 | #ifdef _MSC_VER /* visual c++ */ 38 | # define ALIGN32_BEG __declspec(align(32)) 39 | # define ALIGN32_END 40 | #else /* gcc or icc */ 41 | # define ALIGN32_BEG 42 | # define ALIGN32_END __attribute__((aligned(32))) 43 | #endif 44 | 45 | /* __m128 is ugly to write */ 46 | typedef __m256 v8sf; // vector of 8 float (avx) 47 | typedef __m256i v8si; // vector of 8 int (avx) 48 | typedef __m128i v4si; // vector of 8 int (avx) 49 | 50 | #define _PI32AVX_CONST(Name, Val) \ 51 | static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val } 52 | 53 | _PI32AVX_CONST(1, 1); 54 | _PI32AVX_CONST(inv1, ~1); 55 | _PI32AVX_CONST(2, 2); 56 | _PI32AVX_CONST(4, 4); 57 | 58 | 59 | /* declare some AVX constants -- why can't I figure a better way to do that? */ 60 | #define _PS256_CONST(Name, Val) \ 61 | static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 62 | #define _PI32_CONST256(Name, Val) \ 63 | static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 64 | #define _PS256_CONST_TYPE(Name, Type, Val) \ 65 | static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 66 | 67 | _PS256_CONST(1 , 1.0f); 68 | _PS256_CONST(0p5, 0.5f); 69 | /* the smallest non denormalized float number */ 70 | _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); 71 | _PS256_CONST_TYPE(mant_mask, int, 0x7f800000); 72 | _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); 73 | 74 | _PS256_CONST_TYPE(sign_mask, unsigned, 0x80000000u); 75 | _PS256_CONST_TYPE(inv_sign_mask, unsigned, ~0x80000000u); 76 | 77 | _PI32_CONST256(0, 0); 78 | _PI32_CONST256(1, 1); 79 | _PI32_CONST256(inv1, ~1); 80 | _PI32_CONST256(2, 2); 81 | _PI32_CONST256(4, 4); 82 | _PI32_CONST256(0x7f, 0x7f); 83 | 84 | _PS256_CONST(cephes_SQRTHF, 0.707106781186547524f); 85 | _PS256_CONST(cephes_log_p0, 7.0376836292E-2f); 86 | _PS256_CONST(cephes_log_p1, - 1.1514610310E-1f); 87 | _PS256_CONST(cephes_log_p2, 1.1676998740E-1f); 88 | _PS256_CONST(cephes_log_p3, - 1.2420140846E-1f); 89 | _PS256_CONST(cephes_log_p4, + 1.4249322787E-1f); 90 | _PS256_CONST(cephes_log_p5, - 1.6668057665E-1f); 91 | _PS256_CONST(cephes_log_p6, + 2.0000714765E-1f); 92 | _PS256_CONST(cephes_log_p7, - 2.4999993993E-1f); 93 | _PS256_CONST(cephes_log_p8, + 3.3333331174E-1f); 94 | _PS256_CONST(cephes_log_q1, -2.12194440e-4f); 95 | _PS256_CONST(cephes_log_q2, 0.693359375f); 96 | 97 | #ifndef __AVX2__ 98 | 99 | typedef union imm_xmm_union { 100 | v8si imm; 101 | v4si xmm[2]; 102 | } imm_xmm_union; 103 | 104 | #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \ 105 | imm_xmm_union u; \ 106 | u.imm = imm_; \ 107 | xmm0_ = u.xmm[0]; \ 108 | xmm1_ = u.xmm[1]; \ 109 | } 110 | 111 | #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \ 112 | imm_xmm_union u; \ 113 | u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \ 114 | } 115 | 116 | 117 | #define AVX2_BITOP_USING_SSE2(fn) \ 118 | inline v8si _mm256_##fn##_sse2(v8si x, int a) \ 119 | { \ 120 | /* use SSE2 instruction to perform the bitop AVX2 */ \ 121 | v4si x1, x2; \ 122 | v8si ret; \ 123 | COPY_IMM_TO_XMM(x, x1, x2); \ 124 | x1 = _mm_##fn(x1,a); \ 125 | x2 = _mm_##fn(x2,a); \ 126 | COPY_XMM_TO_IMM(x1, x2, ret); \ 127 | return(ret); \ 128 | } 129 | 130 | //#warning "Using SSE2 to perform AVX2 bitshift ops" 131 | AVX2_BITOP_USING_SSE2(slli_epi32) 132 | 133 | #define AVX2_INTOP_USING_SSE2(fn) \ 134 | inline v8si _mm256_##fn##_sse2(v8si x, v8si y) \ 135 | { \ 136 | /* use SSE2 instructions to perform the AVX2 integer operation */ \ 137 | v4si x1, x2; \ 138 | v4si y1, y2; \ 139 | v8si ret; \ 140 | COPY_IMM_TO_XMM(x, x1, x2); \ 141 | COPY_IMM_TO_XMM(y, y1, y2); \ 142 | x1 = _mm_##fn(x1,y1); \ 143 | x2 = _mm_##fn(x2,y2); \ 144 | COPY_XMM_TO_IMM(x1, x2, ret); \ 145 | return(ret); \ 146 | } 147 | 148 | //#warning "Using SSE2 to perform AVX2 integer ops" 149 | AVX2_INTOP_USING_SSE2(and_si128) 150 | AVX2_INTOP_USING_SSE2(andnot_si128) 151 | AVX2_INTOP_USING_SSE2(cmpeq_epi32) 152 | AVX2_INTOP_USING_SSE2(sub_epi32) 153 | AVX2_INTOP_USING_SSE2(add_epi32) 154 | 155 | #endif /* __AVX2__ */ 156 | 157 | 158 | /* natural logarithm computed for 8 simultaneous float 159 | return NaN for x <= 0 160 | */ 161 | inline v8sf log256_ps(v8sf x) { 162 | v8si imm0; 163 | v8sf one = *(v8sf*)_ps256_1; 164 | 165 | //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); 166 | v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); 167 | 168 | x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ 169 | 170 | // can be done with AVX2 171 | imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); 172 | 173 | /* keep only the fractional part */ 174 | x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); 175 | x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); 176 | 177 | // this is again another AVX2 instruction 178 | imm0 = _mm256_sub_epi32_sse2(imm0, *(v8si*)_pi32_256_0x7f); 179 | v8sf e = _mm256_cvtepi32_ps(imm0); 180 | 181 | e = _mm256_add_ps(e, one); 182 | 183 | /* part2: 184 | if( x < SQRTHF ) { 185 | e -= 1; 186 | x = x + x - 1.0; 187 | } else { x = x - 1.0; } 188 | */ 189 | //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); 190 | v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); 191 | v8sf tmp = _mm256_and_ps(x, mask); 192 | x = _mm256_sub_ps(x, one); 193 | e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); 194 | x = _mm256_add_ps(x, tmp); 195 | 196 | v8sf z = _mm256_mul_ps(x,x); 197 | 198 | v8sf y = *(v8sf*)_ps256_cephes_log_p0; 199 | y = _mm256_mul_ps(y, x); 200 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); 201 | y = _mm256_mul_ps(y, x); 202 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); 203 | y = _mm256_mul_ps(y, x); 204 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); 205 | y = _mm256_mul_ps(y, x); 206 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); 207 | y = _mm256_mul_ps(y, x); 208 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); 209 | y = _mm256_mul_ps(y, x); 210 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); 211 | y = _mm256_mul_ps(y, x); 212 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); 213 | y = _mm256_mul_ps(y, x); 214 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); 215 | y = _mm256_mul_ps(y, x); 216 | 217 | y = _mm256_mul_ps(y, z); 218 | 219 | tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); 220 | y = _mm256_add_ps(y, tmp); 221 | 222 | 223 | tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); 224 | y = _mm256_sub_ps(y, tmp); 225 | 226 | tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); 227 | x = _mm256_add_ps(x, y); 228 | x = _mm256_add_ps(x, tmp); 229 | x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN 230 | return x; 231 | } 232 | 233 | _PS256_CONST(exp_hi, 88.3762626647949f); 234 | _PS256_CONST(exp_lo, -88.3762626647949f); 235 | 236 | _PS256_CONST(cephes_LOG2EF, 1.44269504088896341f); 237 | _PS256_CONST(cephes_exp_C1, 0.693359375f); 238 | _PS256_CONST(cephes_exp_C2, -2.12194440e-4f); 239 | 240 | _PS256_CONST(cephes_exp_p0, 1.9875691500E-4f); 241 | _PS256_CONST(cephes_exp_p1, 1.3981999507E-3f); 242 | _PS256_CONST(cephes_exp_p2, 8.3334519073E-3f); 243 | _PS256_CONST(cephes_exp_p3, 4.1665795894E-2f); 244 | _PS256_CONST(cephes_exp_p4, 1.6666665459E-1f); 245 | _PS256_CONST(cephes_exp_p5, 5.0000001201E-1f); 246 | 247 | inline v8sf exp256_ps(v8sf x) { 248 | v8sf tmp = _mm256_setzero_ps(), fx; 249 | v8si imm0; 250 | v8sf one = *(v8sf*)_ps256_1; 251 | 252 | x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); 253 | x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); 254 | 255 | /* express exp(x) as exp(g + n*log(2)) */ 256 | fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); 257 | fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); 258 | 259 | /* how to perform a floorf with SSE: just below */ 260 | //imm0 = _mm256_cvttps_epi32(fx); 261 | //tmp = _mm256_cvtepi32_ps(imm0); 262 | 263 | tmp = _mm256_floor_ps(fx); 264 | 265 | /* if greater, substract 1 */ 266 | //v8sf mask = _mm256_cmpgt_ps(tmp, fx); 267 | v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); 268 | mask = _mm256_and_ps(mask, one); 269 | fx = _mm256_sub_ps(tmp, mask); 270 | 271 | tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); 272 | v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); 273 | x = _mm256_sub_ps(x, tmp); 274 | x = _mm256_sub_ps(x, z); 275 | 276 | z = _mm256_mul_ps(x,x); 277 | 278 | v8sf y = *(v8sf*)_ps256_cephes_exp_p0; 279 | y = _mm256_mul_ps(y, x); 280 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); 281 | y = _mm256_mul_ps(y, x); 282 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); 283 | y = _mm256_mul_ps(y, x); 284 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); 285 | y = _mm256_mul_ps(y, x); 286 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); 287 | y = _mm256_mul_ps(y, x); 288 | y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); 289 | y = _mm256_mul_ps(y, z); 290 | y = _mm256_add_ps(y, x); 291 | y = _mm256_add_ps(y, one); 292 | 293 | /* build 2^n */ 294 | imm0 = _mm256_cvttps_epi32(fx); 295 | // another two AVX2 instructions 296 | imm0 = _mm256_add_epi32_sse2(imm0, *(v8si*)_pi32_256_0x7f); 297 | imm0 = _mm256_slli_epi32_sse2(imm0, 23); 298 | v8sf pow2n = _mm256_castsi256_ps(imm0); 299 | y = _mm256_mul_ps(y, pow2n); 300 | return y; 301 | } 302 | 303 | _PS256_CONST(minus_cephes_DP1, -0.78515625f); 304 | _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); 305 | _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); 306 | _PS256_CONST(sincof_p0, -1.9515295891E-4f); 307 | _PS256_CONST(sincof_p1, 8.3321608736E-3f); 308 | _PS256_CONST(sincof_p2, -1.6666654611E-1f); 309 | _PS256_CONST(coscof_p0, 2.443315711809948E-005f); 310 | _PS256_CONST(coscof_p1, -1.388731625493765E-003f); 311 | _PS256_CONST(coscof_p2, 4.166664568298827E-002f); 312 | _PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI 313 | 314 | 315 | /* evaluation of 8 sines at onces using AVX intrisics 316 | 317 | The code is the exact rewriting of the cephes sinf function. 318 | Precision is excellent as long as x < 8192 (I did not bother to 319 | take into account the special handling they have for greater values 320 | -- it does not return garbage for arguments over 8192, though, but 321 | the extra precision is missing). 322 | 323 | Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the 324 | surprising but correct result. 325 | 326 | */ 327 | inline v8sf sin256_ps(v8sf x) { // any x 328 | v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; 329 | v8si imm0, imm2; 330 | 331 | #ifndef __AVX2__ 332 | v4si imm0_1, imm0_2; 333 | v4si imm2_1, imm2_2; 334 | #endif 335 | 336 | sign_bit = x; 337 | /* take the absolute value */ 338 | x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); 339 | /* extract the sign bit (upper one) */ 340 | sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); 341 | 342 | /* scale by 4/Pi */ 343 | y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); 344 | 345 | /* 346 | Here we start a series of integer operations, which are in the 347 | realm of AVX2. 348 | If we don't have AVX, let's perform them using SSE2 directives 349 | */ 350 | 351 | #ifdef __AVX2__ 352 | /* store the integer part of y in mm0 */ 353 | imm2 = _mm256_cvttps_epi32(y); 354 | /* j=(j+1) & (~1) (see the cephes sources) */ 355 | // another two AVX2 instruction 356 | imm2 = _mm256_add_epi32_sse2(imm2, *(v8si*)_pi32_256_1); 357 | imm2 = _mm256_and_si128_sse2(imm2, *(v8si*)_pi32_256_inv1); 358 | y = _mm256_cvtepi32_ps(imm2); 359 | 360 | /* get the swap sign flag */ 361 | imm0 = _mm256_and_si128_sse2(imm2, *(v8si*)_pi32_256_4); 362 | imm0 = _mm256_slli_epi32_sse2(imm0, 29); 363 | /* get the polynom selection mask 364 | there is one polynom for 0 <= x <= Pi/4 365 | and another one for Pi/4