├── fastapprox ├── NEWS ├── AUTHORS ├── README ├── fw-pkgin │ ├── Makefile.am.local │ ├── stop │ ├── start │ ├── pre-install │ ├── post-install │ ├── pre-remove │ ├── post-remove │ └── config ├── Makefile.am.local ├── tests │ ├── testfastonebigheader.c │ ├── testfastlog.c │ ├── testfastpow.c │ ├── testfastsigmoid.c │ ├── testfastexp.c │ ├── testfasthyperbolic.c │ ├── testfastgamma.c │ ├── testfasterf.c │ ├── Makefile.am.local │ ├── testfastlambertw.c │ ├── testfasttrig.c │ └── testmacros.h ├── BUILD_HOWTO ├── configure.ac.local ├── bootstrap ├── src │ ├── Makefile.am.local │ ├── cast.h │ ├── fastsigmoid.h │ ├── fastpow.h │ ├── fasthyperbolic.h │ ├── fastlog.h │ ├── fastexp.h │ ├── fastgamma.h │ ├── fasterf.h │ ├── sse.h │ ├── fastlambertw.h │ ├── fasttrig.h │ └── fastonebigheader.h ├── ChangeLog ├── COPYING └── ax_check_compile_flag.m4 └── README.md /fastapprox/NEWS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastapprox/AUTHORS: -------------------------------------------------------------------------------- 1 | Paul Mineiro 2 | -------------------------------------------------------------------------------- /fastapprox/README: -------------------------------------------------------------------------------- 1 | Fast approximate versions of certain functions that arise in machine learning. 2 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/Makefile.am.local: -------------------------------------------------------------------------------- 1 | # put whatever (auto)make commands here, they will be included from Makefile.am 2 | -------------------------------------------------------------------------------- /fastapprox/Makefile.am.local: -------------------------------------------------------------------------------- 1 | # put whatever (auto)make commands here, they will be included from Makefile.am 2 | 3 | EXTRA_DIST += BUILD_HOWTO 4 | -------------------------------------------------------------------------------- /fastapprox/tests/testfastonebigheader.c: -------------------------------------------------------------------------------- 1 | #include "../src/fastonebigheader.h" 2 | 3 | int 4 | main (void) 5 | { 6 | return fastexp (1) < 0; 7 | } 8 | -------------------------------------------------------------------------------- /fastapprox/BUILD_HOWTO: -------------------------------------------------------------------------------- 1 | The library consists entirely of header files, so there is no building per se. 2 | 3 | You can run the tests via 4 | 5 | ./configure && make check 6 | 7 | After that you can make install if you want, or, you can grab 8 | src/fastonebigheader.h and just drop it into whatever you're working on. 9 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/stop: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | #--------------------------------------------------------------------- 4 | # start 5 | # 6 | # Executed when the package (service) is shut down. 7 | # Not supported by all package formats. 8 | #--------------------------------------------------------------------- 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/start: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | #--------------------------------------------------------------------- 4 | # start 5 | # 6 | # Executed when the package (service) is started up. 7 | # Not supported by all package formats. 8 | #--------------------------------------------------------------------- 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /fastapprox/configure.ac.local: -------------------------------------------------------------------------------- 1 | dnl -- include additional autoconf commands here 2 | dnl -- do not include AC_OUTPUT, this is called for you 3 | 4 | m4_include([ax_check_compile_flag.m4]) 5 | 6 | AX_CHECK_COMPILE_FLAG([-std=c++0x], 7 | [CXXFLAGS="$CXXFLAGS -std=c++0x"]) 8 | 9 | AC_CHECK_HEADERS([emmintrin.h boost/math/special_functions/digamma.hpp gsl/gsl_sf_lambert.h]) 10 | 11 | PKG_CHECK_MODULES([GSL], [gsl], [], [ printf "" ]) 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastapprox 2 | Approximate and vectorized versions of common mathematical functions 3 | 4 | The easiest way to include this in your projects is via the one big standalone header file which works with both C and C++. 5 | 6 | ### Current functions: 7 | 8 | - exponential, logarithm, and power 9 | - lgamma and digamma 10 | - cosh, sinh, tanh 11 | - cos, sin, tan 12 | - sigmoid and erf 13 | - Lambert W 14 | 15 | There's a Mathematica notebook which helps explain the techniques. 16 | -------------------------------------------------------------------------------- /fastapprox/bootstrap: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | if test -d fw/bin 4 | then 5 | PATH="`pwd`/fw/bin:$PATH" 6 | export PATH 7 | fi 8 | 9 | fwb=`which fw-bootstrap` 10 | 11 | if test -z "$fwb" 12 | then 13 | echo "bootstrap: fatal: fw-bootstrap not installed or not in PATH" 1>&2 14 | exit 1 15 | fi 16 | 17 | "$fwb" --fw_version "0.3.3" --name fastapprox --template Cxx --revision svn --svn_project_path https://fastapprox.googlecode.com/svn/trunk/fastapprox --svn_tag_root https://fastapprox.googlecode.com/svn/tags/fastapprox "$@" 18 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/pre-install: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -e 4 | 5 | #--------------------------------------------------------------------- 6 | # pre-install 7 | # 8 | # Executed before the package is installed. 9 | # 10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks 11 | #--------------------------------------------------------------------- 12 | 13 | case "$1" in 14 | install) 15 | ;; 16 | upgrade) 17 | # old version is $2 18 | ;; 19 | *) 20 | ;; 21 | esac 22 | 23 | exit 0 24 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/post-install: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -e 4 | 5 | #--------------------------------------------------------------------- 6 | # post-install 7 | # 8 | # Executed after the package is installed. 9 | # 10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks 11 | #--------------------------------------------------------------------- 12 | 13 | case "$1" in 14 | configure) 15 | # most recently configured version is $2 (possibly empty string) 16 | ;; 17 | *) 18 | ;; 19 | esac 20 | 21 | exit 0 22 | -------------------------------------------------------------------------------- /fastapprox/src/Makefile.am.local: -------------------------------------------------------------------------------- 1 | # put whatever (auto)make commands here, they will be included from Makefile.am 2 | # 3 | 4 | fastonebigheader.h: $(filter-out config.h fastonebigheader.h, $(wildcard *.h)) 5 | cat \ 6 | cast.h \ 7 | sse.h \ 8 | fastexp.h \ 9 | fastlog.h \ 10 | fasterf.h \ 11 | fastgamma.h \ 12 | fasthyperbolic.h \ 13 | fastlambertw.h \ 14 | fastpow.h \ 15 | fastsigmoid.h \ 16 | fasttrig.h \ 17 | | grep -v '#include "' > "$@" 18 | 19 | myinclude_HEADERS += \ 20 | fastonebigheader.h \ 21 | $(filter-out config.h fastonebigheader.h, $(wildcard *.h)) 22 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/pre-remove: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -e 4 | 5 | #--------------------------------------------------------------------- 6 | # pre-remove 7 | # 8 | # Executed before the package is removed. 9 | # 10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks 11 | #--------------------------------------------------------------------- 12 | 13 | case "$1" in 14 | upgrade) 15 | # defer to newer package's script 16 | exit 1 17 | ;; 18 | failed-upgrade) 19 | # actually handle the upgrade here 20 | # old-version is $2 21 | ;; 22 | remove) 23 | ;; 24 | *) 25 | ;; 26 | esac 27 | 28 | exit 0 29 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/post-remove: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -e 4 | 5 | #--------------------------------------------------------------------- 6 | # post-remove 7 | # 8 | # Executed after the package is removed. 9 | # 10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks 11 | #--------------------------------------------------------------------- 12 | 13 | case "$1" in 14 | upgrade) 15 | # defer to newer package's script 16 | exit 1 17 | ;; 18 | failed-upgrade) 19 | # actually handle the upgrade here 20 | # old-version is $2 21 | ;; 22 | remove) 23 | ;; 24 | *) 25 | ;; 26 | esac 27 | 28 | exit 0 29 | -------------------------------------------------------------------------------- /fastapprox/ChangeLog: -------------------------------------------------------------------------------- 1 | Version 0.3.2 2 | * fix some compile warnings 3 | 4 | Version 0.3.1 5 | * somewhat faster fasterlog 6 | * Lambert W, use negative exponential in Halley's method 7 | 8 | Version 0.3.0 9 | * Lambert W 10 | 11 | Version 0.2.0 12 | * 10% faster fastdigamma/vfastdigamma (same formula with terms rearranged) 13 | * handle underflow in fastexp (25% speed penalty, but ubiqituous so necessary) 14 | * purge last remaining __attribute__ (gcc specific) 15 | * trade some overall pow2 accuracy to improve accuracy on integral powers 16 | 17 | Version 0.1.0 18 | * Inverse erf 19 | * Trade some overall log accuracy to improve accuracy on exact powers of 2 20 | * Whoops, left out power 21 | 22 | Version 0.0.0 23 | * Fast logarithm, exponential. 24 | * Fast trigonometrics and hyperbolics. 25 | * Fast sigmoid and erf. 26 | -------------------------------------------------------------------------------- /fastapprox/tests/testfastlog.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fastlog.h" 9 | 10 | #include "testmacros.h" 11 | 12 | test_scalar (fastlog, logf, 1e-10f + 10.0f * drand48 (), 1e-4f, 100000000) 13 | test_scalar (fasterlog, logf, 1e-10f + 10.0f * drand48 (), 2e-2f, 100000000) 14 | 15 | test_vector (vfastlog, logf, 1e-10f + 10.0f * drand48 (), 1e-4f, 100000000) 16 | test_vector (vfasterlog, logf, 1e-10f + 10.0f * drand48 (), 2e-2f, 100000000) 17 | 18 | int 19 | main (int argc, 20 | char *argv[]) 21 | { 22 | char buf[4096]; 23 | 24 | (void) argc; 25 | 26 | srand48 (69); 27 | 28 | strncpy (buf, argv[0], sizeof (buf) - 5); 29 | strncat (buf, ".out", 5); 30 | 31 | fclose (stderr); 32 | stderr = fopen (buf, "w"); 33 | 34 | test_fastlog (); 35 | test_fasterlog (); 36 | test_vfastlog (); 37 | test_vfasterlog (); 38 | 39 | time_fastlog (); 40 | time_fasterlog (); 41 | time_vfastlog (); 42 | time_vfasterlog (); 43 | 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /fastapprox/tests/testfastpow.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fastpow.h" 9 | 10 | #include "testmacros.h" 11 | 12 | test_scalar2 (fastpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 1e-4f, 100000000) 13 | test_scalar2 (fasterpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 2e-2f, 100000000) 14 | 15 | test_vector2 (vfastpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 1e-4f, 100000000) 16 | test_vector2 (vfasterpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 2e-2f, 100000000) 17 | 18 | int 19 | main (int argc, 20 | char *argv[]) 21 | { 22 | char buf[4096]; 23 | 24 | (void) argc; 25 | 26 | srand48 (69); 27 | 28 | strncpy (buf, argv[0], sizeof (buf) - 5); 29 | strncat (buf, ".out", 5); 30 | 31 | fclose (stderr); 32 | stderr = fopen (buf, "w"); 33 | 34 | test_fastpow (); 35 | test_fasterpow (); 36 | test_vfastpow (); 37 | test_vfasterpow (); 38 | 39 | time_fastpow (); 40 | time_fasterpow (); 41 | time_vfastpow (); 42 | time_vfasterpow (); 43 | 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /fastapprox/tests/testfastsigmoid.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fastsigmoid.h" 9 | 10 | #include "testmacros.h" 11 | 12 | static inline float 13 | sigmoidf (float x) 14 | { 15 | return 1.0 / (1.0 + expf (-x)); 16 | } 17 | 18 | test_scalar (fastsigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 1e-4f, 100000000) 19 | test_scalar (fastersigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 2e-2f, 100000000) 20 | 21 | test_vector (vfastsigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 1e-4f, 100000000) 22 | test_vector (vfastersigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 2e-2f, 100000000) 23 | 24 | int 25 | main (int argc, 26 | char *argv[]) 27 | { 28 | char buf[4096]; 29 | 30 | (void) argc; 31 | 32 | srand48 (69); 33 | 34 | strncpy (buf, argv[0], sizeof (buf) - 5); 35 | strncat (buf, ".out", 5); 36 | 37 | fclose (stderr); 38 | stderr = fopen (buf, "w"); 39 | 40 | test_fastsigmoid (); 41 | test_fastersigmoid (); 42 | test_vfastsigmoid (); 43 | test_vfastersigmoid (); 44 | 45 | time_fastsigmoid (); 46 | time_fastersigmoid (); 47 | time_vfastsigmoid (); 48 | time_vfastersigmoid (); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /fastapprox/tests/testfastexp.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fastexp.h" 9 | 10 | #include "testmacros.h" 11 | 12 | test_scalar (fastexp, expf, -5.0f + 10.0f * drand48 (), 1e-4f, 100000000) 13 | test_scalar (fasterexp, expf, -5.0f + 10.0f * drand48 (), 2e-2f, 100000000) 14 | 15 | test_vector (vfastexp, expf, -5.0f + 10.0f * drand48 (), 1e-4f, 100000000) 16 | test_vector (vfasterexp, expf, -5.0f + 10.0f * drand48 (), 2e-2f, 100000000) 17 | 18 | int 19 | main (int argc, 20 | char *argv[]) 21 | { 22 | char buf[4096]; 23 | 24 | (void) argc; 25 | 26 | float x; 27 | for (x = -50; x > -1000; x -= 100) 28 | { 29 | assert (fastexp (x) >= 0); 30 | assert (fasterexp (x) >= 0); 31 | #ifdef __SSE2__ 32 | v4sf vx = v4sfl (x); 33 | assert (v4sf_index (vfastexp (vx), 0) >= 0); 34 | assert (v4sf_index (vfasterexp (vx), 0) >= 0); 35 | #endif 36 | } 37 | 38 | srand48 (69); 39 | 40 | strncpy (buf, argv[0], sizeof (buf) - 5); 41 | strncat (buf, ".out", 5); 42 | 43 | fclose (stderr); 44 | stderr = fopen (buf, "w"); 45 | 46 | 47 | test_fastexp (); 48 | test_fasterexp (); 49 | test_vfastexp (); 50 | test_vfasterexp (); 51 | 52 | time_fastexp (); 53 | time_fasterexp (); 54 | time_vfastexp (); 55 | time_vfasterexp (); 56 | 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /fastapprox/COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011, Paul Mineiro 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of Paul Mineiro nor the names of its contributors 14 | may be used to endorse or promote products derived from this software 15 | without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 27 | THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /fastapprox/tests/testfasthyperbolic.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fasthyperbolic.h" 9 | 10 | #include "testmacros.h" 11 | 12 | test_scalar (fastsinh, sinhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000) 13 | test_scalar (fastersinh, sinhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000) 14 | 15 | test_vector (vfastsinh, sinhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000) 16 | test_vector (vfastersinh, sinhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000) 17 | 18 | test_scalar (fastcosh, coshf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000) 19 | test_scalar (fastercosh, coshf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000) 20 | 21 | test_vector (vfastcosh, coshf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000) 22 | test_vector (vfastercosh, coshf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000) 23 | 24 | test_scalar (fasttanh, tanhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000) 25 | test_scalar (fastertanh, tanhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000) 26 | 27 | test_vector (vfasttanh, tanhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000) 28 | test_vector (vfastertanh, tanhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000) 29 | 30 | int 31 | main (int argc, 32 | char *argv[]) 33 | { 34 | char buf[4096]; 35 | 36 | (void) argc; 37 | 38 | srand48 (69); 39 | 40 | strncpy (buf, argv[0], sizeof (buf) - 5); 41 | strncat (buf, ".out", 5); 42 | 43 | fclose (stderr); 44 | stderr = fopen (buf, "w"); 45 | 46 | test_fastsinh (); 47 | test_fastersinh (); 48 | test_fastcosh (); 49 | test_fastercosh (); 50 | test_fasttanh (); 51 | test_fastertanh (); 52 | test_vfastsinh (); 53 | test_vfastersinh (); 54 | test_vfastcosh (); 55 | test_vfastercosh (); 56 | test_vfasttanh (); 57 | test_vfastertanh (); 58 | 59 | time_fastsinh (); 60 | time_fastersinh (); 61 | time_fastcosh (); 62 | time_fastercosh (); 63 | time_fasttanh (); 64 | time_fastertanh (); 65 | time_vfastsinh (); 66 | time_vfastersinh (); 67 | time_vfastcosh (); 68 | time_vfastercosh (); 69 | time_vfasttanh (); 70 | time_vfastertanh (); 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /fastapprox/tests/testfastgamma.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/config.h" 9 | 10 | #ifdef __cplusplus 11 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP 12 | #include 13 | #endif 14 | #endif 15 | 16 | #include "../src/fastgamma.h" 17 | 18 | #include "testmacros.h" 19 | 20 | test_scalar (fastlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000) 21 | test_scalar (fasterlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000) 22 | 23 | test_vector (vfastlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000) 24 | test_vector (vfasterlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000) 25 | 26 | #ifdef __cplusplus 27 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP 28 | test_scalar (fastdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000) 29 | test_scalar (fasterdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000) 30 | 31 | test_vector (vfastdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000) 32 | test_vector (vfasterdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000) 33 | #endif 34 | #endif 35 | 36 | int 37 | main (int argc, 38 | char *argv[]) 39 | { 40 | char buf[4096]; 41 | 42 | (void) argc; 43 | 44 | srand48 (69); 45 | 46 | strncpy (buf, argv[0], sizeof (buf) - 5); 47 | strncat (buf, ".out", 5); 48 | 49 | fclose (stderr); 50 | stderr = fopen (buf, "w"); 51 | 52 | test_fastlgamma (); 53 | test_fasterlgamma (); 54 | test_vfastlgamma (); 55 | test_vfasterlgamma (); 56 | 57 | #ifdef __cplusplus 58 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP 59 | test_fastdigamma (); 60 | test_fasterdigamma (); 61 | test_vfastdigamma (); 62 | test_vfasterdigamma (); 63 | #endif 64 | #endif 65 | 66 | time_fastlgamma (); 67 | time_fasterlgamma (); 68 | time_vfastlgamma (); 69 | time_vfasterlgamma (); 70 | 71 | #ifdef __cplusplus 72 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP 73 | time_fastdigamma (); 74 | time_fasterdigamma (); 75 | time_vfastdigamma (); 76 | time_vfasterdigamma (); 77 | #endif 78 | #endif 79 | 80 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP 81 | return 0; 82 | #else 83 | return 77; 84 | #endif 85 | } 86 | -------------------------------------------------------------------------------- /fastapprox/tests/testfasterf.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fasterf.h" 9 | 10 | #include "testmacros.h" 11 | 12 | static float 13 | slowinverseerf (float x) 14 | { 15 | float y0 = 0.886227f * x; 16 | float sqrtpi = 1.7724538509055160f; 17 | 18 | y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0)); 19 | y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0)); 20 | y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0)); 21 | y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0)); 22 | y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0)); 23 | 24 | return y0; 25 | } 26 | 27 | test_scalar (fasterf, erff, -6.0f + 12.0f * drand48 (), 1e-3f, 100000000) 28 | test_scalar (fastererf, erff, -6.0f + 12.0f * drand48 (), 2e-2f, 100000000) 29 | test_vector (vfasterf, erff, -6.0f + 12.0f * drand48 (), 1e-3f, 100000000) 30 | test_vector (vfastererf, erff, -6.0f + 12.0f * drand48 (), 2e-2f, 100000000) 31 | 32 | test_scalar (fastinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 2e-3f, 100000000) 33 | test_vector (vfastinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 2e-3f, 100000000) 34 | test_scalar (fasterinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 4e-2f, 100000000) 35 | test_vector (vfasterinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 4e-2f, 100000000) 36 | 37 | test_scalar (fasterfc, erfcf, -2.0f + 4.0f * drand48 (), 5e-3f, 100000000) 38 | test_scalar (fastererfc, erfcf, -2.0f + 4.0f * drand48 (), 8e-2f, 100000000) 39 | test_vector (vfasterfc, erfcf, -2.0f + 4.0f * drand48 (), 5e-3f, 100000000) 40 | test_vector (vfastererfc, erfcf, -2.0f + 4.0f * drand48 (), 8e-2f, 100000000) 41 | 42 | int 43 | main (int argc, 44 | char *argv[]) 45 | { 46 | char buf[4096]; 47 | 48 | (void) argc; 49 | 50 | srand48 (69); 51 | 52 | strncpy (buf, argv[0], sizeof (buf) - 5); 53 | strncat (buf, ".out", 5); 54 | 55 | fclose (stderr); 56 | stderr = fopen (buf, "w"); 57 | 58 | test_fasterf (); 59 | test_fastererf (); 60 | test_vfasterf (); 61 | test_vfastererf (); 62 | test_fasterfc (); 63 | test_fastererfc (); 64 | test_vfasterfc (); 65 | test_vfastererfc (); 66 | test_fastinverseerf (); 67 | test_vfastinverseerf (); 68 | test_fasterinverseerf (); 69 | test_vfasterinverseerf (); 70 | 71 | time_fasterf (); 72 | time_fastererf (); 73 | time_vfasterf (); 74 | time_vfastererf (); 75 | time_fasterfc (); 76 | time_fastererfc (); 77 | time_vfasterfc (); 78 | time_vfastererfc (); 79 | time_fastinverseerf (); 80 | time_vfastinverseerf (); 81 | time_fasterinverseerf (); 82 | time_vfasterinverseerf (); 83 | 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /fastapprox/fw-pkgin/config: -------------------------------------------------------------------------------- 1 | # The FW_PACKAGE_MAINTAINER field is populated with the environment 2 | # variable FW_PACKAGE_DEFAULT_MAINTAINER if non-empty at init time 3 | 4 | FW_PACKAGE_NAME="fastapprox" 5 | FW_PACKAGE_VERSION="0.3.2" 6 | FW_PACKAGE_MAINTAINER="Paul Mineiro " 7 | FW_PACKAGE_SHORT_DESCRIPTION="Fast approximate function library." 8 | FW_PACKAGE_DESCRIPTION=`cat README` 9 | FW_PACKAGE_ARCHITECTURE_DEPENDENT="0" 10 | 11 | # Dependency information. The native syntax corresponds to Debian, 12 | # http://www.debian.org/doc/debian-policy/ch-relationships.html 13 | # Section 7.1 "Syntax of Relationship Fields" 14 | # 15 | # For other packaging systems, the syntax is translated for you. 16 | 17 | FW_PACKAGE_DEPENDS="" 18 | FW_PACKAGE_CONFLICTS="" 19 | FW_PACKAGE_PROVIDES="" 20 | FW_PACKAGE_REPLACES="" 21 | FW_PACKAGE_SUGGESTS="" 22 | 23 | FW_PACKAGE_BUILD_DEPENDS="" 24 | FW_PACKAGE_BUILD_CONFLICTS="" 25 | 26 | # dupload is used for submitting debian packages to a package archive 27 | # The FW_DUPLOAD_ARGS field is populated with the environment variable 28 | # FW_DEFAULT_DUPLOAD_ARGS if non-empty at init time 29 | 30 | FW_DUPLOAD_ARGS=${FW_DUPLOAD_ARGS-"-t dukeslucid"} 31 | 32 | # scp+createrepo is used for submitting rpm packages to a package archive 33 | # The FW_RPM_REPO_USER, FW_RPM_REPO_HOST, FW_RPM_REPO_BASEDIR, 34 | # and FW_RPM_POSTCREATEREPO_COMMANDS variables are populated with 35 | # FW_RPM_REPO_USER_DEFAULT, FW_RPM_REPO_HOST_DEFAULT, 36 | # FW_RPM_REPO_BASEDIR_DEFAULT, and FW_RPM_POSTCREATEREPO_COMMANDS_DEFAULT 37 | # respectively if non-empty at init time 38 | 39 | FW_RPM_REPO_USER=${FW_RPM_REPO_USER-"`whoami`"} 40 | FW_RPM_REPO_HOST=${FW_RPM_REPO_HOST-"ub32srvvmw"} 41 | FW_RPM_REPO_BASEDIR=${FW_RPM_REPO_BASEDIR-"/var/yum"} 42 | FW_RPM_CREATEREPO_ARGS=${FW_RPM_CREATEREPO_ARGS-"-q --database"} 43 | 44 | # this variable controls whether createrepo is run incrementally (--update). 45 | # possible settings are yes (always do it), no (never do it), and 46 | # auto (do it if the repository has been previously initialized) 47 | FW_RPM_CREATEREPO_INCREMENTAL=${FW_RPM_CREATEREPO_INCREMENTAL-"auto"} 48 | 49 | # these commands will be run after a successful createrepo run 50 | FW_RPM_POSTCREATEREPO_COMMANDS=${FW_RPM_POSTCREATEREPO_COMMANDS-"{ cd /var; rsync -a yum /var/package/dukes; }"} 51 | # here's a suggestion: 52 | # FW_RPM_POSTCREATEREPO_COMMANDS="gpg --detach-sign --armor repodata/repomd.xml" 53 | 54 | # set to the directory in which version-named tags will be created 55 | FW_SUBVERSION_TAG_ROOT="https://fastapprox.googlecode.com/svn/tags/fastapprox" 56 | 57 | # uncomment and set to specify additional pkg-config packages on the Requires: 58 | # line of the generated .pc file 59 | # FW_PKGCONFIG_REQUIRES_EXTRA="" 60 | 61 | # uncomment and set to specify additional content for the Libs: 62 | # line of the generated .pc file 63 | # FW_PKGCONFIG_LIBS_EXTRA="" 64 | 65 | # uncomment and set to specify additional content for the Cflags: 66 | # line of the generated .pc file 67 | # FW_PKGCONFIG_CFLAGS_EXTRA="" 68 | 69 | # uncomment and set to add arbitrary additional content to the 70 | # generated .pc file 71 | # FW_PKGCONFIG_EXTRA="" 72 | -------------------------------------------------------------------------------- /fastapprox/src/cast.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2012 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __CAST_H_ 42 | 43 | #ifdef __cplusplus 44 | #define cast_uint32_t static_cast 45 | #else 46 | #define cast_uint32_t (uint32_t) 47 | #endif 48 | 49 | #endif // __CAST_H_ 50 | -------------------------------------------------------------------------------- /fastapprox/tests/Makefile.am.local: -------------------------------------------------------------------------------- 1 | # put whatever (auto)make commands here, they will be included from Makefile.am 2 | 3 | check_PROGRAMS = \ 4 | testfastexp \ 5 | testfastexpcpp \ 6 | testfasterf \ 7 | testfasterfcpp \ 8 | testfastlog \ 9 | testfastlogcpp \ 10 | testfastpow \ 11 | testfastpowcpp \ 12 | testfastsigmoid \ 13 | testfastsigmoidcpp \ 14 | testfasthyperbolic \ 15 | testfasthyperboliccpp \ 16 | testfasttrig \ 17 | testfasttrigcpp \ 18 | testfastgamma \ 19 | testfastgammacpp \ 20 | testfastlambertw \ 21 | testfastlambertwcpp \ 22 | testfastonebigheader \ 23 | testfastonebigheadercpp 24 | 25 | testfasterf_SOURCES = \ 26 | testfasterf.c 27 | 28 | testfasterfcpp_SOURCES = \ 29 | testfasterfcpp.cc 30 | 31 | testfasterf_LDADD = \ 32 | -lm 33 | 34 | testfasterfcpp_LDADD = \ 35 | -lm 36 | 37 | testfastexp_SOURCES = \ 38 | testfastexp.c 39 | 40 | testfastexpcpp_SOURCES = \ 41 | testfastexpcpp.cc 42 | 43 | testfastexp_LDADD = \ 44 | -lm 45 | 46 | testfastexpcpp_LDADD = \ 47 | -lm 48 | 49 | testfastgamma_SOURCES = \ 50 | testfastgamma.c 51 | 52 | testfastgammacpp_SOURCES = \ 53 | testfastgammacpp.cc 54 | 55 | testfastgamma_LDADD = \ 56 | -lm 57 | 58 | testfastgammacpp_LDADD = \ 59 | -lm 60 | 61 | testfasthyperbolic_SOURCES = \ 62 | testfasthyperbolic.c 63 | 64 | testfasthyperboliccpp_SOURCES = \ 65 | testfasthyperboliccpp.cc 66 | 67 | testfasthyperbolic_LDADD = \ 68 | -lm 69 | 70 | testfasthyperboliccpp_LDADD = \ 71 | -lm 72 | 73 | testfastlambertw_SOURCES = \ 74 | testfastlambertw.c 75 | 76 | testfastlambertwcpp_SOURCES = \ 77 | testfastlambertwcpp.cc 78 | 79 | testfastlambertw_LDADD = \ 80 | -lm ${GSL_LIBS} 81 | 82 | testfastlambertwcpp_LDADD = \ 83 | -lm ${GSL_LIBS} 84 | 85 | testfastlog_SOURCES = \ 86 | testfastlog.c 87 | 88 | testfastlogcpp_SOURCES = \ 89 | testfastlogcpp.cc 90 | 91 | testfastlog_LDADD = \ 92 | -lm 93 | 94 | testfastlogcpp_LDADD = \ 95 | -lm 96 | 97 | testfastonebigheader_SOURCES = \ 98 | testfastonebigheader.c 99 | 100 | testfastonebigheadercpp_SOURCES = \ 101 | testfastonebigheadercpp.cc 102 | 103 | testfastonebigheader_LDADD = \ 104 | -lm 105 | 106 | testfastonebigheadercpp_LDADD = \ 107 | -lm 108 | 109 | testfastpow_SOURCES = \ 110 | testfastpow.c 111 | 112 | testfastpowcpp_SOURCES = \ 113 | testfastpowcpp.cc 114 | 115 | testfastpow_LDADD = \ 116 | -lm 117 | 118 | testfastpowcpp_LDADD = \ 119 | -lm 120 | 121 | testfastsigmoid_SOURCES = \ 122 | testfastsigmoid.c 123 | 124 | testfastsigmoidcpp_SOURCES = \ 125 | testfastsigmoidcpp.cc 126 | 127 | testfastsigmoid_LDADD = \ 128 | -lm 129 | 130 | testfastsigmoidcpp_LDADD = \ 131 | -lm 132 | 133 | testfasttrig_SOURCES = \ 134 | testfasttrig.c 135 | 136 | testfasttrigcpp_SOURCES = \ 137 | testfasttrigcpp.cc 138 | 139 | testfasttrig_LDADD = \ 140 | -lm 141 | 142 | testfasttrigcpp_LDADD = \ 143 | -lm 144 | 145 | %cpp.cc: %.c 146 | cat $< > $@ 147 | 148 | TESTS = \ 149 | $(check_PROGRAMS) 150 | 151 | CLEANFILES += \ 152 | $(wildcard *cpp.cc) \ 153 | $(wildcard *.out) 154 | 155 | EXTRA_DIST = testmacros.h fastapprox.nb 156 | -------------------------------------------------------------------------------- /fastapprox/tests/testfastlambertw.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fastlambertw.h" 9 | 10 | #include "testmacros.h" 11 | 12 | #include "../src/config.h" 13 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 14 | #include 15 | #endif 16 | 17 | static inline float 18 | lambertwrange (void) 19 | { 20 | if (drand48 () < 0.5) 21 | { 22 | return -0.36787944117144232f + 1.36787944117144232f * drand48 (); 23 | } 24 | else 25 | { 26 | return 100.0f * drand48 (); 27 | } 28 | } 29 | 30 | static inline float 31 | lambertwf (float x) 32 | { 33 | float w = (x < 5) ? 0 : log (x) - log (log (x)) + log (log (x)) / log (x); 34 | unsigned int n; 35 | 36 | for (n = 0; n < 20; ++n) 37 | { 38 | w = (w * w + exp (-w) * x) / (1.0 + w); 39 | } 40 | 41 | return w; 42 | } 43 | 44 | static inline float 45 | lambertwexpxf (float x) 46 | { 47 | return lambertwf (expf (x)); 48 | } 49 | 50 | test_scalar (fastlambertw, lambertwf, lambertwrange (), 1e-4f, 100000000) 51 | test_scalar (fasterlambertw, lambertwf, lambertwrange (), 1e-2f, 100000000) 52 | test_scalar (fastlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-3f, 100000000) 53 | test_scalar (fasterlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-2f, 100000000) 54 | 55 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 56 | test_scalar (gsl_sf_lambert_W0, lambertwf, lambertwrange (), 1e-2f, 1000000) 57 | #endif 58 | 59 | test_vector (vfastlambertw, lambertwf, lambertwrange (), 1e-4f, 100000000) 60 | test_vector (vfasterlambertw, lambertwf, lambertwrange (), 1e-2f, 100000000) 61 | test_vector (vfastlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-3f, 100000000) 62 | test_vector (vfasterlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-2f, 100000000) 63 | 64 | int 65 | main (int argc, 66 | char *argv[]) 67 | { 68 | char buf[4096]; 69 | 70 | (void) argc; 71 | 72 | srand48 (69); 73 | 74 | // fprintf (stderr, "fastlambertw (%g) = %g, " 75 | // "fastlambertw (%g) = %g, " 76 | // "fasterlambertwexpx (%g) = %g (%g)\n", 77 | // -0.36787944117144232f, 78 | // fastlambertw (-0.36787944117144232f), 79 | // -0.36787944117144232f + 0.01f, 80 | // fastlambertw (-0.36787944117144232f + 0.01f), 81 | // -5.0f, 82 | // fasterlambertwexpx (-5.0f), 83 | // v4sf_index (vfasterlambertwexpx (v4sfl (-5.0f)), 0)); 84 | 85 | strncpy (buf, argv[0], sizeof (buf) - 5); 86 | strncat (buf, ".out", 5); 87 | 88 | fclose (stderr); 89 | stderr = fopen (buf, "w"); 90 | 91 | test_fastlambertw (); 92 | test_fasterlambertw (); 93 | test_vfastlambertw (); 94 | test_vfasterlambertw (); 95 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 96 | test_gsl_sf_lambert_W0 (); 97 | #endif 98 | test_fastlambertwexpx (); 99 | test_fasterlambertwexpx (); 100 | test_vfastlambertwexpx (); 101 | test_vfasterlambertwexpx (); 102 | 103 | time_fastlambertw (); 104 | time_fasterlambertw (); 105 | time_vfastlambertw (); 106 | time_vfasterlambertw (); 107 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 108 | time_gsl_sf_lambert_W0 (); 109 | #endif 110 | time_fastlambertwexpx (); 111 | time_fasterlambertwexpx (); 112 | time_vfastlambertwexpx (); 113 | time_vfasterlambertwexpx (); 114 | 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /fastapprox/ax_check_compile_flag.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check whether the given FLAG works with the current language's compiler 12 | # or gives an error. (Warnings, however, are ignored) 13 | # 14 | # ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on 15 | # success/failure. 16 | # 17 | # If EXTRA-FLAGS is defined, it is added to the current language's default 18 | # flags (e.g. CFLAGS) when the check is done. The check is thus made with 19 | # the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to 20 | # force the compiler to issue an error when a bad flag is given. 21 | # 22 | # NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this 23 | # macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. 24 | # 25 | # LICENSE 26 | # 27 | # Copyright (c) 2008 Guido U. Draheim 28 | # Copyright (c) 2011 Maarten Bosmans 29 | # 30 | # This program is free software: you can redistribute it and/or modify it 31 | # under the terms of the GNU General Public License as published by the 32 | # Free Software Foundation, either version 3 of the License, or (at your 33 | # option) any later version. 34 | # 35 | # This program is distributed in the hope that it will be useful, but 36 | # WITHOUT ANY WARRANTY; without even the implied warranty of 37 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 38 | # Public License for more details. 39 | # 40 | # You should have received a copy of the GNU General Public License along 41 | # with this program. If not, see . 42 | # 43 | # As a special exception, the respective Autoconf Macro's copyright owner 44 | # gives unlimited permission to copy, distribute and modify the configure 45 | # scripts that are the output of Autoconf when processing the Macro. You 46 | # need not follow the terms of the GNU General Public License when using 47 | # or distributing such scripts, even though portions of the text of the 48 | # Macro appear in them. The GNU General Public License (GPL) does govern 49 | # all other use of the material that constitutes the Autoconf Macro. 50 | # 51 | # This special exception to the GPL applies to versions of the Autoconf 52 | # Macro released by the Autoconf Archive. When you make and distribute a 53 | # modified version of the Autoconf Macro, you may extend this special 54 | # exception to the GPL to apply to your modified version as well. 55 | 56 | #serial 2 57 | 58 | AC_DEFUN([AX_CHECK_COMPILE_FLAG], 59 | [AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX 60 | AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl 61 | AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ 62 | ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS 63 | _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" 64 | AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], 65 | [AS_VAR_SET(CACHEVAR,[yes])], 66 | [AS_VAR_SET(CACHEVAR,[no])]) 67 | _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) 68 | AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], 69 | [m4_default([$2], :)], 70 | [m4_default([$3], :)]) 71 | AS_VAR_POPDEF([CACHEVAR])dnl 72 | ])dnl AX_CHECK_COMPILE_FLAGS 73 | -------------------------------------------------------------------------------- /fastapprox/src/fastsigmoid.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_SIGMOID_H_ 42 | #define __FAST_SIGMOID_H_ 43 | 44 | #include 45 | #include "sse.h" 46 | #include "fastexp.h" 47 | 48 | static inline float 49 | fastsigmoid (float x) 50 | { 51 | return 1.0f / (1.0f + fastexp (-x)); 52 | } 53 | 54 | static inline float 55 | fastersigmoid (float x) 56 | { 57 | return 1.0f / (1.0f + fasterexp (-x)); 58 | } 59 | 60 | #ifdef __SSE2__ 61 | 62 | static inline v4sf 63 | vfastsigmoid (const v4sf x) 64 | { 65 | const v4sf c_1 = v4sfl (1.0f); 66 | 67 | return c_1 / (c_1 + vfastexp (-x)); 68 | } 69 | 70 | static inline v4sf 71 | vfastersigmoid (const v4sf x) 72 | { 73 | const v4sf c_1 = v4sfl (1.0f); 74 | 75 | return c_1 / (c_1 + vfasterexp (-x)); 76 | } 77 | 78 | #endif //__SSE2__ 79 | 80 | #endif // __FAST_SIGMOID_H_ 81 | -------------------------------------------------------------------------------- /fastapprox/src/fastpow.h: -------------------------------------------------------------------------------- 1 | 2 | /*=====================================================================* 3 | * Copyright (C) 2011 Paul Mineiro * 4 | * All rights reserved. * 5 | * * 6 | * Redistribution and use in source and binary forms, with * 7 | * or without modification, are permitted provided that the * 8 | * following conditions are met: * 9 | * * 10 | * * Redistributions of source code must retain the * 11 | * above copyright notice, this list of conditions and * 12 | * the following disclaimer. * 13 | * * 14 | * * Redistributions in binary form must reproduce the * 15 | * above copyright notice, this list of conditions and * 16 | * the following disclaimer in the documentation and/or * 17 | * other materials provided with the distribution. * 18 | * * 19 | * * Neither the name of Paul Mineiro nor the names * 20 | * of other contributors may be used to endorse or promote * 21 | * products derived from this software without specific * 22 | * prior written permission. * 23 | * * 24 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 25 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 26 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 27 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 29 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 30 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 31 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 32 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 33 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 34 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 35 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 36 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 37 | * POSSIBILITY OF SUCH DAMAGE. * 38 | * * 39 | * Contact: Paul Mineiro * 40 | *=====================================================================*/ 41 | 42 | #ifndef __FAST_POW_H_ 43 | #define __FAST_POW_H_ 44 | 45 | #include 46 | #include "sse.h" 47 | #include "fastexp.h" 48 | #include "fastlog.h" 49 | 50 | static inline float 51 | fastpow (float x, 52 | float p) 53 | { 54 | return fastpow2 (p * fastlog2 (x)); 55 | } 56 | 57 | static inline float 58 | fasterpow (float x, 59 | float p) 60 | { 61 | return fasterpow2 (p * fasterlog2 (x)); 62 | } 63 | 64 | #ifdef __SSE2__ 65 | 66 | static inline v4sf 67 | vfastpow (const v4sf x, 68 | const v4sf p) 69 | { 70 | return vfastpow2 (p * vfastlog2 (x)); 71 | } 72 | 73 | static inline v4sf 74 | vfasterpow (const v4sf x, 75 | const v4sf p) 76 | { 77 | return vfasterpow2 (p * vfasterlog2 (x)); 78 | } 79 | 80 | #endif //__SSE2__ 81 | 82 | #endif // __FAST_POW_H_ 83 | -------------------------------------------------------------------------------- /fastapprox/tests/testfasttrig.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../src/fasttrig.h" 9 | 10 | #include "testmacros.h" 11 | 12 | static const float pi = 3.1415926535897932f; 13 | static const float twopi = 6.2831853071795865f; 14 | static const float halfpi = 1.5707963267948966f; 15 | 16 | static inline float 17 | no_half_pi () 18 | { 19 | float rv = -100.0 + 200.0 * drand48 (); 20 | int k = rv / halfpi; 21 | float delta = rv - k * halfpi; 22 | 23 | while (fabsf (delta) < 0.001f) 24 | { 25 | rv = -100.0 + 200.0 * drand48 (); 26 | k = rv / halfpi; 27 | delta = rv - k * halfpi; 28 | } 29 | 30 | return rv; 31 | } 32 | 33 | test_scalar (fastsin, sinf, -pi + twopi * drand48 (), 1e-4f, 100000000) 34 | test_scalar (fastersin, sinf, -pi + twopi * drand48 (), 2e-2f, 100000000) 35 | test_scalar (fastsinfull, sinf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000) 36 | test_scalar (fastersinfull, sinf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000) 37 | 38 | test_vector (vfastsin, sinf, -pi + twopi * drand48 (), 1e-4f, 100000000) 39 | test_vector (vfastersin, sinf, -pi + twopi * drand48 (), 2e-2f, 100000000) 40 | test_vector (vfastsinfull, sinf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000) 41 | test_vector (vfastersinfull, sinf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000) 42 | 43 | test_scalar (fastcos, cosf, -pi + twopi * drand48 (), 1e-4f, 100000000) 44 | test_scalar (fastercos, cosf, -pi + twopi * drand48 (), 2e-2f, 100000000) 45 | test_scalar (fastcosfull, cosf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000) 46 | test_scalar (fastercosfull, cosf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000) 47 | 48 | test_vector (vfastcos, cosf, -pi + twopi * drand48 (), 1e-4f, 100000000) 49 | test_vector (vfastercos, cosf, -pi + twopi * drand48 (), 2e-2f, 100000000) 50 | test_vector (vfastcosfull, cosf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000) 51 | test_vector (vfastercosfull, cosf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000) 52 | 53 | test_scalar (fasttan, tanf, -0.5f * pi + pi * drand48 (), 1e-4f, 100000000) 54 | test_scalar (fastertan, tanf, -0.5f * pi + pi * drand48 (), 2e-2f, 100000000) 55 | test_scalar (fasttanfull, tanf, no_half_pi (), 1e-4f, 100000000) 56 | test_scalar (fastertanfull, tanf, no_half_pi (), 2e-2f, 100000000) 57 | 58 | test_vector (vfasttan, tanf, -0.5f * pi + pi * drand48 (), 1e-4f, 100000000) 59 | test_vector (vfastertan, tanf, -0.5f * pi + pi * drand48 (), 2e-2f, 100000000) 60 | test_vector (vfasttanfull, tanf, no_half_pi (), 1e-4f, 100000000) 61 | test_vector (vfastertanfull, tanf, no_half_pi (), 2e-2f, 100000000) 62 | 63 | int 64 | main (int argc, 65 | char *argv[]) 66 | { 67 | (void) argc; 68 | (void) argv; 69 | 70 | char buf[4096]; 71 | 72 | srand48 (69); 73 | 74 | strncpy (buf, argv[0], sizeof (buf) - 5); 75 | strncat (buf, ".out", 5); 76 | 77 | fclose (stderr); 78 | stderr = fopen (buf, "w"); 79 | 80 | test_fastsin (); 81 | test_fastersin (); 82 | test_fastsinfull (); 83 | test_fastersinfull (); 84 | test_fastcos (); 85 | test_fastercos (); 86 | test_fastcosfull (); 87 | test_fastercosfull (); 88 | test_fasttan (); 89 | test_fastertan (); 90 | test_fasttanfull (); 91 | test_fastertanfull (); 92 | test_vfastsin (); 93 | test_vfastersin (); 94 | test_vfastsinfull (); 95 | test_vfastersinfull (); 96 | test_vfastcos (); 97 | test_vfastercos (); 98 | test_vfastcosfull (); 99 | test_vfastercosfull (); 100 | test_vfasttan (); 101 | test_vfastertan (); 102 | test_vfasttanfull (); 103 | test_vfastertanfull (); 104 | 105 | time_fastsin (); 106 | time_fastersin (); 107 | time_fastsinfull (); 108 | time_fastersinfull (); 109 | time_fastcos (); 110 | time_fastercos (); 111 | time_fastcosfull (); 112 | time_fastercosfull (); 113 | time_fasttan (); 114 | time_fastertan (); 115 | time_fasttanfull (); 116 | time_fastertanfull (); 117 | time_vfastsin (); 118 | time_vfastersin (); 119 | time_vfastsinfull (); 120 | time_vfastersinfull (); 121 | time_vfastcos (); 122 | time_vfastercos (); 123 | time_vfastcosfull (); 124 | time_vfastercosfull (); 125 | time_vfasttan (); 126 | time_vfastertan (); 127 | time_vfasttanfull (); 128 | time_vfastertanfull (); 129 | 130 | return 0; 131 | } 132 | -------------------------------------------------------------------------------- /fastapprox/src/fasthyperbolic.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_HYPERBOLIC_H_ 42 | #define __FAST_HYPERBOLIC_H_ 43 | 44 | #include 45 | #include "sse.h" 46 | #include "fastexp.h" 47 | 48 | static inline float 49 | fastsinh (float p) 50 | { 51 | return 0.5f * (fastexp (p) - fastexp (-p)); 52 | } 53 | 54 | static inline float 55 | fastersinh (float p) 56 | { 57 | return 0.5f * (fasterexp (p) - fasterexp (-p)); 58 | } 59 | 60 | static inline float 61 | fastcosh (float p) 62 | { 63 | return 0.5f * (fastexp (p) + fastexp (-p)); 64 | } 65 | 66 | static inline float 67 | fastercosh (float p) 68 | { 69 | return 0.5f * (fasterexp (p) + fasterexp (-p)); 70 | } 71 | 72 | static inline float 73 | fasttanh (float p) 74 | { 75 | return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p)); 76 | } 77 | 78 | static inline float 79 | fastertanh (float p) 80 | { 81 | return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p)); 82 | } 83 | 84 | #ifdef __SSE2__ 85 | 86 | static inline v4sf 87 | vfastsinh (const v4sf p) 88 | { 89 | const v4sf c_0_5 = v4sfl (0.5f); 90 | 91 | return c_0_5 * (vfastexp (p) - vfastexp (-p)); 92 | } 93 | 94 | static inline v4sf 95 | vfastersinh (const v4sf p) 96 | { 97 | const v4sf c_0_5 = v4sfl (0.5f); 98 | 99 | return c_0_5 * (vfasterexp (p) - vfasterexp (-p)); 100 | } 101 | 102 | static inline v4sf 103 | vfastcosh (const v4sf p) 104 | { 105 | const v4sf c_0_5 = v4sfl (0.5f); 106 | 107 | return c_0_5 * (vfastexp (p) + vfastexp (-p)); 108 | } 109 | 110 | static inline v4sf 111 | vfastercosh (const v4sf p) 112 | { 113 | const v4sf c_0_5 = v4sfl (0.5f); 114 | 115 | return c_0_5 * (vfasterexp (p) + vfasterexp (-p)); 116 | } 117 | 118 | static inline v4sf 119 | vfasttanh (const v4sf p) 120 | { 121 | const v4sf c_1 = v4sfl (1.0f); 122 | const v4sf c_2 = v4sfl (2.0f); 123 | 124 | return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p)); 125 | } 126 | 127 | static inline v4sf 128 | vfastertanh (const v4sf p) 129 | { 130 | const v4sf c_1 = v4sfl (1.0f); 131 | const v4sf c_2 = v4sfl (2.0f); 132 | 133 | return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p)); 134 | } 135 | 136 | #endif //__SSE2__ 137 | 138 | #endif // __FAST_HYPERBOLIC_H_ 139 | -------------------------------------------------------------------------------- /fastapprox/src/fastlog.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_LOG_H_ 42 | #define __FAST_LOG_H_ 43 | 44 | #include 45 | #include "sse.h" 46 | 47 | static inline float 48 | fastlog2 (float x) 49 | { 50 | union { float f; uint32_t i; } vx = { x }; 51 | union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 }; 52 | float y = vx.i; 53 | y *= 1.1920928955078125e-7f; 54 | 55 | return y - 124.22551499f 56 | - 1.498030302f * mx.f 57 | - 1.72587999f / (0.3520887068f + mx.f); 58 | } 59 | 60 | static inline float 61 | fastlog (float x) 62 | { 63 | return 0.69314718f * fastlog2 (x); 64 | } 65 | 66 | static inline float 67 | fasterlog2 (float x) 68 | { 69 | union { float f; uint32_t i; } vx = { x }; 70 | float y = vx.i; 71 | y *= 1.1920928955078125e-7f; 72 | return y - 126.94269504f; 73 | } 74 | 75 | static inline float 76 | fasterlog (float x) 77 | { 78 | // return 0.69314718f * fasterlog2 (x); 79 | 80 | union { float f; uint32_t i; } vx = { x }; 81 | float y = vx.i; 82 | y *= 8.2629582881927490e-8f; 83 | return y - 87.989971088f; 84 | } 85 | 86 | #ifdef __SSE2__ 87 | 88 | static inline v4sf 89 | vfastlog2 (v4sf x) 90 | { 91 | union { v4sf f; v4si i; } vx = { x }; 92 | union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000); 93 | v4sf y = v4si_to_v4sf (vx.i); 94 | y *= v4sfl (1.1920928955078125e-7f); 95 | 96 | const v4sf c_124_22551499 = v4sfl (124.22551499f); 97 | const v4sf c_1_498030302 = v4sfl (1.498030302f); 98 | const v4sf c_1_725877999 = v4sfl (1.72587999f); 99 | const v4sf c_0_3520087068 = v4sfl (0.3520887068f); 100 | 101 | return y - c_124_22551499 102 | - c_1_498030302 * mx.f 103 | - c_1_725877999 / (c_0_3520087068 + mx.f); 104 | } 105 | 106 | static inline v4sf 107 | vfastlog (v4sf x) 108 | { 109 | const v4sf c_0_69314718 = v4sfl (0.69314718f); 110 | 111 | return c_0_69314718 * vfastlog2 (x); 112 | } 113 | 114 | static inline v4sf 115 | vfasterlog2 (v4sf x) 116 | { 117 | union { v4sf f; v4si i; } vx = { x }; 118 | v4sf y = v4si_to_v4sf (vx.i); 119 | y *= v4sfl (1.1920928955078125e-7f); 120 | 121 | const v4sf c_126_94269504 = v4sfl (126.94269504f); 122 | 123 | return y - c_126_94269504; 124 | } 125 | 126 | static inline v4sf 127 | vfasterlog (v4sf x) 128 | { 129 | // const v4sf c_0_69314718 = v4sfl (0.69314718f); 130 | // 131 | // return c_0_69314718 * vfasterlog2 (x); 132 | 133 | union { v4sf f; v4si i; } vx = { x }; 134 | v4sf y = v4si_to_v4sf (vx.i); 135 | y *= v4sfl (8.2629582881927490e-8f); 136 | 137 | const v4sf c_87_989971088 = v4sfl (87.989971088f); 138 | 139 | return y - c_87_989971088; 140 | } 141 | 142 | #endif // __SSE2__ 143 | 144 | #endif // __FAST_LOG_H_ 145 | -------------------------------------------------------------------------------- /fastapprox/src/fastexp.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_EXP_H_ 42 | #define __FAST_EXP_H_ 43 | 44 | #include 45 | #include "cast.h" 46 | #include "sse.h" 47 | 48 | // Underflow of exponential is common practice in numerical routines, 49 | // so handle it here. 50 | 51 | static inline float 52 | fastpow2 (float p) 53 | { 54 | float offset = (p < 0) ? 1.0f : 0.0f; 55 | float clipp = (p < -126) ? -126.0f : p; 56 | int w = clipp; 57 | float z = clipp - w + offset; 58 | union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) }; 59 | 60 | return v.f; 61 | } 62 | 63 | static inline float 64 | fastexp (float p) 65 | { 66 | return fastpow2 (1.442695040f * p); 67 | } 68 | 69 | static inline float 70 | fasterpow2 (float p) 71 | { 72 | float clipp = (p < -126) ? -126.0f : p; 73 | union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) }; 74 | return v.f; 75 | } 76 | 77 | static inline float 78 | fasterexp (float p) 79 | { 80 | return fasterpow2 (1.442695040f * p); 81 | } 82 | 83 | #ifdef __SSE2__ 84 | 85 | static inline v4sf 86 | vfastpow2 (const v4sf p) 87 | { 88 | v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f)); 89 | v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f)); 90 | v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); 91 | v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); 92 | v4si w = v4sf_to_v4si (clipp); 93 | v4sf z = clipp - v4si_to_v4sf (w) + offset; 94 | 95 | const v4sf c_121_2740838 = v4sfl (121.2740575f); 96 | const v4sf c_27_7280233 = v4sfl (27.7280233f); 97 | const v4sf c_4_84252568 = v4sfl (4.84252568f); 98 | const v4sf c_1_49012907 = v4sfl (1.49012907f); 99 | union { v4si i; v4sf f; } v = { 100 | v4sf_to_v4si ( 101 | v4sfl (1 << 23) * 102 | (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z) 103 | ) 104 | }; 105 | 106 | return v.f; 107 | } 108 | 109 | static inline v4sf 110 | vfastexp (const v4sf p) 111 | { 112 | const v4sf c_invlog_2 = v4sfl (1.442695040f); 113 | 114 | return vfastpow2 (c_invlog_2 * p); 115 | } 116 | 117 | static inline v4sf 118 | vfasterpow2 (const v4sf p) 119 | { 120 | const v4sf c_126_94269504 = v4sfl (126.94269504f); 121 | v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); 122 | v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); 123 | union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) }; 124 | return v.f; 125 | } 126 | 127 | static inline v4sf 128 | vfasterexp (const v4sf p) 129 | { 130 | const v4sf c_invlog_2 = v4sfl (1.442695040f); 131 | 132 | return vfasterpow2 (c_invlog_2 * p); 133 | } 134 | 135 | #endif //__SSE2__ 136 | 137 | #endif // __FAST_EXP_H_ 138 | -------------------------------------------------------------------------------- /fastapprox/src/fastgamma.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_GAMMA_H_ 42 | #define __FAST_GAMMA_H_ 43 | 44 | #include 45 | #include "sse.h" 46 | #include "fastlog.h" 47 | 48 | /* gamma/digamma functions only work for positive inputs */ 49 | 50 | static inline float 51 | fastlgamma (float x) 52 | { 53 | float logterm = fastlog (x * (1.0f + x) * (2.0f + x)); 54 | float xp3 = 3.0f + x; 55 | 56 | return - 2.081061466f 57 | - x 58 | + 0.0833333f / xp3 59 | - logterm 60 | + (2.5f + x) * fastlog (xp3); 61 | } 62 | 63 | static inline float 64 | fasterlgamma (float x) 65 | { 66 | return - 0.0810614667f 67 | - x 68 | - fasterlog (x) 69 | + (0.5f + x) * fasterlog (1.0f + x); 70 | } 71 | 72 | static inline float 73 | fastdigamma (float x) 74 | { 75 | float twopx = 2.0f + x; 76 | float logterm = fastlog (twopx); 77 | 78 | return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) / 79 | (12.0f * x * (1.0f + x) * twopx * twopx) 80 | + logterm; 81 | } 82 | 83 | static inline float 84 | fasterdigamma (float x) 85 | { 86 | float onepx = 1.0f + x; 87 | 88 | return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx); 89 | } 90 | 91 | #ifdef __SSE2__ 92 | 93 | static inline v4sf 94 | vfastlgamma (v4sf x) 95 | { 96 | const v4sf c_1_0 = v4sfl (1.0f); 97 | const v4sf c_2_0 = v4sfl (2.0f); 98 | const v4sf c_3_0 = v4sfl (3.0f); 99 | const v4sf c_2_081061466 = v4sfl (2.081061466f); 100 | const v4sf c_0_0833333 = v4sfl (0.0833333f); 101 | const v4sf c_2_5 = v4sfl (2.5f); 102 | 103 | v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x)); 104 | v4sf xp3 = c_3_0 + x; 105 | 106 | return - c_2_081061466 107 | - x 108 | + c_0_0833333 / xp3 109 | - logterm 110 | + (c_2_5 + x) * vfastlog (xp3); 111 | } 112 | 113 | static inline v4sf 114 | vfasterlgamma (v4sf x) 115 | { 116 | const v4sf c_0_0810614667 = v4sfl (0.0810614667f); 117 | const v4sf c_0_5 = v4sfl (0.5f); 118 | const v4sf c_1 = v4sfl (1.0f); 119 | 120 | return - c_0_0810614667 121 | - x 122 | - vfasterlog (x) 123 | + (c_0_5 + x) * vfasterlog (c_1 + x); 124 | } 125 | 126 | static inline v4sf 127 | vfastdigamma (v4sf x) 128 | { 129 | v4sf twopx = v4sfl (2.0f) + x; 130 | v4sf logterm = vfastlog (twopx); 131 | 132 | return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) / 133 | (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx) 134 | + logterm; 135 | } 136 | 137 | static inline v4sf 138 | vfasterdigamma (v4sf x) 139 | { 140 | const v4sf c_1_0 = v4sfl (1.0f); 141 | const v4sf c_2_0 = v4sfl (2.0f); 142 | v4sf onepx = c_1_0 + x; 143 | 144 | return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx); 145 | } 146 | 147 | #endif //__SSE2__ 148 | 149 | #endif // __FAST_GAMMA_H_ 150 | -------------------------------------------------------------------------------- /fastapprox/src/fasterf.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_ERF_H_ 42 | #define __FAST_ERF_H_ 43 | 44 | #include 45 | #include 46 | #include "sse.h" 47 | #include "fastexp.h" 48 | #include "fastlog.h" 49 | 50 | // fasterfc: not actually faster than erfcf(3) on newer machines! 51 | // ... although vectorized version is interesting 52 | // and fastererfc is very fast 53 | 54 | static inline float 55 | fasterfc (float x) 56 | { 57 | static const float k = 3.3509633149424609f; 58 | static const float a = 0.07219054755431126f; 59 | static const float b = 15.418191568719577f; 60 | static const float c = 5.609846028328545f; 61 | 62 | union { float f; uint32_t i; } vc = { c * x }; 63 | float xsq = x * x; 64 | float xquad = xsq * xsq; 65 | 66 | vc.i |= 0x80000000; 67 | 68 | return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f); 69 | } 70 | 71 | static inline float 72 | fastererfc (float x) 73 | { 74 | static const float k = 3.3509633149424609f; 75 | 76 | return 2.0f / (1.0f + fasterpow2 (k * x)); 77 | } 78 | 79 | // fasterf: not actually faster than erff(3) on newer machines! 80 | // ... although vectorized version is interesting 81 | // and fastererf is very fast 82 | 83 | static inline float 84 | fasterf (float x) 85 | { 86 | return 1.0f - fasterfc (x); 87 | } 88 | 89 | static inline float 90 | fastererf (float x) 91 | { 92 | return 1.0f - fastererfc (x); 93 | } 94 | 95 | static inline float 96 | fastinverseerf (float x) 97 | { 98 | static const float invk = 0.30004578719350504f; 99 | static const float a = 0.020287853348211326f; 100 | static const float b = 0.07236892874789555f; 101 | static const float c = 0.9913030456864257f; 102 | static const float d = 0.8059775923760193f; 103 | 104 | float xsq = x * x; 105 | 106 | return invk * fastlog2 ((1.0f + x) / (1.0f - x)) 107 | + x * (a - b * xsq) / (c - d * xsq); 108 | } 109 | 110 | static inline float 111 | fasterinverseerf (float x) 112 | { 113 | static const float invk = 0.30004578719350504f; 114 | 115 | return invk * fasterlog2 ((1.0f + x) / (1.0f - x)); 116 | } 117 | 118 | #ifdef __SSE2__ 119 | 120 | static inline v4sf 121 | vfasterfc (v4sf x) 122 | { 123 | const v4sf k = v4sfl (3.3509633149424609f); 124 | const v4sf a = v4sfl (0.07219054755431126f); 125 | const v4sf b = v4sfl (15.418191568719577f); 126 | const v4sf c = v4sfl (5.609846028328545f); 127 | 128 | union { v4sf f; v4si i; } vc; vc.f = c * x; 129 | vc.i |= v4sil (0x80000000); 130 | 131 | v4sf xsq = x * x; 132 | v4sf xquad = xsq * xsq; 133 | 134 | return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f); 135 | } 136 | 137 | static inline v4sf 138 | vfastererfc (const v4sf x) 139 | { 140 | const v4sf k = v4sfl (3.3509633149424609f); 141 | 142 | return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x)); 143 | } 144 | 145 | static inline v4sf 146 | vfasterf (v4sf x) 147 | { 148 | return v4sfl (1.0f) - vfasterfc (x); 149 | } 150 | 151 | static inline v4sf 152 | vfastererf (const v4sf x) 153 | { 154 | return v4sfl (1.0f) - vfastererfc (x); 155 | } 156 | 157 | static inline v4sf 158 | vfastinverseerf (v4sf x) 159 | { 160 | const v4sf invk = v4sfl (0.30004578719350504f); 161 | const v4sf a = v4sfl (0.020287853348211326f); 162 | const v4sf b = v4sfl (0.07236892874789555f); 163 | const v4sf c = v4sfl (0.9913030456864257f); 164 | const v4sf d = v4sfl (0.8059775923760193f); 165 | 166 | v4sf xsq = x * x; 167 | 168 | return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) 169 | + x * (a - b * xsq) / (c - d * xsq); 170 | } 171 | 172 | static inline v4sf 173 | vfasterinverseerf (v4sf x) 174 | { 175 | const v4sf invk = v4sfl (0.30004578719350504f); 176 | 177 | return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)); 178 | } 179 | 180 | #endif //__SSE2__ 181 | 182 | #endif // __FAST_ERF_H_ 183 | -------------------------------------------------------------------------------- /fastapprox/src/sse.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __SSE_H_ 42 | #define __SSE_H_ 43 | 44 | #ifdef __SSE2__ 45 | 46 | #include 47 | 48 | #ifdef __cplusplus 49 | namespace { 50 | #endif // __cplusplus 51 | 52 | typedef __m128 v4sf; 53 | typedef __m128i v4si; 54 | 55 | #define v4si_to_v4sf _mm_cvtepi32_ps 56 | #define v4sf_to_v4si _mm_cvttps_epi32 57 | 58 | #if _MSC_VER && !__INTEL_COMPILER 59 | template 60 | __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; } 61 | 62 | #define AS_4CHARS(a) \ 63 | GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \ 64 | GetChar(int32_t(a), 2), GetChar(int32_t(a), 3) 65 | 66 | #define _MM_SETR_EPI32(a0, a1, a2, a3) \ 67 | { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) } 68 | 69 | #define v4sfl(x) (const v4sf { (x), (x), (x), (x) }) 70 | #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x)) 71 | 72 | __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); } 73 | __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); } 74 | __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); } 75 | __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); } 76 | 77 | __forceinline const v4sf operator+(const v4sf& a) { return a; } 78 | __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } 79 | 80 | __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); } 81 | __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); } 82 | __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); } 83 | 84 | __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); } 85 | __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); } 86 | __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); } 87 | 88 | __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; } 89 | __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; } 90 | __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; } 91 | __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; } 92 | 93 | __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; } 94 | __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; } 95 | __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; } 96 | #else 97 | #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) }) 98 | #define v2dil(x) ((const v4si) { (x), (x) }) 99 | #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x)) 100 | #endif 101 | 102 | typedef union { v4sf f; float array[4]; } v4sfindexer; 103 | #define v4sf_index(_findx, _findi) \ 104 | ({ \ 105 | v4sfindexer _findvx = { _findx } ; \ 106 | _findvx.array[_findi]; \ 107 | }) 108 | typedef union { v4si i; int array[4]; } v4siindexer; 109 | #define v4si_index(_iindx, _iindi) \ 110 | ({ \ 111 | v4siindexer _iindvx = { _iindx } ; \ 112 | _iindvx.array[_iindi]; \ 113 | }) 114 | 115 | typedef union { v4sf f; v4si i; } v4sfv4sipun; 116 | #if _MSC_VER && !__INTEL_COMPILER 117 | #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) 118 | #else 119 | #define v4sf_fabs(x) \ 120 | ({ \ 121 | v4sfv4sipun vx; \ 122 | vx.f = x; \ 123 | vx.i &= v4sil (0x7FFFFFFF); \ 124 | vx.f; \ 125 | }) 126 | #endif 127 | 128 | #ifdef __cplusplus 129 | } // end namespace 130 | #endif // __cplusplus 131 | 132 | #endif // __SSE2__ 133 | 134 | #endif // __SSE_H_ 135 | -------------------------------------------------------------------------------- /fastapprox/src/fastlambertw.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_LAMBERT_W_H_ 42 | #define __FAST_LAMBERT_W_H_ 43 | 44 | #include 45 | #include "fastexp.h" 46 | #include "fastlog.h" 47 | #include "sse.h" 48 | 49 | // these functions compute the upper branch aka W_0 50 | 51 | static inline float 52 | fastlambertw (float x) 53 | { 54 | static const float threshold = 2.26445f; 55 | 56 | float c = (x < threshold) ? 1.546865557f : 1.0f; 57 | float d = (x < threshold) ? 2.250366841f : 0.0f; 58 | float a = (x < threshold) ? -0.737769969f : 0.0f; 59 | 60 | float logterm = fastlog (c * x + d); 61 | float loglogterm = fastlog (logterm); 62 | 63 | float minusw = -a - logterm + loglogterm - loglogterm / logterm; 64 | float expminusw = fastexp (minusw); 65 | float xexpminusw = x * expminusw; 66 | float pexpminusw = xexpminusw - minusw; 67 | 68 | return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) / 69 | (2.0f + pexpminusw * (2.0f - minusw)); 70 | } 71 | 72 | static inline float 73 | fasterlambertw (float x) 74 | { 75 | static const float threshold = 2.26445f; 76 | 77 | float c = (x < threshold) ? 1.546865557f : 1.0f; 78 | float d = (x < threshold) ? 2.250366841f : 0.0f; 79 | float a = (x < threshold) ? -0.737769969f : 0.0f; 80 | 81 | float logterm = fasterlog (c * x + d); 82 | float loglogterm = fasterlog (logterm); 83 | 84 | float w = a + logterm - loglogterm + loglogterm / logterm; 85 | float expw = fasterexp (-w); 86 | 87 | return (w * w + expw * x) / (1.0f + w); 88 | } 89 | 90 | static inline float 91 | fastlambertwexpx (float x) 92 | { 93 | static const float k = 1.1765631309f; 94 | static const float a = 0.94537622168f; 95 | 96 | float logarg = fmaxf (x, k); 97 | float powarg = (x < k) ? a * (x - k) : 0; 98 | 99 | float logterm = fastlog (logarg); 100 | float powterm = fasterpow2 (powarg); // don't need accuracy here 101 | 102 | float w = powterm * (logarg - logterm + logterm / logarg); 103 | float logw = fastlog (w); 104 | float p = x - logw; 105 | 106 | return w * (2.0f + p + w * (3.0f + 2.0f * p)) / 107 | (2.0f - p + w * (5.0f + 2.0f * w)); 108 | } 109 | 110 | static inline float 111 | fasterlambertwexpx (float x) 112 | { 113 | static const float k = 1.1765631309f; 114 | static const float a = 0.94537622168f; 115 | 116 | float logarg = fmaxf (x, k); 117 | float powarg = (x < k) ? a * (x - k) : 0; 118 | 119 | float logterm = fasterlog (logarg); 120 | float powterm = fasterpow2 (powarg); 121 | 122 | float w = powterm * (logarg - logterm + logterm / logarg); 123 | float logw = fasterlog (w); 124 | 125 | return w * (1.0f + x - logw) / (1.0f + w); 126 | } 127 | 128 | #ifdef __SSE2__ 129 | 130 | static inline v4sf 131 | vfastlambertw (v4sf x) 132 | { 133 | const v4sf threshold = v4sfl (2.26445f); 134 | 135 | v4sf under = _mm_cmplt_ps (x, threshold); 136 | v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), 137 | _mm_andnot_ps (under, v4sfl (1.0f))); 138 | v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); 139 | v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); 140 | 141 | v4sf logterm = vfastlog (c * x + d); 142 | v4sf loglogterm = vfastlog (logterm); 143 | 144 | v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm; 145 | v4sf expminusw = vfastexp (minusw); 146 | v4sf xexpminusw = x * expminusw; 147 | v4sf pexpminusw = xexpminusw - minusw; 148 | 149 | return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / 150 | (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw)); 151 | } 152 | 153 | static inline v4sf 154 | vfasterlambertw (v4sf x) 155 | { 156 | const v4sf threshold = v4sfl (2.26445f); 157 | 158 | v4sf under = _mm_cmplt_ps (x, threshold); 159 | v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), 160 | _mm_andnot_ps (under, v4sfl (1.0f))); 161 | v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); 162 | v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); 163 | 164 | v4sf logterm = vfasterlog (c * x + d); 165 | v4sf loglogterm = vfasterlog (logterm); 166 | 167 | v4sf w = a + logterm - loglogterm + loglogterm / logterm; 168 | v4sf expw = vfasterexp (-w); 169 | 170 | return (w * w + expw * x) / (v4sfl (1.0f) + w); 171 | } 172 | 173 | static inline v4sf 174 | vfastlambertwexpx (v4sf x) 175 | { 176 | const v4sf k = v4sfl (1.1765631309f); 177 | const v4sf a = v4sfl (0.94537622168f); 178 | const v4sf two = v4sfl (2.0f); 179 | const v4sf three = v4sfl (3.0f); 180 | const v4sf five = v4sfl (5.0f); 181 | 182 | v4sf logarg = _mm_max_ps (x, k); 183 | v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); 184 | 185 | v4sf logterm = vfastlog (logarg); 186 | v4sf powterm = vfasterpow2 (powarg); // don't need accuracy here 187 | 188 | v4sf w = powterm * (logarg - logterm + logterm / logarg); 189 | v4sf logw = vfastlog (w); 190 | v4sf p = x - logw; 191 | 192 | return w * (two + p + w * (three + two * p)) / 193 | (two - p + w * (five + two * w)); 194 | } 195 | 196 | static inline v4sf 197 | vfasterlambertwexpx (v4sf x) 198 | { 199 | const v4sf k = v4sfl (1.1765631309f); 200 | const v4sf a = v4sfl (0.94537622168f); 201 | 202 | v4sf logarg = _mm_max_ps (x, k); 203 | v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); 204 | 205 | v4sf logterm = vfasterlog (logarg); 206 | v4sf powterm = vfasterpow2 (powarg); 207 | 208 | v4sf w = powterm * (logarg - logterm + logterm / logarg); 209 | v4sf logw = vfasterlog (w); 210 | 211 | return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w); 212 | } 213 | 214 | #endif // __SSE2__ 215 | 216 | #endif // __FAST_LAMBERT_W_H_ 217 | -------------------------------------------------------------------------------- /fastapprox/src/fasttrig.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2011 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __FAST_TRIG_H_ 42 | #define __FAST_TRIG_H_ 43 | 44 | #include 45 | #include "sse.h" 46 | 47 | // http://www.devmaster.net/forums/showthread.php?t=5784 48 | // fast sine variants are for x \in [ -\pi, pi ] 49 | // fast cosine variants are for x \in [ -\pi, pi ] 50 | // fast tangent variants are for x \in [ -\pi / 2, pi / 2 ] 51 | // "full" versions of functions handle the entire range of inputs 52 | // although the range reduction technique used here will be hopelessly 53 | // inaccurate for |x| >> 1000 54 | // 55 | // WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than 56 | // libc calls on older machines (!) and on newer machines are only 57 | // slighly faster. however: 58 | // * vectorized versions are competitive 59 | // * faster full versions are competitive 60 | 61 | static inline float 62 | fastsin (float x) 63 | { 64 | static const float fouroverpi = 1.2732395447351627f; 65 | static const float fouroverpisq = 0.40528473456935109f; 66 | static const float q = 0.78444488374548933f; 67 | union { float f; uint32_t i; } p = { 0.20363937680730309f }; 68 | union { float f; uint32_t i; } r = { 0.015124940802184233f }; 69 | union { float f; uint32_t i; } s = { -0.0032225901625579573f }; 70 | 71 | union { float f; uint32_t i; } vx = { x }; 72 | uint32_t sign = vx.i & 0x80000000; 73 | vx.i = vx.i & 0x7FFFFFFF; 74 | 75 | float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 76 | float qpproxsq = qpprox * qpprox; 77 | 78 | p.i |= sign; 79 | r.i |= sign; 80 | s.i ^= sign; 81 | 82 | return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f)); 83 | } 84 | 85 | static inline float 86 | fastersin (float x) 87 | { 88 | static const float fouroverpi = 1.2732395447351627f; 89 | static const float fouroverpisq = 0.40528473456935109f; 90 | static const float q = 0.77633023248007499f; 91 | union { float f; uint32_t i; } p = { 0.22308510060189463f }; 92 | 93 | union { float f; uint32_t i; } vx = { x }; 94 | uint32_t sign = vx.i & 0x80000000; 95 | vx.i &= 0x7FFFFFFF; 96 | 97 | float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 98 | 99 | p.i |= sign; 100 | 101 | return qpprox * (q + p.f * qpprox); 102 | } 103 | 104 | static inline float 105 | fastsinfull (float x) 106 | { 107 | static const float twopi = 6.2831853071795865f; 108 | static const float invtwopi = 0.15915494309189534f; 109 | 110 | int k = x * invtwopi; 111 | float half = (x < 0) ? -0.5f : 0.5f; 112 | return fastsin ((half + k) * twopi - x); 113 | } 114 | 115 | static inline float 116 | fastersinfull (float x) 117 | { 118 | static const float twopi = 6.2831853071795865f; 119 | static const float invtwopi = 0.15915494309189534f; 120 | 121 | int k = x * invtwopi; 122 | float half = (x < 0) ? -0.5f : 0.5f; 123 | return fastersin ((half + k) * twopi - x); 124 | } 125 | 126 | static inline float 127 | fastcos (float x) 128 | { 129 | static const float halfpi = 1.5707963267948966f; 130 | static const float halfpiminustwopi = -4.7123889803846899f; 131 | float offset = (x > halfpi) ? halfpiminustwopi : halfpi; 132 | return fastsin (x + offset); 133 | } 134 | 135 | static inline float 136 | fastercos (float x) 137 | { 138 | static const float twooverpi = 0.63661977236758134f; 139 | static const float p = 0.54641335845679634f; 140 | 141 | union { float f; uint32_t i; } vx = { x }; 142 | vx.i &= 0x7FFFFFFF; 143 | 144 | float qpprox = 1.0f - twooverpi * vx.f; 145 | 146 | return qpprox + p * qpprox * (1.0f - qpprox * qpprox); 147 | } 148 | 149 | static inline float 150 | fastcosfull (float x) 151 | { 152 | static const float halfpi = 1.5707963267948966f; 153 | return fastsinfull (x + halfpi); 154 | } 155 | 156 | static inline float 157 | fastercosfull (float x) 158 | { 159 | static const float halfpi = 1.5707963267948966f; 160 | return fastersinfull (x + halfpi); 161 | } 162 | 163 | static inline float 164 | fasttan (float x) 165 | { 166 | static const float halfpi = 1.5707963267948966f; 167 | return fastsin (x) / fastsin (x + halfpi); 168 | } 169 | 170 | static inline float 171 | fastertan (float x) 172 | { 173 | return fastersin (x) / fastercos (x); 174 | } 175 | 176 | static inline float 177 | fasttanfull (float x) 178 | { 179 | static const float twopi = 6.2831853071795865f; 180 | static const float invtwopi = 0.15915494309189534f; 181 | 182 | int k = x * invtwopi; 183 | float half = (x < 0) ? -0.5f : 0.5f; 184 | float xnew = x - (half + k) * twopi; 185 | 186 | return fastsin (xnew) / fastcos (xnew); 187 | } 188 | 189 | static inline float 190 | fastertanfull (float x) 191 | { 192 | static const float twopi = 6.2831853071795865f; 193 | static const float invtwopi = 0.15915494309189534f; 194 | 195 | int k = x * invtwopi; 196 | float half = (x < 0) ? -0.5f : 0.5f; 197 | float xnew = x - (half + k) * twopi; 198 | 199 | return fastersin (xnew) / fastercos (xnew); 200 | } 201 | 202 | #ifdef __SSE2__ 203 | 204 | static inline v4sf 205 | vfastsin (const v4sf x) 206 | { 207 | const v4sf fouroverpi = v4sfl (1.2732395447351627f); 208 | const v4sf fouroverpisq = v4sfl (0.40528473456935109f); 209 | const v4sf q = v4sfl (0.78444488374548933f); 210 | const v4sf p = v4sfl (0.20363937680730309f); 211 | const v4sf r = v4sfl (0.015124940802184233f); 212 | const v4sf s = v4sfl (-0.0032225901625579573f); 213 | 214 | union { v4sf f; v4si i; } vx = { x }; 215 | v4si sign = vx.i & v4sil (0x80000000); 216 | vx.i &= v4sil (0x7FFFFFFF); 217 | 218 | v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 219 | v4sf qpproxsq = qpprox * qpprox; 220 | union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s)); 221 | vy.i ^= sign; 222 | 223 | return q * qpprox + vy.f; 224 | } 225 | 226 | static inline v4sf 227 | vfastersin (const v4sf x) 228 | { 229 | const v4sf fouroverpi = v4sfl (1.2732395447351627f); 230 | const v4sf fouroverpisq = v4sfl (0.40528473456935109f); 231 | const v4sf q = v4sfl (0.77633023248007499f); 232 | const v4sf plit = v4sfl (0.22308510060189463f); 233 | union { v4sf f; v4si i; } p = { plit }; 234 | 235 | union { v4sf f; v4si i; } vx = { x }; 236 | v4si sign = vx.i & v4sil (0x80000000); 237 | vx.i &= v4sil (0x7FFFFFFF); 238 | 239 | v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 240 | 241 | p.i |= sign; 242 | 243 | return qpprox * (q + p.f * qpprox); 244 | } 245 | 246 | static inline v4sf 247 | vfastsinfull (const v4sf x) 248 | { 249 | const v4sf twopi = v4sfl (6.2831853071795865f); 250 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 251 | 252 | v4si k = v4sf_to_v4si (x * invtwopi); 253 | 254 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 255 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 256 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 257 | 258 | return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x); 259 | } 260 | 261 | static inline v4sf 262 | vfastersinfull (const v4sf x) 263 | { 264 | const v4sf twopi = v4sfl (6.2831853071795865f); 265 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 266 | 267 | v4si k = v4sf_to_v4si (x * invtwopi); 268 | 269 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 270 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 271 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 272 | 273 | return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x); 274 | } 275 | 276 | static inline v4sf 277 | vfastcos (const v4sf x) 278 | { 279 | const v4sf halfpi = v4sfl (1.5707963267948966f); 280 | const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f); 281 | v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi); 282 | v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi), 283 | _mm_andnot_ps (lthalfpi, halfpi)); 284 | return vfastsin (x + offset); 285 | } 286 | 287 | static inline v4sf 288 | vfastercos (v4sf x) 289 | { 290 | const v4sf twooverpi = v4sfl (0.63661977236758134f); 291 | const v4sf p = v4sfl (0.54641335845679634); 292 | 293 | v4sf vx = v4sf_fabs (x); 294 | v4sf qpprox = v4sfl (1.0f) - twooverpi * vx; 295 | 296 | return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox); 297 | } 298 | 299 | static inline v4sf 300 | vfastcosfull (const v4sf x) 301 | { 302 | const v4sf halfpi = v4sfl (1.5707963267948966f); 303 | return vfastsinfull (x + halfpi); 304 | } 305 | 306 | static inline v4sf 307 | vfastercosfull (const v4sf x) 308 | { 309 | const v4sf halfpi = v4sfl (1.5707963267948966f); 310 | return vfastersinfull (x + halfpi); 311 | } 312 | 313 | static inline v4sf 314 | vfasttan (const v4sf x) 315 | { 316 | const v4sf halfpi = v4sfl (1.5707963267948966f); 317 | return vfastsin (x) / vfastsin (x + halfpi); 318 | } 319 | 320 | static inline v4sf 321 | vfastertan (const v4sf x) 322 | { 323 | return vfastersin (x) / vfastercos (x); 324 | } 325 | 326 | static inline v4sf 327 | vfasttanfull (const v4sf x) 328 | { 329 | const v4sf twopi = v4sfl (6.2831853071795865f); 330 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 331 | 332 | v4si k = v4sf_to_v4si (x * invtwopi); 333 | 334 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 335 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 336 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 337 | v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; 338 | 339 | return vfastsin (xnew) / vfastcos (xnew); 340 | } 341 | 342 | static inline v4sf 343 | vfastertanfull (const v4sf x) 344 | { 345 | const v4sf twopi = v4sfl (6.2831853071795865f); 346 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 347 | 348 | v4si k = v4sf_to_v4si (x * invtwopi); 349 | 350 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 351 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 352 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 353 | v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; 354 | 355 | return vfastersin (xnew) / vfastercos (xnew); 356 | } 357 | 358 | #endif //__SSE2__ 359 | 360 | #endif // __FAST_TRIG_H_ 361 | -------------------------------------------------------------------------------- /fastapprox/tests/testmacros.h: -------------------------------------------------------------------------------- 1 | #define test_scalar(estf, exactf, gen, maxerr, ntime) \ 2 | static void \ 3 | test_ ## estf ## _once (double* erracc, \ 4 | float* max, \ 5 | float* argmax) \ 6 | { \ 7 | float x = (gen); \ 8 | float exact = exactf (x); \ 9 | float est = estf (x); \ 10 | float err = fabsf (est - exact) / \ 11 | (fabsf (1e-4f) + fabsf (est) + fabsf (exact));\ 12 | \ 13 | if (err > *max) { *max = err; *argmax = x; } \ 14 | *erracc += err; \ 15 | } \ 16 | \ 17 | static void \ 18 | test_ ## estf (void) \ 19 | { \ 20 | unsigned int i; \ 21 | double err = 0; \ 22 | float argmax = 0; \ 23 | float max = 0; \ 24 | \ 25 | for (i = 0; i < 100000; ++i) \ 26 | { \ 27 | test_ ## estf ## _once (&err, &max, &argmax); \ 28 | } \ 29 | \ 30 | err /= i; \ 31 | \ 32 | fprintf (stderr, \ 33 | "%s average relative error = %g\n", \ 34 | #estf, \ 35 | err); \ 36 | fprintf (stderr, \ 37 | "%s max relative error (at %g) = %g\n", \ 38 | #estf, \ 39 | argmax, \ 40 | max); \ 41 | assert (err < maxerr); \ 42 | } \ 43 | \ 44 | static void \ 45 | time_ ## estf (void) \ 46 | { \ 47 | struct timeval start; \ 48 | struct timeval end; \ 49 | unsigned int i; \ 50 | float sum = 0; \ 51 | volatile float xd = 1.0f; \ 52 | \ 53 | gettimeofday (&start, NULL); \ 54 | \ 55 | for (i = 0; i < ntime; ++i) \ 56 | { \ 57 | sum += estf (xd); \ 58 | } \ 59 | \ 60 | gettimeofday (&end, NULL); \ 61 | \ 62 | fprintf (stderr, \ 63 | "%g\r%s million calls per second = %g\n", \ 64 | sum, \ 65 | #estf, \ 66 | ((double) i) / \ 67 | (1e+6 * (double) end.tv_sec \ 68 | - 1e+6 * (double) start.tv_sec \ 69 | + (double) end.tv_usec \ 70 | - (double) start.tv_usec)); \ 71 | } 72 | 73 | #define test_scalar2(estf, exactf, genx, geny, maxerr, ntime) \ 74 | static void \ 75 | test_ ## estf ## _once (double* erracc, \ 76 | float* max, \ 77 | float* argmaxx, \ 78 | float* argmaxy) \ 79 | { \ 80 | float x = (genx); /* ah ... the generation gap ... */ \ 81 | float y = (geny); \ 82 | float exact = exactf (x, y); \ 83 | float est = estf (x, y); \ 84 | float err = fabsf (est - exact) / \ 85 | (fabsf (1e-4f) + fabsf (est) + fabsf (exact));\ 86 | \ 87 | if (err > *max) { *max = err; *argmaxx = x; *argmaxy = y; } \ 88 | *erracc += err; \ 89 | } \ 90 | \ 91 | static void \ 92 | test_ ## estf (void) \ 93 | { \ 94 | unsigned int i; \ 95 | double err = 0; \ 96 | float argmaxx = 0; \ 97 | float argmaxy = 0; \ 98 | float max = 0; \ 99 | \ 100 | for (i = 0; i < 100000; ++i) \ 101 | { \ 102 | test_ ## estf ## _once (&err, &max, &argmaxx, &argmaxy); \ 103 | } \ 104 | \ 105 | err /= i; \ 106 | \ 107 | fprintf (stderr, \ 108 | "%s average relative error = %g\n", \ 109 | #estf, \ 110 | err); \ 111 | fprintf (stderr, \ 112 | "%s max relative error (at %g, %g) = %g\n", \ 113 | #estf, \ 114 | argmaxx, \ 115 | argmaxy, \ 116 | max); \ 117 | assert (err < maxerr); \ 118 | } \ 119 | \ 120 | static void \ 121 | time_ ## estf (void) \ 122 | { \ 123 | struct timeval start; \ 124 | struct timeval end; \ 125 | unsigned int i; \ 126 | float sum = 0; \ 127 | volatile float xd = 1.0f; \ 128 | volatile float yd = 1.0f; \ 129 | \ 130 | gettimeofday (&start, NULL); \ 131 | \ 132 | for (i = 0; i < ntime; ++i) \ 133 | { \ 134 | sum += estf (xd, yd); \ 135 | } \ 136 | \ 137 | gettimeofday (&end, NULL); \ 138 | \ 139 | fprintf (stderr, \ 140 | "%g\r%s million calls per second = %g\n", \ 141 | sum, \ 142 | #estf, \ 143 | ((double) i) / \ 144 | (1e+6 * (double) end.tv_sec \ 145 | - 1e+6 * (double) start.tv_sec \ 146 | + (double) end.tv_usec \ 147 | - (double) start.tv_usec)); \ 148 | } 149 | 150 | #ifdef __SSE2__ 151 | 152 | #define test_vector(estf, exactf, gen, maxerr, ntime) \ 153 | static void \ 154 | test_ ## estf ## _once (double* erracc, \ 155 | float* max, \ 156 | float* argmax) \ 157 | { \ 158 | v4sf x = v4sfl ((float) (gen)); \ 159 | v4sf exact = (v4sf) { exactf (v4sf_index (x, 0)), \ 160 | exactf (v4sf_index (x, 1)), \ 161 | exactf (v4sf_index (x, 2)), \ 162 | exactf (v4sf_index (x, 3)) }; \ 163 | v4sf est = estf (x); \ 164 | v4sf err = v4sf_fabs (est - exact) / \ 165 | (v4sfl (1e-4) + v4sf_fabs (est) + v4sf_fabs (exact)); \ 166 | \ 167 | unsigned int k; \ 168 | for (k = 0; k < 4; ++k) \ 169 | { \ 170 | if (v4sf_index (err, k) > *max) \ 171 | { \ 172 | *max = v4sf_index (err, k); \ 173 | *argmax = v4sf_index (x, k); \ 174 | } \ 175 | \ 176 | *erracc += v4sf_index (err, k); \ 177 | } \ 178 | } \ 179 | \ 180 | static void \ 181 | test_ ## estf (void) \ 182 | { \ 183 | unsigned int i; \ 184 | double err = 0; \ 185 | float argmax = 0; \ 186 | float max = 0; \ 187 | \ 188 | for (i = 0; i < 100000; ++i) \ 189 | { \ 190 | test_ ## estf ## _once (&err, &max, &argmax); \ 191 | } \ 192 | \ 193 | err /= (4.0f * i); \ 194 | \ 195 | fprintf (stderr, \ 196 | "%s average relative error = %g\n", \ 197 | #estf, \ 198 | err); \ 199 | fprintf (stderr, \ 200 | "%s max relative error (at %g) = %g\n", \ 201 | #estf, \ 202 | argmax, \ 203 | max); \ 204 | assert (err < maxerr); \ 205 | } \ 206 | \ 207 | static void \ 208 | time_ ## estf (void) \ 209 | { \ 210 | struct timeval start; \ 211 | struct timeval end; \ 212 | unsigned int i; \ 213 | v4sf sum = v4sfl (0.0f); \ 214 | volatile v4sf xd = v4sfl (1.0f); \ 215 | \ 216 | gettimeofday (&start, NULL); \ 217 | \ 218 | for (i = 0; i < ntime; ++i) \ 219 | { \ 220 | sum += estf (xd); \ 221 | } \ 222 | \ 223 | gettimeofday (&end, NULL); \ 224 | \ 225 | fprintf (stderr, \ 226 | "%g\r%s million calls per second = %g\n", \ 227 | v4sf_index (sum, 0), \ 228 | #estf, \ 229 | ((double) i) / \ 230 | (1e+6 * (double) end.tv_sec \ 231 | - 1e+6 * (double) start.tv_sec \ 232 | + (double) end.tv_usec \ 233 | - (double) start.tv_usec)); \ 234 | } 235 | 236 | #define test_vector2(estf, exactf, genx, geny, maxerr, ntime) \ 237 | static void \ 238 | test_ ## estf ## _once (double* erracc, \ 239 | float* max, \ 240 | float* argmaxx, \ 241 | float* argmaxy) \ 242 | { \ 243 | v4sf x = v4sfl ((float) (genx)); \ 244 | v4sf y = v4sfl ((float) (geny)); \ 245 | v4sf exact = (v4sf) { exactf (v4sf_index (x, 0), \ 246 | v4sf_index (y, 0)), \ 247 | exactf (v4sf_index (x, 1), \ 248 | v4sf_index (y, 1)), \ 249 | exactf (v4sf_index (x, 2), \ 250 | v4sf_index (y, 2)), \ 251 | exactf (v4sf_index (x, 3), \ 252 | v4sf_index (y, 3)) }; \ 253 | v4sf est = estf (x, y); \ 254 | v4sf err = v4sf_fabs (est - exact) / \ 255 | (v4sfl (1e-4) + v4sf_fabs (est) + v4sf_fabs (exact)); \ 256 | \ 257 | unsigned int k; \ 258 | for (k = 0; k < 4; ++k) \ 259 | { \ 260 | if (v4sf_index (err, k) > *max) \ 261 | { \ 262 | *max = v4sf_index (err, k); \ 263 | *argmaxx = v4sf_index (x, k); \ 264 | *argmaxy = v4sf_index (y, k); \ 265 | } \ 266 | \ 267 | *erracc += v4sf_index (err, k); \ 268 | } \ 269 | } \ 270 | \ 271 | static void \ 272 | test_ ## estf (void) \ 273 | { \ 274 | unsigned int i; \ 275 | double err = 0; \ 276 | float argmaxx = 0; \ 277 | float argmaxy = 0; \ 278 | float max = 0; \ 279 | \ 280 | for (i = 0; i < 100000; ++i) \ 281 | { \ 282 | test_ ## estf ## _once (&err, &max, &argmaxx, &argmaxy); \ 283 | } \ 284 | \ 285 | err /= (4.0f * i); \ 286 | \ 287 | fprintf (stderr, \ 288 | "%s average relative error = %g\n", \ 289 | #estf, \ 290 | err); \ 291 | fprintf (stderr, \ 292 | "%s max relative error (at %g, %g) = %g\n", \ 293 | #estf, \ 294 | argmaxx, \ 295 | argmaxy, \ 296 | max); \ 297 | assert (err < maxerr); \ 298 | } \ 299 | \ 300 | static void \ 301 | time_ ## estf (void) \ 302 | { \ 303 | struct timeval start; \ 304 | struct timeval end; \ 305 | unsigned int i; \ 306 | v4sf sum = v4sfl (0.0f); \ 307 | volatile v4sf xd = v4sfl (1.0f); \ 308 | volatile v4sf yd = v4sfl (1.0f); \ 309 | \ 310 | gettimeofday (&start, NULL); \ 311 | \ 312 | for (i = 0; i < ntime; ++i) \ 313 | { \ 314 | sum += estf (xd, yd); \ 315 | } \ 316 | \ 317 | gettimeofday (&end, NULL); \ 318 | \ 319 | fprintf (stderr, \ 320 | "%g\r%s million calls per second = %g\n", \ 321 | v4sf_index (sum, 0), \ 322 | #estf, \ 323 | ((double) i) / \ 324 | (1e+6 * (double) end.tv_sec \ 325 | - 1e+6 * (double) start.tv_sec \ 326 | + (double) end.tv_usec \ 327 | - (double) start.tv_usec)); \ 328 | } 329 | 330 | #else // __SSE2__ 331 | 332 | #define test_vector(estf, exactf, gen, maxerr, ntime) \ 333 | static void \ 334 | test_ ## estf (void) \ 335 | { \ 336 | fprintf (stderr, "%s test SKIPPED (no SSE support)\n", \ 337 | #estf); \ 338 | } \ 339 | \ 340 | static void \ 341 | time_ ## estf (void) \ 342 | { \ 343 | } 344 | 345 | #define test_vector2(estf, exactf, genx, geny, maxerr, ntime) \ 346 | static void \ 347 | test_ ## estf (void) \ 348 | { \ 349 | fprintf (stderr, "%s test SKIPPED (no SSE support)\n", \ 350 | #estf); \ 351 | } \ 352 | \ 353 | static void \ 354 | time_ ## estf (void) \ 355 | { \ 356 | } 357 | 358 | #endif // __SSE2__ 359 | -------------------------------------------------------------------------------- /fastapprox/src/fastonebigheader.h: -------------------------------------------------------------------------------- 1 | /*=====================================================================* 2 | * Copyright (C) 2012 Paul Mineiro * 3 | * All rights reserved. * 4 | * * 5 | * Redistribution and use in source and binary forms, with * 6 | * or without modification, are permitted provided that the * 7 | * following conditions are met: * 8 | * * 9 | * * Redistributions of source code must retain the * 10 | * above copyright notice, this list of conditions and * 11 | * the following disclaimer. * 12 | * * 13 | * * Redistributions in binary form must reproduce the * 14 | * above copyright notice, this list of conditions and * 15 | * the following disclaimer in the documentation and/or * 16 | * other materials provided with the distribution. * 17 | * * 18 | * * Neither the name of Paul Mineiro nor the names * 19 | * of other contributors may be used to endorse or promote * 20 | * products derived from this software without specific * 21 | * prior written permission. * 22 | * * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 24 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 25 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 26 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 28 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 29 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 30 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 31 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 32 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 33 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 34 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 36 | * POSSIBILITY OF SUCH DAMAGE. * 37 | * * 38 | * Contact: Paul Mineiro * 39 | *=====================================================================*/ 40 | 41 | #ifndef __CAST_H_ 42 | 43 | #ifdef __cplusplus 44 | #define cast_uint32_t static_cast 45 | #else 46 | #define cast_uint32_t (uint32_t) 47 | #endif 48 | 49 | #endif // __CAST_H_ 50 | /*=====================================================================* 51 | * Copyright (C) 2011 Paul Mineiro * 52 | * All rights reserved. * 53 | * * 54 | * Redistribution and use in source and binary forms, with * 55 | * or without modification, are permitted provided that the * 56 | * following conditions are met: * 57 | * * 58 | * * Redistributions of source code must retain the * 59 | * above copyright notice, this list of conditions and * 60 | * the following disclaimer. * 61 | * * 62 | * * Redistributions in binary form must reproduce the * 63 | * above copyright notice, this list of conditions and * 64 | * the following disclaimer in the documentation and/or * 65 | * other materials provided with the distribution. * 66 | * * 67 | * * Neither the name of Paul Mineiro nor the names * 68 | * of other contributors may be used to endorse or promote * 69 | * products derived from this software without specific * 70 | * prior written permission. * 71 | * * 72 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 73 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 74 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 75 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 76 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 77 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 78 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 79 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 80 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 81 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 82 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 83 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 84 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 85 | * POSSIBILITY OF SUCH DAMAGE. * 86 | * * 87 | * Contact: Paul Mineiro * 88 | *=====================================================================*/ 89 | 90 | #ifndef __SSE_H_ 91 | #define __SSE_H_ 92 | 93 | #ifdef __SSE2__ 94 | 95 | #include 96 | 97 | #ifdef __cplusplus 98 | namespace { 99 | #endif // __cplusplus 100 | 101 | typedef __m128 v4sf; 102 | typedef __m128i v4si; 103 | 104 | #define v4si_to_v4sf _mm_cvtepi32_ps 105 | #define v4sf_to_v4si _mm_cvttps_epi32 106 | 107 | #if _MSC_VER && !__INTEL_COMPILER 108 | template 109 | __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; } 110 | 111 | #define AS_4CHARS(a) \ 112 | GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \ 113 | GetChar(int32_t(a), 2), GetChar(int32_t(a), 3) 114 | 115 | #define _MM_SETR_EPI32(a0, a1, a2, a3) \ 116 | { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) } 117 | 118 | #define v4sfl(x) (const v4sf { (x), (x), (x), (x) }) 119 | #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x)) 120 | 121 | __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); } 122 | __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); } 123 | __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); } 124 | __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); } 125 | 126 | __forceinline const v4sf operator+(const v4sf& a) { return a; } 127 | __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } 128 | 129 | __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); } 130 | __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); } 131 | __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); } 132 | 133 | __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); } 134 | __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); } 135 | __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); } 136 | 137 | __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; } 138 | __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; } 139 | __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; } 140 | __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; } 141 | 142 | __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; } 143 | __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; } 144 | __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; } 145 | #else 146 | #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) }) 147 | #define v2dil(x) ((const v4si) { (x), (x) }) 148 | #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x)) 149 | #endif 150 | 151 | typedef union { v4sf f; float array[4]; } v4sfindexer; 152 | #define v4sf_index(_findx, _findi) \ 153 | ({ \ 154 | v4sfindexer _findvx = { _findx } ; \ 155 | _findvx.array[_findi]; \ 156 | }) 157 | typedef union { v4si i; int array[4]; } v4siindexer; 158 | #define v4si_index(_iindx, _iindi) \ 159 | ({ \ 160 | v4siindexer _iindvx = { _iindx } ; \ 161 | _iindvx.array[_iindi]; \ 162 | }) 163 | 164 | typedef union { v4sf f; v4si i; } v4sfv4sipun; 165 | #if _MSC_VER && !__INTEL_COMPILER 166 | #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) 167 | #else 168 | #define v4sf_fabs(x) \ 169 | ({ \ 170 | v4sfv4sipun vx; \ 171 | vx.f = x; \ 172 | vx.i &= v4sil (0x7FFFFFFF); \ 173 | vx.f; \ 174 | }) 175 | #endif 176 | 177 | #ifdef __cplusplus 178 | } // end namespace 179 | #endif // __cplusplus 180 | 181 | #endif // __SSE2__ 182 | 183 | #endif // __SSE_H_ 184 | /*=====================================================================* 185 | * Copyright (C) 2011 Paul Mineiro * 186 | * All rights reserved. * 187 | * * 188 | * Redistribution and use in source and binary forms, with * 189 | * or without modification, are permitted provided that the * 190 | * following conditions are met: * 191 | * * 192 | * * Redistributions of source code must retain the * 193 | * above copyright notice, this list of conditions and * 194 | * the following disclaimer. * 195 | * * 196 | * * Redistributions in binary form must reproduce the * 197 | * above copyright notice, this list of conditions and * 198 | * the following disclaimer in the documentation and/or * 199 | * other materials provided with the distribution. * 200 | * * 201 | * * Neither the name of Paul Mineiro nor the names * 202 | * of other contributors may be used to endorse or promote * 203 | * products derived from this software without specific * 204 | * prior written permission. * 205 | * * 206 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 207 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 208 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 209 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 210 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 211 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 212 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 213 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 214 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 215 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 216 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 217 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 218 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 219 | * POSSIBILITY OF SUCH DAMAGE. * 220 | * * 221 | * Contact: Paul Mineiro * 222 | *=====================================================================*/ 223 | 224 | #ifndef __FAST_EXP_H_ 225 | #define __FAST_EXP_H_ 226 | 227 | #include 228 | 229 | // Underflow of exponential is common practice in numerical routines, 230 | // so handle it here. 231 | 232 | static inline float 233 | fastpow2 (float p) 234 | { 235 | float offset = (p < 0) ? 1.0f : 0.0f; 236 | float clipp = (p < -126) ? -126.0f : p; 237 | int w = clipp; 238 | float z = clipp - w + offset; 239 | union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) }; 240 | 241 | return v.f; 242 | } 243 | 244 | static inline float 245 | fastexp (float p) 246 | { 247 | return fastpow2 (1.442695040f * p); 248 | } 249 | 250 | static inline float 251 | fasterpow2 (float p) 252 | { 253 | float clipp = (p < -126) ? -126.0f : p; 254 | union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) }; 255 | return v.f; 256 | } 257 | 258 | static inline float 259 | fasterexp (float p) 260 | { 261 | return fasterpow2 (1.442695040f * p); 262 | } 263 | 264 | #ifdef __SSE2__ 265 | 266 | static inline v4sf 267 | vfastpow2 (const v4sf p) 268 | { 269 | v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f)); 270 | v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f)); 271 | v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); 272 | v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); 273 | v4si w = v4sf_to_v4si (clipp); 274 | v4sf z = clipp - v4si_to_v4sf (w) + offset; 275 | 276 | const v4sf c_121_2740838 = v4sfl (121.2740575f); 277 | const v4sf c_27_7280233 = v4sfl (27.7280233f); 278 | const v4sf c_4_84252568 = v4sfl (4.84252568f); 279 | const v4sf c_1_49012907 = v4sfl (1.49012907f); 280 | union { v4si i; v4sf f; } v = { 281 | v4sf_to_v4si ( 282 | v4sfl (1 << 23) * 283 | (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z) 284 | ) 285 | }; 286 | 287 | return v.f; 288 | } 289 | 290 | static inline v4sf 291 | vfastexp (const v4sf p) 292 | { 293 | const v4sf c_invlog_2 = v4sfl (1.442695040f); 294 | 295 | return vfastpow2 (c_invlog_2 * p); 296 | } 297 | 298 | static inline v4sf 299 | vfasterpow2 (const v4sf p) 300 | { 301 | const v4sf c_126_94269504 = v4sfl (126.94269504f); 302 | v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); 303 | v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); 304 | union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) }; 305 | return v.f; 306 | } 307 | 308 | static inline v4sf 309 | vfasterexp (const v4sf p) 310 | { 311 | const v4sf c_invlog_2 = v4sfl (1.442695040f); 312 | 313 | return vfasterpow2 (c_invlog_2 * p); 314 | } 315 | 316 | #endif //__SSE2__ 317 | 318 | #endif // __FAST_EXP_H_ 319 | /*=====================================================================* 320 | * Copyright (C) 2011 Paul Mineiro * 321 | * All rights reserved. * 322 | * * 323 | * Redistribution and use in source and binary forms, with * 324 | * or without modification, are permitted provided that the * 325 | * following conditions are met: * 326 | * * 327 | * * Redistributions of source code must retain the * 328 | * above copyright notice, this list of conditions and * 329 | * the following disclaimer. * 330 | * * 331 | * * Redistributions in binary form must reproduce the * 332 | * above copyright notice, this list of conditions and * 333 | * the following disclaimer in the documentation and/or * 334 | * other materials provided with the distribution. * 335 | * * 336 | * * Neither the name of Paul Mineiro nor the names * 337 | * of other contributors may be used to endorse or promote * 338 | * products derived from this software without specific * 339 | * prior written permission. * 340 | * * 341 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 342 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 343 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 344 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 345 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 346 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 347 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 348 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 349 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 350 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 351 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 352 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 353 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 354 | * POSSIBILITY OF SUCH DAMAGE. * 355 | * * 356 | * Contact: Paul Mineiro * 357 | *=====================================================================*/ 358 | 359 | #ifndef __FAST_LOG_H_ 360 | #define __FAST_LOG_H_ 361 | 362 | #include 363 | 364 | static inline float 365 | fastlog2 (float x) 366 | { 367 | union { float f; uint32_t i; } vx = { x }; 368 | union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 }; 369 | float y = vx.i; 370 | y *= 1.1920928955078125e-7f; 371 | 372 | return y - 124.22551499f 373 | - 1.498030302f * mx.f 374 | - 1.72587999f / (0.3520887068f + mx.f); 375 | } 376 | 377 | static inline float 378 | fastlog (float x) 379 | { 380 | return 0.69314718f * fastlog2 (x); 381 | } 382 | 383 | static inline float 384 | fasterlog2 (float x) 385 | { 386 | union { float f; uint32_t i; } vx = { x }; 387 | float y = vx.i; 388 | y *= 1.1920928955078125e-7f; 389 | return y - 126.94269504f; 390 | } 391 | 392 | static inline float 393 | fasterlog (float x) 394 | { 395 | // return 0.69314718f * fasterlog2 (x); 396 | 397 | union { float f; uint32_t i; } vx = { x }; 398 | float y = vx.i; 399 | y *= 8.2629582881927490e-8f; 400 | return y - 87.989971088f; 401 | } 402 | 403 | #ifdef __SSE2__ 404 | 405 | static inline v4sf 406 | vfastlog2 (v4sf x) 407 | { 408 | union { v4sf f; v4si i; } vx = { x }; 409 | union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000); 410 | v4sf y = v4si_to_v4sf (vx.i); 411 | y *= v4sfl (1.1920928955078125e-7f); 412 | 413 | const v4sf c_124_22551499 = v4sfl (124.22551499f); 414 | const v4sf c_1_498030302 = v4sfl (1.498030302f); 415 | const v4sf c_1_725877999 = v4sfl (1.72587999f); 416 | const v4sf c_0_3520087068 = v4sfl (0.3520887068f); 417 | 418 | return y - c_124_22551499 419 | - c_1_498030302 * mx.f 420 | - c_1_725877999 / (c_0_3520087068 + mx.f); 421 | } 422 | 423 | static inline v4sf 424 | vfastlog (v4sf x) 425 | { 426 | const v4sf c_0_69314718 = v4sfl (0.69314718f); 427 | 428 | return c_0_69314718 * vfastlog2 (x); 429 | } 430 | 431 | static inline v4sf 432 | vfasterlog2 (v4sf x) 433 | { 434 | union { v4sf f; v4si i; } vx = { x }; 435 | v4sf y = v4si_to_v4sf (vx.i); 436 | y *= v4sfl (1.1920928955078125e-7f); 437 | 438 | const v4sf c_126_94269504 = v4sfl (126.94269504f); 439 | 440 | return y - c_126_94269504; 441 | } 442 | 443 | static inline v4sf 444 | vfasterlog (v4sf x) 445 | { 446 | // const v4sf c_0_69314718 = v4sfl (0.69314718f); 447 | // 448 | // return c_0_69314718 * vfasterlog2 (x); 449 | 450 | union { v4sf f; v4si i; } vx = { x }; 451 | v4sf y = v4si_to_v4sf (vx.i); 452 | y *= v4sfl (8.2629582881927490e-8f); 453 | 454 | const v4sf c_87_989971088 = v4sfl (87.989971088f); 455 | 456 | return y - c_87_989971088; 457 | } 458 | 459 | #endif // __SSE2__ 460 | 461 | #endif // __FAST_LOG_H_ 462 | /*=====================================================================* 463 | * Copyright (C) 2011 Paul Mineiro * 464 | * All rights reserved. * 465 | * * 466 | * Redistribution and use in source and binary forms, with * 467 | * or without modification, are permitted provided that the * 468 | * following conditions are met: * 469 | * * 470 | * * Redistributions of source code must retain the * 471 | * above copyright notice, this list of conditions and * 472 | * the following disclaimer. * 473 | * * 474 | * * Redistributions in binary form must reproduce the * 475 | * above copyright notice, this list of conditions and * 476 | * the following disclaimer in the documentation and/or * 477 | * other materials provided with the distribution. * 478 | * * 479 | * * Neither the name of Paul Mineiro nor the names * 480 | * of other contributors may be used to endorse or promote * 481 | * products derived from this software without specific * 482 | * prior written permission. * 483 | * * 484 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 485 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 486 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 487 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 488 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 489 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 490 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 491 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 492 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 493 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 494 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 495 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 496 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 497 | * POSSIBILITY OF SUCH DAMAGE. * 498 | * * 499 | * Contact: Paul Mineiro * 500 | *=====================================================================*/ 501 | 502 | #ifndef __FAST_ERF_H_ 503 | #define __FAST_ERF_H_ 504 | 505 | #include 506 | #include 507 | 508 | // fasterfc: not actually faster than erfcf(3) on newer machines! 509 | // ... although vectorized version is interesting 510 | // and fastererfc is very fast 511 | 512 | static inline float 513 | fasterfc (float x) 514 | { 515 | static const float k = 3.3509633149424609f; 516 | static const float a = 0.07219054755431126f; 517 | static const float b = 15.418191568719577f; 518 | static const float c = 5.609846028328545f; 519 | 520 | union { float f; uint32_t i; } vc = { c * x }; 521 | float xsq = x * x; 522 | float xquad = xsq * xsq; 523 | 524 | vc.i |= 0x80000000; 525 | 526 | return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f); 527 | } 528 | 529 | static inline float 530 | fastererfc (float x) 531 | { 532 | static const float k = 3.3509633149424609f; 533 | 534 | return 2.0f / (1.0f + fasterpow2 (k * x)); 535 | } 536 | 537 | // fasterf: not actually faster than erff(3) on newer machines! 538 | // ... although vectorized version is interesting 539 | // and fastererf is very fast 540 | 541 | static inline float 542 | fasterf (float x) 543 | { 544 | return 1.0f - fasterfc (x); 545 | } 546 | 547 | static inline float 548 | fastererf (float x) 549 | { 550 | return 1.0f - fastererfc (x); 551 | } 552 | 553 | static inline float 554 | fastinverseerf (float x) 555 | { 556 | static const float invk = 0.30004578719350504f; 557 | static const float a = 0.020287853348211326f; 558 | static const float b = 0.07236892874789555f; 559 | static const float c = 0.9913030456864257f; 560 | static const float d = 0.8059775923760193f; 561 | 562 | float xsq = x * x; 563 | 564 | return invk * fastlog2 ((1.0f + x) / (1.0f - x)) 565 | + x * (a - b * xsq) / (c - d * xsq); 566 | } 567 | 568 | static inline float 569 | fasterinverseerf (float x) 570 | { 571 | static const float invk = 0.30004578719350504f; 572 | 573 | return invk * fasterlog2 ((1.0f + x) / (1.0f - x)); 574 | } 575 | 576 | #ifdef __SSE2__ 577 | 578 | static inline v4sf 579 | vfasterfc (v4sf x) 580 | { 581 | const v4sf k = v4sfl (3.3509633149424609f); 582 | const v4sf a = v4sfl (0.07219054755431126f); 583 | const v4sf b = v4sfl (15.418191568719577f); 584 | const v4sf c = v4sfl (5.609846028328545f); 585 | 586 | union { v4sf f; v4si i; } vc; vc.f = c * x; 587 | vc.i |= v4sil (0x80000000); 588 | 589 | v4sf xsq = x * x; 590 | v4sf xquad = xsq * xsq; 591 | 592 | return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f); 593 | } 594 | 595 | static inline v4sf 596 | vfastererfc (const v4sf x) 597 | { 598 | const v4sf k = v4sfl (3.3509633149424609f); 599 | 600 | return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x)); 601 | } 602 | 603 | static inline v4sf 604 | vfasterf (v4sf x) 605 | { 606 | return v4sfl (1.0f) - vfasterfc (x); 607 | } 608 | 609 | static inline v4sf 610 | vfastererf (const v4sf x) 611 | { 612 | return v4sfl (1.0f) - vfastererfc (x); 613 | } 614 | 615 | static inline v4sf 616 | vfastinverseerf (v4sf x) 617 | { 618 | const v4sf invk = v4sfl (0.30004578719350504f); 619 | const v4sf a = v4sfl (0.020287853348211326f); 620 | const v4sf b = v4sfl (0.07236892874789555f); 621 | const v4sf c = v4sfl (0.9913030456864257f); 622 | const v4sf d = v4sfl (0.8059775923760193f); 623 | 624 | v4sf xsq = x * x; 625 | 626 | return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) 627 | + x * (a - b * xsq) / (c - d * xsq); 628 | } 629 | 630 | static inline v4sf 631 | vfasterinverseerf (v4sf x) 632 | { 633 | const v4sf invk = v4sfl (0.30004578719350504f); 634 | 635 | return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)); 636 | } 637 | 638 | #endif //__SSE2__ 639 | 640 | #endif // __FAST_ERF_H_ 641 | /*=====================================================================* 642 | * Copyright (C) 2011 Paul Mineiro * 643 | * All rights reserved. * 644 | * * 645 | * Redistribution and use in source and binary forms, with * 646 | * or without modification, are permitted provided that the * 647 | * following conditions are met: * 648 | * * 649 | * * Redistributions of source code must retain the * 650 | * above copyright notice, this list of conditions and * 651 | * the following disclaimer. * 652 | * * 653 | * * Redistributions in binary form must reproduce the * 654 | * above copyright notice, this list of conditions and * 655 | * the following disclaimer in the documentation and/or * 656 | * other materials provided with the distribution. * 657 | * * 658 | * * Neither the name of Paul Mineiro nor the names * 659 | * of other contributors may be used to endorse or promote * 660 | * products derived from this software without specific * 661 | * prior written permission. * 662 | * * 663 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 664 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 665 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 666 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 667 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 668 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 669 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 670 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 671 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 672 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 673 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 674 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 675 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 676 | * POSSIBILITY OF SUCH DAMAGE. * 677 | * * 678 | * Contact: Paul Mineiro * 679 | *=====================================================================*/ 680 | 681 | #ifndef __FAST_GAMMA_H_ 682 | #define __FAST_GAMMA_H_ 683 | 684 | #include 685 | 686 | /* gamma/digamma functions only work for positive inputs */ 687 | 688 | static inline float 689 | fastlgamma (float x) 690 | { 691 | float logterm = fastlog (x * (1.0f + x) * (2.0f + x)); 692 | float xp3 = 3.0f + x; 693 | 694 | return - 2.081061466f 695 | - x 696 | + 0.0833333f / xp3 697 | - logterm 698 | + (2.5f + x) * fastlog (xp3); 699 | } 700 | 701 | static inline float 702 | fasterlgamma (float x) 703 | { 704 | return - 0.0810614667f 705 | - x 706 | - fasterlog (x) 707 | + (0.5f + x) * fasterlog (1.0f + x); 708 | } 709 | 710 | static inline float 711 | fastdigamma (float x) 712 | { 713 | float twopx = 2.0f + x; 714 | float logterm = fastlog (twopx); 715 | 716 | return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) / 717 | (12.0f * x * (1.0f + x) * twopx * twopx) 718 | + logterm; 719 | } 720 | 721 | static inline float 722 | fasterdigamma (float x) 723 | { 724 | float onepx = 1.0f + x; 725 | 726 | return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx); 727 | } 728 | 729 | #ifdef __SSE2__ 730 | 731 | static inline v4sf 732 | vfastlgamma (v4sf x) 733 | { 734 | const v4sf c_1_0 = v4sfl (1.0f); 735 | const v4sf c_2_0 = v4sfl (2.0f); 736 | const v4sf c_3_0 = v4sfl (3.0f); 737 | const v4sf c_2_081061466 = v4sfl (2.081061466f); 738 | const v4sf c_0_0833333 = v4sfl (0.0833333f); 739 | const v4sf c_2_5 = v4sfl (2.5f); 740 | 741 | v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x)); 742 | v4sf xp3 = c_3_0 + x; 743 | 744 | return - c_2_081061466 745 | - x 746 | + c_0_0833333 / xp3 747 | - logterm 748 | + (c_2_5 + x) * vfastlog (xp3); 749 | } 750 | 751 | static inline v4sf 752 | vfasterlgamma (v4sf x) 753 | { 754 | const v4sf c_0_0810614667 = v4sfl (0.0810614667f); 755 | const v4sf c_0_5 = v4sfl (0.5f); 756 | const v4sf c_1 = v4sfl (1.0f); 757 | 758 | return - c_0_0810614667 759 | - x 760 | - vfasterlog (x) 761 | + (c_0_5 + x) * vfasterlog (c_1 + x); 762 | } 763 | 764 | static inline v4sf 765 | vfastdigamma (v4sf x) 766 | { 767 | v4sf twopx = v4sfl (2.0f) + x; 768 | v4sf logterm = vfastlog (twopx); 769 | 770 | return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) / 771 | (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx) 772 | + logterm; 773 | } 774 | 775 | static inline v4sf 776 | vfasterdigamma (v4sf x) 777 | { 778 | const v4sf c_1_0 = v4sfl (1.0f); 779 | const v4sf c_2_0 = v4sfl (2.0f); 780 | v4sf onepx = c_1_0 + x; 781 | 782 | return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx); 783 | } 784 | 785 | #endif //__SSE2__ 786 | 787 | #endif // __FAST_GAMMA_H_ 788 | /*=====================================================================* 789 | * Copyright (C) 2011 Paul Mineiro * 790 | * All rights reserved. * 791 | * * 792 | * Redistribution and use in source and binary forms, with * 793 | * or without modification, are permitted provided that the * 794 | * following conditions are met: * 795 | * * 796 | * * Redistributions of source code must retain the * 797 | * above copyright notice, this list of conditions and * 798 | * the following disclaimer. * 799 | * * 800 | * * Redistributions in binary form must reproduce the * 801 | * above copyright notice, this list of conditions and * 802 | * the following disclaimer in the documentation and/or * 803 | * other materials provided with the distribution. * 804 | * * 805 | * * Neither the name of Paul Mineiro nor the names * 806 | * of other contributors may be used to endorse or promote * 807 | * products derived from this software without specific * 808 | * prior written permission. * 809 | * * 810 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 811 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 812 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 813 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 814 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 815 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 816 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 817 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 818 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 819 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 820 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 821 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 822 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 823 | * POSSIBILITY OF SUCH DAMAGE. * 824 | * * 825 | * Contact: Paul Mineiro * 826 | *=====================================================================*/ 827 | 828 | #ifndef __FAST_HYPERBOLIC_H_ 829 | #define __FAST_HYPERBOLIC_H_ 830 | 831 | #include 832 | 833 | static inline float 834 | fastsinh (float p) 835 | { 836 | return 0.5f * (fastexp (p) - fastexp (-p)); 837 | } 838 | 839 | static inline float 840 | fastersinh (float p) 841 | { 842 | return 0.5f * (fasterexp (p) - fasterexp (-p)); 843 | } 844 | 845 | static inline float 846 | fastcosh (float p) 847 | { 848 | return 0.5f * (fastexp (p) + fastexp (-p)); 849 | } 850 | 851 | static inline float 852 | fastercosh (float p) 853 | { 854 | return 0.5f * (fasterexp (p) + fasterexp (-p)); 855 | } 856 | 857 | static inline float 858 | fasttanh (float p) 859 | { 860 | return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p)); 861 | } 862 | 863 | static inline float 864 | fastertanh (float p) 865 | { 866 | return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p)); 867 | } 868 | 869 | #ifdef __SSE2__ 870 | 871 | static inline v4sf 872 | vfastsinh (const v4sf p) 873 | { 874 | const v4sf c_0_5 = v4sfl (0.5f); 875 | 876 | return c_0_5 * (vfastexp (p) - vfastexp (-p)); 877 | } 878 | 879 | static inline v4sf 880 | vfastersinh (const v4sf p) 881 | { 882 | const v4sf c_0_5 = v4sfl (0.5f); 883 | 884 | return c_0_5 * (vfasterexp (p) - vfasterexp (-p)); 885 | } 886 | 887 | static inline v4sf 888 | vfastcosh (const v4sf p) 889 | { 890 | const v4sf c_0_5 = v4sfl (0.5f); 891 | 892 | return c_0_5 * (vfastexp (p) + vfastexp (-p)); 893 | } 894 | 895 | static inline v4sf 896 | vfastercosh (const v4sf p) 897 | { 898 | const v4sf c_0_5 = v4sfl (0.5f); 899 | 900 | return c_0_5 * (vfasterexp (p) + vfasterexp (-p)); 901 | } 902 | 903 | static inline v4sf 904 | vfasttanh (const v4sf p) 905 | { 906 | const v4sf c_1 = v4sfl (1.0f); 907 | const v4sf c_2 = v4sfl (2.0f); 908 | 909 | return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p)); 910 | } 911 | 912 | static inline v4sf 913 | vfastertanh (const v4sf p) 914 | { 915 | const v4sf c_1 = v4sfl (1.0f); 916 | const v4sf c_2 = v4sfl (2.0f); 917 | 918 | return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p)); 919 | } 920 | 921 | #endif //__SSE2__ 922 | 923 | #endif // __FAST_HYPERBOLIC_H_ 924 | /*=====================================================================* 925 | * Copyright (C) 2011 Paul Mineiro * 926 | * All rights reserved. * 927 | * * 928 | * Redistribution and use in source and binary forms, with * 929 | * or without modification, are permitted provided that the * 930 | * following conditions are met: * 931 | * * 932 | * * Redistributions of source code must retain the * 933 | * above copyright notice, this list of conditions and * 934 | * the following disclaimer. * 935 | * * 936 | * * Redistributions in binary form must reproduce the * 937 | * above copyright notice, this list of conditions and * 938 | * the following disclaimer in the documentation and/or * 939 | * other materials provided with the distribution. * 940 | * * 941 | * * Neither the name of Paul Mineiro nor the names * 942 | * of other contributors may be used to endorse or promote * 943 | * products derived from this software without specific * 944 | * prior written permission. * 945 | * * 946 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 947 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 948 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 949 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 950 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 951 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 952 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 953 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 954 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 955 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 956 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 957 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 958 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 959 | * POSSIBILITY OF SUCH DAMAGE. * 960 | * * 961 | * Contact: Paul Mineiro * 962 | *=====================================================================*/ 963 | 964 | #ifndef __FAST_LAMBERT_W_H_ 965 | #define __FAST_LAMBERT_W_H_ 966 | 967 | #include 968 | 969 | // these functions compute the upper branch aka W_0 970 | 971 | static inline float 972 | fastlambertw (float x) 973 | { 974 | static const float threshold = 2.26445f; 975 | 976 | float c = (x < threshold) ? 1.546865557f : 1.0f; 977 | float d = (x < threshold) ? 2.250366841f : 0.0f; 978 | float a = (x < threshold) ? -0.737769969f : 0.0f; 979 | 980 | float logterm = fastlog (c * x + d); 981 | float loglogterm = fastlog (logterm); 982 | 983 | float minusw = -a - logterm + loglogterm - loglogterm / logterm; 984 | float expminusw = fastexp (minusw); 985 | float xexpminusw = x * expminusw; 986 | float pexpminusw = xexpminusw - minusw; 987 | 988 | return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) / 989 | (2.0f + pexpminusw * (2.0f - minusw)); 990 | } 991 | 992 | static inline float 993 | fasterlambertw (float x) 994 | { 995 | static const float threshold = 2.26445f; 996 | 997 | float c = (x < threshold) ? 1.546865557f : 1.0f; 998 | float d = (x < threshold) ? 2.250366841f : 0.0f; 999 | float a = (x < threshold) ? -0.737769969f : 0.0f; 1000 | 1001 | float logterm = fasterlog (c * x + d); 1002 | float loglogterm = fasterlog (logterm); 1003 | 1004 | float w = a + logterm - loglogterm + loglogterm / logterm; 1005 | float expw = fasterexp (-w); 1006 | 1007 | return (w * w + expw * x) / (1.0f + w); 1008 | } 1009 | 1010 | static inline float 1011 | fastlambertwexpx (float x) 1012 | { 1013 | static const float k = 1.1765631309f; 1014 | static const float a = 0.94537622168f; 1015 | 1016 | float logarg = fmaxf (x, k); 1017 | float powarg = (x < k) ? a * (x - k) : 0; 1018 | 1019 | float logterm = fastlog (logarg); 1020 | float powterm = fasterpow2 (powarg); // don't need accuracy here 1021 | 1022 | float w = powterm * (logarg - logterm + logterm / logarg); 1023 | float logw = fastlog (w); 1024 | float p = x - logw; 1025 | 1026 | return w * (2.0f + p + w * (3.0f + 2.0f * p)) / 1027 | (2.0f - p + w * (5.0f + 2.0f * w)); 1028 | } 1029 | 1030 | static inline float 1031 | fasterlambertwexpx (float x) 1032 | { 1033 | static const float k = 1.1765631309f; 1034 | static const float a = 0.94537622168f; 1035 | 1036 | float logarg = fmaxf (x, k); 1037 | float powarg = (x < k) ? a * (x - k) : 0; 1038 | 1039 | float logterm = fasterlog (logarg); 1040 | float powterm = fasterpow2 (powarg); 1041 | 1042 | float w = powterm * (logarg - logterm + logterm / logarg); 1043 | float logw = fasterlog (w); 1044 | 1045 | return w * (1.0f + x - logw) / (1.0f + w); 1046 | } 1047 | 1048 | #ifdef __SSE2__ 1049 | 1050 | static inline v4sf 1051 | vfastlambertw (v4sf x) 1052 | { 1053 | const v4sf threshold = v4sfl (2.26445f); 1054 | 1055 | v4sf under = _mm_cmplt_ps (x, threshold); 1056 | v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), 1057 | _mm_andnot_ps (under, v4sfl (1.0f))); 1058 | v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); 1059 | v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); 1060 | 1061 | v4sf logterm = vfastlog (c * x + d); 1062 | v4sf loglogterm = vfastlog (logterm); 1063 | 1064 | v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm; 1065 | v4sf expminusw = vfastexp (minusw); 1066 | v4sf xexpminusw = x * expminusw; 1067 | v4sf pexpminusw = xexpminusw - minusw; 1068 | 1069 | return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / 1070 | (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw)); 1071 | } 1072 | 1073 | static inline v4sf 1074 | vfasterlambertw (v4sf x) 1075 | { 1076 | const v4sf threshold = v4sfl (2.26445f); 1077 | 1078 | v4sf under = _mm_cmplt_ps (x, threshold); 1079 | v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), 1080 | _mm_andnot_ps (under, v4sfl (1.0f))); 1081 | v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); 1082 | v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); 1083 | 1084 | v4sf logterm = vfasterlog (c * x + d); 1085 | v4sf loglogterm = vfasterlog (logterm); 1086 | 1087 | v4sf w = a + logterm - loglogterm + loglogterm / logterm; 1088 | v4sf expw = vfasterexp (-w); 1089 | 1090 | return (w * w + expw * x) / (v4sfl (1.0f) + w); 1091 | } 1092 | 1093 | static inline v4sf 1094 | vfastlambertwexpx (v4sf x) 1095 | { 1096 | const v4sf k = v4sfl (1.1765631309f); 1097 | const v4sf a = v4sfl (0.94537622168f); 1098 | const v4sf two = v4sfl (2.0f); 1099 | const v4sf three = v4sfl (3.0f); 1100 | const v4sf five = v4sfl (5.0f); 1101 | 1102 | v4sf logarg = _mm_max_ps (x, k); 1103 | v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); 1104 | 1105 | v4sf logterm = vfastlog (logarg); 1106 | v4sf powterm = vfasterpow2 (powarg); // don't need accuracy here 1107 | 1108 | v4sf w = powterm * (logarg - logterm + logterm / logarg); 1109 | v4sf logw = vfastlog (w); 1110 | v4sf p = x - logw; 1111 | 1112 | return w * (two + p + w * (three + two * p)) / 1113 | (two - p + w * (five + two * w)); 1114 | } 1115 | 1116 | static inline v4sf 1117 | vfasterlambertwexpx (v4sf x) 1118 | { 1119 | const v4sf k = v4sfl (1.1765631309f); 1120 | const v4sf a = v4sfl (0.94537622168f); 1121 | 1122 | v4sf logarg = _mm_max_ps (x, k); 1123 | v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); 1124 | 1125 | v4sf logterm = vfasterlog (logarg); 1126 | v4sf powterm = vfasterpow2 (powarg); 1127 | 1128 | v4sf w = powterm * (logarg - logterm + logterm / logarg); 1129 | v4sf logw = vfasterlog (w); 1130 | 1131 | return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w); 1132 | } 1133 | 1134 | #endif // __SSE2__ 1135 | 1136 | #endif // __FAST_LAMBERT_W_H_ 1137 | 1138 | /*=====================================================================* 1139 | * Copyright (C) 2011 Paul Mineiro * 1140 | * All rights reserved. * 1141 | * * 1142 | * Redistribution and use in source and binary forms, with * 1143 | * or without modification, are permitted provided that the * 1144 | * following conditions are met: * 1145 | * * 1146 | * * Redistributions of source code must retain the * 1147 | * above copyright notice, this list of conditions and * 1148 | * the following disclaimer. * 1149 | * * 1150 | * * Redistributions in binary form must reproduce the * 1151 | * above copyright notice, this list of conditions and * 1152 | * the following disclaimer in the documentation and/or * 1153 | * other materials provided with the distribution. * 1154 | * * 1155 | * * Neither the name of Paul Mineiro nor the names * 1156 | * of other contributors may be used to endorse or promote * 1157 | * products derived from this software without specific * 1158 | * prior written permission. * 1159 | * * 1160 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 1161 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 1162 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 1163 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 1164 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 1165 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 1166 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 1167 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 1168 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 1169 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 1170 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 1171 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 1172 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 1173 | * POSSIBILITY OF SUCH DAMAGE. * 1174 | * * 1175 | * Contact: Paul Mineiro * 1176 | *=====================================================================*/ 1177 | 1178 | #ifndef __FAST_POW_H_ 1179 | #define __FAST_POW_H_ 1180 | 1181 | #include 1182 | 1183 | static inline float 1184 | fastpow (float x, 1185 | float p) 1186 | { 1187 | return fastpow2 (p * fastlog2 (x)); 1188 | } 1189 | 1190 | static inline float 1191 | fasterpow (float x, 1192 | float p) 1193 | { 1194 | return fasterpow2 (p * fasterlog2 (x)); 1195 | } 1196 | 1197 | #ifdef __SSE2__ 1198 | 1199 | static inline v4sf 1200 | vfastpow (const v4sf x, 1201 | const v4sf p) 1202 | { 1203 | return vfastpow2 (p * vfastlog2 (x)); 1204 | } 1205 | 1206 | static inline v4sf 1207 | vfasterpow (const v4sf x, 1208 | const v4sf p) 1209 | { 1210 | return vfasterpow2 (p * vfasterlog2 (x)); 1211 | } 1212 | 1213 | #endif //__SSE2__ 1214 | 1215 | #endif // __FAST_POW_H_ 1216 | /*=====================================================================* 1217 | * Copyright (C) 2011 Paul Mineiro * 1218 | * All rights reserved. * 1219 | * * 1220 | * Redistribution and use in source and binary forms, with * 1221 | * or without modification, are permitted provided that the * 1222 | * following conditions are met: * 1223 | * * 1224 | * * Redistributions of source code must retain the * 1225 | * above copyright notice, this list of conditions and * 1226 | * the following disclaimer. * 1227 | * * 1228 | * * Redistributions in binary form must reproduce the * 1229 | * above copyright notice, this list of conditions and * 1230 | * the following disclaimer in the documentation and/or * 1231 | * other materials provided with the distribution. * 1232 | * * 1233 | * * Neither the name of Paul Mineiro nor the names * 1234 | * of other contributors may be used to endorse or promote * 1235 | * products derived from this software without specific * 1236 | * prior written permission. * 1237 | * * 1238 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 1239 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 1240 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 1241 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 1242 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 1243 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 1244 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 1245 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 1246 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 1247 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 1248 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 1249 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 1250 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 1251 | * POSSIBILITY OF SUCH DAMAGE. * 1252 | * * 1253 | * Contact: Paul Mineiro * 1254 | *=====================================================================*/ 1255 | 1256 | #ifndef __FAST_SIGMOID_H_ 1257 | #define __FAST_SIGMOID_H_ 1258 | 1259 | #include 1260 | 1261 | static inline float 1262 | fastsigmoid (float x) 1263 | { 1264 | return 1.0f / (1.0f + fastexp (-x)); 1265 | } 1266 | 1267 | static inline float 1268 | fastersigmoid (float x) 1269 | { 1270 | return 1.0f / (1.0f + fasterexp (-x)); 1271 | } 1272 | 1273 | #ifdef __SSE2__ 1274 | 1275 | static inline v4sf 1276 | vfastsigmoid (const v4sf x) 1277 | { 1278 | const v4sf c_1 = v4sfl (1.0f); 1279 | 1280 | return c_1 / (c_1 + vfastexp (-x)); 1281 | } 1282 | 1283 | static inline v4sf 1284 | vfastersigmoid (const v4sf x) 1285 | { 1286 | const v4sf c_1 = v4sfl (1.0f); 1287 | 1288 | return c_1 / (c_1 + vfasterexp (-x)); 1289 | } 1290 | 1291 | #endif //__SSE2__ 1292 | 1293 | #endif // __FAST_SIGMOID_H_ 1294 | /*=====================================================================* 1295 | * Copyright (C) 2011 Paul Mineiro * 1296 | * All rights reserved. * 1297 | * * 1298 | * Redistribution and use in source and binary forms, with * 1299 | * or without modification, are permitted provided that the * 1300 | * following conditions are met: * 1301 | * * 1302 | * * Redistributions of source code must retain the * 1303 | * above copyright notice, this list of conditions and * 1304 | * the following disclaimer. * 1305 | * * 1306 | * * Redistributions in binary form must reproduce the * 1307 | * above copyright notice, this list of conditions and * 1308 | * the following disclaimer in the documentation and/or * 1309 | * other materials provided with the distribution. * 1310 | * * 1311 | * * Neither the name of Paul Mineiro nor the names * 1312 | * of other contributors may be used to endorse or promote * 1313 | * products derived from this software without specific * 1314 | * prior written permission. * 1315 | * * 1316 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * 1317 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * 1318 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * 1319 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * 1320 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * 1321 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * 1322 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 1323 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * 1324 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * 1325 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 1326 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 1327 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * 1328 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * 1329 | * POSSIBILITY OF SUCH DAMAGE. * 1330 | * * 1331 | * Contact: Paul Mineiro * 1332 | *=====================================================================*/ 1333 | 1334 | #ifndef __FAST_TRIG_H_ 1335 | #define __FAST_TRIG_H_ 1336 | 1337 | #include 1338 | 1339 | // http://www.devmaster.net/forums/showthread.php?t=5784 1340 | // fast sine variants are for x \in [ -\pi, pi ] 1341 | // fast cosine variants are for x \in [ -\pi, pi ] 1342 | // fast tangent variants are for x \in [ -\pi / 2, pi / 2 ] 1343 | // "full" versions of functions handle the entire range of inputs 1344 | // although the range reduction technique used here will be hopelessly 1345 | // inaccurate for |x| >> 1000 1346 | // 1347 | // WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than 1348 | // libc calls on older machines (!) and on newer machines are only 1349 | // slighly faster. however: 1350 | // * vectorized versions are competitive 1351 | // * faster full versions are competitive 1352 | 1353 | static inline float 1354 | fastsin (float x) 1355 | { 1356 | static const float fouroverpi = 1.2732395447351627f; 1357 | static const float fouroverpisq = 0.40528473456935109f; 1358 | static const float q = 0.78444488374548933f; 1359 | union { float f; uint32_t i; } p = { 0.20363937680730309f }; 1360 | union { float f; uint32_t i; } r = { 0.015124940802184233f }; 1361 | union { float f; uint32_t i; } s = { -0.0032225901625579573f }; 1362 | 1363 | union { float f; uint32_t i; } vx = { x }; 1364 | uint32_t sign = vx.i & 0x80000000; 1365 | vx.i = vx.i & 0x7FFFFFFF; 1366 | 1367 | float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 1368 | float qpproxsq = qpprox * qpprox; 1369 | 1370 | p.i |= sign; 1371 | r.i |= sign; 1372 | s.i ^= sign; 1373 | 1374 | return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f)); 1375 | } 1376 | 1377 | static inline float 1378 | fastersin (float x) 1379 | { 1380 | static const float fouroverpi = 1.2732395447351627f; 1381 | static const float fouroverpisq = 0.40528473456935109f; 1382 | static const float q = 0.77633023248007499f; 1383 | union { float f; uint32_t i; } p = { 0.22308510060189463f }; 1384 | 1385 | union { float f; uint32_t i; } vx = { x }; 1386 | uint32_t sign = vx.i & 0x80000000; 1387 | vx.i &= 0x7FFFFFFF; 1388 | 1389 | float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 1390 | 1391 | p.i |= sign; 1392 | 1393 | return qpprox * (q + p.f * qpprox); 1394 | } 1395 | 1396 | static inline float 1397 | fastsinfull (float x) 1398 | { 1399 | static const float twopi = 6.2831853071795865f; 1400 | static const float invtwopi = 0.15915494309189534f; 1401 | 1402 | int k = x * invtwopi; 1403 | float half = (x < 0) ? -0.5f : 0.5f; 1404 | return fastsin ((half + k) * twopi - x); 1405 | } 1406 | 1407 | static inline float 1408 | fastersinfull (float x) 1409 | { 1410 | static const float twopi = 6.2831853071795865f; 1411 | static const float invtwopi = 0.15915494309189534f; 1412 | 1413 | int k = x * invtwopi; 1414 | float half = (x < 0) ? -0.5f : 0.5f; 1415 | return fastersin ((half + k) * twopi - x); 1416 | } 1417 | 1418 | static inline float 1419 | fastcos (float x) 1420 | { 1421 | static const float halfpi = 1.5707963267948966f; 1422 | static const float halfpiminustwopi = -4.7123889803846899f; 1423 | float offset = (x > halfpi) ? halfpiminustwopi : halfpi; 1424 | return fastsin (x + offset); 1425 | } 1426 | 1427 | static inline float 1428 | fastercos (float x) 1429 | { 1430 | static const float twooverpi = 0.63661977236758134f; 1431 | static const float p = 0.54641335845679634f; 1432 | 1433 | union { float f; uint32_t i; } vx = { x }; 1434 | vx.i &= 0x7FFFFFFF; 1435 | 1436 | float qpprox = 1.0f - twooverpi * vx.f; 1437 | 1438 | return qpprox + p * qpprox * (1.0f - qpprox * qpprox); 1439 | } 1440 | 1441 | static inline float 1442 | fastcosfull (float x) 1443 | { 1444 | static const float halfpi = 1.5707963267948966f; 1445 | return fastsinfull (x + halfpi); 1446 | } 1447 | 1448 | static inline float 1449 | fastercosfull (float x) 1450 | { 1451 | static const float halfpi = 1.5707963267948966f; 1452 | return fastersinfull (x + halfpi); 1453 | } 1454 | 1455 | static inline float 1456 | fasttan (float x) 1457 | { 1458 | static const float halfpi = 1.5707963267948966f; 1459 | return fastsin (x) / fastsin (x + halfpi); 1460 | } 1461 | 1462 | static inline float 1463 | fastertan (float x) 1464 | { 1465 | return fastersin (x) / fastercos (x); 1466 | } 1467 | 1468 | static inline float 1469 | fasttanfull (float x) 1470 | { 1471 | static const float twopi = 6.2831853071795865f; 1472 | static const float invtwopi = 0.15915494309189534f; 1473 | 1474 | int k = x * invtwopi; 1475 | float half = (x < 0) ? -0.5f : 0.5f; 1476 | float xnew = x - (half + k) * twopi; 1477 | 1478 | return fastsin (xnew) / fastcos (xnew); 1479 | } 1480 | 1481 | static inline float 1482 | fastertanfull (float x) 1483 | { 1484 | static const float twopi = 6.2831853071795865f; 1485 | static const float invtwopi = 0.15915494309189534f; 1486 | 1487 | int k = x * invtwopi; 1488 | float half = (x < 0) ? -0.5f : 0.5f; 1489 | float xnew = x - (half + k) * twopi; 1490 | 1491 | return fastersin (xnew) / fastercos (xnew); 1492 | } 1493 | 1494 | #ifdef __SSE2__ 1495 | 1496 | static inline v4sf 1497 | vfastsin (const v4sf x) 1498 | { 1499 | const v4sf fouroverpi = v4sfl (1.2732395447351627f); 1500 | const v4sf fouroverpisq = v4sfl (0.40528473456935109f); 1501 | const v4sf q = v4sfl (0.78444488374548933f); 1502 | const v4sf p = v4sfl (0.20363937680730309f); 1503 | const v4sf r = v4sfl (0.015124940802184233f); 1504 | const v4sf s = v4sfl (-0.0032225901625579573f); 1505 | 1506 | union { v4sf f; v4si i; } vx = { x }; 1507 | v4si sign = vx.i & v4sil (0x80000000); 1508 | vx.i &= v4sil (0x7FFFFFFF); 1509 | 1510 | v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 1511 | v4sf qpproxsq = qpprox * qpprox; 1512 | union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s)); 1513 | vy.i ^= sign; 1514 | 1515 | return q * qpprox + vy.f; 1516 | } 1517 | 1518 | static inline v4sf 1519 | vfastersin (const v4sf x) 1520 | { 1521 | const v4sf fouroverpi = v4sfl (1.2732395447351627f); 1522 | const v4sf fouroverpisq = v4sfl (0.40528473456935109f); 1523 | const v4sf q = v4sfl (0.77633023248007499f); 1524 | const v4sf plit = v4sfl (0.22308510060189463f); 1525 | union { v4sf f; v4si i; } p = { plit }; 1526 | 1527 | union { v4sf f; v4si i; } vx = { x }; 1528 | v4si sign = vx.i & v4sil (0x80000000); 1529 | vx.i &= v4sil (0x7FFFFFFF); 1530 | 1531 | v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; 1532 | 1533 | p.i |= sign; 1534 | 1535 | return qpprox * (q + p.f * qpprox); 1536 | } 1537 | 1538 | static inline v4sf 1539 | vfastsinfull (const v4sf x) 1540 | { 1541 | const v4sf twopi = v4sfl (6.2831853071795865f); 1542 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 1543 | 1544 | v4si k = v4sf_to_v4si (x * invtwopi); 1545 | 1546 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 1547 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 1548 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 1549 | 1550 | return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x); 1551 | } 1552 | 1553 | static inline v4sf 1554 | vfastersinfull (const v4sf x) 1555 | { 1556 | const v4sf twopi = v4sfl (6.2831853071795865f); 1557 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 1558 | 1559 | v4si k = v4sf_to_v4si (x * invtwopi); 1560 | 1561 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 1562 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 1563 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 1564 | 1565 | return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x); 1566 | } 1567 | 1568 | static inline v4sf 1569 | vfastcos (const v4sf x) 1570 | { 1571 | const v4sf halfpi = v4sfl (1.5707963267948966f); 1572 | const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f); 1573 | v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi); 1574 | v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi), 1575 | _mm_andnot_ps (lthalfpi, halfpi)); 1576 | return vfastsin (x + offset); 1577 | } 1578 | 1579 | static inline v4sf 1580 | vfastercos (v4sf x) 1581 | { 1582 | const v4sf twooverpi = v4sfl (0.63661977236758134f); 1583 | const v4sf p = v4sfl (0.54641335845679634); 1584 | 1585 | v4sf vx = v4sf_fabs (x); 1586 | v4sf qpprox = v4sfl (1.0f) - twooverpi * vx; 1587 | 1588 | return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox); 1589 | } 1590 | 1591 | static inline v4sf 1592 | vfastcosfull (const v4sf x) 1593 | { 1594 | const v4sf halfpi = v4sfl (1.5707963267948966f); 1595 | return vfastsinfull (x + halfpi); 1596 | } 1597 | 1598 | static inline v4sf 1599 | vfastercosfull (const v4sf x) 1600 | { 1601 | const v4sf halfpi = v4sfl (1.5707963267948966f); 1602 | return vfastersinfull (x + halfpi); 1603 | } 1604 | 1605 | static inline v4sf 1606 | vfasttan (const v4sf x) 1607 | { 1608 | const v4sf halfpi = v4sfl (1.5707963267948966f); 1609 | return vfastsin (x) / vfastsin (x + halfpi); 1610 | } 1611 | 1612 | static inline v4sf 1613 | vfastertan (const v4sf x) 1614 | { 1615 | return vfastersin (x) / vfastercos (x); 1616 | } 1617 | 1618 | static inline v4sf 1619 | vfasttanfull (const v4sf x) 1620 | { 1621 | const v4sf twopi = v4sfl (6.2831853071795865f); 1622 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 1623 | 1624 | v4si k = v4sf_to_v4si (x * invtwopi); 1625 | 1626 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 1627 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 1628 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 1629 | v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; 1630 | 1631 | return vfastsin (xnew) / vfastcos (xnew); 1632 | } 1633 | 1634 | static inline v4sf 1635 | vfastertanfull (const v4sf x) 1636 | { 1637 | const v4sf twopi = v4sfl (6.2831853071795865f); 1638 | const v4sf invtwopi = v4sfl (0.15915494309189534f); 1639 | 1640 | v4si k = v4sf_to_v4si (x * invtwopi); 1641 | 1642 | v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); 1643 | v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), 1644 | _mm_andnot_ps (ltzero, v4sfl (0.5f))); 1645 | v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; 1646 | 1647 | return vfastersin (xnew) / vfastercos (xnew); 1648 | } 1649 | 1650 | #endif //__SSE2__ 1651 | 1652 | #endif // __FAST_TRIG_H_ 1653 | --------------------------------------------------------------------------------