├── fastapprox
    ├── NEWS
    ├── AUTHORS
    ├── README
    ├── fw-pkgin
    │   ├── Makefile.am.local
    │   ├── stop
    │   ├── start
    │   ├── pre-install
    │   ├── post-install
    │   ├── pre-remove
    │   ├── post-remove
    │   └── config
    ├── Makefile.am.local
    ├── tests
    │   ├── testfastonebigheader.c
    │   ├── testfastlog.c
    │   ├── testfastpow.c
    │   ├── testfastsigmoid.c
    │   ├── testfastexp.c
    │   ├── testfasthyperbolic.c
    │   ├── testfastgamma.c
    │   ├── testfasterf.c
    │   ├── Makefile.am.local
    │   ├── testfastlambertw.c
    │   ├── testfasttrig.c
    │   └── testmacros.h
    ├── BUILD_HOWTO
    ├── configure.ac.local
    ├── bootstrap
    ├── src
    │   ├── Makefile.am.local
    │   ├── cast.h
    │   ├── fastsigmoid.h
    │   ├── fastpow.h
    │   ├── fasthyperbolic.h
    │   ├── fastlog.h
    │   ├── fastexp.h
    │   ├── fastgamma.h
    │   ├── fasterf.h
    │   ├── sse.h
    │   ├── fastlambertw.h
    │   ├── fasttrig.h
    │   └── fastonebigheader.h
    ├── ChangeLog
    ├── COPYING
    └── ax_check_compile_flag.m4
└── README.md


/fastapprox/NEWS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fastapprox/AUTHORS:
--------------------------------------------------------------------------------
1 | Paul Mineiro <paul@mineiro.com>
2 | 


--------------------------------------------------------------------------------
/fastapprox/README:
--------------------------------------------------------------------------------
1 | Fast approximate versions of certain functions that arise in machine learning.
2 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/Makefile.am.local:
--------------------------------------------------------------------------------
1 | # put whatever (auto)make commands here, they will be included from Makefile.am
2 | 


--------------------------------------------------------------------------------
/fastapprox/Makefile.am.local:
--------------------------------------------------------------------------------
1 | # put whatever (auto)make commands here, they will be included from Makefile.am
2 | 
3 | EXTRA_DIST += BUILD_HOWTO
4 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfastonebigheader.c:
--------------------------------------------------------------------------------
1 | #include "../src/fastonebigheader.h"
2 | 
3 | int
4 | main (void)
5 | {
6 |   return fastexp (1) < 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/fastapprox/BUILD_HOWTO:
--------------------------------------------------------------------------------
1 | The library consists entirely of header files, so there is no building per se.
2 | 
3 | You can run the tests via
4 | 
5 | ./configure && make check
6 | 
7 | After that you can make install if you want, or, you can grab 
8 | src/fastonebigheader.h and just drop it into whatever you're working on.
9 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/stop:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | #---------------------------------------------------------------------
 4 | #                                start                                
 5 | # 
 6 | # Executed when the package (service) is shut down.
 7 | # Not supported by all package formats.
 8 | #---------------------------------------------------------------------
 9 | 
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/start:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | #---------------------------------------------------------------------
 4 | #                                start                                
 5 | # 
 6 | # Executed when the package (service) is started up.  
 7 | # Not supported by all package formats.
 8 | #---------------------------------------------------------------------
 9 | 
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/fastapprox/configure.ac.local:
--------------------------------------------------------------------------------
 1 | dnl -- include additional autoconf commands here
 2 | dnl -- do not include AC_OUTPUT, this is called for you
 3 | 
 4 | m4_include([ax_check_compile_flag.m4])
 5 | 
 6 | AX_CHECK_COMPILE_FLAG([-std=c++0x], 
 7 |                       [CXXFLAGS="$CXXFLAGS -std=c++0x"])
 8 | 
 9 | AC_CHECK_HEADERS([emmintrin.h boost/math/special_functions/digamma.hpp gsl/gsl_sf_lambert.h])
10 | 
11 | PKG_CHECK_MODULES([GSL], [gsl], [], [ printf "" ])
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # fastapprox
 2 | Approximate and vectorized versions of common mathematical functions
 3 | 
 4 | The easiest way to include this in your projects is via the one big standalone header file which works with both C and C++.
 5 | 
 6 | ### Current functions:
 7 | 
 8 |  - exponential, logarithm, and power
 9 |  - lgamma and digamma
10 |  - cosh, sinh, tanh
11 |  - cos, sin, tan
12 |  - sigmoid and erf
13 |  - Lambert W
14 | 
15 | There's a Mathematica notebook which helps explain the techniques.
16 | 


--------------------------------------------------------------------------------
/fastapprox/bootstrap:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | if test -d fw/bin
 4 |   then
 5 |     PATH="`pwd`/fw/bin:$PATH"
 6 |     export PATH
 7 |   fi
 8 | 
 9 | fwb=`which fw-bootstrap`
10 | 
11 | if test -z "$fwb"
12 |   then
13 |     echo "bootstrap: fatal: fw-bootstrap not installed or not in PATH" 1>&2
14 |     exit 1
15 |   fi
16 | 
17 | "$fwb" --fw_version "0.3.3" --name fastapprox --template Cxx --revision svn --svn_project_path https://fastapprox.googlecode.com/svn/trunk/fastapprox --svn_tag_root https://fastapprox.googlecode.com/svn/tags/fastapprox "$@"
18 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/pre-install:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | #---------------------------------------------------------------------
 6 | #                            pre-install                             
 7 | # 
 8 | # Executed before the package is installed.
 9 | #
10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks
11 | #---------------------------------------------------------------------
12 | 
13 | case "$1" in
14 |   install)
15 |     ;;
16 |   upgrade)
17 |     # old version is $2
18 |     ;;
19 |   *)
20 |     ;;
21 | esac
22 | 
23 | exit 0
24 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/post-install:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | #---------------------------------------------------------------------
 6 | #                            post-install                             
 7 | # 
 8 | # Executed after the package is installed.
 9 | #
10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks
11 | #---------------------------------------------------------------------
12 | 
13 | case "$1" in
14 |   configure)
15 |     # most recently configured version is $2 (possibly empty string)
16 |     ;;
17 |   *)
18 |     ;;
19 | esac
20 | 
21 | exit 0
22 | 


--------------------------------------------------------------------------------
/fastapprox/src/Makefile.am.local:
--------------------------------------------------------------------------------
 1 | # put whatever (auto)make commands here, they will be included from Makefile.am
 2 | #
 3 | 
 4 | fastonebigheader.h: $(filter-out config.h fastonebigheader.h, $(wildcard *.h))
 5 | 	cat 			\
 6 | 	cast.h			\
 7 | 	sse.h 			\
 8 |         fastexp.h		\
 9 |         fastlog.h		\
10 |         fasterf.h		\
11 |         fastgamma.h		\
12 |         fasthyperbolic.h	\
13 | 	fastlambertw.h		\
14 |         fastpow.h		\
15 |         fastsigmoid.h		\
16 |         fasttrig.h		\
17 | 	| grep -v '#include "' > "$@"
18 | 
19 | myinclude_HEADERS +=					\
20 |   fastonebigheader.h 					\
21 |   $(filter-out config.h fastonebigheader.h, $(wildcard *.h))
22 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/pre-remove:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | #---------------------------------------------------------------------
 6 | #                            pre-remove
 7 | # 
 8 | # Executed before the package is removed.
 9 | #
10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks
11 | #---------------------------------------------------------------------
12 | 
13 | case "$1" in
14 |   upgrade)
15 |     # defer to newer package's script
16 |     exit 1
17 |     ;;
18 |   failed-upgrade)
19 |     # actually handle the upgrade here
20 |     # old-version is $2
21 |     ;;
22 |   remove)
23 |     ;;
24 |   *)
25 |     ;;
26 | esac
27 | 
28 | exit 0
29 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/post-remove:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | #---------------------------------------------------------------------
 6 | #                            post-remove
 7 | # 
 8 | # Executed after the package is removed.
 9 | #
10 | # http://code.google.com/p/fwtemplates/wiki/PackageHooks
11 | #---------------------------------------------------------------------
12 | 
13 | case "$1" in
14 |   upgrade)
15 |     # defer to newer package's script
16 |     exit 1
17 |     ;;
18 |   failed-upgrade)
19 |     # actually handle the upgrade here
20 |     # old-version is $2
21 |     ;;
22 |   remove)
23 |     ;;
24 |   *)
25 |     ;;
26 | esac
27 | 
28 | exit 0
29 | 


--------------------------------------------------------------------------------
/fastapprox/ChangeLog:
--------------------------------------------------------------------------------
 1 | Version 0.3.2
 2 |   * fix some compile warnings
 3 | 
 4 | Version 0.3.1
 5 |   * somewhat faster fasterlog
 6 |   * Lambert W, use negative exponential in Halley's method
 7 | 
 8 | Version 0.3.0
 9 |   * Lambert W
10 | 
11 | Version 0.2.0
12 |   * 10% faster fastdigamma/vfastdigamma (same formula with terms rearranged)
13 |   * handle underflow in fastexp (25% speed penalty, but ubiqituous so necessary)
14 |   * purge last remaining __attribute__ (gcc specific)
15 |   * trade some overall pow2 accuracy to improve accuracy on integral powers
16 | 
17 | Version 0.1.0
18 |   * Inverse erf
19 |   * Trade some overall log accuracy to improve accuracy on exact powers of 2
20 |   * Whoops, left out power 
21 | 
22 | Version 0.0.0
23 |   * Fast logarithm, exponential.
24 |   * Fast trigonometrics and hyperbolics.
25 |   * Fast sigmoid and erf.
26 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfastlog.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/time.h>
 7 | 
 8 | #include "../src/fastlog.h"
 9 | 
10 | #include "testmacros.h"
11 | 
12 | test_scalar (fastlog, logf, 1e-10f + 10.0f * drand48 (), 1e-4f, 100000000)
13 | test_scalar (fasterlog, logf, 1e-10f + 10.0f * drand48 (), 2e-2f, 100000000)
14 | 
15 | test_vector (vfastlog, logf, 1e-10f + 10.0f * drand48 (), 1e-4f, 100000000)
16 | test_vector (vfasterlog, logf, 1e-10f + 10.0f * drand48 (), 2e-2f, 100000000)
17 | 
18 | int 
19 | main (int   argc,
20 |       char *argv[])
21 | {
22 |   char buf[4096];
23 | 
24 |   (void) argc;
25 | 
26 |   srand48 (69);
27 | 
28 |   strncpy (buf, argv[0], sizeof (buf) - 5);
29 |   strncat (buf, ".out", 5);
30 | 
31 |   fclose (stderr);
32 |   stderr = fopen (buf, "w");
33 | 
34 |   test_fastlog ();
35 |   test_fasterlog ();
36 |   test_vfastlog ();
37 |   test_vfasterlog ();
38 | 
39 |   time_fastlog ();
40 |   time_fasterlog ();
41 |   time_vfastlog ();
42 |   time_vfasterlog ();
43 | 
44 |   return 0;
45 | }
46 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfastpow.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/time.h>
 7 | 
 8 | #include "../src/fastpow.h"
 9 | 
10 | #include "testmacros.h"
11 | 
12 | test_scalar2 (fastpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 1e-4f, 100000000)
13 | test_scalar2 (fasterpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 2e-2f, 100000000)
14 | 
15 | test_vector2 (vfastpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 1e-4f, 100000000)
16 | test_vector2 (vfasterpow, powf, 3.0f * drand48 (), -3.0f + 6.0f * drand48 (), 2e-2f, 100000000)
17 | 
18 | int 
19 | main (int   argc,
20 |       char *argv[])
21 | {
22 |   char buf[4096];
23 | 
24 |   (void) argc;
25 | 
26 |   srand48 (69);
27 | 
28 |   strncpy (buf, argv[0], sizeof (buf) - 5);
29 |   strncat (buf, ".out", 5);
30 | 
31 |   fclose (stderr);
32 |   stderr = fopen (buf, "w");
33 | 
34 |   test_fastpow ();
35 |   test_fasterpow ();
36 |   test_vfastpow ();
37 |   test_vfasterpow ();
38 | 
39 |   time_fastpow ();
40 |   time_fasterpow ();
41 |   time_vfastpow ();
42 |   time_vfasterpow ();
43 | 
44 |   return 0;
45 | }
46 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfastsigmoid.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/time.h>
 7 | 
 8 | #include "../src/fastsigmoid.h"
 9 | 
10 | #include "testmacros.h"
11 | 
12 | static inline float
13 | sigmoidf (float x)
14 | {
15 |   return 1.0 / (1.0 + expf (-x));
16 | }
17 | 
18 | test_scalar (fastsigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 1e-4f, 100000000)
19 | test_scalar (fastersigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 2e-2f, 100000000)
20 | 
21 | test_vector (vfastsigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 1e-4f, 100000000)
22 | test_vector (vfastersigmoid, sigmoidf, -50.0f + 100.0f * drand48 (), 2e-2f, 100000000)
23 | 
24 | int 
25 | main (int   argc,
26 |       char *argv[])
27 | {
28 |   char buf[4096];
29 | 
30 |   (void) argc;
31 | 
32 |   srand48 (69);
33 | 
34 |   strncpy (buf, argv[0], sizeof (buf) - 5);
35 |   strncat (buf, ".out", 5);
36 | 
37 |   fclose (stderr);
38 |   stderr = fopen (buf, "w");
39 | 
40 |   test_fastsigmoid ();
41 |   test_fastersigmoid ();
42 |   test_vfastsigmoid ();
43 |   test_vfastersigmoid ();
44 | 
45 |   time_fastsigmoid ();
46 |   time_fastersigmoid ();
47 |   time_vfastsigmoid ();
48 |   time_vfastersigmoid ();
49 | 
50 |   return 0;
51 | }
52 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfastexp.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/time.h>
 7 | 
 8 | #include "../src/fastexp.h"
 9 | 
10 | #include "testmacros.h"
11 | 
12 | test_scalar (fastexp, expf, -5.0f + 10.0f * drand48 (), 1e-4f, 100000000)
13 | test_scalar (fasterexp, expf, -5.0f + 10.0f * drand48 (), 2e-2f, 100000000)
14 | 
15 | test_vector (vfastexp, expf, -5.0f + 10.0f * drand48 (), 1e-4f, 100000000)
16 | test_vector (vfasterexp, expf, -5.0f + 10.0f * drand48 (), 2e-2f, 100000000)
17 | 
18 | int 
19 | main (int   argc,
20 |       char *argv[])
21 | {
22 |   char buf[4096];
23 | 
24 |   (void) argc;
25 | 
26 |   float x;
27 |   for (x = -50; x > -1000; x -= 100)
28 |     {
29 |       assert (fastexp (x) >= 0);
30 |       assert (fasterexp (x) >= 0);
31 | #ifdef __SSE2__
32 |       v4sf vx = v4sfl (x);
33 |       assert (v4sf_index (vfastexp (vx), 0) >= 0);
34 |       assert (v4sf_index (vfasterexp (vx), 0) >= 0);
35 | #endif
36 |     }
37 | 
38 |   srand48 (69);
39 | 
40 |   strncpy (buf, argv[0], sizeof (buf) - 5);
41 |   strncat (buf, ".out", 5);
42 | 
43 |   fclose (stderr);
44 |   stderr = fopen (buf, "w");
45 | 
46 | 
47 |   test_fastexp ();
48 |   test_fasterexp ();
49 |   test_vfastexp ();
50 |   test_vfasterexp ();
51 | 
52 |   time_fastexp ();
53 |   time_fasterexp ();
54 |   time_vfastexp ();
55 |   time_vfasterexp ();
56 | 
57 |   return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/fastapprox/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011, Paul Mineiro
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions
 6 | are met:
 7 | 
 8 |   * Redistributions of source code must retain the above copyright notice,
 9 |     this list of conditions and the following disclaimer.
10 |   * Redistributions in binary form must reproduce the above copyright notice,
11 |     this list of conditions and the following disclaimer in the documentation
12 |     and/or other materials provided with the distribution.
13 |   * Neither the name of Paul Mineiro nor the names of its contributors
14 |     may be used to endorse or promote products derived from this software
15 |     without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27 | THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfasthyperbolic.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/time.h>
 7 | 
 8 | #include "../src/fasthyperbolic.h"
 9 | 
10 | #include "testmacros.h"
11 | 
12 | test_scalar (fastsinh, sinhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000)
13 | test_scalar (fastersinh, sinhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000)
14 | 
15 | test_vector (vfastsinh, sinhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000)
16 | test_vector (vfastersinh, sinhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000)
17 | 
18 | test_scalar (fastcosh, coshf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000)
19 | test_scalar (fastercosh, coshf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000)
20 | 
21 | test_vector (vfastcosh, coshf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000)
22 | test_vector (vfastercosh, coshf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000)
23 | 
24 | test_scalar (fasttanh, tanhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000)
25 | test_scalar (fastertanh, tanhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000)
26 | 
27 | test_vector (vfasttanh, tanhf, -25.0f + 50.0f * drand48 (), 1e-4f, 100000000)
28 | test_vector (vfastertanh, tanhf, -25.0f + 50.0f * drand48 (), 2e-2f, 100000000)
29 | 
30 | int 
31 | main (int   argc,
32 |       char *argv[])
33 | {
34 |   char buf[4096];
35 | 
36 |   (void) argc;
37 | 
38 |   srand48 (69);
39 | 
40 |   strncpy (buf, argv[0], sizeof (buf) - 5);
41 |   strncat (buf, ".out", 5);
42 | 
43 |   fclose (stderr);
44 |   stderr = fopen (buf, "w");
45 | 
46 |   test_fastsinh ();
47 |   test_fastersinh ();
48 |   test_fastcosh ();
49 |   test_fastercosh ();
50 |   test_fasttanh ();
51 |   test_fastertanh ();
52 |   test_vfastsinh ();
53 |   test_vfastersinh ();
54 |   test_vfastcosh ();
55 |   test_vfastercosh ();
56 |   test_vfasttanh ();
57 |   test_vfastertanh ();
58 | 
59 |   time_fastsinh ();
60 |   time_fastersinh ();
61 |   time_fastcosh ();
62 |   time_fastercosh ();
63 |   time_fasttanh ();
64 |   time_fastertanh ();
65 |   time_vfastsinh ();
66 |   time_vfastersinh ();
67 |   time_vfastcosh ();
68 |   time_vfastercosh ();
69 |   time_vfasttanh ();
70 |   time_vfastertanh ();
71 | 
72 |   return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfastgamma.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/time.h>
 7 | 
 8 | #include "../src/config.h"
 9 | 
10 | #ifdef __cplusplus
11 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP
12 | #include <boost/math/special_functions/digamma.hpp>
13 | #endif
14 | #endif
15 | 
16 | #include "../src/fastgamma.h"
17 | 
18 | #include "testmacros.h"
19 | 
20 | test_scalar (fastlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000)
21 | test_scalar (fasterlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000)
22 | 
23 | test_vector (vfastlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000)
24 | test_vector (vfasterlgamma, lgammaf, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000)
25 | 
26 | #ifdef __cplusplus
27 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP
28 | test_scalar (fastdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000)
29 | test_scalar (fasterdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000)
30 | 
31 | test_vector (vfastdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 5e-4f, 100000000)
32 | test_vector (vfasterdigamma, boost::math::digamma, 1e-2f + 10.0f * drand48 (), 1e-1f, 100000000)
33 | #endif
34 | #endif
35 | 
36 | int 
37 | main (int   argc,
38 |       char *argv[])
39 | {
40 |   char buf[4096];
41 | 
42 |   (void) argc;
43 | 
44 |   srand48 (69);
45 | 
46 |   strncpy (buf, argv[0], sizeof (buf) - 5);
47 |   strncat (buf, ".out", 5);
48 | 
49 |   fclose (stderr);
50 |   stderr = fopen (buf, "w");
51 | 
52 |   test_fastlgamma ();
53 |   test_fasterlgamma ();
54 |   test_vfastlgamma ();
55 |   test_vfasterlgamma ();
56 | 
57 | #ifdef __cplusplus
58 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP
59 |   test_fastdigamma ();
60 |   test_fasterdigamma ();
61 |   test_vfastdigamma ();
62 |   test_vfasterdigamma ();
63 | #endif
64 | #endif
65 | 
66 |   time_fastlgamma ();
67 |   time_fasterlgamma ();
68 |   time_vfastlgamma ();
69 |   time_vfasterlgamma ();
70 | 
71 | #ifdef __cplusplus
72 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP
73 |   time_fastdigamma ();
74 |   time_fasterdigamma ();
75 |   time_vfastdigamma ();
76 |   time_vfasterdigamma ();
77 | #endif
78 | #endif
79 | 
80 | #ifdef HAVE_BOOST_MATH_SPECIAL_FUNCTIONS_DIGAMMA_HPP
81 |   return 0;
82 | #else
83 |   return 77;
84 | #endif
85 | }
86 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfasterf.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/time.h>
 7 | 
 8 | #include "../src/fasterf.h"
 9 | 
10 | #include "testmacros.h"
11 | 
12 | static float
13 | slowinverseerf (float x)
14 | {
15 |   float y0 = 0.886227f * x;
16 |   float sqrtpi = 1.7724538509055160f;
17 | 
18 |   y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0));
19 |   y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0));
20 |   y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0));
21 |   y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0));
22 |   y0 += 0.5f * exp (y0 * y0) * sqrtpi * (x - erff (y0));
23 | 
24 |   return y0;
25 | }
26 | 
27 | test_scalar (fasterf, erff, -6.0f + 12.0f * drand48 (), 1e-3f, 100000000)
28 | test_scalar (fastererf, erff, -6.0f + 12.0f * drand48 (), 2e-2f, 100000000)
29 | test_vector (vfasterf, erff, -6.0f + 12.0f * drand48 (), 1e-3f, 100000000)
30 | test_vector (vfastererf, erff, -6.0f + 12.0f * drand48 (), 2e-2f, 100000000)
31 | 
32 | test_scalar (fastinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 2e-3f, 100000000)
33 | test_vector (vfastinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 2e-3f, 100000000)
34 | test_scalar (fasterinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 4e-2f, 100000000)
35 | test_vector (vfasterinverseerf, slowinverseerf, -0.99f + 1.98f * drand48 (), 4e-2f, 100000000)
36 | 
37 | test_scalar (fasterfc, erfcf, -2.0f + 4.0f * drand48 (), 5e-3f, 100000000)
38 | test_scalar (fastererfc, erfcf, -2.0f + 4.0f * drand48 (), 8e-2f, 100000000)
39 | test_vector (vfasterfc, erfcf, -2.0f + 4.0f * drand48 (), 5e-3f, 100000000)
40 | test_vector (vfastererfc, erfcf, -2.0f + 4.0f * drand48 (), 8e-2f, 100000000)
41 | 
42 | int 
43 | main (int   argc,
44 |       char *argv[])
45 | {
46 |   char buf[4096];
47 | 
48 |   (void) argc;
49 | 
50 |   srand48 (69);
51 | 
52 |   strncpy (buf, argv[0], sizeof (buf) - 5);
53 |   strncat (buf, ".out", 5);
54 | 
55 |   fclose (stderr);
56 |   stderr = fopen (buf, "w");
57 | 
58 |   test_fasterf ();
59 |   test_fastererf ();
60 |   test_vfasterf ();
61 |   test_vfastererf ();
62 |   test_fasterfc ();
63 |   test_fastererfc ();
64 |   test_vfasterfc ();
65 |   test_vfastererfc ();
66 |   test_fastinverseerf ();
67 |   test_vfastinverseerf ();
68 |   test_fasterinverseerf ();
69 |   test_vfasterinverseerf ();
70 | 
71 |   time_fasterf ();
72 |   time_fastererf ();
73 |   time_vfasterf ();
74 |   time_vfastererf ();
75 |   time_fasterfc ();
76 |   time_fastererfc ();
77 |   time_vfasterfc ();
78 |   time_vfastererfc ();
79 |   time_fastinverseerf ();
80 |   time_vfastinverseerf ();
81 |   time_fasterinverseerf ();
82 |   time_vfasterinverseerf ();
83 | 
84 |   return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/fastapprox/fw-pkgin/config:
--------------------------------------------------------------------------------
 1 | # The FW_PACKAGE_MAINTAINER field is populated with the environment
 2 | # variable FW_PACKAGE_DEFAULT_MAINTAINER if non-empty at init time
 3 | 
 4 | FW_PACKAGE_NAME="fastapprox"
 5 | FW_PACKAGE_VERSION="0.3.2"
 6 | FW_PACKAGE_MAINTAINER="Paul Mineiro <paul-fw@mineiro.com>"
 7 | FW_PACKAGE_SHORT_DESCRIPTION="Fast approximate function library."
 8 | FW_PACKAGE_DESCRIPTION=`cat README`
 9 | FW_PACKAGE_ARCHITECTURE_DEPENDENT="0"
10 | 
11 | # Dependency information.  The native syntax corresponds to Debian,
12 | # http://www.debian.org/doc/debian-policy/ch-relationships.html
13 | # Section 7.1 "Syntax of Relationship Fields"
14 | # 
15 | # For other packaging systems, the syntax is translated for you.
16 | 
17 | FW_PACKAGE_DEPENDS=""
18 | FW_PACKAGE_CONFLICTS=""
19 | FW_PACKAGE_PROVIDES=""
20 | FW_PACKAGE_REPLACES=""
21 | FW_PACKAGE_SUGGESTS=""
22 | 
23 | FW_PACKAGE_BUILD_DEPENDS=""
24 | FW_PACKAGE_BUILD_CONFLICTS=""
25 | 
26 | # dupload is used for submitting debian packages to a package archive
27 | # The FW_DUPLOAD_ARGS field is populated with the environment variable
28 | # FW_DEFAULT_DUPLOAD_ARGS if non-empty at init time
29 | 
30 | FW_DUPLOAD_ARGS=${FW_DUPLOAD_ARGS-"-t dukeslucid"}
31 | 
32 | # scp+createrepo is used for submitting rpm packages to a package archive
33 | # The FW_RPM_REPO_USER, FW_RPM_REPO_HOST, FW_RPM_REPO_BASEDIR,
34 | # and FW_RPM_POSTCREATEREPO_COMMANDS variables are populated with 
35 | # FW_RPM_REPO_USER_DEFAULT, FW_RPM_REPO_HOST_DEFAULT, 
36 | # FW_RPM_REPO_BASEDIR_DEFAULT, and FW_RPM_POSTCREATEREPO_COMMANDS_DEFAULT
37 | # respectively if non-empty at init time
38 | 
39 | FW_RPM_REPO_USER=${FW_RPM_REPO_USER-"`whoami`"}
40 | FW_RPM_REPO_HOST=${FW_RPM_REPO_HOST-"ub32srvvmw"}
41 | FW_RPM_REPO_BASEDIR=${FW_RPM_REPO_BASEDIR-"/var/yum"}
42 | FW_RPM_CREATEREPO_ARGS=${FW_RPM_CREATEREPO_ARGS-"-q --database"}
43 | 
44 | # this variable controls whether createrepo is run incrementally (--update).
45 | # possible settings are yes (always do it), no (never do it), and 
46 | # auto (do it if the repository has been previously initialized)
47 | FW_RPM_CREATEREPO_INCREMENTAL=${FW_RPM_CREATEREPO_INCREMENTAL-"auto"}
48 | 
49 | # these commands will be run after a successful createrepo run
50 | FW_RPM_POSTCREATEREPO_COMMANDS=${FW_RPM_POSTCREATEREPO_COMMANDS-"{ cd /var; rsync -a yum /var/package/dukes; }"}
51 | # here's a suggestion:
52 | # FW_RPM_POSTCREATEREPO_COMMANDS="gpg --detach-sign --armor repodata/repomd.xml" 
53 | 
54 | # set to the directory in which version-named tags will be created
55 | FW_SUBVERSION_TAG_ROOT="https://fastapprox.googlecode.com/svn/tags/fastapprox"
56 | 
57 | # uncomment and set to specify additional pkg-config packages on the Requires:
58 | # line of the generated .pc file
59 | # FW_PKGCONFIG_REQUIRES_EXTRA=""
60 | 
61 | # uncomment and set to specify additional content for the Libs:
62 | # line of the generated .pc file
63 | # FW_PKGCONFIG_LIBS_EXTRA=""
64 | 
65 | # uncomment and set to specify additional content for the Cflags:
66 | # line of the generated .pc file
67 | # FW_PKGCONFIG_CFLAGS_EXTRA=""
68 | 
69 | # uncomment and set to add arbitrary additional content to the 
70 | # generated .pc file
71 | # FW_PKGCONFIG_EXTRA=""
72 | 


--------------------------------------------------------------------------------
/fastapprox/src/cast.h:
--------------------------------------------------------------------------------
 1 | /*=====================================================================*
 2 |  *                   Copyright (C) 2012 Paul Mineiro                   *
 3 |  * All rights reserved.                                                *
 4 |  *                                                                     *
 5 |  * Redistribution and use in source and binary forms, with             *
 6 |  * or without modification, are permitted provided that the            *
 7 |  * following conditions are met:                                       *
 8 |  *                                                                     *
 9 |  *     * Redistributions of source code must retain the                *
10 |  *     above copyright notice, this list of conditions and             *
11 |  *     the following disclaimer.                                       *
12 |  *                                                                     *
13 |  *     * Redistributions in binary form must reproduce the             *
14 |  *     above copyright notice, this list of conditions and             *
15 |  *     the following disclaimer in the documentation and/or            *
16 |  *     other materials provided with the distribution.                 *
17 |  *                                                                     *
18 |  *     * Neither the name of Paul Mineiro nor the names                *
19 |  *     of other contributors may be used to endorse or promote         *
20 |  *     products derived from this software without specific            *
21 |  *     prior written permission.                                       *
22 |  *                                                                     *
23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
37 |  *                                                                     *
38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
39 |  *=====================================================================*/
40 | 
41 | #ifndef __CAST_H_
42 | 
43 | #ifdef __cplusplus
44 | #define cast_uint32_t static_cast<uint32_t>
45 | #else
46 | #define cast_uint32_t (uint32_t)
47 | #endif
48 | 
49 | #endif // __CAST_H_
50 | 


--------------------------------------------------------------------------------
/fastapprox/tests/Makefile.am.local:
--------------------------------------------------------------------------------
  1 | # put whatever (auto)make commands here, they will be included from Makefile.am
  2 | 
  3 | check_PROGRAMS =			\
  4 |   testfastexp				\
  5 |   testfastexpcpp			\
  6 |   testfasterf				\
  7 |   testfasterfcpp			\
  8 |   testfastlog				\
  9 |   testfastlogcpp			\
 10 |   testfastpow				\
 11 |   testfastpowcpp			\
 12 |   testfastsigmoid			\
 13 |   testfastsigmoidcpp			\
 14 |   testfasthyperbolic			\
 15 |   testfasthyperboliccpp			\
 16 |   testfasttrig				\
 17 |   testfasttrigcpp			\
 18 |   testfastgamma				\
 19 |   testfastgammacpp			\
 20 |   testfastlambertw			\
 21 |   testfastlambertwcpp			\
 22 |   testfastonebigheader			\
 23 |   testfastonebigheadercpp
 24 | 
 25 | testfasterf_SOURCES =			\
 26 |   testfasterf.c
 27 | 
 28 | testfasterfcpp_SOURCES =		\
 29 |   testfasterfcpp.cc
 30 | 
 31 | testfasterf_LDADD =			\
 32 |   -lm
 33 | 
 34 | testfasterfcpp_LDADD =			\
 35 |   -lm
 36 | 
 37 | testfastexp_SOURCES =			\
 38 |   testfastexp.c
 39 | 
 40 | testfastexpcpp_SOURCES =		\
 41 |   testfastexpcpp.cc
 42 | 
 43 | testfastexp_LDADD =			\
 44 |   -lm
 45 | 
 46 | testfastexpcpp_LDADD =			\
 47 |   -lm
 48 | 
 49 | testfastgamma_SOURCES =			\
 50 |   testfastgamma.c
 51 | 
 52 | testfastgammacpp_SOURCES =		\
 53 |   testfastgammacpp.cc
 54 | 
 55 | testfastgamma_LDADD =			\
 56 |   -lm
 57 | 
 58 | testfastgammacpp_LDADD =		\
 59 |   -lm
 60 | 
 61 | testfasthyperbolic_SOURCES =		\
 62 |   testfasthyperbolic.c
 63 | 
 64 | testfasthyperboliccpp_SOURCES =		\
 65 |   testfasthyperboliccpp.cc
 66 | 
 67 | testfasthyperbolic_LDADD =		\
 68 |   -lm
 69 | 
 70 | testfasthyperboliccpp_LDADD =		\
 71 |   -lm
 72 | 
 73 | testfastlambertw_SOURCES =		\
 74 |   testfastlambertw.c
 75 | 
 76 | testfastlambertwcpp_SOURCES =		\
 77 |   testfastlambertwcpp.cc
 78 | 
 79 | testfastlambertw_LDADD =		\
 80 |   -lm ${GSL_LIBS}
 81 | 
 82 | testfastlambertwcpp_LDADD =		\
 83 |   -lm ${GSL_LIBS}
 84 | 
 85 | testfastlog_SOURCES =			\
 86 |   testfastlog.c
 87 | 
 88 | testfastlogcpp_SOURCES =		\
 89 |   testfastlogcpp.cc
 90 | 
 91 | testfastlog_LDADD =			\
 92 |   -lm
 93 | 
 94 | testfastlogcpp_LDADD =			\
 95 |   -lm
 96 | 
 97 | testfastonebigheader_SOURCES =		\
 98 |   testfastonebigheader.c
 99 | 
100 | testfastonebigheadercpp_SOURCES =	\
101 |   testfastonebigheadercpp.cc
102 | 
103 | testfastonebigheader_LDADD =		\
104 |   -lm
105 | 
106 | testfastonebigheadercpp_LDADD =		\
107 |   -lm
108 | 
109 | testfastpow_SOURCES =			\
110 |   testfastpow.c
111 | 
112 | testfastpowcpp_SOURCES =		\
113 |   testfastpowcpp.cc
114 | 
115 | testfastpow_LDADD =			\
116 |   -lm
117 | 
118 | testfastpowcpp_LDADD =			\
119 |   -lm
120 | 
121 | testfastsigmoid_SOURCES =		\
122 |   testfastsigmoid.c
123 | 
124 | testfastsigmoidcpp_SOURCES =		\
125 |   testfastsigmoidcpp.cc
126 | 
127 | testfastsigmoid_LDADD =			\
128 |   -lm
129 | 
130 | testfastsigmoidcpp_LDADD =		\
131 |   -lm
132 | 
133 | testfasttrig_SOURCES =			\
134 |   testfasttrig.c
135 | 
136 | testfasttrigcpp_SOURCES =		\
137 |   testfasttrigcpp.cc
138 | 
139 | testfasttrig_LDADD =			\
140 |   -lm
141 | 
142 | testfasttrigcpp_LDADD =			\
143 |   -lm
144 | 
145 | %cpp.cc: %.c
146 | 	cat $< > $@
147 | 
148 | TESTS =					\
149 |   $(check_PROGRAMS)
150 | 
151 | CLEANFILES +=				\
152 |   $(wildcard *cpp.cc)			\
153 |   $(wildcard *.out)
154 | 
155 | EXTRA_DIST = testmacros.h fastapprox.nb
156 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfastlambertw.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <math.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <sys/time.h>
  7 | 
  8 | #include "../src/fastlambertw.h"
  9 | 
 10 | #include "testmacros.h"
 11 | 
 12 | #include "../src/config.h"
 13 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 
 14 | #include <gsl/gsl_sf_lambert.h>
 15 | #endif
 16 | 
 17 | static inline float
 18 | lambertwrange (void)
 19 | {
 20 |   if (drand48 () < 0.5) 
 21 |     {
 22 |       return -0.36787944117144232f + 1.36787944117144232f * drand48 ();
 23 |     }
 24 |   else
 25 |     {
 26 |       return 100.0f * drand48 ();
 27 |     }
 28 | }
 29 | 
 30 | static inline float
 31 | lambertwf (float x)
 32 | {
 33 |   float w = (x < 5) ? 0 : log (x) - log (log (x)) + log (log (x)) / log (x);
 34 |   unsigned int n;
 35 | 
 36 |   for (n = 0; n < 20; ++n)
 37 |     {
 38 |       w = (w * w + exp (-w) * x) / (1.0 + w);
 39 |     }
 40 | 
 41 |   return w;
 42 | }
 43 | 
 44 | static inline float
 45 | lambertwexpxf (float x)
 46 | {
 47 |   return lambertwf (expf (x));
 48 | }
 49 | 
 50 | test_scalar (fastlambertw, lambertwf, lambertwrange (), 1e-4f, 100000000)
 51 | test_scalar (fasterlambertw, lambertwf, lambertwrange (), 1e-2f, 100000000)
 52 | test_scalar (fastlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-3f, 100000000)
 53 | test_scalar (fasterlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-2f, 100000000)
 54 | 
 55 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 
 56 | test_scalar (gsl_sf_lambert_W0, lambertwf, lambertwrange (), 1e-2f, 1000000)
 57 | #endif
 58 | 
 59 | test_vector (vfastlambertw, lambertwf, lambertwrange (), 1e-4f, 100000000)
 60 | test_vector (vfasterlambertw, lambertwf, lambertwrange (), 1e-2f, 100000000)
 61 | test_vector (vfastlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-3f, 100000000)
 62 | test_vector (vfasterlambertwexpx, lambertwexpxf, -3.0f + 6.0f * drand48 (), 1e-2f, 100000000)
 63 | 
 64 | int 
 65 | main (int   argc,
 66 |       char *argv[])
 67 | {
 68 |   char buf[4096];
 69 | 
 70 |   (void) argc;
 71 | 
 72 |   srand48 (69);
 73 | 
 74 | //  fprintf (stderr, "fastlambertw (%g) = %g, "
 75 | //                   "fastlambertw (%g) = %g, "
 76 | //                   "fasterlambertwexpx (%g) = %g (%g)\n",
 77 | //           -0.36787944117144232f,
 78 | //           fastlambertw (-0.36787944117144232f),
 79 | //           -0.36787944117144232f + 0.01f,
 80 | //           fastlambertw (-0.36787944117144232f + 0.01f),
 81 | //           -5.0f,
 82 | //           fasterlambertwexpx (-5.0f),
 83 | //           v4sf_index (vfasterlambertwexpx (v4sfl (-5.0f)), 0));
 84 | 
 85 |   strncpy (buf, argv[0], sizeof (buf) - 5);
 86 |   strncat (buf, ".out", 5);
 87 | 
 88 |   fclose (stderr);
 89 |   stderr = fopen (buf, "w");
 90 | 
 91 |   test_fastlambertw ();
 92 |   test_fasterlambertw ();
 93 |   test_vfastlambertw ();
 94 |   test_vfasterlambertw ();
 95 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 
 96 |   test_gsl_sf_lambert_W0 ();
 97 | #endif
 98 |   test_fastlambertwexpx ();
 99 |   test_fasterlambertwexpx ();
100 |   test_vfastlambertwexpx ();
101 |   test_vfasterlambertwexpx ();
102 | 
103 |   time_fastlambertw ();
104 |   time_fasterlambertw ();
105 |   time_vfastlambertw ();
106 |   time_vfasterlambertw ();
107 | #ifdef HAVE_GSL_GSL_SF_LAMBERT_H 
108 |   time_gsl_sf_lambert_W0 ();
109 | #endif
110 |   time_fastlambertwexpx ();
111 |   time_fasterlambertwexpx ();
112 |   time_vfastlambertwexpx ();
113 |   time_vfasterlambertwexpx ();
114 | 
115 |   return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/fastapprox/ax_check_compile_flag.m4:
--------------------------------------------------------------------------------
 1 | # ===========================================================================
 2 | #   http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
 3 | # ===========================================================================
 4 | #
 5 | # SYNOPSIS
 6 | #
 7 | #   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
 8 | #
 9 | # DESCRIPTION
10 | #
11 | #   Check whether the given FLAG works with the current language's compiler
12 | #   or gives an error.  (Warnings, however, are ignored)
13 | #
14 | #   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
15 | #   success/failure.
16 | #
17 | #   If EXTRA-FLAGS is defined, it is added to the current language's default
18 | #   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
19 | #   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
20 | #   force the compiler to issue an error when a bad flag is given.
21 | #
22 | #   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
23 | #   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
24 | #
25 | # LICENSE
26 | #
27 | #   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
28 | #   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
29 | #
30 | #   This program is free software: you can redistribute it and/or modify it
31 | #   under the terms of the GNU General Public License as published by the
32 | #   Free Software Foundation, either version 3 of the License, or (at your
33 | #   option) any later version.
34 | #
35 | #   This program is distributed in the hope that it will be useful, but
36 | #   WITHOUT ANY WARRANTY; without even the implied warranty of
37 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
38 | #   Public License for more details.
39 | #
40 | #   You should have received a copy of the GNU General Public License along
41 | #   with this program. If not, see <http://www.gnu.org/licenses/>.
42 | #
43 | #   As a special exception, the respective Autoconf Macro's copyright owner
44 | #   gives unlimited permission to copy, distribute and modify the configure
45 | #   scripts that are the output of Autoconf when processing the Macro. You
46 | #   need not follow the terms of the GNU General Public License when using
47 | #   or distributing such scripts, even though portions of the text of the
48 | #   Macro appear in them. The GNU General Public License (GPL) does govern
49 | #   all other use of the material that constitutes the Autoconf Macro.
50 | #
51 | #   This special exception to the GPL applies to versions of the Autoconf
52 | #   Macro released by the Autoconf Archive. When you make and distribute a
53 | #   modified version of the Autoconf Macro, you may extend this special
54 | #   exception to the GPL to apply to your modified version as well.
55 | 
56 | #serial 2
57 | 
58 | AC_DEFUN([AX_CHECK_COMPILE_FLAG],
59 | [AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
60 | AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
61 | AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
62 |   ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
63 |   _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
64 |   AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
65 |     [AS_VAR_SET(CACHEVAR,[yes])],
66 |     [AS_VAR_SET(CACHEVAR,[no])])
67 |   _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
68 | AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
69 |   [m4_default([$2], :)],
70 |   [m4_default([$3], :)])
71 | AS_VAR_POPDEF([CACHEVAR])dnl
72 | ])dnl AX_CHECK_COMPILE_FLAGS
73 | 


--------------------------------------------------------------------------------
/fastapprox/src/fastsigmoid.h:
--------------------------------------------------------------------------------
 1 | /*=====================================================================*
 2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 3 |  * All rights reserved.                                                *
 4 |  *                                                                     *
 5 |  * Redistribution and use in source and binary forms, with             *
 6 |  * or without modification, are permitted provided that the            *
 7 |  * following conditions are met:                                       *
 8 |  *                                                                     *
 9 |  *     * Redistributions of source code must retain the                *
10 |  *     above copyright notice, this list of conditions and             *
11 |  *     the following disclaimer.                                       *
12 |  *                                                                     *
13 |  *     * Redistributions in binary form must reproduce the             *
14 |  *     above copyright notice, this list of conditions and             *
15 |  *     the following disclaimer in the documentation and/or            *
16 |  *     other materials provided with the distribution.                 *
17 |  *                                                                     *
18 |  *     * Neither the name of Paul Mineiro nor the names                *
19 |  *     of other contributors may be used to endorse or promote         *
20 |  *     products derived from this software without specific            *
21 |  *     prior written permission.                                       *
22 |  *                                                                     *
23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
37 |  *                                                                     *
38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
39 |  *=====================================================================*/
40 | 
41 | #ifndef __FAST_SIGMOID_H_
42 | #define __FAST_SIGMOID_H_
43 | 
44 | #include <stdint.h>
45 | #include "sse.h"
46 | #include "fastexp.h"
47 | 
48 | static inline float
49 | fastsigmoid (float x)
50 | {
51 |   return 1.0f / (1.0f + fastexp (-x));
52 | }
53 | 
54 | static inline float
55 | fastersigmoid (float x)
56 | {
57 |   return 1.0f / (1.0f + fasterexp (-x));
58 | }
59 | 
60 | #ifdef __SSE2__
61 | 
62 | static inline v4sf
63 | vfastsigmoid (const v4sf x)
64 | {
65 |   const v4sf c_1 = v4sfl (1.0f);
66 | 
67 |   return c_1 / (c_1 + vfastexp (-x));
68 | }
69 | 
70 | static inline v4sf
71 | vfastersigmoid (const v4sf x)
72 | {
73 |   const v4sf c_1 = v4sfl (1.0f);
74 | 
75 |   return c_1 / (c_1 + vfasterexp (-x));
76 | }
77 | 
78 | #endif //__SSE2__
79 | 
80 | #endif // __FAST_SIGMOID_H_
81 | 


--------------------------------------------------------------------------------
/fastapprox/src/fastpow.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /*=====================================================================*
 3 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 4 |  * All rights reserved.                                                *
 5 |  *                                                                     *
 6 |  * Redistribution and use in source and binary forms, with             *
 7 |  * or without modification, are permitted provided that the            *
 8 |  * following conditions are met:                                       *
 9 |  *                                                                     *
10 |  *     * Redistributions of source code must retain the                *
11 |  *     above copyright notice, this list of conditions and             *
12 |  *     the following disclaimer.                                       *
13 |  *                                                                     *
14 |  *     * Redistributions in binary form must reproduce the             *
15 |  *     above copyright notice, this list of conditions and             *
16 |  *     the following disclaimer in the documentation and/or            *
17 |  *     other materials provided with the distribution.                 *
18 |  *                                                                     *
19 |  *     * Neither the name of Paul Mineiro nor the names                *
20 |  *     of other contributors may be used to endorse or promote         *
21 |  *     products derived from this software without specific            *
22 |  *     prior written permission.                                       *
23 |  *                                                                     *
24 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
25 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
26 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
27 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
28 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
29 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
30 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
31 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
32 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
33 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
34 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
35 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
36 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
37 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
38 |  *                                                                     *
39 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
40 |  *=====================================================================*/
41 | 
42 | #ifndef __FAST_POW_H_
43 | #define __FAST_POW_H_
44 | 
45 | #include <stdint.h>
46 | #include "sse.h"
47 | #include "fastexp.h"
48 | #include "fastlog.h"
49 | 
50 | static inline float
51 | fastpow (float x,
52 |          float p)
53 | {
54 |   return fastpow2 (p * fastlog2 (x));
55 | }
56 | 
57 | static inline float
58 | fasterpow (float x,
59 |            float p)
60 | {
61 |   return fasterpow2 (p * fasterlog2 (x));
62 | }
63 | 
64 | #ifdef __SSE2__
65 | 
66 | static inline v4sf
67 | vfastpow (const v4sf x,
68 |           const v4sf p)
69 | {
70 |   return vfastpow2 (p * vfastlog2 (x));
71 | }
72 | 
73 | static inline v4sf
74 | vfasterpow (const v4sf x,
75 |             const v4sf p)
76 | {
77 |   return vfasterpow2 (p * vfasterlog2 (x));
78 | }
79 | 
80 | #endif //__SSE2__
81 | 
82 | #endif // __FAST_POW_H_
83 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testfasttrig.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <math.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <sys/time.h>
  7 | 
  8 | #include "../src/fasttrig.h"
  9 | 
 10 | #include "testmacros.h"
 11 | 
 12 | static const float pi = 3.1415926535897932f;
 13 | static const float twopi = 6.2831853071795865f;
 14 | static const float halfpi = 1.5707963267948966f;
 15 | 
 16 | static inline float
 17 | no_half_pi ()
 18 | {
 19 |   float rv = -100.0 + 200.0 * drand48 ();
 20 |   int k = rv / halfpi;
 21 |   float delta = rv - k * halfpi;
 22 | 
 23 |   while (fabsf (delta) < 0.001f)
 24 |     {
 25 |       rv = -100.0 + 200.0 * drand48 ();
 26 |       k = rv / halfpi;
 27 |       delta = rv - k * halfpi;
 28 |     }
 29 | 
 30 |   return rv;
 31 | }
 32 | 
 33 | test_scalar (fastsin, sinf, -pi + twopi * drand48 (), 1e-4f, 100000000)
 34 | test_scalar (fastersin, sinf, -pi + twopi * drand48 (), 2e-2f, 100000000)
 35 | test_scalar (fastsinfull, sinf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000)
 36 | test_scalar (fastersinfull, sinf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000)
 37 | 
 38 | test_vector (vfastsin, sinf, -pi + twopi * drand48 (), 1e-4f, 100000000)
 39 | test_vector (vfastersin, sinf, -pi + twopi * drand48 (), 2e-2f, 100000000)
 40 | test_vector (vfastsinfull, sinf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000)
 41 | test_vector (vfastersinfull, sinf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000)
 42 | 
 43 | test_scalar (fastcos, cosf, -pi + twopi * drand48 (), 1e-4f, 100000000)
 44 | test_scalar (fastercos, cosf, -pi + twopi * drand48 (), 2e-2f, 100000000)
 45 | test_scalar (fastcosfull, cosf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000)
 46 | test_scalar (fastercosfull, cosf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000)
 47 | 
 48 | test_vector (vfastcos, cosf, -pi + twopi * drand48 (), 1e-4f, 100000000)
 49 | test_vector (vfastercos, cosf, -pi + twopi * drand48 (), 2e-2f, 100000000)
 50 | test_vector (vfastcosfull, cosf, -100.0 + 200.0 * drand48 (), 1e-4f, 100000000)
 51 | test_vector (vfastercosfull, cosf, -100.0 + 200.0 * drand48 (), 2e-2f, 100000000)
 52 | 
 53 | test_scalar (fasttan, tanf, -0.5f * pi + pi * drand48 (), 1e-4f, 100000000)
 54 | test_scalar (fastertan, tanf, -0.5f * pi + pi * drand48 (), 2e-2f, 100000000)
 55 | test_scalar (fasttanfull, tanf, no_half_pi (), 1e-4f, 100000000)
 56 | test_scalar (fastertanfull, tanf, no_half_pi (), 2e-2f, 100000000)
 57 | 
 58 | test_vector (vfasttan, tanf, -0.5f * pi + pi * drand48 (), 1e-4f, 100000000)
 59 | test_vector (vfastertan, tanf, -0.5f * pi + pi * drand48 (), 2e-2f, 100000000)
 60 | test_vector (vfasttanfull, tanf, no_half_pi (), 1e-4f, 100000000)
 61 | test_vector (vfastertanfull, tanf, no_half_pi (), 2e-2f, 100000000)
 62 | 
 63 | int 
 64 | main (int   argc,
 65 |       char *argv[])
 66 | {
 67 |   (void) argc;
 68 |   (void) argv;
 69 | 
 70 |   char buf[4096];
 71 | 
 72 |   srand48 (69);
 73 | 
 74 |   strncpy (buf, argv[0], sizeof (buf) - 5);
 75 |   strncat (buf, ".out", 5);
 76 | 
 77 |   fclose (stderr);
 78 |   stderr = fopen (buf, "w");
 79 | 
 80 |   test_fastsin ();
 81 |   test_fastersin ();
 82 |   test_fastsinfull ();
 83 |   test_fastersinfull ();
 84 |   test_fastcos ();
 85 |   test_fastercos ();
 86 |   test_fastcosfull ();
 87 |   test_fastercosfull ();
 88 |   test_fasttan ();
 89 |   test_fastertan ();
 90 |   test_fasttanfull ();
 91 |   test_fastertanfull ();
 92 |   test_vfastsin ();
 93 |   test_vfastersin ();
 94 |   test_vfastsinfull ();
 95 |   test_vfastersinfull ();
 96 |   test_vfastcos ();
 97 |   test_vfastercos ();
 98 |   test_vfastcosfull ();
 99 |   test_vfastercosfull ();
100 |   test_vfasttan ();
101 |   test_vfastertan ();
102 |   test_vfasttanfull ();
103 |   test_vfastertanfull ();
104 | 
105 |   time_fastsin ();
106 |   time_fastersin ();
107 |   time_fastsinfull ();
108 |   time_fastersinfull ();
109 |   time_fastcos ();
110 |   time_fastercos ();
111 |   time_fastcosfull ();
112 |   time_fastercosfull ();
113 |   time_fasttan ();
114 |   time_fastertan ();
115 |   time_fasttanfull ();
116 |   time_fastertanfull ();
117 |   time_vfastsin ();
118 |   time_vfastersin ();
119 |   time_vfastsinfull ();
120 |   time_vfastersinfull ();
121 |   time_vfastcos ();
122 |   time_vfastercos ();
123 |   time_vfastcosfull ();
124 |   time_vfastercosfull ();
125 |   time_vfasttan ();
126 |   time_vfastertan ();
127 |   time_vfasttanfull ();
128 |   time_vfastertanfull ();
129 | 
130 |   return 0;
131 | }
132 | 


--------------------------------------------------------------------------------
/fastapprox/src/fasthyperbolic.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __FAST_HYPERBOLIC_H_
 42 | #define __FAST_HYPERBOLIC_H_
 43 | 
 44 | #include <stdint.h>
 45 | #include "sse.h"
 46 | #include "fastexp.h"
 47 | 
 48 | static inline float
 49 | fastsinh (float p)
 50 | {
 51 |   return 0.5f * (fastexp (p) - fastexp (-p));
 52 | }
 53 | 
 54 | static inline float
 55 | fastersinh (float p)
 56 | {
 57 |   return 0.5f * (fasterexp (p) - fasterexp (-p));
 58 | }
 59 | 
 60 | static inline float
 61 | fastcosh (float p)
 62 | {
 63 |   return 0.5f * (fastexp (p) + fastexp (-p));
 64 | }
 65 | 
 66 | static inline float
 67 | fastercosh (float p)
 68 | {
 69 |   return 0.5f * (fasterexp (p) + fasterexp (-p));
 70 | }
 71 | 
 72 | static inline float
 73 | fasttanh (float p)
 74 | {
 75 |   return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
 76 | }
 77 | 
 78 | static inline float
 79 | fastertanh (float p)
 80 | {
 81 |   return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p));
 82 | }
 83 | 
 84 | #ifdef __SSE2__
 85 | 
 86 | static inline v4sf
 87 | vfastsinh (const v4sf p)
 88 | {
 89 |   const v4sf c_0_5 = v4sfl (0.5f);
 90 | 
 91 |   return c_0_5 * (vfastexp (p) - vfastexp (-p));
 92 | }
 93 | 
 94 | static inline v4sf
 95 | vfastersinh (const v4sf p)
 96 | {
 97 |   const v4sf c_0_5 = v4sfl (0.5f);
 98 | 
 99 |   return c_0_5 * (vfasterexp (p) - vfasterexp (-p));
100 | }
101 | 
102 | static inline v4sf
103 | vfastcosh (const v4sf p)
104 | {
105 |   const v4sf c_0_5 = v4sfl (0.5f);
106 | 
107 |   return c_0_5 * (vfastexp (p) + vfastexp (-p));
108 | }
109 | 
110 | static inline v4sf
111 | vfastercosh (const v4sf p)
112 | {
113 |   const v4sf c_0_5 = v4sfl (0.5f);
114 | 
115 |   return c_0_5 * (vfasterexp (p) + vfasterexp (-p));
116 | }
117 | 
118 | static inline v4sf
119 | vfasttanh (const v4sf p)
120 | {
121 |   const v4sf c_1 = v4sfl (1.0f);
122 |   const v4sf c_2 = v4sfl (2.0f);
123 | 
124 |   return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p));
125 | }
126 | 
127 | static inline v4sf
128 | vfastertanh (const v4sf p)
129 | {
130 |   const v4sf c_1 = v4sfl (1.0f);
131 |   const v4sf c_2 = v4sfl (2.0f);
132 | 
133 |   return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p));
134 | }
135 | 
136 | #endif //__SSE2__
137 | 
138 | #endif // __FAST_HYPERBOLIC_H_
139 | 


--------------------------------------------------------------------------------
/fastapprox/src/fastlog.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __FAST_LOG_H_
 42 | #define __FAST_LOG_H_
 43 | 
 44 | #include <stdint.h>
 45 | #include "sse.h"
 46 | 
 47 | static inline float 
 48 | fastlog2 (float x)
 49 | {
 50 |   union { float f; uint32_t i; } vx = { x };
 51 |   union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
 52 |   float y = vx.i;
 53 |   y *= 1.1920928955078125e-7f;
 54 | 
 55 |   return y - 124.22551499f
 56 |            - 1.498030302f * mx.f 
 57 |            - 1.72587999f / (0.3520887068f + mx.f);
 58 | }
 59 | 
 60 | static inline float
 61 | fastlog (float x)
 62 | {
 63 |   return 0.69314718f * fastlog2 (x);
 64 | }
 65 | 
 66 | static inline float 
 67 | fasterlog2 (float x)
 68 | {
 69 |   union { float f; uint32_t i; } vx = { x };
 70 |   float y = vx.i;
 71 |   y *= 1.1920928955078125e-7f;
 72 |   return y - 126.94269504f;
 73 | }
 74 | 
 75 | static inline float
 76 | fasterlog (float x)
 77 | {
 78 | //  return 0.69314718f * fasterlog2 (x);
 79 | 
 80 |   union { float f; uint32_t i; } vx = { x };
 81 |   float y = vx.i;
 82 |   y *= 8.2629582881927490e-8f;
 83 |   return y - 87.989971088f;
 84 | }
 85 | 
 86 | #ifdef __SSE2__
 87 | 
 88 | static inline v4sf
 89 | vfastlog2 (v4sf x)
 90 | {
 91 |   union { v4sf f; v4si i; } vx = { x };
 92 |   union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000);
 93 |   v4sf y = v4si_to_v4sf (vx.i);
 94 |   y *= v4sfl (1.1920928955078125e-7f);
 95 | 
 96 |   const v4sf c_124_22551499 = v4sfl (124.22551499f);
 97 |   const v4sf c_1_498030302 = v4sfl (1.498030302f);
 98 |   const v4sf c_1_725877999 = v4sfl (1.72587999f);
 99 |   const v4sf c_0_3520087068 = v4sfl (0.3520887068f);
100 | 
101 |   return y - c_124_22551499
102 |            - c_1_498030302 * mx.f 
103 |            - c_1_725877999 / (c_0_3520087068 + mx.f);
104 | }
105 | 
106 | static inline v4sf
107 | vfastlog (v4sf x)
108 | {
109 |   const v4sf c_0_69314718 = v4sfl (0.69314718f);
110 | 
111 |   return c_0_69314718 * vfastlog2 (x);
112 | }
113 | 
114 | static inline v4sf 
115 | vfasterlog2 (v4sf x)
116 | {
117 |   union { v4sf f; v4si i; } vx = { x };
118 |   v4sf y = v4si_to_v4sf (vx.i);
119 |   y *= v4sfl (1.1920928955078125e-7f);
120 | 
121 |   const v4sf c_126_94269504 = v4sfl (126.94269504f);
122 | 
123 |   return y - c_126_94269504;
124 | }
125 | 
126 | static inline v4sf
127 | vfasterlog (v4sf x)
128 | {
129 | //  const v4sf c_0_69314718 = v4sfl (0.69314718f);
130 | //
131 | //  return c_0_69314718 * vfasterlog2 (x);
132 | 
133 |   union { v4sf f; v4si i; } vx = { x };
134 |   v4sf y = v4si_to_v4sf (vx.i);
135 |   y *= v4sfl (8.2629582881927490e-8f);
136 | 
137 |   const v4sf c_87_989971088 = v4sfl (87.989971088f);
138 | 
139 |   return y - c_87_989971088;
140 | }
141 | 
142 | #endif // __SSE2__
143 | 
144 | #endif // __FAST_LOG_H_
145 | 


--------------------------------------------------------------------------------
/fastapprox/src/fastexp.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __FAST_EXP_H_
 42 | #define __FAST_EXP_H_
 43 | 
 44 | #include <stdint.h>
 45 | #include "cast.h"
 46 | #include "sse.h"
 47 | 
 48 | // Underflow of exponential is common practice in numerical routines,
 49 | // so handle it here.
 50 | 
 51 | static inline float
 52 | fastpow2 (float p)
 53 | {
 54 |   float offset = (p < 0) ? 1.0f : 0.0f;
 55 |   float clipp = (p < -126) ? -126.0f : p;
 56 |   int w = clipp;
 57 |   float z = clipp - w + offset;
 58 |   union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
 59 | 
 60 |   return v.f;
 61 | }
 62 | 
 63 | static inline float
 64 | fastexp (float p)
 65 | {
 66 |   return fastpow2 (1.442695040f * p);
 67 | }
 68 | 
 69 | static inline float
 70 | fasterpow2 (float p)
 71 | {
 72 |   float clipp = (p < -126) ? -126.0f : p;
 73 |   union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) };
 74 |   return v.f;
 75 | }
 76 | 
 77 | static inline float
 78 | fasterexp (float p)
 79 | {
 80 |   return fasterpow2 (1.442695040f * p);
 81 | }
 82 | 
 83 | #ifdef __SSE2__
 84 | 
 85 | static inline v4sf
 86 | vfastpow2 (const v4sf p)
 87 | {
 88 |   v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f));
 89 |   v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f));
 90 |   v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
 91 |   v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
 92 |   v4si w = v4sf_to_v4si (clipp);
 93 |   v4sf z = clipp - v4si_to_v4sf (w) + offset;
 94 | 
 95 |   const v4sf c_121_2740838 = v4sfl (121.2740575f);
 96 |   const v4sf c_27_7280233 = v4sfl (27.7280233f);
 97 |   const v4sf c_4_84252568 = v4sfl (4.84252568f);
 98 |   const v4sf c_1_49012907 = v4sfl (1.49012907f);
 99 |   union { v4si i; v4sf f; } v = {
100 |     v4sf_to_v4si (
101 |       v4sfl (1 << 23) * 
102 |       (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z)
103 |     )
104 |   };
105 | 
106 |   return v.f;
107 | }
108 | 
109 | static inline v4sf
110 | vfastexp (const v4sf p)
111 | {
112 |   const v4sf c_invlog_2 = v4sfl (1.442695040f);
113 | 
114 |   return vfastpow2 (c_invlog_2 * p);
115 | }
116 | 
117 | static inline v4sf
118 | vfasterpow2 (const v4sf p)
119 | {
120 |   const v4sf c_126_94269504 = v4sfl (126.94269504f);
121 |   v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
122 |   v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
123 |   union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) };
124 |   return v.f;
125 | }
126 | 
127 | static inline v4sf
128 | vfasterexp (const v4sf p)
129 | {
130 |   const v4sf c_invlog_2 = v4sfl (1.442695040f);
131 | 
132 |   return vfasterpow2 (c_invlog_2 * p);
133 | }
134 | 
135 | #endif //__SSE2__
136 | 
137 | #endif // __FAST_EXP_H_
138 | 


--------------------------------------------------------------------------------
/fastapprox/src/fastgamma.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __FAST_GAMMA_H_
 42 | #define __FAST_GAMMA_H_
 43 | 
 44 | #include <stdint.h>
 45 | #include "sse.h"
 46 | #include "fastlog.h"
 47 | 
 48 | /* gamma/digamma functions only work for positive inputs */
 49 | 
 50 | static inline float
 51 | fastlgamma (float x)
 52 | {
 53 |   float logterm = fastlog (x * (1.0f + x) * (2.0f + x));
 54 |   float xp3 = 3.0f + x;
 55 | 
 56 |   return - 2.081061466f 
 57 |          - x 
 58 |          + 0.0833333f / xp3 
 59 |          - logterm 
 60 |          + (2.5f + x) * fastlog (xp3);
 61 | }
 62 | 
 63 | static inline float
 64 | fasterlgamma (float x)
 65 | {
 66 |   return - 0.0810614667f 
 67 |          - x
 68 |          - fasterlog (x)
 69 |          + (0.5f + x) * fasterlog (1.0f + x);
 70 | }
 71 | 
 72 | static inline float
 73 | fastdigamma (float x)
 74 | {
 75 |   float twopx = 2.0f + x;
 76 |   float logterm = fastlog (twopx);
 77 | 
 78 |   return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) /
 79 |          (12.0f * x * (1.0f + x) * twopx * twopx)
 80 |          + logterm;
 81 | }
 82 | 
 83 | static inline float
 84 | fasterdigamma (float x)
 85 | {
 86 |   float onepx = 1.0f + x;
 87 | 
 88 |   return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx);
 89 | }
 90 | 
 91 | #ifdef __SSE2__
 92 | 
 93 | static inline v4sf
 94 | vfastlgamma (v4sf x)
 95 | {
 96 |   const v4sf c_1_0 = v4sfl (1.0f);
 97 |   const v4sf c_2_0 = v4sfl (2.0f);
 98 |   const v4sf c_3_0 = v4sfl (3.0f);
 99 |   const v4sf c_2_081061466 = v4sfl (2.081061466f);
100 |   const v4sf c_0_0833333 = v4sfl (0.0833333f);
101 |   const v4sf c_2_5 = v4sfl (2.5f);
102 | 
103 |   v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x));
104 |   v4sf xp3 = c_3_0 + x;
105 | 
106 |   return - c_2_081061466
107 |          - x 
108 |          + c_0_0833333 / xp3 
109 |          - logterm 
110 |          + (c_2_5 + x) * vfastlog (xp3);
111 | }
112 | 
113 | static inline v4sf
114 | vfasterlgamma (v4sf x)
115 | {
116 |   const v4sf c_0_0810614667 = v4sfl (0.0810614667f);
117 |   const v4sf c_0_5 = v4sfl (0.5f);
118 |   const v4sf c_1 = v4sfl (1.0f);
119 | 
120 |   return - c_0_0810614667
121 |          - x
122 |          - vfasterlog (x)
123 |          + (c_0_5 + x) * vfasterlog (c_1 + x);
124 | }
125 | 
126 | static inline v4sf
127 | vfastdigamma (v4sf x)
128 | {
129 |   v4sf twopx = v4sfl (2.0f) + x;
130 |   v4sf logterm = vfastlog (twopx);
131 | 
132 |   return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) /
133 |          (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx)
134 |          + logterm;
135 | }
136 | 
137 | static inline v4sf
138 | vfasterdigamma (v4sf x)
139 | {
140 |   const v4sf c_1_0 = v4sfl (1.0f);
141 |   const v4sf c_2_0 = v4sfl (2.0f);
142 |   v4sf onepx = c_1_0 + x;
143 | 
144 |   return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx);
145 | }
146 | 
147 | #endif //__SSE2__
148 | 
149 | #endif // __FAST_GAMMA_H_
150 | 


--------------------------------------------------------------------------------
/fastapprox/src/fasterf.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __FAST_ERF_H_
 42 | #define __FAST_ERF_H_
 43 | 
 44 | #include <math.h>
 45 | #include <stdint.h>
 46 | #include "sse.h"
 47 | #include "fastexp.h"
 48 | #include "fastlog.h"
 49 | 
 50 | // fasterfc: not actually faster than erfcf(3) on newer machines!
 51 | // ... although vectorized version is interesting
 52 | //     and fastererfc is very fast
 53 | 
 54 | static inline float
 55 | fasterfc (float x)
 56 | {
 57 |   static const float k = 3.3509633149424609f;
 58 |   static const float a = 0.07219054755431126f;
 59 |   static const float b = 15.418191568719577f;
 60 |   static const float c = 5.609846028328545f;
 61 | 
 62 |   union { float f; uint32_t i; } vc = { c * x };
 63 |   float xsq = x * x;
 64 |   float xquad = xsq * xsq;
 65 | 
 66 |   vc.i |= 0x80000000;
 67 | 
 68 |   return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f);
 69 | }
 70 | 
 71 | static inline float
 72 | fastererfc (float x)
 73 | {
 74 |   static const float k = 3.3509633149424609f;
 75 | 
 76 |   return 2.0f / (1.0f + fasterpow2 (k * x));
 77 | }
 78 | 
 79 | // fasterf: not actually faster than erff(3) on newer machines! 
 80 | // ... although vectorized version is interesting
 81 | //     and fastererf is very fast
 82 | 
 83 | static inline float
 84 | fasterf (float x)
 85 | {
 86 |   return 1.0f - fasterfc (x);
 87 | }
 88 | 
 89 | static inline float
 90 | fastererf (float x)
 91 | {
 92 |   return 1.0f - fastererfc (x);
 93 | }
 94 | 
 95 | static inline float
 96 | fastinverseerf (float x)
 97 | {
 98 |   static const float invk = 0.30004578719350504f;
 99 |   static const float a = 0.020287853348211326f;
100 |   static const float b = 0.07236892874789555f;
101 |   static const float c = 0.9913030456864257f;
102 |   static const float d = 0.8059775923760193f;
103 | 
104 |   float xsq = x * x;
105 | 
106 |   return invk * fastlog2 ((1.0f + x) / (1.0f - x)) 
107 |        + x * (a - b * xsq) / (c - d * xsq);
108 | }
109 | 
110 | static inline float
111 | fasterinverseerf (float x)
112 | {
113 |   static const float invk = 0.30004578719350504f;
114 | 
115 |   return invk * fasterlog2 ((1.0f + x) / (1.0f - x));
116 | }
117 | 
118 | #ifdef __SSE2__
119 | 
120 | static inline v4sf
121 | vfasterfc (v4sf x)
122 | {
123 |   const v4sf k = v4sfl (3.3509633149424609f);
124 |   const v4sf a = v4sfl (0.07219054755431126f);
125 |   const v4sf b = v4sfl (15.418191568719577f);
126 |   const v4sf c = v4sfl (5.609846028328545f);
127 | 
128 |   union { v4sf f; v4si i; } vc; vc.f = c * x;
129 |   vc.i |= v4sil (0x80000000);
130 | 
131 |   v4sf xsq = x * x;
132 |   v4sf xquad = xsq * xsq;
133 | 
134 |   return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f);
135 | }
136 | 
137 | static inline v4sf
138 | vfastererfc (const v4sf x)
139 | {
140 |   const v4sf k = v4sfl (3.3509633149424609f);
141 | 
142 |   return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x));
143 | }
144 | 
145 | static inline v4sf
146 | vfasterf (v4sf x)
147 | {
148 |   return v4sfl (1.0f) - vfasterfc (x);
149 | }
150 | 
151 | static inline v4sf
152 | vfastererf (const v4sf x)
153 | {
154 |   return v4sfl (1.0f) - vfastererfc (x);
155 | }
156 | 
157 | static inline v4sf
158 | vfastinverseerf (v4sf x)
159 | {
160 |   const v4sf invk = v4sfl (0.30004578719350504f);
161 |   const v4sf a = v4sfl (0.020287853348211326f);
162 |   const v4sf b = v4sfl (0.07236892874789555f);
163 |   const v4sf c = v4sfl (0.9913030456864257f);
164 |   const v4sf d = v4sfl (0.8059775923760193f);
165 | 
166 |   v4sf xsq = x * x;
167 | 
168 |   return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) 
169 |        + x * (a - b * xsq) / (c - d * xsq);
170 | }
171 | 
172 | static inline v4sf
173 | vfasterinverseerf (v4sf x)
174 | {
175 |   const v4sf invk = v4sfl (0.30004578719350504f);
176 | 
177 |   return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x));
178 | }
179 | 
180 | #endif //__SSE2__
181 | 
182 | #endif // __FAST_ERF_H_
183 | 


--------------------------------------------------------------------------------
/fastapprox/src/sse.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __SSE_H_
 42 | #define __SSE_H_
 43 | 
 44 | #ifdef __SSE2__
 45 | 
 46 | #include <emmintrin.h>
 47 | 
 48 | #ifdef __cplusplus
 49 | namespace {
 50 | #endif // __cplusplus
 51 | 
 52 | typedef __m128 v4sf;
 53 | typedef __m128i v4si;
 54 | 
 55 | #define v4si_to_v4sf _mm_cvtepi32_ps
 56 | #define v4sf_to_v4si _mm_cvttps_epi32
 57 | 
 58 | #if _MSC_VER && !__INTEL_COMPILER
 59 |   template <class T>
 60 |   __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }
 61 | 
 62 |   #define AS_4CHARS(a) \
 63 |       GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
 64 |       GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)
 65 | 
 66 |   #define _MM_SETR_EPI32(a0, a1, a2, a3) \
 67 |       { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }
 68 | 
 69 |   #define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
 70 |   #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))
 71 | 
 72 |   __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
 73 |   __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
 74 |   __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
 75 |   __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }
 76 | 
 77 |   __forceinline const v4sf operator+(const v4sf& a) { return a; }
 78 |   __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
 79 | 
 80 |   __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
 81 |   __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
 82 |   __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }
 83 | 
 84 |   __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
 85 |   __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
 86 |   __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }
 87 | 
 88 |   __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
 89 |   __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
 90 |   __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
 91 |   __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }
 92 | 
 93 |   __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
 94 |   __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
 95 |   __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
 96 | #else
 97 |   #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
 98 |   #define v2dil(x) ((const v4si) { (x), (x) })
 99 |   #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
100 | #endif
101 | 
102 | typedef union { v4sf f; float array[4]; } v4sfindexer;
103 | #define v4sf_index(_findx, _findi)      \
104 |   ({                                    \
105 |      v4sfindexer _findvx = { _findx } ; \
106 |      _findvx.array[_findi];             \
107 |   })
108 | typedef union { v4si i; int array[4]; } v4siindexer;
109 | #define v4si_index(_iindx, _iindi)      \
110 |   ({                                    \
111 |      v4siindexer _iindvx = { _iindx } ; \
112 |      _iindvx.array[_iindi];             \
113 |   })
114 | 
115 | typedef union { v4sf f; v4si i; } v4sfv4sipun;
116 | #if _MSC_VER && !__INTEL_COMPILER
117 |   #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
118 | #else
119 |   #define v4sf_fabs(x)                  \
120 |   ({                                    \
121 |      v4sfv4sipun vx;                    \
122 |      vx.f = x;                          \
123 |      vx.i &= v4sil (0x7FFFFFFF);        \
124 |      vx.f;                              \
125 |   })
126 | #endif
127 | 
128 | #ifdef __cplusplus
129 | } // end namespace
130 | #endif // __cplusplus
131 | 
132 | #endif // __SSE2__
133 | 
134 | #endif // __SSE_H_
135 | 


--------------------------------------------------------------------------------
/fastapprox/src/fastlambertw.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __FAST_LAMBERT_W_H_
 42 | #define __FAST_LAMBERT_W_H_
 43 | 
 44 | #include <stdint.h>
 45 | #include "fastexp.h"
 46 | #include "fastlog.h"
 47 | #include "sse.h"
 48 | 
 49 | // these functions compute the upper branch aka W_0
 50 | 
 51 | static inline float
 52 | fastlambertw (float x)
 53 | {
 54 |   static const float threshold = 2.26445f;
 55 | 
 56 |   float c = (x < threshold) ? 1.546865557f : 1.0f;
 57 |   float d = (x < threshold) ? 2.250366841f : 0.0f;
 58 |   float a = (x < threshold) ? -0.737769969f : 0.0f;
 59 | 
 60 |   float logterm = fastlog (c * x + d);
 61 |   float loglogterm = fastlog (logterm);
 62 | 
 63 |   float minusw = -a - logterm + loglogterm - loglogterm / logterm;
 64 |   float expminusw = fastexp (minusw);
 65 |   float xexpminusw = x * expminusw;
 66 |   float pexpminusw = xexpminusw - minusw;
 67 | 
 68 |   return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) /
 69 |          (2.0f + pexpminusw * (2.0f - minusw));
 70 | }
 71 | 
 72 | static inline float
 73 | fasterlambertw (float x)
 74 | {
 75 |   static const float threshold = 2.26445f;
 76 | 
 77 |   float c = (x < threshold) ? 1.546865557f : 1.0f;
 78 |   float d = (x < threshold) ? 2.250366841f : 0.0f;
 79 |   float a = (x < threshold) ? -0.737769969f : 0.0f;
 80 | 
 81 |   float logterm = fasterlog (c * x + d);
 82 |   float loglogterm = fasterlog (logterm);
 83 | 
 84 |   float w = a + logterm - loglogterm + loglogterm / logterm;
 85 |   float expw = fasterexp (-w);
 86 | 
 87 |   return (w * w + expw * x) / (1.0f + w);
 88 | }
 89 | 
 90 | static inline float
 91 | fastlambertwexpx (float x)
 92 | {
 93 |   static const float k = 1.1765631309f;
 94 |   static const float a = 0.94537622168f;
 95 | 
 96 |   float logarg = fmaxf (x, k);
 97 |   float powarg = (x < k) ? a * (x - k) : 0;
 98 | 
 99 |   float logterm = fastlog (logarg);
100 |   float powterm = fasterpow2 (powarg);  // don't need accuracy here
101 | 
102 |   float w = powterm * (logarg - logterm + logterm / logarg);
103 |   float logw = fastlog (w);
104 |   float p = x - logw;
105 | 
106 |   return w * (2.0f + p + w * (3.0f + 2.0f * p)) /
107 |          (2.0f - p + w * (5.0f + 2.0f * w));
108 | }
109 | 
110 | static inline float
111 | fasterlambertwexpx (float x)
112 | {
113 |   static const float k = 1.1765631309f;
114 |   static const float a = 0.94537622168f;
115 | 
116 |   float logarg = fmaxf (x, k);
117 |   float powarg = (x < k) ? a * (x - k) : 0;
118 | 
119 |   float logterm = fasterlog (logarg);
120 |   float powterm = fasterpow2 (powarg);
121 | 
122 |   float w = powterm * (logarg - logterm + logterm / logarg);
123 |   float logw = fasterlog (w);
124 | 
125 |   return w * (1.0f + x - logw) / (1.0f + w);
126 | }
127 | 
128 | #ifdef __SSE2__
129 | 
130 | static inline v4sf
131 | vfastlambertw (v4sf x)
132 | {
133 |   const v4sf threshold = v4sfl (2.26445f);
134 | 
135 |   v4sf under = _mm_cmplt_ps (x, threshold);
136 |   v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
137 |                       _mm_andnot_ps (under, v4sfl (1.0f)));
138 |   v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
139 |   v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
140 | 
141 |   v4sf logterm = vfastlog (c * x + d);
142 |   v4sf loglogterm = vfastlog (logterm);
143 | 
144 |   v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm;
145 |   v4sf expminusw = vfastexp (minusw);
146 |   v4sf xexpminusw = x * expminusw;
147 |   v4sf pexpminusw = xexpminusw - minusw;
148 | 
149 |   return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / 
150 |          (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw));
151 | }
152 | 
153 | static inline v4sf
154 | vfasterlambertw (v4sf x)
155 | {
156 |   const v4sf threshold = v4sfl (2.26445f);
157 | 
158 |   v4sf under = _mm_cmplt_ps (x, threshold);
159 |   v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
160 |                       _mm_andnot_ps (under, v4sfl (1.0f)));
161 |   v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
162 |   v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
163 | 
164 |   v4sf logterm = vfasterlog (c * x + d);
165 |   v4sf loglogterm = vfasterlog (logterm);
166 | 
167 |   v4sf w = a + logterm - loglogterm + loglogterm / logterm;
168 |   v4sf expw = vfasterexp (-w);
169 | 
170 |   return (w * w + expw * x) / (v4sfl (1.0f) + w);
171 | }
172 | 
173 | static inline v4sf
174 | vfastlambertwexpx (v4sf x)
175 | {
176 |   const v4sf k = v4sfl (1.1765631309f);
177 |   const v4sf a = v4sfl (0.94537622168f);
178 |   const v4sf two = v4sfl (2.0f);
179 |   const v4sf three = v4sfl (3.0f);
180 |   const v4sf five = v4sfl (5.0f);
181 | 
182 |   v4sf logarg = _mm_max_ps (x, k);
183 |   v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
184 | 
185 |   v4sf logterm = vfastlog (logarg);
186 |   v4sf powterm = vfasterpow2 (powarg);  // don't need accuracy here
187 | 
188 |   v4sf w = powterm * (logarg - logterm + logterm / logarg);
189 |   v4sf logw = vfastlog (w);
190 |   v4sf p = x - logw;
191 | 
192 |   return w * (two + p + w * (three + two * p)) /
193 |          (two - p + w * (five + two * w));
194 | }
195 | 
196 | static inline v4sf
197 | vfasterlambertwexpx (v4sf x)
198 | {
199 |   const v4sf k = v4sfl (1.1765631309f);
200 |   const v4sf a = v4sfl (0.94537622168f);
201 | 
202 |   v4sf logarg = _mm_max_ps (x, k);
203 |   v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
204 | 
205 |   v4sf logterm = vfasterlog (logarg);
206 |   v4sf powterm = vfasterpow2 (powarg);
207 | 
208 |   v4sf w = powterm * (logarg - logterm + logterm / logarg);
209 |   v4sf logw = vfasterlog (w);
210 | 
211 |   return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w);
212 | }
213 | 
214 | #endif // __SSE2__
215 | 
216 | #endif // __FAST_LAMBERT_W_H_
217 | 


--------------------------------------------------------------------------------
/fastapprox/src/fasttrig.h:
--------------------------------------------------------------------------------
  1 | /*=====================================================================*
  2 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  3 |  * All rights reserved.                                                *
  4 |  *                                                                     *
  5 |  * Redistribution and use in source and binary forms, with             *
  6 |  * or without modification, are permitted provided that the            *
  7 |  * following conditions are met:                                       *
  8 |  *                                                                     *
  9 |  *     * Redistributions of source code must retain the                *
 10 |  *     above copyright notice, this list of conditions and             *
 11 |  *     the following disclaimer.                                       *
 12 |  *                                                                     *
 13 |  *     * Redistributions in binary form must reproduce the             *
 14 |  *     above copyright notice, this list of conditions and             *
 15 |  *     the following disclaimer in the documentation and/or            *
 16 |  *     other materials provided with the distribution.                 *
 17 |  *                                                                     *
 18 |  *     * Neither the name of Paul Mineiro nor the names                *
 19 |  *     of other contributors may be used to endorse or promote         *
 20 |  *     products derived from this software without specific            *
 21 |  *     prior written permission.                                       *
 22 |  *                                                                     *
 23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 37 |  *                                                                     *
 38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 39 |  *=====================================================================*/
 40 | 
 41 | #ifndef __FAST_TRIG_H_
 42 | #define __FAST_TRIG_H_
 43 | 
 44 | #include <stdint.h>
 45 | #include "sse.h"
 46 | 
 47 | // http://www.devmaster.net/forums/showthread.php?t=5784
 48 | // fast sine variants are for x \in [ -\pi, pi ]
 49 | // fast cosine variants are for x \in [ -\pi, pi ]
 50 | // fast tangent variants are for x \in [ -\pi / 2, pi / 2 ]
 51 | // "full" versions of functions handle the entire range of inputs
 52 | // although the range reduction technique used here will be hopelessly
 53 | // inaccurate for |x| >> 1000
 54 | //
 55 | // WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than
 56 | // libc calls on older machines (!) and on newer machines are only 
 57 | // slighly faster.  however:
 58 | //   * vectorized versions are competitive
 59 | //   * faster full versions are competitive
 60 | 
 61 | static inline float
 62 | fastsin (float x)
 63 | {
 64 |   static const float fouroverpi = 1.2732395447351627f;
 65 |   static const float fouroverpisq = 0.40528473456935109f;
 66 |   static const float q = 0.78444488374548933f;
 67 |   union { float f; uint32_t i; } p = { 0.20363937680730309f };
 68 |   union { float f; uint32_t i; } r = { 0.015124940802184233f };
 69 |   union { float f; uint32_t i; } s = { -0.0032225901625579573f };
 70 | 
 71 |   union { float f; uint32_t i; } vx = { x };
 72 |   uint32_t sign = vx.i & 0x80000000;
 73 |   vx.i = vx.i & 0x7FFFFFFF;
 74 | 
 75 |   float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
 76 |   float qpproxsq = qpprox * qpprox;
 77 | 
 78 |   p.i |= sign;
 79 |   r.i |= sign;
 80 |   s.i ^= sign;
 81 | 
 82 |   return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f));
 83 | }
 84 | 
 85 | static inline float
 86 | fastersin (float x)
 87 | {
 88 |   static const float fouroverpi = 1.2732395447351627f;
 89 |   static const float fouroverpisq = 0.40528473456935109f;
 90 |   static const float q = 0.77633023248007499f;
 91 |   union { float f; uint32_t i; } p = { 0.22308510060189463f };
 92 | 
 93 |   union { float f; uint32_t i; } vx = { x };
 94 |   uint32_t sign = vx.i & 0x80000000;
 95 |   vx.i &= 0x7FFFFFFF;
 96 | 
 97 |   float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
 98 | 
 99 |   p.i |= sign;
100 | 
101 |   return qpprox * (q + p.f * qpprox);
102 | }
103 | 
104 | static inline float
105 | fastsinfull (float x)
106 | {
107 |   static const float twopi = 6.2831853071795865f;
108 |   static const float invtwopi = 0.15915494309189534f;
109 | 
110 |   int k = x * invtwopi;
111 |   float half = (x < 0) ? -0.5f : 0.5f;
112 |   return fastsin ((half + k) * twopi - x);
113 | }
114 | 
115 | static inline float
116 | fastersinfull (float x)
117 | {
118 |   static const float twopi = 6.2831853071795865f;
119 |   static const float invtwopi = 0.15915494309189534f;
120 | 
121 |   int k = x * invtwopi;
122 |   float half = (x < 0) ? -0.5f : 0.5f;
123 |   return fastersin ((half + k) * twopi - x);
124 | }
125 | 
126 | static inline float
127 | fastcos (float x)
128 | {
129 |   static const float halfpi = 1.5707963267948966f;
130 |   static const float halfpiminustwopi = -4.7123889803846899f;
131 |   float offset = (x > halfpi) ? halfpiminustwopi : halfpi;
132 |   return fastsin (x + offset);
133 | }
134 | 
135 | static inline float
136 | fastercos (float x)
137 | {
138 |   static const float twooverpi = 0.63661977236758134f;
139 |   static const float p = 0.54641335845679634f;
140 | 
141 |   union { float f; uint32_t i; } vx = { x };
142 |   vx.i &= 0x7FFFFFFF;
143 | 
144 |   float qpprox = 1.0f - twooverpi * vx.f;
145 | 
146 |   return qpprox + p * qpprox * (1.0f - qpprox * qpprox);
147 | }
148 | 
149 | static inline float
150 | fastcosfull (float x)
151 | {
152 |   static const float halfpi = 1.5707963267948966f;
153 |   return fastsinfull (x + halfpi);
154 | }
155 | 
156 | static inline float
157 | fastercosfull (float x)
158 | {
159 |   static const float halfpi = 1.5707963267948966f;
160 |   return fastersinfull (x + halfpi);
161 | }
162 | 
163 | static inline float
164 | fasttan (float x)
165 | {
166 |   static const float halfpi = 1.5707963267948966f;
167 |   return fastsin (x) / fastsin (x + halfpi);
168 | }
169 | 
170 | static inline float
171 | fastertan (float x)
172 | {
173 |   return fastersin (x) / fastercos (x);
174 | }
175 | 
176 | static inline float
177 | fasttanfull (float x)
178 | {
179 |   static const float twopi = 6.2831853071795865f;
180 |   static const float invtwopi = 0.15915494309189534f;
181 | 
182 |   int k = x * invtwopi;
183 |   float half = (x < 0) ? -0.5f : 0.5f;
184 |   float xnew = x - (half + k) * twopi;
185 | 
186 |   return fastsin (xnew) / fastcos (xnew);
187 | }
188 | 
189 | static inline float
190 | fastertanfull (float x)
191 | {
192 |   static const float twopi = 6.2831853071795865f;
193 |   static const float invtwopi = 0.15915494309189534f;
194 | 
195 |   int k = x * invtwopi;
196 |   float half = (x < 0) ? -0.5f : 0.5f;
197 |   float xnew = x - (half + k) * twopi;
198 | 
199 |   return fastersin (xnew) / fastercos (xnew);
200 | }
201 | 
202 | #ifdef __SSE2__
203 | 
204 | static inline v4sf
205 | vfastsin (const v4sf x)
206 | {
207 |   const v4sf fouroverpi = v4sfl (1.2732395447351627f);
208 |   const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
209 |   const v4sf q = v4sfl (0.78444488374548933f);
210 |   const v4sf p = v4sfl (0.20363937680730309f);
211 |   const v4sf r = v4sfl (0.015124940802184233f);
212 |   const v4sf s = v4sfl (-0.0032225901625579573f);
213 | 
214 |   union { v4sf f; v4si i; } vx = { x };
215 |   v4si sign = vx.i & v4sil (0x80000000);
216 |   vx.i &= v4sil (0x7FFFFFFF);
217 | 
218 |   v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
219 |   v4sf qpproxsq = qpprox * qpprox;
220 |   union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s));
221 |   vy.i ^= sign;
222 | 
223 |   return q * qpprox + vy.f;
224 | }
225 | 
226 | static inline v4sf
227 | vfastersin (const v4sf x)
228 | {
229 |   const v4sf fouroverpi = v4sfl (1.2732395447351627f);
230 |   const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
231 |   const v4sf q = v4sfl (0.77633023248007499f);
232 |   const v4sf plit = v4sfl (0.22308510060189463f);
233 |   union { v4sf f; v4si i; } p = { plit };
234 | 
235 |   union { v4sf f; v4si i; } vx = { x };
236 |   v4si sign = vx.i & v4sil (0x80000000);
237 |   vx.i &= v4sil (0x7FFFFFFF);
238 | 
239 |   v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
240 | 
241 |   p.i |= sign;
242 | 
243 |   return qpprox * (q + p.f * qpprox);
244 | }
245 | 
246 | static inline v4sf
247 | vfastsinfull (const v4sf x)
248 | {
249 |   const v4sf twopi = v4sfl (6.2831853071795865f);
250 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
251 | 
252 |   v4si k = v4sf_to_v4si (x * invtwopi);
253 | 
254 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
255 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
256 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
257 | 
258 |   return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x);
259 | }
260 | 
261 | static inline v4sf
262 | vfastersinfull (const v4sf x)
263 | {
264 |   const v4sf twopi = v4sfl (6.2831853071795865f);
265 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
266 | 
267 |   v4si k = v4sf_to_v4si (x * invtwopi);
268 | 
269 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
270 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
271 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
272 | 
273 |   return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x);
274 | }
275 | 
276 | static inline v4sf
277 | vfastcos (const v4sf x)
278 | {
279 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
280 |   const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f);
281 |   v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi);
282 |   v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi),
283 |                            _mm_andnot_ps (lthalfpi, halfpi));
284 |   return vfastsin (x + offset);
285 | }
286 | 
287 | static inline v4sf
288 | vfastercos (v4sf x)
289 | {
290 |   const v4sf twooverpi = v4sfl (0.63661977236758134f);
291 |   const v4sf p = v4sfl (0.54641335845679634);
292 | 
293 |   v4sf vx = v4sf_fabs (x);
294 |   v4sf qpprox = v4sfl (1.0f) - twooverpi * vx;
295 | 
296 |   return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox);
297 | }
298 | 
299 | static inline v4sf
300 | vfastcosfull (const v4sf x)
301 | {
302 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
303 |   return vfastsinfull (x + halfpi);
304 | }
305 | 
306 | static inline v4sf
307 | vfastercosfull (const v4sf x)
308 | {
309 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
310 |   return vfastersinfull (x + halfpi);
311 | }
312 | 
313 | static inline v4sf
314 | vfasttan (const v4sf x)
315 | {
316 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
317 |   return vfastsin (x) / vfastsin (x + halfpi);
318 | }
319 | 
320 | static inline v4sf
321 | vfastertan (const v4sf x)
322 | {
323 |   return vfastersin (x) / vfastercos (x);
324 | }
325 | 
326 | static inline v4sf
327 | vfasttanfull (const v4sf x)
328 | {
329 |   const v4sf twopi = v4sfl (6.2831853071795865f);
330 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
331 | 
332 |   v4si k = v4sf_to_v4si (x * invtwopi);
333 | 
334 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
335 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
336 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
337 |   v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
338 | 
339 |   return vfastsin (xnew) / vfastcos (xnew);
340 | }
341 | 
342 | static inline v4sf
343 | vfastertanfull (const v4sf x)
344 | {
345 |   const v4sf twopi = v4sfl (6.2831853071795865f);
346 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
347 | 
348 |   v4si k = v4sf_to_v4si (x * invtwopi);
349 | 
350 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
351 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
352 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
353 |   v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
354 | 
355 |   return vfastersin (xnew) / vfastercos (xnew);
356 | }
357 | 
358 | #endif //__SSE2__
359 | 
360 | #endif // __FAST_TRIG_H_
361 | 


--------------------------------------------------------------------------------
/fastapprox/tests/testmacros.h:
--------------------------------------------------------------------------------
  1 | #define test_scalar(estf, exactf, gen, maxerr, ntime)           \
  2 |   static void                                                   \
  3 |   test_ ## estf ## _once (double* erracc,                       \
  4 |                           float*  max,                          \
  5 |                           float*  argmax)                       \
  6 |     {                                                           \
  7 |       float x = (gen);                                          \
  8 |       float exact = exactf (x);                                 \
  9 |       float est = estf (x);                                     \
 10 |       float err = fabsf (est - exact) /                         \
 11 |                   (fabsf (1e-4f) + fabsf (est) + fabsf (exact));\
 12 |                                                                 \
 13 |       if (err > *max) { *max = err; *argmax = x; }              \
 14 |       *erracc += err;                                           \
 15 |     }                                                           \
 16 |                                                                 \
 17 |   static void                                                   \
 18 |   test_ ## estf (void)                                          \
 19 |     {                                                           \
 20 |       unsigned int i;                                           \
 21 |       double err = 0;                                           \
 22 |       float argmax = 0;                                         \
 23 |       float max = 0;                                            \
 24 |                                                                 \
 25 |       for (i = 0; i < 100000; ++i)                              \
 26 |         {                                                       \
 27 |           test_ ## estf ## _once (&err, &max, &argmax);         \
 28 |         }                                                       \
 29 |                                                                 \
 30 |       err /= i;                                                 \
 31 |                                                                 \
 32 |       fprintf (stderr,                                          \
 33 |                "%s average relative error = %g\n",              \
 34 |                #estf,                                           \
 35 |                err);                                            \
 36 |       fprintf (stderr,                                          \
 37 |                "%s max relative error (at %g) = %g\n",          \
 38 |                #estf,                                           \
 39 |                argmax,                                          \
 40 |                max);                                            \
 41 |       assert (err < maxerr);                                    \
 42 |     }                                                           \
 43 |                                                                 \
 44 |     static void                                                 \
 45 |     time_ ## estf (void)                                        \
 46 |       {                                                         \
 47 |         struct timeval start;                                   \
 48 |         struct timeval end;                                     \
 49 |         unsigned int i;                                         \
 50 |         float sum = 0;                                          \
 51 |         volatile float xd = 1.0f;                               \
 52 |                                                                 \
 53 |         gettimeofday (&start, NULL);                            \
 54 |                                                                 \
 55 |         for (i = 0; i < ntime; ++i)                             \
 56 |           {                                                     \
 57 |             sum += estf (xd);                                   \
 58 |           }                                                     \
 59 |                                                                 \
 60 |         gettimeofday (&end, NULL);                              \
 61 |                                                                 \
 62 |         fprintf (stderr,                                        \
 63 |                  "%g\r%s million calls per second = %g\n",      \
 64 |                  sum,                                           \
 65 |                  #estf,                                         \
 66 |                    ((double) i) /                               \
 67 |                     (1e+6 * (double) end.tv_sec                 \
 68 |                      - 1e+6 * (double) start.tv_sec             \
 69 |                      + (double) end.tv_usec                     \
 70 |                      - (double) start.tv_usec));                \
 71 |       }
 72 | 
 73 | #define test_scalar2(estf, exactf, genx, geny, maxerr, ntime)   \
 74 |   static void                                                   \
 75 |   test_ ## estf ## _once (double* erracc,                       \
 76 |                           float*  max,                          \
 77 |                           float*  argmaxx,                      \
 78 |                           float*  argmaxy)                      \
 79 |     {                                                           \
 80 |       float x = (genx); /* ah ... the generation gap ... */     \
 81 |       float y = (geny);                                         \
 82 |       float exact = exactf (x, y);                              \
 83 |       float est = estf (x, y);                                  \
 84 |       float err = fabsf (est - exact) /                         \
 85 |                   (fabsf (1e-4f) + fabsf (est) + fabsf (exact));\
 86 |                                                                 \
 87 |       if (err > *max) { *max = err; *argmaxx = x; *argmaxy = y; } \
 88 |       *erracc += err;                                           \
 89 |     }                                                           \
 90 |                                                                 \
 91 |   static void                                                   \
 92 |   test_ ## estf (void)                                          \
 93 |     {                                                           \
 94 |       unsigned int i;                                           \
 95 |       double err = 0;                                           \
 96 |       float argmaxx = 0;                                        \
 97 |       float argmaxy = 0;                                        \
 98 |       float max = 0;                                            \
 99 |                                                                 \
100 |       for (i = 0; i < 100000; ++i)                              \
101 |         {                                                       \
102 |           test_ ## estf ## _once (&err, &max, &argmaxx, &argmaxy); \
103 |         }                                                       \
104 |                                                                 \
105 |       err /= i;                                                 \
106 |                                                                 \
107 |       fprintf (stderr,                                          \
108 |                "%s average relative error = %g\n",              \
109 |                #estf,                                           \
110 |                err);                                            \
111 |       fprintf (stderr,                                          \
112 |                "%s max relative error (at %g, %g) = %g\n",      \
113 |                #estf,                                           \
114 |                argmaxx,                                         \
115 |                argmaxy,                                         \
116 |                max);                                            \
117 |       assert (err < maxerr);                                    \
118 |     }                                                           \
119 |                                                                 \
120 |     static void                                                 \
121 |     time_ ## estf (void)                                        \
122 |       {                                                         \
123 |         struct timeval start;                                   \
124 |         struct timeval end;                                     \
125 |         unsigned int i;                                         \
126 |         float sum = 0;                                          \
127 |         volatile float xd = 1.0f;                               \
128 |         volatile float yd = 1.0f;                               \
129 |                                                                 \
130 |         gettimeofday (&start, NULL);                            \
131 |                                                                 \
132 |         for (i = 0; i < ntime; ++i)                             \
133 |           {                                                     \
134 |             sum += estf (xd, yd);                               \
135 |           }                                                     \
136 |                                                                 \
137 |         gettimeofday (&end, NULL);                              \
138 |                                                                 \
139 |         fprintf (stderr,                                        \
140 |                  "%g\r%s million calls per second = %g\n",      \
141 |                  sum,                                           \
142 |                  #estf,                                         \
143 |                    ((double) i) /                               \
144 |                     (1e+6 * (double) end.tv_sec                 \
145 |                      - 1e+6 * (double) start.tv_sec             \
146 |                      + (double) end.tv_usec                     \
147 |                      - (double) start.tv_usec));                \
148 |       }
149 | 
150 | #ifdef __SSE2__
151 | 
152 | #define test_vector(estf, exactf, gen, maxerr, ntime)           \
153 |   static void                                                   \
154 |   test_ ## estf ## _once (double* erracc,                       \
155 |                           float*  max,                          \
156 |                           float*  argmax)                       \
157 |     {                                                           \
158 |       v4sf x = v4sfl ((float) (gen));                           \
159 |       v4sf exact = (v4sf) { exactf (v4sf_index (x, 0)),         \
160 |                             exactf (v4sf_index (x, 1)),         \
161 |                             exactf (v4sf_index (x, 2)),         \
162 |                             exactf (v4sf_index (x, 3)) };       \
163 |       v4sf est = estf (x);                                      \
164 |       v4sf err = v4sf_fabs (est - exact) /                      \
165 |                  (v4sfl (1e-4) + v4sf_fabs (est) + v4sf_fabs (exact));  \
166 |                                                                 \
167 |       unsigned int k;                                           \
168 |       for (k = 0; k < 4; ++k)                                   \
169 |         {                                                       \
170 |           if (v4sf_index (err, k) > *max)                       \
171 |             {                                                   \
172 |               *max = v4sf_index (err, k);                       \
173 |               *argmax = v4sf_index (x, k);                      \
174 |             }                                                   \
175 |                                                                 \
176 |           *erracc += v4sf_index (err, k);                       \
177 |         }                                                       \
178 |     }                                                           \
179 |                                                                 \
180 |   static void                                                   \
181 |   test_ ## estf (void)                                          \
182 |     {                                                           \
183 |       unsigned int i;                                           \
184 |       double err = 0;                                           \
185 |       float argmax = 0;                                         \
186 |       float max = 0;                                            \
187 |                                                                 \
188 |       for (i = 0; i < 100000; ++i)                              \
189 |         {                                                       \
190 |           test_ ## estf ## _once (&err, &max, &argmax);         \
191 |         }                                                       \
192 |                                                                 \
193 |       err /= (4.0f * i);                                        \
194 |                                                                 \
195 |       fprintf (stderr,                                          \
196 |                "%s average relative error = %g\n",              \
197 |                #estf,                                           \
198 |                err);                                            \
199 |       fprintf (stderr,                                          \
200 |                "%s max relative error (at %g) = %g\n",          \
201 |                #estf,                                           \
202 |                argmax,                                          \
203 |                max);                                            \
204 |       assert (err < maxerr);                                    \
205 |     }                                                           \
206 |                                                                 \
207 |     static void                                                 \
208 |     time_ ## estf (void)                                        \
209 |       {                                                         \
210 |         struct timeval start;                                   \
211 |         struct timeval end;                                     \
212 |         unsigned int i;                                         \
213 |         v4sf sum = v4sfl (0.0f);                                \
214 |         volatile v4sf xd = v4sfl (1.0f);                        \
215 |                                                                 \
216 |         gettimeofday (&start, NULL);                            \
217 |                                                                 \
218 |         for (i = 0; i < ntime; ++i)                             \
219 |           {                                                     \
220 |             sum += estf (xd);                                   \
221 |           }                                                     \
222 |                                                                 \
223 |         gettimeofday (&end, NULL);                              \
224 |                                                                 \
225 |         fprintf (stderr,                                        \
226 |                  "%g\r%s million calls per second = %g\n",      \
227 |                  v4sf_index (sum, 0),                           \
228 |                  #estf,                                         \
229 |                    ((double) i) /                               \
230 |                     (1e+6 * (double) end.tv_sec                 \
231 |                      - 1e+6 * (double) start.tv_sec             \
232 |                      + (double) end.tv_usec                     \
233 |                      - (double) start.tv_usec));                \
234 |       }
235 | 
236 | #define test_vector2(estf, exactf, genx, geny, maxerr, ntime)   \
237 |   static void                                                   \
238 |   test_ ## estf ## _once (double* erracc,                       \
239 |                           float*  max,                          \
240 |                           float*  argmaxx,                      \
241 |                           float*  argmaxy)                      \
242 |     {                                                           \
243 |       v4sf x = v4sfl ((float) (genx));                          \
244 |       v4sf y = v4sfl ((float) (geny));                          \
245 |       v4sf exact = (v4sf) { exactf (v4sf_index (x, 0),          \
246 |                                     v4sf_index (y, 0)),         \
247 |                             exactf (v4sf_index (x, 1),          \
248 |                                     v4sf_index (y, 1)),         \
249 |                             exactf (v4sf_index (x, 2),          \
250 |                                     v4sf_index (y, 2)),         \
251 |                             exactf (v4sf_index (x, 3),          \
252 |                                     v4sf_index (y, 3)) };       \
253 |       v4sf est = estf (x, y);                                   \
254 |       v4sf err = v4sf_fabs (est - exact) /                      \
255 |                  (v4sfl (1e-4) + v4sf_fabs (est) + v4sf_fabs (exact));  \
256 |                                                                 \
257 |       unsigned int k;                                           \
258 |       for (k = 0; k < 4; ++k)                                   \
259 |         {                                                       \
260 |           if (v4sf_index (err, k) > *max)                       \
261 |             {                                                   \
262 |               *max = v4sf_index (err, k);                       \
263 |               *argmaxx = v4sf_index (x, k);                     \
264 |               *argmaxy = v4sf_index (y, k);                     \
265 |             }                                                   \
266 |                                                                 \
267 |           *erracc += v4sf_index (err, k);                       \
268 |         }                                                       \
269 |     }                                                           \
270 |                                                                 \
271 |   static void                                                   \
272 |   test_ ## estf (void)                                          \
273 |     {                                                           \
274 |       unsigned int i;                                           \
275 |       double err = 0;                                           \
276 |       float argmaxx = 0;                                        \
277 |       float argmaxy = 0;                                        \
278 |       float max = 0;                                            \
279 |                                                                 \
280 |       for (i = 0; i < 100000; ++i)                              \
281 |         {                                                       \
282 |           test_ ## estf ## _once (&err, &max, &argmaxx, &argmaxy);   \
283 |         }                                                       \
284 |                                                                 \
285 |       err /= (4.0f * i);                                        \
286 |                                                                 \
287 |       fprintf (stderr,                                          \
288 |                "%s average relative error = %g\n",              \
289 |                #estf,                                           \
290 |                err);                                            \
291 |       fprintf (stderr,                                          \
292 |                "%s max relative error (at %g, %g) = %g\n",      \
293 |                #estf,                                           \
294 |                argmaxx,                                         \
295 |                argmaxy,                                         \
296 |                max);                                            \
297 |       assert (err < maxerr);                                    \
298 |     }                                                           \
299 |                                                                 \
300 |     static void                                                 \
301 |     time_ ## estf (void)                                        \
302 |       {                                                         \
303 |         struct timeval start;                                   \
304 |         struct timeval end;                                     \
305 |         unsigned int i;                                         \
306 |         v4sf sum = v4sfl (0.0f);                                \
307 |         volatile v4sf xd = v4sfl (1.0f);                        \
308 |         volatile v4sf yd = v4sfl (1.0f);                        \
309 |                                                                 \
310 |         gettimeofday (&start, NULL);                            \
311 |                                                                 \
312 |         for (i = 0; i < ntime; ++i)                             \
313 |           {                                                     \
314 |             sum += estf (xd, yd);                               \
315 |           }                                                     \
316 |                                                                 \
317 |         gettimeofday (&end, NULL);                              \
318 |                                                                 \
319 |         fprintf (stderr,                                        \
320 |                  "%g\r%s million calls per second = %g\n",      \
321 |                  v4sf_index (sum, 0),                           \
322 |                  #estf,                                         \
323 |                    ((double) i) /                               \
324 |                     (1e+6 * (double) end.tv_sec                 \
325 |                      - 1e+6 * (double) start.tv_sec             \
326 |                      + (double) end.tv_usec                     \
327 |                      - (double) start.tv_usec));                \
328 |       }
329 | 
330 | #else // __SSE2__
331 | 
332 | #define test_vector(estf, exactf, gen, maxerr, ntime)           \
333 |     static void                                                 \
334 |     test_ ## estf (void)                                        \
335 |       {                                                         \
336 |         fprintf (stderr, "%s test SKIPPED (no SSE support)\n",  \
337 |                  #estf);                                        \
338 |       }                                                         \
339 |                                                                 \
340 |     static void                                                 \
341 |     time_ ## estf (void)                                        \
342 |       {                                                         \
343 |       }
344 | 
345 | #define test_vector2(estf, exactf, genx, geny, maxerr, ntime)   \
346 |     static void                                                 \
347 |     test_ ## estf (void)                                        \
348 |       {                                                         \
349 |         fprintf (stderr, "%s test SKIPPED (no SSE support)\n",  \
350 |                  #estf);                                        \
351 |       }                                                         \
352 |                                                                 \
353 |     static void                                                 \
354 |     time_ ## estf (void)                                        \
355 |       {                                                         \
356 |       }
357 | 
358 | #endif // __SSE2__
359 | 


--------------------------------------------------------------------------------
/fastapprox/src/fastonebigheader.h:
--------------------------------------------------------------------------------
   1 | /*=====================================================================*
   2 |  *                   Copyright (C) 2012 Paul Mineiro                   *
   3 |  * All rights reserved.                                                *
   4 |  *                                                                     *
   5 |  * Redistribution and use in source and binary forms, with             *
   6 |  * or without modification, are permitted provided that the            *
   7 |  * following conditions are met:                                       *
   8 |  *                                                                     *
   9 |  *     * Redistributions of source code must retain the                *
  10 |  *     above copyright notice, this list of conditions and             *
  11 |  *     the following disclaimer.                                       *
  12 |  *                                                                     *
  13 |  *     * Redistributions in binary form must reproduce the             *
  14 |  *     above copyright notice, this list of conditions and             *
  15 |  *     the following disclaimer in the documentation and/or            *
  16 |  *     other materials provided with the distribution.                 *
  17 |  *                                                                     *
  18 |  *     * Neither the name of Paul Mineiro nor the names                *
  19 |  *     of other contributors may be used to endorse or promote         *
  20 |  *     products derived from this software without specific            *
  21 |  *     prior written permission.                                       *
  22 |  *                                                                     *
  23 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
  24 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
  25 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
  26 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
  27 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
  28 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
  29 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
  30 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
  31 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
  32 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
  33 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
  34 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
  35 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
  36 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
  37 |  *                                                                     *
  38 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
  39 |  *=====================================================================*/
  40 | 
  41 | #ifndef __CAST_H_
  42 | 
  43 | #ifdef __cplusplus
  44 | #define cast_uint32_t static_cast<uint32_t>
  45 | #else
  46 | #define cast_uint32_t (uint32_t)
  47 | #endif
  48 | 
  49 | #endif // __CAST_H_
  50 | /*=====================================================================*
  51 |  *                   Copyright (C) 2011 Paul Mineiro                   *
  52 |  * All rights reserved.                                                *
  53 |  *                                                                     *
  54 |  * Redistribution and use in source and binary forms, with             *
  55 |  * or without modification, are permitted provided that the            *
  56 |  * following conditions are met:                                       *
  57 |  *                                                                     *
  58 |  *     * Redistributions of source code must retain the                *
  59 |  *     above copyright notice, this list of conditions and             *
  60 |  *     the following disclaimer.                                       *
  61 |  *                                                                     *
  62 |  *     * Redistributions in binary form must reproduce the             *
  63 |  *     above copyright notice, this list of conditions and             *
  64 |  *     the following disclaimer in the documentation and/or            *
  65 |  *     other materials provided with the distribution.                 *
  66 |  *                                                                     *
  67 |  *     * Neither the name of Paul Mineiro nor the names                *
  68 |  *     of other contributors may be used to endorse or promote         *
  69 |  *     products derived from this software without specific            *
  70 |  *     prior written permission.                                       *
  71 |  *                                                                     *
  72 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
  73 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
  74 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
  75 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
  76 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
  77 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
  78 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
  79 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
  80 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
  81 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
  82 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
  83 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
  84 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
  85 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
  86 |  *                                                                     *
  87 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
  88 |  *=====================================================================*/
  89 | 
  90 | #ifndef __SSE_H_
  91 | #define __SSE_H_
  92 | 
  93 | #ifdef __SSE2__
  94 | 
  95 | #include <emmintrin.h>
  96 | 
  97 | #ifdef __cplusplus
  98 | namespace {
  99 | #endif // __cplusplus
 100 | 
 101 | typedef __m128 v4sf;
 102 | typedef __m128i v4si;
 103 | 
 104 | #define v4si_to_v4sf _mm_cvtepi32_ps
 105 | #define v4sf_to_v4si _mm_cvttps_epi32
 106 | 
 107 | #if _MSC_VER && !__INTEL_COMPILER
 108 |   template <class T>
 109 |   __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }
 110 | 
 111 |   #define AS_4CHARS(a) \
 112 |       GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
 113 |       GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)
 114 | 
 115 |   #define _MM_SETR_EPI32(a0, a1, a2, a3) \
 116 |       { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }
 117 | 
 118 |   #define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
 119 |   #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))
 120 | 
 121 |   __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
 122 |   __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
 123 |   __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
 124 |   __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }
 125 | 
 126 |   __forceinline const v4sf operator+(const v4sf& a) { return a; }
 127 |   __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
 128 | 
 129 |   __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
 130 |   __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
 131 |   __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }
 132 | 
 133 |   __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
 134 |   __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
 135 |   __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }
 136 | 
 137 |   __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
 138 |   __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
 139 |   __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
 140 |   __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }
 141 | 
 142 |   __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
 143 |   __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
 144 |   __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
 145 | #else
 146 |   #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
 147 |   #define v2dil(x) ((const v4si) { (x), (x) })
 148 |   #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
 149 | #endif
 150 | 
 151 | typedef union { v4sf f; float array[4]; } v4sfindexer;
 152 | #define v4sf_index(_findx, _findi)      \
 153 |   ({                                    \
 154 |      v4sfindexer _findvx = { _findx } ; \
 155 |      _findvx.array[_findi];             \
 156 |   })
 157 | typedef union { v4si i; int array[4]; } v4siindexer;
 158 | #define v4si_index(_iindx, _iindi)      \
 159 |   ({                                    \
 160 |      v4siindexer _iindvx = { _iindx } ; \
 161 |      _iindvx.array[_iindi];             \
 162 |   })
 163 | 
 164 | typedef union { v4sf f; v4si i; } v4sfv4sipun;
 165 | #if _MSC_VER && !__INTEL_COMPILER
 166 |   #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
 167 | #else
 168 |   #define v4sf_fabs(x)                  \
 169 |   ({                                    \
 170 |      v4sfv4sipun vx;                    \
 171 |      vx.f = x;                          \
 172 |      vx.i &= v4sil (0x7FFFFFFF);        \
 173 |      vx.f;                              \
 174 |   })
 175 | #endif
 176 | 
 177 | #ifdef __cplusplus
 178 | } // end namespace
 179 | #endif // __cplusplus
 180 | 
 181 | #endif // __SSE2__
 182 | 
 183 | #endif // __SSE_H_
 184 | /*=====================================================================*
 185 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 186 |  * All rights reserved.                                                *
 187 |  *                                                                     *
 188 |  * Redistribution and use in source and binary forms, with             *
 189 |  * or without modification, are permitted provided that the            *
 190 |  * following conditions are met:                                       *
 191 |  *                                                                     *
 192 |  *     * Redistributions of source code must retain the                *
 193 |  *     above copyright notice, this list of conditions and             *
 194 |  *     the following disclaimer.                                       *
 195 |  *                                                                     *
 196 |  *     * Redistributions in binary form must reproduce the             *
 197 |  *     above copyright notice, this list of conditions and             *
 198 |  *     the following disclaimer in the documentation and/or            *
 199 |  *     other materials provided with the distribution.                 *
 200 |  *                                                                     *
 201 |  *     * Neither the name of Paul Mineiro nor the names                *
 202 |  *     of other contributors may be used to endorse or promote         *
 203 |  *     products derived from this software without specific            *
 204 |  *     prior written permission.                                       *
 205 |  *                                                                     *
 206 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 207 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 208 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 209 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 210 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 211 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 212 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 213 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 214 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 215 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 216 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 217 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 218 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 219 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 220 |  *                                                                     *
 221 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 222 |  *=====================================================================*/
 223 | 
 224 | #ifndef __FAST_EXP_H_
 225 | #define __FAST_EXP_H_
 226 | 
 227 | #include <stdint.h>
 228 | 
 229 | // Underflow of exponential is common practice in numerical routines,
 230 | // so handle it here.
 231 | 
 232 | static inline float
 233 | fastpow2 (float p)
 234 | {
 235 |   float offset = (p < 0) ? 1.0f : 0.0f;
 236 |   float clipp = (p < -126) ? -126.0f : p;
 237 |   int w = clipp;
 238 |   float z = clipp - w + offset;
 239 |   union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
 240 | 
 241 |   return v.f;
 242 | }
 243 | 
 244 | static inline float
 245 | fastexp (float p)
 246 | {
 247 |   return fastpow2 (1.442695040f * p);
 248 | }
 249 | 
 250 | static inline float
 251 | fasterpow2 (float p)
 252 | {
 253 |   float clipp = (p < -126) ? -126.0f : p;
 254 |   union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) };
 255 |   return v.f;
 256 | }
 257 | 
 258 | static inline float
 259 | fasterexp (float p)
 260 | {
 261 |   return fasterpow2 (1.442695040f * p);
 262 | }
 263 | 
 264 | #ifdef __SSE2__
 265 | 
 266 | static inline v4sf
 267 | vfastpow2 (const v4sf p)
 268 | {
 269 |   v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f));
 270 |   v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f));
 271 |   v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
 272 |   v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
 273 |   v4si w = v4sf_to_v4si (clipp);
 274 |   v4sf z = clipp - v4si_to_v4sf (w) + offset;
 275 | 
 276 |   const v4sf c_121_2740838 = v4sfl (121.2740575f);
 277 |   const v4sf c_27_7280233 = v4sfl (27.7280233f);
 278 |   const v4sf c_4_84252568 = v4sfl (4.84252568f);
 279 |   const v4sf c_1_49012907 = v4sfl (1.49012907f);
 280 |   union { v4si i; v4sf f; } v = {
 281 |     v4sf_to_v4si (
 282 |       v4sfl (1 << 23) * 
 283 |       (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z)
 284 |     )
 285 |   };
 286 | 
 287 |   return v.f;
 288 | }
 289 | 
 290 | static inline v4sf
 291 | vfastexp (const v4sf p)
 292 | {
 293 |   const v4sf c_invlog_2 = v4sfl (1.442695040f);
 294 | 
 295 |   return vfastpow2 (c_invlog_2 * p);
 296 | }
 297 | 
 298 | static inline v4sf
 299 | vfasterpow2 (const v4sf p)
 300 | {
 301 |   const v4sf c_126_94269504 = v4sfl (126.94269504f);
 302 |   v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
 303 |   v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
 304 |   union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) };
 305 |   return v.f;
 306 | }
 307 | 
 308 | static inline v4sf
 309 | vfasterexp (const v4sf p)
 310 | {
 311 |   const v4sf c_invlog_2 = v4sfl (1.442695040f);
 312 | 
 313 |   return vfasterpow2 (c_invlog_2 * p);
 314 | }
 315 | 
 316 | #endif //__SSE2__
 317 | 
 318 | #endif // __FAST_EXP_H_
 319 | /*=====================================================================*
 320 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 321 |  * All rights reserved.                                                *
 322 |  *                                                                     *
 323 |  * Redistribution and use in source and binary forms, with             *
 324 |  * or without modification, are permitted provided that the            *
 325 |  * following conditions are met:                                       *
 326 |  *                                                                     *
 327 |  *     * Redistributions of source code must retain the                *
 328 |  *     above copyright notice, this list of conditions and             *
 329 |  *     the following disclaimer.                                       *
 330 |  *                                                                     *
 331 |  *     * Redistributions in binary form must reproduce the             *
 332 |  *     above copyright notice, this list of conditions and             *
 333 |  *     the following disclaimer in the documentation and/or            *
 334 |  *     other materials provided with the distribution.                 *
 335 |  *                                                                     *
 336 |  *     * Neither the name of Paul Mineiro nor the names                *
 337 |  *     of other contributors may be used to endorse or promote         *
 338 |  *     products derived from this software without specific            *
 339 |  *     prior written permission.                                       *
 340 |  *                                                                     *
 341 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 342 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 343 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 344 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 345 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 346 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 347 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 348 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 349 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 350 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 351 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 352 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 353 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 354 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 355 |  *                                                                     *
 356 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 357 |  *=====================================================================*/
 358 | 
 359 | #ifndef __FAST_LOG_H_
 360 | #define __FAST_LOG_H_
 361 | 
 362 | #include <stdint.h>
 363 | 
 364 | static inline float 
 365 | fastlog2 (float x)
 366 | {
 367 |   union { float f; uint32_t i; } vx = { x };
 368 |   union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
 369 |   float y = vx.i;
 370 |   y *= 1.1920928955078125e-7f;
 371 | 
 372 |   return y - 124.22551499f
 373 |            - 1.498030302f * mx.f 
 374 |            - 1.72587999f / (0.3520887068f + mx.f);
 375 | }
 376 | 
 377 | static inline float
 378 | fastlog (float x)
 379 | {
 380 |   return 0.69314718f * fastlog2 (x);
 381 | }
 382 | 
 383 | static inline float 
 384 | fasterlog2 (float x)
 385 | {
 386 |   union { float f; uint32_t i; } vx = { x };
 387 |   float y = vx.i;
 388 |   y *= 1.1920928955078125e-7f;
 389 |   return y - 126.94269504f;
 390 | }
 391 | 
 392 | static inline float
 393 | fasterlog (float x)
 394 | {
 395 | //  return 0.69314718f * fasterlog2 (x);
 396 | 
 397 |   union { float f; uint32_t i; } vx = { x };
 398 |   float y = vx.i;
 399 |   y *= 8.2629582881927490e-8f;
 400 |   return y - 87.989971088f;
 401 | }
 402 | 
 403 | #ifdef __SSE2__
 404 | 
 405 | static inline v4sf
 406 | vfastlog2 (v4sf x)
 407 | {
 408 |   union { v4sf f; v4si i; } vx = { x };
 409 |   union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000);
 410 |   v4sf y = v4si_to_v4sf (vx.i);
 411 |   y *= v4sfl (1.1920928955078125e-7f);
 412 | 
 413 |   const v4sf c_124_22551499 = v4sfl (124.22551499f);
 414 |   const v4sf c_1_498030302 = v4sfl (1.498030302f);
 415 |   const v4sf c_1_725877999 = v4sfl (1.72587999f);
 416 |   const v4sf c_0_3520087068 = v4sfl (0.3520887068f);
 417 | 
 418 |   return y - c_124_22551499
 419 |            - c_1_498030302 * mx.f 
 420 |            - c_1_725877999 / (c_0_3520087068 + mx.f);
 421 | }
 422 | 
 423 | static inline v4sf
 424 | vfastlog (v4sf x)
 425 | {
 426 |   const v4sf c_0_69314718 = v4sfl (0.69314718f);
 427 | 
 428 |   return c_0_69314718 * vfastlog2 (x);
 429 | }
 430 | 
 431 | static inline v4sf 
 432 | vfasterlog2 (v4sf x)
 433 | {
 434 |   union { v4sf f; v4si i; } vx = { x };
 435 |   v4sf y = v4si_to_v4sf (vx.i);
 436 |   y *= v4sfl (1.1920928955078125e-7f);
 437 | 
 438 |   const v4sf c_126_94269504 = v4sfl (126.94269504f);
 439 | 
 440 |   return y - c_126_94269504;
 441 | }
 442 | 
 443 | static inline v4sf
 444 | vfasterlog (v4sf x)
 445 | {
 446 | //  const v4sf c_0_69314718 = v4sfl (0.69314718f);
 447 | //
 448 | //  return c_0_69314718 * vfasterlog2 (x);
 449 | 
 450 |   union { v4sf f; v4si i; } vx = { x };
 451 |   v4sf y = v4si_to_v4sf (vx.i);
 452 |   y *= v4sfl (8.2629582881927490e-8f);
 453 | 
 454 |   const v4sf c_87_989971088 = v4sfl (87.989971088f);
 455 | 
 456 |   return y - c_87_989971088;
 457 | }
 458 | 
 459 | #endif // __SSE2__
 460 | 
 461 | #endif // __FAST_LOG_H_
 462 | /*=====================================================================*
 463 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 464 |  * All rights reserved.                                                *
 465 |  *                                                                     *
 466 |  * Redistribution and use in source and binary forms, with             *
 467 |  * or without modification, are permitted provided that the            *
 468 |  * following conditions are met:                                       *
 469 |  *                                                                     *
 470 |  *     * Redistributions of source code must retain the                *
 471 |  *     above copyright notice, this list of conditions and             *
 472 |  *     the following disclaimer.                                       *
 473 |  *                                                                     *
 474 |  *     * Redistributions in binary form must reproduce the             *
 475 |  *     above copyright notice, this list of conditions and             *
 476 |  *     the following disclaimer in the documentation and/or            *
 477 |  *     other materials provided with the distribution.                 *
 478 |  *                                                                     *
 479 |  *     * Neither the name of Paul Mineiro nor the names                *
 480 |  *     of other contributors may be used to endorse or promote         *
 481 |  *     products derived from this software without specific            *
 482 |  *     prior written permission.                                       *
 483 |  *                                                                     *
 484 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 485 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 486 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 487 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 488 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 489 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 490 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 491 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 492 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 493 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 494 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 495 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 496 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 497 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 498 |  *                                                                     *
 499 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 500 |  *=====================================================================*/
 501 | 
 502 | #ifndef __FAST_ERF_H_
 503 | #define __FAST_ERF_H_
 504 | 
 505 | #include <math.h>
 506 | #include <stdint.h>
 507 | 
 508 | // fasterfc: not actually faster than erfcf(3) on newer machines!
 509 | // ... although vectorized version is interesting
 510 | //     and fastererfc is very fast
 511 | 
 512 | static inline float
 513 | fasterfc (float x)
 514 | {
 515 |   static const float k = 3.3509633149424609f;
 516 |   static const float a = 0.07219054755431126f;
 517 |   static const float b = 15.418191568719577f;
 518 |   static const float c = 5.609846028328545f;
 519 | 
 520 |   union { float f; uint32_t i; } vc = { c * x };
 521 |   float xsq = x * x;
 522 |   float xquad = xsq * xsq;
 523 | 
 524 |   vc.i |= 0x80000000;
 525 | 
 526 |   return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f);
 527 | }
 528 | 
 529 | static inline float
 530 | fastererfc (float x)
 531 | {
 532 |   static const float k = 3.3509633149424609f;
 533 | 
 534 |   return 2.0f / (1.0f + fasterpow2 (k * x));
 535 | }
 536 | 
 537 | // fasterf: not actually faster than erff(3) on newer machines! 
 538 | // ... although vectorized version is interesting
 539 | //     and fastererf is very fast
 540 | 
 541 | static inline float
 542 | fasterf (float x)
 543 | {
 544 |   return 1.0f - fasterfc (x);
 545 | }
 546 | 
 547 | static inline float
 548 | fastererf (float x)
 549 | {
 550 |   return 1.0f - fastererfc (x);
 551 | }
 552 | 
 553 | static inline float
 554 | fastinverseerf (float x)
 555 | {
 556 |   static const float invk = 0.30004578719350504f;
 557 |   static const float a = 0.020287853348211326f;
 558 |   static const float b = 0.07236892874789555f;
 559 |   static const float c = 0.9913030456864257f;
 560 |   static const float d = 0.8059775923760193f;
 561 | 
 562 |   float xsq = x * x;
 563 | 
 564 |   return invk * fastlog2 ((1.0f + x) / (1.0f - x)) 
 565 |        + x * (a - b * xsq) / (c - d * xsq);
 566 | }
 567 | 
 568 | static inline float
 569 | fasterinverseerf (float x)
 570 | {
 571 |   static const float invk = 0.30004578719350504f;
 572 | 
 573 |   return invk * fasterlog2 ((1.0f + x) / (1.0f - x));
 574 | }
 575 | 
 576 | #ifdef __SSE2__
 577 | 
 578 | static inline v4sf
 579 | vfasterfc (v4sf x)
 580 | {
 581 |   const v4sf k = v4sfl (3.3509633149424609f);
 582 |   const v4sf a = v4sfl (0.07219054755431126f);
 583 |   const v4sf b = v4sfl (15.418191568719577f);
 584 |   const v4sf c = v4sfl (5.609846028328545f);
 585 | 
 586 |   union { v4sf f; v4si i; } vc; vc.f = c * x;
 587 |   vc.i |= v4sil (0x80000000);
 588 | 
 589 |   v4sf xsq = x * x;
 590 |   v4sf xquad = xsq * xsq;
 591 | 
 592 |   return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f);
 593 | }
 594 | 
 595 | static inline v4sf
 596 | vfastererfc (const v4sf x)
 597 | {
 598 |   const v4sf k = v4sfl (3.3509633149424609f);
 599 | 
 600 |   return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x));
 601 | }
 602 | 
 603 | static inline v4sf
 604 | vfasterf (v4sf x)
 605 | {
 606 |   return v4sfl (1.0f) - vfasterfc (x);
 607 | }
 608 | 
 609 | static inline v4sf
 610 | vfastererf (const v4sf x)
 611 | {
 612 |   return v4sfl (1.0f) - vfastererfc (x);
 613 | }
 614 | 
 615 | static inline v4sf
 616 | vfastinverseerf (v4sf x)
 617 | {
 618 |   const v4sf invk = v4sfl (0.30004578719350504f);
 619 |   const v4sf a = v4sfl (0.020287853348211326f);
 620 |   const v4sf b = v4sfl (0.07236892874789555f);
 621 |   const v4sf c = v4sfl (0.9913030456864257f);
 622 |   const v4sf d = v4sfl (0.8059775923760193f);
 623 | 
 624 |   v4sf xsq = x * x;
 625 | 
 626 |   return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) 
 627 |        + x * (a - b * xsq) / (c - d * xsq);
 628 | }
 629 | 
 630 | static inline v4sf
 631 | vfasterinverseerf (v4sf x)
 632 | {
 633 |   const v4sf invk = v4sfl (0.30004578719350504f);
 634 | 
 635 |   return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x));
 636 | }
 637 | 
 638 | #endif //__SSE2__
 639 | 
 640 | #endif // __FAST_ERF_H_
 641 | /*=====================================================================*
 642 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 643 |  * All rights reserved.                                                *
 644 |  *                                                                     *
 645 |  * Redistribution and use in source and binary forms, with             *
 646 |  * or without modification, are permitted provided that the            *
 647 |  * following conditions are met:                                       *
 648 |  *                                                                     *
 649 |  *     * Redistributions of source code must retain the                *
 650 |  *     above copyright notice, this list of conditions and             *
 651 |  *     the following disclaimer.                                       *
 652 |  *                                                                     *
 653 |  *     * Redistributions in binary form must reproduce the             *
 654 |  *     above copyright notice, this list of conditions and             *
 655 |  *     the following disclaimer in the documentation and/or            *
 656 |  *     other materials provided with the distribution.                 *
 657 |  *                                                                     *
 658 |  *     * Neither the name of Paul Mineiro nor the names                *
 659 |  *     of other contributors may be used to endorse or promote         *
 660 |  *     products derived from this software without specific            *
 661 |  *     prior written permission.                                       *
 662 |  *                                                                     *
 663 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 664 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 665 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 666 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 667 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 668 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 669 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 670 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 671 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 672 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 673 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 674 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 675 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 676 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 677 |  *                                                                     *
 678 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 679 |  *=====================================================================*/
 680 | 
 681 | #ifndef __FAST_GAMMA_H_
 682 | #define __FAST_GAMMA_H_
 683 | 
 684 | #include <stdint.h>
 685 | 
 686 | /* gamma/digamma functions only work for positive inputs */
 687 | 
 688 | static inline float
 689 | fastlgamma (float x)
 690 | {
 691 |   float logterm = fastlog (x * (1.0f + x) * (2.0f + x));
 692 |   float xp3 = 3.0f + x;
 693 | 
 694 |   return - 2.081061466f 
 695 |          - x 
 696 |          + 0.0833333f / xp3 
 697 |          - logterm 
 698 |          + (2.5f + x) * fastlog (xp3);
 699 | }
 700 | 
 701 | static inline float
 702 | fasterlgamma (float x)
 703 | {
 704 |   return - 0.0810614667f 
 705 |          - x
 706 |          - fasterlog (x)
 707 |          + (0.5f + x) * fasterlog (1.0f + x);
 708 | }
 709 | 
 710 | static inline float
 711 | fastdigamma (float x)
 712 | {
 713 |   float twopx = 2.0f + x;
 714 |   float logterm = fastlog (twopx);
 715 | 
 716 |   return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) /
 717 |          (12.0f * x * (1.0f + x) * twopx * twopx)
 718 |          + logterm;
 719 | }
 720 | 
 721 | static inline float
 722 | fasterdigamma (float x)
 723 | {
 724 |   float onepx = 1.0f + x;
 725 | 
 726 |   return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx);
 727 | }
 728 | 
 729 | #ifdef __SSE2__
 730 | 
 731 | static inline v4sf
 732 | vfastlgamma (v4sf x)
 733 | {
 734 |   const v4sf c_1_0 = v4sfl (1.0f);
 735 |   const v4sf c_2_0 = v4sfl (2.0f);
 736 |   const v4sf c_3_0 = v4sfl (3.0f);
 737 |   const v4sf c_2_081061466 = v4sfl (2.081061466f);
 738 |   const v4sf c_0_0833333 = v4sfl (0.0833333f);
 739 |   const v4sf c_2_5 = v4sfl (2.5f);
 740 | 
 741 |   v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x));
 742 |   v4sf xp3 = c_3_0 + x;
 743 | 
 744 |   return - c_2_081061466
 745 |          - x 
 746 |          + c_0_0833333 / xp3 
 747 |          - logterm 
 748 |          + (c_2_5 + x) * vfastlog (xp3);
 749 | }
 750 | 
 751 | static inline v4sf
 752 | vfasterlgamma (v4sf x)
 753 | {
 754 |   const v4sf c_0_0810614667 = v4sfl (0.0810614667f);
 755 |   const v4sf c_0_5 = v4sfl (0.5f);
 756 |   const v4sf c_1 = v4sfl (1.0f);
 757 | 
 758 |   return - c_0_0810614667
 759 |          - x
 760 |          - vfasterlog (x)
 761 |          + (c_0_5 + x) * vfasterlog (c_1 + x);
 762 | }
 763 | 
 764 | static inline v4sf
 765 | vfastdigamma (v4sf x)
 766 | {
 767 |   v4sf twopx = v4sfl (2.0f) + x;
 768 |   v4sf logterm = vfastlog (twopx);
 769 | 
 770 |   return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) /
 771 |          (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx)
 772 |          + logterm;
 773 | }
 774 | 
 775 | static inline v4sf
 776 | vfasterdigamma (v4sf x)
 777 | {
 778 |   const v4sf c_1_0 = v4sfl (1.0f);
 779 |   const v4sf c_2_0 = v4sfl (2.0f);
 780 |   v4sf onepx = c_1_0 + x;
 781 | 
 782 |   return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx);
 783 | }
 784 | 
 785 | #endif //__SSE2__
 786 | 
 787 | #endif // __FAST_GAMMA_H_
 788 | /*=====================================================================*
 789 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 790 |  * All rights reserved.                                                *
 791 |  *                                                                     *
 792 |  * Redistribution and use in source and binary forms, with             *
 793 |  * or without modification, are permitted provided that the            *
 794 |  * following conditions are met:                                       *
 795 |  *                                                                     *
 796 |  *     * Redistributions of source code must retain the                *
 797 |  *     above copyright notice, this list of conditions and             *
 798 |  *     the following disclaimer.                                       *
 799 |  *                                                                     *
 800 |  *     * Redistributions in binary form must reproduce the             *
 801 |  *     above copyright notice, this list of conditions and             *
 802 |  *     the following disclaimer in the documentation and/or            *
 803 |  *     other materials provided with the distribution.                 *
 804 |  *                                                                     *
 805 |  *     * Neither the name of Paul Mineiro nor the names                *
 806 |  *     of other contributors may be used to endorse or promote         *
 807 |  *     products derived from this software without specific            *
 808 |  *     prior written permission.                                       *
 809 |  *                                                                     *
 810 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 811 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 812 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 813 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 814 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 815 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 816 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 817 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 818 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 819 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 820 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 821 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 822 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 823 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 824 |  *                                                                     *
 825 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 826 |  *=====================================================================*/
 827 | 
 828 | #ifndef __FAST_HYPERBOLIC_H_
 829 | #define __FAST_HYPERBOLIC_H_
 830 | 
 831 | #include <stdint.h>
 832 | 
 833 | static inline float
 834 | fastsinh (float p)
 835 | {
 836 |   return 0.5f * (fastexp (p) - fastexp (-p));
 837 | }
 838 | 
 839 | static inline float
 840 | fastersinh (float p)
 841 | {
 842 |   return 0.5f * (fasterexp (p) - fasterexp (-p));
 843 | }
 844 | 
 845 | static inline float
 846 | fastcosh (float p)
 847 | {
 848 |   return 0.5f * (fastexp (p) + fastexp (-p));
 849 | }
 850 | 
 851 | static inline float
 852 | fastercosh (float p)
 853 | {
 854 |   return 0.5f * (fasterexp (p) + fasterexp (-p));
 855 | }
 856 | 
 857 | static inline float
 858 | fasttanh (float p)
 859 | {
 860 |   return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
 861 | }
 862 | 
 863 | static inline float
 864 | fastertanh (float p)
 865 | {
 866 |   return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p));
 867 | }
 868 | 
 869 | #ifdef __SSE2__
 870 | 
 871 | static inline v4sf
 872 | vfastsinh (const v4sf p)
 873 | {
 874 |   const v4sf c_0_5 = v4sfl (0.5f);
 875 | 
 876 |   return c_0_5 * (vfastexp (p) - vfastexp (-p));
 877 | }
 878 | 
 879 | static inline v4sf
 880 | vfastersinh (const v4sf p)
 881 | {
 882 |   const v4sf c_0_5 = v4sfl (0.5f);
 883 | 
 884 |   return c_0_5 * (vfasterexp (p) - vfasterexp (-p));
 885 | }
 886 | 
 887 | static inline v4sf
 888 | vfastcosh (const v4sf p)
 889 | {
 890 |   const v4sf c_0_5 = v4sfl (0.5f);
 891 | 
 892 |   return c_0_5 * (vfastexp (p) + vfastexp (-p));
 893 | }
 894 | 
 895 | static inline v4sf
 896 | vfastercosh (const v4sf p)
 897 | {
 898 |   const v4sf c_0_5 = v4sfl (0.5f);
 899 | 
 900 |   return c_0_5 * (vfasterexp (p) + vfasterexp (-p));
 901 | }
 902 | 
 903 | static inline v4sf
 904 | vfasttanh (const v4sf p)
 905 | {
 906 |   const v4sf c_1 = v4sfl (1.0f);
 907 |   const v4sf c_2 = v4sfl (2.0f);
 908 | 
 909 |   return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p));
 910 | }
 911 | 
 912 | static inline v4sf
 913 | vfastertanh (const v4sf p)
 914 | {
 915 |   const v4sf c_1 = v4sfl (1.0f);
 916 |   const v4sf c_2 = v4sfl (2.0f);
 917 | 
 918 |   return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p));
 919 | }
 920 | 
 921 | #endif //__SSE2__
 922 | 
 923 | #endif // __FAST_HYPERBOLIC_H_
 924 | /*=====================================================================*
 925 |  *                   Copyright (C) 2011 Paul Mineiro                   *
 926 |  * All rights reserved.                                                *
 927 |  *                                                                     *
 928 |  * Redistribution and use in source and binary forms, with             *
 929 |  * or without modification, are permitted provided that the            *
 930 |  * following conditions are met:                                       *
 931 |  *                                                                     *
 932 |  *     * Redistributions of source code must retain the                *
 933 |  *     above copyright notice, this list of conditions and             *
 934 |  *     the following disclaimer.                                       *
 935 |  *                                                                     *
 936 |  *     * Redistributions in binary form must reproduce the             *
 937 |  *     above copyright notice, this list of conditions and             *
 938 |  *     the following disclaimer in the documentation and/or            *
 939 |  *     other materials provided with the distribution.                 *
 940 |  *                                                                     *
 941 |  *     * Neither the name of Paul Mineiro nor the names                *
 942 |  *     of other contributors may be used to endorse or promote         *
 943 |  *     products derived from this software without specific            *
 944 |  *     prior written permission.                                       *
 945 |  *                                                                     *
 946 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
 947 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
 948 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
 949 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
 950 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
 951 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
 952 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
 953 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
 954 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
 955 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
 956 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 957 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
 958 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
 959 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
 960 |  *                                                                     *
 961 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
 962 |  *=====================================================================*/
 963 | 
 964 | #ifndef __FAST_LAMBERT_W_H_
 965 | #define __FAST_LAMBERT_W_H_
 966 | 
 967 | #include <stdint.h>
 968 | 
 969 | // these functions compute the upper branch aka W_0
 970 | 
 971 | static inline float
 972 | fastlambertw (float x)
 973 | {
 974 |   static const float threshold = 2.26445f;
 975 | 
 976 |   float c = (x < threshold) ? 1.546865557f : 1.0f;
 977 |   float d = (x < threshold) ? 2.250366841f : 0.0f;
 978 |   float a = (x < threshold) ? -0.737769969f : 0.0f;
 979 | 
 980 |   float logterm = fastlog (c * x + d);
 981 |   float loglogterm = fastlog (logterm);
 982 | 
 983 |   float minusw = -a - logterm + loglogterm - loglogterm / logterm;
 984 |   float expminusw = fastexp (minusw);
 985 |   float xexpminusw = x * expminusw;
 986 |   float pexpminusw = xexpminusw - minusw;
 987 | 
 988 |   return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) /
 989 |          (2.0f + pexpminusw * (2.0f - minusw));
 990 | }
 991 | 
 992 | static inline float
 993 | fasterlambertw (float x)
 994 | {
 995 |   static const float threshold = 2.26445f;
 996 | 
 997 |   float c = (x < threshold) ? 1.546865557f : 1.0f;
 998 |   float d = (x < threshold) ? 2.250366841f : 0.0f;
 999 |   float a = (x < threshold) ? -0.737769969f : 0.0f;
1000 | 
1001 |   float logterm = fasterlog (c * x + d);
1002 |   float loglogterm = fasterlog (logterm);
1003 | 
1004 |   float w = a + logterm - loglogterm + loglogterm / logterm;
1005 |   float expw = fasterexp (-w);
1006 | 
1007 |   return (w * w + expw * x) / (1.0f + w);
1008 | }
1009 | 
1010 | static inline float
1011 | fastlambertwexpx (float x)
1012 | {
1013 |   static const float k = 1.1765631309f;
1014 |   static const float a = 0.94537622168f;
1015 | 
1016 |   float logarg = fmaxf (x, k);
1017 |   float powarg = (x < k) ? a * (x - k) : 0;
1018 | 
1019 |   float logterm = fastlog (logarg);
1020 |   float powterm = fasterpow2 (powarg);  // don't need accuracy here
1021 | 
1022 |   float w = powterm * (logarg - logterm + logterm / logarg);
1023 |   float logw = fastlog (w);
1024 |   float p = x - logw;
1025 | 
1026 |   return w * (2.0f + p + w * (3.0f + 2.0f * p)) /
1027 |          (2.0f - p + w * (5.0f + 2.0f * w));
1028 | }
1029 | 
1030 | static inline float
1031 | fasterlambertwexpx (float x)
1032 | {
1033 |   static const float k = 1.1765631309f;
1034 |   static const float a = 0.94537622168f;
1035 | 
1036 |   float logarg = fmaxf (x, k);
1037 |   float powarg = (x < k) ? a * (x - k) : 0;
1038 | 
1039 |   float logterm = fasterlog (logarg);
1040 |   float powterm = fasterpow2 (powarg);
1041 | 
1042 |   float w = powterm * (logarg - logterm + logterm / logarg);
1043 |   float logw = fasterlog (w);
1044 | 
1045 |   return w * (1.0f + x - logw) / (1.0f + w);
1046 | }
1047 | 
1048 | #ifdef __SSE2__
1049 | 
1050 | static inline v4sf
1051 | vfastlambertw (v4sf x)
1052 | {
1053 |   const v4sf threshold = v4sfl (2.26445f);
1054 | 
1055 |   v4sf under = _mm_cmplt_ps (x, threshold);
1056 |   v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
1057 |                       _mm_andnot_ps (under, v4sfl (1.0f)));
1058 |   v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
1059 |   v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
1060 | 
1061 |   v4sf logterm = vfastlog (c * x + d);
1062 |   v4sf loglogterm = vfastlog (logterm);
1063 | 
1064 |   v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm;
1065 |   v4sf expminusw = vfastexp (minusw);
1066 |   v4sf xexpminusw = x * expminusw;
1067 |   v4sf pexpminusw = xexpminusw - minusw;
1068 | 
1069 |   return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / 
1070 |          (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw));
1071 | }
1072 | 
1073 | static inline v4sf
1074 | vfasterlambertw (v4sf x)
1075 | {
1076 |   const v4sf threshold = v4sfl (2.26445f);
1077 | 
1078 |   v4sf under = _mm_cmplt_ps (x, threshold);
1079 |   v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
1080 |                       _mm_andnot_ps (under, v4sfl (1.0f)));
1081 |   v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
1082 |   v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
1083 | 
1084 |   v4sf logterm = vfasterlog (c * x + d);
1085 |   v4sf loglogterm = vfasterlog (logterm);
1086 | 
1087 |   v4sf w = a + logterm - loglogterm + loglogterm / logterm;
1088 |   v4sf expw = vfasterexp (-w);
1089 | 
1090 |   return (w * w + expw * x) / (v4sfl (1.0f) + w);
1091 | }
1092 | 
1093 | static inline v4sf
1094 | vfastlambertwexpx (v4sf x)
1095 | {
1096 |   const v4sf k = v4sfl (1.1765631309f);
1097 |   const v4sf a = v4sfl (0.94537622168f);
1098 |   const v4sf two = v4sfl (2.0f);
1099 |   const v4sf three = v4sfl (3.0f);
1100 |   const v4sf five = v4sfl (5.0f);
1101 | 
1102 |   v4sf logarg = _mm_max_ps (x, k);
1103 |   v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
1104 | 
1105 |   v4sf logterm = vfastlog (logarg);
1106 |   v4sf powterm = vfasterpow2 (powarg);  // don't need accuracy here
1107 | 
1108 |   v4sf w = powterm * (logarg - logterm + logterm / logarg);
1109 |   v4sf logw = vfastlog (w);
1110 |   v4sf p = x - logw;
1111 | 
1112 |   return w * (two + p + w * (three + two * p)) /
1113 |          (two - p + w * (five + two * w));
1114 | }
1115 | 
1116 | static inline v4sf
1117 | vfasterlambertwexpx (v4sf x)
1118 | {
1119 |   const v4sf k = v4sfl (1.1765631309f);
1120 |   const v4sf a = v4sfl (0.94537622168f);
1121 | 
1122 |   v4sf logarg = _mm_max_ps (x, k);
1123 |   v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
1124 | 
1125 |   v4sf logterm = vfasterlog (logarg);
1126 |   v4sf powterm = vfasterpow2 (powarg);
1127 | 
1128 |   v4sf w = powterm * (logarg - logterm + logterm / logarg);
1129 |   v4sf logw = vfasterlog (w);
1130 | 
1131 |   return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w);
1132 | }
1133 | 
1134 | #endif // __SSE2__
1135 | 
1136 | #endif // __FAST_LAMBERT_W_H_
1137 | 
1138 | /*=====================================================================*
1139 |  *                   Copyright (C) 2011 Paul Mineiro                   *
1140 |  * All rights reserved.                                                *
1141 |  *                                                                     *
1142 |  * Redistribution and use in source and binary forms, with             *
1143 |  * or without modification, are permitted provided that the            *
1144 |  * following conditions are met:                                       *
1145 |  *                                                                     *
1146 |  *     * Redistributions of source code must retain the                *
1147 |  *     above copyright notice, this list of conditions and             *
1148 |  *     the following disclaimer.                                       *
1149 |  *                                                                     *
1150 |  *     * Redistributions in binary form must reproduce the             *
1151 |  *     above copyright notice, this list of conditions and             *
1152 |  *     the following disclaimer in the documentation and/or            *
1153 |  *     other materials provided with the distribution.                 *
1154 |  *                                                                     *
1155 |  *     * Neither the name of Paul Mineiro nor the names                *
1156 |  *     of other contributors may be used to endorse or promote         *
1157 |  *     products derived from this software without specific            *
1158 |  *     prior written permission.                                       *
1159 |  *                                                                     *
1160 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
1161 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
1162 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
1163 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
1164 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
1165 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
1166 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
1167 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
1168 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
1169 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
1170 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
1171 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
1172 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
1173 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
1174 |  *                                                                     *
1175 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
1176 |  *=====================================================================*/
1177 | 
1178 | #ifndef __FAST_POW_H_
1179 | #define __FAST_POW_H_
1180 | 
1181 | #include <stdint.h>
1182 | 
1183 | static inline float
1184 | fastpow (float x,
1185 |          float p)
1186 | {
1187 |   return fastpow2 (p * fastlog2 (x));
1188 | }
1189 | 
1190 | static inline float
1191 | fasterpow (float x,
1192 |            float p)
1193 | {
1194 |   return fasterpow2 (p * fasterlog2 (x));
1195 | }
1196 | 
1197 | #ifdef __SSE2__
1198 | 
1199 | static inline v4sf
1200 | vfastpow (const v4sf x,
1201 |           const v4sf p)
1202 | {
1203 |   return vfastpow2 (p * vfastlog2 (x));
1204 | }
1205 | 
1206 | static inline v4sf
1207 | vfasterpow (const v4sf x,
1208 |             const v4sf p)
1209 | {
1210 |   return vfasterpow2 (p * vfasterlog2 (x));
1211 | }
1212 | 
1213 | #endif //__SSE2__
1214 | 
1215 | #endif // __FAST_POW_H_
1216 | /*=====================================================================*
1217 |  *                   Copyright (C) 2011 Paul Mineiro                   *
1218 |  * All rights reserved.                                                *
1219 |  *                                                                     *
1220 |  * Redistribution and use in source and binary forms, with             *
1221 |  * or without modification, are permitted provided that the            *
1222 |  * following conditions are met:                                       *
1223 |  *                                                                     *
1224 |  *     * Redistributions of source code must retain the                *
1225 |  *     above copyright notice, this list of conditions and             *
1226 |  *     the following disclaimer.                                       *
1227 |  *                                                                     *
1228 |  *     * Redistributions in binary form must reproduce the             *
1229 |  *     above copyright notice, this list of conditions and             *
1230 |  *     the following disclaimer in the documentation and/or            *
1231 |  *     other materials provided with the distribution.                 *
1232 |  *                                                                     *
1233 |  *     * Neither the name of Paul Mineiro nor the names                *
1234 |  *     of other contributors may be used to endorse or promote         *
1235 |  *     products derived from this software without specific            *
1236 |  *     prior written permission.                                       *
1237 |  *                                                                     *
1238 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
1239 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
1240 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
1241 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
1242 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
1243 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
1244 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
1245 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
1246 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
1247 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
1248 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
1249 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
1250 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
1251 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
1252 |  *                                                                     *
1253 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
1254 |  *=====================================================================*/
1255 | 
1256 | #ifndef __FAST_SIGMOID_H_
1257 | #define __FAST_SIGMOID_H_
1258 | 
1259 | #include <stdint.h>
1260 | 
1261 | static inline float
1262 | fastsigmoid (float x)
1263 | {
1264 |   return 1.0f / (1.0f + fastexp (-x));
1265 | }
1266 | 
1267 | static inline float
1268 | fastersigmoid (float x)
1269 | {
1270 |   return 1.0f / (1.0f + fasterexp (-x));
1271 | }
1272 | 
1273 | #ifdef __SSE2__
1274 | 
1275 | static inline v4sf
1276 | vfastsigmoid (const v4sf x)
1277 | {
1278 |   const v4sf c_1 = v4sfl (1.0f);
1279 | 
1280 |   return c_1 / (c_1 + vfastexp (-x));
1281 | }
1282 | 
1283 | static inline v4sf
1284 | vfastersigmoid (const v4sf x)
1285 | {
1286 |   const v4sf c_1 = v4sfl (1.0f);
1287 | 
1288 |   return c_1 / (c_1 + vfasterexp (-x));
1289 | }
1290 | 
1291 | #endif //__SSE2__
1292 | 
1293 | #endif // __FAST_SIGMOID_H_
1294 | /*=====================================================================*
1295 |  *                   Copyright (C) 2011 Paul Mineiro                   *
1296 |  * All rights reserved.                                                *
1297 |  *                                                                     *
1298 |  * Redistribution and use in source and binary forms, with             *
1299 |  * or without modification, are permitted provided that the            *
1300 |  * following conditions are met:                                       *
1301 |  *                                                                     *
1302 |  *     * Redistributions of source code must retain the                *
1303 |  *     above copyright notice, this list of conditions and             *
1304 |  *     the following disclaimer.                                       *
1305 |  *                                                                     *
1306 |  *     * Redistributions in binary form must reproduce the             *
1307 |  *     above copyright notice, this list of conditions and             *
1308 |  *     the following disclaimer in the documentation and/or            *
1309 |  *     other materials provided with the distribution.                 *
1310 |  *                                                                     *
1311 |  *     * Neither the name of Paul Mineiro nor the names                *
1312 |  *     of other contributors may be used to endorse or promote         *
1313 |  *     products derived from this software without specific            *
1314 |  *     prior written permission.                                       *
1315 |  *                                                                     *
1316 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
1317 |  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
1318 |  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
1319 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
1320 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
1321 |  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
1322 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
1323 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
1324 |  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
1325 |  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
1326 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
1327 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
1328 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
1329 |  * POSSIBILITY OF SUCH DAMAGE.                                         *
1330 |  *                                                                     *
1331 |  * Contact: Paul Mineiro <paul@mineiro.com>                            *
1332 |  *=====================================================================*/
1333 | 
1334 | #ifndef __FAST_TRIG_H_
1335 | #define __FAST_TRIG_H_
1336 | 
1337 | #include <stdint.h>
1338 | 
1339 | // http://www.devmaster.net/forums/showthread.php?t=5784
1340 | // fast sine variants are for x \in [ -\pi, pi ]
1341 | // fast cosine variants are for x \in [ -\pi, pi ]
1342 | // fast tangent variants are for x \in [ -\pi / 2, pi / 2 ]
1343 | // "full" versions of functions handle the entire range of inputs
1344 | // although the range reduction technique used here will be hopelessly
1345 | // inaccurate for |x| >> 1000
1346 | //
1347 | // WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than
1348 | // libc calls on older machines (!) and on newer machines are only 
1349 | // slighly faster.  however:
1350 | //   * vectorized versions are competitive
1351 | //   * faster full versions are competitive
1352 | 
1353 | static inline float
1354 | fastsin (float x)
1355 | {
1356 |   static const float fouroverpi = 1.2732395447351627f;
1357 |   static const float fouroverpisq = 0.40528473456935109f;
1358 |   static const float q = 0.78444488374548933f;
1359 |   union { float f; uint32_t i; } p = { 0.20363937680730309f };
1360 |   union { float f; uint32_t i; } r = { 0.015124940802184233f };
1361 |   union { float f; uint32_t i; } s = { -0.0032225901625579573f };
1362 | 
1363 |   union { float f; uint32_t i; } vx = { x };
1364 |   uint32_t sign = vx.i & 0x80000000;
1365 |   vx.i = vx.i & 0x7FFFFFFF;
1366 | 
1367 |   float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1368 |   float qpproxsq = qpprox * qpprox;
1369 | 
1370 |   p.i |= sign;
1371 |   r.i |= sign;
1372 |   s.i ^= sign;
1373 | 
1374 |   return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f));
1375 | }
1376 | 
1377 | static inline float
1378 | fastersin (float x)
1379 | {
1380 |   static const float fouroverpi = 1.2732395447351627f;
1381 |   static const float fouroverpisq = 0.40528473456935109f;
1382 |   static const float q = 0.77633023248007499f;
1383 |   union { float f; uint32_t i; } p = { 0.22308510060189463f };
1384 | 
1385 |   union { float f; uint32_t i; } vx = { x };
1386 |   uint32_t sign = vx.i & 0x80000000;
1387 |   vx.i &= 0x7FFFFFFF;
1388 | 
1389 |   float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1390 | 
1391 |   p.i |= sign;
1392 | 
1393 |   return qpprox * (q + p.f * qpprox);
1394 | }
1395 | 
1396 | static inline float
1397 | fastsinfull (float x)
1398 | {
1399 |   static const float twopi = 6.2831853071795865f;
1400 |   static const float invtwopi = 0.15915494309189534f;
1401 | 
1402 |   int k = x * invtwopi;
1403 |   float half = (x < 0) ? -0.5f : 0.5f;
1404 |   return fastsin ((half + k) * twopi - x);
1405 | }
1406 | 
1407 | static inline float
1408 | fastersinfull (float x)
1409 | {
1410 |   static const float twopi = 6.2831853071795865f;
1411 |   static const float invtwopi = 0.15915494309189534f;
1412 | 
1413 |   int k = x * invtwopi;
1414 |   float half = (x < 0) ? -0.5f : 0.5f;
1415 |   return fastersin ((half + k) * twopi - x);
1416 | }
1417 | 
1418 | static inline float
1419 | fastcos (float x)
1420 | {
1421 |   static const float halfpi = 1.5707963267948966f;
1422 |   static const float halfpiminustwopi = -4.7123889803846899f;
1423 |   float offset = (x > halfpi) ? halfpiminustwopi : halfpi;
1424 |   return fastsin (x + offset);
1425 | }
1426 | 
1427 | static inline float
1428 | fastercos (float x)
1429 | {
1430 |   static const float twooverpi = 0.63661977236758134f;
1431 |   static const float p = 0.54641335845679634f;
1432 | 
1433 |   union { float f; uint32_t i; } vx = { x };
1434 |   vx.i &= 0x7FFFFFFF;
1435 | 
1436 |   float qpprox = 1.0f - twooverpi * vx.f;
1437 | 
1438 |   return qpprox + p * qpprox * (1.0f - qpprox * qpprox);
1439 | }
1440 | 
1441 | static inline float
1442 | fastcosfull (float x)
1443 | {
1444 |   static const float halfpi = 1.5707963267948966f;
1445 |   return fastsinfull (x + halfpi);
1446 | }
1447 | 
1448 | static inline float
1449 | fastercosfull (float x)
1450 | {
1451 |   static const float halfpi = 1.5707963267948966f;
1452 |   return fastersinfull (x + halfpi);
1453 | }
1454 | 
1455 | static inline float
1456 | fasttan (float x)
1457 | {
1458 |   static const float halfpi = 1.5707963267948966f;
1459 |   return fastsin (x) / fastsin (x + halfpi);
1460 | }
1461 | 
1462 | static inline float
1463 | fastertan (float x)
1464 | {
1465 |   return fastersin (x) / fastercos (x);
1466 | }
1467 | 
1468 | static inline float
1469 | fasttanfull (float x)
1470 | {
1471 |   static const float twopi = 6.2831853071795865f;
1472 |   static const float invtwopi = 0.15915494309189534f;
1473 | 
1474 |   int k = x * invtwopi;
1475 |   float half = (x < 0) ? -0.5f : 0.5f;
1476 |   float xnew = x - (half + k) * twopi;
1477 | 
1478 |   return fastsin (xnew) / fastcos (xnew);
1479 | }
1480 | 
1481 | static inline float
1482 | fastertanfull (float x)
1483 | {
1484 |   static const float twopi = 6.2831853071795865f;
1485 |   static const float invtwopi = 0.15915494309189534f;
1486 | 
1487 |   int k = x * invtwopi;
1488 |   float half = (x < 0) ? -0.5f : 0.5f;
1489 |   float xnew = x - (half + k) * twopi;
1490 | 
1491 |   return fastersin (xnew) / fastercos (xnew);
1492 | }
1493 | 
1494 | #ifdef __SSE2__
1495 | 
1496 | static inline v4sf
1497 | vfastsin (const v4sf x)
1498 | {
1499 |   const v4sf fouroverpi = v4sfl (1.2732395447351627f);
1500 |   const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
1501 |   const v4sf q = v4sfl (0.78444488374548933f);
1502 |   const v4sf p = v4sfl (0.20363937680730309f);
1503 |   const v4sf r = v4sfl (0.015124940802184233f);
1504 |   const v4sf s = v4sfl (-0.0032225901625579573f);
1505 | 
1506 |   union { v4sf f; v4si i; } vx = { x };
1507 |   v4si sign = vx.i & v4sil (0x80000000);
1508 |   vx.i &= v4sil (0x7FFFFFFF);
1509 | 
1510 |   v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1511 |   v4sf qpproxsq = qpprox * qpprox;
1512 |   union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s));
1513 |   vy.i ^= sign;
1514 | 
1515 |   return q * qpprox + vy.f;
1516 | }
1517 | 
1518 | static inline v4sf
1519 | vfastersin (const v4sf x)
1520 | {
1521 |   const v4sf fouroverpi = v4sfl (1.2732395447351627f);
1522 |   const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
1523 |   const v4sf q = v4sfl (0.77633023248007499f);
1524 |   const v4sf plit = v4sfl (0.22308510060189463f);
1525 |   union { v4sf f; v4si i; } p = { plit };
1526 | 
1527 |   union { v4sf f; v4si i; } vx = { x };
1528 |   v4si sign = vx.i & v4sil (0x80000000);
1529 |   vx.i &= v4sil (0x7FFFFFFF);
1530 | 
1531 |   v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1532 | 
1533 |   p.i |= sign;
1534 | 
1535 |   return qpprox * (q + p.f * qpprox);
1536 | }
1537 | 
1538 | static inline v4sf
1539 | vfastsinfull (const v4sf x)
1540 | {
1541 |   const v4sf twopi = v4sfl (6.2831853071795865f);
1542 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1543 | 
1544 |   v4si k = v4sf_to_v4si (x * invtwopi);
1545 | 
1546 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1547 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1548 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1549 | 
1550 |   return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x);
1551 | }
1552 | 
1553 | static inline v4sf
1554 | vfastersinfull (const v4sf x)
1555 | {
1556 |   const v4sf twopi = v4sfl (6.2831853071795865f);
1557 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1558 | 
1559 |   v4si k = v4sf_to_v4si (x * invtwopi);
1560 | 
1561 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1562 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1563 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1564 | 
1565 |   return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x);
1566 | }
1567 | 
1568 | static inline v4sf
1569 | vfastcos (const v4sf x)
1570 | {
1571 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
1572 |   const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f);
1573 |   v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi);
1574 |   v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi),
1575 |                            _mm_andnot_ps (lthalfpi, halfpi));
1576 |   return vfastsin (x + offset);
1577 | }
1578 | 
1579 | static inline v4sf
1580 | vfastercos (v4sf x)
1581 | {
1582 |   const v4sf twooverpi = v4sfl (0.63661977236758134f);
1583 |   const v4sf p = v4sfl (0.54641335845679634);
1584 | 
1585 |   v4sf vx = v4sf_fabs (x);
1586 |   v4sf qpprox = v4sfl (1.0f) - twooverpi * vx;
1587 | 
1588 |   return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox);
1589 | }
1590 | 
1591 | static inline v4sf
1592 | vfastcosfull (const v4sf x)
1593 | {
1594 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
1595 |   return vfastsinfull (x + halfpi);
1596 | }
1597 | 
1598 | static inline v4sf
1599 | vfastercosfull (const v4sf x)
1600 | {
1601 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
1602 |   return vfastersinfull (x + halfpi);
1603 | }
1604 | 
1605 | static inline v4sf
1606 | vfasttan (const v4sf x)
1607 | {
1608 |   const v4sf halfpi = v4sfl (1.5707963267948966f);
1609 |   return vfastsin (x) / vfastsin (x + halfpi);
1610 | }
1611 | 
1612 | static inline v4sf
1613 | vfastertan (const v4sf x)
1614 | {
1615 |   return vfastersin (x) / vfastercos (x);
1616 | }
1617 | 
1618 | static inline v4sf
1619 | vfasttanfull (const v4sf x)
1620 | {
1621 |   const v4sf twopi = v4sfl (6.2831853071795865f);
1622 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1623 | 
1624 |   v4si k = v4sf_to_v4si (x * invtwopi);
1625 | 
1626 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1627 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1628 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1629 |   v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
1630 | 
1631 |   return vfastsin (xnew) / vfastcos (xnew);
1632 | }
1633 | 
1634 | static inline v4sf
1635 | vfastertanfull (const v4sf x)
1636 | {
1637 |   const v4sf twopi = v4sfl (6.2831853071795865f);
1638 |   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1639 | 
1640 |   v4si k = v4sf_to_v4si (x * invtwopi);
1641 | 
1642 |   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1643 |   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1644 |                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1645 |   v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
1646 | 
1647 |   return vfastersin (xnew) / vfastercos (xnew);
1648 | }
1649 | 
1650 | #endif //__SSE2__
1651 | 
1652 | #endif // __FAST_TRIG_H_
1653 | 


--------------------------------------------------------------------------------