├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── ford.yml
├── fpm.rsp
├── fpm.toml
├── src
    ├── fast_dotp.f90
    ├── fast_erf.f90
    ├── fast_log.f90
    ├── fast_math.f90
    ├── fast_rsqrt.f90
    ├── fast_sum.f90
    ├── fast_tanh.f90
    ├── fast_trigo.f90
    └── utilities
    │   ├── nvidia_shift.inc
    │   ├── nvidia_shift_interface.inc
    │   ├── vkahans.inc
    │   └── vkahans_m.inc
└── test
    └── test_fast_math.f90


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ${{ matrix.os }}
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         os: [ubuntu-latest, macos-latest, windows-latest]
12 |         toolchain:
13 |           - {compiler: gcc, version: 13, flags: ['-O3 -march=native -mtune=native -flto']}
14 |           - {compiler: intel, version: '2025.0', flags: ['-O3 -xhost']}
15 |           - {compiler: intel-classic, version: '2021.10', flags: ['-O3 -xhost']}
16 |           - {compiler: nvidia-hpc, version: '25.1', flags: ['-Mpreprocess -Ofast']}
17 |         include:
18 |           - os: ubuntu-latest
19 |             toolchain: {compiler: gcc, version: 12, flags: ['-O3 -march=native -mtune=native -flto']}
20 |         exclude:
21 |           - os: macos-latest
22 |             toolchain: {compiler: intel, version: '2025.0'}
23 |           - os: macos-latest
24 |             toolchain: {compiler: nvidia-hpc, version: '25.1'}
25 |           - os: windows-latest
26 |             toolchain: {compiler: nvidia-hpc, version: '25.1'}
27 | 
28 |     steps:
29 |       - name: Checkout code
30 |         uses: actions/checkout@v1
31 |       
32 |       - uses: fortran-lang/setup-fortran@main
33 |         id: setup-fortran
34 |         with:
35 |           compiler: ${{ matrix.toolchain.compiler }}
36 |           version: ${{ matrix.toolchain.version }}
37 | 
38 |       - name: Setup Fortran Package Manager
39 |         uses: fortran-lang/setup-fpm@v5
40 |         with:
41 |           github-token: ${{ secrets.GITHUB_TOKEN }}
42 | 
43 |       - name: Setup Fortran on MacOS
44 |         if: contains( matrix.os, 'macos')
45 |         run: |
46 |           brew reinstall gcc@13
47 |           ln -s /usr/local/lib/gcc/13/libgfortran.5.dylib /usr/local/lib/
48 |           ln -s /usr/local/lib/gcc/13/libquadmath.0.dylib /usr/local/lib/
49 | 
50 |       - run: |
51 |           fpm test --compiler ${{ env.FC }} --c-compiler ${{ env.CC }} --cxx-compiler ${{ env.CXX }} --flag "${{ join(matrix.toolchain.flags, ' ') }}"
52 |     #      ${{ env.FC }} ... # environment vars FC, CC, and CXX are set
53 |     #      ${{ steps.setup-fortran.outputs.fc }} ... # outputs work too
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | doc


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | This project is FOSS (free and open source software), therefore, anyone interested to use it or to contribute is welcome.
 4 | 
 5 | * If you have a nice fast function that can be faster than an intrinsic within an "acceptable" tolerance.
 6 | * If you have an idea to make the code/documentation better looking and easier to read (we are striving for the [Fortran stdlib style guide](https://github.com/fortran-lang/stdlib/blob/HEAD/STYLE_GUIDE.md) but not there yet)
 7 | * If you have an idea to make the unit tests even more robust
 8 | 
 9 | Please go ahead and open a discussion or create a PR.
10 | 
11 | # Basic rules:
12 | * Remember, this is an open-source project! So be gentle and patient (we know, it is all about fast functions, but for the sake of our intellectual curiosity and helping our work).
13 | * Be polite, especially when disagreeing!
14 | * Enjoy number crunching coding with Modern Fortran :)


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | From Transvalor S.A.
 4 | Copyright (c) 2023-present José R. Alves Z.
 5 | 
 6 | From Federico Perini
 7 | Copyright (c) 2016-2022 Federico Perini <perini@wisc.edu>
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![DOI](https://zenodo.org/badge/681533852.svg)](https://zenodo.org/badge/latestdoi/681533852)
  2 | # Fortran Fast math
  3 | A collection of functions for fast number crunching using Fortran.
  4 | 
  5 | In order to get the maximum performance of this library, compile with "-O3 -march=native -flto" (or equivalent). Note: For the elemental functions, inlinement is key to extract maximum performance. It can be achieved either by use of the `-flto`(gcc)/`-ipo`(intel) flag or using the `include` mechanism.
  6 | 
  7 | # Available functions
  8 | 
  9 | | function | name(s)               | shapes     | types            | 
 10 | |----------|-----------------------|------------|------------------|
 11 | | sum      | `fsum` `fsum_kahan`(1) |        `1d`|`real32` `real64` |
 12 | | dot      | `fprod` `fprod_kahan`(2)|        `1d`|`real32` `real64` |
 13 | | cos      | `fcos`                | `elemental`|`real32` `real64` |
 14 | | sin      | `fsin`                | `elemental`|`real32` `real64` |
 15 | | tan      | `ftan`                | `elemental`|`real32` `real64` |
 16 | | tanh     | `ftanh`               | `elemental`|`real32` `real64` |
 17 | | acos     | `facos`               | `elemental`|`real32` `real64` |
 18 | | atan     | `fatan`               | `elemental`|`real32` `real64` |
 19 | | erf      | `ferf`                | `elemental`|`real32` `real64` |
 20 | | log      | `flog_p3` `flog_p5`   | `elemental`|         `real64` |
 21 | | rsqrt(3) | `frsqrt`              | `elemental`|`real32` `real64` |
 22 | 
 23 | * (1) fast (and precise) sum for 1D arrays - possibility of including a mask.
 24 |     `fsum`: fastest method and at worst, same or 1 order of magnitud more precise than the intrinsic sum. It groups chunks of values in a temporal working batch which is summed up once at the end.
 25 |     `fsum_kahan`: Highest precision. It has a precission close to a quadratic sum (for real32 summing with real64, and fo real64 summing with real128). It also uses the chunks principle with an elemental kahan operator applied on top.
 26 | 
 27 | * (2) fast (and precise) dot product for 1D arrays - possibility of including a 3rd weighting array.
 28 |     `fprod`: fastest method and at worst, 1 order of magnitud more precise than the intrinsic dot_product. runtime can vary between 3X and 8X the intrinsic. It groups chunks of products in a temporal working batch which is summed up once at the end (based on `fsum`).
 29 |     `fprod_kahan`: Same idea as `fsum_kahan` but on top of chunked products.
 30 | * (3) rsqrt: reciprocal square root $f(x)=1/sqrt(x)$
 31 | # API documentation
 32 | 
 33 | To generate the API documentation for `fast_math` using
 34 | [ford](https://github.com/Fortran-FOSS-Programmers/ford) run the following
 35 | command:
 36 | 
 37 | ```shell
 38 | ford ford.yml
 39 | ```
 40 | 
 41 | # TODO
 42 | * Contribution guidelines
 43 | * Polish autodoc
 44 | 
 45 | # Elapsed time examples and precision
 46 | Warning: The following values are just references as to see how different can they be between different compilers. Actual speed-ups(downs) should be measured under the true use conditions to account for (lack-off) inlinement, etc etc.
 47 | <details>
 48 | <summary>(Click to unfold) Windows gfortran 14.1 > fpm test --flag "-O3 -march=native -mtune=native"</summary>
 49 | CPU: Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz   1.99 GHz
 50 | 
 51 | |      sum r32 | <time> [ns/eval] | Speed-Up | relative error  |
 52 | |--------------|------------------|----------|-----------------|
 53 | |    intrinsic |           1.2100 |     1.00 |      3.3794E-06 |
 54 | |        kahan |           0.1800 |     6.72 |      1.0425E-07 |
 55 | |        chunk |           0.1100 |    11.00 |      1.1265E-07 |
 56 |  
 57 | |      sum r64 | <time> [ns/eval] | Speed-Up | relative error  |
 58 | |--------------|------------------|----------|-----------------|
 59 | |    intrinsic |           1.3000 |     1.00 |      5.9269E-15 |
 60 | |        kahan |           0.3100 |     4.19 |      1.7286E-16 |
 61 | |        chunk |           0.1500 |     8.67 |      2.1416E-16 |
 62 |  
 63 | | sum r32 mask | <time> [ns/eval] | Speed-Up | relative error  |
 64 | |--------------|------------------|----------|-----------------|
 65 | |    intrinsic |           4.1250 |     1.00 |      1.5687E-06 |
 66 | |        kahan |           0.1600 |    25.78 |      9.1493E-08 |
 67 | |        chunk |           0.1600 |    25.78 |      8.8453E-08 |
 68 |  
 69 | | sum r64 mask | <time> [ns/eval] | Speed-Up | relative error  |
 70 | |--------------|------------------|----------|-----------------|
 71 | |    intrinsic |           4.0350 |     1.00 |      2.9428E-15 |
 72 | |        kahan |           0.3750 |    10.76 |      1.2179E-16 |
 73 | |        chunk |           0.2450 |    16.47 |      1.2768E-16 |
 74 |  
 75 | |      dot r32 | <time> [ns/eval] | Speed-Up | relative error  |
 76 | |--------------|------------------|----------|-----------------|
 77 | |    intrinsic |           1.0600 |     1.00 |      3.2735E-06 |
 78 | |        kahan |           0.1500 |     7.07 |      9.8348E-08 |
 79 | |        chunk |           0.1000 |    10.60 |      1.1587E-07 |
 80 | 
 81 | |      dot r64 | <time> [ns/eval] | Speed-Up | relative error  |
 82 | |--------------|------------------|----------|-----------------|
 83 | |    intrinsic |           1.2100 |     1.00 |      5.8091E-15 |
 84 | |        kahan |           0.3300 |     3.67 |      1.8407E-16 |
 85 | |        chunk |           0.2000 |     6.05 |      2.0528E-16 |
 86 | 
 87 | |        trigo | <time> [ns/eval] | Speed-Up | relative error  |
 88 | |--------------|------------------|----------|-----------------|
 89 | |     fsin r32 |           2.8840 |    13.82 |      3.4749E-07 |
 90 | |     fsin r64 |           3.1040 |    12.17 |      4.0784E-16 |
 91 | |    facos r32 |           1.6600 |    28.64 |      2.9135E-05 | 
 92 | |    facos r64 |           1.6800 |     6.89 |      2.9274E-14 | 
 93 | |    fatan r32 |           1.6720 |    23.36 |      1.7730E-06 | 
 94 | |    fatan r64 |           2.5120 |     3.94 |      6.6869E-06 | 
 95 | 
 96 | |       hyperb | <time> [ns/eval] | Speed-Up | relative error  |
 97 | |--------------|------------------|----------|-----------------|
 98 | |    ftanh r32 |           2.1640 |     8.61 |      5.9480E-08 | 
 99 | |    ftanh r64 |           2.3480 |     7.16 |      1.3282E-09 | 
100 | |     ferf r32 |           2.3600 |    27.21 |      7.9573E-08 | 
101 | |     ferf r64 |           4.1200 |    15.60 |      9.6298E-08 | 
102 | 
103 | |        rsqrt | <time> [ns/eval] | Speed-Up | relative error  |
104 | |--------------|------------------|----------|-----------------|
105 | |   frsqrt r32 |           1.7720 |     0.26 |      9.4039E-04 | 
106 | |   frsqrt r64 |           2.2280 |     0.64 |      8.9297E-04 | 
107 | </details>
108 | 
109 | <details>
110 | <summary>(Click to unfold) Windows ifx 2025.0.4 > fpm test --flag "/O3 /Qxhost"</summary>
111 | CPU: Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz   1.99 GHz
112 | 
113 | |      sum r32 | <time> [ns/eval] | Speed-Up | relative error  |
114 | |--------------|------------------|----------|-----------------|
115 | |    intrinsic |           0.4300 |     1.00 |      3.8308E-07 |
116 | |        kahan |           0.1700 |     2.53 |      6.0938E-08 |
117 | |        chunk |           0.0100 |    43.00 |      6.0938E-08 |
118 |  
119 | |      sum r64 | <time> [ns/eval] | Speed-Up | relative error  |
120 | |--------------|------------------|----------|-----------------|
121 | |    intrinsic |           0.3500 |     1.00 |      1.5061E-15 |
122 | |        kahan |           0.1800 |     1.94 |      1.3033E-16 |
123 | |        chunk |           0.0200 |    17.50 |      1.3886E-16 |
124 |  
125 | | sum r32 mask | <time> [ns/eval] | Speed-Up | relative error  |
126 | |--------------|------------------|----------|-----------------|
127 | |    intrinsic |           0.3000 |     1.00 |      2.0369E-07 |
128 | |        kahan |           0.2200 |     1.36 |      5.2360E-08 |
129 | |        chunk |           0.1750 |     1.71 |      5.2515E-08 |
130 |  
131 | | sum r64 mask | <time> [ns/eval] | Speed-Up | relative error  |
132 | |--------------|------------------|----------|-----------------|
133 | |    intrinsic |           0.3500 |     1.00 |      3.7423E-16 |
134 | |        kahan |           0.2900 |     1.21 |      8.3862E-17 |
135 | |        chunk |           0.2800 |     1.25 |      9.4422E-17 |
136 |  
137 | |      dot r32 | <time> [ns/eval] | Speed-Up | relative error  |
138 | |--------------|------------------|----------|-----------------|
139 | |    intrinsic |           0.3400 |     1.00 |      3.9539E-07 |
140 | |        kahan |           0.1600 |     2.12 |      6.7639E-08 |
141 | |        chunk |           0.1600 |     2.12 |      6.6906E-08 |
142 |  
143 | |      dot r64 | <time> [ns/eval] | Speed-Up | relative error  |
144 | |--------------|------------------|----------|-----------------|
145 | |    intrinsic |           0.7100 |     1.00 |      1.4730E-15 |
146 | |        kahan |           0.1500 |     4.73 |      1.2270E-16 |
147 | |        chunk |           0.1700 |     4.18 |      1.2459E-16 |
148 | 
149 | |        trigo | <time> [ns/eval] | Speed-Up | relative error  |
150 | |--------------|------------------|----------|-----------------|
151 | |     fsin r32 |           3.0960 |     0.26 |      2.0412E-08 | 
152 | |     fsin r64 |           2.7080 |     1.01 |      3.5190E-17 | 
153 | |    facos r32 |           1.6440 |     0.46 |      1.3946E-05 | 
154 | |    facos r64 |           1.7560 |     1.51 |      2.0708E-11 | 
155 | |    fatan r32 |           2.6880 |     0.28 |      4.4950E-06 | 
156 | |    fatan r64 |           1.9000 |     1.73 |      6.6869E-06 | 
157 | 
158 | |       hyperb | <time> [ns/eval] | Speed-Up | relative error  |
159 | |--------------|------------------|----------|-----------------|
160 | |    ftanh r32 |           2.3200 |     0.48 |      1.0284E-08 | 
161 | |    ftanh r64 |           2.3080 |     2.19 |      1.3282E-09 | 
162 | |     ferf r32 |           3.3160 |     0.23 |      7.5974E-07 | 
163 | |     ferf r64 |           2.9760 |     0.89 |      9.6298E-08 | 
164 | 
165 | |        rsqrt | <time> [ns/eval] | Speed-Up | relative error  |
166 | |--------------|------------------|----------|-----------------|
167 | |   frsqrt r32 |           1.7280 |     0.21 |      9.4033E-04 | 
168 | |   frsqrt r64 |           1.6520 |     0.90 |      8.7360E-04 |
169 | </details>
170 | 
171 | <details>
172 | <summary>(Click to unfold) WSL2 nvfortran 24.3 > fpm test --flag "-Mpreprocess -fast"</summary>
173 | CPU: Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz   1.99 GHz
174 | 
175 | |      sum r32 | <time> [ns/eval] | Speed-Up | relative error  |
176 | |--------------|------------------|----------|-----------------|
177 | |    intrinsic |           0.2100 |     1.00 |      1.1295E-07 |
178 | |        kahan |           0.3200 |     0.66 |      9.8169E-08 |
179 | |        chunk |           0.1400 |     1.50 |      7.1764E-08 |
180 |  
181 | |      sum r64 | <time> [ns/eval] | Speed-Up | relative error  |
182 | |--------------|------------------|----------|-----------------|
183 | |    intrinsic |           0.3300 |     1.00 |      3.8969E-16 |
184 | |        kahan |           0.3200 |     1.03 |      1.8086E-16 |
185 | |        chunk |           0.2200 |     1.50 |      9.0372E-17 |
186 |  
187 | | sum r32 mask | <time> [ns/eval] | Speed-Up | relative error  |
188 | |--------------|------------------|----------|-----------------|
189 | |    intrinsic |           0.2400 |     1.00 |      2.0742E-07 |
190 | |        kahan |           0.3050 |     0.79 |      8.9645E-08 |
191 | |        chunk |           0.1550 |     1.55 |      5.8651E-08 |
192 |  
193 | | sum r64 mask | <time> [ns/eval] | Speed-Up | relative error  |
194 | |--------------|------------------|----------|-----------------|
195 | |    intrinsic |           0.4150 |     1.00 |      3.8136E-16 |
196 | |        kahan |           0.5000 |     0.83 |      1.2734E-16 |
197 | |        chunk |           0.2850 |     1.46 |      2.4869E-17 |
198 |  
199 | |      dot r32 | <time> [ns/eval] | Speed-Up | relative error  |
200 | |--------------|------------------|----------|-----------------|
201 | |    intrinsic |           0.2500 |     1.00 |      1.1426E-07 |
202 | |        kahan |           0.2600 |     0.96 |      9.7811E-08 |
203 | |        chunk |           0.1400 |     1.79 |      7.2122E-08 |
204 |  
205 | |      dot r64 | <time> [ns/eval] | Speed-Up | relative error  |
206 | |--------------|------------------|----------|-----------------|
207 | |    intrinsic |           0.2600 |     1.00 |      3.9246E-16 |
208 | |        kahan |           0.3800 |     0.68 |      1.9229E-16 |
209 | |        chunk |           0.1900 |     1.37 |      9.0927E-17 |
210 | 
211 | |        trigo | <time> [ns/eval] | Speed-Up | relative error  |
212 | |--------------|------------------|----------|-----------------|
213 | |     fsin r32 |           0.0600 |   190.80 |      1.0325E-07 | 
214 | |     fsin r64 |           0.0320 |   357.25 |      5.0118E-17 | 
215 | |    facos r32 |           0.0280 |   221.43 |      1.0563E-06 | 
216 | |    facos r64 |           0.0160 |   546.75 |      3.7996E-15 | 
217 | |    fatan r32 |           0.0240 |   300.50 |      5.4993E-06 | 
218 | |    fatan r64 |           0.0400 |   244.40 |      6.6869E-06 | 
219 | 
220 | |       hyperb | <time> [ns/eval] | Speed-Up | relative error  |
221 | |--------------|------------------|----------|-----------------|
222 | |    ftanh r32 |           0.0280 |   510.71 |      5.5308E-08 | 
223 | |    ftanh r64 |           0.0360 |   348.56 |      1.3282E-09 | 
224 | |     ferf r32 |           0.0400 |   496.90 |      9.1205E-08 | 
225 | |     ferf r64 |           0.0360 |   532.44 |      9.6298E-08 | 
226 | 
227 | |        rsqrt | <time> [ns/eval] | Speed-Up | relative error  |
228 | |--------------|------------------|----------|-----------------|
229 | |   frsqrt r32 |          16.3120 |     0.03 |      9.4387E-04 | 
230 | |   frsqrt r64 |          16.7680 |     0.11 |      8.6745E-04 |
231 | </details>
232 | 
233 | # Acknowledgement
234 | 
235 | * Compilation of this library was possible thanks to [Transvalor S.A.](https://www.transvalor.com/en/homepage) research activities. 
236 | * Part of this library is based on the work of [Perini and Reitz](https://doi.org/10.1016/j.combustflame.2018.04.013), that was funded through the Sandia National Laboratories by the U.S. Department of Energy, Office of Vehicle Technologies, program managers Leo Breton, Gupreet Singh.
237 | * The [fortran lang community](https://fortran-lang.discourse.group/) discussions such as [Some Intrinsic SUMS](https://fortran-lang.discourse.group/t/some-intrinsic-sums/5760) and [fastGPT](https://fortran-lang.discourse.group/t/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/5385)
238 | 
239 | Contribution of open-source developers:
240 | 
241 | [jalvesz](https://github.com/jalvesz)
242 | 
243 | [perazz](https://github.com/perazz)
244 | 


--------------------------------------------------------------------------------
/ford.yml:
--------------------------------------------------------------------------------
 1 | project:            Fast math collection library
 2 | summary:            Fast sum, dot products, trigonometric functions and more
 3 | author:             José Alves
 4 | date:               May 15, 2023
 5 | preprocess:         False
 6 | incl_src:           False
 7 | include:            ./src
 8 | src_dir:            ./src
 9 | output_dir:         doc
10 | 
11 | {!README.md!}


--------------------------------------------------------------------------------
/fpm.rsp:
--------------------------------------------------------------------------------
 1 | @testgfortran
 2 | option test --compiler gfortran --flag "-O3 -march=native -mtune=native"
 3 | 
 4 | @wtestifort
 5 | option test --compiler ifort --flag "/O3 /Qxhost"
 6 | 
 7 | @ltestifort
 8 | option test --compiler ifort --flag "-O3 -xhost"
 9 | 
10 | @wtestifx
11 | option test --compiler ifx --flag "/O3 /Qxhost"
12 | 
13 | @ltestifx
14 | option test --compiler ifx --flag "-O3 -xhost"
15 | 
16 | @testnvfortran
17 | option test --compiler nvfortran --flag "-Mpreprocess -fast"


--------------------------------------------------------------------------------
/fpm.toml:
--------------------------------------------------------------------------------
 1 | name = "fast_math"
 2 | version = "0.1.0"
 3 | license = "license"
 4 | author = "Jose Alves"
 5 | maintainer = "jose.alves@transvalor.com"
 6 | copyright = "Copyright 2023, Jose Alves"
 7 | 
 8 | [build]
 9 | auto-executables = true
10 | auto-tests = true
11 | auto-examples = true
12 | module-naming = false
13 | 
14 | [install]
15 | library = false
16 | 
17 | [fortran]
18 | implicit-typing = false
19 | implicit-external = false
20 | source-form = "free"
21 | 
22 | [preprocess]
23 | [preprocess.cpp]
24 | suffixes = [".f90"]
25 | 
26 | [dev-dependencies]
27 | test-drive.git = "https://github.com/fortran-lang/test-drive.git"
28 | test-drive.tag = "v0.4.0"


--------------------------------------------------------------------------------
/src/fast_dotp.f90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A.
  3 | !
  4 | ! SPDX-License-Identifier: MIT
  5 | !
  6 | module fast_dotp
  7 |     !! A faster and more accurate implementation of the dot_product intrinsic. 
  8 |     !! It uses the same principle as fsum_chunk but considering local multiplications that can be vectorized for faster summation.
  9 |     use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
 10 |     implicit none
 11 |     private
 12 |     
 13 |     public :: fprod, fprod_kahan
 14 |     integer, parameter :: chunk64 = 64
 15 |     integer, parameter :: chunk32 = 64
 16 |     
 17 |     interface fprod
 18 |         module procedure fprod_sp
 19 |         module procedure fprod_sp_weighted
 20 |         module procedure fprod_dp
 21 |         module procedure fprod_dp_weighted
 22 |     end interface
 23 | 
 24 |     interface fprod_kahan
 25 |         module procedure fprod_kahan_sp
 26 |         module procedure fprod_kahan_sp_weighted
 27 |         module procedure fprod_kahan_dp
 28 |         module procedure fprod_kahan_dp_weighted
 29 |     end interface
 30 | 
 31 |     interface vkahans
 32 |       module procedure vkahans_sp
 33 |       module procedure vkahans_dp
 34 |     end interface
 35 |     
 36 |     contains
 37 | 
 38 |     pure function fprod_sp(a,b) result(p)
 39 |         integer, parameter :: wp = sp
 40 |         integer, parameter :: chunk = chunk32
 41 |         real(wp), intent(in) :: a(:)
 42 |         real(wp), intent(in) :: b(:)
 43 |         real(wp) :: p
 44 |         ! --
 45 |         real(wp) :: abatch(chunk)
 46 |         integer :: i, n, r
 47 |         ! -----------------------------
 48 |         n = size(a)
 49 |         r = mod(n,chunk)
 50 | 
 51 |         abatch(1:r)       = a(1:r)*b(1:r)
 52 |         abatch(r+1:chunk) = 0._wp
 53 |         do i = r+1, n-r, chunk
 54 |          abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1)
 55 |         end do
 56 |         
 57 |         p = 0.0_wp
 58 |         do i = 1, chunk/2
 59 |           p = p + abatch(i)+abatch(chunk/2+i)
 60 |         end do
 61 |     end function
 62 |   
 63 |     pure function fprod_dp(a,b) result(p)
 64 |         integer, parameter :: wp = dp
 65 |         integer, parameter :: chunk = chunk64
 66 |         real(wp), intent(in) :: a(:)
 67 |         real(wp), intent(in) :: b(:)
 68 |         real(wp) :: p
 69 |         ! --
 70 |         real(wp) :: abatch(chunk)
 71 |         integer :: i, n, r
 72 |         ! -----------------------------
 73 |         n = size(a)
 74 |         r = mod(n,chunk)
 75 | 
 76 |         abatch(1:r)       = a(1:r)*b(1:r)
 77 |         abatch(r+1:chunk) = 0._wp
 78 |         do i = r+1, n-r, chunk
 79 |          abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1)
 80 |         end do
 81 |         
 82 |         p = 0.0_wp
 83 |         do i = 1, chunk/2
 84 |           p = p + abatch(i)+abatch(chunk/2+i)
 85 |         end do
 86 |     end function
 87 |     
 88 |     pure function fprod_sp_weighted(a,b,w) result(p)
 89 |         integer, parameter :: wp = sp
 90 |         integer, parameter :: chunk = chunk32
 91 |         real(wp), intent(in) :: a(:)
 92 |         real(wp), intent(in) :: b(:)
 93 |         real(wp), intent(in) :: w(:)
 94 |         real(wp) :: p
 95 |         ! --
 96 |         real(wp) :: abatch(chunk)
 97 |         integer :: i, n, r
 98 |         ! -----------------------------
 99 |         n = size(a)
100 |         r = mod(n,chunk)
101 | 
102 |         abatch(1:r)       = a(1:r)*b(1:r)*w(1:r)
103 |         abatch(r+1:chunk) = 0._wp
104 |         do i = r+1, n-r, chunk
105 |          abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1)*w(i:i+chunk-1)
106 |         end do
107 |         
108 |         p = 0.0_wp
109 |         do i = 1, chunk/2
110 |           p = p + abatch(i)+abatch(chunk/2+i)
111 |         end do
112 |     end function
113 |   
114 |     pure function fprod_dp_weighted(a,b,w) result(p)
115 |         integer, parameter :: wp = dp
116 |         integer, parameter :: chunk = chunk64
117 |         real(wp), intent(in) :: a(:)
118 |         real(wp), intent(in) :: b(:)
119 |         real(wp), intent(in) :: w(:)
120 |         real(wp) :: p
121 |         ! --
122 |         real(wp) :: abatch(chunk)
123 |         integer :: i, n, r
124 |         ! -----------------------------
125 |         n = size(a)
126 |         r = mod(n,chunk)
127 | 
128 |         abatch(1:r)       = a(1:r)*b(1:r)*w(1:r)
129 |         abatch(r+1:chunk) = 0._wp
130 |         do i = r+1, n-r, chunk
131 |          abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1)*w(i:i+chunk-1)
132 |         end do
133 |         
134 |         p = 0.0_wp
135 |         do i = 1, chunk/2
136 |           p = p + abatch(i)+abatch(chunk/2+i)
137 |         end do
138 |     end function
139 | 
140 |     pure function fprod_kahan_sp(a,b) result(p)
141 |         integer, parameter :: wp = sp
142 |         integer, parameter :: chunk = chunk32
143 |         real(wp), intent(in) :: a(:)
144 |         real(wp), intent(in) :: b(:)
145 |         real(wp) :: p
146 |         ! --
147 |         real(wp) :: sbatch(chunk)
148 |         real(wp) :: cbatch(chunk)
149 |         integer :: i, n, r
150 |         ! -----------------------------
151 |         n = size(a)
152 |         r = mod(n,chunk)
153 |         
154 |         sbatch(1:r) = a(1:r) * b(1:r)
155 |         sbatch(r+1:chunk)  = 0.0_wp
156 |         cbatch = 0.0_wp 
157 |         do i = r+1, n-r, chunk
158 |           call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) )
159 |         end do
160 |         
161 |         p = 0.0_wp
162 |         do i = 1,chunk
163 |           call vkahans( sbatch(i) , p , cbatch(i) )
164 |         end do
165 |     end function
166 | 
167 |     pure function fprod_kahan_dp(a,b) result(p)
168 |         integer, parameter :: wp = dp
169 |         integer, parameter :: chunk = chunk64
170 |         real(wp), intent(in) :: a(:)
171 |         real(wp), intent(in) :: b(:)
172 |         real(wp) :: p
173 |         ! --
174 |         real(wp) :: sbatch(chunk)
175 |         real(wp) :: cbatch(chunk)
176 |         integer :: i, n, r
177 |         ! -----------------------------
178 |         n = size(a)
179 |         r = mod(n,chunk)
180 |         
181 |         sbatch(1:r) = a(1:r) * b(1:r)
182 |         sbatch(r+1:chunk)  = 0.0_wp
183 |         cbatch = 0.0_wp 
184 |         do i = r+1, n-r, chunk
185 |           call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) )
186 |         end do
187 |         
188 |         p = 0.0_wp
189 |         do i = 1,chunk
190 |           call vkahans( sbatch(i) , p , cbatch(i) )
191 |         end do
192 |     end function
193 | 
194 |     pure function fprod_kahan_sp_weighted(a,b,w) result(p)
195 |         integer, parameter :: wp = sp
196 |         integer, parameter :: chunk = chunk32
197 |         real(wp), intent(in) :: a(:)
198 |         real(wp), intent(in) :: b(:)
199 |         real(wp), intent(in) :: w(:)
200 |         real(wp) :: p
201 |         ! --
202 |         real(wp) :: sbatch(chunk)
203 |         real(wp) :: cbatch(chunk)
204 |         integer :: i, n, r
205 |         ! -----------------------------
206 |         n = size(a)
207 |         r = mod(n,chunk)
208 |         
209 |         sbatch(1:r) = a(1:r) * b(1:r) * w(1:r)
210 |         sbatch(r+1:chunk)  = 0.0_wp
211 |         cbatch = 0.0_wp
212 |         do i = r+1, n-r, chunk
213 |           call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) * w(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) )
214 |         end do
215 |         
216 |         p = 0.0_wp
217 |         do i = 1,chunk
218 |           call vkahans( sbatch(i) , p , cbatch(i) )
219 |         end do
220 |     end function
221 | 
222 |     pure function fprod_kahan_dp_weighted(a,b,w) result(p)
223 |         integer, parameter :: wp = dp
224 |         integer, parameter :: chunk = chunk64
225 |         real(wp), intent(in) :: a(:)
226 |         real(wp), intent(in) :: b(:)
227 |         real(wp), intent(in) :: w(:)
228 |         real(wp) :: p
229 |         ! --
230 |         real(wp) :: sbatch(chunk)
231 |         real(wp) :: cbatch(chunk)
232 |         integer :: i, n, r
233 |         ! -----------------------------
234 |         n = size(a)
235 |         r = mod(n,chunk)
236 |         
237 |         sbatch(1:r) = a(1:r) * b(1:r) * w(1:r)
238 |         sbatch(r+1:chunk)  = 0.0_wp
239 |         cbatch = 0.0_wp 
240 |         do i = r+1, n-r, chunk
241 |           call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) * w(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) )
242 |         end do
243 |         
244 |         p = 0.0_wp
245 |         do i = 1,chunk
246 |           call vkahans( sbatch(i) , p , cbatch(i) )
247 |         end do
248 |     end function
249 | 
250 |     elemental subroutine vkahans_sp(a,s,c)
251 |     integer, parameter :: wp = sp
252 |     include 'utilities/vkahans.inc'
253 |     end subroutine  
254 | 
255 |     elemental subroutine vkahans_dp(a,s,c)
256 |     integer, parameter :: wp = dp
257 |     include 'utilities/vkahans.inc'
258 |     end subroutine  
259 | 
260 |   end module fast_dotp


--------------------------------------------------------------------------------
/src/fast_erf.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A.
 3 | !
 4 | ! SPDX-License-Identifier: MIT
 5 | !
 6 | module fast_erf
 7 |     !! Source: https://fortran-lang.discourse.group/t/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/5385/31
 8 |     use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
 9 |     implicit none
10 |     private
11 |     
12 |     public :: ferf
13 |     
14 |     interface ferf
15 |         module procedure ferf_sp
16 |         module procedure ferf_dp
17 |     end interface
18 |     
19 |     contains
20 |     
21 |     elemental function ferf_sp( x ) result( y )
22 |         integer, parameter :: wp = sp
23 |         real(wp), intent(in) :: x
24 |         real(wp) :: y
25 |         !-- Internal Variables
26 |         real(wp) :: abs_x, sqr_x
27 |         !-------------------------------------------------
28 |         abs_x = abs(x)
29 |         sqr_x = x**2
30 |         y = 1._wp - 1._wp / (1._wp+ 0.278393_wp*abs_x + 0.230389_wp*sqr_x + 0.000972_wp*abs_x*sqr_x + 0.078108_wp*sqr_x*sqr_x)**4
31 |         y = sign(y,x)
32 |     end function
33 | 
34 |     elemental function ferf_dp( x ) result( y )
35 |         integer, parameter :: wp = dp
36 |         real(wp), intent(in) :: x
37 |         real(wp) :: y
38 |         !-- Internal Variables
39 |         real(wp) :: abs_x, sqr_x
40 |         !-------------------------------------------------
41 |         abs_x = abs(x)
42 |         sqr_x = x**2
43 |         y = 1._wp - 1._wp / (1._wp+ 0.278393_wp*abs_x + 0.230389_wp*sqr_x + 0.000972_wp*abs_x*sqr_x + 0.078108_wp*sqr_x*sqr_x)**4
44 |         y = sign(y,x)
45 |     end function
46 |     
47 | end module


--------------------------------------------------------------------------------
/src/fast_log.f90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! SPDX-FileCopyrightText: 2016-2022 Federico Perini <perini@wisc.edu>
  3 | !
  4 | ! SPDX-License-Identifier: MIT
  5 | !
  6 | !   ***********************************************************************************************
  7 | !> @brief A module to compute FAST logarithm functions, based on Perini and Reitz, "Fast         **
  8 | !>        approximations of exponential and logarithm functions combined with efficient          **
  9 | !>        storage/retrieval for combustion kinetics calculations" Comb Flame 194(2018), 37-51.   **
 10 | !   ***********************************************************************************************
 11 | module fast_log
 12 |     use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
 13 |     implicit none
 14 |     private
 15 |     
 16 |     public :: flog_p3, flog_p5
 17 | 
 18 |     interface flog_p3
 19 |         module procedure flog_p3_dp
 20 |     end interface
 21 |     interface flog_p5
 22 |         module procedure flog_p5_dp
 23 |     end interface
 24 | 
 25 | #ifdef __NVCOMPILER
 26 |     include 'utilities/nvidia_shift_interface.inc'
 27 | #endif
 28 |     
 29 | contains
 30 | 
 31 |     elemental function flog_p3_dp(x) result(y)
 32 |         integer, parameter :: wp = dp
 33 |         real(wp), intent(in) :: x
 34 |         real(wp) :: y
 35 |         !-- Internal Variables
 36 |         real(wp) :: xi,xf
 37 |         integer(wp) :: iwp
 38 |         integer(wp), parameter :: mantissa_left  = 2_wp**52
 39 |         integer(wp), parameter :: mantissa       = -9218868437227405313_wp ! not(shiftl(2047_wp,52))
 40 |         integer(wp), parameter :: bias           = 1023_wp
 41 |         integer(wp), parameter :: ishift         = mantissa_left*bias
 42 | 
 43 |         real(wp), parameter :: log2         = log(2._wp)
 44 |         real(wp), parameter :: rlog2        = 1._wp/log2
 45 |         real(wp), parameter :: sqrt2        = sqrt(2._wp)
 46 |         real(wp), parameter :: s(3)= [rlog2,3.0_wp-2.5_wp*rlog2,1.5_wp*rlog2-2.0_wp]
 47 |         !-------------------------------------------------
 48 |         iwp = transfer(x,iwp)
 49 |         xi = shiftr(iwp,52)-bias
 50 | 
 51 |         ! Take mantissa part only
 52 |         xf = transfer(iand(iwp,mantissa)+ishift,xf)-1._wp
 53 | 
 54 |         ! Apply cubic polynomial
 55 |         xf = xf*(s(1)+xf*(s(2)+xf*s(3)))
 56 | 
 57 |         ! Compute log and Change of basis: log_2(x) -> log_e(x) = log2*log_2(x)
 58 |         y = (xf+xi)*log2
 59 | 
 60 |     end function flog_p3_dp
 61 | 
 62 |     elemental function flog_p5_dp(x) result(y)
 63 |         integer, parameter :: wp = dp
 64 |         real(wp), intent(in) :: x
 65 |         real(wp) :: y
 66 |         !-- Internal Variables
 67 |         real(wp) :: xi,xf
 68 |         integer(wp) :: iwp
 69 |         integer(wp), parameter :: mantissa_left  = 2_wp**52
 70 |         integer(wp), parameter :: mantissa       = -9218868437227405313_wp ! not(shiftl(2047_wp,52))
 71 |         integer(wp), parameter :: bias           = 1023_wp
 72 |         integer(wp), parameter :: ishift         = mantissa_left*bias
 73 | 
 74 |         real(wp), parameter :: log2         = log(2._wp)
 75 |         real(wp), parameter :: rlog2        = 1._wp/log2
 76 |         real(wp), parameter :: sqrt2        = sqrt(2._wp)
 77 |         real(wp), parameter :: s(5)= [ 1.44269504088896e+0_wp,&
 78 |                                       -7.21347520444482e-1_wp,&
 79 |                                        4.42145354110618e-1_wp,&
 80 |                                       -2.12375830888126e-1_wp,&
 81 |                                        4.88829563330264e-2_wp]
 82 |         !-------------------------------------------------
 83 |         iwp = transfer(x,iwp)
 84 |         xi = shiftr(iwp,52)-bias
 85 | 
 86 |         ! Take mantissa part only
 87 |         xf = transfer(iand(iwp,mantissa)+ishift,xf)-1._wp
 88 | 
 89 |         ! Apply quintic polynomial
 90 |         xf = xf*(s(1)+xf*(s(2)+xf*(s(3)+xf*(s(4)+xf*s(5)))))
 91 | 
 92 |         ! Compute log and Change of basis: log_2(x) -> log_e(x) = log2*log_2(x)
 93 |         y = (xf+xi)*log2
 94 | 
 95 |     end function flog_p5_dp
 96 | 
 97 | #ifdef __NVCOMPILER
 98 |     include 'utilities/nvidia_shift.inc'
 99 | #endif
100 | end module fast_log


--------------------------------------------------------------------------------
/src/fast_math.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A.
 3 | !
 4 | ! SPDX-License-Identifier: MIT
 5 | !
 6 | module fast_math
 7 |   !! User API: All modules can be referenced from this module as entry point
 8 |   !-------------------------
 9 |   ! Basics
10 |   use fast_sum
11 |   use fast_dotp
12 |   use fast_rsqrt
13 |   !-------------------------
14 |   ! logarithmic
15 |   use fast_log
16 |   !-------------------------
17 |   ! Trigonometric
18 |   use fast_trigo
19 |   !-------------------------
20 |   ! Hyperbolic
21 |   use fast_tanh
22 |   use fast_erf
23 | 
24 | end module fast_math
25 | 


--------------------------------------------------------------------------------
/src/fast_rsqrt.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | ! SPDX-FileCopyrightText: 2016-2022 Federico Perini <perini@wisc.edu>
 3 | !
 4 | ! SPDX-License-Identifier: MIT
 5 | !
 6 | !   ***********************************************************************************************
 7 | !> @brief A FAST reciprocal of a square root, 1/sqrt(x), based on Perini and Reitz, "Fast        **
 8 | !>        approximations of exponential and logarithm functions combined with efficient          **
 9 | !>        storage/retrieval for combustion kinetics calculations" Comb Flame 194(2018), 37-51.   **
10 | !   ***********************************************************************************************
11 | module fast_rsqrt
12 |     use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
13 |     implicit none
14 |     private
15 |     
16 |     public :: frsqrt
17 | 
18 |     interface frsqrt
19 |         !! Retranscript of the original Quake III Arena, see https://en.wikipedia.org/wiki/Fast_inverse_square_root
20 |         !! for pure reference
21 |         module procedure frsqrt_dp
22 |         module procedure frsqrt_sp
23 |     end interface
24 | 
25 | #ifdef __NVCOMPILER
26 |     include 'utilities/nvidia_shift_interface.inc'
27 | #endif
28 | 
29 | contains
30 | 
31 |     elemental function frsqrt_sp(x) result(y)
32 |         integer, parameter :: wp = sp
33 |         real(wp), intent(in) :: x
34 |         real(wp) :: y
35 |         !-- Internal Variables
36 |         real(wp)    :: x2,y2
37 |         integer(wp) :: i
38 |         integer(wp), parameter :: magic = int(Z'5f3759df',kind=wp)
39 |         !-------------------------------------------------
40 |         x2 = 0.5_wp*x
41 |         i  = transfer(x,i)
42 |         i  = magic - shiftr(i,1)
43 |         y2 = transfer(i,y)
44 | 
45 |         ! Perform one Newton-Raphson step
46 |         y  = y2*(1.5_wp-x2*y2*y2)
47 | 
48 |     end function frsqrt_sp
49 | 
50 |     elemental function frsqrt_dp(x) result(y)
51 |         !! Double precision implementation of the Quake III arena algorithm
52 |         !! With an avx2 enabled machine you will have speed-ups compared to the intrinsic 1/srqt(x)
53 |         !! even with 2 or 3 Newton-Raphson iterations
54 |         !! 1 iter > precision at 1e-3
55 |         !! 2 iter > precision at 1e-6
56 |         !! 3 iter > precision at 1e-11
57 |         integer, parameter :: wp = dp
58 |         integer, parameter :: ninter = 1
59 |         real(wp), intent(in) :: x
60 |         real(wp) :: y
61 |         !-- Internal Variables
62 |         real(wp)    :: x2,y2
63 |         integer(wp) :: i
64 |         integer(wp), parameter :: magic = 6910469410427058089_wp
65 |         !-------------------------------------------------
66 |         x2 = 0.5_wp*x
67 |         i  = transfer(x,i)
68 |         i  = magic - shiftr(i,1)
69 |         y2 = transfer(i,y)
70 | 
71 |         ! Perform Newton-Raphson steps 
72 |         do i = 1, ninter
73 |             y2  = y2*(1.5_wp-x2*y2*y2)  
74 |         end do
75 |         y = y2
76 |     end function frsqrt_dp
77 | 
78 | #ifdef __NVCOMPILER
79 |     include 'utilities/nvidia_shift.inc'
80 | #endif
81 | end module fast_rsqrt


--------------------------------------------------------------------------------
/src/fast_sum.f90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A.
  3 | !
  4 | ! SPDX-License-Identifier: MIT
  5 | !
  6 | module fast_sum
  7 |   !! Two fast & accurate sum are proposed for 1D arrays:
  8 |   !! By default, "fsum" will use the fsum_chunk approach. This method is at worst, one order of magnitud more accurate that "sum" and between 1.5 to 10 times faster
  9 |   !! A second approach is also proposed, "fsum_pair" which is the most accurate approach. cpu time can vary between x2 times slower or sometimes faster than intrinsic sum.
 10 |   use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
 11 |   implicit none
 12 |   private
 13 | 
 14 |   public :: fsum, fsum_kahan
 15 |   integer, parameter :: chunk64 = 64
 16 |   integer, parameter :: chunk32 = 64
 17 |   
 18 |   interface fsum
 19 |     !! Source: to the best of knowledge: Alves J. but heavily inspired by this paper https://epubs.siam.org/doi/10.1137/19M1257780
 20 |       module procedure fsum_chunk_1d_sp
 21 |       module procedure fsum_chunk_1d_sp_mask
 22 |       module procedure fsum_chunk_1d_dp
 23 |       module procedure fsum_chunk_1d_dp_mask
 24 |   end interface
 25 | 
 26 |   interface fsum_kahan
 27 |       module procedure fsum_kahan_1d_sp
 28 |       module procedure fsum_kahan_1d_sp_mask
 29 |       module procedure fsum_kahan_1d_dp
 30 |       module procedure fsum_kahan_1d_dp_mask
 31 |   end interface
 32 | 
 33 |   interface vkahans
 34 |       module procedure vkahans_sp
 35 |       module procedure vkahans_dp
 36 |   end interface
 37 |   interface vkahans_m
 38 |       module procedure vkahans_m_sp
 39 |       module procedure vkahans_m_dp
 40 |   end interface
 41 | 
 42 |   contains
 43 | 
 44 |   pure function fsum_chunk_1d_sp(a) result(sout)
 45 |       integer, parameter :: wp = sp
 46 |       integer, parameter :: chunk = chunk32
 47 |       real(wp), intent(in) :: a(:)
 48 |       real(wp) :: sout
 49 |       ! --
 50 |       real(wp) :: abatch(chunk)
 51 |       integer :: i, n, r
 52 |       ! -----------------------------
 53 |       n  = size(a)
 54 |       r = mod(n,chunk)
 55 |       
 56 |       abatch(1:r)       = a(1:r)
 57 |       abatch(r+1:chunk) = 0._wp
 58 |       do i = r+1, n-r, chunk
 59 |        abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)
 60 |       end do
 61 |       
 62 |       sout = 0.0_wp
 63 |       do i = 1, chunk/2
 64 |         sout = sout + abatch(i)+abatch(chunk/2+i)
 65 |       end do
 66 |   end function
 67 | 
 68 |   pure function fsum_chunk_1d_dp(a) result(sout)
 69 |       integer, parameter :: wp = dp
 70 |       integer, parameter :: chunk = chunk64
 71 |       real(wp), intent(in) :: a(:)
 72 |       real(wp) :: sout
 73 |       ! --
 74 |       real(wp) :: abatch(chunk)
 75 |       integer :: i, n, r
 76 |       ! -----------------------------
 77 |       n  = size(a)
 78 |       r = mod(n,chunk)
 79 |       
 80 |       abatch(1:r)       = a(1:r)
 81 |       abatch(r+1:chunk) = 0._wp
 82 |       do i = r+1, n-r, chunk
 83 |        abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)
 84 |       end do
 85 | 
 86 |       sout = 0.0_wp
 87 |       do i = 1, chunk/2
 88 |         sout = sout + abatch(i)+abatch(chunk/2+i)
 89 |       end do
 90 |   end function
 91 |   
 92 |   pure function fsum_chunk_1d_sp_mask(a,mask) result(sout)
 93 |       integer, parameter :: wp = sp
 94 |       integer, parameter :: chunk = chunk32
 95 |       real(wp), intent(in) :: a(:)
 96 |       logical, intent(in) :: mask(:)
 97 |       real(wp) :: sout
 98 |       ! --
 99 |       real(wp) :: abatch(chunk)
100 |       integer :: i, n, r
101 |       ! -----------------------------
102 |       n  = size(a)
103 |       r = mod(n,chunk)
104 |       
105 |       abatch(1:r)       = merge( 0.0_wp , a(1:r) , mask(1:r) )
106 |       abatch(r+1:chunk) = 0._wp
107 |       do i = r+1, n-r, chunk
108 |        abatch(1:chunk) = abatch(1:chunk) + merge( 0.0_wp , a(i:i+chunk-1), mask(i:i+chunk-1) )
109 |       end do
110 |       
111 |       sout = 0.0_wp
112 |       do i = 1, chunk/2
113 |         sout = sout + abatch(i)+abatch(chunk/2+i)
114 |       end do
115 |   end function
116 | 
117 |   pure function fsum_chunk_1d_dp_mask(a,mask) result(sout)
118 |       integer, parameter :: wp = dp
119 |       integer, parameter :: chunk = chunk64
120 |       real(wp), intent(in) :: a(:)
121 |       logical, intent(in) :: mask(:)
122 |       real(wp) :: sout
123 |       ! --
124 |       real(wp) :: abatch(chunk)
125 |       integer :: i, n, r
126 |       ! -----------------------------
127 |       n  = size(a)
128 |       r = mod(n,chunk)
129 |       
130 |       abatch(1:r)       = merge( 0.0_wp , a(1:r) , mask(1:r) )
131 |       abatch(r+1:chunk) = 0._wp
132 |       do i = r+1, n-r, chunk
133 |        abatch(1:chunk) = abatch(1:chunk) + merge( 0.0_wp , a(i:i+chunk-1), mask(i:i+chunk-1) )
134 |       end do
135 | 
136 |       sout = 0.0_wp
137 |       do i = 1, chunk/2
138 |         sout = sout + abatch(i)+abatch(chunk/2+i)
139 |       end do
140 |   end function
141 | 
142 |   pure function fsum_kahan_1d_sp(a) result(sout)
143 |       integer, parameter :: wp = sp
144 |       integer, parameter :: chunk = chunk32
145 |       real(wp), intent(in) :: a(:)
146 |       real(wp) :: sout
147 |       ! --
148 |       real(wp) :: sbatch(chunk)
149 |       real(wp) :: cbatch(chunk)
150 |       integer :: i, n, r
151 |       ! -----------------------------
152 |       n  = size(a)
153 |       r = mod(n,chunk)
154 | 
155 |       sbatch(1:r) = a(1:r)
156 |       sbatch(r+1:chunk)  = 0.0_wp
157 |       cbatch = 0.0_wp
158 |       do i = r+1, n-r, chunk
159 |         call vkahans( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) )
160 |       end do  
161 |       
162 |       sout = 0.0_wp
163 |       do i = 1,chunk
164 |         call vkahans( sbatch(i) , sout , cbatch(i) )
165 |       end do
166 |   end function
167 | 
168 |   pure function fsum_kahan_1d_dp(a) result(sout)
169 |       integer, parameter :: wp = dp
170 |       integer, parameter :: chunk = chunk64
171 |       real(wp), intent(in) :: a(:)
172 |       real(wp) :: sout
173 |       ! --
174 |       real(wp) :: sbatch(chunk)
175 |       real(wp) :: cbatch(chunk)
176 |       integer :: i, n, r
177 |       ! -----------------------------
178 |       n  = size(a)
179 |       r = mod(n,chunk)
180 | 
181 |       sbatch(1:r) = a(1:r)
182 |       sbatch(r+1:chunk)  = 0.0_wp
183 |       cbatch = 0.0_wp
184 |       do i = r+1, n-r, chunk
185 |         call vkahans( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) )
186 |       end do    
187 |       
188 |       sout = 0.0_wp
189 |       do i = 1,chunk
190 |         call vkahans( sbatch(i) , sout , cbatch(i) )
191 |       end do
192 |   end function
193 | 
194 |   pure function fsum_kahan_1d_sp_mask(a,mask) result(sout)
195 |       integer, parameter :: wp = sp
196 |       integer, parameter :: chunk = chunk32
197 |       real(wp), intent(in) :: a(:)
198 |       logical, intent(in) :: mask(:)
199 |       real(wp) :: sout
200 |       ! --
201 |       real(wp) :: sbatch(chunk)
202 |       real(wp) :: cbatch(chunk)
203 |       integer :: i, n, r
204 |       ! -----------------------------
205 |       n  = size(a)
206 |       r = mod(n,chunk)
207 | 
208 |       sbatch(1:r) = merge( 0.0_wp , a(1:r) , mask(1:r) )
209 |       sbatch(r+1:chunk)  = 0.0_wp
210 |       cbatch = 0.0_wp
211 |       do i = r+1, n-r, chunk
212 |         call vkahans_m( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk), mask(i:i+chunk-1) )
213 |       end do 
214 | 
215 |       sout = 0.0_wp
216 |       do i = 1,chunk
217 |         call vkahans( sbatch(i) , sout , cbatch(i) )
218 |       end do
219 |   end function
220 | 
221 |   pure function fsum_kahan_1d_dp_mask(a,mask) result(sout)
222 |       integer, parameter :: wp = dp
223 |       integer, parameter :: chunk = chunk64
224 |       real(wp), intent(in) :: a(:)
225 |       logical, intent(in) :: mask(:)
226 |       real(wp) :: sout
227 |       ! --
228 |       real(wp) :: sbatch(chunk)
229 |       real(wp) :: cbatch(chunk)
230 |       integer :: i, n, r
231 |       ! -----------------------------
232 |       n  = size(a)
233 |       r = mod(n,chunk)
234 | 
235 |       sbatch(1:r) = merge( 0.0_wp , a(1:r) , mask(1:r) )
236 |       sbatch(r+1:chunk)  = 0.0_wp
237 |       cbatch = 0.0_wp
238 |       do i = r+1, n-r, chunk
239 |         call vkahans_m( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk), mask(i:i+chunk-1) )
240 |       end do 
241 | 
242 |       sout = 0.0_wp
243 |       do i = 1,chunk
244 |         call vkahans( sbatch(i) , sout , cbatch(i) )
245 |       end do
246 |   end function
247 | 
248 |   elemental subroutine vkahans_sp(a,s,c)
249 |   integer, parameter :: wp = sp
250 |   include 'utilities/vkahans.inc'
251 |   end subroutine  
252 | 
253 |   elemental subroutine vkahans_dp(a,s,c)
254 |   integer, parameter :: wp = dp
255 |   include 'utilities/vkahans.inc'
256 |   end subroutine  
257 | 
258 |   elemental subroutine vkahans_m_sp(a,s,c,m)
259 |   integer, parameter :: wp = sp
260 |   include 'utilities/vkahans_m.inc'
261 |   end subroutine  
262 | 
263 |   elemental subroutine vkahans_m_dp(a,s,c,m)
264 |   integer, parameter :: wp = dp
265 |   include 'utilities/vkahans_m.inc'
266 |   end subroutine  
267 | 
268 | end module fast_sum


--------------------------------------------------------------------------------
/src/fast_tanh.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A.
 3 | !
 4 | ! SPDX-License-Identifier: MIT
 5 | !
 6 | module fast_tanh
 7 |     !! Source: https://fortran-lang.discourse.group/t/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/5385/31
 8 |     use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
 9 |     implicit none
10 |     private
11 |     
12 |     public :: ftanh
13 |     
14 |     interface ftanh
15 |         module procedure ftanh_sp
16 |         module procedure ftanh_dp
17 |     end interface
18 |     
19 |     contains
20 |     
21 |     elemental function ftanh_sp( x ) result( y )
22 |         integer, parameter :: wp = sp
23 |         real(wp), intent(in) :: x
24 |         real(wp) :: y
25 |         !-- Internal Variables
26 |         real(wp) :: x2, a, b
27 |         !---------------------------------------------
28 |         x2 = x*x
29 |         a = x * (135135.0_wp + x2 * (17325.0_wp + x2 * (378.0_wp + x2)))
30 |         b = 135135.0_wp + x2 * (62370.0_wp + x2 * (3150.0_wp + x2 * 28.0_wp))
31 |         y = merge( a / b , sign(1.0_wp,x) , x2 <= 25._wp )
32 |     end function
33 | 
34 |     elemental function ftanh_dp( x ) result( y )
35 |         integer, parameter :: wp = dp
36 |         real(wp), intent(in) :: x
37 |         real(wp) :: y
38 |         !-- Internal Variables
39 |         real(wp) :: x2, a, b
40 |         !---------------------------------------------
41 |         x2 = x*x
42 |         a = x * (135135.0_wp + x2 * (17325.0_wp + x2 * (378.0_wp + x2)))
43 |         b = 135135.0_wp + x2 * (62370.0_wp + x2 * (3150.0_wp + x2 * 28.0_wp))
44 |         y = merge( a / b , sign(1.0_wp,x) , x2 <= 25._wp )
45 |     end function
46 |     
47 | end module


--------------------------------------------------------------------------------
/src/fast_trigo.f90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A.
  3 | !
  4 | ! SPDX-License-Identifier: MIT
  5 | !
  6 | module fast_trigo
  7 |     !! Source for fast sine cosine: http://web.archive.org/web/20141220225551/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
  8 |     use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
  9 |     implicit none
 10 |     private
 11 |     
 12 |     public :: fsin, fcos, ftan
 13 |     public :: facos, facos_nvidia, fatan
 14 |     
 15 |     interface fcos
 16 |         module procedure fcos_sp
 17 |         module procedure fcos_dp
 18 |     end interface
 19 | 
 20 |     interface fsin
 21 |         module procedure fsin_sp
 22 |         module procedure fsin_dp
 23 |     end interface
 24 | 
 25 |     interface ftan
 26 |         module procedure ftan_sp
 27 |         module procedure ftan_dp
 28 |     end interface
 29 | 
 30 |     interface facos
 31 |         module procedure facos_sp
 32 |         module procedure facos_dp
 33 |     end interface
 34 |     
 35 |     interface facos_nvidia
 36 |     !! Source : https://developer.download.nvidia.com/cg/acos.html
 37 |         module procedure facos_nvidia_sp
 38 |         module procedure facos_nvidia_dp
 39 |     end interface
 40 | 
 41 |     interface fatan
 42 |     !! Source : https://www.dsprelated.com/showarticle/1052.php
 43 |         module procedure fatan_sp
 44 |         module procedure fatan_dp
 45 |     end interface
 46 |     
 47 |     contains
 48 |     
 49 |     elemental function fcos_sp( x ) result( y )
 50 |         integer, parameter :: wp = sp
 51 |         real(wp), intent(in) :: x
 52 |         real(wp) :: y
 53 |         !-- Internal Variables
 54 |         real(wp), parameter :: half_pi =acos(-1.0_wp)/2
 55 |         !---------------------------------------------
 56 |         y = fsin_sp( half_pi - x )
 57 |     end function
 58 | 
 59 |     elemental function fcos_dp( x ) result( y )
 60 |         integer, parameter :: wp = dp
 61 |         real(wp), intent(in) :: x
 62 |         real(wp) :: y
 63 |         !-- Internal Variables
 64 |         real(wp), parameter :: half_pi =acos(-1.0_wp)/2
 65 |         !---------------------------------------------
 66 |         y = fsin_dp( half_pi - x )
 67 |     end function
 68 | 
 69 |     elemental function fsin_sp( x ) result( y )
 70 |         integer, parameter :: wp = sp
 71 |         real(wp), intent(in) :: x
 72 |         real(wp) :: y
 73 |         !-- Internal Variables
 74 |         real(wp), parameter :: twopi = 2*acos(-1.0_wp)
 75 |         real(wp), parameter :: invtwopi = 1.0_wp/twopi
 76 |         real(wp), parameter :: c1=4.0_wp/acos(-1.0_wp)
 77 |         real(wp), parameter :: c2=-4.0_wp/acos(-1.0_wp)**2
 78 |         real(wp), parameter :: c3=0.225_wp
 79 |         real(wp) :: x0
 80 |         !---------------------------------------------
 81 |         x0 = x - (int(x*invtwopi,kind=1) * twopi) 
 82 |         y = c1*x0+c2*x0*abs(x0)
 83 |         y = c3*(y*abs(y)-y)+y
 84 |     end function
 85 | 
 86 |     elemental function fsin_dp( x ) result( y )
 87 |         integer, parameter :: wp = dp
 88 |         real(wp), intent(in) :: x
 89 |         real(wp) :: y
 90 |         !-- Internal Variables
 91 |         real(wp), parameter :: twopi = 2*acos(-1.0_wp)
 92 |         real(wp), parameter :: invtwopi = 1.0_wp/twopi
 93 |         real(wp), parameter :: c1=4.0_wp/acos(-1.0_wp)
 94 |         real(wp), parameter :: c2=-4.0_wp/acos(-1.0_wp)**2
 95 |         real(wp), parameter :: c3=0.225_wp
 96 |         real(wp) :: x0
 97 |         !---------------------------------------------
 98 |         x0 = x - (int(x*invtwopi,kind=1) * twopi) 
 99 |         y = c1*x0+c2*x0*abs(x0)
100 |         y = c3*(y*abs(y)-y)+y
101 |     end function
102 | 
103 |     elemental function ftan_sp( x ) result( y )
104 |         integer, parameter :: wp = sp
105 |         real(wp), intent(in) :: x
106 |         real(wp) :: y
107 |         !-- Internal Variables
108 |         real(wp), parameter :: pi = acos(-1.0_wp)
109 |         real(wp), parameter :: invpi = 1.0_wp/acos(-1.0_wp)
110 |         real(wp) :: x0, xsq
111 |         !-------------------------------------------------
112 |         x0 = x - (int(x*invpi,kind=1) * pi) 
113 |         xsq = x0 * x0
114 |         y = x0 * (2.471688400562703_wp - 0.189759681063053_wp * xsq) / &
115 |                  (2.4674011002723397_wp - xsq)
116 |     end function
117 | 
118 |     elemental function ftan_dp( x ) result( y )
119 |         integer, parameter :: wp = dp
120 |         real(wp), intent(in) :: x
121 |         real(wp) :: y
122 |         !-- Internal Variables
123 |         real(wp), parameter :: pi = acos(-1.0_wp)
124 |         real(wp), parameter :: invpi = 1.0_wp/acos(-1.0_wp)
125 |         real(wp) :: x0, xsq
126 |         !-------------------------------------------------
127 |         x0 = x - (int(x*invpi,kind=1) * pi) 
128 |         xsq = x0 * x0
129 |         y = x * (2.471688400562703_wp - 0.189759681063053_wp * xsq) / &
130 |                 (2.4674011002723397_wp - xsq)
131 |     end function
132 | 
133 |     !====================================================
134 |     ! Inverse
135 |     !====================================================
136 | 
137 |     elemental function facos_sp( x ) result( y )
138 |       integer, parameter :: wp = sp
139 |       real(wp), intent(in) :: x
140 |       real(wp) :: y
141 |       !---------------------------------------------
142 |       y =  (-0.69813170079773212_wp * x * x - 0.87266462599716477_wp) * x + 1.5707963267948966_wp
143 |     end function
144 | 
145 |     elemental function facos_dp( x ) result( y )
146 |       integer, parameter :: wp = dp
147 |       real(wp), intent(in) :: x
148 |       real(wp) :: y
149 |       !---------------------------------------------
150 |       y =  (-0.69813170079773212_wp * x * x - 0.87266462599716477_wp) * x + 1.5707963267948966_wp
151 |     end function
152 | 
153 |     elemental function  facos_nvidia_sp( x ) result( y )
154 |         integer, parameter :: wp = sp
155 |         real(wp), intent(in) :: x
156 |         real(wp) :: y
157 |         !-- Internal Variables
158 |         integer(1) :: negate
159 |         real(wp) :: xp
160 |         !---------------------------------------------
161 |         negate = merge( 1_1 , 0_1 , x < 0_wp )
162 |         xp = abs(x)
163 |         y = -0.0187293_wp * xp + 0.0742610_wp
164 |         y = y * xp - 0.2121144_wp
165 |         y = y * xp + 1.5707288_wp
166 |         y = y * sqrt(1_wp-xp)
167 |         y = y + negate * (- 2.0_wp * y + 3.14159265358979_wp)
168 |     end function
169 | 
170 |     elemental function  facos_nvidia_dp( x ) result( y )
171 |         integer, parameter :: wp = dp
172 |         real(wp), intent(in) :: x
173 |         real(wp) :: y
174 |         !-- Internal Variables
175 |         integer(1) :: negate
176 |         real(wp) :: xp
177 |         !---------------------------------------------
178 |         negate = merge( 1_1 , 0_1 , x < 0_wp )
179 |         xp = abs(x)
180 |         y = -0.0187293_wp * xp + 0.0742610_wp
181 |         y = y * xp - 0.2121144_wp
182 |         y = y * xp + 1.5707288_wp
183 |         y = y * sqrt(1_wp-xp)
184 |         y = y + negate * (- 2.0_wp * y + 3.14159265358979_wp)
185 |     end function
186 | 
187 |     elemental function fatan_sp( x ) result( y )
188 |         integer, parameter :: wp = sp
189 |         real(wp), intent(in) :: x
190 |         real(wp) :: y
191 |         !-- Internal Variables
192 |         real(wp), parameter :: hpi = acos(-1.0_wp)/2._wp
193 |         real(wp) :: inv_x
194 |         !---------------------------------------------
195 |         if(abs(x)<1._wp)then
196 |           y = base( x )
197 |         else
198 |           inv_x = 1._wp / x
199 |           y = sign(hpi,x)  - base( inv_x )
200 |         end if
201 |     contains
202 |         real(wp) elemental function base( x ) result( y )
203 |             real(wp), intent(in) :: x
204 |             real(wp), parameter :: n1 = 0.97239411_wp
205 |             real(wp), parameter :: n2 = -0.19194795_wp
206 |             y = (n1 + n2 * x * x) * x
207 |         end function
208 |     end function
209 | 
210 |     elemental function fatan_dp( x ) result( y )
211 |         integer, parameter :: wp = dp
212 |         real(wp), intent(in) :: x
213 |         real(wp) :: y
214 |         !-- Internal Variables
215 |         real(wp), parameter :: hpi = acos(-1.0_wp)/2._wp
216 |         real(wp) :: inv_x
217 |         !---------------------------------------------
218 |         if(abs(x)<1._wp)then
219 |           y = base( x )
220 |         else
221 |           inv_x = 1._wp / x
222 |           y = sign(hpi,x)  - base( inv_x )
223 |         end if
224 |     contains
225 |         real(wp) elemental function base( x ) result( y )
226 |             real(wp), intent(in) :: x
227 |             real(wp), parameter :: n1 = 0.97239411_wp
228 |             real(wp), parameter :: n2 = -0.19194795_wp
229 |             y = (n1 + n2 * x * x) * x
230 |         end function
231 |     end function
232 |     
233 | end module


--------------------------------------------------------------------------------
/src/utilities/nvidia_shift.inc:
--------------------------------------------------------------------------------
 1 | elemental integer(sp) function shiftr_sp( I , shift )
 2 |   integer(sp), intent(in) :: I 
 3 |   integer, intent(in) :: shift
 4 |   shiftr_sp = rshift( I, shift )
 5 | end function
 6 | 
 7 | elemental integer(dp) function shiftr_dp( I , shift )
 8 |   integer(dp), intent(in) :: I 
 9 |   integer, intent(in) :: shift
10 |   shiftr_dp = rshift( I, shift )
11 | end function
12 | 
13 | elemental integer(sp) function shiftl_sp( I , shift )
14 |   integer(sp), intent(in) :: I 
15 |   integer, intent(in) :: shift
16 |   shiftl_sp = lshift( I, shift )
17 | end function
18 | 
19 | elemental integer(dp) function shiftl_dp( I , shift )
20 |   integer(dp), intent(in) :: I 
21 |   integer, intent(in) :: shift
22 |   shiftl_dp = lshift( I, shift )
23 | end function


--------------------------------------------------------------------------------
/src/utilities/nvidia_shift_interface.inc:
--------------------------------------------------------------------------------
1 | interface shiftl
2 |   module procedure shiftl_sp
3 |   module procedure shiftl_dp
4 | end interface
5 | interface shiftr
6 |   module procedure shiftr_sp
7 |   module procedure shiftr_dp
8 | end interface


--------------------------------------------------------------------------------
/src/utilities/vkahans.inc:
--------------------------------------------------------------------------------
1 |   real(wp), intent(in) :: a
2 |   real(wp), intent(inout) :: s
3 |   real(wp), intent(inout) :: c
4 |   ! -- internal variables
5 |   real(wp) :: t, y    
6 |   y = a - c
7 |   t = s + y
8 |   c = (t - s) - y
9 |   s = t


--------------------------------------------------------------------------------
/src/utilities/vkahans_m.inc:
--------------------------------------------------------------------------------
 1 |   real(wp), intent(in) :: a
 2 |   real(wp), intent(inout) :: s
 3 |   real(wp), intent(inout) :: c
 4 |   logical, intent(in) :: m
 5 |   ! -- internal variables
 6 |   real(wp) :: t, y 
 7 |   y = a - c 
 8 |   t = s + y 
 9 |   c = (t - s) - y 
10 |   s = merge( s , t , m )


--------------------------------------------------------------------------------
/test/test_fast_math.f90:
--------------------------------------------------------------------------------
  1 | module test_fast_math
  2 |     use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
  3 |     use testdrive, only: new_unittest, unittest_type, error_type, check
  4 |     use fast_math
  5 |     implicit none
  6 | 
  7 |     logical :: verbose = .true. ! change me to .true. if you want to see the results
  8 |     interface scramble
  9 |         module procedure scramble_sp
 10 |         module procedure scramble_dp
 11 |     end interface
 12 |     interface scramble_l
 13 |         module procedure scramble_spl
 14 |         module procedure scramble_dpl
 15 |     end interface
 16 | 
 17 |     character(len=*), parameter :: fmt1 = "('| ',a12,' | <time> [ns/eval] | Speed-Up | relative error' ,'  |')"
 18 |     character(len=*), parameter :: fmt2 = "('|--------------|------------------|----------|-----------------| ')"
 19 |     character(len=*), parameter :: fmt3 = "('| ',a12,' |        ', f9.4,' | ',f8.2,' |',es16.4,' | ')"
 20 | 
 21 | contains
 22 | 
 23 | subroutine scramble_sp( x )
 24 |     integer, parameter :: wp = sp
 25 |     real(wp), intent(inout) :: x(:)
 26 |     real(wp) :: u, temp
 27 |     integer :: i, j, m
 28 | 
 29 |     m = size(x)
 30 |     do i = 1, m
 31 |         call random_number(u)
 32 |         j = 1 + FLOOR(m*u)
 33 |         temp = x(j)
 34 |         x(j) = x(i)
 35 |         x(i) = temp
 36 |     end do
 37 | end subroutine
 38 | 
 39 | subroutine scramble_dp( x )
 40 |     integer, parameter :: wp = dp
 41 |     real(wp), intent(inout) :: x(:)
 42 |     real(wp) :: u, temp
 43 |     integer :: i, j, m
 44 | 
 45 |     m = size(x)
 46 |     do i = 1, m
 47 |         call random_number(u)
 48 |         j = 1 + FLOOR(m*u)
 49 |         temp = x(j)
 50 |         x(j) = x(i)
 51 |         x(i) = temp
 52 |     end do
 53 | end subroutine
 54 | 
 55 | subroutine scramble_spl( x , l )
 56 |     integer, parameter :: wp = sp
 57 |     real(wp), intent(inout) :: x(:)
 58 |     logical , intent(inout) :: l(:)
 59 |     real(wp) :: u, temp
 60 |     logical :: ltemp
 61 |     integer :: i, j, m
 62 | 
 63 |     m = size(x)
 64 |     do i = 1, m
 65 |         call random_number(u)
 66 |         j = 1 + FLOOR(m*u)
 67 |         temp = x(j); ltemp = l(j)
 68 |         x(j) = x(i); l(j) = l(i)
 69 |         x(i) = temp; l(i) = ltemp
 70 |     end do
 71 | end subroutine
 72 | 
 73 | subroutine scramble_dpl( x , l )
 74 |     integer, parameter :: wp = dp
 75 |     real(wp), intent(inout) :: x(:)
 76 |     logical , intent(inout) :: l(:)
 77 |     real(wp) :: u, temp
 78 |     logical :: ltemp
 79 |     integer :: i, j, k, m
 80 | 
 81 |     m = size(x)
 82 |     do i = 1, m
 83 |         call random_number(u)
 84 |         j = 1 + FLOOR(m*u)
 85 |         temp = x(j); ltemp = l(j)
 86 |         x(j) = x(i); l(j) = l(i)
 87 |         x(i) = temp; l(i) = ltemp
 88 |     end do
 89 | end subroutine
 90 | 
 91 | real(dp) function timer() result(t)
 92 |     integer :: values(8)
 93 |     call date_and_time(VALUES=values)
 94 |     t = 60*values(6) + values(7) + values(8)*1.d-3
 95 | end function
 96 | 
 97 | !> Collect all exported unit tests
 98 | subroutine collect_suite(testsuite)
 99 |     !> Collection of tests
100 |     type(unittest_type), allocatable, intent(out) :: testsuite(:)
101 | 
102 |     testsuite = [ &
103 |         new_unittest('fast_sum', test_fast_sum) , &
104 |         new_unittest('fast_dotp', test_fast_dotproduct) , &
105 |         new_unittest('fast_trig', test_fast_trigonometry) , &
106 |         new_unittest('fast_hyper', test_fast_hyperbolic ) , & 
107 |         new_unittest('fast_rsqrt', test_fast_rsqrt ) & !! The Quake III rsqrt implementation here is not realy much faster that what compilers can do.
108 |     ]
109 | end subroutine
110 | 
111 | subroutine test_fast_sum(error)
112 |     !> Error handling
113 |     type(error_type), allocatable, intent(out) :: error
114 | 
115 |     !> Internal parameters and variables
116 |     integer, parameter :: n = 1e5, ncalc = 3, niter = 1000
117 |     integer :: iter, i
118 |     real(dp) :: times(0:ncalc), times_tot(ncalc)
119 |     !====================================================================================
120 |     block
121 |         integer, parameter :: wp=sp
122 |         real(kind=wp), allocatable :: x(:)
123 |         real(kind=wp) :: xsum(ncalc), meanval(ncalc), err(ncalc), tolerance = epsilon(1._wp)*100
124 | 
125 |         allocate(x(n))
126 |         do i = 1, n 
127 |             x(i) = 8*atan(1._wp)*(real(i,kind=wp)-0.5_wp)/real(n,kind=wp)**2
128 |         end do
129 |         
130 |         times_tot(:) = 0
131 |         meanval(:) = 0._wp
132 |         err(:) = 0._wp
133 |         do iter=1,niter
134 |             call scramble(x)
135 |             times(0) = timer()
136 |             xsum(1) = sum(x)       ; times(1) = timer()
137 |             xsum(2) = fsum_kahan(x); times(2) = timer()
138 |             xsum(3) = fsum(x)      ; times(3) = timer()
139 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
140 |             meanval(1:ncalc) = meanval(1:ncalc) + xsum(1:ncalc)
141 |             err(1:ncalc) = err(1:ncalc) + abs(1._wp-xsum(1:ncalc)/(4*atan(1._wp)))
142 |         end do
143 |         meanval(1:ncalc) = meanval(1:ncalc) / niter
144 |         err(1:ncalc) = err(1:ncalc) / niter 
145 | 
146 |         if(verbose)then
147 |         print *,""
148 |         write(*,fmt1) "sum r32"
149 |         write(*,fmt2)
150 |         write(*,fmt3) "intrinsic" , 1e9*times_tot(1)/(niter*n) , times_tot(1)/times_tot(1), err(1)
151 |         write(*,fmt3) "    kahan" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err(2)
152 |         write(*,fmt3) "    chunk" , 1e9*times_tot(3)/(niter*n) , times_tot(1)/times_tot(3), err(3)
153 |         end if
154 | 
155 |         call check(error, all(err(:)<tolerance) )
156 |         if (allocated(error)) return
157 |     end block
158 | 
159 |     block
160 |         integer, parameter :: wp=dp
161 |         real(kind=wp), allocatable :: x(:)
162 |         real(kind=wp) :: xsum(ncalc), meanval(ncalc), err(ncalc), tolerance = epsilon(1._wp)*100
163 | 
164 |         allocate(x(n))
165 |         do i = 1, n 
166 |             x(i) = 8*atan(1._wp)*(real(i,kind=wp)-0.5_wp)/real(n,kind=wp)**2
167 |         end do
168 |         
169 |         times_tot(:) = 0
170 |         meanval(:) = 0._wp
171 |         err(:) = 0._wp
172 |         do iter=1,niter
173 |             call scramble(x)
174 |             times(0) = timer()
175 |             xsum(1) = sum(x)       ; times(1) = timer()
176 |             xsum(2) = fsum_kahan(x); times(2) = timer()
177 |             xsum(3) = fsum(x)      ; times(3) = timer()
178 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
179 |             meanval(1:ncalc) = meanval(1:ncalc) + xsum(1:ncalc)
180 |             err(1:ncalc) = err(1:ncalc) + abs(1._wp-xsum(1:ncalc)/(4*atan(1._wp)))
181 |         end do
182 |         meanval(1:ncalc) = meanval(1:ncalc) / niter
183 |         err(1:ncalc) = err(1:ncalc) / niter 
184 | 
185 |         if(verbose)then
186 |         print *,""
187 |         write(*,fmt1) "sum r64"
188 |         write(*,fmt2)
189 |         write(*,fmt3) "intrinsic" , 1e9*times_tot(1)/(niter*n) , times_tot(1)/times_tot(1), err(1)
190 |         write(*,fmt3) "    kahan" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err(2)
191 |         write(*,fmt3) "    chunk" , 1e9*times_tot(3)/(niter*n) , times_tot(1)/times_tot(3), err(3)
192 |         end if
193 | 
194 |         call check(error, all(err(:)<tolerance) )
195 |         if (allocated(error)) return
196 |     end block
197 | 
198 |     block
199 |         integer, parameter :: wp=sp
200 |         real(kind=wp), allocatable :: x(:)
201 |         logical, allocatable :: mask(:), nmask(:)
202 |         real(kind=wp) :: xsum(ncalc), meanval(ncalc), err(ncalc), tolerance = epsilon(1._wp)*100
203 | 
204 |         allocate(x(n))
205 |         do i = 1, n 
206 |             x(i) = 8*atan(1._wp)*(real(i,kind=wp)-0.5_wp)/real(n,kind=wp)**2
207 |         end do
208 |         allocate(mask(n),source=.false.); mask(1:n:2) = .true.
209 |         allocate(nmask(n))
210 | 
211 |         times_tot(:) = 0
212 |         meanval(:) = 0._wp
213 |         err(:) = 0._wp
214 |         do iter=1,niter
215 |             call scramble_l(x,mask); nmask(:) = .not.mask(:)
216 |             times(0) = timer()
217 |             xsum(1) = sum(x,mask)        + sum(x,nmask)       ; times(1) = timer()
218 |             xsum(2) = fsum_kahan(x,mask) + fsum_kahan(x,nmask); times(2) = timer()
219 |             xsum(3) = fsum(x,mask)       + fsum(x,nmask)      ; times(3) = timer()
220 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
221 | 
222 |             meanval(1:ncalc) = meanval(1:ncalc) + xsum(1:ncalc)
223 |             err(1:ncalc) = err(1:ncalc) + abs(1._wp-(xsum(1:ncalc)/(4*atan(1._wp))))
224 |         end do
225 |         times_tot(:) = times_tot(:) / 2
226 |         meanval(1:ncalc) = meanval(1:ncalc) / niter
227 |         err(1:ncalc) = err(1:ncalc) / niter 
228 | 
229 |         if(verbose)then
230 |         print *,""
231 |         write(*,fmt1) "sum r32 mask"
232 |         write(*,fmt2)
233 |         write(*,fmt3) "intrinsic" , 1e9*times_tot(1)/(niter*n) , times_tot(1)/times_tot(1), err(1)
234 |         write(*,fmt3) "    kahan" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err(2)
235 |         write(*,fmt3) "    chunk" , 1e9*times_tot(3)/(niter*n) , times_tot(1)/times_tot(3), err(3)
236 |         end if
237 | 
238 |         call check(error, all(err(:)<tolerance) )
239 |         if (allocated(error)) return
240 |     end block
241 | 
242 |     block
243 |         integer, parameter :: wp=dp
244 |         real(kind=wp), allocatable :: x(:)
245 |         logical, allocatable :: mask(:), nmask(:)
246 |         real(kind=wp) :: xsum(ncalc), meanval(ncalc), err(ncalc), tolerance = epsilon(1._wp)*100
247 | 
248 |         allocate(x(n))
249 |         do i = 1, n 
250 |             x(i) = 8*atan(1._wp)*(real(i,kind=wp)-0.5_wp)/real(n,kind=wp)**2
251 |         end do
252 |         allocate(mask(n),source=.false.); mask(1:n:2) = .true.
253 |         allocate(nmask(n))
254 | 
255 |         times_tot(:) = 0
256 |         meanval(:) = 0._wp
257 |         err(:) = 0._wp
258 |         do iter=1,niter
259 |             call scramble_l(x,mask); nmask(:) = .not.mask(:)
260 |             times(0) = timer()
261 |             xsum(1) = sum(x,mask)        + sum(x,nmask)       ; times(1) = timer()
262 |             xsum(2) = fsum_kahan(x,mask) + fsum_kahan(x,nmask); times(2) = timer()
263 |             xsum(3) = fsum(x,mask)       + fsum(x,nmask)      ; times(3) = timer()
264 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
265 | 
266 |             meanval(1:ncalc) = meanval(1:ncalc) + xsum(1:ncalc)
267 |             err(1:ncalc) = err(1:ncalc) + abs(1._wp-(xsum(1:ncalc)/(4*atan(1._wp))))
268 |         end do
269 |         times_tot(:) = times_tot(:) / 2
270 |         meanval(1:ncalc) = meanval(1:ncalc) / niter
271 |         err(1:ncalc) = err(1:ncalc) / niter 
272 | 
273 |         if(verbose)then
274 |         print *,""
275 |         write(*,fmt1) "sum r64 mask"
276 |         write(*,fmt2)
277 |         write(*,fmt3) "intrinsic" , 1e9*times_tot(1)/(niter*n) , times_tot(1)/times_tot(1), err(1)
278 |         write(*,fmt3) "    kahan" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err(2)
279 |         write(*,fmt3) "    chunk" , 1e9*times_tot(3)/(niter*n) , times_tot(1)/times_tot(3), err(3)
280 |         end if
281 | 
282 |         call check(error, all(err(:)<tolerance) )
283 |         if (allocated(error)) return
284 |     end block
285 | 
286 | end subroutine
287 | 
288 | subroutine test_fast_dotproduct(error)
289 |     !> Error handling
290 |     type(error_type), allocatable, intent(out) :: error
291 | 
292 |     !> Internal parameters and variables
293 |     integer, parameter :: n = 1e5, ncalc = 3, niter = 1000
294 |     integer :: iter, i
295 |     real(dp) :: times(0:ncalc), times_tot(ncalc)
296 |     !====================================================================================
297 |     block
298 |         integer, parameter :: wp=sp
299 |         real(kind=wp), allocatable :: x(:)
300 |         real(kind=wp) :: xsum(ncalc), meanval(ncalc), err(ncalc), tolerance = epsilon(1._wp)*100
301 | 
302 |         allocate(x(n))
303 |         do i = 1, n 
304 |             x(i) = 8*atan(1._wp)*(real(i,kind=wp)-0.5_wp)/real(n,kind=wp)**2
305 |         end do
306 |         x(:) = sqrt( x(:) )
307 | 
308 |         times_tot(:) = 0
309 |         meanval(:) = 0._wp
310 |         err(:) = 0._wp
311 |         do iter=1,niter
312 |             call scramble(x)
313 |             times(0) = timer()
314 |             xsum(1) = dot_product(x,x) ; times(1) = timer()
315 |             xsum(2) = fprod_kahan(x,x) ; times(2) = timer()
316 |             xsum(3) = fprod(x,x)       ; times(3) = timer()
317 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
318 |             meanval(1:ncalc) = meanval(1:ncalc) + xsum(1:ncalc)
319 |             err(1:ncalc) = err(1:ncalc) + abs(1._wp-xsum(1:ncalc)/(4*atan(1._wp)))
320 |         end do
321 |         meanval(1:ncalc) = meanval(1:ncalc) / niter
322 |         err(1:ncalc) = err(1:ncalc) / niter 
323 | 
324 |         if(verbose)then
325 |         print *,""
326 |         write(*,fmt1) "dot r32"
327 |         write(*,fmt2)
328 |         write(*,fmt3) "intrinsic" , 1e9*times_tot(1)/(niter*n) , times_tot(1)/times_tot(1), err(1)
329 |         write(*,fmt3) "    kahan", 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err(2)
330 |         write(*,fmt3) "    chunk", 1e9*times_tot(3)/(niter*n) , times_tot(1)/times_tot(3), err(3)
331 |         end if
332 | 
333 |         call check(error, all(err(:)<tolerance) )
334 |         if (allocated(error)) return
335 |     end block
336 | 
337 |     block
338 |         integer, parameter :: wp=dp
339 |         real(kind=wp), allocatable :: x(:)
340 |         real(kind=wp) :: xsum(ncalc), meanval(ncalc), err(ncalc), tolerance = epsilon(1._wp)*100
341 | 
342 |         allocate(x(n))
343 |         do i = 1, n 
344 |             x(i) = 8*atan(1._wp)*(real(i,kind=wp)-0.5_wp)/real(n,kind=wp)**2
345 |         end do
346 |         x(:) = sqrt( x(:) )
347 | 
348 |         times_tot(:) = 0
349 |         meanval(:) = 0._wp
350 |         err(:) = 0._wp
351 |         do iter=1,niter
352 |             call scramble(x)
353 |             times(0) = timer()
354 |             xsum(1) = dot_product(x,x) ; times(1) = timer()
355 |             xsum(2) = fprod_kahan(x,x) ; times(2) = timer()
356 |             xsum(3) = fprod(x,x)       ; times(3) = timer()
357 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
358 |             meanval(1:ncalc) = meanval(1:ncalc) + xsum(1:ncalc)
359 |             err(1:ncalc) = err(1:ncalc) + abs(1._wp-xsum(1:ncalc)/(4*atan(1._wp)))
360 |         end do
361 |         meanval(1:ncalc) = meanval(1:ncalc) / niter
362 |         err(1:ncalc) = err(1:ncalc) / niter 
363 | 
364 |         if(verbose)then
365 |         print *,""
366 |         write(*,fmt1) "dot r64"
367 |         write(*,fmt2)
368 |         write(*,fmt3) "intrinsic" , 1e9*times_tot(1)/(niter*n) , times_tot(1)/times_tot(1), err(1)
369 |         write(*,fmt3) "    kahan", 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err(2)
370 |         write(*,fmt3) "    chunk", 1e9*times_tot(3)/(niter*n) , times_tot(1)/times_tot(3), err(3)
371 |         end if
372 | 
373 |         call check(error, all(err(:)<tolerance) )
374 |         if (allocated(error)) return
375 |     end block
376 | 
377 | end subroutine
378 | 
379 | subroutine test_fast_trigonometry(error)
380 |     !> Error handling
381 |     type(error_type), allocatable, intent(out) :: error
382 | 
383 |     !> Internal parameters and variables
384 |     integer, parameter :: n = 5e5, ncalc = 2, niter = 500
385 |     integer :: i, iter
386 |     real(dp) :: times(0:ncalc), times_tot(ncalc)
387 |     !====================================================================================
388 |     if(verbose)then
389 |         print *,""
390 |         write(*,fmt1) "trigo"
391 |         write(*,fmt2)
392 |     end if
393 |     block
394 |         integer, parameter :: wp=sp
395 |         real(wp), allocatable :: x(:) , y(:), yref(:)
396 |         real(kind=wp) :: err, tolerance = epsilon(1._wp)*500
397 |         !> define a linspace between [-pi,pi]
398 |         allocate( x(n) , y(n), yref(n) )
399 |         do i = 1, n 
400 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*acos(-1.0_wp)
401 |         end do
402 | 
403 |         times_tot(:) = 0
404 |         err = 0._wp
405 |         do iter=1,niter
406 |             times(0) = timer()
407 |             yref = sin(x); times(1) = timer()
408 |             y = fsin(x)  ; times(2) = timer()
409 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
410 |             err = err + sqrt( sum( y - yref )**2 / n )
411 |         end do
412 |         err = err / niter
413 | 
414 |         if(verbose) write(*,fmt3) "fsin r32" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
415 | 
416 |         call check(error, err < tolerance )
417 |         if (allocated(error)) return
418 |     end block
419 |     block
420 |         integer, parameter :: wp=dp
421 |         real(wp), allocatable :: x(:) , y(:), yref(:)
422 |         real(kind=wp) :: err, tolerance = epsilon(1._wp)*500
423 |         !> define a linspace between [-pi,pi]
424 |         allocate( x(n) , y(n), yref(n) )
425 |         do i = 1, n 
426 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*acos(-1.0_wp)
427 |         end do
428 | 
429 |         times_tot(:) = 0
430 |         err = 0._wp
431 |         do iter=1,niter
432 |             times(0) = timer()
433 |             yref = sin(x); times(1) = timer()
434 |             y = fsin(x)  ; times(2) = timer()
435 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
436 |             err = err + sqrt( sum( y - yref )**2 / n )
437 |         end do
438 |         err = err / niter
439 | 
440 |         if(verbose) write(*,fmt3) "fsin r64" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
441 | 
442 |         call check(error, err < tolerance )
443 |         if (allocated(error)) return
444 |     end block
445 | 
446 |     block
447 |         integer, parameter :: wp=sp
448 |         real(wp), allocatable :: x(:) , y(:), yref(:)
449 |         real(kind=wp) :: err, tolerance = 1e-4_wp
450 |         !> define a linspace between [-1,1]
451 |         allocate( x(n) , y(n), yref(n) )
452 |         do i = 1, n 
453 |             x(i) = (real(i,kind=wp) / n - 0.5_wp)*2
454 |         end do
455 | 
456 |         times_tot(:) = 0
457 |         err = 0._wp
458 |         do iter=1,niter
459 |             times(0) = timer()
460 |             yref = acos(x); times(1) = timer()
461 |             y = facos(x)  ; times(2) = timer()
462 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
463 |             err = err + sqrt( sum( y - yref )**2 / n )
464 |         end do
465 |         err = err / niter
466 | 
467 |         if(verbose) write(*,fmt3) "facos r32" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
468 | 
469 |         call check(error, err < tolerance )
470 |         if (allocated(error)) return
471 |     end block
472 |     block
473 |         integer, parameter :: wp=dp
474 |         real(wp), allocatable :: x(:) , y(:), yref(:)
475 |         real(kind=wp) :: err, tolerance = 1e-4_wp
476 |         !> define a linspace between [-1,1]
477 |         allocate( x(n) , y(n), yref(n) )
478 |         do i = 1, n 
479 |             x(i) = (real(i,kind=wp) / n - 0.5_wp)*2
480 |         end do
481 | 
482 |         times_tot(:) = 0
483 |         err = 0._wp
484 |         do iter=1,niter
485 |             times(0) = timer()
486 |             yref = acos(x); times(1) = timer()
487 |             y = facos(x)  ; times(2) = timer()
488 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
489 |             err = err + sqrt( sum( y - yref )**2 / n )
490 |         end do
491 |         err = err / niter
492 | 
493 |         if(verbose) write(*,fmt3) "facos r64" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
494 | 
495 |         call check(error, err < tolerance )
496 |         if (allocated(error)) return
497 |     end block
498 | 
499 |     block
500 |         integer, parameter :: wp=sp
501 |         real(wp), allocatable :: x(:) , y(:), yref(:)
502 |         real(kind=wp) :: err, tolerance = 2e-5_wp
503 |         !> define a linspace between [-3,3]
504 |         allocate( x(n) , y(n), yref(n) )
505 |         do i = 1, n 
506 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*3._wp
507 |         end do
508 | 
509 |         times_tot(:) = 0
510 |         err = 0._wp
511 |         do iter=1,niter
512 |             times(0) = timer()
513 |             yref = atan(x); times(1) = timer()
514 |             y = fatan(x)  ; times(2) = timer()
515 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
516 |             err = err + sqrt( sum( y - yref )**2 / n )
517 |         end do
518 |         err = err / niter
519 | 
520 |         if(verbose) write(*,fmt3) "fatan r32" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
521 | 
522 |         call check(error, err < tolerance )
523 |         if (allocated(error)) return
524 |     end block
525 |     block
526 |         integer, parameter :: wp=dp
527 |         real(wp), allocatable :: x(:) , y(:), yref(:)
528 |         real(kind=wp) :: err, tolerance = 2e-5_wp
529 |         !> define a linspace between [-3,3]
530 |         allocate( x(n) , y(n), yref(n) )
531 |         do i = 1, n 
532 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*3._wp
533 |         end do
534 | 
535 |         times_tot(:) = 0
536 |         err = 0._wp
537 |         do iter=1,niter
538 |             times(0) = timer()
539 |             yref = atan(x); times(1) = timer()
540 |             y = fatan(x)  ; times(2) = timer()
541 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
542 |             err = err + sqrt( sum( y - yref )**2 / n )
543 |         end do
544 |         err = err / niter
545 | 
546 |         if(verbose) write(*,fmt3) "fatan r64" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
547 | 
548 |         call check(error, err < tolerance )
549 |         if (allocated(error)) return
550 |     end block
551 | 
552 | end subroutine
553 | 
554 | subroutine test_fast_hyperbolic(error)
555 |     !> Error handling
556 |     type(error_type), allocatable, intent(out) :: error
557 | 
558 |     !> Internal parameters and variables
559 |     integer, parameter :: n = 5e5, ncalc = 2, niter = 500
560 |     integer :: i, iter
561 |     real(dp) :: times(0:ncalc), times_tot(ncalc)
562 |     !====================================================================================
563 |     if(verbose)then
564 |         print *,""
565 |         write(*,fmt1) "hyperb"
566 |         write(*,fmt2)
567 |     end if
568 |     block
569 |         integer, parameter :: wp=sp
570 |         real(wp), allocatable :: x(:) , y(:), yref(:)
571 |         real(kind=wp) :: err, tolerance = 1e-5_wp
572 |         !> define a linspace between [-3,3]
573 |         allocate( x(n) , y(n), yref(n) )
574 |         do i = 1, n 
575 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*3._wp
576 |         end do
577 | 
578 |         times_tot(:) = 0
579 |         err = 0._wp
580 |         do iter=1,niter
581 |             times(0) = timer()
582 |             yref = tanh(x); times(1) = timer()
583 |             y = ftanh(x)  ; times(2) = timer()
584 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
585 |             err = err + sqrt( sum( y - yref )**2 / n )
586 |         end do
587 |         err = err / niter
588 | 
589 |         if(verbose) write(*,fmt3) "ftanh r32" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
590 | 
591 |         call check(error, err < tolerance )
592 |         if (allocated(error)) return
593 |     end block
594 |     block
595 |         integer, parameter :: wp=dp
596 |         real(wp), allocatable :: x(:) , y(:), yref(:)
597 |         real(kind=wp) :: err, tolerance = 1e-5_wp
598 |         !> define a linspace between [-3,3]
599 |         allocate( x(n) , y(n), yref(n) )
600 |         do i = 1, n 
601 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*3._wp
602 |         end do
603 | 
604 |         times_tot(:) = 0
605 |         err = 0._wp
606 |         do iter=1,niter
607 |             times(0) = timer()
608 |             yref = tanh(x); times(1) = timer()
609 |             y = ftanh(x)  ; times(2) = timer()
610 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
611 |             err = err + sqrt( sum( y - yref )**2 / n )
612 |         end do
613 |         err = err / niter
614 | 
615 |         if(verbose) write(*,fmt3) "ftanh r64" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
616 | 
617 |         call check(error, err < tolerance )
618 |         if (allocated(error)) return
619 |     end block
620 | 
621 |     block
622 |         integer, parameter :: wp=sp
623 |         real(wp), allocatable :: x(:) , y(:), yref(:)
624 |         real(kind=wp) :: err, tolerance = 1e-2_wp
625 |         !> define a linspace between [-3,3]
626 |         allocate( x(n) , y(n), yref(n) )
627 |         do i = 1, n 
628 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*3._wp
629 |         end do
630 | 
631 |         times_tot(:) = 0
632 |         err = 0._wp
633 |         do iter=1,niter
634 |             times(0) = timer()
635 |             yref = erf(x); times(1) = timer()
636 |             y = ferf(x)  ; times(2) = timer()
637 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
638 |             err = err + sqrt( sum( y - yref )**2 / n )
639 |         end do
640 |         err = err / niter
641 | 
642 |         if(verbose) write(*,fmt3) "ferf r32" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
643 | 
644 |         call check(error, err < tolerance )
645 |         if (allocated(error)) return
646 |     end block
647 |     block
648 |         integer, parameter :: wp=dp
649 |         real(wp), allocatable :: x(:) , y(:), yref(:)
650 |         real(kind=wp) :: err, tolerance = 1e-2_wp
651 |         !> define a linspace between [-3,3]
652 |         allocate( x(n) , y(n), yref(n) )
653 |         do i = 1, n 
654 |             x(i) = 2*(real(i,kind=wp) / n - 0.5_wp)*3._wp
655 |         end do
656 | 
657 |         times_tot(:) = 0
658 |         err = 0._wp
659 |         do iter=1,niter
660 |             times(0) = timer()
661 |             yref = erf(x); times(1) = timer()
662 |             y = ferf(x)  ; times(2) = timer()
663 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
664 |             err = err + sqrt( sum( y - yref )**2 / n )
665 |         end do
666 |         err = err / niter
667 | 
668 |         if(verbose) write(*,fmt3) "ferf r64" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
669 | 
670 |         call check(error, err < tolerance )
671 |         if (allocated(error)) return
672 |     end block
673 | 
674 | end subroutine
675 | 
676 | subroutine test_fast_rsqrt(error)
677 |     !> Error handling
678 |     type(error_type), allocatable, intent(out) :: error
679 | 
680 |     !> Internal parameters and variables
681 |     integer, parameter :: n = 5e5, ncalc = 2, niter = 500
682 |     integer :: iter, i
683 |     real(dp) :: times(0:ncalc), times_tot(ncalc)
684 |     !====================================================================================
685 |     if(verbose)then
686 |         print *,""
687 |         write(*,fmt1) "rsqrt"
688 |         write(*,fmt2)
689 |     end if
690 |     block
691 |         integer, parameter :: wp=sp
692 |         real(wp), allocatable :: x(:) , y(:), yref(:)
693 |         real(kind=wp) :: err, tolerance = 1e-2_wp
694 |         !> define a log space
695 |         
696 |         allocate( x(n) , y(n), yref(n) )
697 |         call random_number(x)
698 |         x = 10._wp**(-10*(1._wp-x) + 10*x)
699 | 
700 |         times_tot(:) = 0
701 |         err = 0._wp
702 |         do iter=1,niter
703 |             times(0) = timer()
704 |             yref = 1._wp/sqrt(x); times(1) = timer()
705 |             y = frsqrt(x)       ; times(2) = timer()
706 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
707 |             err = err + sqrt( sum( y - yref )**2 / sum( yref )**2 )
708 |         end do
709 |         err = err / niter
710 | 
711 |         if(verbose) write(*,fmt3) "frsqrt r32" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
712 | 
713 |         call check(error, err < tolerance )
714 |         if (allocated(error)) return
715 |     end block
716 |     block
717 |         integer, parameter :: wp=dp
718 |         real(wp), allocatable :: x(:) , y(:), yref(:)
719 |         real(kind=wp) :: err, tolerance = 1e-2_wp
720 |         !> define a log space
721 |         allocate( x(n) , y(n), yref(n) )
722 |         call random_number(x)
723 |         x = 10._wp**(-200*(1._wp-x) + 200*x)
724 | 
725 |         times_tot(:) = 0
726 |         err = 0._wp
727 |         do iter=1,niter
728 |             times(0) = timer()
729 |             yref = 1._wp/sqrt(x); times(1) = timer()
730 |             y = frsqrt(x)       ; times(2) = timer()
731 |             times_tot(:) = times_tot(:) + times(1:ncalc) - times(0:ncalc-1)
732 |             err = err + sqrt( sum( y - yref )**2 / sum( yref )**2 )
733 |         end do
734 |         err = err / niter
735 | 
736 |         if(verbose) write(*,fmt3) "frsqrt r64" , 1e9*times_tot(2)/(niter*n) , times_tot(1)/times_tot(2), err
737 | 
738 |         call check(error, err < tolerance )
739 |         if (allocated(error)) return
740 |     end block
741 | 
742 | end subroutine
743 |     
744 | end module test_fast_math
745 | 
746 | program tester
747 |     use, intrinsic :: iso_fortran_env, only : error_unit
748 |     use testdrive, only : run_testsuite, new_testsuite, testsuite_type
749 |     use test_fast_math, only : collect_suite
750 |     implicit none
751 |     integer :: stat, is
752 |     type(testsuite_type), allocatable :: testsuites(:)
753 |     character(len=*), parameter :: fmt = '("#", *(1x, a))'
754 |   
755 |     stat = 0
756 |   
757 |     testsuites = [ &
758 |       new_testsuite("fast_math", collect_suite) &
759 |       ]
760 |   
761 |     do is = 1, size(testsuites)
762 |       write(error_unit, fmt) "Testing:", testsuites(is)%name
763 |       call run_testsuite(testsuites(is)%collect, error_unit, stat)
764 |     end do
765 |   
766 |     if (stat > 0) then
767 |       write(error_unit, '(i0, 1x, a)') stat, "test(s) failed!"
768 |       error stop
769 |     end if
770 |   
771 | end program tester
772 | 


--------------------------------------------------------------------------------