├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── ford.yml ├── fpm.rsp ├── fpm.toml ├── src ├── fast_dotp.f90 ├── fast_erf.f90 ├── fast_log.f90 ├── fast_math.f90 ├── fast_rsqrt.f90 ├── fast_sum.f90 ├── fast_tanh.f90 ├── fast_trigo.f90 └── utilities │ ├── nvidia_shift.inc │ ├── nvidia_shift_interface.inc │ ├── vkahans.inc │ └── vkahans_m.inc └── test └── test_fast_math.f90 /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | os: [ubuntu-latest, macos-latest, windows-latest] 12 | toolchain: 13 | - {compiler: gcc, version: 13, flags: ['-O3 -march=native -mtune=native -flto']} 14 | - {compiler: intel, version: '2025.0', flags: ['-O3 -xhost']} 15 | - {compiler: intel-classic, version: '2021.10', flags: ['-O3 -xhost']} 16 | - {compiler: nvidia-hpc, version: '25.1', flags: ['-Mpreprocess -Ofast']} 17 | include: 18 | - os: ubuntu-latest 19 | toolchain: {compiler: gcc, version: 12, flags: ['-O3 -march=native -mtune=native -flto']} 20 | exclude: 21 | - os: macos-latest 22 | toolchain: {compiler: intel, version: '2025.0'} 23 | - os: macos-latest 24 | toolchain: {compiler: nvidia-hpc, version: '25.1'} 25 | - os: windows-latest 26 | toolchain: {compiler: nvidia-hpc, version: '25.1'} 27 | 28 | steps: 29 | - name: Checkout code 30 | uses: actions/checkout@v1 31 | 32 | - uses: fortran-lang/setup-fortran@main 33 | id: setup-fortran 34 | with: 35 | compiler: ${{ matrix.toolchain.compiler }} 36 | version: ${{ matrix.toolchain.version }} 37 | 38 | - name: Setup Fortran Package Manager 39 | uses: fortran-lang/setup-fpm@v5 40 | with: 41 | github-token: ${{ secrets.GITHUB_TOKEN }} 42 | 43 | - name: Setup Fortran on MacOS 44 | if: contains( matrix.os, 'macos') 45 | run: | 46 | brew reinstall gcc@13 47 | ln -s /usr/local/lib/gcc/13/libgfortran.5.dylib /usr/local/lib/ 48 | ln -s /usr/local/lib/gcc/13/libquadmath.0.dylib /usr/local/lib/ 49 | 50 | - run: | 51 | fpm test --compiler ${{ env.FC }} --c-compiler ${{ env.CC }} --cxx-compiler ${{ env.CXX }} --flag "${{ join(matrix.toolchain.flags, ' ') }}" 52 | # ${{ env.FC }} ... # environment vars FC, CC, and CXX are set 53 | # ${{ steps.setup-fortran.outputs.fc }} ... # outputs work too 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | doc -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | This project is FOSS (free and open source software), therefore, anyone interested to use it or to contribute is welcome. 4 | 5 | * If you have a nice fast function that can be faster than an intrinsic within an "acceptable" tolerance. 6 | * If you have an idea to make the code/documentation better looking and easier to read (we are striving for the [Fortran stdlib style guide](https://github.com/fortran-lang/stdlib/blob/HEAD/STYLE_GUIDE.md) but not there yet) 7 | * If you have an idea to make the unit tests even more robust 8 | 9 | Please go ahead and open a discussion or create a PR. 10 | 11 | # Basic rules: 12 | * Remember, this is an open-source project! So be gentle and patient (we know, it is all about fast functions, but for the sake of our intellectual curiosity and helping our work). 13 | * Be polite, especially when disagreeing! 14 | * Enjoy number crunching coding with Modern Fortran :) -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | From Transvalor S.A. 4 | Copyright (c) 2023-present José R. Alves Z. 5 | 6 | From Federico Perini 7 | Copyright (c) 2016-2022 Federico Perini 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/681533852.svg)](https://zenodo.org/badge/latestdoi/681533852) 2 | # Fortran Fast math 3 | A collection of functions for fast number crunching using Fortran. 4 | 5 | In order to get the maximum performance of this library, compile with "-O3 -march=native -flto" (or equivalent). Note: For the elemental functions, inlinement is key to extract maximum performance. It can be achieved either by use of the `-flto`(gcc)/`-ipo`(intel) flag or using the `include` mechanism. 6 | 7 | # Available functions 8 | 9 | | function | name(s) | shapes | types | 10 | |----------|-----------------------|------------|------------------| 11 | | sum | `fsum` `fsum_kahan`(1) | `1d`|`real32` `real64` | 12 | | dot | `fprod` `fprod_kahan`(2)| `1d`|`real32` `real64` | 13 | | cos | `fcos` | `elemental`|`real32` `real64` | 14 | | sin | `fsin` | `elemental`|`real32` `real64` | 15 | | tan | `ftan` | `elemental`|`real32` `real64` | 16 | | tanh | `ftanh` | `elemental`|`real32` `real64` | 17 | | acos | `facos` | `elemental`|`real32` `real64` | 18 | | atan | `fatan` | `elemental`|`real32` `real64` | 19 | | erf | `ferf` | `elemental`|`real32` `real64` | 20 | | log | `flog_p3` `flog_p5` | `elemental`| `real64` | 21 | | rsqrt(3) | `frsqrt` | `elemental`|`real32` `real64` | 22 | 23 | * (1) fast (and precise) sum for 1D arrays - possibility of including a mask. 24 | `fsum`: fastest method and at worst, same or 1 order of magnitud more precise than the intrinsic sum. It groups chunks of values in a temporal working batch which is summed up once at the end. 25 | `fsum_kahan`: Highest precision. It has a precission close to a quadratic sum (for real32 summing with real64, and fo real64 summing with real128). It also uses the chunks principle with an elemental kahan operator applied on top. 26 | 27 | * (2) fast (and precise) dot product for 1D arrays - possibility of including a 3rd weighting array. 28 | `fprod`: fastest method and at worst, 1 order of magnitud more precise than the intrinsic dot_product. runtime can vary between 3X and 8X the intrinsic. It groups chunks of products in a temporal working batch which is summed up once at the end (based on `fsum`). 29 | `fprod_kahan`: Same idea as `fsum_kahan` but on top of chunked products. 30 | * (3) rsqrt: reciprocal square root $f(x)=1/sqrt(x)$ 31 | # API documentation 32 | 33 | To generate the API documentation for `fast_math` using 34 | [ford](https://github.com/Fortran-FOSS-Programmers/ford) run the following 35 | command: 36 | 37 | ```shell 38 | ford ford.yml 39 | ``` 40 | 41 | # TODO 42 | * Contribution guidelines 43 | * Polish autodoc 44 | 45 | # Elapsed time examples and precision 46 | Warning: The following values are just references as to see how different can they be between different compilers. Actual speed-ups(downs) should be measured under the true use conditions to account for (lack-off) inlinement, etc etc. 47 |
48 | (Click to unfold) Windows gfortran 14.1 > fpm test --flag "-O3 -march=native -mtune=native" 49 | CPU: Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz 1.99 GHz 50 | 51 | | sum r32 |
108 | 109 |
110 | (Click to unfold) Windows ifx 2025.0.4 > fpm test --flag "/O3 /Qxhost" 111 | CPU: Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz 1.99 GHz 112 | 113 | | sum r32 |
170 | 171 |
172 | (Click to unfold) WSL2 nvfortran 24.3 > fpm test --flag "-Mpreprocess -fast" 173 | CPU: Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz 1.99 GHz 174 | 175 | | sum r32 |
232 | 233 | # Acknowledgement 234 | 235 | * Compilation of this library was possible thanks to [Transvalor S.A.](https://www.transvalor.com/en/homepage) research activities. 236 | * Part of this library is based on the work of [Perini and Reitz](https://doi.org/10.1016/j.combustflame.2018.04.013), that was funded through the Sandia National Laboratories by the U.S. Department of Energy, Office of Vehicle Technologies, program managers Leo Breton, Gupreet Singh. 237 | * The [fortran lang community](https://fortran-lang.discourse.group/) discussions such as [Some Intrinsic SUMS](https://fortran-lang.discourse.group/t/some-intrinsic-sums/5760) and [fastGPT](https://fortran-lang.discourse.group/t/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/5385) 238 | 239 | Contribution of open-source developers: 240 | 241 | [jalvesz](https://github.com/jalvesz) 242 | 243 | [perazz](https://github.com/perazz) 244 | -------------------------------------------------------------------------------- /ford.yml: -------------------------------------------------------------------------------- 1 | project: Fast math collection library 2 | summary: Fast sum, dot products, trigonometric functions and more 3 | author: José Alves 4 | date: May 15, 2023 5 | preprocess: False 6 | incl_src: False 7 | include: ./src 8 | src_dir: ./src 9 | output_dir: doc 10 | 11 | {!README.md!} -------------------------------------------------------------------------------- /fpm.rsp: -------------------------------------------------------------------------------- 1 | @testgfortran 2 | option test --compiler gfortran --flag "-O3 -march=native -mtune=native" 3 | 4 | @wtestifort 5 | option test --compiler ifort --flag "/O3 /Qxhost" 6 | 7 | @ltestifort 8 | option test --compiler ifort --flag "-O3 -xhost" 9 | 10 | @wtestifx 11 | option test --compiler ifx --flag "/O3 /Qxhost" 12 | 13 | @ltestifx 14 | option test --compiler ifx --flag "-O3 -xhost" 15 | 16 | @testnvfortran 17 | option test --compiler nvfortran --flag "-Mpreprocess -fast" -------------------------------------------------------------------------------- /fpm.toml: -------------------------------------------------------------------------------- 1 | name = "fast_math" 2 | version = "0.1.0" 3 | license = "license" 4 | author = "Jose Alves" 5 | maintainer = "jose.alves@transvalor.com" 6 | copyright = "Copyright 2023, Jose Alves" 7 | 8 | [build] 9 | auto-executables = true 10 | auto-tests = true 11 | auto-examples = true 12 | module-naming = false 13 | 14 | [install] 15 | library = false 16 | 17 | [fortran] 18 | implicit-typing = false 19 | implicit-external = false 20 | source-form = "free" 21 | 22 | [preprocess] 23 | [preprocess.cpp] 24 | suffixes = [".f90"] 25 | 26 | [dev-dependencies] 27 | test-drive.git = "https://github.com/fortran-lang/test-drive.git" 28 | test-drive.tag = "v0.4.0" -------------------------------------------------------------------------------- /src/fast_dotp.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A. 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | module fast_dotp 7 | !! A faster and more accurate implementation of the dot_product intrinsic. 8 | !! It uses the same principle as fsum_chunk but considering local multiplications that can be vectorized for faster summation. 9 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 10 | implicit none 11 | private 12 | 13 | public :: fprod, fprod_kahan 14 | integer, parameter :: chunk64 = 64 15 | integer, parameter :: chunk32 = 64 16 | 17 | interface fprod 18 | module procedure fprod_sp 19 | module procedure fprod_sp_weighted 20 | module procedure fprod_dp 21 | module procedure fprod_dp_weighted 22 | end interface 23 | 24 | interface fprod_kahan 25 | module procedure fprod_kahan_sp 26 | module procedure fprod_kahan_sp_weighted 27 | module procedure fprod_kahan_dp 28 | module procedure fprod_kahan_dp_weighted 29 | end interface 30 | 31 | interface vkahans 32 | module procedure vkahans_sp 33 | module procedure vkahans_dp 34 | end interface 35 | 36 | contains 37 | 38 | pure function fprod_sp(a,b) result(p) 39 | integer, parameter :: wp = sp 40 | integer, parameter :: chunk = chunk32 41 | real(wp), intent(in) :: a(:) 42 | real(wp), intent(in) :: b(:) 43 | real(wp) :: p 44 | ! -- 45 | real(wp) :: abatch(chunk) 46 | integer :: i, n, r 47 | ! ----------------------------- 48 | n = size(a) 49 | r = mod(n,chunk) 50 | 51 | abatch(1:r) = a(1:r)*b(1:r) 52 | abatch(r+1:chunk) = 0._wp 53 | do i = r+1, n-r, chunk 54 | abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1) 55 | end do 56 | 57 | p = 0.0_wp 58 | do i = 1, chunk/2 59 | p = p + abatch(i)+abatch(chunk/2+i) 60 | end do 61 | end function 62 | 63 | pure function fprod_dp(a,b) result(p) 64 | integer, parameter :: wp = dp 65 | integer, parameter :: chunk = chunk64 66 | real(wp), intent(in) :: a(:) 67 | real(wp), intent(in) :: b(:) 68 | real(wp) :: p 69 | ! -- 70 | real(wp) :: abatch(chunk) 71 | integer :: i, n, r 72 | ! ----------------------------- 73 | n = size(a) 74 | r = mod(n,chunk) 75 | 76 | abatch(1:r) = a(1:r)*b(1:r) 77 | abatch(r+1:chunk) = 0._wp 78 | do i = r+1, n-r, chunk 79 | abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1) 80 | end do 81 | 82 | p = 0.0_wp 83 | do i = 1, chunk/2 84 | p = p + abatch(i)+abatch(chunk/2+i) 85 | end do 86 | end function 87 | 88 | pure function fprod_sp_weighted(a,b,w) result(p) 89 | integer, parameter :: wp = sp 90 | integer, parameter :: chunk = chunk32 91 | real(wp), intent(in) :: a(:) 92 | real(wp), intent(in) :: b(:) 93 | real(wp), intent(in) :: w(:) 94 | real(wp) :: p 95 | ! -- 96 | real(wp) :: abatch(chunk) 97 | integer :: i, n, r 98 | ! ----------------------------- 99 | n = size(a) 100 | r = mod(n,chunk) 101 | 102 | abatch(1:r) = a(1:r)*b(1:r)*w(1:r) 103 | abatch(r+1:chunk) = 0._wp 104 | do i = r+1, n-r, chunk 105 | abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1)*w(i:i+chunk-1) 106 | end do 107 | 108 | p = 0.0_wp 109 | do i = 1, chunk/2 110 | p = p + abatch(i)+abatch(chunk/2+i) 111 | end do 112 | end function 113 | 114 | pure function fprod_dp_weighted(a,b,w) result(p) 115 | integer, parameter :: wp = dp 116 | integer, parameter :: chunk = chunk64 117 | real(wp), intent(in) :: a(:) 118 | real(wp), intent(in) :: b(:) 119 | real(wp), intent(in) :: w(:) 120 | real(wp) :: p 121 | ! -- 122 | real(wp) :: abatch(chunk) 123 | integer :: i, n, r 124 | ! ----------------------------- 125 | n = size(a) 126 | r = mod(n,chunk) 127 | 128 | abatch(1:r) = a(1:r)*b(1:r)*w(1:r) 129 | abatch(r+1:chunk) = 0._wp 130 | do i = r+1, n-r, chunk 131 | abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*b(i:i+chunk-1)*w(i:i+chunk-1) 132 | end do 133 | 134 | p = 0.0_wp 135 | do i = 1, chunk/2 136 | p = p + abatch(i)+abatch(chunk/2+i) 137 | end do 138 | end function 139 | 140 | pure function fprod_kahan_sp(a,b) result(p) 141 | integer, parameter :: wp = sp 142 | integer, parameter :: chunk = chunk32 143 | real(wp), intent(in) :: a(:) 144 | real(wp), intent(in) :: b(:) 145 | real(wp) :: p 146 | ! -- 147 | real(wp) :: sbatch(chunk) 148 | real(wp) :: cbatch(chunk) 149 | integer :: i, n, r 150 | ! ----------------------------- 151 | n = size(a) 152 | r = mod(n,chunk) 153 | 154 | sbatch(1:r) = a(1:r) * b(1:r) 155 | sbatch(r+1:chunk) = 0.0_wp 156 | cbatch = 0.0_wp 157 | do i = r+1, n-r, chunk 158 | call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) ) 159 | end do 160 | 161 | p = 0.0_wp 162 | do i = 1,chunk 163 | call vkahans( sbatch(i) , p , cbatch(i) ) 164 | end do 165 | end function 166 | 167 | pure function fprod_kahan_dp(a,b) result(p) 168 | integer, parameter :: wp = dp 169 | integer, parameter :: chunk = chunk64 170 | real(wp), intent(in) :: a(:) 171 | real(wp), intent(in) :: b(:) 172 | real(wp) :: p 173 | ! -- 174 | real(wp) :: sbatch(chunk) 175 | real(wp) :: cbatch(chunk) 176 | integer :: i, n, r 177 | ! ----------------------------- 178 | n = size(a) 179 | r = mod(n,chunk) 180 | 181 | sbatch(1:r) = a(1:r) * b(1:r) 182 | sbatch(r+1:chunk) = 0.0_wp 183 | cbatch = 0.0_wp 184 | do i = r+1, n-r, chunk 185 | call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) ) 186 | end do 187 | 188 | p = 0.0_wp 189 | do i = 1,chunk 190 | call vkahans( sbatch(i) , p , cbatch(i) ) 191 | end do 192 | end function 193 | 194 | pure function fprod_kahan_sp_weighted(a,b,w) result(p) 195 | integer, parameter :: wp = sp 196 | integer, parameter :: chunk = chunk32 197 | real(wp), intent(in) :: a(:) 198 | real(wp), intent(in) :: b(:) 199 | real(wp), intent(in) :: w(:) 200 | real(wp) :: p 201 | ! -- 202 | real(wp) :: sbatch(chunk) 203 | real(wp) :: cbatch(chunk) 204 | integer :: i, n, r 205 | ! ----------------------------- 206 | n = size(a) 207 | r = mod(n,chunk) 208 | 209 | sbatch(1:r) = a(1:r) * b(1:r) * w(1:r) 210 | sbatch(r+1:chunk) = 0.0_wp 211 | cbatch = 0.0_wp 212 | do i = r+1, n-r, chunk 213 | call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) * w(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) ) 214 | end do 215 | 216 | p = 0.0_wp 217 | do i = 1,chunk 218 | call vkahans( sbatch(i) , p , cbatch(i) ) 219 | end do 220 | end function 221 | 222 | pure function fprod_kahan_dp_weighted(a,b,w) result(p) 223 | integer, parameter :: wp = dp 224 | integer, parameter :: chunk = chunk64 225 | real(wp), intent(in) :: a(:) 226 | real(wp), intent(in) :: b(:) 227 | real(wp), intent(in) :: w(:) 228 | real(wp) :: p 229 | ! -- 230 | real(wp) :: sbatch(chunk) 231 | real(wp) :: cbatch(chunk) 232 | integer :: i, n, r 233 | ! ----------------------------- 234 | n = size(a) 235 | r = mod(n,chunk) 236 | 237 | sbatch(1:r) = a(1:r) * b(1:r) * w(1:r) 238 | sbatch(r+1:chunk) = 0.0_wp 239 | cbatch = 0.0_wp 240 | do i = r+1, n-r, chunk 241 | call vkahans( a(i:i+chunk-1) * b(i:i+chunk-1) * w(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) ) 242 | end do 243 | 244 | p = 0.0_wp 245 | do i = 1,chunk 246 | call vkahans( sbatch(i) , p , cbatch(i) ) 247 | end do 248 | end function 249 | 250 | elemental subroutine vkahans_sp(a,s,c) 251 | integer, parameter :: wp = sp 252 | include 'utilities/vkahans.inc' 253 | end subroutine 254 | 255 | elemental subroutine vkahans_dp(a,s,c) 256 | integer, parameter :: wp = dp 257 | include 'utilities/vkahans.inc' 258 | end subroutine 259 | 260 | end module fast_dotp -------------------------------------------------------------------------------- /src/fast_erf.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A. 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | module fast_erf 7 | !! Source: https://fortran-lang.discourse.group/t/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/5385/31 8 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 9 | implicit none 10 | private 11 | 12 | public :: ferf 13 | 14 | interface ferf 15 | module procedure ferf_sp 16 | module procedure ferf_dp 17 | end interface 18 | 19 | contains 20 | 21 | elemental function ferf_sp( x ) result( y ) 22 | integer, parameter :: wp = sp 23 | real(wp), intent(in) :: x 24 | real(wp) :: y 25 | !-- Internal Variables 26 | real(wp) :: abs_x, sqr_x 27 | !------------------------------------------------- 28 | abs_x = abs(x) 29 | sqr_x = x**2 30 | y = 1._wp - 1._wp / (1._wp+ 0.278393_wp*abs_x + 0.230389_wp*sqr_x + 0.000972_wp*abs_x*sqr_x + 0.078108_wp*sqr_x*sqr_x)**4 31 | y = sign(y,x) 32 | end function 33 | 34 | elemental function ferf_dp( x ) result( y ) 35 | integer, parameter :: wp = dp 36 | real(wp), intent(in) :: x 37 | real(wp) :: y 38 | !-- Internal Variables 39 | real(wp) :: abs_x, sqr_x 40 | !------------------------------------------------- 41 | abs_x = abs(x) 42 | sqr_x = x**2 43 | y = 1._wp - 1._wp / (1._wp+ 0.278393_wp*abs_x + 0.230389_wp*sqr_x + 0.000972_wp*abs_x*sqr_x + 0.078108_wp*sqr_x*sqr_x)**4 44 | y = sign(y,x) 45 | end function 46 | 47 | end module -------------------------------------------------------------------------------- /src/fast_log.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2016-2022 Federico Perini 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | ! *********************************************************************************************** 7 | !> @brief A module to compute FAST logarithm functions, based on Perini and Reitz, "Fast ** 8 | !> approximations of exponential and logarithm functions combined with efficient ** 9 | !> storage/retrieval for combustion kinetics calculations" Comb Flame 194(2018), 37-51. ** 10 | ! *********************************************************************************************** 11 | module fast_log 12 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 13 | implicit none 14 | private 15 | 16 | public :: flog_p3, flog_p5 17 | 18 | interface flog_p3 19 | module procedure flog_p3_dp 20 | end interface 21 | interface flog_p5 22 | module procedure flog_p5_dp 23 | end interface 24 | 25 | #ifdef __NVCOMPILER 26 | include 'utilities/nvidia_shift_interface.inc' 27 | #endif 28 | 29 | contains 30 | 31 | elemental function flog_p3_dp(x) result(y) 32 | integer, parameter :: wp = dp 33 | real(wp), intent(in) :: x 34 | real(wp) :: y 35 | !-- Internal Variables 36 | real(wp) :: xi,xf 37 | integer(wp) :: iwp 38 | integer(wp), parameter :: mantissa_left = 2_wp**52 39 | integer(wp), parameter :: mantissa = -9218868437227405313_wp ! not(shiftl(2047_wp,52)) 40 | integer(wp), parameter :: bias = 1023_wp 41 | integer(wp), parameter :: ishift = mantissa_left*bias 42 | 43 | real(wp), parameter :: log2 = log(2._wp) 44 | real(wp), parameter :: rlog2 = 1._wp/log2 45 | real(wp), parameter :: sqrt2 = sqrt(2._wp) 46 | real(wp), parameter :: s(3)= [rlog2,3.0_wp-2.5_wp*rlog2,1.5_wp*rlog2-2.0_wp] 47 | !------------------------------------------------- 48 | iwp = transfer(x,iwp) 49 | xi = shiftr(iwp,52)-bias 50 | 51 | ! Take mantissa part only 52 | xf = transfer(iand(iwp,mantissa)+ishift,xf)-1._wp 53 | 54 | ! Apply cubic polynomial 55 | xf = xf*(s(1)+xf*(s(2)+xf*s(3))) 56 | 57 | ! Compute log and Change of basis: log_2(x) -> log_e(x) = log2*log_2(x) 58 | y = (xf+xi)*log2 59 | 60 | end function flog_p3_dp 61 | 62 | elemental function flog_p5_dp(x) result(y) 63 | integer, parameter :: wp = dp 64 | real(wp), intent(in) :: x 65 | real(wp) :: y 66 | !-- Internal Variables 67 | real(wp) :: xi,xf 68 | integer(wp) :: iwp 69 | integer(wp), parameter :: mantissa_left = 2_wp**52 70 | integer(wp), parameter :: mantissa = -9218868437227405313_wp ! not(shiftl(2047_wp,52)) 71 | integer(wp), parameter :: bias = 1023_wp 72 | integer(wp), parameter :: ishift = mantissa_left*bias 73 | 74 | real(wp), parameter :: log2 = log(2._wp) 75 | real(wp), parameter :: rlog2 = 1._wp/log2 76 | real(wp), parameter :: sqrt2 = sqrt(2._wp) 77 | real(wp), parameter :: s(5)= [ 1.44269504088896e+0_wp,& 78 | -7.21347520444482e-1_wp,& 79 | 4.42145354110618e-1_wp,& 80 | -2.12375830888126e-1_wp,& 81 | 4.88829563330264e-2_wp] 82 | !------------------------------------------------- 83 | iwp = transfer(x,iwp) 84 | xi = shiftr(iwp,52)-bias 85 | 86 | ! Take mantissa part only 87 | xf = transfer(iand(iwp,mantissa)+ishift,xf)-1._wp 88 | 89 | ! Apply quintic polynomial 90 | xf = xf*(s(1)+xf*(s(2)+xf*(s(3)+xf*(s(4)+xf*s(5))))) 91 | 92 | ! Compute log and Change of basis: log_2(x) -> log_e(x) = log2*log_2(x) 93 | y = (xf+xi)*log2 94 | 95 | end function flog_p5_dp 96 | 97 | #ifdef __NVCOMPILER 98 | include 'utilities/nvidia_shift.inc' 99 | #endif 100 | end module fast_log -------------------------------------------------------------------------------- /src/fast_math.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A. 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | module fast_math 7 | !! User API: All modules can be referenced from this module as entry point 8 | !------------------------- 9 | ! Basics 10 | use fast_sum 11 | use fast_dotp 12 | use fast_rsqrt 13 | !------------------------- 14 | ! logarithmic 15 | use fast_log 16 | !------------------------- 17 | ! Trigonometric 18 | use fast_trigo 19 | !------------------------- 20 | ! Hyperbolic 21 | use fast_tanh 22 | use fast_erf 23 | 24 | end module fast_math 25 | -------------------------------------------------------------------------------- /src/fast_rsqrt.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2016-2022 Federico Perini 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | ! *********************************************************************************************** 7 | !> @brief A FAST reciprocal of a square root, 1/sqrt(x), based on Perini and Reitz, "Fast ** 8 | !> approximations of exponential and logarithm functions combined with efficient ** 9 | !> storage/retrieval for combustion kinetics calculations" Comb Flame 194(2018), 37-51. ** 10 | ! *********************************************************************************************** 11 | module fast_rsqrt 12 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 13 | implicit none 14 | private 15 | 16 | public :: frsqrt 17 | 18 | interface frsqrt 19 | !! Retranscript of the original Quake III Arena, see https://en.wikipedia.org/wiki/Fast_inverse_square_root 20 | !! for pure reference 21 | module procedure frsqrt_dp 22 | module procedure frsqrt_sp 23 | end interface 24 | 25 | #ifdef __NVCOMPILER 26 | include 'utilities/nvidia_shift_interface.inc' 27 | #endif 28 | 29 | contains 30 | 31 | elemental function frsqrt_sp(x) result(y) 32 | integer, parameter :: wp = sp 33 | real(wp), intent(in) :: x 34 | real(wp) :: y 35 | !-- Internal Variables 36 | real(wp) :: x2,y2 37 | integer(wp) :: i 38 | integer(wp), parameter :: magic = int(Z'5f3759df',kind=wp) 39 | !------------------------------------------------- 40 | x2 = 0.5_wp*x 41 | i = transfer(x,i) 42 | i = magic - shiftr(i,1) 43 | y2 = transfer(i,y) 44 | 45 | ! Perform one Newton-Raphson step 46 | y = y2*(1.5_wp-x2*y2*y2) 47 | 48 | end function frsqrt_sp 49 | 50 | elemental function frsqrt_dp(x) result(y) 51 | !! Double precision implementation of the Quake III arena algorithm 52 | !! With an avx2 enabled machine you will have speed-ups compared to the intrinsic 1/srqt(x) 53 | !! even with 2 or 3 Newton-Raphson iterations 54 | !! 1 iter > precision at 1e-3 55 | !! 2 iter > precision at 1e-6 56 | !! 3 iter > precision at 1e-11 57 | integer, parameter :: wp = dp 58 | integer, parameter :: ninter = 1 59 | real(wp), intent(in) :: x 60 | real(wp) :: y 61 | !-- Internal Variables 62 | real(wp) :: x2,y2 63 | integer(wp) :: i 64 | integer(wp), parameter :: magic = 6910469410427058089_wp 65 | !------------------------------------------------- 66 | x2 = 0.5_wp*x 67 | i = transfer(x,i) 68 | i = magic - shiftr(i,1) 69 | y2 = transfer(i,y) 70 | 71 | ! Perform Newton-Raphson steps 72 | do i = 1, ninter 73 | y2 = y2*(1.5_wp-x2*y2*y2) 74 | end do 75 | y = y2 76 | end function frsqrt_dp 77 | 78 | #ifdef __NVCOMPILER 79 | include 'utilities/nvidia_shift.inc' 80 | #endif 81 | end module fast_rsqrt -------------------------------------------------------------------------------- /src/fast_sum.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A. 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | module fast_sum 7 | !! Two fast & accurate sum are proposed for 1D arrays: 8 | !! By default, "fsum" will use the fsum_chunk approach. This method is at worst, one order of magnitud more accurate that "sum" and between 1.5 to 10 times faster 9 | !! A second approach is also proposed, "fsum_pair" which is the most accurate approach. cpu time can vary between x2 times slower or sometimes faster than intrinsic sum. 10 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 11 | implicit none 12 | private 13 | 14 | public :: fsum, fsum_kahan 15 | integer, parameter :: chunk64 = 64 16 | integer, parameter :: chunk32 = 64 17 | 18 | interface fsum 19 | !! Source: to the best of knowledge: Alves J. but heavily inspired by this paper https://epubs.siam.org/doi/10.1137/19M1257780 20 | module procedure fsum_chunk_1d_sp 21 | module procedure fsum_chunk_1d_sp_mask 22 | module procedure fsum_chunk_1d_dp 23 | module procedure fsum_chunk_1d_dp_mask 24 | end interface 25 | 26 | interface fsum_kahan 27 | module procedure fsum_kahan_1d_sp 28 | module procedure fsum_kahan_1d_sp_mask 29 | module procedure fsum_kahan_1d_dp 30 | module procedure fsum_kahan_1d_dp_mask 31 | end interface 32 | 33 | interface vkahans 34 | module procedure vkahans_sp 35 | module procedure vkahans_dp 36 | end interface 37 | interface vkahans_m 38 | module procedure vkahans_m_sp 39 | module procedure vkahans_m_dp 40 | end interface 41 | 42 | contains 43 | 44 | pure function fsum_chunk_1d_sp(a) result(sout) 45 | integer, parameter :: wp = sp 46 | integer, parameter :: chunk = chunk32 47 | real(wp), intent(in) :: a(:) 48 | real(wp) :: sout 49 | ! -- 50 | real(wp) :: abatch(chunk) 51 | integer :: i, n, r 52 | ! ----------------------------- 53 | n = size(a) 54 | r = mod(n,chunk) 55 | 56 | abatch(1:r) = a(1:r) 57 | abatch(r+1:chunk) = 0._wp 58 | do i = r+1, n-r, chunk 59 | abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1) 60 | end do 61 | 62 | sout = 0.0_wp 63 | do i = 1, chunk/2 64 | sout = sout + abatch(i)+abatch(chunk/2+i) 65 | end do 66 | end function 67 | 68 | pure function fsum_chunk_1d_dp(a) result(sout) 69 | integer, parameter :: wp = dp 70 | integer, parameter :: chunk = chunk64 71 | real(wp), intent(in) :: a(:) 72 | real(wp) :: sout 73 | ! -- 74 | real(wp) :: abatch(chunk) 75 | integer :: i, n, r 76 | ! ----------------------------- 77 | n = size(a) 78 | r = mod(n,chunk) 79 | 80 | abatch(1:r) = a(1:r) 81 | abatch(r+1:chunk) = 0._wp 82 | do i = r+1, n-r, chunk 83 | abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1) 84 | end do 85 | 86 | sout = 0.0_wp 87 | do i = 1, chunk/2 88 | sout = sout + abatch(i)+abatch(chunk/2+i) 89 | end do 90 | end function 91 | 92 | pure function fsum_chunk_1d_sp_mask(a,mask) result(sout) 93 | integer, parameter :: wp = sp 94 | integer, parameter :: chunk = chunk32 95 | real(wp), intent(in) :: a(:) 96 | logical, intent(in) :: mask(:) 97 | real(wp) :: sout 98 | ! -- 99 | real(wp) :: abatch(chunk) 100 | integer :: i, n, r 101 | ! ----------------------------- 102 | n = size(a) 103 | r = mod(n,chunk) 104 | 105 | abatch(1:r) = merge( 0.0_wp , a(1:r) , mask(1:r) ) 106 | abatch(r+1:chunk) = 0._wp 107 | do i = r+1, n-r, chunk 108 | abatch(1:chunk) = abatch(1:chunk) + merge( 0.0_wp , a(i:i+chunk-1), mask(i:i+chunk-1) ) 109 | end do 110 | 111 | sout = 0.0_wp 112 | do i = 1, chunk/2 113 | sout = sout + abatch(i)+abatch(chunk/2+i) 114 | end do 115 | end function 116 | 117 | pure function fsum_chunk_1d_dp_mask(a,mask) result(sout) 118 | integer, parameter :: wp = dp 119 | integer, parameter :: chunk = chunk64 120 | real(wp), intent(in) :: a(:) 121 | logical, intent(in) :: mask(:) 122 | real(wp) :: sout 123 | ! -- 124 | real(wp) :: abatch(chunk) 125 | integer :: i, n, r 126 | ! ----------------------------- 127 | n = size(a) 128 | r = mod(n,chunk) 129 | 130 | abatch(1:r) = merge( 0.0_wp , a(1:r) , mask(1:r) ) 131 | abatch(r+1:chunk) = 0._wp 132 | do i = r+1, n-r, chunk 133 | abatch(1:chunk) = abatch(1:chunk) + merge( 0.0_wp , a(i:i+chunk-1), mask(i:i+chunk-1) ) 134 | end do 135 | 136 | sout = 0.0_wp 137 | do i = 1, chunk/2 138 | sout = sout + abatch(i)+abatch(chunk/2+i) 139 | end do 140 | end function 141 | 142 | pure function fsum_kahan_1d_sp(a) result(sout) 143 | integer, parameter :: wp = sp 144 | integer, parameter :: chunk = chunk32 145 | real(wp), intent(in) :: a(:) 146 | real(wp) :: sout 147 | ! -- 148 | real(wp) :: sbatch(chunk) 149 | real(wp) :: cbatch(chunk) 150 | integer :: i, n, r 151 | ! ----------------------------- 152 | n = size(a) 153 | r = mod(n,chunk) 154 | 155 | sbatch(1:r) = a(1:r) 156 | sbatch(r+1:chunk) = 0.0_wp 157 | cbatch = 0.0_wp 158 | do i = r+1, n-r, chunk 159 | call vkahans( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) ) 160 | end do 161 | 162 | sout = 0.0_wp 163 | do i = 1,chunk 164 | call vkahans( sbatch(i) , sout , cbatch(i) ) 165 | end do 166 | end function 167 | 168 | pure function fsum_kahan_1d_dp(a) result(sout) 169 | integer, parameter :: wp = dp 170 | integer, parameter :: chunk = chunk64 171 | real(wp), intent(in) :: a(:) 172 | real(wp) :: sout 173 | ! -- 174 | real(wp) :: sbatch(chunk) 175 | real(wp) :: cbatch(chunk) 176 | integer :: i, n, r 177 | ! ----------------------------- 178 | n = size(a) 179 | r = mod(n,chunk) 180 | 181 | sbatch(1:r) = a(1:r) 182 | sbatch(r+1:chunk) = 0.0_wp 183 | cbatch = 0.0_wp 184 | do i = r+1, n-r, chunk 185 | call vkahans( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk) ) 186 | end do 187 | 188 | sout = 0.0_wp 189 | do i = 1,chunk 190 | call vkahans( sbatch(i) , sout , cbatch(i) ) 191 | end do 192 | end function 193 | 194 | pure function fsum_kahan_1d_sp_mask(a,mask) result(sout) 195 | integer, parameter :: wp = sp 196 | integer, parameter :: chunk = chunk32 197 | real(wp), intent(in) :: a(:) 198 | logical, intent(in) :: mask(:) 199 | real(wp) :: sout 200 | ! -- 201 | real(wp) :: sbatch(chunk) 202 | real(wp) :: cbatch(chunk) 203 | integer :: i, n, r 204 | ! ----------------------------- 205 | n = size(a) 206 | r = mod(n,chunk) 207 | 208 | sbatch(1:r) = merge( 0.0_wp , a(1:r) , mask(1:r) ) 209 | sbatch(r+1:chunk) = 0.0_wp 210 | cbatch = 0.0_wp 211 | do i = r+1, n-r, chunk 212 | call vkahans_m( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk), mask(i:i+chunk-1) ) 213 | end do 214 | 215 | sout = 0.0_wp 216 | do i = 1,chunk 217 | call vkahans( sbatch(i) , sout , cbatch(i) ) 218 | end do 219 | end function 220 | 221 | pure function fsum_kahan_1d_dp_mask(a,mask) result(sout) 222 | integer, parameter :: wp = dp 223 | integer, parameter :: chunk = chunk64 224 | real(wp), intent(in) :: a(:) 225 | logical, intent(in) :: mask(:) 226 | real(wp) :: sout 227 | ! -- 228 | real(wp) :: sbatch(chunk) 229 | real(wp) :: cbatch(chunk) 230 | integer :: i, n, r 231 | ! ----------------------------- 232 | n = size(a) 233 | r = mod(n,chunk) 234 | 235 | sbatch(1:r) = merge( 0.0_wp , a(1:r) , mask(1:r) ) 236 | sbatch(r+1:chunk) = 0.0_wp 237 | cbatch = 0.0_wp 238 | do i = r+1, n-r, chunk 239 | call vkahans_m( a(i:i+chunk-1) , sbatch(1:chunk) , cbatch(1:chunk), mask(i:i+chunk-1) ) 240 | end do 241 | 242 | sout = 0.0_wp 243 | do i = 1,chunk 244 | call vkahans( sbatch(i) , sout , cbatch(i) ) 245 | end do 246 | end function 247 | 248 | elemental subroutine vkahans_sp(a,s,c) 249 | integer, parameter :: wp = sp 250 | include 'utilities/vkahans.inc' 251 | end subroutine 252 | 253 | elemental subroutine vkahans_dp(a,s,c) 254 | integer, parameter :: wp = dp 255 | include 'utilities/vkahans.inc' 256 | end subroutine 257 | 258 | elemental subroutine vkahans_m_sp(a,s,c,m) 259 | integer, parameter :: wp = sp 260 | include 'utilities/vkahans_m.inc' 261 | end subroutine 262 | 263 | elemental subroutine vkahans_m_dp(a,s,c,m) 264 | integer, parameter :: wp = dp 265 | include 'utilities/vkahans_m.inc' 266 | end subroutine 267 | 268 | end module fast_sum -------------------------------------------------------------------------------- /src/fast_tanh.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A. 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | module fast_tanh 7 | !! Source: https://fortran-lang.discourse.group/t/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/5385/31 8 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 9 | implicit none 10 | private 11 | 12 | public :: ftanh 13 | 14 | interface ftanh 15 | module procedure ftanh_sp 16 | module procedure ftanh_dp 17 | end interface 18 | 19 | contains 20 | 21 | elemental function ftanh_sp( x ) result( y ) 22 | integer, parameter :: wp = sp 23 | real(wp), intent(in) :: x 24 | real(wp) :: y 25 | !-- Internal Variables 26 | real(wp) :: x2, a, b 27 | !--------------------------------------------- 28 | x2 = x*x 29 | a = x * (135135.0_wp + x2 * (17325.0_wp + x2 * (378.0_wp + x2))) 30 | b = 135135.0_wp + x2 * (62370.0_wp + x2 * (3150.0_wp + x2 * 28.0_wp)) 31 | y = merge( a / b , sign(1.0_wp,x) , x2 <= 25._wp ) 32 | end function 33 | 34 | elemental function ftanh_dp( x ) result( y ) 35 | integer, parameter :: wp = dp 36 | real(wp), intent(in) :: x 37 | real(wp) :: y 38 | !-- Internal Variables 39 | real(wp) :: x2, a, b 40 | !--------------------------------------------- 41 | x2 = x*x 42 | a = x * (135135.0_wp + x2 * (17325.0_wp + x2 * (378.0_wp + x2))) 43 | b = 135135.0_wp + x2 * (62370.0_wp + x2 * (3150.0_wp + x2 * 28.0_wp)) 44 | y = merge( a / b , sign(1.0_wp,x) , x2 <= 25._wp ) 45 | end function 46 | 47 | end module -------------------------------------------------------------------------------- /src/fast_trigo.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! SPDX-FileCopyrightText: 2023 Transvalor S.A. 3 | ! 4 | ! SPDX-License-Identifier: MIT 5 | ! 6 | module fast_trigo 7 | !! Source for fast sine cosine: http://web.archive.org/web/20141220225551/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 8 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 9 | implicit none 10 | private 11 | 12 | public :: fsin, fcos, ftan 13 | public :: facos, facos_nvidia, fatan 14 | 15 | interface fcos 16 | module procedure fcos_sp 17 | module procedure fcos_dp 18 | end interface 19 | 20 | interface fsin 21 | module procedure fsin_sp 22 | module procedure fsin_dp 23 | end interface 24 | 25 | interface ftan 26 | module procedure ftan_sp 27 | module procedure ftan_dp 28 | end interface 29 | 30 | interface facos 31 | module procedure facos_sp 32 | module procedure facos_dp 33 | end interface 34 | 35 | interface facos_nvidia 36 | !! Source : https://developer.download.nvidia.com/cg/acos.html 37 | module procedure facos_nvidia_sp 38 | module procedure facos_nvidia_dp 39 | end interface 40 | 41 | interface fatan 42 | !! Source : https://www.dsprelated.com/showarticle/1052.php 43 | module procedure fatan_sp 44 | module procedure fatan_dp 45 | end interface 46 | 47 | contains 48 | 49 | elemental function fcos_sp( x ) result( y ) 50 | integer, parameter :: wp = sp 51 | real(wp), intent(in) :: x 52 | real(wp) :: y 53 | !-- Internal Variables 54 | real(wp), parameter :: half_pi =acos(-1.0_wp)/2 55 | !--------------------------------------------- 56 | y = fsin_sp( half_pi - x ) 57 | end function 58 | 59 | elemental function fcos_dp( x ) result( y ) 60 | integer, parameter :: wp = dp 61 | real(wp), intent(in) :: x 62 | real(wp) :: y 63 | !-- Internal Variables 64 | real(wp), parameter :: half_pi =acos(-1.0_wp)/2 65 | !--------------------------------------------- 66 | y = fsin_dp( half_pi - x ) 67 | end function 68 | 69 | elemental function fsin_sp( x ) result( y ) 70 | integer, parameter :: wp = sp 71 | real(wp), intent(in) :: x 72 | real(wp) :: y 73 | !-- Internal Variables 74 | real(wp), parameter :: twopi = 2*acos(-1.0_wp) 75 | real(wp), parameter :: invtwopi = 1.0_wp/twopi 76 | real(wp), parameter :: c1=4.0_wp/acos(-1.0_wp) 77 | real(wp), parameter :: c2=-4.0_wp/acos(-1.0_wp)**2 78 | real(wp), parameter :: c3=0.225_wp 79 | real(wp) :: x0 80 | !--------------------------------------------- 81 | x0 = x - (int(x*invtwopi,kind=1) * twopi) 82 | y = c1*x0+c2*x0*abs(x0) 83 | y = c3*(y*abs(y)-y)+y 84 | end function 85 | 86 | elemental function fsin_dp( x ) result( y ) 87 | integer, parameter :: wp = dp 88 | real(wp), intent(in) :: x 89 | real(wp) :: y 90 | !-- Internal Variables 91 | real(wp), parameter :: twopi = 2*acos(-1.0_wp) 92 | real(wp), parameter :: invtwopi = 1.0_wp/twopi 93 | real(wp), parameter :: c1=4.0_wp/acos(-1.0_wp) 94 | real(wp), parameter :: c2=-4.0_wp/acos(-1.0_wp)**2 95 | real(wp), parameter :: c3=0.225_wp 96 | real(wp) :: x0 97 | !--------------------------------------------- 98 | x0 = x - (int(x*invtwopi,kind=1) * twopi) 99 | y = c1*x0+c2*x0*abs(x0) 100 | y = c3*(y*abs(y)-y)+y 101 | end function 102 | 103 | elemental function ftan_sp( x ) result( y ) 104 | integer, parameter :: wp = sp 105 | real(wp), intent(in) :: x 106 | real(wp) :: y 107 | !-- Internal Variables 108 | real(wp), parameter :: pi = acos(-1.0_wp) 109 | real(wp), parameter :: invpi = 1.0_wp/acos(-1.0_wp) 110 | real(wp) :: x0, xsq 111 | !------------------------------------------------- 112 | x0 = x - (int(x*invpi,kind=1) * pi) 113 | xsq = x0 * x0 114 | y = x0 * (2.471688400562703_wp - 0.189759681063053_wp * xsq) / & 115 | (2.4674011002723397_wp - xsq) 116 | end function 117 | 118 | elemental function ftan_dp( x ) result( y ) 119 | integer, parameter :: wp = dp 120 | real(wp), intent(in) :: x 121 | real(wp) :: y 122 | !-- Internal Variables 123 | real(wp), parameter :: pi = acos(-1.0_wp) 124 | real(wp), parameter :: invpi = 1.0_wp/acos(-1.0_wp) 125 | real(wp) :: x0, xsq 126 | !------------------------------------------------- 127 | x0 = x - (int(x*invpi,kind=1) * pi) 128 | xsq = x0 * x0 129 | y = x * (2.471688400562703_wp - 0.189759681063053_wp * xsq) / & 130 | (2.4674011002723397_wp - xsq) 131 | end function 132 | 133 | !==================================================== 134 | ! Inverse 135 | !==================================================== 136 | 137 | elemental function facos_sp( x ) result( y ) 138 | integer, parameter :: wp = sp 139 | real(wp), intent(in) :: x 140 | real(wp) :: y 141 | !--------------------------------------------- 142 | y = (-0.69813170079773212_wp * x * x - 0.87266462599716477_wp) * x + 1.5707963267948966_wp 143 | end function 144 | 145 | elemental function facos_dp( x ) result( y ) 146 | integer, parameter :: wp = dp 147 | real(wp), intent(in) :: x 148 | real(wp) :: y 149 | !--------------------------------------------- 150 | y = (-0.69813170079773212_wp * x * x - 0.87266462599716477_wp) * x + 1.5707963267948966_wp 151 | end function 152 | 153 | elemental function facos_nvidia_sp( x ) result( y ) 154 | integer, parameter :: wp = sp 155 | real(wp), intent(in) :: x 156 | real(wp) :: y 157 | !-- Internal Variables 158 | integer(1) :: negate 159 | real(wp) :: xp 160 | !--------------------------------------------- 161 | negate = merge( 1_1 , 0_1 , x < 0_wp ) 162 | xp = abs(x) 163 | y = -0.0187293_wp * xp + 0.0742610_wp 164 | y = y * xp - 0.2121144_wp 165 | y = y * xp + 1.5707288_wp 166 | y = y * sqrt(1_wp-xp) 167 | y = y + negate * (- 2.0_wp * y + 3.14159265358979_wp) 168 | end function 169 | 170 | elemental function facos_nvidia_dp( x ) result( y ) 171 | integer, parameter :: wp = dp 172 | real(wp), intent(in) :: x 173 | real(wp) :: y 174 | !-- Internal Variables 175 | integer(1) :: negate 176 | real(wp) :: xp 177 | !--------------------------------------------- 178 | negate = merge( 1_1 , 0_1 , x < 0_wp ) 179 | xp = abs(x) 180 | y = -0.0187293_wp * xp + 0.0742610_wp 181 | y = y * xp - 0.2121144_wp 182 | y = y * xp + 1.5707288_wp 183 | y = y * sqrt(1_wp-xp) 184 | y = y + negate * (- 2.0_wp * y + 3.14159265358979_wp) 185 | end function 186 | 187 | elemental function fatan_sp( x ) result( y ) 188 | integer, parameter :: wp = sp 189 | real(wp), intent(in) :: x 190 | real(wp) :: y 191 | !-- Internal Variables 192 | real(wp), parameter :: hpi = acos(-1.0_wp)/2._wp 193 | real(wp) :: inv_x 194 | !--------------------------------------------- 195 | if(abs(x)<1._wp)then 196 | y = base( x ) 197 | else 198 | inv_x = 1._wp / x 199 | y = sign(hpi,x) - base( inv_x ) 200 | end if 201 | contains 202 | real(wp) elemental function base( x ) result( y ) 203 | real(wp), intent(in) :: x 204 | real(wp), parameter :: n1 = 0.97239411_wp 205 | real(wp), parameter :: n2 = -0.19194795_wp 206 | y = (n1 + n2 * x * x) * x 207 | end function 208 | end function 209 | 210 | elemental function fatan_dp( x ) result( y ) 211 | integer, parameter :: wp = dp 212 | real(wp), intent(in) :: x 213 | real(wp) :: y 214 | !-- Internal Variables 215 | real(wp), parameter :: hpi = acos(-1.0_wp)/2._wp 216 | real(wp) :: inv_x 217 | !--------------------------------------------- 218 | if(abs(x)<1._wp)then 219 | y = base( x ) 220 | else 221 | inv_x = 1._wp / x 222 | y = sign(hpi,x) - base( inv_x ) 223 | end if 224 | contains 225 | real(wp) elemental function base( x ) result( y ) 226 | real(wp), intent(in) :: x 227 | real(wp), parameter :: n1 = 0.97239411_wp 228 | real(wp), parameter :: n2 = -0.19194795_wp 229 | y = (n1 + n2 * x * x) * x 230 | end function 231 | end function 232 | 233 | end module -------------------------------------------------------------------------------- /src/utilities/nvidia_shift.inc: -------------------------------------------------------------------------------- 1 | elemental integer(sp) function shiftr_sp( I , shift ) 2 | integer(sp), intent(in) :: I 3 | integer, intent(in) :: shift 4 | shiftr_sp = rshift( I, shift ) 5 | end function 6 | 7 | elemental integer(dp) function shiftr_dp( I , shift ) 8 | integer(dp), intent(in) :: I 9 | integer, intent(in) :: shift 10 | shiftr_dp = rshift( I, shift ) 11 | end function 12 | 13 | elemental integer(sp) function shiftl_sp( I , shift ) 14 | integer(sp), intent(in) :: I 15 | integer, intent(in) :: shift 16 | shiftl_sp = lshift( I, shift ) 17 | end function 18 | 19 | elemental integer(dp) function shiftl_dp( I , shift ) 20 | integer(dp), intent(in) :: I 21 | integer, intent(in) :: shift 22 | shiftl_dp = lshift( I, shift ) 23 | end function -------------------------------------------------------------------------------- /src/utilities/nvidia_shift_interface.inc: -------------------------------------------------------------------------------- 1 | interface shiftl 2 | module procedure shiftl_sp 3 | module procedure shiftl_dp 4 | end interface 5 | interface shiftr 6 | module procedure shiftr_sp 7 | module procedure shiftr_dp 8 | end interface -------------------------------------------------------------------------------- /src/utilities/vkahans.inc: -------------------------------------------------------------------------------- 1 | real(wp), intent(in) :: a 2 | real(wp), intent(inout) :: s 3 | real(wp), intent(inout) :: c 4 | ! -- internal variables 5 | real(wp) :: t, y 6 | y = a - c 7 | t = s + y 8 | c = (t - s) - y 9 | s = t -------------------------------------------------------------------------------- /src/utilities/vkahans_m.inc: -------------------------------------------------------------------------------- 1 | real(wp), intent(in) :: a 2 | real(wp), intent(inout) :: s 3 | real(wp), intent(inout) :: c 4 | logical, intent(in) :: m 5 | ! -- internal variables 6 | real(wp) :: t, y 7 | y = a - c 8 | t = s + y 9 | c = (t - s) - y 10 | s = merge( s , t , m ) -------------------------------------------------------------------------------- /test/test_fast_math.f90: -------------------------------------------------------------------------------- 1 | module test_fast_math 2 | use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64 3 | use testdrive, only: new_unittest, unittest_type, error_type, check 4 | use fast_math 5 | implicit none 6 | 7 | logical :: verbose = .true. ! change me to .true. if you want to see the results 8 | interface scramble 9 | module procedure scramble_sp 10 | module procedure scramble_dp 11 | end interface 12 | interface scramble_l 13 | module procedure scramble_spl 14 | module procedure scramble_dpl 15 | end interface 16 | 17 | character(len=*), parameter :: fmt1 = "('| ',a12,' |