├── .clang-format ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── extern └── amd-libm │ ├── LICENSE │ └── lib │ └── libalm.so ├── include ├── sf_benchmarks.hpp ├── sf_libraries.hpp └── sf_utils.hpp ├── misc ├── join_for_readme.sql └── sf_benchmarks.sql └── src ├── bessel.f ├── bind_af.cpp ├── bind_amdlibm.cpp ├── bind_baobzi.cpp ├── bind_boost.cpp ├── bind_eigen.cpp ├── bind_fort.cpp ├── bind_gsl.cpp ├── bind_misc.cpp ├── bind_sctl.cpp ├── bind_sleef.cpp ├── bind_stl.cpp ├── hank103.f ├── hank106.f ├── main.cpp └── utils.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | BasedOnStyle: LLVM 4 | TabWidth: 4 5 | ColumnLimit: 120 6 | IndentWidth: 4 7 | AlwaysBreakTemplateDeclarations: true 8 | ... 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extern/sleef"] 2 | path = extern/sleef 3 | url = https://github.com/shibatch/sleef 4 | [submodule "extern/baobzi"] 5 | path = extern/baobzi 6 | url = https://github.com/flatironinstitute/baobzi 7 | [submodule "extern/SCTL"] 8 | path = extern/SCTL 9 | url = https://github.com/dmalhotra/SCTL.git 10 | [submodule "extern/eigen"] 11 | path = extern/eigen 12 | url = https://gitlab.com/libeigen/eigen.git 13 | [submodule "extern/toml11"] 14 | path = extern/toml11 15 | url = https://github.com/ToruNiina/toml11 16 | [submodule "extern/vectorclass2"] 17 | path = extern/vectorclass2 18 | url = https://github.com/vectorclass/version2 19 | [submodule "extern/sqlite_orm"] 20 | path = extern/sqlite_orm 21 | url = https://github.com/fnc12/sqlite_orm 22 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | project( 3 | sf_benchmarks 4 | LANGUAGES C CXX Fortran 5 | ) 6 | include(ExternalProject) 7 | set(CMAKE_CXX_STANDARD 17) 8 | 9 | find_package(GSL REQUIRED) 10 | find_package(Boost) 11 | find_package(SQLite3) 12 | 13 | set (default_build_type "Release") 14 | if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) 15 | message (STATUS "No build type specified. Setting build type to Release.") 16 | set (CMAKE_BUILD_TYPE "Release" CACHE STRING "Valid options: Debug, RelWithDebInfo, Release" FORCE) 17 | endif() 18 | 19 | set( 20 | SF_INCLUDES 21 | ${PROJECT_SOURCE_DIR}/include 22 | ) 23 | 24 | ExternalProject_Add( 25 | libsleef 26 | SOURCE_DIR ${PROJECT_SOURCE_DIR}/extern/sleef 27 | CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/contrib 28 | ) 29 | 30 | ExternalProject_Add(libbaobzi 31 | SOURCE_DIR ${PROJECT_SOURCE_DIR}/extern/baobzi 32 | CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/contrib -DBAOBZI_BUILD_TESTS=OFF 33 | -DBAOBZI_BUILD_SHARED=OFF -DBAOBZI_BUILD_EXAMPLES=OFF -DBAOBZI_BUILD_STATIC=ON 34 | ) 35 | 36 | include_directories( 37 | ${CMAKE_BINARY_DIR}/contrib/include 38 | ${PROJECT_SOURCE_DIR}/extern/baobzi/extern/msgpack-c/include 39 | ${PROJECT_SOURCE_DIR}/extern/SCTL/include 40 | ${PROJECT_SOURCE_DIR}/extern/eigen 41 | ${PROJECT_SOURCE_DIR}/extern/toml11 42 | ${PROJECT_SOURCE_DIR}/extern/vectorclass2 43 | ${PROJECT_SOURCE_DIR}/extern/sqlite_orm/include 44 | ) 45 | link_directories(${CMAKE_BINARY_DIR}/contrib/lib64 ${PROJECT_SOURCE_DIR}/extern/amd-libm/lib) 46 | 47 | file(GLOB SF_SOURCES "src/*.cpp" "src/*.f") 48 | add_executable(sf_benchmarks ${SF_SOURCES}) 49 | target_include_directories(sf_benchmarks PRIVATE ${SF_INCLUDES} ${GSL_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) 50 | target_link_libraries(sf_benchmarks sleef GSL::gsl baobzi dl SQLite::SQLite3) 51 | add_dependencies(sf_benchmarks libsleef libbaobzi) 52 | target_compile_options(sf_benchmarks PRIVATE -march=native -ftree-loop-vectorize -ffast-math -DSCTL_HAVE_LIBMVEC 53 | $<$:-fallow-argument-mismatch>) 54 | 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 Robert Blackwell 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /extern/amd-libm/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2008-2021 Advanced Micro Devices, Inc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, 4 | are permitted provided that the following conditions are met: 5 | 1. Redistributions of source code must retain the above copyright notice, 6 | this list of conditions and the following disclaimer. 7 | 2. Redistributions in binary form must reproduce the above copyright notice, 8 | this list of conditions and the following disclaimer in the documentation 9 | and/or other materials provided with the distribution. 10 | 3. Neither the name of the copyright holder nor the names of its contributors 11 | may be used to endorse or promote products derived from this software without 12 | specific prior written permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 18 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 19 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 20 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 21 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 22 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 23 | POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /extern/amd-libm/lib/libalm.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flatironinstitute/sf_benchmarks/02aea37608d5acef56330e4d1be972784e69b065/extern/amd-libm/lib/libalm.so -------------------------------------------------------------------------------- /include/sf_benchmarks.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SF_BENCHMARKS_HPP 2 | #define SF_BENCHMARKS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Attempt to force non-aliased pointers actually seems to slow things down... 11 | //#define RESTRICT __restrict 12 | #define RESTRICT 13 | 14 | typedef std::complex cdouble; 15 | typedef sctl::Vec sctl_dx4; 16 | typedef sctl::Vec sctl_dx8; 17 | 18 | typedef sctl::Vec sctl_fx8; 19 | typedef sctl::Vec sctl_fx16; 20 | 21 | typedef std::function(cdouble)> fun_cdx1_x2; 22 | 23 | template 24 | using multi_eval_func = std::function; 25 | 26 | template 27 | std::function sctl_apply(const F &f) { 28 | static const auto fn = [f](const VAL_T *RESTRICT vals, VAL_T *RESTRICT res, size_t N) { 29 | using Vec = sctl::Vec; 30 | for (size_t i = 0; i < N; i += VecLen) { 31 | f(Vec::LoadAligned(vals + i)).StoreAligned(res + i); 32 | } 33 | }; 34 | return fn; 35 | } 36 | 37 | template 38 | std::function vec_func_apply(const F &f) { 39 | static const auto fn = [f](const VAL_T *RESTRICT vals, VAL_T *RESTRICT res, size_t N) { 40 | for (size_t i = 0; i < N; i += VEC_T::size()) { 41 | f(VEC_T().load_a(vals + i)).store_a(res + i); 42 | } 43 | }; 44 | return fn; 45 | } 46 | 47 | template 48 | std::function scalar_func_apply(const F &f) { 49 | static const auto fn = [f](const VAL_T *RESTRICT vals, VAL_T *RESTRICT res, size_t N) { 50 | for (size_t i = 0; i < N; i += 1) { 51 | res[i] = f(vals[i]); 52 | } 53 | }; 54 | return fn; 55 | } 56 | 57 | struct configuration_t { 58 | int id; 59 | std::string func; 60 | std::string ftype; 61 | double lbound = 0.0; 62 | double ubound = 1.0; 63 | double ilbound = 0.0; 64 | double iubound = 0.0; 65 | }; 66 | 67 | #undef RESTRICT 68 | #endif 69 | -------------------------------------------------------------------------------- /include/sf_libraries.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SF_LIBRARIES_HPP 2 | #define SF_LIBRARIES_HPP 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | extern "C" { 26 | void hank103_(double _Complex *, double _Complex *, double _Complex *, int *); 27 | void fort_bessel_jn_(int *, double *, double *); 28 | void fort_bessel_yn_(int *, double *, double *); 29 | } 30 | 31 | namespace sf::functions { 32 | namespace af { 33 | std::unordered_map> &get_funs_fx8(); 34 | std::unordered_map> &get_funs_dx4(); 35 | std::unordered_map> &get_funs_fx16(); 36 | std::unordered_map> &get_funs_dx8(); 37 | } // namespace af 38 | 39 | namespace amd { 40 | std::unordered_map> &get_funs_fx1(); 41 | std::unordered_map> &get_funs_fx8(); 42 | std::unordered_map> &get_funs_dx1(); 43 | std::unordered_map> &get_funs_dx4(); 44 | } // namespace amd 45 | 46 | namespace baobzi { 47 | std::unordered_map> & 48 | get_funs_dx1(std::set &keys_to_eval, std::unordered_map &configs); 49 | } 50 | 51 | namespace boost { 52 | std::unordered_map> &get_funs_fx1(); 53 | std::unordered_map> &get_funs_dx1(); 54 | } // namespace boost 55 | 56 | // https://eigen.tuxfamily.org/dox/group__CoeffwiseMathFunctions.html 57 | namespace eigen { 58 | enum OPS { 59 | cos, 60 | sin, 61 | tan, 62 | cosh, 63 | sinh, 64 | tanh, 65 | exp, 66 | log, 67 | log10, 68 | pow35, 69 | pow13, 70 | asin, 71 | acos, 72 | atan, 73 | asinh, 74 | acosh, 75 | atanh, 76 | erf, 77 | erfc, 78 | lgamma, 79 | digamma, 80 | ndtri, 81 | sqrt, 82 | rsqrt 83 | }; 84 | 85 | std::unordered_map &get_funs(); 86 | } // namespace eigen 87 | 88 | namespace fort { 89 | std::unordered_map> &get_funs_dx1(); 90 | } // namespace fort 91 | 92 | namespace gsl { 93 | std::unordered_map> &get_funs_dx1(); 94 | std::unordered_map> &get_funs_cdx1(); 95 | } // namespace gsl 96 | 97 | namespace misc { 98 | std::unordered_map &get_funs_cdx1_x2(); 99 | } // namespace misc 100 | 101 | namespace SCTL { 102 | std::unordered_map> &get_funs_fx8(); 103 | std::unordered_map> &get_funs_dx4(); 104 | std::unordered_map> &get_funs_fx16(); 105 | std::unordered_map> &get_funs_dx8(); 106 | } // namespace SCTL 107 | 108 | namespace sleef { 109 | std::unordered_map> &get_funs_fx1(); 110 | std::unordered_map> &get_funs_dx1(); 111 | std::unordered_map> &get_funs_fx8(); 112 | std::unordered_map> &get_funs_dx4(); 113 | std::unordered_map> &get_funs_fx16(); 114 | std::unordered_map> &get_funs_dx8(); 115 | } // namespace sleef 116 | 117 | namespace stl { 118 | std::unordered_map<::std::string, multi_eval_func> &get_funs_fx1(); 119 | std::unordered_map<::std::string, multi_eval_func> &get_funs_dx1(); 120 | } // namespace stl 121 | 122 | } // namespace sf::functions 123 | 124 | #endif 125 | -------------------------------------------------------------------------------- /include/sf_utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SF_UTILS_HPP 2 | #define SF_UTILS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace sf::utils { 11 | 12 | struct toolchain_info_t { 13 | int id; 14 | std::string compiler; 15 | std::string compilervers; 16 | std::string libcvers; 17 | 18 | toolchain_info_t(); 19 | }; 20 | 21 | struct host_info_t { 22 | int id; 23 | std::string cpuname; 24 | std::string cpuclock; 25 | std::string cpuclockmax; 26 | std::string memclock; 27 | std::string L1d; 28 | std::string L1i; 29 | std::string L2; 30 | std::string L3; 31 | 32 | host_info_t(); 33 | }; 34 | 35 | struct library_info_t { 36 | int id; 37 | std::string name; 38 | std::string version; 39 | }; 40 | 41 | struct timer { 42 | struct timespec ts; 43 | struct timespec tf; 44 | 45 | unsigned long long tscs; 46 | unsigned long long tscf; 47 | 48 | timer() { start(); } 49 | void start() { clock_gettime(CLOCK_MONOTONIC, &ts); tscs = __rdtsc(); } 50 | void stop() { clock_gettime(CLOCK_MONOTONIC, &tf); tscf = __rdtsc(); } 51 | double elapsed() { return (tf.tv_sec - ts.tv_sec) + (tf.tv_nsec - ts.tv_nsec) * 1E-9; } 52 | unsigned long long ticks_elapsed() { return tscf - tscs; } 53 | }; 54 | 55 | std::string exec(const char *cmd); 56 | std::string get_alm_version(); 57 | std::string get_sleef_version(); 58 | std::string get_af_version(); 59 | std::string get_boost_version(); 60 | std::string get_gsl_version(); 61 | std::string get_sctl_version(); 62 | std::string get_baobzi_version(); 63 | std::string get_eigen_version(); 64 | 65 | template 66 | Eigen::VectorX transform_domain(const Eigen::Ref> &vals, double lower, double upper) { 67 | VAL_T delta = upper - lower; 68 | return vals.array() * delta + lower; 69 | } 70 | 71 | } // namespace sf::utils 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /misc/join_for_readme.sql: -------------------------------------------------------------------------------- 1 | .mode html 2 | .headers on 3 | 4 | SELECT 5 | configurations.func, 6 | libraries.name, 7 | configurations.ftype, 8 | measurements.nelem, 9 | measurements.veclev, 10 | ROUND(configurations.lbound, 2), 11 | ROUND(configurations.ubound, 2), 12 | ROUND(measurements.megaevalspersec, 1), 13 | ROUND(measurements.cyclespereval, 1) 14 | FROM 15 | configurations 16 | JOIN measurements 17 | ON configurations.id=measurements.configuration 18 | JOIN libraries 19 | ON libraries.id=measurements.library 20 | WHERE 21 | (measurements.nelem=1024 OR measurements.nrepeat=1) AND 22 | measurements.run=(SELECT MIN(id) FROM runs) 23 | ORDER BY configurations.func, configurations.ftype, measurements.nelem, measurements.megaevalspersec DESC; 24 | -------------------------------------------------------------------------------- /misc/sf_benchmarks.sql: -------------------------------------------------------------------------------- 1 | create table hosts ( 2 | id integer primary key autoincrement, 3 | cpuname text not null unique, 4 | cpuclock text null, 5 | cpuclockmax text null, 6 | memclock text null, 7 | l1dcache text null, 8 | l1icache text null, 9 | l2cache text null, 10 | l3cache text null 11 | ); 12 | 13 | create table libraries ( 14 | id integer primary key autoincrement, 15 | name text, 16 | version text, 17 | unique(name, version) 18 | ); 19 | 20 | create table toolchains ( 21 | id integer primary key autoincrement, 22 | compiler text, 23 | compilervers text, 24 | libcvers text, 25 | unique(compiler, compilervers, libcvers) 26 | ); 27 | 28 | create table configurations ( 29 | id integer primary key autoincrement, 30 | func text not null, 31 | ftype text not null, 32 | lbound real not null, 33 | ubound real not null, 34 | ilbound real null, 35 | iubound real null, 36 | unique(func, ftype, nelem, nrep, vectlev, lbound, ubound, ilbound, iubound) 37 | ); 38 | 39 | create table runs ( 40 | id integer primary key autoincrement, 41 | time timestamp not null default current_timestamp, 42 | host integer not null references hosts, 43 | toolchain integer not null references toolchains 44 | ); 45 | 46 | create table measurements ( 47 | id integer primary key autoincrement, 48 | run integer references runs, 49 | library integer not null references libraries, 50 | configuration integer not null references configurations, 51 | nelem integer not null, 52 | nrepeat integer not null, 53 | vectlev integer not null, 54 | evalspersec real not null, 55 | meanevaltime real not null, 56 | stddev real not null, 57 | istddev real not null, 58 | maxerr real, 59 | imaxerr 60 | ); 61 | -------------------------------------------------------------------------------- /src/bessel.f: -------------------------------------------------------------------------------- 1 | subroutine fort_bessel_jn(n, x, y) 2 | INTEGER*4 n 3 | REAL*8 x,y 4 | y = BESSEL_JN(n, x) 5 | end subroutine 6 | 7 | subroutine fort_bessel_yn(n, x, y) 8 | INTEGER*4 n 9 | REAL*8 x,y 10 | y = BESSEL_YN(n, x) 11 | end subroutine 12 | -------------------------------------------------------------------------------- /src/bind_af.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::af { 4 | std::unordered_map> funs_fx8 = { 5 | {"sqrt", vec_func_apply([](Vec8f x) -> Vec8f { return sqrt(x); })}, 6 | {"sin", vec_func_apply([](Vec8f x) -> Vec8f { return sin(x); })}, 7 | {"cos", vec_func_apply([](Vec8f x) -> Vec8f { return cos(x); })}, 8 | {"tan", vec_func_apply([](Vec8f x) -> Vec8f { return tan(x); })}, 9 | {"sinh", vec_func_apply([](Vec8f x) -> Vec8f { return sinh(x); })}, 10 | {"cosh", vec_func_apply([](Vec8f x) -> Vec8f { return cosh(x); })}, 11 | {"tanh", vec_func_apply([](Vec8f x) -> Vec8f { return tanh(x); })}, 12 | {"asinh", vec_func_apply([](Vec8f x) -> Vec8f { return asinh(x); })}, 13 | {"acosh", vec_func_apply([](Vec8f x) -> Vec8f { return acosh(x); })}, 14 | {"atanh", vec_func_apply([](Vec8f x) -> Vec8f { return atanh(x); })}, 15 | {"asin", vec_func_apply([](Vec8f x) -> Vec8f { return asin(x); })}, 16 | {"acos", vec_func_apply([](Vec8f x) -> Vec8f { return acos(x); })}, 17 | {"atan", vec_func_apply([](Vec8f x) -> Vec8f { return atan(x); })}, 18 | {"exp", vec_func_apply([](Vec8f x) -> Vec8f { return exp(x); })}, 19 | {"exp2", vec_func_apply([](Vec8f x) -> Vec8f { return exp2(x); })}, 20 | {"exp10", vec_func_apply([](Vec8f x) -> Vec8f { return exp10(x); })}, 21 | {"log", vec_func_apply([](Vec8f x) -> Vec8f { return log(x); })}, 22 | {"log2", vec_func_apply([](Vec8f x) -> Vec8f { return log2(x); })}, 23 | {"log10", vec_func_apply([](Vec8f x) -> Vec8f { return log10(x); })}, 24 | {"pow3.5", vec_func_apply([](Vec8f x) -> Vec8f { return pow(x, 3.5); })}, 25 | {"pow13", vec_func_apply([](Vec8f x) -> Vec8f { return pow_const(x, 13); })}, 26 | }; 27 | 28 | std::unordered_map> funs_dx4 = { 29 | {"sqrt", vec_func_apply([](Vec4d x) -> Vec4d { return sqrt(x); })}, 30 | {"sin", vec_func_apply([](Vec4d x) -> Vec4d { return sin(x); })}, 31 | {"cos", vec_func_apply([](Vec4d x) -> Vec4d { return cos(x); })}, 32 | {"tan", vec_func_apply([](Vec4d x) -> Vec4d { return tan(x); })}, 33 | {"sinh", vec_func_apply([](Vec4d x) -> Vec4d { return sinh(x); })}, 34 | {"cosh", vec_func_apply([](Vec4d x) -> Vec4d { return cosh(x); })}, 35 | {"tanh", vec_func_apply([](Vec4d x) -> Vec4d { return tanh(x); })}, 36 | {"asinh", vec_func_apply([](Vec4d x) -> Vec4d { return asinh(x); })}, 37 | {"acosh", vec_func_apply([](Vec4d x) -> Vec4d { return acosh(x); })}, 38 | {"atanh", vec_func_apply([](Vec4d x) -> Vec4d { return atanh(x); })}, 39 | {"asin", vec_func_apply([](Vec4d x) -> Vec4d { return asin(x); })}, 40 | {"acos", vec_func_apply([](Vec4d x) -> Vec4d { return acos(x); })}, 41 | {"atan", vec_func_apply([](Vec4d x) -> Vec4d { return atan(x); })}, 42 | {"exp", vec_func_apply([](Vec4d x) -> Vec4d { return exp(x); })}, 43 | {"exp2", vec_func_apply([](Vec4d x) -> Vec4d { return exp2(x); })}, 44 | {"exp10", vec_func_apply([](Vec4d x) -> Vec4d { return exp10(x); })}, 45 | {"log", vec_func_apply([](Vec4d x) -> Vec4d { return log(x); })}, 46 | {"log2", vec_func_apply([](Vec4d x) -> Vec4d { return log2(x); })}, 47 | {"log10", vec_func_apply([](Vec4d x) -> Vec4d { return log10(x); })}, 48 | {"pow3.5", vec_func_apply([](Vec4d x) -> Vec4d { return pow(x, 3.5); })}, 49 | {"pow13", vec_func_apply([](Vec4d x) -> Vec4d { return pow_const(x, 13); })}, 50 | }; 51 | 52 | #ifdef __AVX512F__ 53 | std::unordered_map> funs_fx16 = { 54 | {"memcpy", vec_func_apply([](Vec16f x) -> Vec16f { return x; })}, 55 | {"memset", vec_func_apply([](Vec16f x) -> Vec16f { return Vec16f{0.0}; })}, 56 | {"sqrt", vec_func_apply([](Vec16f x) -> Vec16f { return sqrt(x); })}, 57 | {"sin", vec_func_apply([](Vec16f x) -> Vec16f { return sin(x); })}, 58 | {"cos", vec_func_apply([](Vec16f x) -> Vec16f { return cos(x); })}, 59 | {"tan", vec_func_apply([](Vec16f x) -> Vec16f { return tan(x); })}, 60 | {"sinh", vec_func_apply([](Vec16f x) -> Vec16f { return sinh(x); })}, 61 | {"cosh", vec_func_apply([](Vec16f x) -> Vec16f { return cosh(x); })}, 62 | {"tanh", vec_func_apply([](Vec16f x) -> Vec16f { return tanh(x); })}, 63 | {"asinh", vec_func_apply([](Vec16f x) -> Vec16f { return asinh(x); })}, 64 | {"acosh", vec_func_apply([](Vec16f x) -> Vec16f { return acosh(x); })}, 65 | {"atanh", vec_func_apply([](Vec16f x) -> Vec16f { return atanh(x); })}, 66 | {"asin", vec_func_apply([](Vec16f x) -> Vec16f { return asin(x); })}, 67 | {"acos", vec_func_apply([](Vec16f x) -> Vec16f { return acos(x); })}, 68 | {"atan", vec_func_apply([](Vec16f x) -> Vec16f { return atan(x); })}, 69 | {"exp", vec_func_apply([](Vec16f x) -> Vec16f { return exp(x); })}, 70 | {"exp2", vec_func_apply([](Vec16f x) -> Vec16f { return exp2(x); })}, 71 | {"exp10", vec_func_apply([](Vec16f x) -> Vec16f { return exp10(x); })}, 72 | {"log", vec_func_apply([](Vec16f x) -> Vec16f { return log(x); })}, 73 | {"log2", vec_func_apply([](Vec16f x) -> Vec16f { return log2(x); })}, 74 | {"log10", vec_func_apply([](Vec16f x) -> Vec16f { return log10(x); })}, 75 | {"pow3.5", vec_func_apply([](Vec16f x) -> Vec16f { return pow(x, 3.5); })}, 76 | {"pow13", vec_func_apply([](Vec16f x) -> Vec16f { return pow_const(x, 13); })}, 77 | }; 78 | 79 | std::unordered_map> funs_dx8 = { 80 | {"memset", vec_func_apply([](Vec8d x) -> Vec8d { return Vec8d{0.0}; })}, 81 | {"memcpy", vec_func_apply([](Vec8d x) -> Vec8d { return x; })}, 82 | {"sqrt", vec_func_apply([](Vec8d x) -> Vec8d { return sqrt(x); })}, 83 | {"sin", vec_func_apply([](Vec8d x) -> Vec8d { return sin(x); })}, 84 | {"cos", vec_func_apply([](Vec8d x) -> Vec8d { return cos(x); })}, 85 | {"tan", vec_func_apply([](Vec8d x) -> Vec8d { return tan(x); })}, 86 | {"sinh", vec_func_apply([](Vec8d x) -> Vec8d { return sinh(x); })}, 87 | {"cosh", vec_func_apply([](Vec8d x) -> Vec8d { return cosh(x); })}, 88 | {"tanh", vec_func_apply([](Vec8d x) -> Vec8d { return tanh(x); })}, 89 | {"asinh", vec_func_apply([](Vec8d x) -> Vec8d { return asinh(x); })}, 90 | {"acosh", vec_func_apply([](Vec8d x) -> Vec8d { return acosh(x); })}, 91 | {"atanh", vec_func_apply([](Vec8d x) -> Vec8d { return atanh(x); })}, 92 | {"asin", vec_func_apply([](Vec8d x) -> Vec8d { return asin(x); })}, 93 | {"acos", vec_func_apply([](Vec8d x) -> Vec8d { return acos(x); })}, 94 | {"atan", vec_func_apply([](Vec8d x) -> Vec8d { return atan(x); })}, 95 | {"exp", vec_func_apply([](Vec8d x) -> Vec8d { return exp(x); })}, 96 | {"exp2", vec_func_apply([](Vec8d x) -> Vec8d { return exp2(x); })}, 97 | {"exp10", vec_func_apply([](Vec8d x) -> Vec8d { return exp10(x); })}, 98 | {"log", vec_func_apply([](Vec8d x) -> Vec8d { return log(x); })}, 99 | {"log2", vec_func_apply([](Vec8d x) -> Vec8d { return log2(x); })}, 100 | {"log10", vec_func_apply([](Vec8d x) -> Vec8d { return log10(x); })}, 101 | {"pow3.5", vec_func_apply([](Vec8d x) -> Vec8d { return pow(x, 3.5); })}, 102 | {"pow13", vec_func_apply([](Vec8d x) -> Vec8d { return pow_const(x, 13); })}, 103 | }; 104 | #else 105 | std::unordered_map> funs_fx16; 106 | std::unordered_map> funs_dx8; 107 | #endif 108 | 109 | std::unordered_map> &get_funs_fx8() { return funs_fx8; } 110 | std::unordered_map> &get_funs_dx4() { return funs_dx4; } 111 | std::unordered_map> &get_funs_fx16() { return funs_fx16; } 112 | std::unordered_map> &get_funs_dx8() { return funs_dx8; } 113 | 114 | } // namespace sf::functions::af 115 | -------------------------------------------------------------------------------- /src/bind_amdlibm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace sf::functions::amd { 5 | std::unordered_map> funs_fx1; 6 | std::unordered_map> funs_fx8; 7 | std::unordered_map> funs_dx1; 8 | std::unordered_map> funs_dx4; 9 | using C_FUN1F = float (*)(float); 10 | using C_FUN2F = float (*)(float, float); 11 | using C_FUN1D = double (*)(double); 12 | using C_FUN2D = double (*)(double, double); 13 | using C_FX8_FUN1F = Vec8f (*)(Vec8f); 14 | using C_FX8_FUN2F = Vec8f (*)(Vec8f, Vec8f); 15 | using C_DX4_FUN1D = Vec4d (*)(Vec4d); 16 | using C_DX4_FUN2D = Vec4d (*)(Vec4d, Vec4d); 17 | 18 | void *handle = NULL; 19 | 20 | C_FUN1F amd_sinf, amd_cosf, amd_tanf, amd_sinhf, amd_coshf, amd_tanhf, amd_asinf, amd_acosf, amd_atanf, amd_asinhf, 21 | amd_acoshf, amd_atanhf, amd_logf, amd_log2f, amd_log10f, amd_expf, amd_exp2f, amd_exp10f, amd_sqrtf; 22 | C_FUN2F amd_powf; 23 | 24 | C_FUN1D amd_sin, amd_cos, amd_tan, amd_sinh, amd_cosh, amd_tanh, amd_asin, amd_acos, amd_atan, amd_asinh, amd_acosh, 25 | amd_atanh, amd_log, amd_log2, amd_log10, amd_exp, amd_exp2, amd_exp10, amd_sqrt; 26 | C_FUN2D amd_pow; 27 | 28 | C_FX8_FUN1F amd_vrs8_sinf, amd_vrs8_cosf, amd_vrs8_tanf, amd_vrs8_logf, amd_vrs8_log2f, amd_vrs8_expf, amd_vrs8_exp2f; 29 | C_FX8_FUN2F amd_vrs8_powf; 30 | 31 | C_DX4_FUN1D amd_vrd4_sin, amd_vrd4_cos, amd_vrd4_tan, amd_vrd4_log, amd_vrd4_log2, amd_vrd4_exp, amd_vrd4_exp2; 32 | C_DX4_FUN2D amd_vrd4_pow; 33 | 34 | void load_functions() { 35 | if (handle) 36 | return; 37 | void *handle = dlopen("libalm.so", RTLD_NOW); 38 | 39 | amd_sinf = (C_FUN1F)dlsym(handle, "amd_sinf"); 40 | amd_cosf = (C_FUN1F)dlsym(handle, "amd_cosf"); 41 | amd_tanf = (C_FUN1F)dlsym(handle, "amd_tanf"); 42 | amd_sinhf = (C_FUN1F)dlsym(handle, "amd_sinhf"); 43 | amd_coshf = (C_FUN1F)dlsym(handle, "amd_coshf"); 44 | amd_tanhf = (C_FUN1F)dlsym(handle, "amd_tanhf"); 45 | amd_asinf = (C_FUN1F)dlsym(handle, "amd_asinf"); 46 | amd_acosf = (C_FUN1F)dlsym(handle, "amd_acosf"); 47 | amd_atanf = (C_FUN1F)dlsym(handle, "amd_atanf"); 48 | amd_asinhf = (C_FUN1F)dlsym(handle, "amd_asinhf"); 49 | amd_acoshf = (C_FUN1F)dlsym(handle, "amd_acoshf"); 50 | amd_atanhf = (C_FUN1F)dlsym(handle, "amd_atanhf"); 51 | amd_logf = (C_FUN1F)dlsym(handle, "amd_logf"); 52 | amd_log2f = (C_FUN1F)dlsym(handle, "amd_log2f"); 53 | amd_log10f = (C_FUN1F)dlsym(handle, "amd_log10f"); 54 | amd_expf = (C_FUN1F)dlsym(handle, "amd_expf"); 55 | amd_exp2f = (C_FUN1F)dlsym(handle, "amd_exp2f"); 56 | amd_exp10f = (C_FUN1F)dlsym(handle, "amd_exp10f"); 57 | amd_sqrtf = (C_FUN1F)dlsym(handle, "amd_sqrtf"); 58 | amd_powf = (C_FUN2F)dlsym(handle, "amd_powf"); 59 | 60 | amd_sin = (C_FUN1D)dlsym(handle, "amd_sin"); 61 | amd_cos = (C_FUN1D)dlsym(handle, "amd_cos"); 62 | amd_tan = (C_FUN1D)dlsym(handle, "amd_tan"); 63 | amd_sinh = (C_FUN1D)dlsym(handle, "amd_sinh"); 64 | amd_cosh = (C_FUN1D)dlsym(handle, "amd_cosh"); 65 | amd_tanh = (C_FUN1D)dlsym(handle, "amd_tanh"); 66 | amd_asin = (C_FUN1D)dlsym(handle, "amd_asin"); 67 | amd_acos = (C_FUN1D)dlsym(handle, "amd_acos"); 68 | amd_atan = (C_FUN1D)dlsym(handle, "amd_atan"); 69 | amd_asinh = (C_FUN1D)dlsym(handle, "amd_asinh"); 70 | amd_acosh = (C_FUN1D)dlsym(handle, "amd_acosh"); 71 | amd_atanh = (C_FUN1D)dlsym(handle, "amd_atanh"); 72 | amd_log = (C_FUN1D)dlsym(handle, "amd_log"); 73 | amd_log2 = (C_FUN1D)dlsym(handle, "amd_log2"); 74 | amd_log10 = (C_FUN1D)dlsym(handle, "amd_log10"); 75 | amd_exp = (C_FUN1D)dlsym(handle, "amd_exp"); 76 | amd_exp2 = (C_FUN1D)dlsym(handle, "amd_exp2"); 77 | amd_exp10 = (C_FUN1D)dlsym(handle, "amd_exp10"); 78 | amd_sqrt = (C_FUN1D)dlsym(handle, "amd_sqrt"); 79 | amd_pow = (C_FUN2D)dlsym(handle, "amd_pow"); 80 | 81 | amd_vrs8_sinf = (C_FX8_FUN1F)dlsym(handle, "amd_vrs8_sinf"); 82 | amd_vrs8_cosf = (C_FX8_FUN1F)dlsym(handle, "amd_vrs8_cosf"); 83 | amd_vrs8_tanf = (C_FX8_FUN1F)dlsym(handle, "amd_vrs8_tanf"); 84 | amd_vrs8_logf = (C_FX8_FUN1F)dlsym(handle, "amd_vrs8_logf"); 85 | amd_vrs8_log2f = (C_FX8_FUN1F)dlsym(handle, "amd_vrs8_log2f"); 86 | amd_vrs8_expf = (C_FX8_FUN1F)dlsym(handle, "amd_vrs8_expf"); 87 | amd_vrs8_exp2f = (C_FX8_FUN1F)dlsym(handle, "amd_vrs8_exp2f"); 88 | amd_vrs8_powf = (C_FX8_FUN2F)dlsym(handle, "amd_vrs8_powf"); 89 | 90 | amd_vrd4_sin = (C_DX4_FUN1D)dlsym(handle, "amd_vrd4_sin"); 91 | amd_vrd4_cos = (C_DX4_FUN1D)dlsym(handle, "amd_vrd4_cos"); 92 | amd_vrd4_tan = (C_DX4_FUN1D)dlsym(handle, "amd_vrd4_tan"); 93 | amd_vrd4_log = (C_DX4_FUN1D)dlsym(handle, "amd_vrd4_log"); 94 | amd_vrd4_log2 = (C_DX4_FUN1D)dlsym(handle, "amd_vrd4_log2"); 95 | amd_vrd4_exp = (C_DX4_FUN1D)dlsym(handle, "amd_vrd4_exp"); 96 | amd_vrd4_exp2 = (C_DX4_FUN1D)dlsym(handle, "amd_vrd4_exp2"); 97 | amd_vrd4_pow = (C_DX4_FUN2D)dlsym(handle, "amd_vrd4_pow"); 98 | 99 | funs_fx1 = { 100 | {"sin", scalar_func_apply([](float x) -> float { return amd_sinf(x); })}, 101 | {"cos", scalar_func_apply([](float x) -> float { return amd_cosf(x); })}, 102 | {"tan", scalar_func_apply([](float x) -> float { return amd_tanf(x); })}, 103 | {"sinh", scalar_func_apply([](float x) -> float { return amd_sinhf(x); })}, 104 | {"cosh", scalar_func_apply([](float x) -> float { return amd_coshf(x); })}, 105 | {"tanh", scalar_func_apply([](float x) -> float { return amd_tanhf(x); })}, 106 | {"asin", scalar_func_apply([](float x) -> float { return amd_asinf(x); })}, 107 | {"acos", scalar_func_apply([](float x) -> float { return amd_acosf(x); })}, 108 | {"atan", scalar_func_apply([](float x) -> float { return amd_atanf(x); })}, 109 | {"asinh", scalar_func_apply([](float x) -> float { return amd_asinhf(x); })}, 110 | {"acosh", scalar_func_apply([](float x) -> float { return amd_acoshf(x); })}, 111 | {"atanh", scalar_func_apply([](float x) -> float { return amd_atanhf(x); })}, 112 | {"log", scalar_func_apply([](float x) -> float { return amd_logf(x); })}, 113 | {"log2", scalar_func_apply([](float x) -> float { return amd_log2f(x); })}, 114 | {"log10", scalar_func_apply([](float x) -> float { return amd_log10f(x); })}, 115 | {"exp", scalar_func_apply([](float x) -> float { return amd_expf(x); })}, 116 | {"exp2", scalar_func_apply([](float x) -> float { return amd_exp2f(x); })}, 117 | {"exp10", scalar_func_apply([](float x) -> float { return amd_exp10f(x); })}, 118 | {"sqrt", scalar_func_apply([](float x) -> float { return amd_sqrtf(x); })}, 119 | {"pow3.5", scalar_func_apply([](float x) -> float { return amd_powf(x, 3.5); })}, 120 | {"pow13", scalar_func_apply([](float x) -> float { return amd_powf(x, 13); })}, 121 | }; 122 | 123 | funs_dx1 = { 124 | {"sin", scalar_func_apply([](double x) -> double { return amd_sin(x); })}, 125 | {"cos", scalar_func_apply([](double x) -> double { return amd_cos(x); })}, 126 | {"tan", scalar_func_apply([](double x) -> double { return amd_tan(x); })}, 127 | {"sinh", scalar_func_apply([](double x) -> double { return amd_sinh(x); })}, 128 | {"cosh", scalar_func_apply([](double x) -> double { return amd_cosh(x); })}, 129 | {"tanh", scalar_func_apply([](double x) -> double { return amd_tanh(x); })}, 130 | {"asin", scalar_func_apply([](double x) -> double { return amd_asin(x); })}, 131 | {"acos", scalar_func_apply([](double x) -> double { return amd_acos(x); })}, 132 | {"atan", scalar_func_apply([](double x) -> double { return amd_atan(x); })}, 133 | {"asinh", scalar_func_apply([](double x) -> double { return amd_asinh(x); })}, 134 | {"acosh", scalar_func_apply([](double x) -> double { return amd_acosh(x); })}, 135 | {"atanh", scalar_func_apply([](double x) -> double { return amd_atanh(x); })}, 136 | {"log", scalar_func_apply([](double x) -> double { return amd_log(x); })}, 137 | {"log2", scalar_func_apply([](double x) -> double { return amd_log2(x); })}, 138 | {"log10", scalar_func_apply([](double x) -> double { return amd_log10(x); })}, 139 | {"exp", scalar_func_apply([](double x) -> double { return amd_exp(x); })}, 140 | {"exp2", scalar_func_apply([](double x) -> double { return amd_exp2(x); })}, 141 | {"exp10", scalar_func_apply([](double x) -> double { return amd_exp10(x); })}, 142 | {"sqrt", scalar_func_apply([](double x) -> double { return amd_sqrt(x); })}, 143 | {"pow3.5", scalar_func_apply([](double x) -> double { return amd_pow(x, 3.5); })}, 144 | {"pow13", scalar_func_apply([](double x) -> double { return amd_pow(x, 13); })}, 145 | }; 146 | 147 | funs_dx4 = { 148 | {"sin", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_sin(x); })}, 149 | {"cos", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_cos(x); })}, 150 | {"tan", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_tan(x); })}, 151 | {"log", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_log(x); })}, 152 | {"log2", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_log2(x); })}, 153 | {"exp", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_exp(x); })}, 154 | {"exp2", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_exp2(x); })}, 155 | {"pow3.5", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_pow(x, Vec4d{3.5}); })}, 156 | {"pow13", vec_func_apply([](Vec4d x) -> Vec4d { return amd_vrd4_pow(x, Vec4d{13}); })}, 157 | }; 158 | 159 | funs_fx8 = { 160 | {"sin", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_sinf(x); })}, 161 | {"cos", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_cosf(x); })}, 162 | {"tan", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_tanf(x); })}, 163 | {"log", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_logf(x); })}, 164 | {"log2", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_log2f(x); })}, 165 | {"exp", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_expf(x); })}, 166 | {"exp2", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_exp2f(x); })}, 167 | {"pow3.5", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_powf(x, Vec8f{3.5}); })}, 168 | {"pow13", vec_func_apply([](Vec8f x) -> Vec8f { return amd_vrs8_powf(x, Vec8f{13}); })}, 169 | }; 170 | } 171 | 172 | std::unordered_map> &get_funs_fx1() { 173 | load_functions(); 174 | return funs_fx1; 175 | } 176 | std::unordered_map> &get_funs_fx8() { 177 | load_functions(); 178 | return funs_fx8; 179 | } 180 | std::unordered_map> &get_funs_dx1() { 181 | load_functions(); 182 | return funs_dx1; 183 | } 184 | std::unordered_map> &get_funs_dx4() { 185 | load_functions(); 186 | return funs_dx4; 187 | } 188 | } // namespace sf::functions::amd 189 | -------------------------------------------------------------------------------- /src/bind_baobzi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace sf::functions::baobzi { 5 | using ::baobzi::Baobzi; 6 | 7 | double baobzi_fun_wrapper(const double *x, const void *data) { 8 | auto *myfun = (std::function *)data; 9 | return (*myfun)(*x); 10 | } 11 | 12 | std::shared_ptr create_baobzi_func(void *infun, const std::pair &domain) { 13 | baobzi_input_t input = {.func = baobzi_fun_wrapper, 14 | .data = infun, 15 | .dim = 1, 16 | .order = 8, 17 | .tol = 1E-10, 18 | .minimum_leaf_fraction = 0.6, 19 | .split_multi_eval = 0}; 20 | double hl = 0.5 * (domain.second - domain.first); 21 | double center = domain.first + hl; 22 | 23 | return std::shared_ptr(new Baobzi(&input, ¢er, &hl)); 24 | } 25 | 26 | std::unordered_map> baobzi_funs; 27 | std::unordered_map> potential_baobzi_funs{ 28 | {"bessel_Y0", [](double x) -> double { return gsl_sf_bessel_Y0(x); }}, 29 | {"bessel_Y1", [](double x) -> double { return gsl_sf_bessel_Y1(x); }}, 30 | {"bessel_Y2", [](double x) -> double { return gsl_sf_bessel_Yn(2, x); }}, 31 | {"bessel_I0", [](double x) -> double { return gsl_sf_bessel_I0(x); }}, 32 | {"bessel_I1", [](double x) -> double { return gsl_sf_bessel_I1(x); }}, 33 | {"bessel_I2", [](double x) -> double { return gsl_sf_bessel_In(2, x); }}, 34 | {"bessel_J0", [](double x) -> double { return gsl_sf_bessel_J0(x); }}, 35 | {"bessel_J1", [](double x) -> double { return gsl_sf_bessel_J1(x); }}, 36 | {"bessel_J2", [](double x) -> double { return gsl_sf_bessel_Jn(2, x); }}, 37 | {"hermite_0", [](double x) -> double { return gsl_sf_hermite(0, x); }}, 38 | {"hermite_1", [](double x) -> double { return gsl_sf_hermite(1, x); }}, 39 | {"hermite_2", [](double x) -> double { return gsl_sf_hermite(2, x); }}, 40 | {"hermite_3", [](double x) -> double { return gsl_sf_hermite(3, x); }}, 41 | }; 42 | 43 | std::unordered_map> & 44 | get_funs_dx1(std::set &keys_to_eval, std::unordered_map ¶ms) { 45 | for (auto &key : keys_to_eval) { 46 | if (potential_baobzi_funs.count(key) && !baobzi_funs.count(key)) { 47 | std::cerr << "Creating baobzi function '" + key + "'.\n"; 48 | auto ¶m = params[key]; 49 | std::pair domain = std::make_pair(param.lbound, param.ubound); 50 | baobzi_funs[key] = create_baobzi_func((void *)(&potential_baobzi_funs.at(key)), domain); 51 | } 52 | } 53 | 54 | return baobzi_funs; 55 | } 56 | 57 | } // namespace sf::functions::baobzi 58 | -------------------------------------------------------------------------------- /src/bind_boost.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::boost { 4 | std::unordered_map> funs_fx1 = { 5 | {"sin_pi", scalar_func_apply([](float x) -> float { return ::boost::math::sin_pi(x); })}, 6 | {"cos_pi", scalar_func_apply([](float x) -> float { return ::boost::math::cos_pi(x); })}, 7 | {"tgamma", scalar_func_apply([](float x) -> float { return ::boost::math::tgamma(x); })}, 8 | {"lgamma", scalar_func_apply([](float x) -> float { return ::boost::math::lgamma(x); })}, 9 | {"digamma", scalar_func_apply([](float x) -> float { return ::boost::math::digamma(x); })}, 10 | {"pow13", scalar_func_apply([](float x) -> float { return ::boost::math::pow<13>(x); })}, 11 | {"erf", scalar_func_apply([](float x) -> float { return ::boost::math::erf(x); })}, 12 | {"erfc", scalar_func_apply([](float x) -> float { return ::boost::math::erfc(x); })}, 13 | {"sinc_pi", scalar_func_apply([](float x) -> float { return ::boost::math::sinc_pi(x); })}, 14 | {"bessel_Y0", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_neumann(0, x); })}, 15 | {"bessel_Y1", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_neumann(1, x); })}, 16 | {"bessel_Y2", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_neumann(2, x); })}, 17 | {"bessel_I0", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_i(0, x); })}, 18 | {"bessel_I1", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_i(1, x); })}, 19 | {"bessel_I2", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_i(2, x); })}, 20 | {"bessel_J0", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_j(0, x); })}, 21 | {"bessel_J1", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_j(1, x); })}, 22 | {"bessel_J2", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_j(2, x); })}, 23 | {"bessel_K0", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_k(0, x); })}, 24 | {"bessel_K1", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_k(1, x); })}, 25 | {"bessel_K2", scalar_func_apply([](float x) -> float { return ::boost::math::cyl_bessel_k(2, x); })}, 26 | {"bessel_j0", scalar_func_apply([](float x) -> float { return ::boost::math::sph_bessel(0, x); })}, 27 | {"bessel_j1", scalar_func_apply([](float x) -> float { return ::boost::math::sph_bessel(1, x); })}, 28 | {"bessel_j2", scalar_func_apply([](float x) -> float { return ::boost::math::sph_bessel(2, x); })}, 29 | {"bessel_y0", scalar_func_apply([](float x) -> float { return ::boost::math::sph_neumann(0, x); })}, 30 | {"bessel_y1", scalar_func_apply([](float x) -> float { return ::boost::math::sph_neumann(1, x); })}, 31 | {"bessel_y2", scalar_func_apply([](float x) -> float { return ::boost::math::sph_neumann(2, x); })}, 32 | {"hermite_0", scalar_func_apply([](float x) -> float { return ::boost::math::hermite(0, x); })}, 33 | {"hermite_1", scalar_func_apply([](float x) -> float { return ::boost::math::hermite(1, x); })}, 34 | {"hermite_2", scalar_func_apply([](float x) -> float { return ::boost::math::hermite(2, x); })}, 35 | {"hermite_3", scalar_func_apply([](float x) -> float { return ::boost::math::hermite(3, x); })}, 36 | {"riemann_zeta", scalar_func_apply([](float x) -> float { return ::boost::math::zeta(x); })}, 37 | }; 38 | 39 | std::unordered_map> funs_dx1 = { 40 | {"sin_pi", scalar_func_apply([](double x) -> double { return ::boost::math::sin_pi(x); })}, 41 | {"cos_pi", scalar_func_apply([](double x) -> double { return ::boost::math::cos_pi(x); })}, 42 | {"tgamma", scalar_func_apply([](double x) -> double { return ::boost::math::tgamma(x); })}, 43 | {"lgamma", scalar_func_apply([](double x) -> double { return ::boost::math::lgamma(x); })}, 44 | {"digamma", scalar_func_apply([](double x) -> double { return ::boost::math::digamma(x); })}, 45 | {"pow13", scalar_func_apply([](double x) -> double { return ::boost::math::pow<13>(x); })}, 46 | {"erf", scalar_func_apply([](double x) -> double { return ::boost::math::erf(x); })}, 47 | {"erfc", scalar_func_apply([](double x) -> double { return ::boost::math::erfc(x); })}, 48 | {"sinc_pi", scalar_func_apply([](double x) -> double { return ::boost::math::sinc_pi(x); })}, 49 | {"bessel_Y0", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_neumann(0, x); })}, 50 | {"bessel_Y1", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_neumann(1, x); })}, 51 | {"bessel_Y2", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_neumann(2, x); })}, 52 | {"bessel_I0", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_i(0, x); })}, 53 | {"bessel_I1", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_i(1, x); })}, 54 | {"bessel_I2", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_i(2, x); })}, 55 | {"bessel_J0", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_j(0, x); })}, 56 | {"bessel_J1", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_j(1, x); })}, 57 | {"bessel_J2", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_j(2, x); })}, 58 | {"bessel_K0", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_k(0, x); })}, 59 | {"bessel_K1", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_k(1, x); })}, 60 | {"bessel_K2", scalar_func_apply([](double x) -> double { return ::boost::math::cyl_bessel_k(2, x); })}, 61 | {"bessel_j0", scalar_func_apply([](double x) -> double { return ::boost::math::sph_bessel(0, x); })}, 62 | {"bessel_j1", scalar_func_apply([](double x) -> double { return ::boost::math::sph_bessel(1, x); })}, 63 | {"bessel_j2", scalar_func_apply([](double x) -> double { return ::boost::math::sph_bessel(2, x); })}, 64 | {"bessel_y0", scalar_func_apply([](double x) -> double { return ::boost::math::sph_neumann(0, x); })}, 65 | {"bessel_y1", scalar_func_apply([](double x) -> double { return ::boost::math::sph_neumann(1, x); })}, 66 | {"bessel_y2", scalar_func_apply([](double x) -> double { return ::boost::math::sph_neumann(2, x); })}, 67 | {"hermite_0", scalar_func_apply([](double x) -> double { return ::boost::math::hermite(0, x); })}, 68 | {"hermite_1", scalar_func_apply([](double x) -> double { return ::boost::math::hermite(1, x); })}, 69 | {"hermite_2", scalar_func_apply([](double x) -> double { return ::boost::math::hermite(2, x); })}, 70 | {"hermite_3", scalar_func_apply([](double x) -> double { return ::boost::math::hermite(3, x); })}, 71 | {"riemann_zeta", scalar_func_apply([](double x) -> double { return ::boost::math::zeta(x); })}, 72 | }; 73 | 74 | std::unordered_map> &get_funs_fx1() { return funs_fx1; } 75 | std::unordered_map> &get_funs_dx1() { return funs_dx1; } 76 | } // namespace sf::functions::boost 77 | -------------------------------------------------------------------------------- /src/bind_eigen.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::eigen { 4 | std::unordered_map funs = { 5 | {"sin", OPS::sin}, {"cos", OPS::cos}, {"tan", OPS::tan}, {"sinh", OPS::sinh}, 6 | {"cosh", OPS::cosh}, {"tanh", OPS::tanh}, {"exp", OPS::exp}, {"log", OPS::log}, 7 | {"log10", OPS::log10}, {"pow3.5", OPS::pow35}, {"pow13", OPS::pow13}, {"asin", OPS::asin}, 8 | {"acos", OPS::acos}, {"atan", OPS::atan}, {"asinh", OPS::asinh}, {"atanh", OPS::atanh}, 9 | {"acosh", OPS::acosh}, {"erf", OPS::erf}, {"erfc", OPS::erfc}, {"lgamma", OPS::lgamma}, 10 | {"digamma", OPS::digamma}, {"ndtri", OPS::ndtri}, {"sqrt", OPS::sqrt}, {"rsqrt", OPS::rsqrt}, 11 | }; 12 | 13 | std::unordered_map &get_funs() { return funs; } 14 | } // namespace sf::functions::eigen 15 | -------------------------------------------------------------------------------- /src/bind_fort.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::fort { 4 | std::unordered_map> funs_dx1 = { 5 | {"bessel_Y0", scalar_func_apply([](double x) -> double { 6 | int n = 0; 7 | double y; 8 | fort_bessel_yn_(&n, &x, &y); 9 | return y; 10 | })}, 11 | {"bessel_J0", scalar_func_apply([](double x) -> double { 12 | int n = 0; 13 | double y; 14 | fort_bessel_jn_(&n, &x, &y); 15 | return y; 16 | })}, 17 | }; 18 | 19 | std::unordered_map> &get_funs_dx1() { return funs_dx1; } 20 | } // namespace sf::functions::fort 21 | -------------------------------------------------------------------------------- /src/bind_gsl.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::gsl { 4 | std::unordered_map> funs_dx1; 5 | std::unordered_map> funs_cdx1; 6 | bool initialized = false; 7 | 8 | inline cdouble gsl_complex_wrapper(cdouble z, int (*f)(double, double, gsl_sf_result *, gsl_sf_result *)) { 9 | gsl_sf_result re, im; 10 | f(z.real(), z.imag(), &re, &im); 11 | return cdouble{re.val, im.val}; 12 | } 13 | 14 | void load_functions() { 15 | if (initialized) 16 | return; 17 | initialized = true; 18 | 19 | funs_dx1 = { 20 | {"sin_pi", scalar_func_apply([](double x) -> double { return gsl_sf_sin_pi(x); })}, 21 | {"cos_pi", scalar_func_apply([](double x) -> double { return gsl_sf_cos_pi(x); })}, 22 | {"sin", scalar_func_apply([](double x) -> double { return gsl_sf_sin(x); })}, 23 | {"cos", scalar_func_apply([](double x) -> double { return gsl_sf_cos(x); })}, 24 | {"sinc", scalar_func_apply([](double x) -> double { return gsl_sf_sinc(x / M_PI); })}, 25 | {"sinc_pi", scalar_func_apply([](double x) -> double { return gsl_sf_sinc(x); })}, 26 | {"erf", scalar_func_apply([](double x) -> double { return gsl_sf_erf(x); })}, 27 | {"erfc", scalar_func_apply([](double x) -> double { return gsl_sf_erfc(x); })}, 28 | {"tgamma", scalar_func_apply([](double x) -> double { return gsl_sf_gamma(x); })}, 29 | {"lgamma", scalar_func_apply([](double x) -> double { return gsl_sf_lngamma(x); })}, 30 | {"log", scalar_func_apply([](double x) -> double { return gsl_sf_log(x); })}, 31 | {"exp", scalar_func_apply([](double x) -> double { return gsl_sf_exp(x); })}, 32 | {"pow13", scalar_func_apply([](double x) -> double { return gsl_sf_pow_int(x, 13); })}, 33 | {"bessel_Y0", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_Y0(x); })}, 34 | {"bessel_Y1", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_Y1(x); })}, 35 | {"bessel_Y2", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_Yn(2, x); })}, 36 | {"bessel_I0", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_I0(x); })}, 37 | {"bessel_I1", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_I1(x); })}, 38 | {"bessel_I2", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_In(2, x); })}, 39 | {"bessel_J0", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_J0(x); })}, 40 | {"bessel_J1", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_J1(x); })}, 41 | {"bessel_J2", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_Jn(2, x); })}, 42 | {"bessel_K0", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_K0(x); })}, 43 | {"bessel_K1", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_K1(x); })}, 44 | {"bessel_K2", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_Kn(2, x); })}, 45 | {"bessel_j0", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_j0(x); })}, 46 | {"bessel_j1", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_j1(x); })}, 47 | {"bessel_j2", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_j2(x); })}, 48 | {"bessel_y0", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_y0(x); })}, 49 | {"bessel_y1", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_y1(x); })}, 50 | {"bessel_y2", scalar_func_apply([](double x) -> double { return gsl_sf_bessel_y2(x); })}, 51 | {"hermite_0", scalar_func_apply([](double x) -> double { return gsl_sf_hermite(0, x); })}, 52 | {"hermite_1", scalar_func_apply([](double x) -> double { return gsl_sf_hermite(1, x); })}, 53 | {"hermite_2", scalar_func_apply([](double x) -> double { return gsl_sf_hermite(2, x); })}, 54 | {"hermite_3", scalar_func_apply([](double x) -> double { return gsl_sf_hermite(3, x); })}, 55 | {"riemann_zeta", scalar_func_apply([](double x) -> double { return gsl_sf_zeta(x); })}, 56 | }; 57 | 58 | // FIXME: check accuracy of this and this+test_func 59 | funs_cdx1 = { 60 | {"sin", 61 | scalar_func_apply([](cdouble z) -> cdouble { return gsl_complex_wrapper(z, gsl_sf_complex_sin_e); })}, 62 | {"cos", 63 | scalar_func_apply([](cdouble z) -> cdouble { return gsl_complex_wrapper(z, gsl_sf_complex_cos_e); })}, 64 | {"log", 65 | scalar_func_apply([](cdouble z) -> cdouble { return gsl_complex_wrapper(z, gsl_sf_complex_log_e); })}, 66 | {"dilog", scalar_func_apply( 67 | [](cdouble z) -> cdouble { return gsl_complex_wrapper(z, gsl_sf_complex_dilog_e); })}, 68 | {"lgamma", scalar_func_apply( 69 | [](cdouble z) -> cdouble { return gsl_complex_wrapper(z, gsl_sf_lngamma_complex_e); })}, 70 | }; 71 | } 72 | 73 | std::unordered_map> &get_funs_dx1() { 74 | load_functions(); 75 | return funs_dx1; 76 | } 77 | 78 | std::unordered_map> &get_funs_cdx1() { 79 | load_functions(); 80 | return funs_cdx1; 81 | } 82 | 83 | } // namespace sf::functions::gsl 84 | -------------------------------------------------------------------------------- /src/bind_misc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::misc { 4 | std::unordered_map funs_cdx1_x2 = { 5 | {"hank103", [](cdouble z) -> std::pair { 6 | cdouble h0, h1; 7 | int ifexpon = 1; 8 | hank103_((double _Complex *)&z, (double _Complex *)&h0, (double _Complex *)&h1, &ifexpon); 9 | return {h0, h1}; 10 | }}}; 11 | 12 | std::unordered_map &get_funs_cdx1_x2() { return funs_cdx1_x2; } 13 | } // namespace sf::functions::misc 14 | -------------------------------------------------------------------------------- /src/bind_sctl.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::SCTL { 4 | 5 | std::unordered_map> funs_fx8 = { 6 | {"memcpy", sctl_apply([](const sctl_fx8 &x) { return x; })}, 7 | {"memset", sctl_apply([](const sctl_fx8 &x) -> sctl_fx8 { return (sctl_fx8::VData)Vec8f{0.0}; })}, 8 | {"exp", sctl_apply([](const sctl_fx8 &x) { return exp(x); })}, 9 | {"log", sctl_apply([](const sctl_fx8 &x) { return log(x); })}, 10 | {"sin", sctl_apply([](const sctl_fx8 &x) { 11 | sctl_fx8 sinx, cosx; 12 | sincos(sinx, cosx, x); 13 | return sinx; 14 | })}, 15 | {"cos", sctl_apply([](const sctl_fx8 &x) { 16 | sctl_fx8 sinx, cosx; 17 | sincos(sinx, cosx, x); 18 | return cosx; 19 | })}, 20 | {"rsqrt", sctl_apply([](const sctl_fx8 &x) { return sctl::approx_rsqrt<7>(x); })}, 21 | }; 22 | 23 | std::unordered_map> funs_dx4 = { 24 | {"memcpy", sctl_apply([](const sctl_dx4 &x) { return x; })}, 25 | {"memset", sctl_apply([](const sctl_dx4 &x) -> sctl_dx4 { return (sctl_dx4::VData)Vec4d{0.0}; })}, 26 | {"exp", sctl_apply([](const sctl_dx4 &x) { return exp(x); })}, 27 | {"log", sctl_apply([](const sctl_dx4 &x) { return log(x); })}, 28 | {"sin", sctl_apply([](const sctl_dx4 &x) { 29 | sctl_dx4 sinx, cosx; 30 | sincos(sinx, cosx, x); 31 | return sinx; 32 | })}, 33 | {"cos", sctl_apply([](const sctl_dx4 &x) { 34 | sctl_dx4 sinx, cosx; 35 | sincos(sinx, cosx, x); 36 | return cosx; 37 | })}, 38 | {"rsqrt", sctl_apply([](const sctl_dx4 &x) { return sctl::approx_rsqrt<16>(x); })}, 39 | }; 40 | 41 | #ifdef __AVX512F__ 42 | std::unordered_map> funs_fx16 = { 43 | {"memcpy", sctl_apply([](const sctl_fx16 &x) { return x; })}, 44 | {"memset", sctl_apply([](const sctl_fx16 &x) -> sctl_fx16 { return (sctl_fx16::VData)Vec16f{0.0}; })}, 45 | {"exp", sctl_apply([](const sctl_fx16 &x) { return exp(x); })}, 46 | {"log", sctl_apply([](const sctl_fx16 &x) { return log(x); })}, 47 | {"sin", sctl_apply([](const sctl_fx16 &x) { 48 | sctl_fx16 sinx, cosx; 49 | sincos(sinx, cosx, x); 50 | return sinx; 51 | })}, 52 | {"cos", sctl_apply([](const sctl_fx16 &x) { 53 | sctl_fx16 sinx, cosx; 54 | sincos(sinx, cosx, x); 55 | return cosx; 56 | })}, 57 | {"rsqrt", sctl_apply([](const sctl_fx16 &x) { return sctl::approx_rsqrt<7>(x); })}, 58 | }; 59 | 60 | std::unordered_map> funs_dx8 = { 61 | {"memcpy", sctl_apply([](const sctl_dx8 &x) { return x; })}, 62 | {"memset", sctl_apply([](const sctl_dx8 &x) -> sctl_dx8 { return (sctl_dx8::VData)Vec8d{0.0}; })}, 63 | {"exp", sctl_apply([](const sctl_dx8 &x) { return exp(x); })}, 64 | {"log", sctl_apply([](const sctl_dx8 &x) { return log(x); })}, 65 | {"sin", sctl_apply([](const sctl_dx8 &x) { 66 | sctl_dx8 sinx, cosx; 67 | sincos(sinx, cosx, x); 68 | return sinx; 69 | })}, 70 | {"cos", sctl_apply([](const sctl_dx8 &x) { 71 | sctl_dx8 sinx, cosx; 72 | sincos(sinx, cosx, x); 73 | return cosx; 74 | })}, 75 | {"rsqrt", sctl_apply([](const sctl_dx8 &x) { return sctl::approx_rsqrt<16>(x); })}, 76 | }; 77 | #else 78 | std::unordered_map> funs_fx16; 79 | std::unordered_map> funs_dx8; 80 | #endif 81 | 82 | std::unordered_map> &get_funs_fx8() { return funs_fx8; } 83 | std::unordered_map> &get_funs_dx4() { return funs_dx4; } 84 | std::unordered_map> &get_funs_fx16() { return funs_fx16; } 85 | std::unordered_map> &get_funs_dx8() { return funs_dx8; } 86 | 87 | } // namespace sf::functions::SCTL 88 | -------------------------------------------------------------------------------- /src/bind_sleef.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::sleef { 4 | 5 | std::unordered_map> funs_fx1 = { 6 | {"sin_pi", scalar_func_apply([](float x) -> float { return Sleef_sinpif1_u05purecfma(x); })}, 7 | {"cos_pi", scalar_func_apply([](float x) -> float { return Sleef_cospif1_u05purecfma(x); })}, 8 | {"sin", scalar_func_apply([](float x) -> float { return Sleef_sinf1_u10purecfma(x); })}, 9 | {"cos", scalar_func_apply([](float x) -> float { return Sleef_cosf1_u10purecfma(x); })}, 10 | {"tan", scalar_func_apply([](float x) -> float { return Sleef_tanf1_u10purecfma(x); })}, 11 | {"sinh", scalar_func_apply([](float x) -> float { return Sleef_sinhf1_u10purecfma(x); })}, 12 | {"cosh", scalar_func_apply([](float x) -> float { return Sleef_coshf1_u10purecfma(x); })}, 13 | {"tanh", scalar_func_apply([](float x) -> float { return Sleef_tanhf1_u10purecfma(x); })}, 14 | {"asin", scalar_func_apply([](float x) -> float { return Sleef_asinf1_u10purecfma(x); })}, 15 | {"acos", scalar_func_apply([](float x) -> float { return Sleef_acosf1_u10purecfma(x); })}, 16 | {"atan", scalar_func_apply([](float x) -> float { return Sleef_atanf1_u10purecfma(x); })}, 17 | {"asinh", scalar_func_apply([](float x) -> float { return Sleef_asinhf1_u10purecfma(x); })}, 18 | {"acosh", scalar_func_apply([](float x) -> float { return Sleef_acoshf1_u10purecfma(x); })}, 19 | {"atanh", scalar_func_apply([](float x) -> float { return Sleef_atanhf1_u10purecfma(x); })}, 20 | {"log", scalar_func_apply([](float x) -> float { return Sleef_logf1_u10purecfma(x); })}, 21 | {"log2", scalar_func_apply([](float x) -> float { return Sleef_log2f1_u10purecfma(x); })}, 22 | {"log10", scalar_func_apply([](float x) -> float { return Sleef_log10f1_u10purecfma(x); })}, 23 | {"exp", scalar_func_apply([](float x) -> float { return Sleef_expf1_u10purecfma(x); })}, 24 | {"exp2", scalar_func_apply([](float x) -> float { return Sleef_exp2f1_u10purecfma(x); })}, 25 | {"exp10", scalar_func_apply([](float x) -> float { return Sleef_exp10f1_u10purecfma(x); })}, 26 | {"erf", scalar_func_apply([](float x) -> float { return Sleef_erff1_u10purecfma(x); })}, 27 | {"erfc", scalar_func_apply([](float x) -> float { return Sleef_erfcf1_u15purecfma(x); })}, 28 | {"lgamma", scalar_func_apply([](float x) -> float { return Sleef_lgammaf1_u10purecfma(x); })}, 29 | {"tgamma", scalar_func_apply([](float x) -> float { return Sleef_tgammaf1_u10purecfma(x); })}, 30 | {"sqrt", scalar_func_apply([](float x) -> float { return Sleef_sqrtf1_u05purecfma(x); })}, 31 | {"pow3.5", scalar_func_apply([](float x) -> float { return Sleef_powf1_u10purecfma(x, 3.5); })}, 32 | {"pow13", scalar_func_apply([](float x) -> float { return Sleef_powf1_u10purecfma(x, 13); })}, 33 | }; 34 | 35 | std::unordered_map> funs_dx1 = { 36 | {"sin_pi", scalar_func_apply([](double x) -> double { return Sleef_sinpid1_u05purecfma(x); })}, 37 | {"cos_pi", scalar_func_apply([](double x) -> double { return Sleef_cospid1_u05purecfma(x); })}, 38 | {"sin", scalar_func_apply([](double x) -> double { return Sleef_sind1_u10purecfma(x); })}, 39 | {"cos", scalar_func_apply([](double x) -> double { return Sleef_cosd1_u10purecfma(x); })}, 40 | {"tan", scalar_func_apply([](double x) -> double { return Sleef_tand1_u10purecfma(x); })}, 41 | {"sinh", scalar_func_apply([](double x) -> double { return Sleef_sinhd1_u10purecfma(x); })}, 42 | {"cosh", scalar_func_apply([](double x) -> double { return Sleef_coshd1_u10purecfma(x); })}, 43 | {"tanh", scalar_func_apply([](double x) -> double { return Sleef_tanhd1_u10purecfma(x); })}, 44 | {"asin", scalar_func_apply([](double x) -> double { return Sleef_asind1_u10purecfma(x); })}, 45 | {"acos", scalar_func_apply([](double x) -> double { return Sleef_acosd1_u10purecfma(x); })}, 46 | {"atan", scalar_func_apply([](double x) -> double { return Sleef_atand1_u10purecfma(x); })}, 47 | {"asinh", scalar_func_apply([](double x) -> double { return Sleef_asinhd1_u10purecfma(x); })}, 48 | {"acosh", scalar_func_apply([](double x) -> double { return Sleef_acoshd1_u10purecfma(x); })}, 49 | {"atanh", scalar_func_apply([](double x) -> double { return Sleef_atanhd1_u10purecfma(x); })}, 50 | {"log", scalar_func_apply([](double x) -> double { return Sleef_logd1_u10purecfma(x); })}, 51 | {"log2", scalar_func_apply([](double x) -> double { return Sleef_log2d1_u10purecfma(x); })}, 52 | {"log10", scalar_func_apply([](double x) -> double { return Sleef_log10d1_u10purecfma(x); })}, 53 | {"exp", scalar_func_apply([](double x) -> double { return Sleef_expd1_u10purecfma(x); })}, 54 | {"exp2", scalar_func_apply([](double x) -> double { return Sleef_exp2d1_u10purecfma(x); })}, 55 | {"exp10", scalar_func_apply([](double x) -> double { return Sleef_exp10d1_u10purecfma(x); })}, 56 | {"erf", scalar_func_apply([](double x) -> double { return Sleef_erfd1_u10purecfma(x); })}, 57 | {"erfc", scalar_func_apply([](double x) -> double { return Sleef_erfcd1_u15purecfma(x); })}, 58 | {"lgamma", scalar_func_apply([](double x) -> double { return Sleef_lgammad1_u10purecfma(x); })}, 59 | {"tgamma", scalar_func_apply([](double x) -> double { return Sleef_tgammad1_u10purecfma(x); })}, 60 | {"sqrt", scalar_func_apply([](double x) -> double { return Sleef_sqrtd1_u05purecfma(x); })}, 61 | {"pow3.5", scalar_func_apply([](double x) -> double { return Sleef_powd1_u10purecfma(x, 3.5); })}, 62 | {"pow13", scalar_func_apply([](double x) -> double { return Sleef_powd1_u10purecfma(x, 13); })}, 63 | }; 64 | 65 | std::unordered_map> funs_fx8 = { 66 | {"sin_pi", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_sinpif8_u05avx2(x); })}, 67 | {"cos_pi", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_cospif8_u05avx2(x); })}, 68 | {"sin", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_sinf8_u10avx2(x); })}, 69 | {"cos", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_cosf8_u10avx2(x); })}, 70 | {"tan", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_tanf8_u10avx2(x); })}, 71 | {"sinh", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_sinhf8_u10avx2(x); })}, 72 | {"cosh", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_coshf8_u10avx2(x); })}, 73 | {"tanh", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_tanhf8_u10avx2(x); })}, 74 | {"asin", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_asinf8_u10avx2(x); })}, 75 | {"acos", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_acosf8_u10avx2(x); })}, 76 | {"atan", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_atanf8_u10avx2(x); })}, 77 | {"asinh", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_asinhf8_u10avx2(x); })}, 78 | {"acosh", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_acoshf8_u10avx2(x); })}, 79 | {"atanh", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_atanhf8_u10avx2(x); })}, 80 | {"log", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_logf8_u10avx2(x); })}, 81 | {"log2", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_log2f8_u10avx2(x); })}, 82 | {"log10", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_log10f8_u10avx2(x); })}, 83 | {"exp", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_expf8_u10avx2(x); })}, 84 | {"exp2", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_exp2f8_u10avx2(x); })}, 85 | {"exp10", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_exp10f8_u10avx2(x); })}, 86 | {"erf", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_erff8_u10avx2(x); })}, 87 | {"erfc", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_erfcf8_u15avx2(x); })}, 88 | {"lgamma", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_lgammaf8_u10avx2(x); })}, 89 | {"tlgamma", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_tgammaf8_u10avx2(x); })}, 90 | {"sqrt", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_sqrtf8_u05avx2(x); })}, 91 | {"pow3.5", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_powf8_u10avx2(x, Vec8f{3.5}); })}, 92 | {"pow13", vec_func_apply([](Vec8f x) -> Vec8f { return Sleef_powf8_u10avx2(x, Vec8f{13}); })}, 93 | }; 94 | 95 | std::unordered_map> funs_dx4 = { 96 | {"sin_pi", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_sinpid4_u05avx2(x); })}, 97 | {"cos_pi", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_cospid4_u05avx2(x); })}, 98 | {"sin", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_sind4_u10avx2(x); })}, 99 | {"cos", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_cosd4_u10avx2(x); })}, 100 | {"tan", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_tand4_u10avx2(x); })}, 101 | {"sinh", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_sinhd4_u10avx2(x); })}, 102 | {"cosh", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_coshd4_u10avx2(x); })}, 103 | {"tanh", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_tanhd4_u10avx2(x); })}, 104 | {"asin", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_asind4_u10avx2(x); })}, 105 | {"acos", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_acosd4_u10avx2(x); })}, 106 | {"atan", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_atand4_u10avx2(x); })}, 107 | {"asinh", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_asinhd4_u10avx2(x); })}, 108 | {"acosh", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_acoshd4_u10avx2(x); })}, 109 | {"atanh", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_atanhd4_u10avx2(x); })}, 110 | {"log", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_logd4_u10avx2(x); })}, 111 | {"log2", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_log2d4_u10avx2(x); })}, 112 | {"log10", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_log10d4_u10avx2(x); })}, 113 | {"exp", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_expd4_u10avx2(x); })}, 114 | {"exp2", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_exp2d4_u10avx2(x); })}, 115 | {"exp10", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_exp10d4_u10avx2(x); })}, 116 | {"erf", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_erfd4_u10avx2(x); })}, 117 | {"erfc", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_erfcd4_u15avx2(x); })}, 118 | {"lgamma", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_lgammad4_u10avx2(x); })}, 119 | {"tlgamma", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_tgammad4_u10avx2(x); })}, 120 | {"sqrt", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_sqrtd4_u05avx2(x); })}, 121 | {"pow3.5", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_powd4_u10avx2(x, Vec4d{3.5}); })}, 122 | {"pow13", vec_func_apply([](Vec4d x) -> Vec4d { return Sleef_powd4_u10avx2(x, Vec4d{13}); })}, 123 | }; 124 | 125 | #ifdef __AVX512F__ 126 | std::unordered_map> funs_fx16 = { 127 | {"sin_pi", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_sinpif16_u05avx512f(x); })}, 128 | {"cos_pi", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_cospif16_u05avx512f(x); })}, 129 | {"sin", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_sinf16_u10avx512f(x); })}, 130 | {"cos", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_cosf16_u10avx512f(x); })}, 131 | {"tan", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_tanf16_u10avx512f(x); })}, 132 | {"sinh", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_sinhf16_u10avx512f(x); })}, 133 | {"cosh", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_coshf16_u10avx512f(x); })}, 134 | {"tanh", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_tanhf16_u10avx512f(x); })}, 135 | {"asin", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_asinf16_u10avx512f(x); })}, 136 | {"acos", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_acosf16_u10avx512f(x); })}, 137 | {"atan", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_atanf16_u10avx512f(x); })}, 138 | {"asinh", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_asinhf16_u10avx512f(x); })}, 139 | {"acosh", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_acoshf16_u10avx512f(x); })}, 140 | {"atanh", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_atanhf16_u10avx512f(x); })}, 141 | {"log", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_logf16_u10avx512f(x); })}, 142 | {"log2", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_log2f16_u10avx512f(x); })}, 143 | {"log10", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_log10f16_u10avx512f(x); })}, 144 | {"exp", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_expf16_u10avx512f(x); })}, 145 | {"exp2", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_exp2f16_u10avx512f(x); })}, 146 | {"exp10", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_exp10f16_u10avx512f(x); })}, 147 | {"erf", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_erff16_u10avx512f(x); })}, 148 | {"erfc", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_erfcf16_u15avx512f(x); })}, 149 | {"lgamma", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_lgammaf16_u10avx512f(x); })}, 150 | {"tlgamma", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_tgammaf16_u10avx512f(x); })}, 151 | {"sqrt", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_sqrtf16_u05avx512f(x); })}, 152 | {"pow3.5", 153 | vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_powf16_u10avx512f(x, Vec16f{3.5}); })}, 154 | {"pow13", vec_func_apply([](Vec16f x) -> Vec16f { return Sleef_powf16_u10avx512f(x, Vec16f{13}); })}, 155 | }; 156 | 157 | std::unordered_map> funs_dx8 = { 158 | {"sin_pi", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_sinpid8_u05avx512f(x); })}, 159 | {"cos_pi", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_cospid8_u05avx512f(x); })}, 160 | {"sin", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_sind8_u10avx512f(x); })}, 161 | {"cos", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_cosd8_u10avx512f(x); })}, 162 | {"tan", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_tand8_u10avx512f(x); })}, 163 | {"sinh", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_sinhd8_u10avx512f(x); })}, 164 | {"cosh", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_coshd8_u10avx512f(x); })}, 165 | {"tanh", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_tanhd8_u10avx512f(x); })}, 166 | {"asin", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_asind8_u10avx512f(x); })}, 167 | {"acos", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_acosd8_u10avx512f(x); })}, 168 | {"atan", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_atand8_u10avx512f(x); })}, 169 | {"asinh", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_asinhd8_u10avx512f(x); })}, 170 | {"acosh", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_acoshd8_u10avx512f(x); })}, 171 | {"atanh", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_atanhd8_u10avx512f(x); })}, 172 | {"log", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_logd8_u10avx512f(x); })}, 173 | {"log2", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_log2d8_u10avx512f(x); })}, 174 | {"log10", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_log10d8_u10avx512f(x); })}, 175 | {"exp", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_expd8_u10avx512f(x); })}, 176 | {"exp2", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_exp2d8_u10avx512f(x); })}, 177 | {"exp10", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_exp10d8_u10avx512f(x); })}, 178 | {"erf", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_erfd8_u10avx512f(x); })}, 179 | {"erfc", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_erfcd8_u15avx512f(x); })}, 180 | {"lgamma", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_lgammad8_u10avx512f(x); })}, 181 | {"tlgamma", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_tgammad8_u10avx512f(x); })}, 182 | {"sqrt", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_sqrtd8_u05avx512f(x); })}, 183 | {"pow3.5", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_powd8_u10avx512f(x, Vec8d{3.5}); })}, 184 | {"pow13", vec_func_apply([](Vec8d x) -> Vec8d { return Sleef_powd8_u10avx512f(x, Vec8d{13}); })}, 185 | }; 186 | #else 187 | std::unordered_map> funs_fx16; 188 | std::unordered_map> funs_dx8; 189 | #endif 190 | 191 | std::unordered_map> &get_funs_fx1() { return funs_fx1; } 192 | std::unordered_map> &get_funs_dx1() { return funs_dx1; } 193 | std::unordered_map> &get_funs_fx8() { return funs_fx8; } 194 | std::unordered_map> &get_funs_dx4() { return funs_dx4; } 195 | std::unordered_map> &get_funs_fx16() { return funs_fx16; } 196 | std::unordered_map> &get_funs_dx8() { return funs_dx8; } 197 | 198 | } // namespace sf::functions::sleef 199 | -------------------------------------------------------------------------------- /src/bind_stl.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace sf::functions::stl { 4 | std::unordered_map> funs_fx1 = { 5 | {"memcpy", [](const float *src, float *dst, size_t N) { std::memcpy(dst, src, N * sizeof(float)); }}, 6 | {"memset", [](const float *src, float *dst, size_t N) { std::memset(dst, 0, N * sizeof(float)); }}, 7 | {"tgamma", scalar_func_apply([](float x) -> float { return std::tgamma(x); })}, 8 | {"lgamma", scalar_func_apply([](float x) -> float { return std::lgamma(x); })}, 9 | {"sin", scalar_func_apply([](float x) -> float { return std::sin(x); })}, 10 | {"cos", scalar_func_apply([](float x) -> float { return std::cos(x); })}, 11 | {"tan", scalar_func_apply([](float x) -> float { return std::tan(x); })}, 12 | {"asin", scalar_func_apply([](float x) -> float { return std::asin(x); })}, 13 | {"acos", scalar_func_apply([](float x) -> float { return std::acos(x); })}, 14 | {"atan", scalar_func_apply([](float x) -> float { return std::atan(x); })}, 15 | {"asin", scalar_func_apply([](float x) -> float { return std::asin(x); })}, 16 | {"acos", scalar_func_apply([](float x) -> float { return std::acos(x); })}, 17 | {"atan", scalar_func_apply([](float x) -> float { return std::atan(x); })}, 18 | {"sinh", scalar_func_apply([](float x) -> float { return std::sinh(x); })}, 19 | {"cosh", scalar_func_apply([](float x) -> float { return std::cosh(x); })}, 20 | {"tanh", scalar_func_apply([](float x) -> float { return std::tanh(x); })}, 21 | {"asinh", scalar_func_apply([](float x) -> float { return std::asinh(x); })}, 22 | {"acosh", scalar_func_apply([](float x) -> float { return std::acosh(x); })}, 23 | {"atanh", scalar_func_apply([](float x) -> float { return std::atanh(x); })}, 24 | {"sin_pi", scalar_func_apply([](float x) -> float { return std::sin(M_PI * x); })}, 25 | {"cos_pi", scalar_func_apply([](float x) -> float { return std::cos(M_PI * x); })}, 26 | {"erf", scalar_func_apply([](float x) -> float { return std::erf(x); })}, 27 | {"erfc", scalar_func_apply([](float x) -> float { return std::erfc(x); })}, 28 | {"log", scalar_func_apply([](float x) -> float { return std::log(x); })}, 29 | {"log2", scalar_func_apply([](float x) -> float { return std::log2(x); })}, 30 | {"log10", scalar_func_apply([](float x) -> float { return std::log10(x); })}, 31 | {"exp", scalar_func_apply([](float x) -> float { return std::exp(x); })}, 32 | {"exp2", scalar_func_apply([](float x) -> float { return std::exp2(x); })}, 33 | {"exp10", scalar_func_apply([](float x) -> float { return exp10(x); })}, 34 | {"sqrt", scalar_func_apply([](float x) -> float { return std::sqrt(x); })}, 35 | {"rsqrt", scalar_func_apply([](float x) -> float { return 1.0 / std::sqrt(x); })}, 36 | {"pow3.5", scalar_func_apply([](float x) -> float { return std::pow(x, 3.5); })}, 37 | {"pow13", scalar_func_apply([](float x) -> float { return std::pow(x, 13); })}, 38 | }; 39 | 40 | std::unordered_map> funs_dx1 = { 41 | {"memcpy", [](const double *src, double *dst, size_t N) { std::memcpy(dst, src, N * sizeof(double)); }}, 42 | {"memset", [](const double *src, double *dst, size_t N) { std::memset(dst, 0, N * sizeof(double)); }}, 43 | {"tgamma", scalar_func_apply([](double x) -> double { return std::tgamma(x); })}, 44 | {"lgamma", scalar_func_apply([](double x) -> double { return std::lgamma(x); })}, 45 | {"sin", scalar_func_apply([](double x) -> double { return std::sin(x); })}, 46 | {"cos", scalar_func_apply([](double x) -> double { return std::cos(x); })}, 47 | {"tan", scalar_func_apply([](double x) -> double { return std::tan(x); })}, 48 | {"asin", scalar_func_apply([](double x) -> double { return std::asin(x); })}, 49 | {"acos", scalar_func_apply([](double x) -> double { return std::acos(x); })}, 50 | {"atan", scalar_func_apply([](double x) -> double { return std::atan(x); })}, 51 | {"asin", scalar_func_apply([](double x) -> double { return std::asin(x); })}, 52 | {"acos", scalar_func_apply([](double x) -> double { return std::acos(x); })}, 53 | {"atan", scalar_func_apply([](double x) -> double { return std::atan(x); })}, 54 | {"sinh", scalar_func_apply([](double x) -> double { return std::sinh(x); })}, 55 | {"cosh", scalar_func_apply([](double x) -> double { return std::cosh(x); })}, 56 | {"tanh", scalar_func_apply([](double x) -> double { return std::tanh(x); })}, 57 | {"asinh", scalar_func_apply([](double x) -> double { return std::asinh(x); })}, 58 | {"acosh", scalar_func_apply([](double x) -> double { return std::acosh(x); })}, 59 | {"atanh", scalar_func_apply([](double x) -> double { return std::atanh(x); })}, 60 | {"sin_pi", scalar_func_apply([](double x) -> double { return std::sin(M_PI * x); })}, 61 | {"cos_pi", scalar_func_apply([](double x) -> double { return std::cos(M_PI * x); })}, 62 | {"erf", scalar_func_apply([](double x) -> double { return std::erf(x); })}, 63 | {"erfc", scalar_func_apply([](double x) -> double { return std::erfc(x); })}, 64 | {"log", scalar_func_apply([](double x) -> double { return std::log(x); })}, 65 | {"log2", scalar_func_apply([](double x) -> double { return std::log2(x); })}, 66 | {"log10", scalar_func_apply([](double x) -> double { return std::log10(x); })}, 67 | {"exp", scalar_func_apply([](double x) -> double { return std::exp(x); })}, 68 | {"exp2", scalar_func_apply([](double x) -> double { return std::exp2(x); })}, 69 | {"exp10", scalar_func_apply([](double x) -> double { return exp10(x); })}, 70 | {"sqrt", scalar_func_apply([](double x) -> double { return std::sqrt(x); })}, 71 | {"rsqrt", scalar_func_apply([](double x) -> double { return 1.0 / std::sqrt(x); })}, 72 | {"pow3.5", scalar_func_apply([](double x) -> double { return std::pow(x, 3.5); })}, 73 | {"pow13", scalar_func_apply([](double x) -> double { return std::pow(x, 13); })}, 74 | }; 75 | 76 | std::unordered_map> &get_funs_fx1() { return funs_fx1; } 77 | std::unordered_map> &get_funs_dx1() { return funs_dx1; } 78 | } // namespace sf::functions::stl 79 | -------------------------------------------------------------------------------- /src/hank103.f: -------------------------------------------------------------------------------- 1 | ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc 2 | c 3 | c 4 | c this is the end of the debugging code and the beginning of the 5 | c hankel function code proper. 6 | c 7 | c 8 | ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc 9 | c 10 | c 11 | c 12 | c 13 | c code yanked with author's permission from 14 | c https://github.com/ahbarnett/mpspack 15 | subroutine hanks103(z,hanks,n,ifexpon) 16 | implicit real *8 (a-h,o-z) 17 | complex *16 z,hanks(1),cd,cdd 18 | c 19 | c This subroutine evaluates the first n+1 Hankel functions of the 20 | c argument z. The user also has the option of evaluating the 21 | c functions H_m(z) scaled by the (complex) coefficient e^{-i \cdot z}. 22 | c This option is provided via the parameter ifexpon (see below) 23 | c 24 | c 25 | c input parameters: 26 | c 27 | c z - the complex number for which the hankel functions 28 | c H_0, H_1 are to be evaluated 29 | c n - the highest order of any Hankel function to be evaluated 30 | c ifexpon - the integer parameter telling the subroutine whether 31 | c to calculate the actual values of the hankel functions, 32 | c or the values of Hankel functions scaled by e^{-i \cdot z}. 33 | c Permitted values: 0 and 1. 34 | c ifexpon = 1 will cause the subroutine to evaluate the Hankel functions 35 | c honestly 36 | c ifexpon = 0 will cause the subroutine to scale the Hankel functions 37 | c by e^{-i \cdot z}. 38 | c 39 | c output parameters: 40 | c 41 | c hanks - the first n+1 Hankel functions of the (complex) argument z. 42 | c Please note that hanks(1) is the Hankel function of order 0, 43 | c hanks(2) is the Hankel function of order 1, ..., hanks(n+1) 44 | c is the Hankel function of order n 45 | c 46 | c . . . evaluate the functions h0,h1 47 | c 48 | call hank103(z,hanks(1),hanks(2),ifexpon) 49 | c 50 | c 51 | c conduct recursion 52 | c 53 | cd=2/z 54 | cdd=cd 55 | do 1200 i1=2,n 56 | c 57 | i=i1-1 58 | c 59 | cccc hanks(i1+1)=(2*i)/z*hanks(i1)-hanks(i1-1) 60 | hanks(i1+1)=cdd*hanks(i1)-hanks(i1-1) 61 | c 62 | cdd=cdd+cd 63 | 1200 continue 64 | c 65 | return 66 | end 67 | c 68 | c 69 | c 70 | c 71 | c 72 | subroutine hank103(z,h0,h1,ifexpon) 73 | implicit real *8 (a-h,o-z) 74 | complex *16 z,h0,h1,h0u,h0r,h1u,h1r, 75 | 1 fj0,fj1,y0,y1,com,zu,zr,ima,ser2,ser3,z2, 76 | 2 cclog,cd 77 | real *8 rea(2) 78 | equivalence (rea(1),com) 79 | data ima/(0.0d0,1.0d0)/,pi/0.31415926535897932D+01/ 80 | c 81 | c this subroutine evaluates the hankel functions H_0^1, H_1^1 82 | c for an arbitrary user-specified complex number z. The user 83 | c also has the option of evaluating the functions h0, h1 84 | c scaled by the (complex) coefficient e^{-i \cdot z}. This 85 | c subroutine is a modification of the subroutine hank102 86 | c (see), different from the latter by having the parameter 87 | c ifexpon. Please note that the subroutine hank102 is in 88 | c turn a slightly accelerated version of the old hank101 89 | c (see). The principal claim to fame of all three is that 90 | c they are valid on the whole complex plane, and are 91 | c reasonably accurate (14-digit relative accuracy) and 92 | c reasonably fast. Also, please note that all three have not 93 | c been carefully tested in the third quadrant (both x and y 94 | c negative); some sort of numerical trouble is possible 95 | c (though has not been observed) for LARGE z in the third 96 | c quadrant. 97 | c 98 | c input parameters: 99 | c 100 | c z - the complex number for which the hankel functions 101 | c H_0, H_1 are to be evaluated 102 | c ifexpon - the integer parameter telling the subroutine whether 103 | c to calculate the actual values of the hankel functions, 104 | c or the values of Hankel functions scaled by e^{-i \cdot z}. 105 | c Permitted values: 0 and 1. 106 | c ifexpon = 1 will cause the subroutine to evaluate the Hankel functions 107 | c honestly 108 | c ifexpon = 0 will cause the subroutine to scale the Hankel functions 109 | c by e^{-i \cdot z}. 110 | c 111 | c output parameters: 112 | c 113 | c h0, h1 - the said Hankel functions 114 | c 115 | c 116 | c . . . if z in the upper half-plane - act accordingly 117 | c 118 | com=z 119 | if(rea(2) .lt. 0) goto 1400 120 | call hank103u(z,ier,h0,h1,ifexpon) 121 | return 122 | 1400 continue 123 | c 124 | c if z is in the right lower quadrant - act accordingly 125 | c 126 | if(rea(1) .lt. 0) goto 2000 127 | call hank103r(z,ier,h0,h1,ifexpon) 128 | return 129 | 2000 continue 130 | c 131 | c z is in the left lower quadrant. compute 132 | c h0, h1 at the points zu, zr obtained from z by reflection 133 | c in the x and y axis, respectively 134 | c 135 | zu=dconjg(z) 136 | zr=-zu 137 | c 138 | call hank103u(zu,ier,h0u,h1u,ifexpon) 139 | call hank103r(zr,ier,h0r,h1r,ifexpon) 140 | 141 | if(ifexpon .eq. 1) goto 3000 142 | 143 | com=zu 144 | subt=abs(rea(2)) 145 | 146 | cd=exp(ima*zu-subt) 147 | h0u=h0u*cd 148 | h1u=h1u*cd 149 | 150 | cd=exp(ima*zr-subt) 151 | h0r=h0r*cd 152 | h1r=h1r*cd 153 | 3000 continue 154 | c 155 | c compute the functions j0, j1, y0, y1 156 | c at the point zr 157 | c 158 | half=1 159 | half=half/2 160 | y0=(h0u+h0r)*half/ima 161 | fj0=-(h0u-h0r)*half 162 | c 163 | y1=-(h1u-h1r)*half/ima 164 | fj1=(h1u+h1r)*half 165 | c 166 | c finally, compute h0, h1 167 | c 168 | c . . . calculate ser2, ser3 169 | c 170 | z2=-dconjg(z) 171 | cclog=cdlog(z2) 172 | ser2=y0-fj0*2/pi*cclog 173 | ser3=y1-fj1*2/pi*cclog 174 | c 175 | c reflect all of these in the imaginary axis 176 | c 177 | fj0=dconjg(fj0) 178 | fj1=-dconjg(fj1) 179 | c 180 | ser2=dconjg(ser2) 181 | ser3=-dconjg(ser3) 182 | c 183 | c reconstitute y0, y1 184 | c 185 | cclog=cdlog(z) 186 | y0=ser2+fj0*2/pi*cclog 187 | y1=ser3+fj1*2/pi*cclog 188 | c 189 | h0=fj0+ima*y0 190 | h1=fj1+ima*y1 191 | 192 | if(ifexpon .eq. 1) return 193 | 194 | cd=exp(-ima*z+subt) 195 | h0=h0*cd 196 | h1=h1*cd 197 | 198 | return 199 | end 200 | c 201 | c 202 | c 203 | c 204 | c 205 | subroutine hank103u(z,ier,h0,h1,ifexpon) 206 | implicit real *8 (a-h,o-z) 207 | complex *16 z,com,ima,cd,h0,h1,ccex,zzz9 208 | dimension rea(2) 209 | real *8 c0p1(34),c0p1b(36),buf01(2) 210 | equivalence (c0p1(34),buf01(1)), 211 | 1 (c0p1b(1),buf01(2)),(rea(1),com) 212 | real *8 c1p1(34),c1p1b(36),buf11(2) 213 | equivalence (c1p1(34),buf11(1)), 214 | 1 (c1p1b(1),buf11(2)) 215 | real *8 c0p2(34),c0p2b(28),buf02(2) 216 | equivalence (c0p2(34),buf02(1)), 217 | 1 (c0p2b(1),buf02(2)) 218 | real *8 c1p2(34),c1p2b(28),buf12(2) 219 | equivalence (c1p2(34),buf12(1)), 220 | 1 (c1p2b(1),buf12(2)) 221 | data ima/(0.0d0,1.0d0)/ 222 | c 223 | c this subroutine evaluates the hankel functions H_0^1, H_1^1 224 | c for a user-specified complex number z in the upper half-plane. 225 | c it is reasonably accurate (14-digit relative accuracy) 226 | c and reasonably fast. 227 | c 228 | c 229 | c input parameters: 230 | c 231 | c z - the complex number for which the hankel functions 232 | c H_0, H_1 are to be evaluated 233 | c 234 | c output parameters: 235 | c 236 | c ier - error return code. 237 | c ier=0 means successful conclusion 238 | c ier=4 means that z is not in the upper half-plane 239 | c h0, h1 - the said Hankel functions 240 | c 241 | data c0p1/ 242 | 1 -.6619836118357782D-12, -.6619836118612709D-12, 243 | 2 -.7307514264754200D-21, 0.3928160926261892D-10, 244 | 3 0.5712712520172854D-09, -.5712712519967086D-09, 245 | 4 -.1083820384008718D-07, -.1894529309455499D-18, 246 | 5 0.7528123700585197D-07, 0.7528123700841491D-07, 247 | 6 0.1356544045548053D-16, -.8147940452202855D-06, 248 | 7 -.3568198575016769D-05, 0.3568198574899888D-05, 249 | 8 0.2592083111345422D-04, 0.4209074870019400D-15, 250 | 9 -.7935843289157352D-04, -.7935843289415642D-04, 251 | a -.6848330800445365D-14, 0.4136028298630129D-03, 252 | 1 0.9210433149997867D-03, -.9210433149680665D-03, 253 | 2 -.3495306809056563D-02, -.6469844672213905D-13, 254 | 3 0.5573890502766937D-02, 0.5573890503000873D-02, 255 | 4 0.3767341857978150D-12, -.1439178509436339D-01, 256 | 5 -.1342403524448708D-01, 0.1342403524340215D-01, 257 | 6 0.8733016209933828D-02, 0.1400653553627576D-11, 258 | 7 0.2987361261932706D-01, 0.2987361261607835D-01/ 259 | data c0p1b/ 260 | 8 -.3388096836339433D-11, -.1690673895793793D+00, 261 | 9 0.2838366762606121D+00, -.2838366762542546D+00, 262 | a 0.7045107746587499D+00, -.5363893133864181D-11, 263 | 1 -.7788044738211666D+00, -.7788044738130360D+00, 264 | 2 0.5524779104964783D-11, 0.1146003459721775D+01, 265 | 3 0.6930697486173089D+00, -.6930697486240221D+00, 266 | 4 -.7218270272305891D+00, 0.3633022466839301D-11, 267 | 5 0.3280924142354455D+00, 0.3280924142319602D+00, 268 | 6 -.1472323059106612D-11, -.2608421334424268D+00, 269 | 7 -.9031397649230536D-01, 0.9031397649339185D-01, 270 | 8 0.5401342784296321D-01, -.3464095071668884D-12, 271 | 9 -.1377057052946721D-01, -.1377057052927901D-01, 272 | a 0.4273263742980154D-13, 0.5877224130705015D-02, 273 | 1 0.1022508471962664D-02, -.1022508471978459D-02, 274 | 2 -.2789107903871137D-03, 0.2283984571396129D-14, 275 | 3 0.2799719727019427D-04, 0.2799719726970900D-04, 276 | 4 -.3371218242141487D-16, -.3682310515545645D-05, 277 | 5 -.1191412910090512D-06, 0.1191412910113518D-06/ 278 | c 279 | data c1p1/ 280 | 281 | 1 0.4428361927253983D-12, -.4428361927153559D-12, 282 | 2 -.2575693161635231D-10, -.2878656317479645D-21, 283 | 3 0.3658696304107867D-09, 0.3658696304188925D-09, 284 | 4 0.7463138750413651D-19, -.6748894854135266D-08, 285 | 5 -.4530098210372099D-07, 0.4530098210271137D-07, 286 | 6 0.4698787882823243D-06, 0.5343848349451927D-17, 287 | 7 -.1948662942158171D-05, -.1948662942204214D-05, 288 | 8 -.1658085463182409D-15, 0.1316906100496570D-04, 289 | 9 0.3645368564036497D-04, -.3645368563934748D-04, 290 | a -.1633458547818390D-03, -.2697770638600506D-14, 291 | 1 0.2816784976551660D-03, 0.2816784976676616D-03, 292 | 2 0.2548673351180060D-13, -.6106478245116582D-03, 293 | 3 0.2054057459296899D-03, -.2054057460218446D-03, 294 | 4 -.6254962367291260D-02, 0.1484073406594994D-12, 295 | 5 0.1952900562500057D-01, 0.1952900562457318D-01, 296 | 6 -.5517611343746895D-12, -.8528074392467523D-01, 297 | 7 -.1495138141086974D+00, 0.1495138141099772D+00/ 298 | c 299 | data c1p1b/ 300 | 8 0.4394907314508377D+00, -.1334677126491326D-11, 301 | 9 -.1113740586940341D+01, -.1113740586937837D+01, 302 | a 0.2113005088866033D-11, 0.1170212831401968D+01, 303 | 1 0.1262152242318805D+01, -.1262152242322008D+01, 304 | 2 -.1557810619605511D+01, 0.2176383208521897D-11, 305 | 3 0.8560741701626648D+00, 0.8560741701600203D+00, 306 | 4 -.1431161194996653D-11, -.8386735092525187D+00, 307 | 5 -.3651819176599290D+00, 0.3651819176613019D+00, 308 | 6 0.2811692367666517D+00, -.5799941348040361D-12, 309 | 7 -.9494630182937280D-01, -.9494630182894480D-01, 310 | 8 0.1364615527772751D-12, 0.5564896498129176D-01, 311 | 9 0.1395239688792536D-01, -.1395239688799950D-01, 312 | a -.5871314703753967D-02, 0.1683372473682212D-13, 313 | 1 0.1009157100083457D-02, 0.1009157100077235D-02, 314 | 2 -.8997331160162008D-15, -.2723724213360371D-03, 315 | 3 -.2708696587599713D-04, 0.2708696587618830D-04, 316 | 4 0.3533092798326666D-05, -.1328028586935163D-16, 317 | 5 -.1134616446885126D-06, -.1134616446876064D-06/ 318 | c 319 | data c0p2/ 320 | 1 0.5641895835516786D+00, -.5641895835516010D+00, 321 | 2 -.3902447089770041D-09, -.3334441074447365D-11, 322 | 3 -.7052368835911731D-01, -.7052368821797083D-01, 323 | 4 0.1957299315085370D-08, -.3126801711815631D-06, 324 | 5 -.3967331737107949D-01, 0.3967327747706934D-01, 325 | 6 0.6902866639752817D-04, 0.3178420816292497D-06, 326 | 7 0.4080457166061280D-01, 0.4080045784614144D-01, 327 | 8 -.2218731025620065D-04, 0.6518438331871517D-02, 328 | 9 0.9798339748600499D-01, -.9778028374972253D-01, 329 | a -.3151825524811773D+00, -.7995603166188139D-03, 330 | 1 0.1111323666639636D+01, 0.1116791178994330D+01, 331 | 2 0.1635711249533488D-01, -.8527067497983841D+01, 332 | 3 -.2595553689471247D+02, 0.2586942834408207D+02, 333 | 4 0.1345583522428299D+03, 0.2002017907999571D+00, 334 | 5 -.3086364384881525D+03, -.3094609382885628D+03, 335 | 6 -.1505974589617013D+01, 0.1250150715797207D+04, 336 | 7 0.2205210257679573D+04, -.2200328091885836D+04/ 337 | data c0p2b/ 338 | 8 -.6724941072552172D+04, -.7018887749450317D+01, 339 | 9 0.8873498980910335D+04, 0.8891369384353965D+04, 340 | a 0.2008805099643591D+02, -.2030681426035686D+05, 341 | 1 -.2010017782384992D+05, 0.2006046282661137D+05, 342 | 2 0.3427941581102808D+05, 0.3432892927181724D+02, 343 | 3 -.2511417407338804D+05, -.2516567363193558D+05, 344 | 4 -.3318253740485142D+02, 0.3143940826027085D+05, 345 | 5 0.1658466564673543D+05, -.1654843151976437D+05, 346 | 6 -.1446345041326510D+05, -.1645433213663233D+02, 347 | 7 0.5094709396573681D+04, 0.5106816671258367D+04, 348 | 8 0.3470692471612145D+01, -.2797902324245621D+04, 349 | 9 -.5615581955514127D+03, 0.5601021281020627D+03, 350 | a 0.1463856702925587D+03, 0.1990076422327786D+00, 351 | 1 -.9334741618922085D+01, -.9361368967669095D+01/ 352 | c 353 | data c1p2/ 354 | 1 -.5641895835446003D+00, -.5641895835437973D+00, 355 | 2 0.3473016376419171D-10, -.3710264617214559D-09, 356 | 3 0.2115710836381847D+00, -.2115710851180242D+00, 357 | 4 0.3132928887334847D-06, 0.2064187785625558D-07, 358 | 5 -.6611954881267806D-01, -.6611997176900310D-01, 359 | 6 -.3386004893181560D-05, 0.7146557892862998D-04, 360 | 7 -.5728505088320786D-01, 0.5732906930408979D-01, 361 | 8 -.6884187195973806D-02, -.2383737409286457D-03, 362 | 9 0.1170452203794729D+00, 0.1192356405185651D+00, 363 | a 0.8652871239920498D-02, -.3366165876561572D+00, 364 | 1 -.1203989383538728D+01, 0.1144625888281483D+01, 365 | 2 0.9153684260534125D+01, 0.1781426600949249D+00, 366 | 3 -.2740411284066946D+02, -.2834461441294877D+02, 367 | 4 -.2192611071606340D+01, 0.1445470231392735D+03, 368 | 5 0.3361116314072906D+03, -.3270584743216529D+03, 369 | 6 -.1339254798224146D+04, -.1657618537130453D+02, 370 | 7 0.2327097844591252D+04, 0.2380960024514808D+04/ 371 | data c1p2b/ 372 | 8 0.7760611776965994D+02, -.7162513471480693D+04, 373 | 9 -.9520608696419367D+04, 0.9322604506839242D+04, 374 | a 0.2144033447577134D+05, 0.2230232555182369D+03, 375 | 1 -.2087584364240919D+05, -.2131762020653283D+05, 376 | 2 -.3825699231499171D+03, 0.3582976792594737D+05, 377 | 3 0.2642632405857713D+05, -.2585137938787267D+05, 378 | 4 -.3251446505037506D+05, -.3710875194432116D+03, 379 | 5 0.1683805377643986D+05, 0.1724393921722052D+05, 380 | 6 0.1846128226280221D+03, -.1479735877145448D+05, 381 | 7 -.5258288893282565D+04, 0.5122237462705988D+04, 382 | 8 0.2831540486197358D+04, 0.3905972651440027D+02, 383 | 9 -.5562781548969544D+03, -.5726891190727206D+03, 384 | a -.2246192560136119D+01, 0.1465347141877978D+03, 385 | 1 0.9456733342595993D+01, -.9155767836700837D+01/ 386 | c 387 | c if the user-specified z is in the lower half-plane 388 | c - bomb out 389 | c 390 | ier=0 391 | com=z 392 | if(rea(2) .ge. 0) goto 1200 393 | ier=4 394 | return 395 | 1200 continue 396 | c 397 | done=1 398 | thresh1=1**2 399 | thresh2=3.7**2 400 | thresh3=20**2 401 | c 402 | c check if if the user-specified z is in one of the 403 | c intermediate regimes 404 | c 405 | d=z*dconjg(z) 406 | if( (d .lt. thresh1) .or. (d .gt. thresh3) ) goto 3000 407 | c 408 | c the user-specified z is in one of the intermediate regimes. 409 | c act accordingly 410 | c 411 | c 412 | if(d .gt. thresh2) goto 2000 413 | c 414 | c z is in the first intermediate regime: its absolute value is 415 | c between 1 and 3.7. act accordingly 416 | c 417 | c . . . evaluate the expansion 418 | c 419 | cd=done/cdsqrt(z) 420 | c 421 | ccex=cd 422 | if(ifexpon .eq. 1) ccex=ccex*cdexp(ima*z) 423 | c 424 | zzz9=z**9 425 | m=35 426 | call hank103p(c0p1,m,cd,h0) 427 | h0=h0*ccex * zzz9 428 | c 429 | call hank103p(c1p1,m,cd,h1) 430 | h1=h1*ccex * zzz9 431 | return 432 | 2000 continue 433 | c 434 | c z is in the second intermediate regime: its absolute value is 435 | c between 3.7 and 20. act accordingly. 436 | c 437 | cd=done/cdsqrt(z) 438 | c 439 | ccex=cd 440 | if(ifexpon .eq. 1) ccex=ccex*cdexp(ima*z) 441 | 442 | m=31 443 | call hank103p(c0p2,m,cd,h0) 444 | h0=h0*ccex 445 | c 446 | m=31 447 | call hank103p(c1p2,m,cd,h1) 448 | h1=h1*ccex 449 | return 450 | 3000 continue 451 | c 452 | c z is either in the local regime or the asymptotic one. 453 | c if it is in the local regime - act accordingly. 454 | c 455 | if(d .gt. 50.d0) goto 4000 456 | call hank103l(z,h0,h1,ifexpon) 457 | return 458 | c 459 | c z is in the asymptotic regime. act accordingly. 460 | c 461 | 4000 continue 462 | call hank103a(z,h0,h1,ifexpon) 463 | return 464 | end 465 | c 466 | c 467 | c 468 | c 469 | subroutine hank103p(p,m,z,f) 470 | implicit real *8 (a-h,o-z) 471 | complex *16 p(1),z,f 472 | c 473 | c evaluate a polynomial at a point 474 | c 475 | f=p(m) 476 | do 1200 i=m-1,1,-1 477 | f=f*z+p(i) 478 | 1200 continue 479 | return 480 | end 481 | 482 | 483 | 484 | 485 | c 486 | c 487 | c 488 | c 489 | c 490 | subroutine hank103a(z,h0,h1,ifexpon) 491 | implicit real *8 (a-h,o-z) 492 | dimension p(18),q(18),p1(18),q1(18),rea(2) 493 | complex *16 z,zinv,pp,qq,ima,h0,h1,pp1,qq1, 494 | 1 com,cccexp,cdd,cdumb,zinv22 495 | equivalence (rea(1),com) 496 | data ima/(0.0d0,1.0d0)/,pi/0.31415926535897932D+01/, 497 | 1 done/1.0d0/,cdumb/ 498 | 2 (0.70710678118654757D+00,-.70710678118654746D+00)/ 499 | c 500 | data p/ 501 | 1 0.1000000000000000D+01, -.7031250000000000D-01, 502 | 2 0.1121520996093750D+00, -.5725014209747314D+00, 503 | 3 0.6074042001273483D+01, -.1100171402692467D+03, 504 | 4 0.3038090510922384D+04, -.1188384262567833D+06, 505 | 5 0.6252951493434797D+07, -.4259392165047669D+09, 506 | 6 0.3646840080706556D+11, -.3833534661393944D+13, 507 | 7 0.4854014686852901D+15, -.7286857349377657D+17, 508 | 8 0.1279721941975975D+20, -.2599382102726235D+22, 509 | 9 0.6046711487532401D+24, -.1597065525294211D+27/ 510 | c 511 | data q/ 512 | 1 -.1250000000000000D+00, 0.7324218750000000D-01, 513 | 2 -.2271080017089844D+00, 0.1727727502584457D+01, 514 | 3 -.2438052969955606D+02, 0.5513358961220206D+03, 515 | 4 -.1825775547429317D+05, 0.8328593040162893D+06, 516 | 5 -.5006958953198893D+08, 0.3836255180230434D+10, 517 | 6 -.3649010818849834D+12, 0.4218971570284096D+14, 518 | 7 -.5827244631566907D+16, 0.9476288099260110D+18, 519 | 8 -.1792162323051699D+21, 0.3900121292034000D+23, 520 | 9 -.9677028801069847D+25, 0.2715581773544907D+28/ 521 | 522 | data p1/ 523 | 1 0.1000000000000000D+01, 0.1171875000000000D+00, 524 | 2 -.1441955566406250D+00, 0.6765925884246826D+00, 525 | 3 -.6883914268109947D+01, 0.1215978918765359D+03, 526 | 4 -.3302272294480852D+04, 0.1276412726461746D+06, 527 | 5 -.6656367718817687D+07, 0.4502786003050393D+09, 528 | 6 -.3833857520742789D+11, 0.4011838599133198D+13, 529 | 7 -.5060568503314726D+15, 0.7572616461117957D+17, 530 | 8 -.1326257285320556D+20, 0.2687496750276277D+22, 531 | 9 -.6238670582374700D+24, 0.1644739123064188D+27/ 532 | c 533 | data q1/ 534 | 1 0.3750000000000000D+00, -.1025390625000000D+00, 535 | 2 0.2775764465332031D+00, -.1993531733751297D+01, 536 | 3 0.2724882731126854D+02, -.6038440767050702D+03, 537 | 4 0.1971837591223663D+05, -.8902978767070679D+06, 538 | 5 0.5310411010968522D+08, -.4043620325107754D+10, 539 | 6 0.3827011346598606D+12, -.4406481417852279D+14, 540 | 7 0.6065091351222699D+16, -.9833883876590680D+18, 541 | 8 0.1855045211579829D+21, -.4027994121281017D+23, 542 | 9 0.9974783533410457D+25, -.2794294288720121D+28/ 543 | c 544 | c evaluate the asymptotic expansion for h0,h1 at 545 | c the user-supplied point z, provided it is not 546 | c in the fourth quadrant 547 | c 548 | m=10 549 | zinv=done/z 550 | c 551 | pp=p(m) 552 | pp1=p1(m) 553 | zinv22=zinv**2 554 | c 555 | qq=q(m) 556 | qq1=q1(m) 557 | c 558 | do 1600 i=m-1,1,-1 559 | 560 | pp=pp* zinv22+p(i) 561 | pp1=pp1* zinv22+p1(i) 562 | 563 | qq=qq* zinv22+q(i) 564 | qq1=qq1* zinv22+q1(i) 565 | 1600 continue 566 | c 567 | qq=qq*zinv 568 | qq1=qq1*zinv 569 | c 570 | cccexp=1 571 | if(ifexpon .eq. 1) cccexp=cdexp(ima*z) 572 | c 573 | cdd=cdsqrt(2/pi*zinv) 574 | c 575 | h0=pp+ima*qq 576 | h0=cdd*cdumb*cccexp * h0 577 | c 578 | h1=pp1+ima*qq1 579 | h1=-cdd*cccexp*cdumb* h1*ima 580 | c 581 | return 582 | end 583 | c 584 | c 585 | c 586 | c 587 | c 588 | subroutine hank103l(z,h0,h1,ifexpon) 589 | implicit real *8 (a-h,o-z) 590 | dimension cj0(16),cj1(16),ser2(16),ser2der(16) 591 | complex *16 z,fj0,fj1,y0,y1,h0,h1,z2,cd,ima,cdddlog 592 | c 593 | data gamma/0.5772156649015328606d+00/ 594 | data ima/(0.0d0,1.0d0)/,pi/0.31415926535897932D+01/, 595 | 1 two/2.0d0/ 596 | c 597 | c this subroutine evaluates the hankel functions H_0^1, H_1^1 598 | c for a user-specified complex number z in the local regime, 599 | c i. e. for cdabs(z) < 1 in the upper half-plane, 600 | c and for cdabs(z) < 4 in the lower half-plane, 601 | c it is reasonably accurate (14-digit relative accuracy) and 602 | c reasonably fast. 603 | c 604 | c input parameters: 605 | c 606 | c z - the complex number for which the hankel functions 607 | c H_0, H_1 are to be evaluated 608 | c 609 | c output parameters: 610 | c 611 | c h0, h1 - the said Hankel functions 612 | c 613 | data cj0/ 614 | 1 0.1000000000000000D+01, -.2500000000000000D+00, 615 | 2 0.1562500000000000D-01, -.4340277777777778D-03, 616 | 3 0.6781684027777778D-05, -.6781684027777778D-07, 617 | 4 0.4709502797067901D-09, -.2402807549524439D-11, 618 | 5 0.9385966990329841D-14, -.2896903392077112D-16, 619 | 6 0.7242258480192779D-19, -.1496334396734045D-21, 620 | 7 0.2597802772107717D-24, -.3842903509035085D-27, 621 | 8 0.4901662639075363D-30, -.5446291821194848D-33/ 622 | data cj1/ 623 | 1 -.5000000000000000D+00, 0.6250000000000000D-01, 624 | 2 -.2604166666666667D-02, 0.5425347222222222D-04, 625 | 3 -.6781684027777778D-06, 0.5651403356481481D-08, 626 | 4 -.3363930569334215D-10, 0.1501754718452775D-12, 627 | 5 -.5214426105738801D-15, 0.1448451696038556D-17, 628 | 6 -.3291935672814899D-20, 0.6234726653058522D-23, 629 | 7 -.9991549123491221D-26, 0.1372465538941102D-28, 630 | 8 -.1633887546358454D-31, 0.1701966194123390D-34/ 631 | data ser2/ 632 | 1 0.2500000000000000D+00, -.2343750000000000D-01, 633 | 2 0.7957175925925926D-03, -.1412850839120370D-04, 634 | 3 0.1548484519675926D-06, -.1153828185281636D-08, 635 | 4 0.6230136717695511D-11, -.2550971742728932D-13, 636 | 5 0.8195247730999099D-16, -.2121234517551702D-18, 637 | 6 0.4518746345057852D-21, -.8061529302289970D-24, 638 | 7 0.1222094716680443D-26, -.1593806157473552D-29, 639 | 8 0.1807204342667468D-32, -.1798089518115172D-35/ 640 | data ser2der/ 641 | 1 0.5000000000000000D+00, -.9375000000000000D-01, 642 | 2 0.4774305555555556D-02, -.1130280671296296D-03, 643 | 3 0.1548484519675926D-05, -.1384593822337963D-07, 644 | 4 0.8722191404773715D-10, -.4081554788366291D-12, 645 | 5 0.1475144591579838D-14, -.4242469035103405D-17, 646 | 6 0.9941241959127275D-20, -.1934767032549593D-22, 647 | 7 0.3177446263369152D-25, -.4462657240925946D-28, 648 | 8 0.5421613028002404D-31, -.5753886457968550D-34/ 649 | c 650 | c evaluate j0, j1 651 | c 652 | m=16 653 | fj0=0 654 | fj1=0 655 | y0=0 656 | y1=0 657 | z2=z**2 658 | cd=1 659 | c 660 | do 1800 i=1,m 661 | fj0=fj0+cj0(i)*cd 662 | fj1=fj1+cj1(i)*cd 663 | y1=y1+ser2der(i)*cd 664 | cd=cd*z2 665 | y0=y0+ser2(i)*cd 666 | 1800 continue 667 | fj1=-fj1*z 668 | c 669 | cdddlog=cdlog(z/two)+gamma 670 | y0=cdddlog*fj0+y0 671 | y0=two/pi*y0 672 | c 673 | y1=y1*z 674 | c 675 | y1=-cdddlog*fj1+fj0/z+y1 676 | y1=-y1*two/pi 677 | c 678 | h0=fj0+ima*y0 679 | h1=fj1+ima*y1 680 | c 681 | if(ifexpon .eq. 1) return 682 | c 683 | cd=exp(-ima*z) 684 | h0=h0*cd 685 | h1=h1*cd 686 | c 687 | return 688 | end 689 | c 690 | c 691 | c 692 | c 693 | c 694 | subroutine hank103r(z,ier,h0,h1,ifexpon) 695 | implicit real *8 (a-h,o-z) 696 | complex *16 z,com,ima,cd,h0,h1,cccexp,cdd,zz18 697 | dimension rea(2) 698 | real *8 c0p1(34),c0p1b(36),buf01(2) 699 | equivalence (c0p1(34),buf01(1)), 700 | 1 (c0p1b(1),buf01(2)),(rea(1),com) 701 | real *8 c1p1(34),c1p1b(36),buf11(2) 702 | equivalence (c1p1(34),buf11(1)), 703 | 1 (c1p1b(1),buf11(2)) 704 | real *8 c0p2(34),c0p2b(20),buf02(2) 705 | equivalence (c0p2(34),buf02(1)), 706 | 1 (c0p2b(1),buf02(2)) 707 | real *8 c1p2(34),c1p2b(28),buf12(2) 708 | equivalence (c1p2(34),buf12(1)), 709 | 1 (c1p2b(1),buf12(2)) 710 | data ima/(0.0d0,1.0d0)/ 711 | c 712 | c this subroutine evaluates the hankel functions H_0^1, H_1^1 713 | c for a user-specified complex number z in the right lower 714 | c quadrant. it is reasonably accurate (14-digit relative 715 | c accuracy) and reasonably fast. 716 | c 717 | c 718 | c input parameters: 719 | c 720 | c z - the complex number for which the hankel functions 721 | c H_0, H_1 are to be evaluated 722 | c 723 | c output parameters: 724 | c 725 | c ier - error return code. 726 | c ier=0 means successful conclusion 727 | c ier=4 means that z is not in the right lower quadrant 728 | c h0, h1 - the said Hankel functions 729 | c 730 | data c0p1/ 731 | 1 -.4268441995428495D-23, 0.4374027848105921D-23, 732 | 2 0.9876152216238049D-23, -.1065264808278614D-20, 733 | 3 0.6240598085551175D-19, 0.6658529985490110D-19, 734 | 4 -.5107210870050163D-17, -.2931746613593983D-18, 735 | 5 0.1611018217758854D-15, -.1359809022054077D-15, 736 | 6 -.7718746693707326D-15, 0.6759496139812828D-14, 737 | 7 -.1067620915195442D-12, -.1434699000145826D-12, 738 | 8 0.3868453040754264D-11, 0.7061853392585180D-12, 739 | 9 -.6220133527871203D-10, 0.3957226744337817D-10, 740 | a 0.3080863675628417D-09, -.1154618431281900D-08, 741 | 1 0.7793319486868695D-08, 0.1502570745460228D-07, 742 | 2 -.1978090852638430D-06, -.7396691873499030D-07, 743 | 3 0.2175857247417038D-05, -.8473534855334919D-06, 744 | 4 -.1053381327609720D-04, 0.2042555121261223D-04, 745 | 5 -.4812568848956982D-04, -.1961519090873697D-03, 746 | 6 0.1291714391689374D-02, 0.9234422384950050D-03, 747 | 7 -.1113890671502769D-01, 0.9053687375483149D-03/ 748 | data c0p1b/ 749 | 8 0.5030666896877862D-01, -.4923119348218356D-01, 750 | 9 0.5202355973926321D+00, -.1705244841954454D+00, 751 | a -.1134990486611273D+01, -.1747542851820576D+01, 752 | 1 0.8308174484970718D+01, 0.2952358687641577D+01, 753 | 2 -.3286074510100263D+02, 0.1126542966971545D+02, 754 | 3 0.6576015458463394D+02, -.1006116996293757D+03, 755 | 4 0.3216834899377392D+02, 0.3614005342307463D+03, 756 | 5 -.6653878500833375D+03, -.6883582242804924D+03, 757 | 6 0.2193362007156572D+04, 0.2423724600546293D+03, 758 | 7 -.3665925878308203D+04, 0.2474933189642588D+04, 759 | 8 0.1987663383445796D+04, -.7382586600895061D+04, 760 | 9 0.4991253411017503D+04, 0.1008505017740918D+05, 761 | a -.1285284928905621D+05, -.5153674821668470D+04, 762 | 1 0.1301656757246985D+05, -.4821250366504323D+04, 763 | 2 -.4982112643422311D+04, 0.9694070195648748D+04, 764 | 3 -.1685723189234701D+04, -.6065143678129265D+04, 765 | 4 0.2029510635584355D+04, 0.1244402339119502D+04, 766 | 5 -.4336682903961364D+03, 0.8923209875101459D+02/ 767 | c 768 | data c1p1/ 769 | 1 -.4019450270734195D-23, -.4819240943285824D-23, 770 | 2 0.1087220822839791D-20, 0.1219058342725899D-21, 771 | 3 -.7458149572694168D-19, 0.5677825613414602D-19, 772 | 4 0.8351815799518541D-18, -.5188585543982425D-17, 773 | 5 0.1221075065755962D-15, 0.1789261470637227D-15, 774 | 6 -.6829972121890858D-14, -.1497462301804588D-14, 775 | 7 0.1579028042950957D-12, -.9414960303758800D-13, 776 | 8 -.1127570848999746D-11, 0.3883137940932639D-11, 777 | 9 -.3397569083776586D-10, -.6779059427459179D-10, 778 | a 0.1149529442506273D-08, 0.4363087909873751D-09, 779 | 1 -.1620182360840298D-07, 0.6404695607668289D-08, 780 | 2 0.9651461037419628D-07, -.1948572160668177D-06, 781 | 3 0.6397881896749446D-06, 0.2318661930507743D-05, 782 | 4 -.1983192412396578D-04, -.1294811208715315D-04, 783 | 5 0.2062663873080766D-03, -.2867633324735777D-04, 784 | 6 -.1084309075952914D-02, 0.1227880935969686D-02, 785 | 7 0.2538406015667726D-03, -.1153316815955356D-01/ 786 | c 787 | data c1p1b/ 788 | 8 0.4520140008266983D-01, 0.5693944718258218D-01, 789 | 9 -.9640790976658534D+00, -.6517135574036008D+00, 790 | a 0.2051491829570049D+01, -.1124151010077572D+01, 791 | 1 -.3977380460328048D+01, 0.8200665483661009D+01, 792 | 2 -.7950131652215817D+01, -.3503037697046647D+02, 793 | 3 0.9607320812492044D+02, 0.7894079689858070D+02, 794 | 4 -.3749002890488298D+03, -.8153831134140778D+01, 795 | 5 0.7824282518763973D+03, -.6035276543352174D+03, 796 | 6 -.5004685759675768D+03, 0.2219009060854551D+04, 797 | 7 -.2111301101664672D+04, -.4035632271617418D+04, 798 | 8 0.7319737262526823D+04, 0.2878734389521922D+04, 799 | 9 -.1087404934318719D+05, 0.3945740567322783D+04, 800 | a 0.6727823761148537D+04, -.1253555346597302D+05, 801 | 1 0.3440468371829973D+04, 0.1383240926370073D+05, 802 | 2 -.9324927373036743D+04, -.6181580304530313D+04, 803 | 3 0.6376198146666679D+04, -.1033615527971958D+04, 804 | 4 -.1497604891055181D+04, 0.1929025541588262D+04, 805 | 5 -.4219760183545219D+02, -.4521162915353207D+03/ 806 | c 807 | data c0p2/ 808 | 1 0.5641895835569398D+00, -.5641895835321127D+00, 809 | 2 -.7052370223565544D-01, -.7052369923405479D-01, 810 | 3 -.3966909368581382D-01, 0.3966934297088857D-01, 811 | 4 0.4130698137268744D-01, 0.4136196771522681D-01, 812 | 5 0.6240742346896508D-01, -.6553556513852438D-01, 813 | 6 -.3258849904760676D-01, -.7998036854222177D-01, 814 | 7 -.3988006311955270D+01, 0.1327373751674479D+01, 815 | 8 0.6121789346915312D+02, -.9251865216627577D+02, 816 | 9 0.4247064992018806D+03, 0.2692553333489150D+04, 817 | a -.4374691601489926D+05, -.3625248208112831D+05, 818 | 1 0.1010975818048476D+07, -.2859360062580096D+05, 819 | 2 -.1138970241206912D+08, 0.1051097979526042D+08, 820 | 3 0.2284038899211195D+08, -.2038012515235694D+09, 821 | 4 0.1325194353842857D+10, 0.1937443530361381D+10, 822 | 5 -.2245999018652171D+11, -.5998903865344352D+10, 823 | 6 0.1793237054876609D+12, -.8625159882306147D+11, 824 | 7 -.5887763042735203D+12, 0.1345331284205280D+13/ 825 | c 826 | data c0p2b/ 827 | 8 -.2743432269370813D+13, -.8894942160272255D+13, 828 | 9 0.4276463113794564D+14, 0.2665019886647781D+14, 829 | a -.2280727423955498D+15, 0.3686908790553973D+14, 830 | 1 0.5639846318168615D+15, -.6841529051615703D+15, 831 | 2 0.9901426799966038D+14, 0.2798406605978152D+16, 832 | 3 -.4910062244008171D+16, -.5126937967581805D+16, 833 | 4 0.1387292951936756D+17, 0.1043295727224325D+16, 834 | 5 -.1565204120687265D+17, 0.1215262806973577D+17, 835 | 6 0.3133802397107054D+16, -.1801394550807078D+17, 836 | 7 0.4427598668012807D+16, 0.6923499968336864D+16/ 837 | c 838 | c 839 | data c1p2/ 840 | 1 -.5641895835431980D+00, -.5641895835508094D+00, 841 | 2 0.2115710934750869D+00, -.2115710923186134D+00, 842 | 3 -.6611607335011594D-01, -.6611615414079688D-01, 843 | 4 -.5783289433408652D-01, 0.5785737744023628D-01, 844 | 5 0.8018419623822896D-01, 0.8189816020440689D-01, 845 | 6 0.1821045296781145D+00, -.2179738973008740D+00, 846 | 7 0.5544705668143094D+00, 0.2224466316444440D+01, 847 | 8 -.8563271248520645D+02, -.4394325758429441D+02, 848 | 9 0.2720627547071340D+04, -.6705390850875292D+03, 849 | a -.3936221960600770D+05, 0.5791730432605451D+05, 850 | 1 -.1976787738827811D+06, -.1502498631245144D+07, 851 | 2 0.2155317823990686D+08, 0.1870953796705298D+08, 852 | 3 -.4703995711098311D+09, 0.3716595906453190D+07, 853 | 4 0.5080557859012385D+10, -.4534199223888966D+10, 854 | 5 -.1064438211647413D+11, 0.8612243893745942D+11, 855 | 6 -.5466017687785078D+12, -.8070950386640701D+12, 856 | 7 0.9337074941225827D+13, 0.2458379240643264D+13/ 857 | c 858 | data c1p2b/ 859 | 8 -.7548692171244579D+14, 0.3751093169954336D+14, 860 | 9 0.2460677431350039D+15, -.5991919372881911D+15, 861 | a 0.1425679408434606D+16, 0.4132221939781502D+16, 862 | 1 -.2247506469468969D+17, -.1269771078165026D+17, 863 | 2 0.1297336292749026D+18, -.2802626909791308D+17, 864 | 3 -.3467137222813017D+18, 0.4773955215582192D+18, 865 | 4 -.2347165776580206D+18, -.2233638097535785D+19, 866 | 5 0.5382350866778548D+19, 0.4820328886922998D+19, 867 | 6 -.1928978948099345D+20, 0.1575498747750907D+18, 868 | 7 0.3049162180215152D+20, -.2837046201123502D+20, 869 | 8 -.5429391644354291D+19, 0.6974653380104308D+20, 870 | 9 -.5322120857794536D+20, -.6739879079691706D+20, 871 | a 0.6780343087166473D+20, 0.1053455984204666D+20, 872 | 1 -.2218784058435737D+20, 0.1505391868530062D+20/ 873 | c 874 | c if z is not in the right lower quadrant - bomb out 875 | c 876 | ier=0 877 | com=z 878 | if( (rea(1) .ge. 0) .and. (rea(2) .le. 0) ) goto 1400 879 | ier=4 880 | return 881 | 1400 continue 882 | c 883 | done=1 884 | thresh1=4**2 885 | thresh2=8**2 886 | thresh3=20**2 887 | c 888 | c check if if the user-specified z is in one of the 889 | c intermediate regimes 890 | c 891 | d=z*dconjg(z) 892 | if( (d .lt. thresh1) .or. (d .gt. thresh3) ) goto 3000 893 | c 894 | c if the user-specified z is in the first intermediate regime 895 | c (i.e. if its absolute value is between 4 and 8), act accordingly 896 | c 897 | if(d .gt. thresh2) goto 2000 898 | c 899 | cccexp=1 900 | if(ifexpon .eq. 1) cccexp=cdexp(ima*z) 901 | cdd=done/cdsqrt(z) 902 | cd=done/z 903 | zz18=z**18 904 | m=35 905 | call hank103p(c0p1,m,cd,h0) 906 | h0=h0*cdd*cccexp*zz18 907 | c 908 | call hank103p(c1p1,m,cd,h1) 909 | h1=h1*cdd*cccexp*zz18 910 | return 911 | 2000 continue 912 | c 913 | c z is in the second intermediate regime (i.e. its 914 | c absolute value is between 8 and 20). act accordingly. 915 | c 916 | cd=done/z 917 | cdd=sqrt(cd) 918 | 919 | cccexp=1 920 | if(ifexpon .eq. 1) cccexp=cdexp(ima*z) 921 | 922 | m=27 923 | c 924 | call hank103p(c0p2,m,cd,h0) 925 | h0=h0*cccexp*cdd 926 | c 927 | m=31 928 | call hank103p(c1p2,m,cd,h1) 929 | h1=h1*cccexp*cdd 930 | return 931 | 3000 continue 932 | c 933 | c 934 | c z is either in the local regime or the asymptotic one. 935 | c if it is in the local regime - act accordingly. 936 | c 937 | if(d .gt. 50.d0) goto 4000 938 | call hank103l(z,h0,h1,ifexpon) 939 | return 940 | c 941 | c z is in the asymptotic regime. act accordingly. 942 | c 943 | 4000 continue 944 | call hank103a(z,h0,h1,ifexpon) 945 | return 946 | end 947 | -------------------------------------------------------------------------------- /src/hank106.f: -------------------------------------------------------------------------------- 1 | cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc 2 | c 3 | c This is the end of the debugging code, and the beginning of 4 | c the Hankel function code proper 5 | c 6 | cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc 7 | c 8 | c 9 | ccc subroutine hank106a(r,h0,h1,w) 10 | subroutine hank106a(r,h0,h1,w,ninterv) 11 | implicit real *8 (a-h,o-z) 12 | integer *4 ninterv 13 | c 14 | c ninterv is number of intervals used in equispaced 15 | c subdivision of current interval 16 | c 17 | complex *16 rk,h0,h1 18 | dimension w(1) 19 | save 20 | c 21 | ccc call hank106b(r,h0,h1,w,w(11) ) 22 | call hank106b(r,h0,h1,w,w(11),ninterv) 23 | c 24 | return 25 | end 26 | c 27 | c 28 | c 29 | c 30 | c 31 | ccc subroutine hank106init(rk7,rmin,rmax,w,keep) 32 | subroutine hank106init(rk7,rmin,rmax,w,keep,ninterv) 33 | implicit real *8 (a-h,o-z) 34 | complex *16 rk,h0,h1,ima,z,rk7,u0,u07,com 35 | dimension w(1),rea(2),rws(10) 36 | integer ijw(1),iws(20) 37 | 38 | c 39 | equivalence (rea(1),com),(rws(1),iws(1)) 40 | save 41 | c 42 | data ima/(0.0d0,1.0d0)/ 43 | c 44 | c This subroutine evaluates the Hankel functions 45 | c H_0^1, H^1_1 of a complex argument, the argument 46 | c living on a ray. The subroutine you are looking at is 47 | c the initialization subroutine; the evaluation 48 | c subroutine is hank106 (see). 49 | c 50 | c 51 | c 8/28/02 - added ninterv to calling sequence (LG) 52 | c 53 | c 54 | c PLEASE NOTE THAT THE USE OF THIS SUBROUTIONE IS NOT COMPLETELY 55 | C STRAIGHTFORWARD: A STRAIGHTFORWARD SUBROUTINE TO USE IS HANK103 56 | C (SEE). HOWEVER, THIS SUBROUTINE IS ABOUT 4 TIMES FASTER THAN 57 | C HANK103. 58 | c 59 | c Recommended pairs rmin,rmax (assuming that rk \sim 1): 60 | c 61 | c 62 | c 63 | c rmin rmax 64 | c 65 | c 0.062 0.125 66 | c 0.031 0.062 67 | c 0.062 0.125 68 | c 0.125 0.25 69 | c 0.25 0.5 70 | c 0.5 1 71 | c 1 2 72 | c 2 5 73 | c 5 10 74 | c 10 100 75 | c 10 200 76 | c 77 | c GENERALLY, WITH ABS(RMIN*RK) > 10, THERE IS NO SHARP LIMIT ON 78 | C RMAX. HOWEVER, FOR SUFFICIENTLY LARGE (RMAX-RMIN)*RK, THE 79 | C code loses speed due to caching problems. THE DETERIORATION 80 | C BECOMES NOTICEABLE AT SOME POINT AFTER |(RMAX-RMIN)*RK| > 100 81 | C (ON THE PENTIUM-IV DESKTOP). 82 | C 83 | c 84 | c Input parameters: 85 | c 86 | c rk7 - the Helmholtz coefficient 87 | c rmin - the minimum r for which this subroutine will evaluate 88 | c the Hankel functions 89 | c rmax - the minimum r for which this subroutine will evaluate 90 | c the Hankel functions 91 | c 92 | c Output parameters: 93 | c 94 | c w - contains various data to be used by the entry hank106 (see) 95 | c keep - the first keep elements of the array w should not be 96 | c changed between the call to this entry, and the subsequent 97 | c calls to the entry hank106. 98 | c ninterv - number of intervals used in equispaced 99 | c subdivision of current interval 100 | c 101 | c 102 | rk=rk7 103 | c 104 | dfool=rk/abs(rk) 105 | if(abs(dfool) .lt. 1.0d-30) rk=rk+2*1.0d-30*abs(rk) 106 | c 107 | d=rk 108 | c 109 | x1=d*rmin 110 | x2=d*rmax 111 | c 112 | c initialize the evaluation of H_0, H_1 113 | c 114 | n=11 115 | ddd=(abs(rmax*rk)-abs(rmin*rk))*2 116 | i=ddd 117 | c 118 | 119 | ccc call prin2('ddd=*',ddd,1) 120 | if(i .lt. 10) i=10 121 | ninterv=i 122 | c 123 | ccc call prinf('ninterv as calculated*',ninterv,1) 124 | c 125 | d=rk 126 | d2=-ima*rk 127 | coef=d2/d 128 | c 129 | c allocate memory for the initialization 130 | c 131 | icenters=21 132 | lcenters=ninterv*2+4 133 | c 134 | ih0s=icenters+lcenters 135 | lh0s=ninterv+4 136 | lh0s=lh0s*2 137 | c 138 | ih1s=ih0s+lh0s 139 | lh1s=ninterv+4 140 | lh1s=lh1s*2 141 | c 142 | ih0derss=ih1s+lh1s 143 | lh0derss=(ninterv*n+4)*2 144 | c 145 | ih1derss=ih0derss+lh0derss 146 | lh1derss=(ninterv*n+4)*2 147 | c 148 | keep=ih1derss+lh1derss 149 | c 150 | call hank106ini(coef,x1,x2,ninterv,n, 151 | 1 w(icenters),w(ih0derss),w(ih0s),w(ih1s),h, 152 | 2 w(ih1derss),u07) 153 | c 154 | u0=1/u07 155 | c 156 | c store in the beginning of the array w various types of data 157 | c 158 | ix1=1 159 | ih=2 160 | iu0=3 161 | in=5 162 | c 163 | irk=6 164 | c 165 | w(ix1)=x1 166 | w(ih)=h 167 | w(ih)=1/h 168 | c 169 | com=u0 170 | w(iu0)=rea(1) 171 | w(iu0+1)=rea(2) 172 | c 173 | w(in)=n+0.1 174 | c 175 | com=rk 176 | c 177 | w(irk)=rea(1) 178 | w(irk+1)=rea(2) 179 | c 180 | c store integer data in the array w 181 | c 182 | iws(1)=ih0derss 183 | iws(2)=ih0s 184 | iws(3)=ih1derss 185 | iws(4)=ih1s 186 | c 187 | do 3200 j=1,8 188 | c 189 | w(10+j)=rws(j) 190 | 3200 continue 191 | c 192 | return 193 | c 194 | c 195 | c 196 | c 197 | ccc entry hank106b(r,h0,h1,w,ijw) 198 | entry hank106b(r,h0,h1,w,ijw,ninterv) 199 | c 200 | call hank106eva(r,w(ix1),n,w(icenters),w(ijw(1)), 201 | 1 w(ijw(2)),w(ijw(4)),h0,h1,w(ih),w(ijw(3)), 202 | ccc 2 w(iu0),w(irk) ) 203 | 2 w(iu0),w(irk),ninterv) 204 | c 205 | return 206 | end 207 | c 208 | c 209 | c 210 | c 211 | c 212 | subroutine hank106eva(r,x1,n,centers,h0derss, 213 | ccc 1 h0s,h1s,h0,h1,h,h1derss,u0,rk) 214 | 1 h0s,h1s,h0,h1,h,h1derss,u0,rk,ninterv) 215 | implicit real *8 (a-h,o-z) 216 | complex *16 h0s(1),h0derss(n,1),h0,h1, 217 | 1 h1derss(n,1),h1s(1),u0,rk,zcom,centers(1),zh,z 218 | c 219 | save 220 | c 221 | c input: 222 | c ninterv - number of intervals used in equispaced 223 | c subdivision of current interval 224 | c 225 | c----------------------------------- 226 | c 227 | c find the subinterval where the point z lives 228 | c 229 | z=rk*r 230 | c 231 | ccccc d = (z-x1)*h 232 | ccc if (d.lt.0) d = 1.0d-12 233 | ccccc if (d.lt.0) d = 0 234 | ccccc i = d+1 235 | c 236 | i=(z-x1)*h +1 237 | ccc call prinf(' i = *',i,1) 238 | if (i.lt.0) then 239 | i = 1 240 | else if (i.gt.ninterv) then 241 | i = ninterv 242 | endif 243 | c 244 | c evaluate the functions h0 and h1 at the point z 245 | c 246 | zh=z-centers(i) 247 | t=zh*u0 248 | ccc call prin2(' zh = *',zh,2) 249 | ccc call prin2(' t = *',t,1) 250 | c 251 | h0=(((((((((h0derss(10,i)*t+h0derss(9,i))*t+h0derss(8,i) ) 252 | 1 * t+h0derss(7,i))*t+h0derss(6,i))*t+h0derss(5,i))*t 253 | 2 +h0derss(4,i))*t+h0derss(3,i))*t+h0derss(2,i)) 254 | 3 *t+h0derss(1,i) ) * t + h0s(i) 255 | c 256 | h1=(((((((((h1derss(10,i)*t+h1derss(9,i))*t+h1derss(8,i) ) 257 | a * t+h1derss(7,i))*t+h1derss(6,i))*t+h1derss(5,i))*t 258 | 2 +h1derss(4,i) ) *t + h1derss(3,i) )* t +h1derss(2,i)) 259 | 3 *t+h1derss(1,i) ) * t +h1s(i) 260 | c 261 | return 262 | end 263 | c 264 | c 265 | c 266 | c 267 | c 268 | subroutine hank106ini(coef,x1,x2,ninterv,n, 269 | 1 centers,h0derss,h0s,h1s,h,h1derss,u0) 270 | implicit real *8 (a-h,o-z) 271 | complex *16 h0s(1),h0derss(n,1),h1derss(n,1),ima,u0, 272 | 1 us(22),h1s(1) 273 | dimension centers(2,1) 274 | c 275 | data ima/(0.0d0,1.0d0)/ 276 | c 277 | c construct the subintervals 278 | c 279 | h=(x2-x1)/ninterv 280 | c 281 | do 1200 i=1,ninterv 282 | c 283 | ab1i=(i-1)*h+x1 284 | ab2i=(i-1)*h+x1 +h 285 | centers(1,i)=(ab2i+ab1i)/2 286 | centers(2,i)=coef*centers(1,i) 287 | 1200 continue 288 | c 289 | c construct the values of Hankel functions and their 290 | c derivatibes at the centers 291 | c 292 | do 1400 i=1,ninterv 293 | c 294 | call hank0ders(centers(1,i),n,h0s(i),h1s(i), 295 | 1 h0derss(1,i),h1derss(1,i) ) 296 | 1400 continue 297 | c 298 | c scale them things by factorials and by complex powers 299 | c 300 | u0=1+ima*coef 301 | u0=u0/abs(u0) 302 | c 303 | us(1)=u0 304 | do 1500 i=1,20 305 | c 306 | us(i+1)=us(i)*u0 307 | 1500 continue 308 | c 309 | do 1800 i=1,ninterv 310 | fact=1 311 | do 1600 j=1,n-1 312 | h0derss(j,i)=h0derss(j,i)*fact * us(j) 313 | h1derss(j,i)=h1derss(j,i)*fact * us(j) 314 | c 315 | fact=fact/(j+1) 316 | 1600 continue 317 | c 318 | 1800 continue 319 | c 320 | return 321 | end 322 | c 323 | c 324 | c 325 | c 326 | c 327 | subroutine hank0ders(z,n,h0,h1,h0ders,h1ders) 328 | implicit real *8 (a-h,o-z) 329 | complex *16 z,h0,h0ders(1),h1,h1ders(1) 330 | c 331 | data ifexpon/1/ 332 | c 333 | c evaluate h0 and h1 334 | c 335 | call hank103(z,h0,h1,ifexpon) 336 | c 337 | h0ders(1)=-h1 338 | h0ders(2)=-(h0ders(1)/z+h0) 339 | h0ders(3)=-(2*h0ders(2)+h0ders(1)*z+h0)/z 340 | h0ders(4)=-(3*h0ders(3)+h0ders(2)*z+2*h0ders(1))/z 341 | c 342 | if(n .le. 4) return 343 | c 344 | do 1400 m=2,n-2 345 | c 346 | h0ders(m+2)=-( (m+1)*h0ders(m+1)+z*h0ders(m)+ 347 | 1 m*h0ders(m-1) )/z 348 | 1400 continue 349 | c 350 | do 1600 i=1,n-1 351 | c 352 | h1ders(i)=-h0ders(i+1) 353 | 1600 continue 354 | c 355 | return 356 | end 357 | 358 | 359 | ccc subroutine hank106datagen(rk,rmin,rmax,ab,nab,ninterval, 360 | ccc 1 w,lw,istart,ier) 361 | subroutine hank106datagen(rk,ier) 362 | implicit real *8 (a-h,o-z) 363 | ccc integer *4 istart(1) 364 | integer *4 istart(29),nintervec(28) 365 | ccc dimension w(1),ab(2,nab) 366 | dimension w(50000),ab(2,28) 367 | complex *16 rk,h0,h1,z,rksav 368 | data nab/28/ 369 | data lw/50000/ 370 | data rmin/1.0d-6/ 371 | data rmax/200/ 372 | save nab,ninterval,lw,istart,nintervec 373 | save w,ab,rmin,rmax,rksav 374 | c 375 | c INPUT PARAMETERS: ----> Now hidden as local vars..... 376 | c 377 | c create top level (dyadic) intervals for hank106init, which 378 | c then uses equisized subintervals to precompute interpolation 379 | c polynomials 380 | c 381 | c rk (complex *16) frequency parameter 382 | c 383 | c rmin, rmax (real *8) desired range of argument to hank106 384 | c [rmin*rk,...,rmax*rk] 385 | c 386 | c ab(2,nab) (real *8) blank array of length 2*nab 387 | c w(lw) (real *8) work array of length lw 388 | c 389 | c OUTPUT PARAMETERS: 390 | c 391 | c ninterval (integer *4) number of subintervals created 392 | c ab(2,ninterval) (real *8) boundary of ith interval is 393 | c (ab(1,i),ab(2,i)) 394 | c nintervec(ninterval) nuomber of equispaced subintervals 395 | c used for ith interval 396 | c istart (integer *4) istart(i) is pointer into workspace for 397 | c data pertaining to ith interval 398 | c ier (integer *4) error flag 399 | c ier = 0 upon normal execution. 400 | c ier = 1 if length (nab) of array ab is of 401 | c insufficient length 402 | c ier = 2 if length (lw) of workspace w is of 403 | c insufficient length 404 | c----------------------------------------------------------------- 405 | c 406 | c 407 | ier = 0 408 | ninterval = 1 409 | istart(1) = 1 410 | rmaxloc = rmin 411 | ccc rksav = rk 412 | rksav = rk/cdabs(rk) 413 | rminsav = rmin 414 | do i = 1,1000 415 | rminloc = rmaxloc 416 | rmaxloc = 2*rminloc 417 | if (rminloc.gt.100) rmaxloc = rminloc+100 418 | ab(1,i) = rminloc 419 | ab(2,i) = rmaxloc 420 | ccc call prinf(' i = *',i,1) 421 | ccc call prin2(' rminloc = *',rminloc,1) 422 | ccc call prin2(' rmaxloc = *',rmaxloc,1) 423 | ccc call prinf(' istart(i) = *',istart(i),1) 424 | ccc call hank106init(rk,rminloc,rmaxloc,w(istart(i)),keep) 425 | ccc call hank106init(rksav,rminloc,rmaxloc,w(istart(i)),keep) 426 | call hank106init(rksav,rminloc,rmaxloc,w(istart(i)), 427 | 1 keep,ninterv) 428 | nintervec(i) = ninterv 429 | ccc call prinf(' keep = *',keep,1) 430 | istart(i+1) = istart(i) + keep + 1 431 | if (rmaxloc.ge.rmax) goto 1111 432 | if (i.gt.nab) then 433 | ier = 1 434 | return 435 | endif 436 | if (istart(i+1).gt.lw) then 437 | ier = 2 438 | return 439 | endif 440 | ninterval = ninterval + 1 441 | enddo 442 | 1111 continue 443 | ccc call prinf(' istart(i+1) = *',istart(i+1),1) 444 | ccc call prinf(' i+1 = *',i+1,1) 445 | ccc call prinf(' ninterval = *',ninterval,1) 446 | ccc call prinf(' istart(ninterval) = *',istart(ninterval),1) 447 | return 448 | c 449 | ccc entry hank106(z,h0,h1,ab,ninterval,w,istart) 450 | entry hank106(z,h0,h1,ifexpon) 451 | c 452 | c 453 | c z (complex *16) argument for Hankel function evaluation. 454 | c h0,h1 (complex *16) H_0(z) and H_1(z) where frequency 455 | c parameter is ASSUMED TO BE that from 456 | c previous call to hank106datagen. 457 | c 458 | c ab,ninterval,w,istart defined above. 459 | c 460 | c------------------------------------------------------------- 461 | c 462 | c determine subinterval and call hank106. 463 | c 464 | ccc call prinf(' ninterval = *',ninterval,1) 465 | ccc call prin2(' rminsav = *',rminsav,1) 466 | ccc call prin2(' rksav = *',rksav,2) 467 | ccc x = dreal(z/(rksav*rminsav)) 468 | ccc call prin2(' x = *',x,1) 469 | ccc call prinf(' ifexpon = *',ifexpon,1) 470 | if (ifexpon.eq.0) then 471 | call hank103(z,h0,h1,ifexpon) 472 | return 473 | endif 474 | r = dreal(z/rksav) 475 | call findinte(r,ab,ninterval,i) 476 | ccc call prin2(' z = *',z,2) 477 | cc call prin2(' rksav = *',rksav,2) 478 | ccc call prin2(' r = *',r,1) 479 | ccc call prinf(' i = *',i,1) 480 | if (i.le.ninterval) then 481 | cc call prin2(' w is = *',w(istart(i)),10) 482 | cc call prinf(' int part is = *',w(istart(i+10)),10) 483 | ccc call hank106a(r,h0,h1,w(istart(i))) 484 | call hank106a(r,h0,h1,w(istart(i)),nintervec(i)) 485 | ccc call prin2(' 106 gives h0 is = *',h0,2) 486 | ccc call hank103(z,h0,h1,ifexpon) 487 | ccc call prin2(' 103 gives h0 is = *',h0,2) 488 | else 489 | call hank103(z,h0,h1,ifexpon) 490 | endif 491 | return 492 | end 493 | c 494 | c 495 | c 496 | subroutine findinte(x,ab,nn,intnum) 497 | implicit real *8 (a-h,o-z) 498 | integer *4 intold,ithresh 499 | dimension ab(2,nn) 500 | c 501 | data intold/-10/ 502 | data ithresh/10/ 503 | c 504 | c check if the point is on the subinterval as the preceding one 505 | c 506 | if(intold .le. 0) goto 2000 507 | if(intold .gt. nn) goto 2000 508 | c 509 | intnum=intold 510 | ccc call prinf(' intnum is *',intnum,1) 511 | ccc call prin2(' ab is *',ab(1,intnum),2) 512 | if( (x .ge. ab(1,intnum) ) .and. (x .le. ab(2,intnum) ) ) return 513 | c 514 | 2000 continue 515 | if(x .lt. ab(1,1)) then 516 | intnum = 777 517 | return 518 | else if(x .gt. ab(2,nn)) then 519 | intnum = 777 520 | return 521 | endif 522 | c 523 | c the point is not on the same subinterval as the preceding one. 524 | c if nn is less than ithresh, use direct scan to find the proper 525 | c interval 526 | c 527 | if(nn .gt. ithresh) goto 3000 528 | c 529 | c 530 | do 2200 j=1,nn 531 | c 532 | intnum=j 533 | c 534 | if(ab(2,j) .ge. x) goto 2400 535 | 2200 continue 536 | c 537 | 2400 continue 538 | c 539 | intold=intnum 540 | return 541 | c 542 | 3000 continue 543 | c 544 | c The point is not on the same subinterval as the preceding one, 545 | c and nn is greater than ithresh; use bisection to find the proper 546 | c interval 547 | c 548 | i1=1 549 | i2=nn 550 | i3=(i1+i2)/2 551 | c 552 | cccc nsteps=0 553 | do 3400 i=1,100 554 | c 555 | if(x .ge. ab(1,i3)) i1=i3 556 | if(x .le. ab(2,i3)) i2=i3 557 | c 558 | if(i2 .eq. i1) goto 3600 559 | c 560 | i3=(i1+i2)/2 561 | 3400 continue 562 | c 563 | 3600 continue 564 | 565 | if(x .lt. ab(1,i3)) i3=i3-1 566 | if(x .gt. ab(2,i3)) i3=i3+1 567 | 568 | intnum=i3 569 | intold=intnum 570 | c 571 | return 572 | end 573 | c 574 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | struct run_t { 20 | int id; 21 | std::string time; 22 | std::unique_ptr host; 23 | std::unique_ptr toolchain; 24 | }; 25 | 26 | run_t run_info; 27 | sf::utils::toolchain_info_t toolchain_info; 28 | sf::utils::host_info_t host_info; 29 | std::unordered_map libraries_info = { 30 | {"agnerfog", {.name = "agnerfog", .version = sf::utils::get_af_version()}}, 31 | {"amdlibm", {.name = "amdlibm", .version = sf::utils::get_alm_version()}}, 32 | {"baobzi", {.name = "baobzi", .version = sf::utils::get_baobzi_version()}}, 33 | {"boost", {.name = "boost", .version = sf::utils::get_boost_version()}}, 34 | {"eigen", {.name = "eigen", .version = sf::utils::get_eigen_version()}}, 35 | {"gsl", {.name = "gsl", .version = sf::utils::get_gsl_version()}}, 36 | {"fort", {.name = "fort", .version = "NA"}}, 37 | {"misc", {.name = "misc", .version = "NA"}}, 38 | {"sctl", {.name = "sctl", .version = sf::utils::get_sctl_version()}}, 39 | {"sleef", {.name = "sleef", .version = sf::utils::get_sleef_version()}}, 40 | {"stl", {.name = "stl", .version = "NA"}}, 41 | }; 42 | 43 | struct measurement_t { 44 | int id; 45 | std::unique_ptr run; 46 | std::unique_ptr library; 47 | std::unique_ptr configuration; 48 | sf::utils::library_info_t library_copy; 49 | configuration_t config_copy; 50 | int nelem = 0; 51 | int nrepeat = 0; 52 | int veclev = 0; 53 | double megaevalspersec = 0; 54 | double cyclespereval = 0; 55 | double meanevaltime = 0; 56 | double stddev = 0; 57 | double maxerr = 0; 58 | double maxrelerr = 0; 59 | 60 | explicit operator bool() const { return nrepeat; } 61 | friend std::ostream &operator<<(std::ostream &, const measurement_t &); 62 | }; 63 | 64 | std::ostream &operator<<(std::ostream &os, const measurement_t &meas) { 65 | 66 | using std::left; 67 | using std::setw; 68 | 69 | if (meas) { 70 | std::string label = meas.config_copy.func + "_" + meas.library_copy.name + "_" + meas.config_copy.ftype + "x" + 71 | std::to_string(meas.veclev); 72 | 73 | os.precision(6); 74 | os << left << setw(25) << label + ": " << left << setw(15) << meas.megaevalspersec; 75 | os.precision(15); 76 | os << left << setw(15) << meas.cyclespereval << left << setw(5) << " "; 77 | os.precision(5); 78 | os << "[" << meas.config_copy.lbound << ", " << meas.config_copy.ubound << "]" << std::endl; 79 | } 80 | return os; 81 | } 82 | 83 | #define EIGEN_CASE(OP) \ 84 | case sf::functions::eigen::OPS::OP: { \ 85 | res = x.array().OP(); \ 86 | break; \ 87 | } 88 | 89 | template 90 | measurement_t test_func(const FUN_T &f, int veclev, sf::utils::library_info_t &library_info, configuration_t &config, 91 | const Eigen::Ref> &x_in, 92 | const Eigen::Ref &y_ref, int n_repeat) { 93 | if (!f) 94 | return measurement_t(); 95 | const std::string label = library_info.name + "_" + config.func; 96 | 97 | Eigen::VectorX x = sf::utils::transform_domain(x_in, config.lbound, config.ubound); 98 | 99 | size_t res_size = x.size(); 100 | size_t n_evals = x.size() * n_repeat; 101 | if constexpr (std::is_same_v) 102 | res_size *= 2; 103 | 104 | Eigen::VectorX res(res_size); 105 | // Force virtual memory to RAM (to force malloc to do its thing) 106 | mlock(res.data(), res_size * sizeof(VAL_T)); 107 | VAL_T *resptr = res.data(); 108 | 109 | sf::utils::timer timer; 110 | 111 | for (long k = 0; k < n_repeat; k++) { 112 | if constexpr (std::is_same_v) { 113 | for (std::size_t i = 0; i < x.size(); ++i) { 114 | std::tie(resptr[i * 2], resptr[i * 2 + 1]) = f(x[i]); 115 | } 116 | } else if constexpr (std::is_same_v>) { 117 | (*f)(x.data(), resptr, x.size()); 118 | } else if constexpr (std::is_same_v) { 119 | switch (f) { 120 | EIGEN_CASE(cos) 121 | EIGEN_CASE(sin) 122 | EIGEN_CASE(tan) 123 | EIGEN_CASE(cosh) 124 | EIGEN_CASE(sinh) 125 | EIGEN_CASE(tanh) 126 | EIGEN_CASE(exp) 127 | EIGEN_CASE(log) 128 | EIGEN_CASE(log10) 129 | EIGEN_CASE(asin) 130 | EIGEN_CASE(acos) 131 | EIGEN_CASE(atan) 132 | EIGEN_CASE(asinh) 133 | EIGEN_CASE(acosh) 134 | EIGEN_CASE(atanh) 135 | EIGEN_CASE(erf) 136 | EIGEN_CASE(erfc) 137 | EIGEN_CASE(lgamma) 138 | EIGEN_CASE(digamma) 139 | EIGEN_CASE(ndtri) 140 | EIGEN_CASE(sqrt) 141 | EIGEN_CASE(rsqrt) 142 | case sf::functions::eigen::OPS::pow35: { 143 | res = x.array().pow(3.5); 144 | break; 145 | } 146 | case sf::functions::eigen::OPS::pow13: { 147 | res = x.array().pow(13); 148 | break; 149 | } 150 | } 151 | } else { 152 | f(x.data(), resptr, x.size()); 153 | } 154 | } 155 | timer.stop(); 156 | 157 | measurement_t meas; 158 | meas.config_copy = config; 159 | meas.library_copy = library_info; 160 | 161 | meas.run = std::make_unique(run_info.id); 162 | meas.configuration = std::make_unique(config.id); 163 | meas.library = std::make_unique(library_info.id); 164 | meas.nelem = x.size(); 165 | meas.nrepeat = n_repeat; 166 | meas.cyclespereval = timer.ticks_elapsed() / (double)n_evals; 167 | meas.megaevalspersec = n_evals / timer.elapsed() / 1E6; 168 | meas.meanevaltime = timer.elapsed() / n_evals / 1E-9; 169 | meas.veclev = veclev; 170 | 171 | if (y_ref.size() && (std::is_same_v || std::is_same_v)) { 172 | Eigen::VectorXd delta = res.template cast() - y_ref; 173 | meas.maxerr = delta.array().abs().maxCoeff(); 174 | meas.maxrelerr = (delta.array().abs() / y_ref.array().abs()).maxCoeff(); 175 | meas.stddev = std::sqrt((delta.array() - delta.mean()).square().sum() / (delta.size() - 1)); 176 | 177 | meas.maxerr = std::isnan(meas.maxerr) ? -2.0 : meas.maxerr; 178 | meas.maxrelerr = std::isnan(meas.maxrelerr) ? -2.0 : meas.maxrelerr; 179 | meas.stddev = std::isnan(meas.stddev) ? -2.0 : meas.stddev; 180 | } else { 181 | meas.stddev = -1.0; 182 | meas.maxerr = -1.0; 183 | meas.maxrelerr = -1.0; 184 | } 185 | 186 | munlock(res.data(), res_size * sizeof(VAL_T)); 187 | return meas; 188 | } 189 | #undef EIGEN_CASE 190 | 191 | std::set parse_args(int argc, char *argv[]) { 192 | // lol: "parse" 193 | std::set res; 194 | for (int i = 0; i < argc; ++i) 195 | res.insert(argv[i]); 196 | 197 | return res; 198 | } 199 | 200 | inline auto init_storage(const std::string &path) { 201 | using namespace sqlite_orm; 202 | using sf::utils::host_info_t; 203 | using sf::utils::library_info_t; 204 | using sf::utils::toolchain_info_t; 205 | 206 | auto storage = make_storage( 207 | "db.sqlite", 208 | make_table( 209 | "hosts", make_column("id", &host_info_t::id, autoincrement(), primary_key()), 210 | make_column("cpuname", &host_info_t::cpuname, unique()), make_column("cpuclock", &host_info_t::cpuclock), 211 | make_column("cpuclockmax", &host_info_t::cpuclockmax), make_column("memclock", &host_info_t::memclock), 212 | make_column("l1dcache", &host_info_t::L1d), make_column("l1icache", &host_info_t::L1i), 213 | make_column("l2cache", &host_info_t::L2), make_column("l3cache", &host_info_t::L3)), 214 | make_table("configurations", make_column("id", &configuration_t::id, autoincrement(), primary_key()), 215 | make_column("func", &configuration_t::func), make_column("ftype", &configuration_t::ftype), 216 | make_column("lbound", &configuration_t::lbound), make_column("ubound", &configuration_t::ubound), 217 | make_column("ilbound", &configuration_t::ilbound), make_column("iubound", &configuration_t::iubound), 218 | sqlite_orm::unique(&configuration_t::func, &configuration_t::ftype, &configuration_t::lbound, 219 | &configuration_t::ubound, &configuration_t::ilbound, &configuration_t::iubound)), 220 | make_table("toolchains", make_column("id", &toolchain_info_t::id, autoincrement(), primary_key()), 221 | make_column("compiler", &toolchain_info_t::compiler), 222 | make_column("compilervers", &toolchain_info_t::compilervers), 223 | make_column("libcvers", &toolchain_info_t::libcvers), 224 | sqlite_orm::unique(&toolchain_info_t::compiler, &toolchain_info_t::compilervers, 225 | &toolchain_info_t::libcvers)), 226 | make_table("libraries", make_column("id", &library_info_t::id, autoincrement(), primary_key()), 227 | make_column("name", &library_info_t::name), make_column("version", &library_info_t::version), 228 | sqlite_orm::unique(&library_info_t::name, &library_info_t::version)), 229 | make_table("runs", make_column("id", &run_t::id, autoincrement(), primary_key()), 230 | make_column("time", &run_t::time), make_column("host", &run_t::host), 231 | make_column("toolchain", &run_t::toolchain), foreign_key(&run_t::host).references(&host_info_t::id), 232 | foreign_key(&run_t::toolchain).references(&toolchain_info_t::id)), 233 | make_table( 234 | "measurements", make_column("id", &measurement_t::id, autoincrement(), primary_key()), 235 | make_column("run", &measurement_t::run), make_column("library", &measurement_t::library), 236 | make_column("configuration", &measurement_t::configuration), make_column("nelem", &measurement_t::nelem), 237 | make_column("nrepeat", &measurement_t::nrepeat), make_column("veclev", &measurement_t::veclev), 238 | make_column("megaevalspersec", &measurement_t::megaevalspersec), 239 | make_column("cyclespereval", &measurement_t::cyclespereval), 240 | make_column("meanevaltime", &measurement_t::meanevaltime), make_column("stddev", &measurement_t::stddev), 241 | make_column("maxrelerr", &measurement_t::maxrelerr), make_column("maxerr", &measurement_t::maxerr), 242 | foreign_key(&measurement_t::run).references(&run_t::id), 243 | foreign_key(&measurement_t::library).references(&library_info_t::id), 244 | foreign_key(&measurement_t::configuration).references(&configuration_t::id))); 245 | 246 | storage.sync_schema(); 247 | auto host_ids = 248 | storage.select(columns(&host_info_t::id), where(is_equal(&host_info_t::cpuname, host_info.cpuname))); 249 | if (host_ids.size() == 0) 250 | host_info.id = storage.insert(host_info); 251 | else 252 | host_info.id = std::get(host_ids[0]); 253 | 254 | auto toolchain_ids = storage.select(columns(&toolchain_info_t::id), 255 | where(is_equal(&toolchain_info_t::compiler, toolchain_info.compiler) and 256 | is_equal(&toolchain_info_t::compilervers, toolchain_info.compilervers) and 257 | is_equal(&toolchain_info_t::libcvers, toolchain_info.libcvers))); 258 | if (toolchain_ids.size() == 0) 259 | toolchain_info.id = storage.insert(toolchain_info); 260 | else 261 | toolchain_info.id = std::get(toolchain_ids[0]); 262 | 263 | for (auto &[name, lib] : libraries_info) { 264 | auto library_ids = 265 | storage.select(columns(&library_info_t::id), where(is_equal(&library_info_t::name, lib.name) and 266 | is_equal(&library_info_t::version, lib.version))); 267 | if (library_ids.size() == 0) 268 | lib.id = storage.insert(lib); 269 | else 270 | lib.id = std::get(library_ids[0]); 271 | } 272 | 273 | run_info.time = storage.select(datetime("now")).front(); 274 | run_info.toolchain = std::make_unique(toolchain_info.id); 275 | run_info.host = std::make_unique(host_info.id); 276 | run_info.id = storage.insert(run_info); 277 | 278 | return storage; 279 | } 280 | 281 | using Storage = decltype(init_storage("")); 282 | 283 | int main(int argc, char *argv[]) { 284 | Storage storage = init_storage("db.sqlite"); 285 | 286 | std::cout << host_info.cpuname << std::endl; 287 | std::cout << " " + toolchain_info.compiler + ": " + toolchain_info.compilervers << std::endl; 288 | std::cout << " libc: " + toolchain_info.libcvers << std::endl; 289 | for (auto &[key, lib] : libraries_info) 290 | std::cout << " " + lib.name + ": " + lib.version << std::endl; 291 | 292 | std::set input_keys = parse_args(argc - 1, argv + 1); 293 | 294 | auto &af_funs_dx4 = sf::functions::af::get_funs_dx4(); 295 | auto &af_funs_dx8 = sf::functions::af::get_funs_dx8(); 296 | auto &af_funs_fx8 = sf::functions::af::get_funs_fx8(); 297 | auto &af_funs_fx16 = sf::functions::af::get_funs_fx16(); 298 | 299 | auto &amdlibm_funs_dx1 = sf::functions::amd::get_funs_dx1(); 300 | auto &amdlibm_funs_dx4 = sf::functions::amd::get_funs_dx4(); 301 | auto &amdlibm_funs_fx1 = sf::functions::amd::get_funs_fx1(); 302 | auto &amdlibm_funs_fx8 = sf::functions::amd::get_funs_fx8(); 303 | 304 | auto &boost_funs_fx1 = sf::functions::boost::get_funs_fx1(); 305 | auto &boost_funs_dx1 = sf::functions::boost::get_funs_dx1(); 306 | 307 | auto &eigen_funs = sf::functions::eigen::get_funs(); 308 | 309 | auto &fort_funs = sf::functions::fort::get_funs_dx1(); 310 | 311 | auto &gsl_funs = sf::functions::gsl::get_funs_dx1(); 312 | auto &gsl_complex_funs = sf::functions::gsl::get_funs_cdx1(); 313 | 314 | auto &misc_funs_cdx1_x2 = sf::functions::misc::get_funs_cdx1_x2(); 315 | 316 | auto &sctl_funs_dx4 = sf::functions::SCTL::get_funs_dx4(); 317 | auto &sctl_funs_dx8 = sf::functions::SCTL::get_funs_dx8(); 318 | auto &sctl_funs_fx8 = sf::functions::SCTL::get_funs_fx8(); 319 | auto &sctl_funs_fx16 = sf::functions::SCTL::get_funs_fx16(); 320 | 321 | auto &sleef_funs_dx1 = sf::functions::sleef::get_funs_dx1(); 322 | auto &sleef_funs_dx4 = sf::functions::sleef::get_funs_dx4(); 323 | auto &sleef_funs_dx8 = sf::functions::sleef::get_funs_dx8(); 324 | auto &sleef_funs_fx1 = sf::functions::sleef::get_funs_fx1(); 325 | auto &sleef_funs_fx8 = sf::functions::sleef::get_funs_fx8(); 326 | auto &sleef_funs_fx16 = sf::functions::sleef::get_funs_fx16(); 327 | 328 | auto &stl_funs_fx1 = sf::functions::stl::get_funs_fx1(); 329 | auto &stl_funs_dx1 = sf::functions::stl::get_funs_dx1(); 330 | 331 | std::set fun_union; 332 | #define merge_into_set(FUNS) \ 333 | for (auto kv : FUNS) \ 334 | fun_union.insert(kv.first); 335 | 336 | merge_into_set(af_funs_fx8); 337 | merge_into_set(amdlibm_funs_fx1); 338 | merge_into_set(boost_funs_fx1); 339 | merge_into_set(eigen_funs); 340 | merge_into_set(fort_funs); 341 | merge_into_set(gsl_funs); 342 | merge_into_set(misc_funs_cdx1_x2); 343 | merge_into_set(sctl_funs_fx8); 344 | merge_into_set(sleef_funs_fx1); 345 | merge_into_set(stl_funs_fx1); 346 | #undef merge_into_set 347 | 348 | std::set keys_to_eval; 349 | if (input_keys.size() > 0) 350 | std::set_intersection(fun_union.begin(), fun_union.end(), input_keys.begin(), input_keys.end(), 351 | std::inserter(keys_to_eval, keys_to_eval.end())); 352 | else 353 | keys_to_eval = fun_union; 354 | 355 | std::vector> run_sets; 356 | for (uint8_t shift = 0; shift <= 14; shift += 14) 357 | run_sets.push_back({1 << (11 + shift), 1 << (14 - shift)}); 358 | 359 | std::unordered_map base_configurations = { 360 | {"acos", {.lbound = -1.0, .ubound = 1.0}}, 361 | {"acosh", {.lbound = 1.0, .ubound = 1000.0}}, 362 | {"asin", {.lbound = -1.0, .ubound = 1.0}}, 363 | {"asinh", {.lbound = -100.0, .ubound = 100.0}}, 364 | {"atan", {.lbound = -100.0, .ubound = 100.0}}, 365 | {"atanh", {.lbound = -0.9, .ubound = 0.9}}, 366 | {"bessel_I0", {.lbound = 0.1, .ubound = 30.0}}, 367 | {"bessel_I1", {.lbound = 0.1, .ubound = 30.0}}, 368 | {"bessel_I2", {.lbound = 0.1, .ubound = 30.0}}, 369 | {"bessel_J0", {.lbound = 0.1, .ubound = 30.0}}, 370 | {"bessel_J1", {.lbound = 0.1, .ubound = 30.0}}, 371 | {"bessel_J2", {.lbound = 0.1, .ubound = 30.0}}, 372 | {"bessel_K0", {.lbound = 0.1, .ubound = 30.0}}, 373 | {"bessel_K1", {.lbound = 0.1, .ubound = 30.0}}, 374 | {"bessel_K2", {.lbound = 0.1, .ubound = 30.0}}, 375 | {"bessel_Y0", {.lbound = 0.1, .ubound = 30.0}}, 376 | {"bessel_Y1", {.lbound = 0.1, .ubound = 30.0}}, 377 | {"bessel_Y2", {.lbound = 0.1, .ubound = 30.0}}, 378 | {"bessel_j0", {.lbound = 0.1, .ubound = 30.0}}, 379 | {"bessel_j1", {.lbound = 0.1, .ubound = 30.0}}, 380 | {"bessel_j2", {.lbound = 0.1, .ubound = 30.0}}, 381 | {"bessel_y0", {.lbound = 0.1, .ubound = 30.0}}, 382 | {"bessel_y1", {.lbound = 0.1, .ubound = 30.0}}, 383 | {"bessel_y2", {.lbound = 0.1, .ubound = 30.0}}, 384 | {"cos", {.lbound = 0.0, .ubound = 2 * M_PI, .ilbound = 0.0, .iubound = 2 * M_PI}}, 385 | {"cos_pi", {.lbound = 0.0, .ubound = 2.0}}, 386 | {"cosh", {.lbound = 0.0, .ubound = 1.0}}, 387 | {"digamma", {.lbound = 0.0, .ubound = 1.0}}, 388 | {"erf", {.lbound = -1.0, .ubound = 1.0}}, 389 | {"erfc", {.lbound = -1.0, .ubound = 1.0}}, 390 | {"exp", {.lbound = -1.0, .ubound = 1.0}}, 391 | {"exp10", {.lbound = -1.0, .ubound = 1.0}}, 392 | {"exp2", {.lbound = -1.0, .ubound = 1.0}}, 393 | {"hank103", {.lbound = 0.0, .ubound = 10.0, .ilbound = 0.0, .iubound = 10.0}}, 394 | {"hermite_0", {.lbound = 0.0, .ubound = 10.0}}, 395 | {"hermite_1", {.lbound = 0.0, .ubound = 10.0}}, 396 | {"hermite_2", {.lbound = 0.0, .ubound = 10.0}}, 397 | {"hermite_3", {.lbound = 0.0, .ubound = 10.0}}, 398 | {"lgamma", {.lbound = 0.0, .ubound = 10.0}}, 399 | {"log", {.lbound = 0.0, .ubound = 10.0}}, 400 | {"log10", {.lbound = 0.0, .ubound = 10.0}}, 401 | {"log2", {.lbound = 0.0, .ubound = 10.0}}, 402 | {"memcpy", {.lbound = 0.0, .ubound = 1.0}}, 403 | {"memset", {.lbound = 0.0, .ubound = 1.0}}, 404 | {"ndtri", {.lbound = 0.0, .ubound = 1.0}}, 405 | {"pow13", {.lbound = 0.0, .ubound = 1.0}}, 406 | {"pow3.5", {.lbound = 0.0, .ubound = 1.0}}, 407 | {"riemann_zeta", {.lbound = 0.0, .ubound = 10.0}}, 408 | {"rsqrt", {.lbound = 0.0, .ubound = 10.0}}, 409 | {"sin", {.lbound = 0.0, .ubound = 2 * M_PI, .ilbound = 0.0, .iubound = 2 * M_PI}}, 410 | {"sin_pi", {.lbound = 0.0, .ubound = 2.0}}, 411 | {"sinc", {.lbound = 0.0, .ubound = 2 * M_PI, .ilbound = 0.0, .iubound = 2 * M_PI}}, 412 | {"sinc_pi", {.lbound = 0.0, .ubound = 2.0}}, 413 | {"sinh", {.lbound = 0.0, .ubound = 2.0}}, 414 | {"sqrt", {.lbound = 0.0, .ubound = 10.0}}, 415 | {"tan", {.lbound = 0.0, .ubound = 2 * M_PI}}, 416 | {"tanh", {.lbound = -1.0, .ubound = 1.0}}, 417 | {"tgamma", {.lbound = -0.0, .ubound = 1.0}}, 418 | }; 419 | 420 | std::unordered_map> double_refs = { 421 | {"acos", stl_funs_dx1["acos"]}, 422 | {"acosh", stl_funs_dx1["acosh"]}, 423 | {"asin", stl_funs_dx1["asin"]}, 424 | {"asinh", stl_funs_dx1["asinh"]}, 425 | {"atan", stl_funs_dx1["atan"]}, 426 | {"atanh", stl_funs_dx1["atanh"]}, 427 | {"bessel_I0", gsl_funs["bessel_I0"]}, 428 | {"bessel_I1", gsl_funs["bessel_I1"]}, 429 | {"bessel_I2", gsl_funs["bessel_I2"]}, 430 | {"bessel_J0", gsl_funs["bessel_J0"]}, 431 | {"bessel_J1", gsl_funs["bessel_J1"]}, 432 | {"bessel_J2", gsl_funs["bessel_J2"]}, 433 | {"bessel_K0", gsl_funs["bessel_K0"]}, 434 | {"bessel_K1", gsl_funs["bessel_K1"]}, 435 | {"bessel_K2", gsl_funs["bessel_K2"]}, 436 | {"bessel_Y0", gsl_funs["bessel_Y0"]}, 437 | {"bessel_Y1", gsl_funs["bessel_Y1"]}, 438 | {"bessel_Y2", gsl_funs["bessel_Y2"]}, 439 | {"bessel_j0", gsl_funs["bessel_j0"]}, 440 | {"bessel_j1", gsl_funs["bessel_j1"]}, 441 | {"bessel_j2", gsl_funs["bessel_j2"]}, 442 | {"bessel_y0", gsl_funs["bessel_y0"]}, 443 | {"bessel_y1", gsl_funs["bessel_y1"]}, 444 | {"bessel_y2", gsl_funs["bessel_y2"]}, 445 | {"memcpy", sctl_funs_dx4["memcpy"]}, 446 | {"cos", stl_funs_dx1["cos"]}, 447 | {"cos_pi", boost_funs_dx1["cos_pi"]}, 448 | {"cosh", stl_funs_dx1["cosh"]}, 449 | {"digamma", boost_funs_dx1["digamma"]}, 450 | {"erf", stl_funs_dx1["erf"]}, 451 | {"erfc", stl_funs_dx1["erfc"]}, 452 | {"exp", stl_funs_dx1["exp"]}, 453 | {"exp10", stl_funs_dx1["exp10"]}, 454 | {"exp2", stl_funs_dx1["exp2"]}, 455 | {"hermite_0", boost_funs_dx1["hermite_0"]}, 456 | {"hermite_1", boost_funs_dx1["hermite_1"]}, 457 | {"hermite_2", boost_funs_dx1["hermite_2"]}, 458 | {"hermite_3", boost_funs_dx1["hermite_3"]}, 459 | {"lgamma", gsl_funs["lgamma"]}, 460 | {"log", stl_funs_dx1["log"]}, 461 | {"log10", stl_funs_dx1["log10"]}, 462 | {"log2", stl_funs_dx1["log2"]}, 463 | {"pow13", stl_funs_dx1["pow13"]}, 464 | {"pow3.5", stl_funs_dx1["pow3.5"]}, 465 | {"riemann_zeta", gsl_funs["riemann_zeta"]}, 466 | {"rsqrt", stl_funs_dx1["rsqrt"]}, 467 | {"sin", stl_funs_dx1["sin"]}, 468 | {"sin_pi", boost_funs_dx1["sin_pi"]}, 469 | {"sinc", gsl_funs["sinc"]}, 470 | {"sinc_pi", gsl_funs["sinc_pi"]}, 471 | {"sinh", stl_funs_dx1["sinh"]}, 472 | {"sqrt", stl_funs_dx1["sqrt"]}, 473 | {"tan", stl_funs_dx1["tan"]}, 474 | {"tanh", stl_funs_dx1["tanh"]}, 475 | {"tgamma", stl_funs_dx1["tgamma"]}, 476 | }; 477 | 478 | for (auto key : keys_to_eval) 479 | std::cout << key << std::endl; 480 | 481 | auto &baobzi_funs = sf::functions::baobzi::get_funs_dx1(keys_to_eval, base_configurations); 482 | 483 | for (auto &run_set : run_sets) { 484 | const auto &[n_eval, n_repeat] = run_set; 485 | std::cerr << "Running benchmark with input vector of length " << n_eval << " and " << n_repeat << " repeats.\n"; 486 | Eigen::VectorXd vals = 0.5 * (Eigen::ArrayXd::Random(n_eval) + 1.0); 487 | Eigen::VectorXf fvals = vals.cast(); 488 | Eigen::VectorX cvals = 0.5 * (Eigen::ArrayX::Random(n_eval) + std::complex{1.0, 1.0}); 489 | 490 | for (auto key : keys_to_eval) { 491 | auto insert_measurement = [&storage](measurement_t &meas) -> void { 492 | if (meas) 493 | storage.insert(meas); 494 | }; 495 | 496 | auto get_conf_data = [&storage, &base_configurations](const std::string &name, 497 | const std::string &ftype) -> configuration_t { 498 | configuration_t config = base_configurations[name]; 499 | config.func = name; 500 | config.ftype = ftype; 501 | 502 | using namespace sqlite_orm; 503 | auto conf_ids = storage.select(columns(&configuration_t::id), 504 | where(is_equal(&configuration_t::ftype, config.ftype) and 505 | is_equal(&configuration_t::func, config.func) and 506 | is_equal(&configuration_t::lbound, config.lbound) and 507 | is_equal(&configuration_t::ubound, config.ubound) and 508 | is_equal(&configuration_t::ilbound, config.ilbound) and 509 | is_equal(&configuration_t::iubound, config.iubound))); 510 | config.id = conf_ids.size() ? std::get(conf_ids[0]) : storage.insert(config); 511 | return config; 512 | }; 513 | 514 | Eigen::VectorXd vals_ref = sf::utils::transform_domain(vals, base_configurations[key].lbound, 515 | base_configurations[key].ubound); 516 | 517 | Eigen::VectorXd dref; 518 | if (double_refs.count(key)) { 519 | dref.resize(vals_ref.size()); 520 | double_refs[key](vals_ref.data(), dref.data(), vals_ref.size()); 521 | } 522 | 523 | std::vector ms; 524 | auto &libs = libraries_info; 525 | 526 | auto conf_f = get_conf_data(key, "f"); 527 | ms.push_back(test_func(amdlibm_funs_fx1[key], 1, libs["amdlibm"], conf_f, fvals, dref, n_repeat)); 528 | ms.push_back(test_func(amdlibm_funs_fx8[key], 8, libs["amdlibm"], conf_f, fvals, dref, n_repeat)); 529 | ms.push_back(test_func(af_funs_fx8[key], 8, libs["agnerfog"], conf_f, fvals, dref, n_repeat)); 530 | ms.push_back(test_func(af_funs_fx16[key], 16, libs["agnerfog"], conf_f, fvals, dref, n_repeat)); 531 | ms.push_back(test_func(boost_funs_fx1[key], 1, libs["boost"], conf_f, fvals, dref, n_repeat)); 532 | ms.push_back(test_func(eigen_funs[key], 0, libs["eigen"], conf_f, fvals, dref, n_repeat)); 533 | ms.push_back(test_func(sleef_funs_fx1[key], 1, libs["sleef"], conf_f, fvals, dref, n_repeat)); 534 | ms.push_back(test_func(sleef_funs_fx8[key], 8, libs["sleef"], conf_f, fvals, dref, n_repeat)); 535 | ms.push_back(test_func(sleef_funs_fx16[key], 16, libs["sleef"], conf_f, fvals, dref, n_repeat)); 536 | ms.push_back(test_func(sctl_funs_fx8[key], 8, libs["sctl"], conf_f, fvals, dref, n_repeat)); 537 | ms.push_back(test_func(sctl_funs_fx16[key], 16, libs["sctl"], conf_f, fvals, dref, n_repeat)); 538 | ms.push_back(test_func(stl_funs_fx1[key], 1, libs["stl"], conf_f, fvals, dref, n_repeat)); 539 | 540 | auto conf_d = get_conf_data(key, "d"); 541 | ms.push_back(test_func(af_funs_dx4[key], 4, libs["agnerfog"], conf_d, vals, dref, n_repeat)); 542 | ms.push_back(test_func(af_funs_dx8[key], 8, libs["agnerfog"], conf_d, vals, dref, n_repeat)); 543 | ms.push_back(test_func(amdlibm_funs_dx1[key], 1, libs["amdlibm"], conf_d, vals, dref, n_repeat)); 544 | ms.push_back(test_func(amdlibm_funs_dx4[key], 4, libs["amdlibm"], conf_d, vals, dref, n_repeat)); 545 | ms.push_back(test_func(baobzi_funs[key], 1, libs["baobzi"], conf_d, vals, dref, n_repeat)); 546 | ms.push_back(test_func(boost_funs_dx1[key], 1, libs["boost"], conf_d, vals, dref, n_repeat)); 547 | ms.push_back(test_func(eigen_funs[key], 0, libs["eigen"], conf_d, vals, dref, n_repeat)); 548 | ms.push_back(test_func(fort_funs[key], 1, libs["fort"], conf_d, vals, dref, n_repeat)); 549 | ms.push_back(test_func(gsl_funs[key], 1, libs["gsl"], conf_d, vals, dref, n_repeat)); 550 | ms.push_back(test_func(sctl_funs_dx4[key], 4, libs["sctl"], conf_d, vals, dref, n_repeat)); 551 | ms.push_back(test_func(sctl_funs_dx8[key], 8, libs["sctl"], conf_d, vals, dref, n_repeat)); 552 | ms.push_back(test_func(sleef_funs_dx1[key], 1, libs["sleef"], conf_d, vals, dref, n_repeat)); 553 | ms.push_back(test_func(sleef_funs_dx4[key], 4, libs["sleef"], conf_d, vals, dref, n_repeat)); 554 | ms.push_back(test_func(sleef_funs_dx8[key], 8, libs["sleef"], conf_d, vals, dref, n_repeat)); 555 | ms.push_back(test_func(stl_funs_dx1[key], 1, libs["stl"], conf_d, vals, dref, n_repeat)); 556 | 557 | for (auto &meas : ms) { 558 | if (!meas) 559 | continue; 560 | std::cout << meas; 561 | storage.insert(meas); 562 | } 563 | // test_func(gsl_complex_funs, [key], "gsl_cdx1", params, cvals, n_repeat); 564 | // test_func(misc_funs_cdx1_x2[key], "misc_cdx1_x2", params, cvals, n_repeat); 565 | 566 | std::cout << "\n"; 567 | } 568 | } 569 | return 0; 570 | } 571 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace sf::utils { 5 | 6 | host_info_t::host_info_t() { 7 | cpuname = exec("grep -m1 'model name' /proc/cpuinfo | cut -d' ' --complement -f1-3"); 8 | L1d = exec("lscpu | grep L1d | awk '{print $3}'"); 9 | L1i = exec("lscpu | grep L1i | awk '{print $3}'"); 10 | L2 = exec("lscpu | grep L2 | awk '{print $3}'"); 11 | L3 = exec("lscpu | grep L3 | awk '{print $3}'"); 12 | } 13 | 14 | toolchain_info_t::toolchain_info_t() { 15 | #ifdef __GNUC__ 16 | compiler = "gcc"; 17 | compilervers = 18 | std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__); 19 | #endif 20 | 21 | libcvers = gnu_get_libc_version(); 22 | } 23 | 24 | std::string exec(const char *cmd) { 25 | // https://stackoverflow.com/a/478960 26 | std::array buffer; 27 | std::string result; 28 | std::unique_ptr pipe(popen(cmd, "r"), pclose); 29 | if (!pipe) { 30 | throw std::runtime_error("popen() failed!"); 31 | } 32 | while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { 33 | result += buffer.data(); 34 | } 35 | result.pop_back(); 36 | return result; 37 | } 38 | 39 | std::string get_alm_version() { 40 | std::string offset_str = "0x" + exec("objdump -t ../extern/amd-libm/lib/libalm.so --section=.rodata | grep -m1 " 41 | "ALM_VERSION_STRING | cut -d' ' -f 1"); 42 | size_t offset = strtol(offset_str.c_str(), NULL, 0); 43 | FILE *obj = fopen("../extern/amd-libm/lib/libalm.so", "r"); 44 | fseek(obj, offset, 0); 45 | char buf[16]; 46 | fread(buf, sizeof(char), 16, obj); 47 | fclose(obj); 48 | return buf; 49 | } 50 | 51 | std::string get_sleef_version() { 52 | return std::to_string(SLEEF_VERSION_MAJOR) + "." + std::to_string(SLEEF_VERSION_MINOR) + "." + 53 | std::to_string(SLEEF_VERSION_PATCHLEVEL); 54 | } 55 | 56 | std::string get_af_version() { 57 | return std::to_string(VECTORCLASS_H / 10000) + "." + std::to_string((VECTORCLASS_H / 100) % 100) + "." + 58 | std::to_string(VECTORCLASS_H % 10); 59 | } 60 | 61 | std::string get_boost_version() { 62 | return std::to_string(BOOST_VERSION / 100000) + "." + std::to_string((BOOST_VERSION / 100) % 1000) + "." + 63 | std::to_string(BOOST_VERSION % 100); 64 | } 65 | 66 | std::string get_gsl_version() { return std::to_string(GSL_MAJOR_VERSION) + "." + std::to_string(GSL_MINOR_VERSION); } 67 | 68 | std::string get_sctl_version() { return exec("cd ../extern/SCTL; git describe --tags"); } 69 | 70 | std::string get_baobzi_version() { return exec("cd ../extern/baobzi; git describe --tags").substr(1); } 71 | 72 | std::string get_eigen_version() { 73 | return std::to_string(EIGEN_WORLD_VERSION) + "." + std::to_string(EIGEN_MAJOR_VERSION) + "." + 74 | std::to_string(EIGEN_MINOR_VERSION); 75 | } 76 | 77 | } // namespace sf::utils 78 | --------------------------------------------------------------------------------