├── .gitignore ├── AUTHORS.md ├── CMakeLists.txt ├── LICENSE ├── README.md ├── examples ├── CMakeLists.txt ├── DemoNewton.cpp ├── common_type.cpp ├── example.cpp └── example2.cpp ├── floatx.png ├── src └── floatx.hpp ├── test ├── CMakeLists.txt ├── IEEEHelper.cpp ├── IEEEHelper.h ├── NanInf.cpp ├── arithmetic.cpp ├── assignment.cpp ├── conversion.cpp ├── rel_ops.cpp ├── round_nearest.cpp ├── std_integration.cpp ├── stream.cpp ├── value_representation.cpp ├── value_representation_bits.cpp └── value_representation_half.cpp ├── testx ├── CMakeLists.txt ├── add_000.cpp ├── div_000.cpp ├── mul_000.cpp └── sub_000.cpp └── third_party ├── CMakeLists.txt ├── DownloadCMakeLists.txt.in ├── git-cmake-format └── CMakeLists.txt ├── gtest └── CMakeLists.txt └── package_loader.cmake /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | Goran Flegar, Universitat Jaume I, 2 | Florian Scheidegger, IBM Research GmbH, 3 | Vedran Novakovic, Universitat Jaume I, 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | project(FloatX) 3 | 4 | option(BUILD_TESTS "Generate build files for unit tests" ON) 5 | option(BUILD_EXHAUSTIVE_TESTS "Generate build files for exhaustive tests" OFF) 6 | option(DEVEL_TOOLS "Include development tools in build system" ON) 7 | option(BUILD_EXAMPLES "Build examples in the example/ directory" ON) 8 | 9 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 10 | set(CMAKE_CXX_STANDARD 11) 11 | 12 | add_subdirectory(third_party) # third party tools and libraries 13 | 14 | add_library(floatx INTERFACE) 15 | target_include_directories(floatx INTERFACE src/) 16 | 17 | if(BUILD_TESTS OR BUILD_EXHAUSTIVE_TESTS) 18 | enable_testing() 19 | endif() 20 | 21 | if(BUILD_TESTS) 22 | add_subdirectory(test) 23 | endif() 24 | 25 | if(BUILD_EXHAUSTIVE_TESTS) 26 | add_subdirectory(testx) 27 | endif() 28 | 29 | if(BUILD_EXAMPLES) 30 | add_subdirectory(examples) 31 | endif() 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 - The OPRECOMP Project Consortium, IBM Research GmbH, University Jaume I. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | FloatX (Float eXtended) 2 | ======================= 3 | 4 | FloatX is a header-only C++ library which extends floating point types beyond 5 | the native single and double (and on some hardware half) precision types. It 6 | provides template types which allow the user to select the number of bits used 7 | for the exponent and significand parts of the floating point number. 8 | The idea of FloatX is based on the FlexFloat library, but, instead of 9 | implementing the functionality in C and providing C++ wrappers, FloatX is 10 | written completely in C++, which makes it more natural to the end user. 11 | In addition, FloatX provides a superset of FlexFloat's functionalities. 12 | 13 | 14 | ![](./floatx.png) 15 | 16 | 17 | Features 18 | -------- 19 | 20 | This section lists the functionalities provided by FloatX. Functionalities that 21 | are also provided by FlexFloat have (_flexfloat_) appended to the description. 22 | 23 | * header-only library, without a compiled component, and heavy inlining, 24 | resulting in relatively high performance 25 | * `floatx` class template, which allows 26 | emulation of non-native types with `exp_bits` exponent bits and `sig_bits` 27 | significand bits using a natively supported `backend_float` type to perform 28 | arithmetic operations (_flexfloat_ - provides a similar functionality in 29 | the C++ wrapper, but the memory consumption of the flexfloat C++ class was 30 | suboptimal). 31 | * `floatxr` class template, which provides the same 32 | functionality as `floatx`, but allows changing the precision of the type 33 | at runtime. This class is easier to experiment with, but is not as 34 | efficient as `floatx` in both the performance, as well as the memory 35 | consumption. (_flexfloat_ - provides a type that has a comparable memory 36 | consumption with the precision selectable at runtime in the C library only) 37 | * conversions between builtin types and `floatx` 38 | (_flexfloat_ - had a bug where NaN can be cast to Inf during conversion) 39 | * assignments on `floatx` and `floatxr` types (_flexfloat_) 40 | * relational operations on `floatx` and `floatxr` types 41 | (_flexfloat_ - did not handle NaN properly) 42 | * relational operations between different types 43 | * arithmetic operations on `floatx` and `floatxr` types (_flexfloat_) 44 | * arithmetic operations between different types with implicit type promotion 45 | * `std::ostream& operator <<(std::ostream&, floatx[r])` (_flexfloat_) 46 | * `std::istream& operator >>(std::istream&, floatx[r])` 47 | * CUDA support 48 | 49 | 50 | What FloatX is NOT 51 | ------------------ 52 | 53 | FloatX does not implement arbitrary floating point types. The only supported 54 | types are "subtypes" of those natively supported by the hardware. 55 | In case you need implementations of larger types, consider using the SoftFloat 56 | library. 57 | 58 | FloatX __emulates__ the types of custom precision, subject to the constraints 59 | above, and, while trying to achieve as high performance as possible, it is 60 | __not__ capable of magically delivering better performance than natively 61 | supported types. Thus, do not expect `floatx<3, 3>` to consume less memory, or 62 | be faster than e.g. `float`, though `floatx<11, 52>` should deliver similar 63 | performance as `double`. 64 | 65 | That being said, it is not likely that FloatX will be useful in production 66 | codes. On the other hand, it can be handy in research projects which aim to 67 | study the effects of using different precisions. 68 | 69 | Installation 70 | ------------ 71 | 72 | To use the library, just make sure that a directory containing `floatx.hpp` is 73 | in your include path (here, it is in `src/` subdirectory). 74 | 75 | Alternatively, if you are using CMake, a `CMakeLists.txt` file is provided. 76 | You can download the repository into your project and use the following code to 77 | depend on the floatx target: 78 | 79 | ``` 80 | add_subdirectory(floatx) 81 | target_add_library(my_target PRIVATE floatx) 82 | ``` 83 | 84 | ### Building the examples / unit tests 85 | 86 | A standard CMake command line sequence should do: 87 | 88 | ``` 89 | mkdir build && cd build && cmake .. && make 90 | ``` 91 | 92 | To run all the tests: 93 | 94 | ``` 95 | make test 96 | ``` 97 | 98 | This will (hopefully) output a summary of the form: 99 | 100 | ``` 101 | test_............ Passed 102 | ``` 103 | 104 | To run only one of the tests (and see more detail output): 105 | 106 | ``` 107 | ./test/ 108 | ``` 109 | 110 | 111 | Examples 112 | -------- 113 | 114 | Some sample code using floatx: 115 | ``` 116 | 1: flx::floatx<7, 12> a = 1.2; // 7 exponent bits , 12 sign . bits 117 | 2: flx::floatx<7, 12> b = 3; // 7 exponent bits , 12 sign . bits 118 | 3: flx::floatx<10, 9> c; // 10 exponent bits , 9 sign . bits 119 | 4: float d = 3.2; 120 | 5: double e = 5.2; 121 | 6: 122 | 7: std :: cin >> c; 123 | 8: c = a + b; // decltype (a + b) == floatx <7, 12> 124 | 9: bool t = a < b; 125 | 10: a += c; 126 | 11: d = a / c; // decltype (a / c) == floatx <10 , 12> 127 | 12: e = c - d; // decltype (c - d) == floatx <10 , 23> 128 | 13: c = a * e; // decltype (a * e) == floatx <11 , 52> 129 | 14: std :: cout << c; 130 | ``` 131 | 132 | Lines 1, 2, and 3 show how floatx numbers can be constructed 133 | from built-in types (floating-point numbers and integers) and read 134 | from C++ streams. Lines 8 and 9 show how these objects are used 135 | to perform basic arithmetic and relational operations. Lines 10-13 136 | demonstrate the interoperability between different floatx and built-in 137 | types. The comments on the right specify the return type of the 138 | operation. Note, that T == U, where T and U are types, is used to 139 | convey that these two types are the same, i.e., that std::is_same::value evaluates to true. Lines 8 and 11-13 also show that floatx 141 | types can be implicitly converted to other floatx types or built-in 142 | types. Finally, line 14 shows how floatx types can be written to an 143 | output stream. 144 | 145 | 146 | ## Authors and contacts 147 | - Goran Flegar, Departamento de Ingeniería y Ciencia de Computadores, Universidad Jaime I, Spain, flegar@uji.es 148 | - Florian Scheidegger, IBM Research - Zurich, eid@zurich.ibm.com 149 | - Vedran Novakovic, Departamento de Ingeniería y Ciencia de Computadores, Universidad Jaime I, Spain 150 | - Giovani Mariani, IBM Research - Zurich, 151 | - Andres E. Tomas, Departamento de Ingeniería y Ciencia de Computadores, Universidad Jaime I, Spain,tomasan@uji.es 152 | - A. Cristiano I. Malossi, IBM Research - Zurich, acm@zurich.ibm.com 153 | - Enrique S. Quintana-Orti, Departamento de Informática de Sistemas y Computadores,Universitat Politècnica de València, Spain, quintana@icc.uji.es 154 | 155 | 156 | ## Reference 157 | 158 | The full text of our paper explaining floatx in datail is available under the following link: https://dl.acm.org/doi/pdf/10.1145/3368086?download=true. 159 | 160 | Please, if you like and use our work, cite our paper as follows: 161 | 162 | ``` 163 | @article{flegar2019floatx, 164 | author={Flegar, Goran and Scheidegger, Florian and Novakovi{\'c}, Vedran and Mariani, Giovani and Tom{\'{}} s, Andr{\'e}s E and Malossi, A Cristiano I and Quintana-Ort{\'\i}, Enrique S}, 165 | title = {FloatX: A C++ Library for Customized Floating-Point Arithmetic}, 166 | year = {2019}, 167 | issue_date = {December 2019}, 168 | publisher = {Association for Computing Machinery}, 169 | address = {New York, NY, USA}, 170 | volume = {45}, 171 | number = {4}, 172 | issn = {0098-3500}, 173 | url = {https://doi.org/10.1145/3368086}, 174 | doi = {10.1145/3368086}, 175 | journal={ACM Transactions on Mathematical Software (TOMS)}, 176 | month = dec, 177 | articleno = {Article 40}, 178 | numpages = {23}, 179 | } 180 | ``` 181 | 182 | ## Acknowledgments 183 | 184 | This work was funded by the European Union’s H2020 research and innovation program under grant 185 | agreement No 732631, project OPRECOMP. 186 | 187 | For details visit http://oprecomp.eu/. 188 | 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(example example.cpp) 2 | target_link_libraries(example PRIVATE floatx) 3 | 4 | add_executable(example2 example2.cpp) 5 | target_link_libraries(example2 PRIVATE floatx) 6 | 7 | add_executable(common_type common_type.cpp) 8 | target_link_libraries(common_type PRIVATE floatx) 9 | 10 | add_executable(DemoNewton DemoNewton.cpp) 11 | target_link_libraries(DemoNewton PRIVATE floatx) -------------------------------------------------------------------------------- /examples/DemoNewton.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | /* 19 | * compile that file 20 | * g++ -std=c++11 -Wall -o DemoNewton -I. DemoNewton.cpp 21 | */ 22 | 23 | #include 24 | #include 25 | 26 | // Babylonian method: 27 | // Derived from Netwon 28 | // See: https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Example 29 | // based on float type 30 | float myroot(float a, float a0, float tol) 31 | { 32 | float x = a0; 33 | float xnext; 34 | float err; 35 | 36 | int k = 0; 37 | do { 38 | xnext = 0.5 * (x + a / x); 39 | err = fabs(x - xnext); 40 | printf("[k=%i]: %f %e\n", k++, xnext, err); 41 | x = xnext; 42 | } while (err > tol); 43 | return xnext; 44 | } 45 | 46 | // general routine based on type T 47 | // note, if T is float that routine is the same as above. 48 | template 49 | T myroot_general(T a, T a0, T tol) 50 | { 51 | T x = a0; 52 | T xnext; 53 | T err; 54 | 55 | int k = 0; 56 | do { 57 | xnext = 0.5 * (x + a / x); 58 | // for example fabs(...) is not defined for the floatx type 59 | // hence, we use a cast to double and back to our type 60 | err = (T)fabs(double(x - xnext)); 61 | printf("[k=%i]: %f %e\n", k++, double(xnext), double(err)); 62 | x = xnext; 63 | } while (err > tol); 64 | return xnext; 65 | } 66 | 67 | int main(int argc, char** argv) 68 | { 69 | printf( 70 | "floatx working " 71 | "example\n==============================================\n"); 72 | printf("Iteratively compute the square root of a\n"); 73 | 74 | if (argc != 3) { 75 | printf("Usage: %s \n computes root(a) by Newton Iterations.\n", 76 | argv[0]); 77 | printf("Example: \n %s 2 1\n", argv[0]); 78 | exit(-1); 79 | } 80 | 81 | float a = atof(argv[1]); 82 | float a0 = atof(argv[2]); 83 | 84 | float res = myroot(a, a0, 1e-6); 85 | 86 | printf("\n\nBaseline version (float)\n\n"); 87 | 88 | printf("==============================================\n"); 89 | printf("Result Computed (float): %.20f\n", res); 90 | float ref = sqrt(a); 91 | printf("Reference: %.20f\n", ref); 92 | printf("==============================================\n"); 93 | printf("Error: %e\n", ref - res); 94 | printf("==============================================\n"); 95 | 96 | 97 | printf("\n\nFloatx Version IEEE 16bit, e.g., floatx<5,10>\n\n"); 98 | res = myroot_general>(a, a0, 1e-6); 99 | printf("==============================================\n"); 100 | printf("Result Computed (floatx: %.20f\n", res); 101 | printf("Reference: %.20f\n", ref); 102 | printf("==============================================\n"); 103 | printf("Error: %e\n", ref - res); 104 | printf("==============================================\n"); 105 | 106 | return 0; 107 | } -------------------------------------------------------------------------------- /examples/common_type.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | 20 | 21 | // Uncoment to disable common type resolution. 22 | // #define FLOATX_NO_TYPE_RESOLUTION 23 | #include 24 | 25 | 26 | int main() 27 | { 28 | using float1 = flx::floatx<5, 7>; 29 | using float2 = flx::floatx<4, 8>; 30 | std::cout << float1(2.6) + float1(6.2) << std::endl; // always works 31 | std::cout << float1(2.6) + float2(6.2) << std::endl; // fails with flag 32 | } 33 | -------------------------------------------------------------------------------- /examples/example.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | /* 19 | * Example of using the FloatX library. Compile with 20 | * g++ -std=c++11 -I ../src test.cpp 21 | */ 22 | #include 23 | 24 | 25 | #include 26 | 27 | 28 | int main() 29 | { 30 | std::cout << "sizeof(floatx) = " << sizeof(flx::floatx<11, 52>) 31 | << "\nsizeof(floatxr) = " << sizeof(flx::floatxr<>) << std::endl; 32 | // compile-time types 33 | flx::floatx<11, 52> f; // double 34 | flx::floatx<7, 22> g(5.3); // float with 7 exp and 22 significand bits 35 | 36 | // runtime types 37 | flx::floatxr<> fr(11, 52); 38 | flx::floatxr<> gr(7, 22, 5.3); 39 | 40 | std::cout << std::scientific; 41 | 42 | // conversion to double 43 | std::cout << double(f) << std::endl 44 | << double(g) << std::endl 45 | << double(fr) << std::endl 46 | << double(gr) << std::endl; 47 | 48 | // conversion to flexfloat 49 | flx::floatx<3, 2> lg(g); 50 | flx::floatx<3, 2> lgr(gr); 51 | 52 | std::cout << double(lg) << ", precision = " 53 | << "(" << get_exp_bits(lg) << ", " << get_sig_bits(lg) << ")\n" 54 | << double(lgr) << ", precision = " 55 | << "(" << get_exp_bits(lgr) << ", " << get_sig_bits(lgr) << ")" 56 | << std::endl; 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /examples/example2.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | /* 19 | * Example of using the FloatX library. Compile with 20 | * g++ -std=c++11 -I ../src test.cpp 21 | */ 22 | #include 23 | 24 | 25 | #include 26 | 27 | template 28 | void foo(T* a, int n) 29 | { 30 | printf("HI FOO ROUTINE\n"); 31 | for (unsigned i = 0; i < n; ++i) { 32 | if (i == 0 || i == 1) { 33 | a[i] = 1; 34 | } else { 35 | a[i] = a[i - 1] + a[i - 2]; 36 | } 37 | } 38 | } 39 | 40 | void compileExample() 41 | { 42 | typedef flx::floatx<11, 48> T; 43 | T res = 0; 44 | 45 | double a = 3.1; 46 | double b = 5.2; 47 | 48 | res = (T)a * b; // ERROR 49 | 50 | std::cout << "[cout] res = " << res << std::endl; 51 | } 52 | 53 | void compileExample2() 54 | { 55 | typedef flx::floatx<11, 48> T; 56 | T res = 33.45; 57 | int i = 4; 58 | 59 | std::cout << "[cout] res = " << res << " and i = " << i << std::endl; 60 | 61 | // TODO (withouth double does not yet work) 62 | if (res == (double)i) { 63 | std::cout << "TRUE\n"; 64 | } else { 65 | std::cout << "FALSE\n"; 66 | } 67 | 68 | if (res == 3.0) { 69 | std::cout << "TRUE 2\n"; 70 | } else { 71 | std::cout << "FALSE 2\n"; 72 | } 73 | } 74 | 75 | int main() 76 | { 77 | compileExample(); 78 | compileExample2(); 79 | printf("--------------------------\n"); 80 | // Double-precision variables 81 | 82 | // simple use case 83 | // flx::floatx<11, 52> ff_a, ff_b, ff_c; 84 | // flx::floatx<5, 30> ff_a, ff_b, ff_c; 85 | 86 | // other use case 87 | flx::floatx<11, 48> ff_a; 88 | flx::floatx<5, 5> ff_b; 89 | flx::floatx<11, 30> ff_c; 90 | 91 | // Assigment with cast (from double literal) 92 | ff_a = 10.4; 93 | ff_b = 11.5; 94 | // Overloaded operators 95 | // ff_b += 2; // DOES NOT WORK (cast from int not defined) 96 | ff_b += 2.0; // WORKS. 97 | 98 | // ff_b = ff_b + 2; // DOES NOT WORK (cast from int not defined) 99 | ff_b = ff_b + 2.0; // DOES NOT WORK (except flex is as double) 100 | // ff_b = double( ff_b + flx::floatx<11, 32>(2)); //WORKS 101 | 102 | ff_c = ff_a + ff_b; 103 | 104 | // C++ output stream 105 | // Explicit output as double. 106 | std::cout << "output after double cast:\n"; 107 | std::cout << "[cout] ff_a = " << double(ff_a) << std::endl; 108 | std::cout << "[cout] ff_b = " << double(ff_b) << std::endl; 109 | std::cout << "[cout] ff_c = " << double(ff_c) << std::endl; 110 | 111 | // Implicit output works as well. 112 | std::cout << "Output:\n"; 113 | std::cout << "[cout] ff_a = " << ff_a << std::endl; 114 | std::cout << "[cout] ff_b = " << ff_b << std::endl; 115 | std::cout << "[cout] ff_c = " << ff_c << std::endl; 116 | 117 | std::cout << "Get information about type:\n"; 118 | std::cout << "[cout] ff_a = " << ff_a << " <" << get_exp_bits(ff_a) << "," 119 | << get_sig_bits(ff_a) << ">" << std::endl; 120 | std::cout << "[cout] ff_b = " << ff_b << " <" << get_exp_bits(ff_b) << "," 121 | << get_sig_bits(ff_b) << ">" << std::endl; 122 | std::cout << "[cout] ff_c = " << ff_c << " <" << get_exp_bits(ff_c) << "," 123 | << get_sig_bits(ff_c) << ">" << std::endl; 124 | 125 | std::cout << "Sizeof Results (it is the static case):\n"; 126 | std::cout << "sizeof( ff_a ) = " << sizeof(ff_a) << "\n"; 127 | std::cout << "sizeof( ff_b ) = " << sizeof(ff_b) << "\n"; 128 | std::cout << "sizeof( ff_c ) = " << sizeof(ff_c) << "\n"; 129 | 130 | // get_exp_bits() 131 | // Binary output. 132 | // std::cout << "[cout] ff_a = " << ff_a << " (" << flexfloat_as_bits << 133 | // ff_a << flexfloat_as_double << ")" << std::endl; std::cout << "[cout] 134 | // ff_b = " << ff_b << " (" << flexfloat_as_bits << ff_b << 135 | // flexfloat_as_double << ")" << std::endl; std::cout << "[cout] ff_c = " << 136 | // ff_c << " (" << flexfloat_as_bits << ff_c << flexfloat_as_double << ")" 137 | // << std::endl; 138 | 139 | // generate arrays of data 140 | // flexfloat<11, 52> ff_a 141 | int n = 100; 142 | // double* a = new double[n]; 143 | flx::floatx<5, 12>* a = new flx::floatx<5, 12>[n]; 144 | 145 | for (unsigned i = 0; i < n; ++i) { 146 | a[i] = i; 147 | } 148 | 149 | // foo< flx::floatx<5, 12> >( a, n); // OK 150 | // foo< flx::floatx<5, 52> >( a, n); //wrong type 151 | foo(a, n); // infers type, ok 152 | 153 | for (unsigned i = 0; i < n; ++i) { 154 | // std::cout << i << ":\t" << a[i] << " (" << flexfloat_as_bits << a[i] 155 | // << flexfloat_as_double << ")" << std::endl; 156 | std::cout << i << ":\t" << double(a[i]) << std::endl; 157 | } 158 | delete[] a; 159 | 160 | // std::cout << "sizeof(floatx) = " << sizeof(flx::floatx<11, 52>) 161 | // << "\nsizeof(floatxr) = " << sizeof(flx::floatxr<>) 162 | // << std::endl; 163 | // // compile-time types 164 | // flx::floatx<11, 52> f; // double 165 | // flx::floatx<7, 22> g(5.3); // float with 7 exp and 22 significand bits 166 | 167 | // // runtime types 168 | // flx::floatxr<> fr(11, 52); 169 | // flx::floatxr<> gr(7, 22, 5.3); 170 | 171 | // std::cout << std::scientific; 172 | 173 | // // conversion to double 174 | // std::cout << double(f) << std::endl 175 | // << double(g) << std::endl 176 | // << double(fr) << std::endl 177 | // << double(gr) << std::endl; 178 | 179 | // // conversion to flexfloat 180 | // flx::floatx<3, 2> lg(g); 181 | // flx::floatx<3, 2> lgr(gr); 182 | 183 | // std::cout << double(lg) << ", precision = " 184 | // << "(" << get_exp_bits(lg) << ", " << get_sig_bits(lg) << ")\n" 185 | // << double(lgr) << ", precision = " 186 | // << "(" << get_exp_bits(lgr) << ", " << get_sig_bits(lgr) << ")" 187 | // << std::endl; 188 | // return 0; 189 | } 190 | -------------------------------------------------------------------------------- /floatx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oprecomp/FloatX/a67318fbedf0cebd5da277f3633275b905a5c12a/floatx.png -------------------------------------------------------------------------------- /src/floatx.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #ifndef FLOATX_FLOATX_HPP_ 19 | #define FLOATX_FLOATX_HPP_ 20 | 21 | 22 | #ifdef __CUDA_ARCH__ 23 | #include "cuda_runtime.h" 24 | #endif // __CUDA_ARCH__ 25 | 26 | #include 27 | 28 | #if CHAR_BIT != 8 29 | #error Expecting 8 bits in a char! 30 | #endif // ?CHAR_BIT 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | 43 | #ifdef __CUDA_ARCH__ 44 | #define FLOATX_ATTRIBUTES __host__ __device__ 45 | #define FLOATX_INLINE __forceinline__ 46 | #else // __CUDA_ARCH__ 47 | #define FLOATX_ATTRIBUTES 48 | #define FLOATX_INLINE inline 49 | #endif // __CUDA_ARCH__ 50 | 51 | 52 | #define USE_BUILTINS 53 | 54 | 55 | namespace flx { 56 | 57 | 58 | namespace detail { 59 | 60 | 61 | static constexpr int bits_in_byte = CHAR_BIT; 62 | 63 | 64 | template 65 | class floatx_base; 66 | 67 | 68 | template 69 | struct bits_type {}; 70 | 71 | 72 | #define ENABLE_STANDARD_BITS_TYPE(_size) \ 73 | template <> \ 74 | struct bits_type<_size / bits_in_byte> { \ 75 | using type = std::uint##_size##_t; \ 76 | } 77 | 78 | ENABLE_STANDARD_BITS_TYPE(8); 79 | ENABLE_STANDARD_BITS_TYPE(16); 80 | ENABLE_STANDARD_BITS_TYPE(32); 81 | ENABLE_STANDARD_BITS_TYPE(64); 82 | 83 | #undef ENABLE_STANDARD_BITS_TYPE 84 | 85 | 86 | } // namespace detail 87 | 88 | 89 | #define FLOATX_USE_DEFAULT_TRAITS(_type) \ 90 | static const auto sig_pos = 0; \ 91 | static const auto exp_pos = float_traits<_type>::sig_bits; \ 92 | static const auto sgn_pos = exp_pos + float_traits<_type>::exp_bits; \ 93 | static const auto sig_mask = \ 94 | (UINT64_C(1) << float_traits<_type>::sig_bits) - UINT64_C(1); \ 95 | static const auto exp_mask = \ 96 | (UINT64_C(1) << float_traits<_type>::exp_bits) - UINT64_C(1); \ 97 | static const auto sgn_mask = UINT64_C(1); \ 98 | static const auto bias = exp_mask >> 1; \ 99 | using bits_type = typename detail::bits_type::type 100 | 101 | 102 | template 103 | struct float_traits {}; 104 | 105 | template 106 | struct float_traits::value>::type> { 108 | static const bool is_floatx = false; 109 | static const bool is_runtime = false; 110 | static const int exp_bits = 0; 111 | static const int sig_bits = 0; 112 | using backend_float = T; 113 | 114 | FLOATX_USE_DEFAULT_TRAITS(T); 115 | }; 116 | 117 | template <> 118 | struct float_traits { 119 | static const bool is_floatx = false; 120 | static const bool is_runtime = false; 121 | static const int exp_bits = 8; 122 | static const int sig_bits = 23; 123 | using backend_float = float; 124 | 125 | FLOATX_USE_DEFAULT_TRAITS(float); 126 | }; 127 | 128 | template <> 129 | struct float_traits { 130 | static const bool is_floatx = false; 131 | static const bool is_runtime = false; 132 | static const int exp_bits = 11; 133 | static const int sig_bits = 52; 134 | using backend_float = double; 135 | 136 | FLOATX_USE_DEFAULT_TRAITS(double); 137 | }; 138 | 139 | 140 | #define ENABLE_PROPERTY(_prop) \ 141 | template \ 142 | FLOATX_ATTRIBUTES FLOATX_INLINE constexpr auto get_##_prop( \ 143 | const Float&) noexcept-> \ 144 | typename std::enable_if::is_runtime, \ 145 | decltype(float_traits::_prop)>::type \ 146 | { \ 147 | return float_traits::_prop; \ 148 | } \ 149 | template \ 150 | FLOATX_ATTRIBUTES FLOATX_INLINE constexpr auto get_##_prop( \ 151 | const RuntimeFloat& f) noexcept-> \ 152 | typename std::enable_if::is_runtime, \ 153 | decltype(f.get_##_prop())>::type \ 154 | { \ 155 | return f.get_##_prop(); \ 156 | } 157 | 158 | ENABLE_PROPERTY(exp_bits); // get_exp_bits(f) 159 | ENABLE_PROPERTY(sig_bits); // get_sig_bits(f) 160 | 161 | #undef ENABLE_PROPERTY 162 | 163 | 164 | template 165 | class floatx 166 | : public detail::floatx_base> { 167 | private: 168 | using backend_float = typename float_traits::backend_float; 169 | 170 | public: 171 | FLOATX_ATTRIBUTES floatx() noexcept 172 | : detail::floatx_base(backend_float(0.0)) 173 | { 174 | this->initialize(); 175 | } 176 | 177 | template 178 | FLOATX_ATTRIBUTES floatx(const T& other) noexcept 179 | : detail::floatx_base(backend_float(other)) 180 | { 181 | this->initialize(); 182 | } 183 | 184 | // Default copy/move constructors/assignment operators are OK here 185 | 186 | template 187 | FLOATX_ATTRIBUTES floatx& operator=(const T& other) noexcept 188 | { 189 | return *this = floatx(other); 190 | } 191 | }; 192 | 193 | 194 | template 195 | struct float_traits, void> { 196 | static const bool is_floatx = true; 197 | static const bool is_runtime = false; 198 | static const int exp_bits = ExpBits; 199 | static const int sig_bits = SigBits; 200 | using backend_float = BackendFloat; 201 | 202 | FLOATX_USE_DEFAULT_TRAITS(backend_float); 203 | }; 204 | 205 | 206 | template 207 | class floatxr 208 | : public detail::floatx_base> { 209 | private: 210 | using backend_float = typename float_traits::backend_float; 211 | 212 | public: 213 | using metadata_type = MetadataType; 214 | 215 | FLOATX_ATTRIBUTES 216 | floatxr(metadata_type exp_bits, metadata_type sig_bits) noexcept 217 | : detail::floatx_base(backend_float(0.0)), 218 | exp_bits_(exp_bits), 219 | sig_bits_(sig_bits) 220 | { 221 | this->initialize(); 222 | } 223 | 224 | // Default copy/move constructors are OK 225 | 226 | template 227 | FLOATX_ATTRIBUTES floatxr(metadata_type exp_bits, metadata_type sig_bits, 228 | const T& other) noexcept 229 | : detail::floatx_base(backend_float(other)), 230 | exp_bits_(exp_bits), 231 | sig_bits_(sig_bits) 232 | { 233 | this->initialize(); 234 | } 235 | 236 | template 237 | FLOATX_ATTRIBUTES floatxr(const T& other) noexcept 238 | : detail::floatx_base(backend_float(other)), 239 | exp_bits_(flx::get_exp_bits(other)), 240 | sig_bits_(flx::get_sig_bits(other)) 241 | { 242 | /* already initialized */ 243 | } 244 | 245 | // Assignment needs to preserve the format of the result 246 | template 247 | FLOATX_ATTRIBUTES floatxr& operator=(const T& other) noexcept 248 | { 249 | return *this = floatxr(flx::get_exp_bits(*this), 250 | flx::get_sig_bits(*this), backend_float(other)); 251 | } 252 | 253 | FLOATX_ATTRIBUTES void set_precision(metadata_type exp_bits, 254 | metadata_type sig_bits) 255 | { 256 | exp_bits_ = exp_bits; 257 | sig_bits_ = sig_bits; 258 | this->initialize(); 259 | } 260 | 261 | FLOATX_ATTRIBUTES constexpr metadata_type get_exp_bits() const noexcept 262 | { 263 | return exp_bits_; 264 | } 265 | 266 | FLOATX_ATTRIBUTES constexpr metadata_type get_sig_bits() const noexcept 267 | { 268 | return sig_bits_; 269 | } 270 | 271 | private: 272 | metadata_type exp_bits_; 273 | metadata_type sig_bits_; 274 | }; 275 | 276 | 277 | template 278 | struct float_traits, void> { 279 | static const bool is_floatx = true; 280 | static const bool is_runtime = true; 281 | static const int exp_bits = float_traits::exp_bits; 282 | static const int sig_bits = float_traits::sig_bits; 283 | using backend_float = BackendFloat; 284 | 285 | FLOATX_USE_DEFAULT_TRAITS(backend_float); 286 | }; 287 | 288 | 289 | template 290 | struct supertype { 291 | private: 292 | static constexpr int max(int x, int y) { return (x > y) ? x : y; } 293 | 294 | public: 295 | #ifdef FLOATX_NO_TYPE_RESOLUTION 296 | static_assert(std::is_same::value, 297 | "Common type detection is disabled by the user" 298 | " [FLOATX_NO_TYPE_RESOLUTION]"); 299 | #endif // FLOATX_NO_TYPE_RESOLUTION 300 | 301 | using type = typename std::enable_if< 302 | float_traits::is_floatx || float_traits::is_floatx, 303 | typename std::conditional::is_runtime || 304 | float_traits::is_runtime, 305 | floatxr, 306 | floatx::exp_bits, 307 | float_traits::exp_bits), 308 | max(float_traits::sig_bits, 309 | float_traits::sig_bits), 310 | BackendFloat>>::type>::type; 311 | static constexpr int max_exp_bits(FloatX1 x, FloatX2 y) 312 | { 313 | return max(get_exp_bits(x), get_exp_bits(y)); 314 | } 315 | static constexpr int max_sig_bits(FloatX1 x, FloatX2 y) 316 | { 317 | return max(get_sig_bits(x), get_sig_bits(y)); 318 | } 319 | }; 320 | 321 | 322 | #define ENABLE_RELATIONAL_OPERATOR(_op) \ 323 | template \ 324 | FLOATX_ATTRIBUTES FLOATX_INLINE \ 325 | typename std::enable_if::is_floatx || \ 326 | float_traits::is_floatx, \ 327 | bool>::type \ 328 | operator _op(const Float1& x, const Float2& y) \ 329 | { \ 330 | return typename float_traits::backend_float(x) _op \ 331 | typename float_traits::backend_float(y); \ 332 | } 333 | 334 | ENABLE_RELATIONAL_OPERATOR(==) 335 | ENABLE_RELATIONAL_OPERATOR(!=) 336 | ENABLE_RELATIONAL_OPERATOR(<) 337 | ENABLE_RELATIONAL_OPERATOR(>) 338 | ENABLE_RELATIONAL_OPERATOR(<=) 339 | ENABLE_RELATIONAL_OPERATOR(>=) 340 | 341 | #undef ENABLE_RELATIONAL_OPERATOR 342 | 343 | 344 | #define ENABLE_ARITHMETIC_OPERATOR(_op) \ 345 | template \ 346 | FLOATX_ATTRIBUTES FLOATX_INLINE typename std::enable_if< \ 347 | (float_traits::is_floatx || \ 348 | float_traits::is_floatx) && \ 349 | !float_traits::is_runtime && \ 350 | !float_traits::is_runtime, \ 351 | typename supertype< \ 352 | Float1, Float2, \ 353 | decltype(typename float_traits::backend_float() _op \ 354 | typename float_traits::backend_float())>::type>:: \ 355 | type \ 356 | operator _op(const Float1& x, const Float2& y) \ 357 | { \ 358 | using bf = decltype(typename float_traits::backend_float( \ 359 | x) _op typename float_traits::backend_float(y)); \ 360 | using st = typename supertype::type; \ 361 | return st(bf(x) _op bf(y)); \ 362 | } \ 363 | \ 364 | template \ 365 | FLOATX_ATTRIBUTES FLOATX_INLINE typename std::enable_if< \ 366 | float_traits::is_runtime || float_traits::is_runtime, \ 367 | typename supertype< \ 368 | Float1, Float2, \ 369 | decltype(typename float_traits::backend_float() _op \ 370 | typename float_traits::backend_float())>::type>:: \ 371 | type \ 372 | operator _op(const Float1& x, const Float2& y) \ 373 | { \ 374 | using bf = decltype(typename float_traits::backend_float( \ 375 | x) _op typename float_traits::backend_float(y)); \ 376 | using st = supertype; \ 377 | return typename st::type(st::max_exp_bits(x, y), \ 378 | st::max_sig_bits(x, y), bf(x) _op bf(y)); \ 379 | } \ 380 | \ 381 | template \ 382 | FLOATX_ATTRIBUTES FLOATX_INLINE \ 383 | typename std::enable_if::is_floatx || \ 384 | float_traits::is_floatx, \ 385 | Float1&>::type \ 386 | operator _op##=(Float1& x, const Float2& y) \ 387 | { \ 388 | return x = Float1(x _op y); \ 389 | } 390 | 391 | ENABLE_ARITHMETIC_OPERATOR(+) 392 | ENABLE_ARITHMETIC_OPERATOR(-) 393 | ENABLE_ARITHMETIC_OPERATOR(*) 394 | ENABLE_ARITHMETIC_OPERATOR(/) 395 | 396 | #undef ENABLE_ARITHMETIC_OPERATOR 397 | 398 | 399 | template 400 | FLOATX_INLINE typename std::enable_if::is_floatx, 401 | std::ostream&>::type& 402 | operator<<(std::ostream& os, const FloatX& f) noexcept 403 | { 404 | return os << typename float_traits::backend_float(f); 405 | } 406 | 407 | 408 | template 409 | FLOATX_INLINE typename std::enable_if::is_floatx, 410 | std::istream&>::type 411 | operator>>(std::istream& is, FloatX& f) noexcept 412 | { 413 | typename float_traits::backend_float tmp; 414 | is >> tmp; 415 | f = tmp; 416 | return is; 417 | } 418 | 419 | 420 | template 421 | FLOATX_ATTRIBUTES FLOATX_INLINE 422 | std::bitset::backend_float)> 423 | bits(const Float& x) noexcept 424 | { 425 | using bf = typename float_traits::backend_float; 426 | using bitset = std::bitset; 427 | bf val = bf(x); 428 | return *reinterpret_cast(&val); 429 | } 430 | 431 | 432 | namespace detail { 433 | 434 | 435 | template 436 | constexpr FLOATX_ATTRIBUTES FLOATX_INLINE 437 | typename float_traits::bits_type 438 | reinterpret_as_bits(Float val) 439 | { 440 | return *reinterpret_cast::bits_type*>( 441 | &val); 442 | } 443 | 444 | 445 | template 446 | constexpr FLOATX_ATTRIBUTES FLOATX_INLINE Float 447 | reinterpret_bits_as(typename float_traits::bits_type bits) 448 | { 449 | return *reinterpret_cast(&bits); 450 | } 451 | 452 | 453 | template 454 | constexpr FLOATX_ATTRIBUTES FLOATX_INLINE SignificandType 455 | get_round_nearest_correction(SignificandType sig, SignificandType lsb_mask, 456 | SignificandType after_lsb_mask, 457 | SignificandType rest_mask) 458 | { 459 | return (sig & after_lsb_mask) && ((sig & rest_mask) || (sig & lsb_mask)); 460 | } 461 | 462 | 463 | FLOATX_ATTRIBUTES FLOATX_INLINE constexpr uint64_t 464 | generate_rest_mask_fast_shift_less64(uint64_t MASK_AFTER_LSB) 465 | { 466 | return (MASK_AFTER_LSB >= 1) ? (MASK_AFTER_LSB - UINT64_C(0x1)) 467 | : UINT64_C(0x0000000000000000); 468 | } 469 | 470 | 471 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t round_nearest(uint64_t mant, 472 | uint16_t SHIFT) 473 | { 474 | if (SHIFT >= 64) 475 | SHIFT = 63; // that works to cover the case of down-shifts if the bit 476 | // number 63 is never set. (since DATA >> 64 is all zero 477 | // which is in that case equivalent to DATA >> 63) 478 | assert(!(mant & (UINT64_C(0x1) << 63))); 479 | 480 | // fast, no additional cases and simpler MASK generation. 481 | const uint64_t MASK_LSB = UINT64_C(0x0000000000000001) << SHIFT; 482 | const uint64_t MASK_AFTER_LSB = UINT64_C(0x0000000000000001) << (SHIFT - 1); 483 | const uint64_t MASK_REST = 484 | generate_rest_mask_fast_shift_less64(MASK_AFTER_LSB); 485 | 486 | uint64_t mant_res = mant >> SHIFT; 487 | 488 | if ((mant & MASK_AFTER_LSB) && ((mant & MASK_REST) || (mant & MASK_LSB))) { 489 | // round up if the bit after the lsb is set (>=0.5) and the number is 490 | // indeed bigger than >0.5 or if it is =0.5 and the TiesToEven rule 491 | // requires to round up. 492 | mant_res += 0x1; 493 | } 494 | 495 | mant_res = mant_res << SHIFT; 496 | 497 | return mant_res; 498 | } 499 | 500 | 501 | // CONSTANTS USED FOR BACKEND = DOUBLE 502 | const uint64_t MASK_MANTISSA = UINT64_C(0x000FFFFFFFFFFFFF); 503 | const uint64_t MASK_EXPONENT = UINT64_C(0x7FF0000000000000); 504 | const uint64_t MASK_SIGN = UINT64_C(0x8000000000000000); 505 | const uint64_t MASK_MANTISSA_OVERFLOW = UINT64_C(0x0010000000000000); 506 | const uint64_t POS_INF_PATTERN = UINT64_C(0x7ff0000000000000); 507 | const uint64_t NEG_INF_PATTERN = UINT64_C(0xfff0000000000000); 508 | const uint64_t BACKEND_BIAS = 509 | UINT64_C(1023); // that value is 2^(BACKEND_E-1)-1. 510 | const int BACKEND_E = 11; 511 | const int BACKEND_M = 52; 512 | 513 | 514 | template 515 | constexpr FLOATX_ATTRIBUTES FLOATX_INLINE bool is_nan_or_inf(BitsType number) 516 | { 517 | return (number & MASK_EXPONENT) == MASK_EXPONENT; 518 | } 519 | 520 | 521 | template 522 | constexpr FLOATX_ATTRIBUTES FLOATX_INLINE bool is_small(BitsType e, 523 | ExpType emin) 524 | { 525 | return e < emin; 526 | } 527 | 528 | 529 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t 530 | convert_nan_or_inf_to_backend(const uint64_t number, const uint8_t M) 531 | { 532 | // The following line delets any additional information that might be coded 533 | // in NAN bits. NAN bits towards the MSB of the mantissa that fit into the 534 | // target type are preserved. 535 | const uint64_t MASK_DELETE = UINT64_C(0xFFFFFFFFFFFFFFFF) 536 | << (BACKEND_M - M); 537 | 538 | // fix the nan (note that the following line does not affects +INF and -INF 539 | // by construction) 540 | return (number & MASK_DELETE); 541 | } 542 | 543 | FLOATX_ATTRIBUTES FLOATX_INLINE void convert_subnormal_mantissa_and_exp( 544 | const uint64_t number, const uint8_t M, const int16_t emin, const int e, 545 | uint64_t& mant, uint64_t& exp) 546 | { 547 | int t = emin - e; 548 | 549 | // the hidden one might have a influence in rounding, hence add the hidden 550 | // one to the mantissa. 551 | mant = mant | MASK_MANTISSA_OVERFLOW; 552 | 553 | // Perform IEEE 754 rounding with TiesToEven. 554 | mant = round_nearest(mant, BACKEND_M - M + t); 555 | 556 | // Handle the case where the number is rounded to exact 0 557 | // since it is smaller (after rounding) than the smallest Subnormal / 2 558 | if (mant == 0x0) { 559 | exp = 0x0; 560 | } 561 | 562 | // remove the hidden one from the mantissa 563 | mant = mant & ~MASK_MANTISSA_OVERFLOW; 564 | } 565 | 566 | FLOATX_ATTRIBUTES FLOATX_INLINE void fix_too_large_mantissa(const int M, int& e, 567 | uint64_t& mant, 568 | uint64_t& exp) 569 | { 570 | e += 1; 571 | // The following is the formula for the new exponent in the case the 572 | // mantissa was rounded up to a value that does not fit into the MANTISSA 573 | // field. 574 | exp = ((uint64_t)e + BACKEND_BIAS) << BACKEND_M; 575 | mant = UINT64_C(0x0000000000000000); 576 | } 577 | 578 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t assemble_regular_number( 579 | const uint64_t sign_pattern, const uint64_t mant, const uint64_t exp) 580 | { 581 | // ensure that the mantissa and exp fields to not contain bits at wrong 582 | // locations. 583 | assert((mant & ~MASK_MANTISSA) == 0x0); 584 | assert((exp & ~MASK_EXPONENT) == 0x0); 585 | 586 | // Assemble the number from the original sign and the current exp and mant 587 | // field. 588 | return (sign_pattern | exp | mant); 589 | } 590 | 591 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t 592 | assemble_inf_number(const uint64_t sign_pattern) 593 | { 594 | // The code of rounding operates on the magnitude, here we still need to at 595 | // the right sign for the final number 596 | return sign_pattern | POS_INF_PATTERN; 597 | } 598 | 599 | // // functionality to get bit representations. 600 | // bool get_sign_from_backend(const double d); 601 | 602 | // // That functions return the bit representation embedded in a data word. A 603 | // backend representation of type 604 | // // will return the full representation of 1+E+M bits in the LSB : LSB+1+E+M 605 | // bit positions of the embedding dataword (e.g. uint64_t). uint16_t 606 | // get_exponent_from_backend(const double d, const uint8_t E, const uint8_t M); 607 | // uint64_t get_mantissa_from_backend(const double d, const uint8_t E, const 608 | // uint8_t M); uint64_t get_fullbit_representation(const double d, const uint8_t 609 | // E, const uint8_t M); 610 | 611 | // // The reverse operation generates constructs a given number of exponent and 612 | // mantissa bits. 613 | // // Note, that the input is encoded into the embedding type as follows: 614 | // // exp: bits (E-1) downto 0 615 | // // mant: bits (M-1) downto 0 616 | // // -> bits at higher positions are required to be 0. (?) or neglected? 617 | // double construct_number(bool sign, uint16_t exp, uint64_t mant, const uint8_t 618 | // E, const uint8_t M); 619 | 620 | // functionality to get bit representations. 621 | FLOATX_ATTRIBUTES FLOATX_INLINE bool get_sign_from_backend(const double d) 622 | { 623 | uint64_t number = flx::detail::reinterpret_as_bits(d); 624 | return (number & MASK_SIGN); 625 | } 626 | 627 | FLOATX_ATTRIBUTES FLOATX_INLINE constexpr bool is_zero_or_nan_or_inf_exp( 628 | const uint64_t exp) 629 | { 630 | return ((exp == 0x0) || (exp == MASK_EXPONENT)); 631 | } 632 | 633 | FLOATX_ATTRIBUTES FLOATX_INLINE uint16_t 634 | get_exponent_zero_or_nan_or_inf_exp(const uint64_t exp, const uint8_t E) 635 | { 636 | uint16_t target_exp = (uint16_t)(exp >> BACKEND_M); 637 | // if it is an inf or nan delete any additional ones in the format. 638 | // (exponent requires E 1's) 639 | target_exp = target_exp & ((0x1 << E) - 1); 640 | 641 | // assert no bits are set at positions 15:E. 642 | // information is encoded only at positons E-1:0. 643 | assert(target_exp < (0x1 << E)); 644 | return target_exp; 645 | } 646 | 647 | FLOATX_ATTRIBUTES FLOATX_INLINE uint16_t 648 | get_exponent_regular_backend_exp(const uint64_t exp, const uint8_t E) 649 | { 650 | // That is the double exponent. 651 | // Interpret the exponent. 652 | uint16_t target_exp = 0x0; 653 | int e = (exp >> BACKEND_M) - BACKEND_BIAS; 654 | 655 | // TARGET FORMAT (emax and emin depends on E) 656 | // IEEE 754 STANDARD 657 | int16_t emax = (0x1 << (E - 1)) - 1; 658 | int16_t emin = 1 - emax; 659 | 660 | // Target bias is the same as emax. 661 | if (e < emin) { 662 | // a regular case in the backend, but a subnormal in the target format. 663 | target_exp = 0x0; // subnormals have a zero exponent. 664 | } else { 665 | // Encode the exponent in target format. 666 | target_exp = (uint16_t)(e + emax); 667 | } 668 | 669 | // assert no bits are set at positions 15:E. 670 | // information is encoded only at positons E-1:0. 671 | assert(target_exp < (0x1 << E)); 672 | return target_exp; 673 | } 674 | 675 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t 676 | get_mantissa_zero_or_nan_or_inf_exp(const uint64_t mant, const uint8_t M) 677 | { 678 | uint64_t ret = mant >> (BACKEND_M - M); 679 | 680 | assert(ret < (UINT64_C(0x1) << M)); 681 | return ret; 682 | } 683 | 684 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t get_mantissa_regular_backend_exp( 685 | const uint64_t exp, const uint64_t mant, const uint8_t E, const uint8_t M) 686 | { 687 | // That is the double exponent. 688 | // Interpret the exponent. 689 | int e = (exp >> BACKEND_M) - BACKEND_BIAS; 690 | 691 | // TARGET FORMAT (emax and emin depends on E) 692 | // IEEE 754 STANDARD 693 | int16_t emax = (0x1 << (E - 1)) - 1; 694 | int16_t emin = 1 - emax; 695 | // Target bias is the same as emax. 696 | 697 | uint64_t ret; 698 | 699 | if (e < emin) { 700 | int t = emin - e; 701 | // Subnormal. The backend mantissa needs the hidden 1 that is visible in 702 | // the subnormal representation of the target format. 703 | ret = (mant | MASK_MANTISSA_OVERFLOW) >> (BACKEND_M - M + t); 704 | } else { 705 | ret = mant >> (BACKEND_M - M); 706 | } 707 | 708 | assert(ret < (UINT64_C(0x1) << M)); 709 | return ret; 710 | } 711 | 712 | FLOATX_ATTRIBUTES FLOATX_INLINE uint16_t 713 | get_exponent_from_backend(const double d, const uint8_t E) 714 | { 715 | uint64_t number = flx::detail::reinterpret_as_bits(d); 716 | uint64_t exp = number & MASK_EXPONENT; 717 | 718 | // detects, zero, denormals, infs and nans in the backend double. 719 | if (is_zero_or_nan_or_inf_exp(exp)) { 720 | return get_exponent_zero_or_nan_or_inf_exp(exp, E); 721 | } else { 722 | return get_exponent_regular_backend_exp(exp, E); 723 | } 724 | } 725 | 726 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t 727 | get_mantissa_from_backend(const double d, const uint8_t E, const uint8_t M) 728 | { 729 | uint64_t number = flx::detail::reinterpret_as_bits(d); 730 | uint64_t exp = number & MASK_EXPONENT; 731 | uint64_t mant = number & MASK_MANTISSA; 732 | 733 | if (is_zero_or_nan_or_inf_exp(exp)) { 734 | return get_mantissa_zero_or_nan_or_inf_exp(mant, M); 735 | } else { 736 | return get_mantissa_regular_backend_exp(exp, mant, E, M); 737 | } 738 | } 739 | 740 | FLOATX_ATTRIBUTES FLOATX_INLINE uint8_t 741 | count_leading_zeros(const uint64_t data) noexcept 742 | { 743 | #ifdef USE_BUILTINS 744 | #ifdef __CUDA_ARCH__ 745 | return __clzll(data); 746 | #else // !__CUDA_ARCH__ 747 | return __builtin_clzl(data); 748 | #endif // ?__CUDA_ARCH__ 749 | #else // !USE_BUILTINS 750 | uint8_t t = 0u; // t will be the number of zero bits on the left 751 | for (t = 0u; t < 64u; ++t) { 752 | if (data & (UINT64_C(0x1) << (63u - t))) { 753 | break; 754 | } 755 | } 756 | return t; 757 | #endif // ?USE_BUILTINS 758 | } 759 | 760 | 761 | FLOATX_ATTRIBUTES FLOATX_INLINE void construct_number_subormal( 762 | uint64_t& backend_exp, uint64_t& mant, const int16_t emin, const uint8_t M) 763 | { 764 | // Zero and Subnormal. 765 | if (mant == UINT64_C(0x0)) { 766 | // real zero. 767 | backend_exp = 0x0; 768 | mant = 0x0; 769 | } else { 770 | // a subnormal in the target fromat, but result in a regular number in 771 | // the backend fromat. 772 | uint8_t t = count_leading_zeros(mant); 773 | t = t - (63 - M); 774 | 775 | // interpret exponent in the format. 776 | 777 | int e = emin - t; 778 | 779 | // rewrite the exponent in the backend format. 780 | backend_exp = ((uint64_t)e + BACKEND_BIAS) << BACKEND_M; 781 | 782 | // mantissa. 783 | mant = mant << (BACKEND_M - M + t); 784 | mant = mant & ~MASK_MANTISSA_OVERFLOW; 785 | } 786 | } 787 | 788 | FLOATX_ATTRIBUTES FLOATX_INLINE void construct_number_nan_or_inf( 789 | uint64_t& backend_exp, uint64_t& mant, const uint8_t M) 790 | { 791 | if (mant == 0x0) { 792 | // Inf 793 | backend_exp = MASK_EXPONENT; // encode a backend inf. 794 | } else { 795 | // Nan 796 | backend_exp = MASK_EXPONENT; // encode nan 797 | mant = mant << (BACKEND_M - M); 798 | } 799 | } 800 | 801 | FLOATX_ATTRIBUTES FLOATX_INLINE void construct_number_regular( 802 | uint64_t& backend_exp, uint64_t& mant, uint16_t const exp, int16_t emax, 803 | const uint8_t M) 804 | { 805 | mant = mant << (BACKEND_M - M); 806 | 807 | // interpret exponent in the format. 808 | int e = exp - emax; 809 | 810 | // rewrite the exponent in the backend format. 811 | backend_exp = ((uint64_t)e + BACKEND_BIAS) << BACKEND_M; 812 | } 813 | 814 | FLOATX_ATTRIBUTES FLOATX_INLINE double construct_number(bool sign, uint16_t exp, 815 | uint64_t mant, 816 | const uint8_t E, 817 | const uint8_t M) 818 | { 819 | uint64_t backend_exp = 0x0; 820 | 821 | // use emax as bias for the format. 822 | int16_t emax = (0x1 << (E - 1)) - 1; 823 | int16_t emin = 1 - emax; 824 | 825 | if (exp == 0x0) { 826 | construct_number_subormal(backend_exp, mant, emin, M); 827 | } else if (exp == ((0x1 << E) - 0x1)) { 828 | construct_number_nan_or_inf(backend_exp, mant, M); 829 | } else { 830 | construct_number_regular(backend_exp, mant, exp, emax, M); 831 | } 832 | 833 | uint64_t sign_bit = MASK_SIGN; 834 | sign_bit *= sign; 835 | 836 | uint64_t number = sign_bit | backend_exp | mant; 837 | double res = reinterpret_bits_as(number); 838 | return res; 839 | } 840 | 841 | FLOATX_ATTRIBUTES FLOATX_INLINE double construct_number( 842 | uint64_t fullbit_representation, const uint8_t E, const uint8_t M) 843 | { 844 | bool sign = (fullbit_representation & (UINT64_C(0x1) << (E + M))); 845 | uint64_t exp = 846 | fullbit_representation & (((UINT64_C(0x1) << E) - UINT64_C(1)) << M); 847 | exp = exp >> M; 848 | uint64_t mant = 849 | fullbit_representation & ((UINT64_C(0x1) << M) - UINT64_C(1)); 850 | return construct_number(sign, (uint16_t)exp, mant, E, M); 851 | } 852 | 853 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t construct_fullbit_representation( 854 | bool sign, uint16_t exp, uint64_t mant, const uint8_t E, const uint8_t M) 855 | { 856 | assert(exp < (0x1 << E)); 857 | assert(mant < (UINT64_C(0x1) << M)); 858 | 859 | uint64_t sign_bit = UINT64_C(0x1) << (E + M); 860 | sign_bit *= sign; 861 | 862 | uint64_t target_exp = (uint64_t)exp; 863 | target_exp = target_exp << M; 864 | 865 | // Note that the words have information encoded at different positions 866 | // [63:E+M+1] free 867 | // E+M sign_bit 868 | // E+M-1:M target_exp 869 | // M-1:0 mantissa 870 | mant = sign_bit | target_exp | mant; 871 | 872 | return mant; 873 | } 874 | 875 | // That functions return the bit representation embedded in a data word. A 876 | // backend representation of type will return the full representation of 877 | // 1+E+M bits in the LSB : LSB+1+E+M bit positions of the embedding dataword 878 | // (e.g. uint64_t). Encoding of the result: [63:E+M+1] free E+M 879 | // sign_bit E+M-1:M target_exp M-1:0 mantissa 880 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t 881 | get_fullbit_representation(const double d, const uint8_t E, const uint8_t M) 882 | { 883 | return construct_fullbit_representation( 884 | get_sign_from_backend(d), get_exponent_from_backend(d, E), 885 | get_mantissa_from_backend(d, E, M), E, M); 886 | } 887 | 888 | // Bitset wrappers. 889 | template 890 | FLOATX_ATTRIBUTES FLOATX_INLINE std::bitset get_exponent_from_backend_BS( 891 | const double d) 892 | { 893 | return std::bitset(get_exponent_from_backend(d, E)); 894 | } 895 | 896 | template 897 | FLOATX_ATTRIBUTES FLOATX_INLINE std::bitset get_mantissa_from_backend_BS( 898 | const double d) 899 | { 900 | return std::bitset(get_mantissa_from_backend(d, E, M)); 901 | } 902 | 903 | template 904 | FLOATX_ATTRIBUTES FLOATX_INLINE std::bitset<1 + E + M> 905 | get_fullbit_representation_BS(const double d) 906 | { 907 | return std::bitset<1 + E + M>(get_fullbit_representation(d, E, M)); 908 | } 909 | 910 | template 911 | FLOATX_ATTRIBUTES FLOATX_INLINE double construct_number(bool sign, 912 | std::bitset exp, 913 | std::bitset mant) 914 | { 915 | return construct_number(sign, exp.to_ulong(), mant.to_ulong(), E, M); 916 | } 917 | 918 | template 919 | FLOATX_ATTRIBUTES FLOATX_INLINE double construct_number( 920 | std::bitset<1 + E + M> fullbit_representation) 921 | { 922 | return construct_number(fullbit_representation.to_ulong(), E, M); 923 | } 924 | 925 | #define ENABLE_EXTRACT_PART(_part) \ 926 | template \ 927 | FLOATX_ATTRIBUTES FLOATX_INLINE uint64_t extract_##_part##_bits( \ 928 | const T& val) noexcept \ 929 | { \ 930 | return (*reinterpret_cast(&val) >> \ 931 | float_traits::_part##_pos) & \ 932 | float_traits::_part##_mask; \ 933 | } 934 | 935 | ENABLE_EXTRACT_PART(sgn); 936 | ENABLE_EXTRACT_PART(exp); 937 | ENABLE_EXTRACT_PART(sig); 938 | 939 | #undef ENABLE_EXTRACT_PART 940 | 941 | template 942 | class floatx_base { 943 | private: 944 | using backend_float = typename float_traits::backend_float; 945 | using bits_type = typename float_traits::bits_type; 946 | 947 | public: 948 | FLOATX_ATTRIBUTES floatx_base(const backend_float& value) noexcept 949 | : value_(value) 950 | {} 951 | 952 | FLOATX_ATTRIBUTES void initialize() noexcept 953 | { 954 | value_ = this->enforce_rounding(value_); 955 | } 956 | 957 | FLOATX_ATTRIBUTES constexpr operator backend_float() const noexcept 958 | { 959 | return value_; 960 | } 961 | 962 | template 963 | FLOATX_ATTRIBUTES constexpr operator T() const noexcept 964 | { 965 | return T(value_); 966 | } 967 | 968 | private: 969 | FLOATX_ATTRIBUTES const ConcreteFloatX& self() const noexcept 970 | { 971 | return *static_cast(this); 972 | } 973 | 974 | FLOATX_ATTRIBUTES ConcreteFloatX& self() noexcept 975 | { 976 | return *static_cast(this); 977 | } 978 | 979 | static constexpr auto backend_sig_pos = 980 | float_traits::sig_pos; 981 | static constexpr auto backend_exp_pos = 982 | float_traits::exp_pos; 983 | static constexpr auto backend_sgn_pos = 984 | float_traits::sgn_pos; 985 | static constexpr auto backend_sig_mask = 986 | float_traits::sig_mask << backend_sig_pos; 987 | static constexpr auto backend_exp_mask = 988 | float_traits::exp_mask << backend_exp_pos; 989 | static constexpr auto backend_sig_bits = 990 | float_traits::sig_bits; 991 | static constexpr auto backend_exp_bits = 992 | float_traits::exp_bits; 993 | static constexpr auto backend_bias = float_traits::bias; 994 | static constexpr auto backend_sig_overflow_mask = 995 | (float_traits::sig_mask + 1) << backend_sig_pos; 996 | static constexpr auto backend_sgn_mask = 997 | float_traits::sgn_mask << backend_sgn_pos; 998 | 999 | FLOATX_ATTRIBUTES 1000 | backend_float enforce_rounding(backend_float value) const noexcept 1001 | { 1002 | const auto exp_bits = get_exp_bits(self()); 1003 | const auto sig_bits = get_sig_bits(self()); 1004 | bits_type bits = reinterpret_as_bits(value); 1005 | auto sig = (bits & backend_sig_mask) >> backend_sig_pos; 1006 | auto raw_exp = bits & backend_exp_mask; 1007 | const auto sgn = bits & backend_sgn_mask; 1008 | 1009 | int exp = (raw_exp >> backend_exp_pos) - backend_bias; 1010 | 1011 | const int emax = (1 << (exp_bits - 1)) - 1; 1012 | const int emin = 1 - emax; 1013 | 1014 | if (is_nan_or_inf(bits)) { 1015 | bits = convert_nan_or_inf_to_backend(bits, sig_bits); 1016 | } else { 1017 | if (is_small(exp, emin)) { 1018 | convert_subnormal_mantissa_and_exp(bits, sig_bits, emin, exp, 1019 | sig, raw_exp); 1020 | } else { 1021 | sig = round_nearest(sig, backend_sig_bits - sig_bits); 1022 | } 1023 | if (significand_is_out_of_range(sig)) { 1024 | fix_too_large_mantissa(sig_bits, exp, sig, raw_exp); 1025 | } 1026 | if (exponent_is_out_of_range(exp, emax)) { 1027 | bits = assemble_inf_number(sgn); 1028 | } else { 1029 | bits = assemble_regular_number(sgn, sig, raw_exp); 1030 | } 1031 | } 1032 | 1033 | return reinterpret_bits_as(bits); 1034 | } 1035 | 1036 | static constexpr FLOATX_ATTRIBUTES bits_type 1037 | reinterpret_as_bits(backend_float val) 1038 | { 1039 | return *reinterpret_cast(&val); 1040 | } 1041 | 1042 | static constexpr FLOATX_ATTRIBUTES bool significand_is_out_of_range( 1043 | bits_type sig) 1044 | { 1045 | return sig >= backend_sig_overflow_mask; 1046 | } 1047 | 1048 | static constexpr FLOATX_ATTRIBUTES bool exponent_is_out_of_range(int exp, 1049 | int emax) 1050 | { 1051 | return exp > emax; 1052 | } 1053 | 1054 | protected: 1055 | backend_float value_; 1056 | }; 1057 | 1058 | 1059 | } // namespace detail 1060 | 1061 | 1062 | template 1063 | FLOATX_ATTRIBUTES FLOATX_INLINE std::string bitstring(const Float& x) noexcept 1064 | { 1065 | using bf = typename float_traits::backend_float; 1066 | const uint64_t one = UINT64_C(1); 1067 | const char map[] = {'0', '1'}; 1068 | const int eb = get_exp_bits(x); 1069 | const int sb = get_sig_bits(x); 1070 | const int beb = get_exp_bits(bf(x)); 1071 | const int bsb = get_sig_bits(bf(x)); 1072 | 1073 | std::string s(sb + eb + 3, '-'); 1074 | auto sgn = detail::extract_sgn_bits(bf(x)); 1075 | auto exp = detail::extract_exp_bits(bf(x)); 1076 | auto sig = detail::extract_sig_bits(bf(x)); 1077 | 1078 | int i = 0; 1079 | s[i++] = map[bool(sgn & UINT64_C(1))]; // sign bit 1080 | ++i; // leave '-' between sign and exponent parts 1081 | s[i++] = map[bool(exp & (one << (beb - 1)))]; // bias bit 1082 | for (auto mask = (one << (eb - 2)); mask > 0; mask >>= 1) { 1083 | s[i++] = map[bool(exp & mask)]; 1084 | } 1085 | ++i; // leave '-' between exponent and significand parts 1086 | for (auto mask = (one << (bsb - 1)); i < s.size(); mask >>= 1) { 1087 | s[i++] = map[bool(sig & mask)]; 1088 | } 1089 | return s; 1090 | } 1091 | 1092 | 1093 | }; // namespace flx 1094 | 1095 | 1096 | #undef FLOATX_ATTRIBUTES 1097 | #undef FLOATX_INLINE 1098 | #undef USE_BUILTINS 1099 | 1100 | 1101 | #endif // FLOATX_FLOATX_HPP_ 1102 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | function(create_test test_name) 2 | add_executable(${test_name} ${test_name}.cpp) 3 | target_link_libraries(${test_name} PRIVATE floatx gtest_main) 4 | file(RELATIVE_PATH REL_BINARY_DIR 5 | ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) 6 | add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${test_name}) 7 | endfunction(create_test) 8 | 9 | add_library(ieee_helper STATIC IEEEHelper.cpp) 10 | 11 | create_test(conversion) 12 | create_test(assignment) 13 | create_test(rel_ops) 14 | create_test(arithmetic) 15 | create_test(stream) 16 | create_test(std_integration) 17 | create_test(NanInf) 18 | create_test(round_nearest) 19 | create_test(value_representation) 20 | target_link_libraries(value_representation PRIVATE ieee_helper) 21 | create_test(value_representation_half) 22 | target_link_libraries(value_representation_half PRIVATE ieee_helper) 23 | create_test(value_representation_bits) 24 | target_link_libraries(value_representation_bits PRIVATE ieee_helper) 25 | -------------------------------------------------------------------------------- /test/IEEEHelper.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include "IEEEHelper.h" 19 | 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | void IEEEHelper::showConfig() 30 | { 31 | printf("Configuration of format<%u,%u>:\n", _e, _m); 32 | printf(" p = %u\n", _m + 1); 33 | printf(" emax = %i\n", getEmax()); 34 | printf(" emin = %i\n", getEmin()); 35 | printf(" bias = %i\n", getBias()); 36 | printf("Limits:\n"); 37 | printf(" max: %5e \t = %.20f\n", maxValue(), maxValue()); 38 | printf(" sNr: %5e \t = %.20f\n", smallestNormalValue(), 39 | smallestNormalValue()); 40 | printf(" lSN: %5e \t = %.20f\n", maxSubnormalValue(), maxSubnormalValue()); 41 | printf(" sSN: %5e \t = %.20f\n", minSubnormalValue(), minSubnormalValue()); 42 | printf("Cases (one-sided):\n"); 43 | printf(" Normal: %i \t = %u*%u\n", countNormalRange(), countExpRange(), 44 | countSubnormalRange()); 45 | printf(" Subnormal: %i\n", countSubnormalRange()); 46 | printf(" NAN/INFs: %i\n", countSubnormalRange()); 47 | printf(" 2^(E+M) = %i = (sum over #cases one side) = %u\n", 48 | (int)pow(2, _e + _m), 49 | countNormalRange() + 2 * countSubnormalRange()); 50 | printf("\n"); 51 | } 52 | 53 | double IEEEHelper::iterateNormalRange(int ie, int im) 54 | { 55 | assert(ie >= 0); 56 | assert(ie < _NnormalExp); 57 | assert(im >= 0); 58 | assert(im < _Nsubnormal); 59 | 60 | double m = 1.0 + im * pow(2.0, -_m); 61 | // printf("im = %i, m = %f\n", im, m ); 62 | return pow(2.0, ie + getEmin()) * m; 63 | } 64 | 65 | double IEEEHelper::iterateSubnormalRange(int im) 66 | { 67 | assert(im >= 0); 68 | assert(im < _Nsubnormal); 69 | 70 | double m = 0.0 + im * pow(2.0, -_m); 71 | return pow(2.0, getEmin()) * m; 72 | } 73 | 74 | void show(uint64_t u) 75 | { 76 | printf("%016llx\t", u); 77 | std::cout << std::bitset<64>(u) << std::endl; 78 | } 79 | 80 | #define CAST_DOUBLE_TO_UINT64(d) (*((uint64_t*)(&(d)))) 81 | #define CAST_UINT64_TO_DOUBLE(d) (*((double*)(&(d)))) 82 | 83 | void show(double d) 84 | { 85 | printf("%.20e\t", d); 86 | uint64_t u = CAST_DOUBLE_TO_UINT64(d); 87 | printf("0x%016llx\t", u); 88 | std::cout << std::bitset<64>(u) << std::endl; 89 | } 90 | 91 | void showTable(IEEEHelper& h) 92 | { 93 | int ne = h.countExpRange(); 94 | int nm = h.countSubnormalRange(); 95 | 96 | printf("Subnormal Range:\n"); 97 | 98 | for (int im = 0; im < nm; ++im) { 99 | double d = h.iterateSubnormalRange(im); 100 | printf("%5i/%5i: \t %.20e \t %f \n", im, nm, d, d); 101 | } 102 | 103 | printf("Normal Range:\n"); 104 | for (int ie = 0; ie < ne; ++ie) { 105 | for (int im = 0; im < nm; ++im) { 106 | double d = h.iterateNormalRange(ie, im); 107 | printf("(%5i,%5i)/(%5i,%5i): \t %.20e \t %f \n", ie, im, ne, nm, d, 108 | d); 109 | } 110 | } 111 | } 112 | 113 | 114 | // int main(int argc, char **argv) 115 | // { 116 | // if( argc != 2+1 ) 117 | // { 118 | // printf("Usage: %s \n", argv[0]); 119 | // exit(1); 120 | // } 121 | 122 | // int e = atoi( argv[1]); 123 | // int m = atoi( argv[2]); 124 | 125 | // IEEEHelper h = IEEEHelper(e,m); 126 | 127 | // h.showConfig(); 128 | // show( h ); 129 | 130 | // return 0; 131 | // } 132 | -------------------------------------------------------------------------------- /test/IEEEHelper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | // ### IEEE 754 HELPER FUNCITONS 6 | 7 | class IEEEHelper { 8 | private: 9 | int _e; 10 | int _m; 11 | int _emax; 12 | int _emin; 13 | 14 | long long int _Nnormal; 15 | long long int _NnormalExp; 16 | long long int _Nsubnormal; 17 | 18 | public: 19 | IEEEHelper(int e, int m) 20 | { 21 | _e = e; 22 | _m = m; 23 | 24 | _Nnormal = (getEmax() - getEmin() + 1) * pow(2.0, _m); 25 | _NnormalExp = (getEmax() - getEmin() + 1); 26 | _Nsubnormal = pow(2.0, _m); 27 | } 28 | 29 | inline int getEmax() { return pow(2.0, _e - 1) - 1; } 30 | inline int getEmin() { return 1 - getEmax(); } 31 | inline int getBias() { return getEmax(); } 32 | 33 | inline double getMmin() { return pow(2.0, -_m); } // use p = m+1 34 | inline double getMmaxNormal() { return 2 - pow(2.0, -_m); } // use p = m+1 35 | inline double getMmaxSubnormal() 36 | { 37 | return 1 - pow(2.0, -_m); 38 | } // use p = m+1 39 | 40 | inline double maxValue() { return pow(2.0, getEmax()) * getMmaxNormal(); } 41 | inline double smallestNormalValue() { return pow(2.0, getEmin()); } 42 | inline double maxSubnormalValue() 43 | { 44 | return pow(2.0, getEmin()) * getMmaxSubnormal(); 45 | } 46 | inline double minSubnormalValue() 47 | { 48 | return pow(2.0, getEmin()) * getMmin(); 49 | } 50 | 51 | void showConfig(); 52 | 53 | int countNormalRange() { return _Nnormal; } 54 | int countExpRange() { return _NnormalExp; } 55 | int countSubnormalRange() { return _Nsubnormal; } 56 | 57 | double iterateNormalRange(int ie, int im); 58 | double iterateSubnormalRange(int i); 59 | }; 60 | 61 | void show(double d); 62 | void showTable(IEEEHelper& h); 63 | -------------------------------------------------------------------------------- /test/NanInf.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | // #include // defines NAN 22 | // #define nan NAN 23 | #define nan double(0.0 / 0.0) 24 | #define inf double(1.0 / 0.0) 25 | 26 | namespace { 27 | 28 | void show(double d) 29 | { 30 | printf("%.20e\t", d); 31 | uint64_t u = flx::detail::reinterpret_as_bits(d); 32 | printf("0x%016llx\t", u); 33 | std::cout << std::bitset<64>(u) << std::endl; 34 | } 35 | 36 | 37 | // System representation of nan's. 38 | TEST(FloatxNanInfTest, system_nans) 39 | { 40 | double constnan = 0.0 / 0.0; 41 | printf("constnan: "); 42 | show((double)constnan); 43 | 44 | double zero; 45 | // try to prevent the compiler to figure out what 46 | // dynamicnan should be in the compile time 47 | *(double*)memset(&zero, ~0, sizeof(zero)) = 0.0; 48 | double dynamicnan = zero / zero; 49 | printf("dynamicnan: "); 50 | show((double)dynamicnan); 51 | 52 | EXPECT_NE(constnan, dynamicnan); // holds only for NANs 53 | EXPECT_NE(constnan, nan); // holds only for NANs 54 | EXPECT_NE(dynamicnan, nan); // holds only for NANs 55 | } 56 | 57 | // See Intel 64 and IA-32 Architectures Software Developer's Manual 58 | // Vol. 1, Appendix E Sect. 4.2.2 Table E-1 for a discussion of a 59 | // type of NaN returned for an invalid operation (e.g., 0/0). It 60 | // seems that always a particular encoding ("QNaN indefinite") is 61 | // used in such cases, but what happens generally (see TODOs below)? 62 | 63 | // A NAN CASE 64 | TEST(FloatxNanInfTest, cast_nans) 65 | { 66 | using T1 = flx::floatx<2, 3>; 67 | using T2 = flx::floatx<10, 50>; 68 | T1 a = 0.0 / 0.0; 69 | T2 b = 0.0; 70 | b = a; 71 | 72 | double constnan = 73 | nan; // note, the way how that nan is generated is relevant! 74 | 75 | EXPECT_NE(a, a); // holds only for NANs 76 | EXPECT_NE(a, nan); // holds only for NANs 77 | // TODO: is the following expectation true generally? 78 | EXPECT_EQ(*reinterpret_cast(&a), 79 | *reinterpret_cast(&constnan)); 80 | 81 | EXPECT_NE(b, b); // holds only for NANs 82 | EXPECT_NE(b, nan); // holds only for NANs 83 | // TODO: is the following expectation true generally? 84 | EXPECT_EQ(*reinterpret_cast(&b), 85 | *reinterpret_cast(&constnan)); 86 | 87 | // Differnt bit represenations for nans 88 | // TODO: is the following expectation true generally? 89 | EXPECT_EQ(*reinterpret_cast(&b), 90 | *reinterpret_cast(&a)); 91 | } 92 | 93 | // A NAN CASE 94 | TEST(FloatxNanInfTest, DIV_2_47_simple) 95 | { 96 | using T = flx::floatx<2, 47>; 97 | T a = -( 98 | 7.105427e-15 / 2 - 99 | 1e-17); // a bit smaller than half of the smallest subnormal in <2,47> 100 | T b = -(7.105427e-15 / 12.0); 101 | T c = 0; 102 | c = a / b; 103 | EXPECT_EQ(double(a), 0.00000000000000000000); 104 | EXPECT_EQ(double(b), 0.00000000000000000000); 105 | EXPECT_NE(c, c); // holds only for NANs 106 | EXPECT_NE(c, nan); // holds only for NANs 107 | 108 | double zero; 109 | // try to prevent the compiler to figure out what 110 | // dynamicnan should be in the compile time 111 | *(double*)memset(&zero, ~0, sizeof(zero)) = 0.0; 112 | double dynamicnan = zero / zero; 113 | 114 | EXPECT_NE(c, dynamicnan); // holds only for NANs 115 | // TODO: is the following expectation true generally? 116 | EXPECT_EQ(*reinterpret_cast(&c), 117 | *reinterpret_cast(&dynamicnan)); 118 | } 119 | 120 | // A REGULAR CASE (fixing in subnormal does not cause the inf case here) 121 | TEST(FloatxNanInfTest, DIV_3_3_simple) 122 | { 123 | using T = flx::floatx<3, 3>; 124 | T a = 0.33333333333333331483; 125 | T b = 0.11111111111111110494; 126 | T c = 0; 127 | c = a / b; 128 | EXPECT_EQ(double(a), 3.43750000000000000000e-01); 129 | EXPECT_EQ(double(b), 1.25000000000000000000e-01); 130 | EXPECT_EQ(double(c), 2.7500000000000000000e-00); 131 | } 132 | 133 | // A INF CASE. 134 | TEST(FloatxNanInfTest, DIV_3_3_simple_inf) 135 | { 136 | using T = flx::floatx<3, 3>; 137 | T a = 0.33333333333333331483; 138 | T b = 139 | (0.03125000000000000000 / 2 - 140 | 1e-17); // a bit smaller than half of the smallest subnormal in <3,3> 141 | T c = 0; 142 | c = a / b; 143 | 144 | // printf("a: "); show((double)a); 145 | // printf("b: "); show((double)b); 146 | // printf("c: "); show((double)c); 147 | // printf("inf: "); show((double)inf); 148 | 149 | EXPECT_EQ(double(a), 3.43750000000000000000e-01); 150 | EXPECT_EQ(double(b), 00000000000000000000); 151 | EXPECT_EQ(double(c), inf); 152 | } 153 | 154 | } // namespace 155 | -------------------------------------------------------------------------------- /test/arithmetic.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | 22 | namespace { 23 | 24 | 25 | TEST(FloatxArithmeticTest, ResultHasCorrectType) 26 | { 27 | using doublex = flx::floatx<11, 52>; 28 | using floatx = flx::floatx<8, 23>; 29 | 30 | ::testing::StaticAssertTypeEq(); 31 | ::testing::StaticAssertTypeEq(); 32 | ::testing::StaticAssertTypeEq(); 33 | ::testing::StaticAssertTypeEq(); 34 | 35 | ::testing::StaticAssertTypeEq(); 36 | ::testing::StaticAssertTypeEq(); 37 | ::testing::StaticAssertTypeEq(); 38 | ::testing::StaticAssertTypeEq(); 39 | 40 | doublex dlhs; 41 | ::testing::StaticAssertTypeEq(); 42 | ::testing::StaticAssertTypeEq(); 43 | ::testing::StaticAssertTypeEq(); 44 | ::testing::StaticAssertTypeEq(); 45 | floatx flhs; 46 | ::testing::StaticAssertTypeEq(); 47 | ::testing::StaticAssertTypeEq(); 48 | ::testing::StaticAssertTypeEq(); 49 | ::testing::StaticAssertTypeEq(); 50 | } 51 | 52 | 53 | TEST(FloatxArithmeticTest, PromotesTypes) 54 | { 55 | using flx1 = flx::floatx<9, 7>; 56 | using flx2 = flx::floatx<6, 13>; 57 | using supertype = flx::floatx<9, 13>; 58 | ::testing::StaticAssertTypeEq(); 59 | ::testing::StaticAssertTypeEq(); 60 | ::testing::StaticAssertTypeEq(); 61 | ::testing::StaticAssertTypeEq(); 62 | 63 | flx1 flhs; 64 | ::testing::StaticAssertTypeEq(); 65 | ::testing::StaticAssertTypeEq(); 66 | ::testing::StaticAssertTypeEq(); 67 | ::testing::StaticAssertTypeEq(); 68 | 69 | using flx3 = flx::floatx<9, 23>; 70 | ::testing::StaticAssertTypeEq(); 71 | ::testing::StaticAssertTypeEq(); 72 | ::testing::StaticAssertTypeEq(); 73 | ::testing::StaticAssertTypeEq(); 74 | 75 | ::testing::StaticAssertTypeEq(); 76 | ::testing::StaticAssertTypeEq(); 77 | ::testing::StaticAssertTypeEq(); 78 | ::testing::StaticAssertTypeEq(); 79 | 80 | using doublex = flx::floatx<11, 52>; 81 | ::testing::StaticAssertTypeEq(); 82 | ::testing::StaticAssertTypeEq(); 83 | ::testing::StaticAssertTypeEq(); 84 | ::testing::StaticAssertTypeEq(); 85 | 86 | ::testing::StaticAssertTypeEq(); 87 | ::testing::StaticAssertTypeEq(); 88 | ::testing::StaticAssertTypeEq(); 89 | ::testing::StaticAssertTypeEq(); 90 | 91 | ::testing::StaticAssertTypeEq(); 92 | ::testing::StaticAssertTypeEq(); 93 | ::testing::StaticAssertTypeEq(); 94 | ::testing::StaticAssertTypeEq(); 95 | 96 | ::testing::StaticAssertTypeEq(); 97 | ::testing::StaticAssertTypeEq(); 98 | ::testing::StaticAssertTypeEq(); 99 | ::testing::StaticAssertTypeEq(); 100 | } 101 | 102 | 103 | TEST(FloatxrArithmeticTest, ResultHasCorrectType) 104 | { 105 | auto fxr = []() { return flx::floatxr<>(8, 23); }; 106 | ::testing::StaticAssertTypeEq, decltype(fxr() + fxr())>(); 107 | ::testing::StaticAssertTypeEq, decltype(fxr() - fxr())>(); 108 | ::testing::StaticAssertTypeEq, decltype(fxr() * fxr())>(); 109 | ::testing::StaticAssertTypeEq, decltype(fxr() / fxr())>(); 110 | 111 | flx::floatxr<> dlhs(8, 23); 112 | ::testing::StaticAssertTypeEq&, decltype(dlhs += fxr())>(); 113 | ::testing::StaticAssertTypeEq&, decltype(dlhs -= fxr())>(); 114 | ::testing::StaticAssertTypeEq&, decltype(dlhs *= fxr())>(); 115 | ::testing::StaticAssertTypeEq&, decltype(dlhs /= fxr())>(); 116 | } 117 | 118 | 119 | TEST(FloatxrArithmeticTest, PromotesTypes) 120 | { 121 | auto fxr = []() { return flx::floatxr<>(8, 23); }; 122 | using floatx = flx::floatx<9, 12>; 123 | ::testing::StaticAssertTypeEq, decltype(fxr() + floatx())>(); 124 | ::testing::StaticAssertTypeEq, decltype(fxr() - floatx())>(); 125 | ::testing::StaticAssertTypeEq, decltype(fxr() * floatx())>(); 126 | ::testing::StaticAssertTypeEq, decltype(fxr() / floatx())>(); 127 | 128 | flx::floatxr<> dlhs(8, 23); 129 | ::testing::StaticAssertTypeEq&, 130 | decltype(dlhs += floatx())>(); 131 | ::testing::StaticAssertTypeEq&, 132 | decltype(dlhs -= floatx())>(); 133 | ::testing::StaticAssertTypeEq&, 134 | decltype(dlhs *= floatx())>(); 135 | ::testing::StaticAssertTypeEq&, 136 | decltype(dlhs /= floatx())>(); 137 | 138 | ::testing::StaticAssertTypeEq, decltype(fxr() + double())>(); 139 | ::testing::StaticAssertTypeEq, decltype(fxr() - double())>(); 140 | ::testing::StaticAssertTypeEq, decltype(fxr() * double())>(); 141 | ::testing::StaticAssertTypeEq, decltype(fxr() / double())>(); 142 | 143 | ::testing::StaticAssertTypeEq&, 144 | decltype(dlhs += double())>(); 145 | ::testing::StaticAssertTypeEq&, 146 | decltype(dlhs -= double())>(); 147 | ::testing::StaticAssertTypeEq&, 148 | decltype(dlhs *= double())>(); 149 | ::testing::StaticAssertTypeEq&, 150 | decltype(dlhs /= double())>(); 151 | 152 | ::testing::StaticAssertTypeEq, decltype(fxr() + int())>(); 153 | ::testing::StaticAssertTypeEq, decltype(fxr() - int())>(); 154 | ::testing::StaticAssertTypeEq, decltype(fxr() * int())>(); 155 | ::testing::StaticAssertTypeEq, decltype(fxr() / int())>(); 156 | 157 | ::testing::StaticAssertTypeEq&, decltype(dlhs += int())>(); 158 | ::testing::StaticAssertTypeEq&, decltype(dlhs -= int())>(); 159 | ::testing::StaticAssertTypeEq&, decltype(dlhs *= int())>(); 160 | ::testing::StaticAssertTypeEq&, decltype(dlhs /= int())>(); 161 | } 162 | 163 | 164 | } // namespace 165 | -------------------------------------------------------------------------------- /test/assignment.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | 22 | namespace { 23 | 24 | 25 | TEST(FloatxAssignmentTest, PreservesPrecision) 26 | { 27 | const double val = 1.0 + 1e-15; 28 | flx::floatx<11, 52> fx_val; 29 | fx_val = val; 30 | EXPECT_EQ(val, double(fx_val)); 31 | } 32 | 33 | TEST(FloatxAssignmentTest, LowersPrecision) 34 | { 35 | const double val = 1.0 + 1e-15; 36 | flx::floatx<8, 23> fx_val; 37 | fx_val = val; 38 | EXPECT_NE(val, double(fx_val)); // round to float 39 | } 40 | 41 | 42 | TEST(FloatxAssignmentTest, AssignsBetweenFormats) 43 | { 44 | const double val = 1.0 + 1e-15; 45 | flx::floatx<11, 52> d_val(val); 46 | flx::floatx<8, 23> s_val; 47 | s_val = d_val; 48 | EXPECT_NE(val, double(s_val)); 49 | EXPECT_EQ(float(val), float(s_val)); 50 | } 51 | 52 | 53 | TEST(FloatxrAssignmentTest, PreservesPrecision) 54 | { 55 | const double val = 1.0 + 1e-15; 56 | flx::floatxr<> fx_val(11, 52); 57 | fx_val = val; 58 | EXPECT_EQ(val, double(fx_val)); 59 | } 60 | 61 | TEST(FloatxrAssignmentTest, LowersPrecision) 62 | { 63 | const double val = 1.0 + 1e-15; 64 | flx::floatxr<> fx_val(8, 23); 65 | fx_val = val; 66 | EXPECT_NE(val, double(fx_val)); // round to float 67 | } 68 | 69 | 70 | TEST(FloatxrAssignmentTest, AssignsBetweenFormats) 71 | { 72 | const double val = 1.0 + 1e-15; 73 | flx::floatx<11, 52> d_val(val); 74 | flx::floatxr<> s_val(8, 23); 75 | s_val = d_val; 76 | EXPECT_NE(val, double(s_val)); 77 | EXPECT_EQ(float(val), float(s_val)); 78 | } 79 | 80 | 81 | } // namespace 82 | -------------------------------------------------------------------------------- /test/conversion.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | 20 | 21 | #include 22 | #include 23 | 24 | 25 | namespace { 26 | 27 | 28 | TEST(FloatxConversionTest, PreservesDoublePrecision) 29 | { 30 | const double val = 1.0 + 1e-15; 31 | EXPECT_EQ(val, double(flx::floatx<11, 52>(val))); 32 | } 33 | 34 | TEST(FloatxConversionTest, LowersPrecision) 35 | { 36 | const double val = 1.0 + 1e-15; 37 | EXPECT_NE(val, double(flx::floatx<8, 23>(val))); // round to float 38 | } 39 | 40 | 41 | TEST(FloatxConversionTest, HandlesDenormals) 42 | { 43 | EXPECT_EQ(0.25, double(flx::floatx<2, 3>(0.25))); 44 | EXPECT_EQ(0.75, double(flx::floatx<2, 3>(0.75))); 45 | } 46 | 47 | 48 | TEST(FloatxConversionTest, ConvertsBetweenFloatX) 49 | { 50 | const double val = 1.0 + 1e-15; 51 | flx::floatx<11, 52> d_val(val); 52 | flx::floatx<8, 23> s_val(d_val); 53 | EXPECT_NE(val, double(s_val)); 54 | EXPECT_EQ(float(val), float(s_val)); 55 | } 56 | 57 | TEST(FloatxConversionTest, ConvertsToBits) 58 | { 59 | const double val = 1.0 + 1e-15; 60 | ::testing::StaticAssertTypeEq, 61 | decltype(bits(flx::floatx<11, 52>(val)))>(); 62 | ::testing::StaticAssertTypeEq, 63 | decltype(bits(flx::floatx<8, 23>(val)))>(); 64 | EXPECT_EQ(flx::bits(val), bits(flx::floatx<11, 52>(val))); 65 | EXPECT_NE(flx::bits(val), bits(flx::floatx<8, 23>(val))); 66 | } 67 | 68 | TEST(FloatxConversionTest, ConvertsToString) 69 | { 70 | flx::floatx<4, 5> val1 = 1.0; 71 | EXPECT_EQ("0-0111-00000", bitstring(val1)); 72 | flx::floatx<3, 2> val2 = 1.75; 73 | EXPECT_EQ("0-011-11", bitstring(val2)); 74 | flx::floatx<5, 7> val3 = 0.0; 75 | EXPECT_EQ("0-00000-0000000", bitstring(val3)); 76 | } 77 | 78 | TEST(FloatxrConversionTest, PreservesDoublePrecision) 79 | { 80 | const double val = 1.0 + 1e-15; 81 | EXPECT_EQ(val, double(flx::floatxr<>(11, 52, val))); 82 | } 83 | 84 | TEST(FloatxrConversionTest, LowersPrecision) 85 | { 86 | const double val = 1.0 + 1e-15; 87 | EXPECT_NE(val, double(flx::floatxr<>(8, 23, val))); // round to float 88 | } 89 | 90 | TEST(FloatxrConversionTest, InheritsPrecision) 91 | { 92 | const double val = 1.0 + 1e-15; 93 | EXPECT_EQ(val, double(flx::floatxr<>(val))); 94 | } 95 | 96 | TEST(FloatxrConversionTest, ChangesPrecision) 97 | { 98 | const double val = 1.0 + 1e-15; 99 | flx::floatxr<> fxr_val(val); 100 | fxr_val.set_precision(8, 23); 101 | EXPECT_NE(val, double(fxr_val)); 102 | EXPECT_EQ(float(val), float(fxr_val)); 103 | } 104 | 105 | TEST(FloatxrConversionTest, ConvertsBetweenFloatX) 106 | { 107 | const double val = 1.0 + 1e-15; 108 | flx::floatx<11, 52> d_val(val); 109 | flx::floatxr<> s_val(8, 23, d_val); 110 | EXPECT_NE(val, double(s_val)); 111 | EXPECT_EQ(float(val), float(s_val)); 112 | } 113 | 114 | TEST(FloatxrConversionTest, ConvertsToBits) 115 | { 116 | const double val = 1.0 + 1e-15; 117 | ::testing::StaticAssertTypeEq, 118 | decltype(bits(flx::floatxr<>(val)))>(); 119 | EXPECT_EQ(flx::bits(val), bits(flx::floatxr<>(val))); 120 | EXPECT_NE(flx::bits(val), bits(flx::floatxr<>(8, 23, val))); 121 | } 122 | 123 | 124 | TEST(FloatxrConversionTest, ConvertsToString) 125 | { 126 | flx::floatxr<> val1(4, 5, 1.0); 127 | EXPECT_EQ("0-0111-00000", bitstring(val1)); 128 | flx::floatxr<> val2(3, 2, 1.75); 129 | EXPECT_EQ("0-011-11", bitstring(val2)); 130 | flx::floatxr<> val3(5, 7, 0.0); 131 | EXPECT_EQ("0-00000-0000000", bitstring(val3)); 132 | } 133 | 134 | } // namespace 135 | -------------------------------------------------------------------------------- /test/rel_ops.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | 22 | namespace { 23 | 24 | 25 | TEST(FloatxRelOpsTest, Equal) 26 | { 27 | using doublex = flx::floatx<11, 52>; 28 | using floatx = flx::floatx<8, 23>; 29 | const double val1 = 1.0 + 1e-15; 30 | const double val2 = 1.0 + 2e-15; 31 | EXPECT_TRUE(doublex(val1) == doublex(val1)); 32 | EXPECT_FALSE(doublex(val1) == doublex(val2)); 33 | EXPECT_TRUE(floatx(val1) == floatx(val1)); 34 | EXPECT_TRUE(floatx(val1) == floatx(val2)); // due to rounding 35 | EXPECT_FALSE(floatx(val1) == doublex(val1)); // due to rounding 36 | EXPECT_FALSE(floatx(val1) == doublex(val2)); // due to rounding 37 | } 38 | 39 | 40 | TEST(FloatxRelOpsTest, NotEqual) 41 | { 42 | using doublex = flx::floatx<11, 52>; 43 | using floatx = flx::floatx<8, 23>; 44 | const double val1 = 1.0 + 1e-15; 45 | const double val2 = 1.0 + 2e-15; 46 | EXPECT_FALSE(doublex(val1) != doublex(val1)); 47 | EXPECT_TRUE(doublex(val1) != doublex(val2)); 48 | EXPECT_FALSE(floatx(val1) != floatx(val1)); 49 | EXPECT_FALSE(floatx(val1) != floatx(val2)); // due to rounding 50 | EXPECT_TRUE(floatx(val1) != doublex(val1)); // due to rounding 51 | EXPECT_TRUE(floatx(val1) != doublex(val2)); // due to rounding 52 | } 53 | 54 | 55 | TEST(FloatxRelOpsTest, LessThan) 56 | { 57 | using doublex = flx::floatx<11, 52>; 58 | using floatx = flx::floatx<8, 23>; 59 | const double val1 = 1.0 + 1e-15; 60 | const double val2 = 1.0 + 2e-15; 61 | EXPECT_FALSE(doublex(val1) < doublex(val1)); 62 | EXPECT_FALSE(doublex(val2) < doublex(val1)); 63 | EXPECT_TRUE(doublex(val1) < doublex(val2)); 64 | EXPECT_FALSE(floatx(val1) < floatx(val1)); 65 | EXPECT_FALSE(floatx(val2) < floatx(val1)); 66 | EXPECT_FALSE(floatx(val1) < floatx(val2)); // due to rounding 67 | EXPECT_TRUE(floatx(val1) < doublex(val1)); // due to rounding 68 | EXPECT_TRUE(floatx(val2) < doublex(val1)); // due to rounding 69 | EXPECT_TRUE(floatx(val1) < doublex(val2)); // due to rounding 70 | } 71 | 72 | 73 | TEST(FloatxRelOpsTest, LessOrEqual) 74 | { 75 | using doublex = flx::floatx<11, 52>; 76 | using floatx = flx::floatx<8, 23>; 77 | const double val1 = 1.0 + 1e-15; 78 | const double val2 = 1.0 + 2e-15; 79 | EXPECT_TRUE(doublex(val1) <= doublex(val1)); 80 | EXPECT_FALSE(doublex(val2) <= doublex(val1)); 81 | EXPECT_TRUE(doublex(val1) <= doublex(val2)); 82 | EXPECT_TRUE(floatx(val1) <= floatx(val1)); 83 | EXPECT_TRUE(floatx(val2) <= floatx(val1)); // due to rounding 84 | EXPECT_TRUE(floatx(val1) <= floatx(val2)); 85 | EXPECT_TRUE(floatx(val1) <= doublex(val1)); 86 | EXPECT_TRUE(floatx(val2) <= doublex(val1)); // due to rounding 87 | EXPECT_TRUE(floatx(val1) <= doublex(val2)); 88 | } 89 | 90 | 91 | TEST(FloatxRelOpsTest, GreaterThan) 92 | { 93 | using doublex = flx::floatx<11, 52>; 94 | using floatx = flx::floatx<8, 23>; 95 | const double val1 = 1.0 + 1e-15; 96 | const double val2 = 1.0 + 2e-15; 97 | EXPECT_FALSE(doublex(val1) > doublex(val1)); 98 | EXPECT_TRUE(doublex(val2) > doublex(val1)); 99 | EXPECT_FALSE(doublex(val1) > doublex(val2)); 100 | EXPECT_FALSE(floatx(val1) > floatx(val1)); 101 | EXPECT_FALSE(floatx(val2) > floatx(val1)); // due to rounding 102 | EXPECT_FALSE(floatx(val1) > floatx(val2)); 103 | EXPECT_FALSE(floatx(val1) > doublex(val1)); 104 | EXPECT_FALSE(floatx(val2) > doublex(val1)); // due to rounding 105 | EXPECT_FALSE(floatx(val1) > doublex(val2)); 106 | } 107 | 108 | 109 | TEST(FloatxRelOpsTest, GreaterOrEqual) 110 | { 111 | using doublex = flx::floatx<11, 52>; 112 | using floatx = flx::floatx<8, 23>; 113 | const double val1 = 1.0 + 1e-15; 114 | const double val2 = 1.0 + 2e-15; 115 | EXPECT_TRUE(doublex(val1) >= doublex(val1)); 116 | EXPECT_TRUE(doublex(val2) >= doublex(val1)); 117 | EXPECT_FALSE(doublex(val1) >= doublex(val2)); 118 | EXPECT_TRUE(floatx(val1) >= floatx(val1)); 119 | EXPECT_TRUE(floatx(val2) >= floatx(val1)); // due to rounding 120 | EXPECT_TRUE(floatx(val1) >= floatx(val2)); // due to rounding 121 | EXPECT_FALSE(floatx(val1) >= doublex(val1)); // due to rounding 122 | EXPECT_FALSE(floatx(val2) >= doublex(val1)); // due to rounding 123 | EXPECT_FALSE(floatx(val1) >= doublex(val2)); 124 | } 125 | 126 | 127 | TEST(FloatxrRelOpsTest, Equal) 128 | { 129 | auto doublex = [](double a) { return flx::floatxr<>(11, 52, a); }; 130 | auto floatx = [](double a) { return flx::floatxr<>(8, 23, a); }; 131 | const double val1 = 1.0 + 1e-15; 132 | const double val2 = 1.0 + 2e-15; 133 | EXPECT_TRUE(doublex(val1) == doublex(val1)); 134 | EXPECT_FALSE(doublex(val1) == doublex(val2)); 135 | EXPECT_TRUE(floatx(val1) == floatx(val1)); 136 | EXPECT_TRUE(floatx(val1) == floatx(val2)); // due to rounding 137 | EXPECT_FALSE(floatx(val1) == doublex(val1)); // due to rounding 138 | EXPECT_FALSE(floatx(val1) == doublex(val2)); // due to rounding 139 | } 140 | 141 | 142 | TEST(FloatxrRelOpsTest, NotEqual) 143 | { 144 | auto doublex = [](double a) { return flx::floatxr<>(11, 52, a); }; 145 | auto floatx = [](double a) { return flx::floatxr<>(8, 23, a); }; 146 | const double val1 = 1.0 + 1e-15; 147 | const double val2 = 1.0 + 2e-15; 148 | EXPECT_FALSE(doublex(val1) != doublex(val1)); 149 | EXPECT_TRUE(doublex(val1) != doublex(val2)); 150 | EXPECT_FALSE(floatx(val1) != floatx(val1)); 151 | EXPECT_FALSE(floatx(val1) != floatx(val2)); // due to rounding 152 | EXPECT_TRUE(floatx(val1) != doublex(val1)); // due to rounding 153 | EXPECT_TRUE(floatx(val1) != doublex(val2)); // due to rounding 154 | } 155 | 156 | 157 | TEST(FloatxrRelOpsTest, LessThan) 158 | { 159 | auto doublex = [](double a) { return flx::floatxr<>(11, 52, a); }; 160 | auto floatx = [](double a) { return flx::floatxr<>(8, 23, a); }; 161 | const double val1 = 1.0 + 1e-15; 162 | const double val2 = 1.0 + 2e-15; 163 | EXPECT_FALSE(doublex(val1) < doublex(val1)); 164 | EXPECT_FALSE(doublex(val2) < doublex(val1)); 165 | EXPECT_TRUE(doublex(val1) < doublex(val2)); 166 | EXPECT_FALSE(floatx(val1) < floatx(val1)); 167 | EXPECT_FALSE(floatx(val2) < floatx(val1)); 168 | EXPECT_FALSE(floatx(val1) < floatx(val2)); // due to rounding 169 | EXPECT_TRUE(floatx(val1) < doublex(val1)); // due to rounding 170 | EXPECT_TRUE(floatx(val2) < doublex(val1)); // due to rounding 171 | EXPECT_TRUE(floatx(val1) < doublex(val2)); // due to rounding 172 | } 173 | 174 | 175 | TEST(FloatxrRelOpsTest, LessOrEqual) 176 | { 177 | auto doublex = [](double a) { return flx::floatxr<>(11, 52, a); }; 178 | auto floatx = [](double a) { return flx::floatxr<>(8, 23, a); }; 179 | const double val1 = 1.0 + 1e-15; 180 | const double val2 = 1.0 + 2e-15; 181 | EXPECT_TRUE(doublex(val1) <= doublex(val1)); 182 | EXPECT_FALSE(doublex(val2) <= doublex(val1)); 183 | EXPECT_TRUE(doublex(val1) <= doublex(val2)); 184 | EXPECT_TRUE(floatx(val1) <= floatx(val1)); 185 | EXPECT_TRUE(floatx(val2) <= floatx(val1)); // due to rounding 186 | EXPECT_TRUE(floatx(val1) <= floatx(val2)); 187 | EXPECT_TRUE(floatx(val1) <= doublex(val1)); 188 | EXPECT_TRUE(floatx(val2) <= doublex(val1)); // due to rounding 189 | EXPECT_TRUE(floatx(val1) <= doublex(val2)); 190 | } 191 | 192 | 193 | TEST(FloatxrRelOpsTest, GreaterThan) 194 | { 195 | auto doublex = [](double a) { return flx::floatxr<>(11, 52, a); }; 196 | auto floatx = [](double a) { return flx::floatxr<>(8, 23, a); }; 197 | const double val1 = 1.0 + 1e-15; 198 | const double val2 = 1.0 + 2e-15; 199 | EXPECT_FALSE(doublex(val1) > doublex(val1)); 200 | EXPECT_TRUE(doublex(val2) > doublex(val1)); 201 | EXPECT_FALSE(doublex(val1) > doublex(val2)); 202 | EXPECT_FALSE(floatx(val1) > floatx(val1)); 203 | EXPECT_FALSE(floatx(val2) > floatx(val1)); // due to rounding 204 | EXPECT_FALSE(floatx(val1) > floatx(val2)); 205 | EXPECT_FALSE(floatx(val1) > doublex(val1)); 206 | EXPECT_FALSE(floatx(val2) > doublex(val1)); // due to rounding 207 | EXPECT_FALSE(floatx(val1) > doublex(val2)); 208 | } 209 | 210 | 211 | TEST(FloatxrRelOpsTest, GreaterOrEqual) 212 | { 213 | auto doublex = [](double a) { return flx::floatxr<>(11, 52, a); }; 214 | auto floatx = [](double a) { return flx::floatxr<>(8, 23, a); }; 215 | const double val1 = 1.0 + 1e-15; 216 | const double val2 = 1.0 + 2e-15; 217 | EXPECT_TRUE(doublex(val1) >= doublex(val1)); 218 | EXPECT_TRUE(doublex(val2) >= doublex(val1)); 219 | EXPECT_FALSE(doublex(val1) >= doublex(val2)); 220 | EXPECT_TRUE(floatx(val1) >= floatx(val1)); 221 | EXPECT_TRUE(floatx(val2) >= floatx(val1)); // due to rounding 222 | EXPECT_TRUE(floatx(val1) >= floatx(val2)); // due to rounding 223 | EXPECT_FALSE(floatx(val1) >= doublex(val1)); // due to rounding 224 | EXPECT_FALSE(floatx(val2) >= doublex(val1)); // due to rounding 225 | EXPECT_FALSE(floatx(val1) >= doublex(val2)); 226 | } 227 | 228 | 229 | } // namespace 230 | -------------------------------------------------------------------------------- /test/round_nearest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | // Check internal functions. 22 | // Rounding of a uint64_t type value. 23 | // Checks the following routine: 24 | // inline uint64_t SHIFT_RIGHT_ROUND_NEAREST(uint64_t mant, uint16_t SHIFT) 25 | // IMPLEMENTS rounding according the IEEE 754 standard with a NEAREST policy and 26 | // ties are resolved to even. 27 | 28 | TEST(RoundNearest, down) 29 | { 30 | // RRRRRRRRRR 31 | // number: 0x3e6999999999999a 32 | // 0011111001101001100110011001100110011001100110011001100110011010 33 | // >> 10 : 0x000f9a6666666666 34 | // 0000000000001111100110100110011001100110011001100110011001100110 RND 10: 35 | // 0x000f9a6666666666 36 | // 0000000000001111100110100110011001100110011001100110011001100110 37 | uint64_t number = 0x3e6999999999999a; 38 | int shift_amount = 10; 39 | uint64_t expected = 0x000f9a6666666666 << shift_amount; 40 | 41 | EXPECT_EQ(expected, flx::detail::round_nearest(number, shift_amount)); 42 | } 43 | 44 | TEST(RoundNearest, up) 45 | { 46 | // RRRRRRRRRRRR 47 | // number: 0x3e6999999999999a 48 | // 0011111001101001100110011001100110011001100110011001100110011010 49 | // >> 12 : 0x0003e69999999999 50 | // 0000000000000011111001101001100110011001100110011001100110011001 RND 12: 51 | // 0x0003e6999999999a 52 | // 0000000000000011111001101001100110011001100110011001100110011010 53 | uint64_t number = 0x3e6999999999999a; 54 | int shift_amount = 12; 55 | uint64_t expected = 0x0003e6999999999a << shift_amount; 56 | 57 | EXPECT_EQ(expected, flx::detail::round_nearest(number, shift_amount)); 58 | } 59 | 60 | TEST(RoundNearest, RoundNearestWithTiesToEvenRoundsUp) 61 | { 62 | // RRRR 63 | // number: 0x0ffffffff00000f8 64 | // 0000111111111111111111111111111111110000000000000000000011111000 65 | // >> 4 : 0x00ffffffff00000f 66 | // 0000000011111111111111111111111111111111000000000000000000001111 RND 4: 67 | // 0x00ffffffff000010 68 | // 0000000011111111111111111111111111111111000000000000000000010000 69 | uint64_t number = 0x0ffffffff00000f8; 70 | int shift_amount = 4; 71 | uint64_t expected = 0x00ffffffff000010 << shift_amount; 72 | 73 | EXPECT_EQ(expected, flx::detail::round_nearest(number, shift_amount)); 74 | } 75 | 76 | TEST(RoundNearest, RoundNearestWithTiesToEvenRoundsDown) 77 | { 78 | // RRRR 79 | // number: 0x0ffffffff00000e8 80 | // 0000111111111111111111111111111111110000000000000000000011101000 81 | // >> 4 : 0x00ffffffff00000e 82 | // 0000000011111111111111111111111111111111000000000000000000001110 RND 4: 83 | // 0x00ffffffff00000e 84 | // 0000000011111111111111111111111111111111000000000000000000001110 85 | uint64_t number = 0x0ffffffff00000e8; 86 | int shift_amount = 4; 87 | uint64_t expected = 0x00ffffffff00000e << shift_amount; 88 | 89 | EXPECT_EQ(expected, flx::detail::round_nearest(number, shift_amount)); 90 | } 91 | -------------------------------------------------------------------------------- /test/std_integration.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | 28 | namespace { 29 | 30 | 31 | using doublex = flx::floatx<11, 52>; 32 | using floatx = flx::floatx<8, 23>; 33 | 34 | 35 | TEST(Tuple, CanCreateFloatXTuple) 36 | { 37 | auto tpl = std::make_tuple(doublex{3.2}, floatx{5.2}); 38 | 39 | ASSERT_NEAR(std::get<0>(tpl), 3.2, 1e-15); 40 | ASSERT_NEAR(std::get<1>(tpl), 5.2, 1e-6); 41 | } 42 | 43 | 44 | TEST(TupleVector, CanCreateVectorOfTuples) 45 | { 46 | std::vector> vec{ 47 | std::make_tuple(doublex{3.2}, floatx{5.2}), 48 | std::make_tuple(doublex{0.5}, floatx{1.2})}; 49 | 50 | ASSERT_NEAR(std::get<0>(vec[0]), 3.2, 1e-15); 51 | ASSERT_NEAR(std::get<1>(vec[0]), 5.2, 1e-6); 52 | ASSERT_NEAR(std::get<0>(vec[1]), 0.5, 1e-15); 53 | ASSERT_NEAR(std::get<1>(vec[1]), 1.2, 1e-6); 54 | } 55 | 56 | 57 | TEST(TupleVector, CanIterateThroughVector) 58 | { 59 | std::vector> vec{ 60 | std::make_tuple(doublex{3.2}, floatx{5.2}), 61 | std::make_tuple(doublex{0.5}, floatx{1.2})}; 62 | 63 | for (auto& elem : vec) { 64 | std::get<0>(elem) += 1; 65 | } 66 | 67 | ASSERT_NEAR(std::get<0>(vec[0]), 4.2, 1e-15); 68 | ASSERT_NEAR(std::get<1>(vec[0]), 5.2, 1e-6); 69 | ASSERT_NEAR(std::get<0>(vec[1]), 1.5, 1e-15); 70 | ASSERT_NEAR(std::get<1>(vec[1]), 1.2, 1e-6); 71 | } 72 | 73 | 74 | TEST(TupleVector, CanSortTupleVector) 75 | { 76 | std::vector> vec{ 77 | std::make_tuple(doublex{3.2}, floatx{5.2}), 78 | std::make_tuple(doublex{0.5}, floatx{1.2})}; 79 | 80 | std::sort(begin(vec), end(vec)); 81 | 82 | ASSERT_NEAR(std::get<0>(vec[0]), 0.5, 1e-15); 83 | ASSERT_NEAR(std::get<1>(vec[0]), 1.2, 1e-6); 84 | ASSERT_NEAR(std::get<0>(vec[1]), 3.2, 1e-15); 85 | ASSERT_NEAR(std::get<1>(vec[1]), 5.2, 1e-6); 86 | } 87 | 88 | 89 | // NOTE: this is non-standard behavior, a conformant implementation is allowed 90 | // to have undefined behavior for std::complex> 91 | TEST(Complex, CanCreateComplexFloatX) 92 | { 93 | std::complex a(3.2, 2.5); 94 | 95 | ASSERT_NEAR(a.real(), 3.2, 1e-7); 96 | ASSERT_NEAR(a.imag(), 2.5, 1e-7); 97 | } 98 | 99 | 100 | TEST(Complex, CanAddComplexFloatX) 101 | { 102 | std::complex a(3.2, 2.5); 103 | std::complex b(2.3, 1.4); 104 | 105 | auto res = a + b; 106 | 107 | ASSERT_NEAR(res.real(), 5.5, 1e-7); 108 | ASSERT_NEAR(res.imag(), 3.9, 1e-7); 109 | } 110 | 111 | 112 | TEST(Complex, CanSubstractComplexFloatX) 113 | { 114 | std::complex a(3.2, 2.5); 115 | std::complex b(2.3, 1.4); 116 | 117 | auto res = a - b; 118 | 119 | ASSERT_NEAR(res.real(), 0.9, 1e-7); 120 | ASSERT_NEAR(res.imag(), 1.1, 1e-7); 121 | } 122 | 123 | 124 | TEST(Complex, CanMultiplyComplexFloatX) 125 | { 126 | std::complex a(3.0, 2.0); 127 | std::complex b(2.0, 1.0); 128 | 129 | auto res = a * b; 130 | 131 | ASSERT_NEAR(res.real(), 4.0, 1e-7); 132 | ASSERT_NEAR(res.imag(), 7.0, 1e-7); 133 | } 134 | 135 | 136 | TEST(Complex, CanDivideComplexFloatX) 137 | { 138 | std::complex a(3.0, 2.0); 139 | std::complex b(2.0, 1.0); 140 | 141 | auto res = a / b; 142 | 143 | ASSERT_NEAR(res.real(), 1.6, 1e-7); 144 | ASSERT_NEAR(res.imag(), 0.2, 1e-7); 145 | } 146 | 147 | 148 | } // namespace 149 | -------------------------------------------------------------------------------- /test/stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | 20 | 21 | #include 22 | #include 23 | 24 | 25 | namespace { 26 | 27 | 28 | TEST(FloatxStreamTest, WritesToOutputStream) 29 | { 30 | flx::floatx<8, 23> val = 1.0 + 1e-15; 31 | std::stringstream os; 32 | os << val; 33 | EXPECT_EQ("1", os.str()); 34 | } 35 | 36 | 37 | TEST(FloatxStreamTest, ReadsFromOutputStream) 38 | { 39 | flx::floatx<8, 23> val; 40 | std::stringstream is("1.00000000000001"); 41 | is >> val; 42 | EXPECT_EQ(1.0, val); 43 | } 44 | 45 | 46 | TEST(FloatxrStreamTest, WritesToOutputStream) 47 | { 48 | flx::floatxr<> val = 1.0 + 1e-15; 49 | val.set_precision(8, 23); 50 | std::stringstream os; 51 | os << val; 52 | EXPECT_EQ("1", os.str()); 53 | } 54 | 55 | 56 | TEST(FloatxrStreamTest, ReadsFromOutputStream) 57 | { 58 | flx::floatxr<> val(8, 23); 59 | std::stringstream is("1.00000000000001"); 60 | is >> val; 61 | EXPECT_EQ(1.0, val); 62 | } 63 | 64 | 65 | } // namespace 66 | -------------------------------------------------------------------------------- /test/value_representation.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | #include "IEEEHelper.h" 22 | 23 | // TEST(IEEE_helper, demonstrate ) { 24 | // uint16_t e = 3; 25 | // uint16_t m = 5; 26 | // IEEEHelper h = IEEEHelper(e,m); 27 | // h.showConfig(); 28 | // EXPECT_EQ(1, 1); 29 | // } 30 | 31 | class CheckValidRepresentationOfFloatx5_10 32 | : public ::testing::TestWithParam> {}; 33 | 34 | class CheckValidRepresentationOfFloatx4_8 35 | : public ::testing::TestWithParam> {}; 36 | 37 | template 38 | void test(double d) 39 | { 40 | // Test value d. 41 | T fx = d; 42 | double recoverd = (double)(fx); 43 | 44 | // the recoverd and the original value are required to be the same. 45 | ASSERT_EQ(recoverd, d); 46 | } 47 | 48 | TEST_P(CheckValidRepresentationOfFloatx5_10, subnormal) 49 | { 50 | uint16_t const e = std::get<0>(GetParam()); 51 | uint16_t const m = std::get<1>(GetParam()); 52 | 53 | IEEEHelper h = IEEEHelper(e, m); 54 | // h.showConfig(); 55 | // printf("Subnormal Range:\n"); 56 | 57 | int nm = h.countSubnormalRange(); 58 | for (int im = 0; im < nm; ++im) { 59 | double d = h.iterateSubnormalRange(im); 60 | test>(d); 61 | } 62 | } 63 | 64 | TEST_P(CheckValidRepresentationOfFloatx5_10, regular) 65 | { 66 | uint16_t const e = std::get<0>(GetParam()); 67 | uint16_t const m = std::get<1>(GetParam()); 68 | 69 | IEEEHelper h = IEEEHelper(e, m); 70 | // h.showConfig(); 71 | // printf("Normal Range:\n"); 72 | 73 | int ne = h.countExpRange(); 74 | int nm = h.countSubnormalRange(); 75 | 76 | for (int ie = 0; ie < ne; ++ie) { 77 | for (int im = 0; im < nm; ++im) { 78 | double d = h.iterateNormalRange(ie, im); 79 | test>(d); 80 | } 81 | } 82 | } 83 | 84 | TEST_P(CheckValidRepresentationOfFloatx4_8, subnormal) 85 | { 86 | uint16_t const e = std::get<0>(GetParam()); 87 | uint16_t const m = std::get<1>(GetParam()); 88 | 89 | IEEEHelper h = IEEEHelper(e, m); 90 | // h.showConfig(); 91 | // printf("Subnormal Range:\n"); 92 | 93 | int nm = h.countSubnormalRange(); 94 | for (int im = 0; im < nm; ++im) { 95 | double d = h.iterateSubnormalRange(im); 96 | test>(d); 97 | } 98 | } 99 | 100 | TEST_P(CheckValidRepresentationOfFloatx4_8, regular) 101 | { 102 | uint16_t const e = std::get<0>(GetParam()); 103 | uint16_t const m = std::get<1>(GetParam()); 104 | 105 | IEEEHelper h = IEEEHelper(e, m); 106 | // h.showConfig(); 107 | // printf("Normal Range:\n"); 108 | 109 | int ne = h.countExpRange(); 110 | int nm = h.countSubnormalRange(); 111 | 112 | for (int ie = 0; ie < ne; ++ie) { 113 | for (int im = 0; im < nm; ++im) { 114 | double d = h.iterateNormalRange(ie, im); 115 | test>(d); 116 | } 117 | } 118 | } 119 | 120 | // SUBSET TEST ON FLOATX TYPE HALF <5,10> 121 | INSTANTIATE_TEST_CASE_P(TestParams_full_subnormal_range, 122 | CheckValidRepresentationOfFloatx5_10, 123 | testing::Values(::testing::make_tuple(5, 10))); 124 | 125 | INSTANTIATE_TEST_CASE_P( 126 | TestParams_subset_subnormal_range, CheckValidRepresentationOfFloatx5_10, 127 | testing::Values(::testing::make_tuple(2, 3), ::testing::make_tuple(3, 4), 128 | ::testing::make_tuple(4, 2), ::testing::make_tuple(4, 8), 129 | ::testing::make_tuple(5, 8), ::testing::make_tuple(2, 10))); 130 | 131 | // SUBSET TEST ON FLOATX TYPE HALF <4,8> 132 | INSTANTIATE_TEST_CASE_P(TestParams_full_subnormal_range, 133 | CheckValidRepresentationOfFloatx4_8, 134 | testing::Values(::testing::make_tuple(4, 8))); 135 | 136 | INSTANTIATE_TEST_CASE_P(TestParams_subset_subnormal_range, 137 | CheckValidRepresentationOfFloatx4_8, 138 | testing::Values(::testing::make_tuple(2, 3), 139 | ::testing::make_tuple(3, 8), 140 | ::testing::make_tuple(4, 3))); 141 | -------------------------------------------------------------------------------- /test/value_representation_bits.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | TEST(MyTestBF, BF_3_5) 22 | { 23 | const uint8_t E = 3; 24 | const uint8_t M = 3; 25 | 26 | uint64_t mx = ((uint64_t)0x1) << (1 + E + M); 27 | for (uint64_t cnt = 0x0; cnt < mx; ++cnt) { 28 | // define input pattern 29 | std::bitset<1 + E + M> pattern(cnt); 30 | 31 | // get a backend number 32 | double bd = flx::detail::construct_number(pattern); 33 | 34 | // check that the backend number is valid. 35 | // e.g. cast to fx and back to double (that should not change its value) 36 | flx::floatx fx = bd; 37 | double r = double(fx); 38 | EXPECT_EQ(*reinterpret_cast(&r), 39 | *reinterpret_cast(&bd)); 40 | 41 | // get the reverse functionallity 42 | std::bitset<1 + E + M> out = 43 | flx::detail::get_fullbit_representation_BS(r); 44 | 45 | // printf("value: %.20e\n", r ); 46 | // std::cout << "IN: " << pattern << std::endl; 47 | // std::cout << "OUT: " << out << std::endl; 48 | 49 | EXPECT_EQ(pattern, out); 50 | } 51 | } 52 | 53 | TEST(MyTestBF, BF_5_2) 54 | { 55 | const uint8_t E = 5; 56 | const uint8_t M = 2; 57 | 58 | uint64_t mx = ((uint64_t)0x1) << (1 + E + M); 59 | for (uint64_t cnt = 0x0; cnt < mx; ++cnt) { 60 | // define input pattern 61 | std::bitset<1 + E + M> pattern(cnt); 62 | 63 | // get a backend number 64 | double bd = flx::detail::construct_number(pattern); 65 | 66 | // check that the backend number is valid. 67 | // e.g. cast to fx and back to double (that should not change its value) 68 | flx::floatx fx = bd; 69 | double r = double(fx); 70 | EXPECT_EQ(*reinterpret_cast(&r), 71 | *reinterpret_cast(&bd)); 72 | 73 | // get the reverse functionallity 74 | std::bitset<1 + E + M> out = 75 | flx::detail::get_fullbit_representation_BS(r); 76 | 77 | // printf("value: %.20e\n", r ); 78 | // std::cout << "IN: " << pattern << std::endl; 79 | // std::cout << "OUT: " << out << std::endl; 80 | 81 | EXPECT_EQ(pattern, out); 82 | } 83 | } 84 | 85 | TEST(MyTestBF, BF_5_10) 86 | { 87 | const uint8_t E = 5; 88 | const uint8_t M = 10; 89 | 90 | uint64_t mx = ((uint64_t)0x1) << (1 + E + M); 91 | for (uint64_t cnt = 0x0; cnt < mx; ++cnt) { 92 | // define input pattern 93 | std::bitset<1 + E + M> pattern(cnt); 94 | 95 | // get a backend number 96 | double bd = flx::detail::construct_number(pattern); 97 | 98 | // check that the backend number is valid. 99 | // e.g. cast to fx and back to double (that should not change its value) 100 | flx::floatx fx = bd; 101 | double r = double(fx); 102 | EXPECT_EQ(*reinterpret_cast(&r), 103 | *reinterpret_cast(&bd)); 104 | 105 | // get the reverse functionallity 106 | std::bitset<1 + E + M> out = 107 | flx::detail::get_fullbit_representation_BS(r); 108 | 109 | // printf("value: %.20e\n", r ); 110 | // std::cout << "IN: " << pattern << std::endl; 111 | // std::cout << "OUT: " << out << std::endl; 112 | 113 | EXPECT_EQ(pattern, out); 114 | } 115 | } -------------------------------------------------------------------------------- /test/value_representation_half.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 - The OPRECOMP Project Consortium, Universitat Jaume I, 3 | IBM Research GmbH. All rights reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | 21 | void show(double d) 22 | { 23 | printf("%.20e\t", d); 24 | uint64_t u = flx::detail::reinterpret_as_bits(d); 25 | printf("0x%016llx\t", u); 26 | std::cout << std::bitset<64>(u) << std::endl; 27 | } 28 | 29 | class MyTest : public ::testing::TestWithParam> { 30 | }; 31 | 32 | TEST_P(MyTest, TestFormula) 33 | { 34 | uint64_t const in_number = std::get<0>(GetParam()); 35 | uint64_t const out_number = std::get<1>(GetParam()); 36 | 37 | double d = flx::detail::reinterpret_bits_as(in_number); 38 | double expected = flx::detail::reinterpret_bits_as(out_number); 39 | 40 | // IEEE half. 41 | flx::floatx<5, 10> fx = d; 42 | double recoverd_fx = (double)(fx); 43 | 44 | // printf("number: \t"); show(d); 45 | // printf("recoverd_fx:\t"); show(recoverd_fx); 46 | // printf("expected: \t"); show(expected); 47 | 48 | // Enforces the same reprsentation for nan's. 49 | uint64_t out = flx::detail::reinterpret_as_bits(recoverd_fx); 50 | ASSERT_EQ(out, out_number); 51 | // ASSERT_EQ(recoverd_fx, expected); // does not work for NAN's. 52 | } 53 | 54 | INSTANTIATE_TEST_CASE_P( 55 | TestWithParameters_manual_sampels, MyTest, 56 | testing::Values( 57 | 58 | // Case triggering >> 64 (does generates wrong masks) 59 | // number: 6.09397888183593736447e-05 0x3f0ff33333333333 60 | // 0011111100001111111100110011001100110011001100110011001100110011 61 | // recoverd_sf: 6.09159469604492187500e-05 0x3f0ff00000000000 62 | // 0011111100001111111100000000000000000000000000000000000000000000 63 | ::testing::make_tuple(0x3f0ff33333333333, 0x3f0ff00000000000), 64 | 65 | // Case, triggering rounding twice after each other not the same ase 66 | // rounding once (sticky bit) 67 | // number: 6.09517097473144544803e-05 0x3f0ff4cccccccccd 68 | // 0011111100001111111101001100110011001100110011001100110011001101 69 | // recoverd_sf: 6.09755516052246093750e-05 0x3f0ff80000000000 70 | // 0011111100001111111110000000000000000000000000000000000000000000 71 | ::testing::make_tuple(0x3f0ff4cccccccccd, 0x3f0ff80000000000), 72 | 73 | // Case that triggers a MANTISSA overflow due rounding, requires the 74 | // exponent to be changed. 75 | // number: 3.05056571960449225526e-05 0x3efffccccccccccd 76 | // 0011111011111111111111001100110011001100110011001100110011001101 77 | // recoverd_sf: 3.05175781250000000000e-05 0x3f00000000000000 78 | // 0011111100000000000000000000000000000000000000000000000000000000 79 | ::testing::make_tuple(0x3efffccccccccccd, 0x3f00000000000000), 80 | 81 | // Super ugly (and rare) case! A very small sub-normal number requires 82 | // the mantissa bits almost to be moved out of the storage range. 83 | // Henceforth, the correct rounding should go either up / or down. In 84 | // that case, the rounding depends on the hidden 1, not explicitly 85 | // stored in the mantissa. (BUGFIX) add the hidden one, perform correct 86 | // rounding and go furhter in the routine. That requires to change the 87 | // exponent due rounding at a later point. 88 | // number: 4.76837158203125026470e-08 0x3e6999999999999a 89 | // 0011111001101001100110011001100110011001100110011001100110011010 90 | // recoverd_sf: 5.96046447753906250000e-08 0x3e70000000000000 91 | // 0011111001110000000000000000000000000000000000000000000000000000 92 | ::testing::make_tuple(0x3e6999999999999a, 0x3e70000000000000), 93 | 94 | // A number smaller than the smallest subnormal, requires a full flush 95 | // to 0. 96 | // number: 2.38418579101562513235e-08 0x3e5999999999999a 97 | // 0011111001011001100110011001100110011001100110011001100110011010 98 | // recoverd_sf: 0.00000000000000000000e+00 0x0000000000000000 99 | // 0000000000000000000000000000000000000000000000000000000000000000 100 | ::testing::make_tuple(0x3e5999999999999a, 0x0000000000000000))); 101 | 102 | 103 | INSTANTIATE_TEST_CASE_P( 104 | // Cases showing failuers during a float - Half - float brute force test! 105 | TestWithParameters_manual_sampels_002, MyTest, 106 | testing::Values( 107 | // number: 1.58346726468704329014e-43 0x370c400000000000 108 | // 0011011100001100010000000000000000000000000000000000000000000000 109 | // recoverd_sf: 0.00000000000000000000e+00 0x0000000000000000 110 | // 0000000000000000000000000000000000000000000000000000000000000000 111 | ::testing::make_tuple(0x370c400000000000, 0x0000000000000000), 112 | // number: 4.88279038108885288239e-04 0x3f3ffff680000000 113 | // 0011111100111111111111111111011010000000000000000000000000000000 114 | // recoverd_sf: 4.88281250000000000000e-04 0x3f40000000000000 115 | // 0011111101000000000000000000000000000000000000000000000000000000 116 | ::testing::make_tuple(0x3f3ffff680000000, 0x3f40000000000000), 117 | // number: 6.60882968750000000000e+04 0x40f02284c0000000 118 | // 0100000011110000001000101000010011000000000000000000000000000000 119 | // recoverd_sf: inf 0x7ff0000000000000 120 | // 0111111111110000000000000000000000000000000000000000000000000000 121 | ::testing::make_tuple(0x40f02284c0000000, 0x7ff0000000000000), 122 | // number: nan 0x7ff80e1780000000 123 | // 0111111111111000000011100001011110000000000000000000000000000000 124 | // recoverd_sf: nan 0x7ff80c0000000000 125 | // 0111111111111000000011000000000000000000000000000000000000000000 126 | ::testing::make_tuple(0x7ff80e1780000000, 0x7ff80c0000000000), 127 | // number: nan 0x7ff80eeeeeeeeeee 128 | // 0111111111111000000011101110111011101110111011101110111011101110 129 | // recoverd_sf: nan 0x7ff80c0000000000 130 | // 0111111111111000000011000000000000000000000000000000000000000000 131 | ::testing::make_tuple(0x7ff80eeeeeeeeeee, 0x7ff80c0000000000), 132 | // number: -2.98454239100465201773e-08 0xbe6005ec80000000 133 | // 1011111001100000000001011110110010000000000000000000000000000000 134 | // recoverd_sf: -5.96046447753906250000e-08 0xbe70000000000000 135 | // 1011111001110000000000000000000000000000000000000000000000000000 136 | ::testing::make_tuple(0xbe6005ec80000000, 0xbe70000000000000))); 137 | 138 | 139 | INSTANTIATE_TEST_CASE_P( 140 | TestWithParametersC1, MyTest, 141 | testing::Values( 142 | // number: 1.00000000000000000000e+00 0x3ff0000000000000 143 | // 0011111111110000000000000000000000000000000000000000000000000000 144 | // recoverd_sf: 1.00000000000000000000e+00 0x3ff0000000000000 145 | // 0011111111110000000000000000000000000000000000000000000000000000 146 | ::testing::make_tuple(0x3ff0000000000000, 0x3ff0000000000000), 147 | // number: 5.00000000000000000000e-01 0x3fe0000000000000 148 | // 0011111111100000000000000000000000000000000000000000000000000000 149 | // recoverd_sf: 5.00000000000000000000e-01 0x3fe0000000000000 150 | // 0011111111100000000000000000000000000000000000000000000000000000 151 | ::testing::make_tuple(0x3fe0000000000000, 0x3fe0000000000000), 152 | // number: 3.33333333333333314830e-01 0x3fd5555555555555 153 | // 0011111111010101010101010101010101010101010101010101010101010101 154 | // recoverd_sf: 3.33251953125000000000e-01 0x3fd5540000000000 155 | // 0011111111010101010101000000000000000000000000000000000000000000 156 | ::testing::make_tuple(0x3fd5555555555555, 0x3fd5540000000000), 157 | // number: 2.50000000000000000000e-01 0x3fd0000000000000 158 | // 0011111111010000000000000000000000000000000000000000000000000000 159 | // recoverd_sf: 2.50000000000000000000e-01 0x3fd0000000000000 160 | // 0011111111010000000000000000000000000000000000000000000000000000 161 | ::testing::make_tuple(0x3fd0000000000000, 0x3fd0000000000000), 162 | // number: 2.00000000000000011102e-01 0x3fc999999999999a 163 | // 0011111111001001100110011001100110011001100110011001100110011010 164 | // recoverd_sf: 1.99951171875000000000e-01 0x3fc9980000000000 165 | // 0011111111001001100110000000000000000000000000000000000000000000 166 | ::testing::make_tuple(0x3fc999999999999a, 0x3fc9980000000000), 167 | // number: 1.66666666666666657415e-01 0x3fc5555555555555 168 | // 0011111111000101010101010101010101010101010101010101010101010101 169 | // recoverd_sf: 1.66625976562500000000e-01 0x3fc5540000000000 170 | // 0011111111000101010101000000000000000000000000000000000000000000 171 | ::testing::make_tuple(0x3fc5555555555555, 0x3fc5540000000000), 172 | // number: 1.42857142857142849213e-01 0x3fc2492492492492 173 | // 0011111111000010010010010010010010010010010010010010010010010010 174 | // recoverd_sf: 1.42822265625000000000e-01 0x3fc2480000000000 175 | // 0011111111000010010010000000000000000000000000000000000000000000 176 | ::testing::make_tuple(0x3fc2492492492492, 0x3fc2480000000000), 177 | // number: 1.25000000000000000000e-01 0x3fc0000000000000 178 | // 0011111111000000000000000000000000000000000000000000000000000000 179 | // recoverd_sf: 1.25000000000000000000e-01 0x3fc0000000000000 180 | // 0011111111000000000000000000000000000000000000000000000000000000 181 | ::testing::make_tuple(0x3fc0000000000000, 0x3fc0000000000000), 182 | // number: 1.11111111111111104943e-01 0x3fbc71c71c71c71c 183 | // 0011111110111100011100011100011100011100011100011100011100011100 184 | // recoverd_sf: 1.11083984375000000000e-01 0x3fbc700000000000 185 | // 0011111110111100011100000000000000000000000000000000000000000000 186 | ::testing::make_tuple(0x3fbc71c71c71c71c, 0x3fbc700000000000))); 187 | 188 | // Brute force snippsets from extracted from softlow 189 | INSTANTIATE_TEST_CASE_P( 190 | TestWithParameters_BF_001, MyTest, 191 | testing::Values( 192 | // start: 0 193 | // stop: 50 194 | // inc: 1 195 | ::testing::make_tuple(0x0000000000000000, 0x0000000000000000), 196 | ::testing::make_tuple(0x36a0000000000000, 0x0000000000000000), 197 | ::testing::make_tuple(0x36b0000000000000, 0x0000000000000000), 198 | ::testing::make_tuple(0x36b8000000000000, 0x0000000000000000), 199 | ::testing::make_tuple(0x36c0000000000000, 0x0000000000000000), 200 | ::testing::make_tuple(0x36c4000000000000, 0x0000000000000000), 201 | ::testing::make_tuple(0x36c8000000000000, 0x0000000000000000), 202 | ::testing::make_tuple(0x36cc000000000000, 0x0000000000000000), 203 | ::testing::make_tuple(0x36d0000000000000, 0x0000000000000000), 204 | ::testing::make_tuple(0x36d2000000000000, 0x0000000000000000), 205 | ::testing::make_tuple(0x36d4000000000000, 0x0000000000000000), 206 | ::testing::make_tuple(0x36d6000000000000, 0x0000000000000000), 207 | ::testing::make_tuple(0x36d8000000000000, 0x0000000000000000), 208 | ::testing::make_tuple(0x36da000000000000, 0x0000000000000000), 209 | ::testing::make_tuple(0x36dc000000000000, 0x0000000000000000), 210 | ::testing::make_tuple(0x36de000000000000, 0x0000000000000000), 211 | ::testing::make_tuple(0x36e0000000000000, 0x0000000000000000), 212 | ::testing::make_tuple(0x36e1000000000000, 0x0000000000000000), 213 | ::testing::make_tuple(0x36e2000000000000, 0x0000000000000000), 214 | ::testing::make_tuple(0x36e3000000000000, 0x0000000000000000), 215 | ::testing::make_tuple(0x36e4000000000000, 0x0000000000000000), 216 | ::testing::make_tuple(0x36e5000000000000, 0x0000000000000000), 217 | ::testing::make_tuple(0x36e6000000000000, 0x0000000000000000), 218 | ::testing::make_tuple(0x36e7000000000000, 0x0000000000000000), 219 | ::testing::make_tuple(0x36e8000000000000, 0x0000000000000000), 220 | ::testing::make_tuple(0x36e9000000000000, 0x0000000000000000), 221 | ::testing::make_tuple(0x36ea000000000000, 0x0000000000000000), 222 | ::testing::make_tuple(0x36eb000000000000, 0x0000000000000000), 223 | ::testing::make_tuple(0x36ec000000000000, 0x0000000000000000), 224 | ::testing::make_tuple(0x36ed000000000000, 0x0000000000000000), 225 | ::testing::make_tuple(0x36ee000000000000, 0x0000000000000000), 226 | ::testing::make_tuple(0x36ef000000000000, 0x0000000000000000), 227 | ::testing::make_tuple(0x36f0000000000000, 0x0000000000000000), 228 | ::testing::make_tuple(0x36f0800000000000, 0x0000000000000000), 229 | ::testing::make_tuple(0x36f1000000000000, 0x0000000000000000), 230 | ::testing::make_tuple(0x36f1800000000000, 0x0000000000000000), 231 | ::testing::make_tuple(0x36f2000000000000, 0x0000000000000000), 232 | ::testing::make_tuple(0x36f2800000000000, 0x0000000000000000), 233 | ::testing::make_tuple(0x36f3000000000000, 0x0000000000000000), 234 | ::testing::make_tuple(0x36f3800000000000, 0x0000000000000000), 235 | ::testing::make_tuple(0x36f4000000000000, 0x0000000000000000), 236 | ::testing::make_tuple(0x36f4800000000000, 0x0000000000000000), 237 | ::testing::make_tuple(0x36f5000000000000, 0x0000000000000000), 238 | ::testing::make_tuple(0x36f5800000000000, 0x0000000000000000), 239 | ::testing::make_tuple(0x36f6000000000000, 0x0000000000000000), 240 | ::testing::make_tuple(0x36f6800000000000, 0x0000000000000000), 241 | ::testing::make_tuple(0x36f7000000000000, 0x0000000000000000), 242 | ::testing::make_tuple(0x36f7800000000000, 0x0000000000000000), 243 | ::testing::make_tuple(0x36f8000000000000, 0x0000000000000000), 244 | ::testing::make_tuple(0x36f8800000000000, 0x0000000000000000))); 245 | 246 | INSTANTIATE_TEST_CASE_P( 247 | TestWithParameters_BF_002, MyTest, 248 | testing::Values( 249 | // start: 97495757619 250 | // stop: 97495757669 251 | // inc: 1 252 | ::testing::make_tuple(0xbe66666660000000, 0xbe70000000000000), 253 | ::testing::make_tuple(0xbe66666680000000, 0xbe70000000000000), 254 | ::testing::make_tuple(0xbe666666a0000000, 0xbe70000000000000), 255 | ::testing::make_tuple(0xbe666666c0000000, 0xbe70000000000000), 256 | ::testing::make_tuple(0xbe666666e0000000, 0xbe70000000000000), 257 | ::testing::make_tuple(0xbe66666700000000, 0xbe70000000000000), 258 | ::testing::make_tuple(0xbe66666720000000, 0xbe70000000000000), 259 | ::testing::make_tuple(0xbe66666740000000, 0xbe70000000000000), 260 | ::testing::make_tuple(0xbe66666760000000, 0xbe70000000000000), 261 | ::testing::make_tuple(0xbe66666780000000, 0xbe70000000000000), 262 | ::testing::make_tuple(0xbe666667a0000000, 0xbe70000000000000), 263 | ::testing::make_tuple(0xbe666667c0000000, 0xbe70000000000000), 264 | ::testing::make_tuple(0xbe666667e0000000, 0xbe70000000000000), 265 | ::testing::make_tuple(0xbe66666800000000, 0xbe70000000000000), 266 | ::testing::make_tuple(0xbe66666820000000, 0xbe70000000000000), 267 | ::testing::make_tuple(0xbe66666840000000, 0xbe70000000000000), 268 | ::testing::make_tuple(0xbe66666860000000, 0xbe70000000000000), 269 | ::testing::make_tuple(0xbe66666880000000, 0xbe70000000000000), 270 | ::testing::make_tuple(0xbe666668a0000000, 0xbe70000000000000), 271 | ::testing::make_tuple(0xbe666668c0000000, 0xbe70000000000000), 272 | ::testing::make_tuple(0xbe666668e0000000, 0xbe70000000000000), 273 | ::testing::make_tuple(0xbe66666900000000, 0xbe70000000000000), 274 | ::testing::make_tuple(0xbe66666920000000, 0xbe70000000000000), 275 | ::testing::make_tuple(0xbe66666940000000, 0xbe70000000000000), 276 | ::testing::make_tuple(0xbe66666960000000, 0xbe70000000000000), 277 | ::testing::make_tuple(0xbe66666980000000, 0xbe70000000000000), 278 | ::testing::make_tuple(0xbe666669a0000000, 0xbe70000000000000), 279 | ::testing::make_tuple(0xbe666669c0000000, 0xbe70000000000000), 280 | ::testing::make_tuple(0xbe666669e0000000, 0xbe70000000000000), 281 | ::testing::make_tuple(0xbe66666a00000000, 0xbe70000000000000), 282 | ::testing::make_tuple(0xbe66666a20000000, 0xbe70000000000000), 283 | ::testing::make_tuple(0xbe66666a40000000, 0xbe70000000000000), 284 | ::testing::make_tuple(0xbe66666a60000000, 0xbe70000000000000), 285 | ::testing::make_tuple(0xbe66666a80000000, 0xbe70000000000000), 286 | ::testing::make_tuple(0xbe66666aa0000000, 0xbe70000000000000), 287 | ::testing::make_tuple(0xbe66666ac0000000, 0xbe70000000000000), 288 | ::testing::make_tuple(0xbe66666ae0000000, 0xbe70000000000000), 289 | ::testing::make_tuple(0xbe66666b00000000, 0xbe70000000000000), 290 | ::testing::make_tuple(0xbe66666b20000000, 0xbe70000000000000), 291 | ::testing::make_tuple(0xbe66666b40000000, 0xbe70000000000000), 292 | ::testing::make_tuple(0xbe66666b60000000, 0xbe70000000000000), 293 | ::testing::make_tuple(0xbe66666b80000000, 0xbe70000000000000), 294 | ::testing::make_tuple(0xbe66666ba0000000, 0xbe70000000000000), 295 | ::testing::make_tuple(0xbe66666bc0000000, 0xbe70000000000000), 296 | ::testing::make_tuple(0xbe66666be0000000, 0xbe70000000000000), 297 | ::testing::make_tuple(0xbe66666c00000000, 0xbe70000000000000), 298 | ::testing::make_tuple(0xbe66666c20000000, 0xbe70000000000000), 299 | ::testing::make_tuple(0xbe66666c40000000, 0xbe70000000000000), 300 | ::testing::make_tuple(0xbe66666c60000000, 0xbe70000000000000), 301 | ::testing::make_tuple(0xbe66666c80000000, 0xbe70000000000000))); 302 | 303 | INSTANTIATE_TEST_CASE_P( 304 | TestWithParameters_BF_003, MyTest, 305 | testing::Values( 306 | // start: 214318868070 307 | // stop: 214318868120 308 | // inc: 1 309 | ::testing::make_tuple(0xc4ccccccc0000000, 0xfff0000000000000), 310 | ::testing::make_tuple(0xc4cccccce0000000, 0xfff0000000000000), 311 | ::testing::make_tuple(0xc4cccccd00000000, 0xfff0000000000000), 312 | ::testing::make_tuple(0xc4cccccd20000000, 0xfff0000000000000), 313 | ::testing::make_tuple(0xc4cccccd40000000, 0xfff0000000000000), 314 | ::testing::make_tuple(0xc4cccccd60000000, 0xfff0000000000000), 315 | ::testing::make_tuple(0xc4cccccd80000000, 0xfff0000000000000), 316 | ::testing::make_tuple(0xc4cccccda0000000, 0xfff0000000000000), 317 | ::testing::make_tuple(0xc4cccccdc0000000, 0xfff0000000000000), 318 | ::testing::make_tuple(0xc4cccccde0000000, 0xfff0000000000000), 319 | ::testing::make_tuple(0xc4ccccce00000000, 0xfff0000000000000), 320 | ::testing::make_tuple(0xc4ccccce20000000, 0xfff0000000000000), 321 | ::testing::make_tuple(0xc4ccccce40000000, 0xfff0000000000000), 322 | ::testing::make_tuple(0xc4ccccce60000000, 0xfff0000000000000), 323 | ::testing::make_tuple(0xc4ccccce80000000, 0xfff0000000000000), 324 | ::testing::make_tuple(0xc4cccccea0000000, 0xfff0000000000000), 325 | ::testing::make_tuple(0xc4cccccec0000000, 0xfff0000000000000), 326 | ::testing::make_tuple(0xc4cccccee0000000, 0xfff0000000000000), 327 | ::testing::make_tuple(0xc4cccccf00000000, 0xfff0000000000000), 328 | ::testing::make_tuple(0xc4cccccf20000000, 0xfff0000000000000), 329 | ::testing::make_tuple(0xc4cccccf40000000, 0xfff0000000000000), 330 | ::testing::make_tuple(0xc4cccccf60000000, 0xfff0000000000000), 331 | ::testing::make_tuple(0xc4cccccf80000000, 0xfff0000000000000), 332 | ::testing::make_tuple(0xc4cccccfa0000000, 0xfff0000000000000), 333 | ::testing::make_tuple(0xc4cccccfc0000000, 0xfff0000000000000), 334 | ::testing::make_tuple(0xc4cccccfe0000000, 0xfff0000000000000), 335 | ::testing::make_tuple(0xc4ccccd000000000, 0xfff0000000000000), 336 | ::testing::make_tuple(0xc4ccccd020000000, 0xfff0000000000000), 337 | ::testing::make_tuple(0xc4ccccd040000000, 0xfff0000000000000), 338 | ::testing::make_tuple(0xc4ccccd060000000, 0xfff0000000000000), 339 | ::testing::make_tuple(0xc4ccccd080000000, 0xfff0000000000000), 340 | ::testing::make_tuple(0xc4ccccd0a0000000, 0xfff0000000000000), 341 | ::testing::make_tuple(0xc4ccccd0c0000000, 0xfff0000000000000), 342 | ::testing::make_tuple(0xc4ccccd0e0000000, 0xfff0000000000000), 343 | ::testing::make_tuple(0xc4ccccd100000000, 0xfff0000000000000), 344 | ::testing::make_tuple(0xc4ccccd120000000, 0xfff0000000000000), 345 | ::testing::make_tuple(0xc4ccccd140000000, 0xfff0000000000000), 346 | ::testing::make_tuple(0xc4ccccd160000000, 0xfff0000000000000), 347 | ::testing::make_tuple(0xc4ccccd180000000, 0xfff0000000000000), 348 | ::testing::make_tuple(0xc4ccccd1a0000000, 0xfff0000000000000), 349 | ::testing::make_tuple(0xc4ccccd1c0000000, 0xfff0000000000000), 350 | ::testing::make_tuple(0xc4ccccd1e0000000, 0xfff0000000000000), 351 | ::testing::make_tuple(0xc4ccccd200000000, 0xfff0000000000000), 352 | ::testing::make_tuple(0xc4ccccd220000000, 0xfff0000000000000), 353 | ::testing::make_tuple(0xc4ccccd240000000, 0xfff0000000000000), 354 | ::testing::make_tuple(0xc4ccccd260000000, 0xfff0000000000000), 355 | ::testing::make_tuple(0xc4ccccd280000000, 0xfff0000000000000), 356 | ::testing::make_tuple(0xc4ccccd2a0000000, 0xfff0000000000000), 357 | ::testing::make_tuple(0xc4ccccd2c0000000, 0xfff0000000000000), 358 | ::testing::make_tuple(0xc4ccccd2e0000000, 0xfff0000000000000))); 359 | 360 | INSTANTIATE_TEST_CASE_P( 361 | TestWithParameters_BF_004, MyTest, 362 | testing::Values( 363 | // start: 429492434632 364 | // stop: 429492434682 365 | // inc: 1 366 | ::testing::make_tuple(0xffffced900000000, 0xffffcc0000000000), 367 | ::testing::make_tuple(0xffffced920000000, 0xffffcc0000000000), 368 | ::testing::make_tuple(0xffffced940000000, 0xffffcc0000000000), 369 | ::testing::make_tuple(0xffffced960000000, 0xffffcc0000000000), 370 | ::testing::make_tuple(0xffffced980000000, 0xffffcc0000000000), 371 | ::testing::make_tuple(0xffffced9a0000000, 0xffffcc0000000000), 372 | ::testing::make_tuple(0xffffced9c0000000, 0xffffcc0000000000), 373 | ::testing::make_tuple(0xffffced9e0000000, 0xffffcc0000000000), 374 | ::testing::make_tuple(0xffffceda00000000, 0xffffcc0000000000), 375 | ::testing::make_tuple(0xffffceda20000000, 0xffffcc0000000000), 376 | ::testing::make_tuple(0xffffceda40000000, 0xffffcc0000000000), 377 | ::testing::make_tuple(0xffffceda60000000, 0xffffcc0000000000), 378 | ::testing::make_tuple(0xffffceda80000000, 0xffffcc0000000000), 379 | ::testing::make_tuple(0xffffcedaa0000000, 0xffffcc0000000000), 380 | ::testing::make_tuple(0xffffcedac0000000, 0xffffcc0000000000), 381 | ::testing::make_tuple(0xffffcedae0000000, 0xffffcc0000000000), 382 | ::testing::make_tuple(0xffffcedb00000000, 0xffffcc0000000000), 383 | ::testing::make_tuple(0xffffcedb20000000, 0xffffcc0000000000), 384 | ::testing::make_tuple(0xffffcedb40000000, 0xffffcc0000000000), 385 | ::testing::make_tuple(0xffffcedb60000000, 0xffffcc0000000000), 386 | ::testing::make_tuple(0xffffcedb80000000, 0xffffcc0000000000), 387 | ::testing::make_tuple(0xffffcedba0000000, 0xffffcc0000000000), 388 | ::testing::make_tuple(0xffffcedbc0000000, 0xffffcc0000000000), 389 | ::testing::make_tuple(0xffffcedbe0000000, 0xffffcc0000000000), 390 | ::testing::make_tuple(0xffffcedc00000000, 0xffffcc0000000000), 391 | ::testing::make_tuple(0xffffcedc20000000, 0xffffcc0000000000), 392 | ::testing::make_tuple(0xffffcedc40000000, 0xffffcc0000000000), 393 | ::testing::make_tuple(0xffffcedc60000000, 0xffffcc0000000000), 394 | ::testing::make_tuple(0xffffcedc80000000, 0xffffcc0000000000), 395 | ::testing::make_tuple(0xffffcedca0000000, 0xffffcc0000000000), 396 | ::testing::make_tuple(0xffffcedcc0000000, 0xffffcc0000000000), 397 | ::testing::make_tuple(0xffffcedce0000000, 0xffffcc0000000000), 398 | ::testing::make_tuple(0xffffcedd00000000, 0xffffcc0000000000), 399 | ::testing::make_tuple(0xffffcedd20000000, 0xffffcc0000000000), 400 | ::testing::make_tuple(0xffffcedd40000000, 0xffffcc0000000000), 401 | ::testing::make_tuple(0xffffcedd60000000, 0xffffcc0000000000), 402 | ::testing::make_tuple(0xffffcedd80000000, 0xffffcc0000000000), 403 | ::testing::make_tuple(0xffffcedda0000000, 0xffffcc0000000000), 404 | ::testing::make_tuple(0xffffceddc0000000, 0xffffcc0000000000), 405 | ::testing::make_tuple(0xffffcedde0000000, 0xffffcc0000000000), 406 | ::testing::make_tuple(0xffffcede00000000, 0xffffcc0000000000), 407 | ::testing::make_tuple(0xffffcede20000000, 0xffffcc0000000000), 408 | ::testing::make_tuple(0xffffcede40000000, 0xffffcc0000000000), 409 | ::testing::make_tuple(0xffffcede60000000, 0xffffcc0000000000), 410 | ::testing::make_tuple(0xffffcede80000000, 0xffffcc0000000000), 411 | ::testing::make_tuple(0xffffcedea0000000, 0xffffcc0000000000), 412 | ::testing::make_tuple(0xffffcedec0000000, 0xffffcc0000000000), 413 | ::testing::make_tuple(0xffffcedee0000000, 0xffffcc0000000000), 414 | ::testing::make_tuple(0xffffcedf00000000, 0xffffcc0000000000), 415 | ::testing::make_tuple(0xffffcedf20000000, 0xffffcc0000000000))); 416 | 417 | 418 | INSTANTIATE_TEST_CASE_P( 419 | TestWithParameters_BF_005, MyTest, 420 | testing::Values( 421 | // start: 97495757619 422 | // stop: 97548186419 423 | // inc: 1048576 424 | ::testing::make_tuple(0xbe66666660000000, 0xbe70000000000000), 425 | ::testing::make_tuple(0xbe68666660000000, 0xbe70000000000000), 426 | ::testing::make_tuple(0xbe6a666660000000, 0xbe70000000000000), 427 | ::testing::make_tuple(0xbe6c666660000000, 0xbe70000000000000), 428 | ::testing::make_tuple(0xbe6e666660000000, 0xbe70000000000000), 429 | ::testing::make_tuple(0xbe70666660000000, 0xbe70000000000000), 430 | ::testing::make_tuple(0xbe72666660000000, 0xbe70000000000000), 431 | ::testing::make_tuple(0xbe74666660000000, 0xbe70000000000000), 432 | ::testing::make_tuple(0xbe76666660000000, 0xbe70000000000000), 433 | ::testing::make_tuple(0xbe78666660000000, 0xbe80000000000000), 434 | ::testing::make_tuple(0xbe7a666660000000, 0xbe80000000000000), 435 | ::testing::make_tuple(0xbe7c666660000000, 0xbe80000000000000), 436 | ::testing::make_tuple(0xbe7e666660000000, 0xbe80000000000000), 437 | ::testing::make_tuple(0xbe80666660000000, 0xbe80000000000000), 438 | ::testing::make_tuple(0xbe82666660000000, 0xbe80000000000000), 439 | ::testing::make_tuple(0xbe84666660000000, 0xbe88000000000000), 440 | ::testing::make_tuple(0xbe86666660000000, 0xbe88000000000000), 441 | ::testing::make_tuple(0xbe88666660000000, 0xbe88000000000000), 442 | ::testing::make_tuple(0xbe8a666660000000, 0xbe88000000000000), 443 | ::testing::make_tuple(0xbe8c666660000000, 0xbe90000000000000), 444 | ::testing::make_tuple(0xbe8e666660000000, 0xbe90000000000000), 445 | ::testing::make_tuple(0xbe90666660000000, 0xbe90000000000000), 446 | ::testing::make_tuple(0xbe92666660000000, 0xbe94000000000000), 447 | ::testing::make_tuple(0xbe94666660000000, 0xbe94000000000000), 448 | ::testing::make_tuple(0xbe96666660000000, 0xbe98000000000000), 449 | ::testing::make_tuple(0xbe98666660000000, 0xbe98000000000000), 450 | ::testing::make_tuple(0xbe9a666660000000, 0xbe9c000000000000), 451 | ::testing::make_tuple(0xbe9c666660000000, 0xbe9c000000000000), 452 | ::testing::make_tuple(0xbe9e666660000000, 0xbea0000000000000), 453 | ::testing::make_tuple(0xbea0666660000000, 0xbea0000000000000), 454 | ::testing::make_tuple(0xbea2666660000000, 0xbea2000000000000), 455 | ::testing::make_tuple(0xbea4666660000000, 0xbea4000000000000), 456 | ::testing::make_tuple(0xbea6666660000000, 0xbea6000000000000), 457 | ::testing::make_tuple(0xbea8666660000000, 0xbea8000000000000), 458 | ::testing::make_tuple(0xbeaa666660000000, 0xbeaa000000000000), 459 | ::testing::make_tuple(0xbeac666660000000, 0xbeac000000000000), 460 | ::testing::make_tuple(0xbeae666660000000, 0xbeae000000000000), 461 | ::testing::make_tuple(0xbeb0666660000000, 0xbeb0000000000000), 462 | ::testing::make_tuple(0xbeb2666660000000, 0xbeb2000000000000), 463 | ::testing::make_tuple(0xbeb4666660000000, 0xbeb4000000000000), 464 | ::testing::make_tuple(0xbeb6666660000000, 0xbeb6000000000000), 465 | ::testing::make_tuple(0xbeb8666660000000, 0xbeb8000000000000), 466 | ::testing::make_tuple(0xbeba666660000000, 0xbeba000000000000), 467 | ::testing::make_tuple(0xbebc666660000000, 0xbebc000000000000), 468 | ::testing::make_tuple(0xbebe666660000000, 0xbebe000000000000), 469 | ::testing::make_tuple(0xbec0666660000000, 0xbec0800000000000), 470 | ::testing::make_tuple(0xbec2666660000000, 0xbec2800000000000), 471 | ::testing::make_tuple(0xbec4666660000000, 0xbec4800000000000), 472 | ::testing::make_tuple(0xbec6666660000000, 0xbec6800000000000), 473 | ::testing::make_tuple(0xbec8666660000000, 0xbec8800000000000))); 474 | -------------------------------------------------------------------------------- /testx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | function(create_test test_name) 2 | add_executable(${test_name} ${test_name}.cpp) 3 | target_link_libraries(${test_name} PRIVATE floatx gtest_main) 4 | file(RELATIVE_PATH REL_BINARY_DIR 5 | ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) 6 | add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${test_name}) 7 | endfunction(create_test) 8 | 9 | create_test(add_000) 10 | create_test(sub_000) 11 | create_test(mul_000) 12 | create_test(div_000) 13 | -------------------------------------------------------------------------------- /third_party/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(package_loader.cmake) 2 | 3 | if(BUILD_TESTS) 4 | add_subdirectory(gtest) 5 | endif() 6 | 7 | if(DEVEL_TOOLS) 8 | add_subdirectory(git-cmake-format) 9 | endif() 10 | -------------------------------------------------------------------------------- /third_party/DownloadCMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | include(ExternalProject) 4 | ExternalProject_Add(${package_name} 5 | GIT_REPOSITORY "${package_url}" 6 | GIT_TAG "${package_tag}" 7 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src" 8 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" 9 | CONFIGURE_COMMAND "" 10 | BUILD_COMMAND "" 11 | INSTALL_COMMAND "" 12 | TEST_COMMAND "" 13 | ) 14 | -------------------------------------------------------------------------------- /third_party/git-cmake-format/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | load_git_package(git-cmake-format 2 | "https://github.com/kbenzie/git-cmake-format.git" 3 | "master") 4 | 5 | -------------------------------------------------------------------------------- /third_party/gtest/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Prevent overriding the parent project's compiler/linker 2 | # settings on Windows 3 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 4 | 5 | load_git_package(gtest 6 | "https://github.com/google/googletest.git" 7 | "master") 8 | 9 | -------------------------------------------------------------------------------- /third_party/package_loader.cmake: -------------------------------------------------------------------------------- 1 | set(PACKAGE_DOWNLOADER_SCRIPT 2 | "${CMAKE_CURRENT_LIST_DIR}/DownloadCMakeLists.txt.in") 3 | 4 | function(load_git_package package_name package_url package_tag) 5 | # Download and unpack package at configure time 6 | configure_file(${PACKAGE_DOWNLOADER_SCRIPT} 7 | download/CMakeLists.txt) 8 | execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . 9 | RESULT_VARIABLE result 10 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download) 11 | if(result) 12 | message(FATAL_ERROR 13 | "CMake step for ${package_name}/download failed: ${result}") 14 | endif() 15 | execute_process(COMMAND ${CMAKE_COMMAND} --build . 16 | RESULT_VARIABLE result 17 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download) 18 | if(result) 19 | message(FATAL_ERROR 20 | "Build step for ${package_name}/download failed: ${result}") 21 | endif() 22 | 23 | # Add package to the build 24 | add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/src 25 | ${CMAKE_CURRENT_BINARY_DIR}/build) 26 | endfunction(load_git_package) 27 | --------------------------------------------------------------------------------