├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── arpfloat └── __init__.py ├── benches └── main_benchmark.rs ├── examples ├── calc_pi.rs ├── fma.py ├── print_e.rs └── softmax.py ├── pyproject.toml ├── requirements.txt ├── rustfmt.toml ├── setup.py └── src ├── arithmetic.rs ├── bigint.rs ├── cast.rs ├── float.rs ├── lib.rs ├── operations ├── constants.rs ├── exp.rs ├── frac.rs ├── functions.rs ├── mod.rs └── trig.rs ├── py.rs ├── string.rs └── utils.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - '**' 8 | 9 | name: CI 10 | 11 | jobs: 12 | audit: 13 | name: Audit 14 | runs-on: ubuntu-latest 15 | timeout-minutes: 10 16 | steps: 17 | - uses: actions/checkout@v1 18 | - uses: actions-rs/audit-check@v1 19 | with: 20 | token: ${{ secrets.GITHUB_TOKEN }} 21 | 22 | fmt: 23 | name: Rustfmt 24 | runs-on: ubuntu-latest 25 | timeout-minutes: 10 26 | steps: 27 | - uses: actions/checkout@v2 28 | - uses: actions-rs/toolchain@v1 29 | with: 30 | profile: minimal 31 | toolchain: nightly 32 | override: true 33 | components: rustfmt 34 | 35 | - uses: actions-rs/cargo@v1 36 | with: 37 | command: fmt 38 | args: --all -- --check 39 | 40 | build_and_test_linux: 41 | name: Build and Test (Linux) 42 | runs-on: ubuntu-latest 43 | timeout-minutes: 10 44 | steps: 45 | - uses: actions/checkout@v2 46 | - uses: actions-rs/toolchain@v1 47 | with: 48 | profile: minimal 49 | toolchain: stable 50 | override: true 51 | 52 | - uses: actions-rs/cargo@v1 53 | with: 54 | command: test 55 | args: --workspace 56 | 57 | build_and_test_windows: 58 | name: Build and Test (Windows) 59 | runs-on: windows-latest 60 | timeout-minutes: 10 61 | steps: 62 | - name: Prepare symlink configuration 63 | run: git config --global core.symlinks true 64 | 65 | - uses: actions/checkout@v2 66 | - uses: actions-rs/toolchain@v1 67 | with: 68 | profile: minimal 69 | toolchain: stable 70 | override: true 71 | 72 | - uses: actions-rs/cargo@v1 73 | with: 74 | command: test 75 | args: --workspace 76 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | .vscode 13 | 14 | .env/ 15 | 16 | *.egg-info 17 | __pycache__ 18 | *.so 19 | 20 | build/ 21 | 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "arpfloat" 3 | version = "0.1.11" 4 | authors = ["Nadav Rotem "] 5 | categories = ["mathematics", "algorithms", "no-std"] 6 | description = "Arbitrary-precision floating point library" 7 | documentation = "https://docs.rs/arpfloat/" 8 | edition = "2021" 9 | keywords = ["float"] 10 | license = "Apache-2.0" 11 | readme = "README.md" 12 | repository = "https://github.com/nadavrot/arpfloat" 13 | 14 | [dependencies] 15 | pyo3 = { version = "0.24.1", optional = true } 16 | 17 | [dev-dependencies] 18 | criterion = "0.5" 19 | 20 | [[bench]] 21 | name = "main_benchmark" 22 | harness = false 23 | 24 | [features] 25 | default = ["std", "python"] 26 | std = [] 27 | python=["pyo3", "std"] 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Arbitrary-Precision Floating-Point Library   3 | [![Latest Version]][crates.io] [![Docs Badge]][docs] 4 | 5 | [Latest Version]: https://img.shields.io/crates/v/arpfloat.svg 6 | [crates.io]: https://crates.io/crates/arpfloat 7 | [Docs Badge]: https://docs.rs/arpfloat/badge.svg 8 | [docs]: https://docs.rs/arpfloat 9 | 10 | ARPFloat is an implementation of arbitrary precision 11 | [floating point](https://en.wikipedia.org/wiki/IEEE_754) data 12 | structures and utilities. The library can be used to emulate existing floating 13 | point types, such as FP16, and create new floating point types. Floating point 14 | types can scale to hundreds of digits, and perform very accurate calculations. 15 | In ARPFloat the rounding mode is a part of the type-system, and this defines 16 | away a number of problem that show up when using fenv.h. 17 | 18 | `no_std` environments are supported by disabling the `std` feature. 19 | `python` bindings are supported by enabling the `python` feature. 20 | 21 | ### Example 22 | ```rust 23 | use arpfloat::Float; 24 | use arpfloat::FP128; 25 | 26 | // Create the number '5' in FP128 format. 27 | let n = Float::from_f64(5.).cast(FP128); 28 | 29 | // Use Newton-Raphson to find the square root of 5. 30 | let mut x = n.clone(); 31 | for _ in 0..20 { 32 | x += (&n / &x)/2; 33 | } 34 | 35 | println!("fp128: {}", x); 36 | println!("fp64: {}", x.as_f64()); 37 | ``` 38 | 39 | 40 | The program above will print this output: 41 | ```console 42 | fp128: 2.2360679774997896964091736687312763 43 | fp64: 2.23606797749979 44 | ``` 45 | 46 | The library also provides API that exposes rounding modes, and low-level 47 | operations. 48 | 49 | ```rust 50 | use arpfloat::FP128; 51 | use arpfloat::RoundingMode::NearestTiesToEven; 52 | use arpfloat::Float; 53 | 54 | let x = Float::from_u64(FP128, 1<<53); 55 | let y = Float::from_f64(1000.0).cast(FP128); 56 | 57 | let val = Float::mul_with_rm(&x, &y, NearestTiesToEven); 58 | ``` 59 | 60 | View the internal representation of numbers: 61 | 62 | ```rust 63 | use arpfloat::Float; 64 | use arpfloat::FP16; 65 | 66 | let fp = Float::from_i64(FP16, 15); 67 | 68 | fp.dump(); // Prints FP[+ E=+3 M=11110000000] 69 | 70 | let m = fp.get_mantissa(); 71 | m.dump(); // Prints 11110000000 72 | ``` 73 | 74 | Control the rounding mode for type conversion: 75 | 76 | ```rust 77 | use arpfloat::{FP16, FP32, RoundingMode, Float}; 78 | 79 | let x = Float::from_u64(FP32, 2649); 80 | let b = x.cast_with_rm(FP16, RoundingMode::Zero); 81 | println!("{}", b); // Prints 2648! 82 | ``` 83 | 84 | Define new float formats and use high-precision transcendental functions: 85 | 86 | ```rust 87 | use arpfloat::{Float, Semantics}; 88 | // Define a new float format with 120 bits of accuracy, and 89 | // dynamic range of 2^10. 90 | let sem = Semantics::new(10, 120); 91 | 92 | let pi = Float::pi(sem); 93 | let x = Float::exp(&pi); 94 | println!("e^pi = {}", x); // Prints 23.1406926327792.... 95 | ``` 96 | 97 | Floating point numbers can be converted to 98 | [Continued Fractions](https://en.wikipedia.org/wiki/Continued_fraction) that 99 | approximate the value. 100 | 101 | ```rust 102 | use arpfloat::{Float, FP256, RoundingMode}; 103 | 104 | let ln = Float::ln2(FP256); 105 | println!("ln(2) = {}", ln); 106 | for i in 1..20 { 107 | let (p,q) = ln.as_fraction(i); 108 | println!("{}/{}", p.as_decimal(), q.as_decimal()); 109 | } 110 | ``` 111 | The program above will print this output: 112 | ```console 113 | ln(2) = .6931471805599453094172321214581765680755001343602552..... 114 | 0/1 115 | 1/1 116 | 2/3 117 | 7/10 118 | 9/13 119 | 61/88 120 | 192/277 121 | 253/365 122 | 445/642 123 | 1143/1649 124 | 1588/2291 125 | 2731/3940 126 | .... 127 | ``` 128 | 129 | The [examples](examples) directory contains a few programs that demonstrate the use of this library. 130 | 131 | ### Python Bindings 132 | 133 | The has python bindings that can be installed with 'pip install -e .' 134 | 135 | ```python 136 | >>> from arpfloat import Float, Semantics, FP16, BF16, FP32, fp64, pi 137 | 138 | >>> x = fp64(2.5).cast(FP16) 139 | >>> y = fp64(1.5).cast(FP16) 140 | >>> x + y 141 | 4. 142 | 143 | >>> sem = Semantics(10, 10, "NearestTiesToEven") 144 | >>> sem 145 | Semantics { exponent: 10, precision: 10, mode: NearestTiesToEven } 146 | >>> Float(sem, False, 0b1000000001, 0b1100101) 147 | 4.789062 148 | 149 | >>> pi(FP32) 150 | 3.1415927 151 | >>> pi(FP16) 152 | 3.140625 153 | >>> pi(BF16) 154 | 3.140625 155 | ``` 156 | 157 | Arpfloat allows you to experiment with new floating point formats. For example, 158 | Nvidia's new [FP8](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html) 159 | format can be defined as: 160 | 161 | ```python 162 | import numpy as np 163 | from arpfloat import FP32, fp64, Semantics, zero 164 | 165 | # Create two random numpy arrays in the range [0,1) 166 | A0 = np.random.rand(1000000) 167 | A1 = np.random.rand(1000000) 168 | 169 | # Calculate the numpy dot product of the two arrays 170 | print("Using fp32 arithmetic : ", np.dot(A0, A1)) 171 | 172 | # Create the fp8 format (4 exponent bits, 3 mantissa bits + 1 implicit bit) 173 | FP8 = Semantics(4, 3 + 1, "NearestTiesToEven") 174 | 175 | # Convert the arrays to fp8 176 | A0 = [fp64(x).cast(FP8) for x in A0] 177 | A1 = [fp64(x).cast(FP8) for x in A1] 178 | 179 | dot = sum([x.cast(FP32)*y.cast(FP32) for x, y in zip(A0, A1)]) 180 | print("Using fp8/fp32 arithmetic: ", dot) 181 | ``` 182 | 183 | ### Resources 184 | 185 | There are excellent resources out there, some of which are referenced in the code: 186 | 187 | * Books: 188 | * Handbook of Floating-Point Arithmetic 2010th by Jean-Michel Muller et al. 189 | * Elementary Functions: Algorithms and Implementation by Jean-Michel Muller. 190 | * Modern Computer Arithmetic by Brent and Zimmermann. 191 | * Papers: 192 | * An Accurate Elementary Mathematical Library for the IEEE Floating Point Standard, by Gal and Bachels. 193 | * How to print floating-point numbers accurately by Steele, White. 194 | * What Every Computer Scientist Should Know About Floating-Point Arithmetic by David Goldberg. 195 | * Fast Multiple-Precision Evaluation of Elementary Functions by Richard Brent. 196 | * Fast Trigonometric functions for Arbitrary Precision number by Henrik Vestermark. 197 | * Other excellent software implementations: APFloat, RYU, libBF, newlib, musl, etc. 198 | 199 | ### License 200 | 201 | Licensed under Apache-2.0 202 | -------------------------------------------------------------------------------- /arpfloat/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | ARPFloat: Arbitrary Precision Floating-Point Library 4 | 5 | This library provides arbitrary precision floating-point arithmetic with 6 | configurable precision and rounding modes. It implements IEEE 754 7 | semantics and supports standard arithmetic operations. 8 | 9 | Examples: 10 | >>> from arpfloat import Float, FP16 11 | >>> x = from_f64(FP32, 2.5).cast(FP16) 12 | >>> y = from_f64(FP32, 1.5).cast(FP16) 13 | >>> x + y 14 | 4 15 | 16 | >>> sem = Semantics(10, 10, "Zero") 17 | >>> sem 18 | Semantics { exponent: 10, precision: 10, mode: Zero } 19 | >>> Float(sem, False, 1, 13) 20 | .0507 21 | 22 | >>> arpfloat.pi(arpfloat.FP32) 23 | 3.1415927 24 | >>> pi(FP16) 25 | 3.14 26 | >>> pi(BF16) 27 | 3.15 28 | 29 | Constants: 30 | BF16, FP16, FP32, FP64, FP128, FP256: Standard floating-point formats 31 | pi, e, ln2, zero: Mathematical constants 32 | Float, Semantics: Classes for representing floating-point numbers and their semantics 33 | from_i64, from_f64: Constructors for creating Float objects from integers and floats 34 | """ 35 | 36 | from ._arpfloat import PyFloat as Float 37 | from ._arpfloat import PySemantics as Semantics 38 | from ._arpfloat import pi, e, ln2, zero, fma 39 | from ._arpfloat import from_fp64 as fp64 40 | from ._arpfloat import from_i64 as i64 41 | 42 | # Add __radd__ method to Float class for sum() compatibility 43 | 44 | 45 | def _float_radd(self, other): 46 | if isinstance(other, (int, float)) and other == 0: 47 | return self 48 | return self.__add__(other) 49 | 50 | Float.__radd__ = _float_radd 51 | 52 | # Define standard floating-point types 53 | # Parameters match IEEE 754 standard formats 54 | BF16 = Semantics(8, 8, "NearestTiesToEven") # BFloat16 55 | FP16 = Semantics(5, 11, "NearestTiesToEven") # Half precision 56 | FP32 = Semantics(8, 24, "NearestTiesToEven") # Single precision 57 | FP64 = Semantics(11, 53, "NearestTiesToEven") # Double precision 58 | FP128 = Semantics(15, 113, "NearestTiesToEven") # Quadruple precision 59 | FP256 = Semantics(19, 237, "NearestTiesToEven") # Octuple precision 60 | 61 | version = "0.1.11" 62 | -------------------------------------------------------------------------------- /benches/main_benchmark.rs: -------------------------------------------------------------------------------- 1 | use arpfloat::{BigInt, Float, RoundingMode, Semantics}; 2 | 3 | use RoundingMode::NearestTiesToEven as rme; 4 | 5 | fn test_e() { 6 | let sem = Semantics::new(32, 2000, rme); 7 | black_box(Float::e(sem)); 8 | } 9 | 10 | fn test_sqrt() { 11 | let sem = Semantics::new(32, 10000, rme); 12 | black_box(Float::one(sem, false).scale(1, rme).sqrt()); 13 | } 14 | 15 | fn test_pi() { 16 | let sem = Semantics::new(32, 2000, rme); 17 | black_box(Float::pi(sem)); 18 | } 19 | 20 | fn test_powi() { 21 | let a = BigInt::from_u64(1275563424); 22 | black_box(a.powi(11000)); 23 | } 24 | 25 | fn test_bigint_as_dec() { 26 | let a = BigInt::from_u64(197123); 27 | black_box(a.powi(100).as_decimal()); 28 | } 29 | 30 | fn test_bigint_div() { 31 | let a = BigInt::pseudorandom(1000, 12345); 32 | let b = BigInt::pseudorandom(500, 67890); 33 | black_box(a / b); 34 | } 35 | 36 | fn test_cos() { 37 | let sem = Semantics::new(32, 90, rme); 38 | for i in 0..100 { 39 | let a = Float::from_u64(sem, i).cos(); 40 | black_box(a); 41 | } 42 | } 43 | 44 | fn test_sin() { 45 | let sem = Semantics::new(32, 90, rme); 46 | for i in 0..100 { 47 | let a = Float::from_u64(sem, i).sin(); 48 | black_box(a); 49 | } 50 | } 51 | 52 | fn test_log() { 53 | let sem = Semantics::new(32, 100, rme); 54 | for i in 0..100 { 55 | let a = Float::from_u64(sem, i).log(); 56 | black_box(a); 57 | } 58 | } 59 | 60 | fn test_exp() { 61 | let sem = Semantics::new(32, 100, rme); 62 | for i in 0..1000 { 63 | let a = Float::from_u64(sem, 100 - i).exp(); 64 | let b = Float::from_u64(sem, i).exp(); 65 | black_box(a + b); 66 | } 67 | } 68 | 69 | fn test_bigint_mul_1() { 70 | let a = BigInt::pseudorandom(1000, 98765); 71 | let b = BigInt::pseudorandom(1000, 43210); 72 | black_box(a * b); 73 | } 74 | 75 | fn test_bigint_mul_2() { 76 | let a = BigInt::pseudorandom(10, 98765); 77 | let b = BigInt::pseudorandom(10, 43210); 78 | black_box(a * b); 79 | } 80 | 81 | fn test_bigint_mul_3() { 82 | let a = BigInt::pseudorandom(100, 98765); 83 | let b = BigInt::pseudorandom(100, 43210); 84 | black_box(a * b); 85 | } 86 | 87 | fn test_bigint_mul_4() { 88 | let a = BigInt::pseudorandom(5000, 98765); 89 | let b = BigInt::pseudorandom(1, 43210); 90 | black_box(a * b); 91 | } 92 | 93 | fn test_bigint_div_1() { 94 | let a = BigInt::pseudorandom(1000, 98765); 95 | let b = BigInt::pseudorandom(1000, 43210); 96 | black_box(a / b); 97 | } 98 | 99 | fn test_bigint_div_2() { 100 | let a = BigInt::pseudorandom(1000, 98765); 101 | let b = BigInt::pseudorandom(1, 43210); 102 | black_box(a / b); 103 | } 104 | 105 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 106 | 107 | pub fn criterion_benchmark(c: &mut Criterion) { 108 | c.bench_function("pi", |b| b.iter(test_pi)); 109 | c.bench_function("e", |b| b.iter(test_e)); 110 | c.bench_function("sqrt", |b| b.iter(test_sqrt)); 111 | c.bench_function("powi", |b| b.iter(test_powi)); 112 | c.bench_function("bigint_as_dec", |b| b.iter(test_bigint_as_dec)); 113 | c.bench_function("bigint_div", |b| b.iter(test_bigint_div)); 114 | c.bench_function("cos", |b| b.iter(test_cos)); 115 | c.bench_function("sin", |b| b.iter(test_sin)); 116 | c.bench_function("exp", |b| b.iter(test_exp)); 117 | c.bench_function("log", |b| b.iter(test_log)); 118 | c.bench_function("bigint_mul_1", |b| b.iter(test_bigint_mul_1)); 119 | c.bench_function("bigint_mul_2", |b| b.iter(test_bigint_mul_2)); 120 | c.bench_function("bigint_mul_3", |b| b.iter(test_bigint_mul_3)); 121 | c.bench_function("bigint_mul_4", |b| b.iter(test_bigint_mul_4)); 122 | c.bench_function("bigint_div_1", |b| b.iter(test_bigint_div_1)); 123 | c.bench_function("bigint_div_2", |b| b.iter(test_bigint_div_2)); 124 | } 125 | 126 | criterion_group!(benches, criterion_benchmark); 127 | criterion_main!(benches); 128 | -------------------------------------------------------------------------------- /examples/calc_pi.rs: -------------------------------------------------------------------------------- 1 | //! Calculate the value of PI using the Chudnovsky_algorithm. 2 | //! cargo run --example calc_pi --release 3 | 4 | use arpfloat::{Float, FP256}; 5 | 6 | fn main() { 7 | // https://en.wikipedia.org/wiki/Chudnovsky_algorithm 8 | let iterations = 5; 9 | 10 | // Constants: 11 | let c1 = Float::from_u64(FP256, 10005).sqrt(); 12 | let c2 = Float::from_u64(FP256, 545140134); 13 | let c3 = Float::from_i64(FP256, -262537412640768000); 14 | let c16 = Float::from_u64(FP256, 16); 15 | let c12 = Float::from_u64(FP256, 12); 16 | 17 | // Initial state. 18 | let mut kc = Float::from_u64(FP256, 6); 19 | let mut m = Float::from_u64(FP256, 1); 20 | let mut l = Float::from_u64(FP256, 13591409); 21 | let mut x = Float::from_u64(FP256, 1); 22 | let mut s = Float::from_u64(FP256, 13591409); 23 | 24 | for q in 1..iterations + 1 { 25 | let q3 = Float::from_u64(FP256, q * q * q); 26 | let k3 = &kc * &(&kc * &kc); 27 | m = (k3 - (&kc * &c16)) * m / q3; 28 | l += &c2; 29 | x *= &c3; 30 | s += &(&m * &l) / &x; 31 | kc += &c12; 32 | } 33 | let pi = Float::from_u64(FP256, 426880) * (c1 / s); 34 | println!("pi = {}", pi); 35 | assert_eq!(pi.as_f64(), std::f64::consts::PI); 36 | } 37 | -------------------------------------------------------------------------------- /examples/fma.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from arpfloat import FP32, fp64, Semantics, zero, fma 3 | 4 | # Create two random numpy arrays in the range [0,1) 5 | A0 = np.random.rand(1024) 6 | A1 = np.random.rand(1024) 7 | 8 | # Create the fp8 format (4 exponent bits, 3 mantissa bits + 1 implicit bit) 9 | FP8 = Semantics(4, 3 + 1, "NearestTiesToEven") 10 | 11 | # Convert the arrays to FP8 12 | B0 = [fp64(x).cast(FP8) for x in A0] 13 | B1 = [fp64(x).cast(FP8) for x in A1] 14 | 15 | acc = zero(FP32) 16 | for x, y in zip(B0, B1): 17 | acc = fma(x.cast(FP32), y.cast(FP32), acc) 18 | 19 | print("Using fp8/fp32 arithmetic: ", acc) 20 | print("Using fp32 arithmetic : ", np.dot(A0, A1)) -------------------------------------------------------------------------------- /examples/print_e.rs: -------------------------------------------------------------------------------- 1 | //! Calculates long numbers and prints them. 2 | //! cargo run --example print_e --release 3 | 4 | use arpfloat::{Float, RoundingMode, Semantics}; 5 | 6 | fn main() { 7 | let sem = Semantics::new(32, 5000, RoundingMode::NearestTiesToEven); 8 | let val = Float::e(sem); 9 | println!("F64: {}", val.as_f64()); 10 | println!("FP*: {}", val); 11 | } 12 | -------------------------------------------------------------------------------- /examples/softmax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from arpfloat import FP32, BF16, fp64, zero 3 | 4 | dtype = BF16 5 | 6 | A0 = np.random.rand(6) # Random array in the range [0,1) 7 | B0 = [fp64(x).cast(dtype) for x in A0] # Convert to the emulated format. 8 | 9 | # Find the max value. 10 | max_val = max(B0) 11 | 12 | # calculate exp(x-max) for each value. 13 | shifted_exp = [(x - max_val).exp() for x in B0] 14 | exp_sum = sum(shifted_exp) 15 | 16 | # calculate the softmax: [exp(x-max) / sum(exp(x-max))] 17 | result = [x / exp_sum for x in shifted_exp] 18 | print("Calculated = ", result) 19 | 20 | # NumPy's softmax. 21 | np_softmax = np.exp(A0 - np.max(A0)) / np.exp(A0 - np.max(A0)).sum() 22 | print("Reference = ", np_softmax) 23 | 24 | 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64.0.0", "wheel", "setuptools-rust>=1.5.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "arpfloat" 7 | version = "0.1.11" 8 | description = "Arbitrary-precision floating point library" 9 | authors = [ 10 | {name = "Nadav Rotem", email = "nadav256@gmail.com"}, 11 | ] 12 | readme = "README.md" 13 | requires-python = ">=3.6" 14 | 15 | [project.urls] 16 | Homepage = "https://github.com/nadavrot/arpfloat" 17 | Documentation = "https://docs.rs/arpfloat/" 18 | Repository = "https://github.com/nadavrot/arpfloat" 19 | 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==2.2.4 2 | semantic-version==2.10.0 3 | setuptools==78.1.0 4 | setuptools-rust==1.11.1 5 | wheel==0.45.1 6 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 80 2 | 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools_rust import Binding, RustExtension 3 | 4 | setup( 5 | name="arpfloat", 6 | version="0.1.11", # Match the version in Cargo.toml 7 | description="Arbitrary-precision floating point library", 8 | author="Nadav Rotem", 9 | author_email="nadav256@gmail.com", 10 | url="https://github.com/nadavrot/arpfloat", 11 | rust_extensions=[ 12 | RustExtension( 13 | "arpfloat._arpfloat", 14 | binding=Binding.PyO3, 15 | debug=False, 16 | features=["python"], 17 | ) 18 | ], 19 | package_data={"arpfloat": ["py.typed"]}, 20 | packages=["arpfloat"], 21 | zip_safe=False, 22 | python_requires=">=3.6", 23 | ) 24 | -------------------------------------------------------------------------------- /src/arithmetic.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of the basic arithmetic operations: 2 | //! Addition, Subtraction, Multiplication, Division. 3 | extern crate alloc; 4 | use crate::bigint::BigInt; 5 | 6 | use super::bigint::LossFraction; 7 | use super::float::{Category, Float, RoundingMode}; 8 | use core::cmp::Ordering; 9 | use core::ops::{ 10 | Add, AddAssign, Div, DivAssign, Mul, MulAssign, Sub, SubAssign, 11 | }; 12 | 13 | impl Float { 14 | /// An inner function that performs the addition and subtraction of normal 15 | /// numbers (no NaN, Inf, Zeros). 16 | /// See Pg 247. Chapter 8. Algorithms for the Five Basic Operations. 17 | /// This implementation follows the APFloat implementation, that does not 18 | /// swap the operands. 19 | fn add_or_sub_normals( 20 | a: &Self, 21 | b: &Self, 22 | subtract: bool, 23 | ) -> (Self, LossFraction) { 24 | debug_assert_eq!(a.get_semantics(), b.get_semantics()); 25 | let sem = a.get_semantics(); 26 | let loss; 27 | let mut a = a.clone(); 28 | let mut b = b.clone(); 29 | 30 | // Align the input numbers on the same exponent. 31 | let bits = a.get_exp() - b.get_exp(); 32 | 33 | // Can transform (a-b) to (a + -b), either way, there are cases where 34 | // subtraction needs to happen. 35 | let subtract = subtract ^ (a.get_sign() ^ b.get_sign()); 36 | if subtract { 37 | // Align the input numbers. We shift LHS one bit to the left to 38 | // allow carry/borrow in case of underflow as result of subtraction. 39 | match bits.cmp(&0) { 40 | Ordering::Equal => { 41 | loss = LossFraction::ExactlyZero; 42 | } 43 | Ordering::Greater => { 44 | loss = b.shift_significand_right((bits - 1) as u64); 45 | a.shift_significand_left(1); 46 | } 47 | Ordering::Less => { 48 | loss = a.shift_significand_right((-bits - 1) as u64); 49 | b.shift_significand_left(1); 50 | } 51 | } 52 | 53 | let a_mantissa = a.get_mantissa(); 54 | let b_mantissa = b.get_mantissa(); 55 | let ab_mantissa; 56 | let mut sign = a.get_sign(); 57 | 58 | // Figure out the carry from the shifting operations that dropped 59 | // bits. 60 | let c = !loss.is_exactly_zero() as u64; 61 | let c = BigInt::from_u64(c); 62 | 63 | // Figure out which mantissa is larger, to make sure that we don't 64 | // overflow the subtraction. 65 | if a_mantissa < b_mantissa { 66 | // A < B 67 | ab_mantissa = b_mantissa - a_mantissa - c; 68 | sign = !sign; 69 | } else { 70 | // A >= B 71 | ab_mantissa = a_mantissa - b_mantissa - c; 72 | } 73 | ( 74 | Self::from_parts(sem, sign, a.get_exp(), ab_mantissa), 75 | loss.invert(), 76 | ) 77 | } else { 78 | // Handle the easy case of Add: 79 | let mut b = b.clone(); 80 | let mut a = a.clone(); 81 | if bits > 0 { 82 | loss = b.shift_significand_right(bits as u64); 83 | } else { 84 | loss = a.shift_significand_right(-bits as u64); 85 | } 86 | debug_assert!(a.get_exp() == b.get_exp()); 87 | let ab_mantissa = a.get_mantissa() + b.get_mantissa(); 88 | ( 89 | Self::from_parts(sem, a.get_sign(), a.get_exp(), ab_mantissa), 90 | loss, 91 | ) 92 | } 93 | } 94 | 95 | /// Computes a+b using the rounding mode `rm`. 96 | pub fn add_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { 97 | Self::add_sub(a, b, false, rm) 98 | } 99 | /// Computes a-b using the rounding mode `rm`. 100 | pub fn sub_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { 101 | Self::add_sub(a, b, true, rm) 102 | } 103 | 104 | fn add_sub(a: &Self, b: &Self, subtract: bool, rm: RoundingMode) -> Self { 105 | let sem = a.get_semantics(); 106 | // Table 8.2: Specification of addition for positive floating-point 107 | // data. Pg 247. 108 | match (a.get_category(), b.get_category()) { 109 | (Category::NaN, Category::Infinity) 110 | | (Category::NaN, Category::NaN) 111 | | (Category::NaN, Category::Normal) 112 | | (Category::NaN, Category::Zero) 113 | | (Category::Normal, Category::Zero) 114 | | (Category::Infinity, Category::Normal) 115 | | (Category::Infinity, Category::Zero) => a.clone(), 116 | 117 | (Category::Zero, Category::NaN) 118 | | (Category::Normal, Category::NaN) 119 | | (Category::Infinity, Category::NaN) => { 120 | Self::nan(sem, b.get_sign()) 121 | } 122 | 123 | (Category::Normal, Category::Infinity) 124 | | (Category::Zero, Category::Infinity) => { 125 | Self::inf(sem, b.get_sign() ^ subtract) 126 | } 127 | 128 | (Category::Zero, Category::Normal) => Self::from_parts( 129 | sem, 130 | b.get_sign() ^ subtract, 131 | b.get_exp(), 132 | b.get_mantissa(), 133 | ), 134 | 135 | (Category::Zero, Category::Zero) => { 136 | Self::zero(sem, a.get_sign() && b.get_sign()) 137 | } 138 | 139 | (Category::Infinity, Category::Infinity) => { 140 | if a.get_sign() ^ b.get_sign() ^ subtract { 141 | return Self::nan(sem, a.get_sign() ^ b.get_sign()); 142 | } 143 | Self::inf(sem, a.get_sign()) 144 | } 145 | 146 | (Category::Normal, Category::Normal) => { 147 | // The IEEE 754 spec (section 6.3) states that cancellation 148 | // results in a positive zero, except for the case of the 149 | // negative rounding mode. 150 | let cancellation = subtract == (a.get_sign() == b.get_sign()); 151 | let same_absolute_number = a.same_absolute_value(b); 152 | if cancellation && same_absolute_number { 153 | let is_negative = RoundingMode::Negative == rm; 154 | return Self::zero(sem, is_negative); 155 | } 156 | 157 | let mut res = Self::add_or_sub_normals(a, b, subtract); 158 | res.0.normalize(rm, res.1); 159 | res.0 160 | } 161 | } 162 | } 163 | } 164 | 165 | #[test] 166 | fn test_add() { 167 | use super::float::FP64; 168 | let a = Float::from_u64(FP64, 1); 169 | let b = Float::from_u64(FP64, 2); 170 | let _ = Float::add(a, b); 171 | } 172 | 173 | #[test] 174 | fn test_addition() { 175 | fn add_helper(a: f64, b: f64) -> f64 { 176 | let a = Float::from_f64(a); 177 | let b = Float::from_f64(b); 178 | let c = Float::add(a, b); 179 | c.as_f64() 180 | } 181 | 182 | assert_eq!(add_helper(0., -4.), -4.); 183 | assert_eq!(add_helper(-4., 0.), -4.); 184 | assert_eq!(add_helper(1., 1.), 2.); 185 | assert_eq!(add_helper(8., 4.), 12.); 186 | assert_eq!(add_helper(8., 4.), 12.); 187 | assert_eq!(add_helper(128., 2.), 130.); 188 | assert_eq!(add_helper(128., -8.), 120.); 189 | assert_eq!(add_helper(64., -60.), 4.); 190 | assert_eq!(add_helper(69., -65.), 4.); 191 | assert_eq!(add_helper(69., 69.), 138.); 192 | assert_eq!(add_helper(69., 1.), 70.); 193 | assert_eq!(add_helper(-128., -8.), -136.); 194 | assert_eq!(add_helper(64., -65.), -1.); 195 | assert_eq!(add_helper(-64., -65.), -129.); 196 | assert_eq!(add_helper(-15., -15.), -30.); 197 | 198 | assert_eq!(add_helper(-15., 15.), 0.); 199 | 200 | for i in -4..15 { 201 | for j in i..15 { 202 | assert_eq!( 203 | add_helper(f64::from(j), f64::from(i)), 204 | f64::from(i) + f64::from(j) 205 | ); 206 | } 207 | } 208 | 209 | // Check that adding a negative and positive results in a positive zero for 210 | // the default rounding mode. 211 | let a = Float::from_f64(4.0); 212 | let b = Float::from_f64(-4.0); 213 | let c = Float::add(a.clone(), b); 214 | let d = Float::sub(a.clone(), a); 215 | assert!(c.is_zero()); 216 | assert!(!c.is_negative()); 217 | assert!(d.is_zero()); 218 | assert!(!d.is_negative()); 219 | } 220 | 221 | // Pg 120. Chapter 4. Basic Properties and Algorithms. 222 | #[test] 223 | fn test_addition_large_numbers() { 224 | use super::float::FP64; 225 | let rm = RoundingMode::NearestTiesToEven; 226 | 227 | let one = Float::from_i64(FP64, 1); 228 | let mut a = Float::from_i64(FP64, 1); 229 | 230 | while Float::sub_with_rm(&Float::add_with_rm(&a, &one, rm), &a, rm) == one { 231 | a = Float::add_with_rm(&a, &a, rm); 232 | } 233 | 234 | let mut b = one.clone(); 235 | while Float::sub_with_rm(&Float::add_with_rm(&a, &b, rm), &a, rm) != b { 236 | b = Float::add_with_rm(&b, &one, rm); 237 | } 238 | 239 | assert_eq!(a.as_f64(), 9007199254740992.); 240 | assert_eq!(b.as_f64(), 2.); 241 | } 242 | 243 | #[test] 244 | fn add_denormals() { 245 | let v0 = f64::from_bits(0x0000_0000_0010_0010); 246 | let v1 = f64::from_bits(0x0000_0000_1001_0010); 247 | let v2 = f64::from_bits(0x1000_0000_0001_0010); 248 | assert_eq!(add_f64(v2, -v1), v2 - v1); 249 | 250 | let a0 = Float::from_f64(v0); 251 | assert_eq!(a0.as_f64(), v0); 252 | 253 | fn add_f64(a: f64, b: f64) -> f64 { 254 | let a0 = Float::from_f64(a); 255 | let b0 = Float::from_f64(b); 256 | assert_eq!(a0.as_f64(), a); 257 | Float::add(a0, b0).as_f64() 258 | } 259 | 260 | // Add and subtract denormals. 261 | assert_eq!(add_f64(v0, v1), v0 + v1); 262 | assert_eq!(add_f64(v0, -v0), v0 - v0); 263 | assert_eq!(add_f64(v0, v2), v0 + v2); 264 | assert_eq!(add_f64(v2, v1), v2 + v1); 265 | assert_eq!(add_f64(v2, -v1), v2 - v1); 266 | 267 | // Add and subtract denormals and normal numbers. 268 | assert_eq!(add_f64(v0, 10.), v0 + 10.); 269 | assert_eq!(add_f64(v0, -10.), v0 - 10.); 270 | assert_eq!(add_f64(10000., v0), 10000. + v0); 271 | } 272 | 273 | #[cfg(feature = "std")] 274 | #[test] 275 | fn add_special_values() { 276 | use crate::utils; 277 | 278 | // Test the addition of various irregular values. 279 | let values = utils::get_special_test_values(); 280 | 281 | fn add_f64(a: f64, b: f64) -> f64 { 282 | let a = Float::from_f64(a); 283 | let b = Float::from_f64(b); 284 | Float::add(a, b).as_f64() 285 | } 286 | 287 | for v0 in values { 288 | for v1 in values { 289 | let r0 = add_f64(v0, v1); 290 | let r1 = v0 + v1; 291 | let r0_bits = r0.to_bits(); 292 | let r1_bits = r1.to_bits(); 293 | assert_eq!(r0.is_finite(), r1.is_finite()); 294 | assert_eq!(r0.is_nan(), r1.is_nan()); 295 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 296 | assert_eq!(r0.is_normal(), r1.is_normal()); 297 | // Check that the results are bit identical, or are both NaN. 298 | assert!(!r0.is_normal() || r0_bits == r1_bits); 299 | } 300 | } 301 | } 302 | 303 | #[test] 304 | fn test_add_random_vals() { 305 | use crate::utils; 306 | 307 | let mut lfsr = utils::Lfsr::new(); 308 | 309 | let v0: u64 = 0x645e91f69778bad3; 310 | let v1: u64 = 0xe4d91b16be9ae0c5; 311 | 312 | fn add_f64(a: f64, b: f64) -> f64 { 313 | let a = Float::from_f64(a); 314 | let b = Float::from_f64(b); 315 | let k = Float::add(a, b); 316 | k.as_f64() 317 | } 318 | 319 | let f0 = f64::from_bits(v0); 320 | let f1 = f64::from_bits(v1); 321 | 322 | let r0 = add_f64(f0, f1); 323 | let r1 = f0 + f1; 324 | 325 | assert_eq!(r0.is_finite(), r1.is_finite()); 326 | assert_eq!(r0.is_nan(), r1.is_nan()); 327 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 328 | let r0_bits = r0.to_bits(); 329 | let r1_bits = r1.to_bits(); 330 | // Check that the results are bit identical, or are both NaN. 331 | assert!(r1.is_nan() || r0_bits == r1_bits); 332 | 333 | for _ in 0..50000 { 334 | let v0 = lfsr.get64(); 335 | let v1 = lfsr.get64(); 336 | 337 | let f0 = f64::from_bits(v0); 338 | let f1 = f64::from_bits(v1); 339 | 340 | let r0 = add_f64(f0, f1); 341 | let r1 = f0 + f1; 342 | 343 | assert_eq!(r0.is_finite(), r1.is_finite()); 344 | assert_eq!(r0.is_nan(), r1.is_nan()); 345 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 346 | let r0_bits = r0.to_bits(); 347 | let r1_bits = r1.to_bits(); 348 | // Check that the results are bit identical, or are both NaN. 349 | assert!(r1.is_nan() || r0_bits == r1_bits); 350 | } 351 | } 352 | 353 | impl Float { 354 | /// Compute a*b using the rounding mode `rm`. 355 | pub fn mul_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { 356 | let sem = a.get_semantics(); 357 | let sign = a.get_sign() ^ b.get_sign(); 358 | 359 | // Table 8.4: Specification of multiplication for floating-point data of 360 | // positive sign. Page 251. 361 | match (a.get_category(), b.get_category()) { 362 | (Category::Zero, Category::NaN) 363 | | (Category::Normal, Category::NaN) 364 | | (Category::Infinity, Category::NaN) => { 365 | Self::nan(sem, b.get_sign()) 366 | } 367 | (Category::NaN, Category::Infinity) 368 | | (Category::NaN, Category::NaN) 369 | | (Category::NaN, Category::Normal) 370 | | (Category::NaN, Category::Zero) => Self::nan(sem, a.get_sign()), 371 | (Category::Normal, Category::Infinity) 372 | | (Category::Infinity, Category::Normal) 373 | | (Category::Infinity, Category::Infinity) => Self::inf(sem, sign), 374 | (Category::Normal, Category::Zero) 375 | | (Category::Zero, Category::Normal) 376 | | (Category::Zero, Category::Zero) => Self::zero(sem, sign), 377 | 378 | (Category::Zero, Category::Infinity) 379 | | (Category::Infinity, Category::Zero) => Self::nan(sem, sign), 380 | 381 | (Category::Normal, Category::Normal) => { 382 | let (mut res, loss) = Self::mul_normals(a, b, sign); 383 | res.normalize(rm, loss); 384 | res 385 | } 386 | } 387 | } 388 | 389 | /// See Pg 251. 8.4 Floating-Point Multiplication 390 | fn mul_normals(a: &Self, b: &Self, sign: bool) -> (Self, LossFraction) { 391 | debug_assert_eq!(a.get_semantics(), b.get_semantics()); 392 | let sem = a.get_semantics(); 393 | // We multiply digits in the format 1.xx * 2^(e), or mantissa * 2^(e+1). 394 | // When we multiply two 2^(e+1) numbers, we get: 395 | // log(2^(e_a+1)*2^(e_b+1)) = e_a + e_b + 2. 396 | let mut exp = a.get_exp() + b.get_exp(); 397 | 398 | let a_significand = a.get_mantissa(); 399 | let b_significand = b.get_mantissa(); 400 | let ab_significand = a_significand * b_significand; 401 | 402 | // The exponent is correct, but the bits are not in the right place. 403 | // Set the right exponent for where the bits are placed, and fix the 404 | // exponent below. 405 | exp -= sem.get_mantissa_len() as i64; 406 | 407 | let loss = LossFraction::ExactlyZero; 408 | (Self::from_parts(sem, sign, exp, ab_significand), loss) 409 | } 410 | } 411 | 412 | #[test] 413 | fn test_mul_simple() { 414 | let a: f64 = -24.0; 415 | let b: f64 = 0.1; 416 | 417 | let af = Float::from_f64(a); 418 | let bf = Float::from_f64(b); 419 | let cf = Float::mul(af, bf); 420 | 421 | let r0 = cf.as_f64(); 422 | let r1: f64 = a * b; 423 | assert_eq!(r0, r1); 424 | } 425 | 426 | #[test] 427 | fn mul_regular_values() { 428 | // Test the addition of regular values. 429 | let values = [-5.0, 0., -0., 24., 1., 11., 10000., 256., 0.1, 3., 17.5]; 430 | 431 | fn mul_f64(a: f64, b: f64) -> f64 { 432 | let a = Float::from_f64(a); 433 | let b = Float::from_f64(b); 434 | Float::mul(a, b).as_f64() 435 | } 436 | 437 | for v0 in values { 438 | for v1 in values { 439 | let r0 = mul_f64(v0, v1); 440 | let r1 = v0 * v1; 441 | let r0_bits = r0.to_bits(); 442 | let r1_bits = r1.to_bits(); 443 | // Check that the results are bit identical, or are both NaN. 444 | assert_eq!(r0_bits, r1_bits); 445 | } 446 | } 447 | } 448 | 449 | #[cfg(feature = "std")] 450 | #[test] 451 | fn test_mul_special_values() { 452 | use super::utils; 453 | 454 | // Test the multiplication of various irregular values. 455 | let values = utils::get_special_test_values(); 456 | 457 | fn mul_f64(a: f64, b: f64) -> f64 { 458 | let a = Float::from_f64(a); 459 | let b = Float::from_f64(b); 460 | Float::mul(a, b).as_f64() 461 | } 462 | 463 | for v0 in values { 464 | for v1 in values { 465 | let r0 = mul_f64(v0, v1); 466 | let r1 = v0 * v1; 467 | assert_eq!(r0.is_finite(), r1.is_finite()); 468 | assert_eq!(r0.is_nan(), r1.is_nan()); 469 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 470 | let r0_bits = r0.to_bits(); 471 | let r1_bits = r1.to_bits(); 472 | // Check that the results are bit identical, or are both NaN. 473 | assert!(r1.is_nan() || r0_bits == r1_bits); 474 | } 475 | } 476 | } 477 | 478 | #[test] 479 | fn test_mul_random_vals() { 480 | use super::utils; 481 | 482 | let mut lfsr = utils::Lfsr::new(); 483 | 484 | fn mul_f64(a: f64, b: f64) -> f64 { 485 | let a = Float::from_f64(a); 486 | let b = Float::from_f64(b); 487 | let k = Float::mul(a, b); 488 | k.as_f64() 489 | } 490 | 491 | for _ in 0..50000 { 492 | let v0 = lfsr.get64(); 493 | let v1 = lfsr.get64(); 494 | 495 | let f0 = f64::from_bits(v0); 496 | let f1 = f64::from_bits(v1); 497 | 498 | let r0 = mul_f64(f0, f1); 499 | let r1 = f0 * f1; 500 | assert_eq!(r0.is_finite(), r1.is_finite()); 501 | assert_eq!(r0.is_nan(), r1.is_nan()); 502 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 503 | let r0_bits = r0.to_bits(); 504 | let r1_bits = r1.to_bits(); 505 | // Check that the results are bit identical, or are both NaN. 506 | assert!(r1.is_nan() || r0_bits == r1_bits); 507 | } 508 | } 509 | 510 | impl Float { 511 | /// Compute a/b, with the rounding mode `rm`. 512 | pub fn div_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { 513 | let sem = a.get_semantics(); 514 | let sign = a.get_sign() ^ b.get_sign(); 515 | // Table 8.5: Special values for x/y - Page 263. 516 | match (a.get_category(), b.get_category()) { 517 | (Category::NaN, _) 518 | | (_, Category::NaN) 519 | | (Category::Zero, Category::Zero) 520 | | (Category::Infinity, Category::Infinity) => Self::nan(sem, sign), 521 | 522 | (_, Category::Infinity) => Self::zero(sem, sign), 523 | (Category::Zero, _) => Self::zero(sem, sign), 524 | (_, Category::Zero) => Self::inf(sem, sign), 525 | (Category::Infinity, _) => Self::inf(sem, sign), 526 | (Category::Normal, Category::Normal) => { 527 | let (mut res, loss) = Self::div_normals(a, b); 528 | res.normalize(rm, loss); 529 | res 530 | } 531 | } 532 | } 533 | 534 | /// Compute a/b, where both `a` and `b` are normals. 535 | /// Page 262 8.6. Floating-Point Division. 536 | /// This implementation uses a regular integer division for the mantissa. 537 | fn div_normals(a: &Self, b: &Self) -> (Self, LossFraction) { 538 | debug_assert_eq!(a.get_semantics(), b.get_semantics()); 539 | let sem = a.get_semantics(); 540 | 541 | let mut a = a.clone(); 542 | let mut b = b.clone(); 543 | // Start by normalizing the dividend and divisor to the MSB. 544 | a.align_mantissa(); // Normalize the dividend. 545 | b.align_mantissa(); // Normalize the divisor. 546 | 547 | let mut a_mantissa = a.get_mantissa(); 548 | let b_mantissa = b.get_mantissa(); 549 | 550 | // Calculate the sign and exponent. 551 | let mut exp = a.get_exp() - b.get_exp(); 552 | let sign = a.get_sign() ^ b.get_sign(); 553 | 554 | // Make sure that A >= B, to allow the integer division to generate all 555 | // of the bits of the result. 556 | if a_mantissa < b_mantissa { 557 | a_mantissa.shift_left(1); 558 | exp -= 1; 559 | } 560 | 561 | // The bits are now aligned to the MSB of the mantissa. The 562 | // semantics need to be 1.xxxxx, but we perform integer division. 563 | // Shift the dividend to make sure that we generate the bits after 564 | // the period. 565 | a_mantissa.shift_left(sem.get_mantissa_len()); 566 | let reminder = a_mantissa.inplace_div(&b_mantissa); 567 | 568 | // Find 2 x reminder, to be able to compare to the reminder and figure 569 | // out the kind of loss that we have. 570 | let mut reminder_2x = reminder; 571 | reminder_2x.shift_left(1); 572 | 573 | let reminder = reminder_2x.cmp(&b_mantissa); 574 | let is_zero = reminder_2x.is_zero(); 575 | let loss = match reminder { 576 | Ordering::Less => { 577 | if is_zero { 578 | LossFraction::ExactlyZero 579 | } else { 580 | LossFraction::LessThanHalf 581 | } 582 | } 583 | Ordering::Equal => LossFraction::ExactlyHalf, 584 | Ordering::Greater => LossFraction::MoreThanHalf, 585 | }; 586 | 587 | let x = Self::from_parts(sem, sign, exp, a_mantissa); 588 | (x, loss) 589 | } 590 | } 591 | 592 | #[test] 593 | fn test_div_simple() { 594 | let a: f64 = 1.0; 595 | let b: f64 = 7.0; 596 | 597 | let af = Float::from_f64(a); 598 | let bf = Float::from_f64(b); 599 | let cf = Float::div_with_rm(&af, &bf, RoundingMode::NearestTiesToEven); 600 | 601 | let r0 = cf.as_f64(); 602 | let r1: f64 = a / b; 603 | assert_eq!(r0, r1); 604 | } 605 | 606 | #[cfg(feature = "std")] 607 | #[test] 608 | fn test_div_special_values() { 609 | use super::utils; 610 | 611 | // Test the multiplication of various irregular values. 612 | let values = utils::get_special_test_values(); 613 | 614 | fn div_f64(a: f64, b: f64) -> f64 { 615 | let a = Float::from_f64(a); 616 | let b = Float::from_f64(b); 617 | Float::div_with_rm(&a, &b, RoundingMode::NearestTiesToEven).as_f64() 618 | } 619 | 620 | for v0 in values { 621 | for v1 in values { 622 | let r0 = div_f64(v0, v1); 623 | let r1 = v0 / v1; 624 | assert_eq!(r0.is_finite(), r1.is_finite()); 625 | assert_eq!(r0.is_nan(), r1.is_nan()); 626 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 627 | let r0_bits = r0.to_bits(); 628 | let r1_bits = r1.to_bits(); 629 | // Check that the results are bit identical, or are both NaN. 630 | assert!(r1.is_nan() || r0_bits == r1_bits); 631 | } 632 | } 633 | } 634 | 635 | macro_rules! declare_operator { 636 | ($trait_name:ident, 637 | $func_name:ident, 638 | $func_impl_name:ident) => { 639 | // Self + Self 640 | impl $trait_name for Float { 641 | type Output = Self; 642 | fn $func_name(self, rhs: Self) -> Self { 643 | let sem = self.get_semantics(); 644 | Self::$func_impl_name(&self, &rhs, sem.get_rounding_mode()) 645 | } 646 | } 647 | 648 | // Self + u64 649 | impl $trait_name for Float { 650 | type Output = Self; 651 | fn $func_name(self, rhs: u64) -> Self { 652 | let sem = self.get_semantics(); 653 | Self::$func_impl_name( 654 | &self, 655 | &Self::Output::from_u64(sem, rhs), 656 | sem.get_rounding_mode(), 657 | ) 658 | } 659 | } 660 | // &Self + &Self 661 | impl $trait_name for &Float { 662 | type Output = Float; 663 | fn $func_name(self, rhs: Self) -> Self::Output { 664 | let sem = self.get_semantics(); 665 | Self::Output::$func_impl_name( 666 | &self, 667 | rhs, 668 | sem.get_rounding_mode(), 669 | ) 670 | } 671 | } 672 | // &Self + u64 673 | impl $trait_name for &Float { 674 | type Output = Float; 675 | fn $func_name(self, rhs: u64) -> Self::Output { 676 | let sem = self.get_semantics(); 677 | Self::Output::$func_impl_name( 678 | &self, 679 | &Self::Output::from_u64(self.get_semantics(), rhs), 680 | sem.get_rounding_mode(), 681 | ) 682 | } 683 | } 684 | 685 | // &Self + Self 686 | impl $trait_name for &Float { 687 | type Output = Float; 688 | fn $func_name(self, rhs: Float) -> Self::Output { 689 | let sem = self.get_semantics(); 690 | Self::Output::$func_impl_name( 691 | &self, 692 | &rhs, 693 | sem.get_rounding_mode(), 694 | ) 695 | } 696 | } 697 | }; 698 | } 699 | 700 | declare_operator!(Add, add, add_with_rm); 701 | declare_operator!(Sub, sub, sub_with_rm); 702 | declare_operator!(Mul, mul, mul_with_rm); 703 | declare_operator!(Div, div, div_with_rm); 704 | 705 | macro_rules! declare_assign_operator { 706 | ($trait_name:ident, 707 | $func_name:ident, 708 | $func_impl_name:ident) => { 709 | impl $trait_name for Float { 710 | fn $func_name(&mut self, rhs: Self) { 711 | let sem = self.get_semantics(); 712 | *self = 713 | Self::$func_impl_name(self, &rhs, sem.get_rounding_mode()); 714 | } 715 | } 716 | 717 | impl $trait_name<&Float> for Float { 718 | fn $func_name(&mut self, rhs: &Self) { 719 | let sem = self.get_semantics(); 720 | *self = 721 | Self::$func_impl_name(self, rhs, sem.get_rounding_mode()); 722 | } 723 | } 724 | }; 725 | } 726 | 727 | declare_assign_operator!(AddAssign, add_assign, add_with_rm); 728 | declare_assign_operator!(SubAssign, sub_assign, sub_with_rm); 729 | declare_assign_operator!(MulAssign, mul_assign, mul_with_rm); 730 | declare_assign_operator!(DivAssign, div_assign, div_with_rm); 731 | 732 | #[test] 733 | fn test_operators() { 734 | use crate::FP64; 735 | let a = Float::from_f32(8.0).cast(FP64); 736 | let b = Float::from_f32(2.0).cast(FP64); 737 | let c = &a + &b; 738 | let d = &a - &b; 739 | let e = &a * &b; 740 | let f = &a / &b; 741 | assert_eq!(c.as_f64(), 10.0); 742 | assert_eq!(d.as_f64(), 6.0); 743 | assert_eq!(e.as_f64(), 16.0); 744 | assert_eq!(f.as_f64(), 4.0); 745 | } 746 | 747 | #[test] 748 | fn test_slow_sqrt_2_test() { 749 | use crate::FP128; 750 | use crate::FP64; 751 | 752 | // Find sqrt using a binary search. 753 | let two = Float::from_f64(2.0).cast(FP128); 754 | let mut high = Float::from_f64(2.0).cast(FP128); 755 | let mut low = Float::from_f64(1.0).cast(FP128); 756 | 757 | for _ in 0..25 { 758 | let mid = (&high + &low) / 2; 759 | if (&mid * &mid) < two { 760 | low = mid; 761 | } else { 762 | high = mid; 763 | } 764 | } 765 | 766 | let res = low.cast(FP64); 767 | assert!(res.as_f64() < 1.4142137_f64); 768 | assert!(res.as_f64() > 1.4142134_f64); 769 | } 770 | 771 | #[cfg(feature = "std")] 772 | #[test] 773 | fn test_famous_pentium4_bug() { 774 | use crate::std::string::ToString; 775 | // https://en.wikipedia.org/wiki/Pentium_FDIV_bug 776 | use crate::FP128; 777 | 778 | let a = Float::from_u64(FP128, 4_195_835); 779 | let b = Float::from_u64(FP128, 3_145_727); 780 | let res = a / b; 781 | let result = res.to_string(); 782 | assert!(result.starts_with("1.333820449136241002")); 783 | } 784 | 785 | impl Float { 786 | // Perform a fused multiply-add of normal numbers, without rounding. 787 | fn fused_mul_add_normals( 788 | a: &Self, 789 | b: &Self, 790 | c: &Self, 791 | ) -> (Self, LossFraction) { 792 | debug_assert_eq!(a.get_semantics(), b.get_semantics()); 793 | let sem = a.get_semantics(); 794 | 795 | // Multiply a and b, without rounding. 796 | let sign = a.get_sign() ^ b.get_sign(); 797 | let mut ab = Self::mul_normals(a, b, sign).0; 798 | 799 | // Shift the product, to allow enough precision for the addition. 800 | // Notice that this can be implemented more efficiently with 3 extra 801 | // bits and sticky bits. 802 | // See 8.5. Floating-Point Fused Multiply-Add, Page 255. 803 | let mut c = c.clone(); 804 | let extra_bits = sem.get_precision() + 1; 805 | ab.shift_significand_left(extra_bits as u64); 806 | c.shift_significand_left(extra_bits as u64); 807 | 808 | // Perform the addition, without rounding. 809 | Self::add_or_sub_normals(&ab, &c, false) 810 | } 811 | 812 | /// Compute a*b + c, with the rounding mode `rm`. 813 | pub fn fused_mul_add_with_rm( 814 | a: &Self, 815 | b: &Self, 816 | c: &Self, 817 | rm: RoundingMode, 818 | ) -> Self { 819 | if a.is_normal() && b.is_normal() && c.is_normal() { 820 | let (mut res, loss) = Self::fused_mul_add_normals(a, b, c); 821 | res.normalize(rm, loss); // Finally, round the result. 822 | res 823 | } else { 824 | // Perform two operations. First, handle non-normal values. 825 | 826 | // NaN anything = NaN 827 | if a.is_nan() || b.is_nan() || c.is_nan() { 828 | return Self::nan(a.get_semantics(), a.get_sign()); 829 | } 830 | // (infinity * 0) + c = NaN 831 | if (a.is_inf() && b.is_zero()) || (a.is_zero() && b.is_inf()) { 832 | return Self::nan(a.get_semantics(), a.get_sign()); 833 | } 834 | // (normal * normal) + infinity = infinity 835 | if a.is_normal() && b.is_normal() && c.is_inf() { 836 | return c.clone(); 837 | } 838 | // (normal * 0) + c = c 839 | if a.is_zero() || b.is_zero() { 840 | return c.clone(); 841 | } 842 | 843 | // Multiply (with rounding), and add (with rounding). 844 | let ab = Self::mul_with_rm(a, b, rm); 845 | Self::add_with_rm(&ab, c, rm) 846 | } 847 | } 848 | 849 | /// Compute a*b + c. 850 | pub fn fma(a: &Self, b: &Self, c: &Self) -> Self { 851 | Self::fused_mul_add_with_rm(a, b, c, c.get_rounding_mode()) 852 | } 853 | } 854 | 855 | #[test] 856 | fn test_fma() { 857 | let v0 = -10.; 858 | let v1 = -1.1; 859 | let v2 = 0.000000000000000000000000000000000000001; 860 | let af = Float::from_f64(v0); 861 | let bf = Float::from_f64(v1); 862 | let cf = Float::from_f64(v2); 863 | 864 | let r = Float::fused_mul_add_with_rm( 865 | &af, 866 | &bf, 867 | &cf, 868 | RoundingMode::NearestTiesToEven, 869 | ); 870 | 871 | assert_eq!(f64::mul_add(v0, v1, v2), r.as_f64()); 872 | } 873 | 874 | #[cfg(feature = "std")] 875 | #[test] 876 | fn test_fma_simple() { 877 | use super::utils; 878 | // Test the multiplication of various irregular values. 879 | let values = utils::get_special_test_values(); 880 | for a in values { 881 | for b in values { 882 | for c in values { 883 | let af = Float::from_f64(a); 884 | let bf = Float::from_f64(b); 885 | let cf = Float::from_f64(c); 886 | 887 | let rf = Float::fused_mul_add_with_rm( 888 | &af, 889 | &bf, 890 | &cf, 891 | RoundingMode::NearestTiesToEven, 892 | ); 893 | 894 | let r0 = rf.as_f64(); 895 | let r1: f64 = a.mul_add(b, c); 896 | assert_eq!(r0.is_finite(), r1.is_finite()); 897 | assert_eq!(r0.is_nan(), r1.is_nan()); 898 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 899 | // Check that the results are bit identical, or are both NaN. 900 | assert!(r1.is_nan() || r1.is_infinite() || r0 == r1); 901 | } 902 | } 903 | } 904 | } 905 | 906 | #[test] 907 | fn test_fma_random_vals() { 908 | use super::utils; 909 | 910 | let mut lfsr = utils::Lfsr::new(); 911 | 912 | fn mul_f32(a: f32, b: f32, c: f32) -> f32 { 913 | let a = Float::from_f32(a); 914 | let b = Float::from_f32(b); 915 | let c = Float::from_f32(c); 916 | let k = Float::fused_mul_add_with_rm( 917 | &a, 918 | &b, 919 | &c, 920 | RoundingMode::NearestTiesToEven, 921 | ); 922 | k.as_f32() 923 | } 924 | 925 | for _ in 0..50000 { 926 | let v0 = lfsr.get64() as u32; 927 | let v1 = lfsr.get64() as u32; 928 | let v2 = lfsr.get64() as u32; 929 | 930 | let f0 = f32::from_bits(v0); 931 | let f1 = f32::from_bits(v1); 932 | let f2 = f32::from_bits(v2); 933 | 934 | let r0 = mul_f32(f0, f1, f2); 935 | let r1 = f32::mul_add(f0, f1, f2); 936 | assert_eq!(r0.is_finite(), r1.is_finite()); 937 | assert_eq!(r0.is_nan(), r1.is_nan()); 938 | assert_eq!(r0.is_infinite(), r1.is_infinite()); 939 | let r0_bits = r0.to_bits(); 940 | let r1_bits = r1.to_bits(); 941 | // Check that the results are bit identical, or are both NaN. 942 | assert!(r1.is_nan() || r0_bits == r1_bits); 943 | } 944 | } 945 | -------------------------------------------------------------------------------- /src/cast.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of casting-related methods. 2 | 3 | use crate::float::Semantics; 4 | use crate::FP128; 5 | 6 | use super::bigint::BigInt; 7 | use super::bigint::LossFraction; 8 | use super::float::{self, Category}; 9 | use super::float::{Float, RoundingMode, FP32, FP64}; 10 | use super::utils; 11 | use super::utils::mask; 12 | 13 | impl Float { 14 | /// Load the integer `val` into the float. Notice that the number may 15 | /// overflow, or rounded to the nearest even integer. 16 | pub fn from_u64(sem: Semantics, val: u64) -> Self { 17 | Self::from_bigint(FP128, BigInt::from_u64(val)).cast(sem) 18 | } 19 | 20 | /// Load the big int `val` into the float. Notice that the number may 21 | /// overflow, or rounded to the nearest even integer. 22 | pub fn from_bigint(sem: Semantics, val: BigInt) -> Self { 23 | let mut a = 24 | Self::from_parts(sem, false, sem.get_mantissa_len() as i64, val); 25 | a.normalize(sem.get_rounding_mode(), LossFraction::ExactlyZero); 26 | a 27 | } 28 | 29 | /// Load the integer `val` into the float. Notice that the number may 30 | /// overflow or rounded. 31 | pub fn from_i64(sem: Semantics, val: i64) -> Self { 32 | if val < 0 { 33 | let mut a = Self::from_u64(sem, -val as u64); 34 | a.set_sign(true); 35 | return a; 36 | } 37 | 38 | Self::from_u64(sem, val as u64) 39 | } 40 | 41 | /// Converts and returns the rounded integral part. 42 | pub fn to_i64(&self) -> i64 { 43 | if self.is_nan() || self.is_zero() { 44 | return 0; 45 | } 46 | 47 | if self.is_inf() { 48 | if self.get_sign() { 49 | return i64::MIN; 50 | } else { 51 | return i64::MAX; 52 | } 53 | } 54 | let rm = self.get_rounding_mode(); 55 | let val = self.convert_normal_to_integer(rm); 56 | if self.get_sign() { 57 | -(val.as_u64() as i64) 58 | } else { 59 | val.as_u64() as i64 60 | } 61 | } 62 | 63 | /// Returns a value that is rounded to the nearest integer that's not larger 64 | /// in magnitude than this float. 65 | pub fn trunc(&self) -> Self { 66 | // Only handle normal numbers (don't do anything to NaN, Inf, Zero). 67 | if !self.is_normal() { 68 | return self.clone(); 69 | } 70 | 71 | let exp = self.get_exp(); 72 | 73 | if exp > self.get_mantissa_len() as i64 { 74 | // Already an integer. 75 | return self.clone(); 76 | } 77 | 78 | // Numbers that are smaller than 1 are rounded to zero. 79 | if exp < -1 { 80 | return Self::zero(self.get_semantics(), self.get_sign()); 81 | } 82 | 83 | // This is a fraction. Figure out which bits represent values over one 84 | // and clear out the values that represent the fraction. 85 | let trim = (self.get_mantissa_len() as i64 - exp) as usize; 86 | let mut m = self.get_mantissa(); 87 | m.shift_right(trim); 88 | m.shift_left(trim); 89 | Self::from_parts( 90 | self.get_semantics(), 91 | self.get_sign(), 92 | self.get_exp(), 93 | m, 94 | ) 95 | } 96 | 97 | /// Returns a number rounded to nearest integer, away from zero. 98 | pub fn round(&self) -> Self { 99 | use crate::float::shift_right_with_loss; 100 | let sem = self.get_semantics(); 101 | 102 | // Only handle normal numbers (don't do anything to NaN, Inf, Zero). 103 | if !self.is_normal() { 104 | return self.clone(); 105 | } 106 | 107 | let exp = self.get_exp(); 108 | 109 | if exp > self.get_mantissa_len() as i64 { 110 | // Already an integer. 111 | return self.clone(); 112 | } 113 | 114 | // Numbers that are between 0.5 and 1.0 are rounded to 1.0. 115 | if exp == -1 { 116 | return Self::one(sem, self.get_sign()); 117 | } 118 | 119 | // Numbers below 0.5 are rounded to zero. 120 | if exp < -2 { 121 | return Self::zero(sem, self.get_sign()); 122 | } 123 | 124 | // This is a fraction. Figure out which bits represent values over one 125 | // and clear out the values that represent the fraction. 126 | let trim = (self.get_mantissa_len() as i64 - exp) as usize; 127 | let (mut m, loss) = shift_right_with_loss(&self.get_mantissa(), trim); 128 | m.shift_left(trim); 129 | let t = Self::from_parts(sem, self.get_sign(), self.get_exp(), m); 130 | 131 | if loss.is_lt_half() { 132 | t 133 | } else if self.get_sign() { 134 | t - Self::one(sem, false) 135 | } else { 136 | t + Self::one(sem, false) 137 | } 138 | } 139 | 140 | pub(crate) fn convert_normal_to_integer(&self, rm: RoundingMode) -> BigInt { 141 | // We are converting to integer, so set the center point of the exponent 142 | // to the lsb instead of the msb. 143 | let i_exp = self.get_exp() - self.get_mantissa_len() as i64; 144 | if i_exp < 0 { 145 | let (mut m, loss) = float::shift_right_with_loss( 146 | &self.get_mantissa(), 147 | -i_exp as usize, 148 | ); 149 | if self.need_round_away_from_zero(rm, loss) { 150 | m.inplace_add(&BigInt::one()); 151 | } 152 | m 153 | } else { 154 | let mut m = self.get_mantissa(); 155 | m.shift_left(i_exp as usize); 156 | m 157 | } 158 | } 159 | 160 | fn from_bits(sem: Semantics, float: u64) -> Self { 161 | // Extract the biased exponent (wipe the sign and mantissa). 162 | let biased_exp = ((float >> sem.get_mantissa_len()) 163 | & mask(sem.get_exponent_len()) as u64) 164 | as i64; 165 | // Wipe the original exponent and mantissa. 166 | let sign = 167 | (float >> (sem.get_exponent_len() + sem.get_mantissa_len())) & 1; 168 | // Wipe the sign and exponent. 169 | let mut mantissa = float & mask(sem.get_mantissa_len()) as u64; 170 | 171 | let sign = sign == 1; 172 | 173 | // Check for NaN/Inf 174 | if biased_exp == mask(sem.get_exponent_len()) as i64 { 175 | if mantissa == 0 { 176 | return Self::inf(sem, sign); 177 | } 178 | return Self::nan(sem, sign); 179 | } 180 | 181 | let mut exp = biased_exp - sem.get_bias(); 182 | 183 | // Add the implicit bit for normal numbers. 184 | if biased_exp != 0 { 185 | mantissa += 1u64 << sem.get_mantissa_len(); 186 | } else { 187 | // Handle denormals, adjust the exponent to the legal range. 188 | exp += 1; 189 | } 190 | 191 | let mantissa = BigInt::from_u64(mantissa); 192 | Self::from_parts(sem, sign, exp, mantissa) 193 | } 194 | 195 | /// Cast to another float using the non-default rounding mode `rm`. 196 | pub fn cast_with_rm(&self, to: Semantics, rm: RoundingMode) -> Float { 197 | let mut loss = LossFraction::ExactlyZero; 198 | let exp_delta = 199 | self.get_mantissa_len() as i64 - to.get_mantissa_len() as i64; 200 | let mut temp = self.clone(); 201 | // If we are casting to a narrow type then we need to shift the bits 202 | // to the new-mantissa part of the word. This will adjust the exponent, 203 | // and if we lose bits then we'll need to round the number. 204 | if exp_delta > 0 { 205 | loss = temp.shift_significand_right(exp_delta as u64); 206 | } 207 | 208 | let mut x = Float::raw( 209 | to, 210 | temp.get_sign(), 211 | temp.get_exp() - exp_delta, 212 | temp.get_mantissa(), 213 | temp.get_category(), 214 | ); 215 | // Don't normalize if this is a nop conversion. 216 | if to.get_exponent_len() != self.get_exponent_len() 217 | || to.get_mantissa_len() != self.get_mantissa_len() 218 | { 219 | x.normalize(rm, loss); 220 | } 221 | x 222 | } 223 | /// Convert from one float format to another. 224 | pub fn cast(&self, to: Semantics) -> Float { 225 | self.cast_with_rm(to, self.get_rounding_mode()) 226 | } 227 | 228 | fn as_native_float(&self) -> u64 { 229 | // https://en.wikipedia.org/wiki/IEEE_754 230 | let mantissa: u64; 231 | let mut exp: u64; 232 | match self.get_category() { 233 | Category::Infinity => { 234 | mantissa = 0; 235 | exp = mask(self.get_exponent_len()) as u64; 236 | } 237 | Category::NaN => { 238 | mantissa = 1 << (self.get_mantissa_len() - 1); 239 | exp = mask(self.get_exponent_len()) as u64; 240 | } 241 | Category::Zero => { 242 | mantissa = 0; 243 | exp = 0; 244 | } 245 | Category::Normal => { 246 | exp = (self.get_exp() + self.get_bias()) as u64; 247 | debug_assert!(exp > 0); 248 | let m = self.get_mantissa().as_u64(); 249 | // Encode denormals. If the exponent is the minimum value and we 250 | // don't have a leading integer bit (in the form 1.mmmm) then 251 | // this is a denormal value and we need to encode it as such. 252 | if (exp == 1) && ((m >> self.get_mantissa_len()) == 0) { 253 | exp = 0; 254 | } 255 | mantissa = m & utils::mask(self.get_mantissa_len()) as u64; 256 | } 257 | } 258 | 259 | let mut bits: u64 = self.get_sign() as u64; 260 | bits <<= self.get_exponent_len(); 261 | bits |= exp; 262 | bits <<= self.get_mantissa_len(); 263 | debug_assert!(mantissa <= 1 << self.get_mantissa_len()); 264 | bits |= mantissa; 265 | bits 266 | } 267 | /// Convert this float to fp32. Notice that the number may overflow or 268 | /// rounded to the nearest even (see cast and cast_with_rm). 269 | pub fn as_f32(&self) -> f32 { 270 | let b = self.cast(FP32); 271 | let bits = b.as_native_float(); 272 | f32::from_bits(bits as u32) 273 | } 274 | /// Convert this float to fp64. Notice that the number may overflow or 275 | /// rounded to the nearest even (see cast and cast_with_rm). 276 | pub fn as_f64(&self) -> f64 { 277 | let b = self.cast(FP64); 278 | let bits = b.as_native_float(); 279 | f64::from_bits(bits) 280 | } 281 | 282 | /// Loads and converts a native fp32 value. Notice that the number may 283 | /// overflow or rounded (see cast and cast_with_rm). 284 | pub fn from_f32(float: f32) -> Self { 285 | Float::from_bits(FP32, float.to_bits() as u64) 286 | } 287 | 288 | /// Loads and converts a native fp64 value. Notice that the number may 289 | /// overflow or rounded (see cast and cast_with_rm). 290 | pub fn from_f64(float: f64) -> Self { 291 | Float::from_bits(FP64, float.to_bits()) 292 | } 293 | } 294 | 295 | #[test] 296 | fn test_rounding_to_integer() { 297 | // Test the low integers with round-to-zero. 298 | for i in 0..100 { 299 | let z64 = FP64.with_rm(RoundingMode::Zero); 300 | let r = Float::from_f64(i as f64 + 0.1).cast(z64).to_i64(); 301 | assert_eq!(i, r); 302 | } 303 | 304 | // Test the high integers with round_to_zero. 305 | for i in 0..100 { 306 | let z64 = FP64.with_rm(RoundingMode::Zero); 307 | let val = (i as i64) << 54; 308 | let r = Float::from_i64(FP64, val).cast(z64).to_i64(); 309 | assert_eq!(val, r); 310 | } 311 | 312 | let nta64 = FP64.with_rm(RoundingMode::NearestTiesToAway); 313 | assert_eq!(1, Float::from_f64(0.5).cast(nta64).to_i64()); 314 | assert_eq!(0, Float::from_f64(0.49).cast(nta64).to_i64()); 315 | assert_eq!(199999, Float::from_f64(199999.49).cast(nta64).to_i64()); 316 | assert_eq!(0, Float::from_f64(-0.49).cast(nta64).to_i64()); 317 | assert_eq!(-1, Float::from_f64(-0.5).cast(nta64).to_i64()); 318 | 319 | let z64 = FP64.with_rm(RoundingMode::Zero); 320 | assert_eq!(0, Float::from_f64(0.9).cast(z64).to_i64()); 321 | assert_eq!(1, Float::from_f64(1.1).cast(z64).to_i64()); 322 | assert_eq!(99, Float::from_f64(99.999).cast(z64).to_i64()); 323 | assert_eq!(0, Float::from_f64(-0.99).cast(z64).to_i64()); 324 | assert_eq!(0, Float::from_f64(-0.5).cast(z64).to_i64()); 325 | 326 | let p64 = FP64.with_rm(RoundingMode::Positive); 327 | assert_eq!(1, Float::from_f64(0.9).cast(p64).to_i64()); 328 | assert_eq!(2, Float::from_f64(1.1).cast(p64).to_i64()); 329 | assert_eq!(100, Float::from_f64(99.999).cast(p64).to_i64()); 330 | assert_eq!(0, Float::from_f64(-0.99).cast(p64).to_i64()); 331 | assert_eq!(0, Float::from_f64(-0.5).cast(p64).to_i64()); 332 | 333 | // Special values 334 | let n_inf = f64::NEG_INFINITY; 335 | let inf = f64::INFINITY; 336 | assert_eq!(0, Float::from_f64(f64::NAN).to_i64()); 337 | assert_eq!(i64::MIN, Float::from_f64(n_inf).to_i64()); 338 | assert_eq!(i64::MAX, Float::from_f64(inf).to_i64()); 339 | } 340 | 341 | #[test] 342 | fn test_round_trip_native_float_cast() { 343 | let f = f32::from_bits(0x41700000); 344 | let a = Float::from_f32(f); 345 | assert_eq!(f, a.as_f32()); 346 | 347 | let pi = 355. / 113.; 348 | let a = Float::from_f64(pi); 349 | assert_eq!(pi, a.as_f64()); 350 | 351 | assert!(Float::from_f64(f64::NAN).is_nan()); 352 | assert!(!Float::from_f64(f64::NAN).is_inf()); 353 | assert!(Float::from_f64(f64::INFINITY).is_inf()); 354 | assert!(!Float::from_f64(f64::INFINITY).is_nan()); 355 | assert!(Float::from_f64(f64::NEG_INFINITY).is_inf()); 356 | 357 | let a_float = f32::from_bits(0x3f8fffff); 358 | let a = Float::from_f32(a_float); 359 | let b = a.cast(FP32); 360 | assert_eq!(a.as_f32(), a_float); 361 | assert_eq!(b.as_f32(), a_float); 362 | 363 | let f = f32::from_bits(0x000000); 364 | let a = Float::from_f32(f); 365 | assert!(!a.is_normal()); 366 | assert_eq!(f, a.as_f32()); 367 | } 368 | 369 | #[test] 370 | fn test_cast_easy_ctor() { 371 | let values = [0x3f8fffff, 0x40800000, 0x3f000000, 0xc60b40ec, 0xbc675793]; 372 | 373 | for v in values { 374 | let output = f32::from_bits(v); 375 | let a = Float::from_f32(output).cast(FP64); 376 | let b = a.cast(FP32); 377 | assert_eq!(a.as_f32(), output); 378 | assert_eq!(b.as_f32(), output); 379 | } 380 | } 381 | 382 | #[test] 383 | fn test_cast_from_integers() { 384 | use super::float::FP16; 385 | 386 | let pi = 355. / 133.; 387 | let e = 193. / 71.; 388 | 389 | assert_eq!(Float::from_i64(FP32, 1 << 32).as_f32(), (1u64 << 32) as f32); 390 | assert_eq!(Float::from_i64(FP32, 1 << 34).as_f32(), (1u64 << 34) as f32); 391 | assert_eq!(Float::from_f64(pi).as_f32(), (pi) as f32); 392 | assert_eq!(Float::from_f64(e).as_f32(), (e) as f32); 393 | assert_eq!(Float::from_u64(FP32, 8388610).as_f32(), 8388610 as f32); 394 | 395 | for i in 0..(1 << 16) { 396 | assert_eq!(Float::from_u64(FP32, i << 12).as_f32(), (i << 12) as f32); 397 | } 398 | 399 | assert_eq!(Float::from_i64(FP16, 0).as_f64(), 0.); 400 | assert_eq!(Float::from_i64(FP16, 65500).as_f64(), 65504.0); 401 | assert_eq!(Float::from_i64(FP16, 65504).as_f64(), 65504.0); 402 | assert_eq!(Float::from_i64(FP16, 65519).as_f64(), 65504.0); 403 | assert_eq!(Float::from_i64(FP16, 65520).as_f64(), f64::INFINITY); 404 | assert_eq!(Float::from_i64(FP16, 65536).as_f64(), f64::INFINITY); 405 | 406 | for i in -100..100 { 407 | let a = Float::from_i64(FP32, i); 408 | let b = Float::from_f64(i as f64).cast(FP32); 409 | assert_eq!(a.as_f32(), b.as_f32()); 410 | } 411 | } 412 | 413 | #[test] 414 | fn test_cast_zero_nan_inf() { 415 | assert!(Float::nan(FP64, true).as_f64().is_nan()); 416 | assert_eq!(Float::zero(FP64, false).as_f64(), 0.0); 417 | assert_eq!(Float::zero(FP64, true).as_f64(), -0.0); 418 | 419 | assert!(Float::nan(FP64, true).is_nan()); 420 | assert!(Float::inf(FP64, true).is_inf()); 421 | { 422 | let a = Float::from_f32(f32::from_bits(0x3f8fffff)); 423 | assert!(!a.is_inf()); 424 | assert!(!a.is_nan()); 425 | assert!(!a.is_negative()); 426 | } 427 | { 428 | let a = Float::from_f32(f32::from_bits(0xf48fffff)); 429 | assert!(!a.is_inf()); 430 | assert!(!a.is_nan()); 431 | assert!(a.is_negative()); 432 | } 433 | { 434 | let a = Float::from_f32(f32::from_bits(0xff800000)); // -Inf 435 | assert!(a.is_inf()); 436 | assert!(!a.is_nan()); 437 | assert!(a.is_negative()); 438 | } 439 | { 440 | let a = Float::from_f32(f32::from_bits(0xffc00000)); // -Nan. 441 | assert!(!a.is_inf()); 442 | assert!(a.is_nan()); 443 | assert!(a.is_negative()); 444 | } 445 | 446 | { 447 | let a = Float::from_f64(f64::from_bits((mask(32) << 32) as u64)); 448 | assert!(!a.is_inf()); 449 | assert!(a.is_nan()); 450 | } 451 | { 452 | // Check that casting propagates inf/nan. 453 | let a = Float::from_f32(f32::from_bits(0xff800000)); // -Inf 454 | let b = a.cast(FP64); 455 | assert!(b.is_inf()); 456 | assert!(!b.is_nan()); 457 | assert!(b.is_negative()); 458 | } 459 | } 460 | 461 | #[test] 462 | fn test_cast_down_easy() { 463 | // Check that we can cast the numbers down, matching the hardware casting. 464 | for v in [0.3, 0.1, 14151241515., 14151215., 0.0000000001, 1000000000.] { 465 | let res = Float::from_f64(v).as_f32(); 466 | assert_eq!(Float::from_f64(v).as_f64().to_bits(), v.to_bits()); 467 | assert!(res == v as f32); 468 | } 469 | } 470 | 471 | #[test] 472 | fn test_load_store_all_f32() { 473 | // Try to load and store normals and denormals. 474 | for i in 0..(1u64 << 16) { 475 | let in_f = f32::from_bits((i << 10) as u32); 476 | let fp_f = Float::from_f32(in_f); 477 | let out_f = fp_f.as_f32(); 478 | assert_eq!(in_f.is_nan(), out_f.is_nan()); 479 | assert_eq!(in_f.is_infinite(), out_f.is_infinite()); 480 | assert!(in_f.is_nan() || (in_f.to_bits() == out_f.to_bits())); 481 | } 482 | } 483 | 484 | #[cfg(feature = "std")] 485 | #[test] 486 | fn test_cast_down_complex() { 487 | // Try casting a bunch of difficult values such as inf, nan, denormals, etc. 488 | for v in utils::get_special_test_values() { 489 | let res = Float::from_f64(v).as_f32(); 490 | assert_eq!(Float::from_f64(v).as_f64().to_bits(), v.to_bits()); 491 | assert_eq!(v.is_nan(), res.is_nan()); 492 | assert!(v.is_nan() || res == v as f32); 493 | } 494 | } 495 | 496 | #[cfg(feature = "std")] 497 | #[test] 498 | fn test_trunc() { 499 | use super::utils::Lfsr; 500 | 501 | let large_integer = (1u64 << 52) as f64; 502 | assert_eq!(Float::from_f64(0.4).trunc().as_f64(), 0.); 503 | assert_eq!(Float::from_f64(1.4).trunc().as_f64(), 1.); 504 | assert_eq!(Float::from_f64(1.99).trunc().as_f64(), 1.); 505 | assert_eq!(Float::from_f64(2.0).trunc().as_f64(), 2.0); 506 | assert_eq!(Float::from_f64(-2.4).trunc().as_f64(), -2.0); 507 | assert_eq!(Float::from_f64(1999999.).trunc().as_f64(), 1999999.); 508 | assert_eq!( 509 | Float::from_f64(large_integer).trunc().as_f64(), 510 | large_integer 511 | ); 512 | assert_eq!(Float::from_f64(0.001).trunc().as_f64(), 0.); 513 | 514 | // Test random values. 515 | let mut lfsr = Lfsr::new(); 516 | for _ in 0..5000 { 517 | let v0 = f64::from_bits(lfsr.get64()); 518 | let t0 = Float::from_f64(v0).trunc().as_f64(); 519 | let t1 = v0.trunc(); 520 | assert_eq!(t0.is_nan(), t1.is_nan()); 521 | if !t1.is_nan() { 522 | assert_eq!(t0, t1); 523 | } 524 | } 525 | 526 | // Test special values. 527 | for val in utils::get_special_test_values() { 528 | let t0 = Float::from_f64(val).trunc().as_f64(); 529 | let t1 = val.trunc(); 530 | assert_eq!(t0.is_nan(), t1.is_nan()); 531 | if !t1.is_nan() { 532 | assert_eq!(t0, t1); 533 | } 534 | } 535 | } 536 | 537 | #[cfg(feature = "std")] 538 | #[test] 539 | fn test_round() { 540 | use super::utils::Lfsr; 541 | assert_eq!(Float::from_f64(2.0).round().as_f64(), 2.0); 542 | assert_eq!(Float::from_f64(2.5).round().as_f64(), 3.0); 543 | assert_eq!(Float::from_f64(-2.5).round().as_f64(), -3.0); 544 | 545 | let big_num = (1u64 << 52) as f64; 546 | assert_eq!(Float::from_f64(0.4).round().as_f64(), 0.); 547 | assert_eq!(Float::from_f64(1.4).round().as_f64(), 1.); 548 | assert_eq!(Float::from_f64(1.99).round().as_f64(), 2.); 549 | assert_eq!(Float::from_f64(2.0).round().as_f64(), 2.0); 550 | assert_eq!(Float::from_f64(2.1).round().as_f64(), 2.0); 551 | assert_eq!(Float::from_f64(-2.4).round().as_f64(), -2.0); 552 | assert_eq!(Float::from_f64(1999999.).round().as_f64(), 1999999.); 553 | assert_eq!(Float::from_f64(big_num).round().as_f64(), big_num); 554 | assert_eq!(Float::from_f64(0.001).round().as_f64(), 0.); 555 | 556 | // Test random values. 557 | let mut lfsr = Lfsr::new(); 558 | for _ in 0..5000 { 559 | let v0 = f64::from_bits(lfsr.get64()); 560 | let t0 = Float::from_f64(v0).round().as_f64(); 561 | let t1 = v0.round(); 562 | assert_eq!(t0.is_nan(), t1.is_nan()); 563 | if !t1.is_nan() { 564 | assert_eq!(t0, t1); 565 | } 566 | } 567 | 568 | // Test special values. 569 | for val in utils::get_special_test_values() { 570 | let t0 = Float::from_f64(val).round().as_f64(); 571 | let t1 = val.round(); 572 | assert_eq!(t0.is_nan(), t1.is_nan()); 573 | if !t1.is_nan() { 574 | assert_eq!(t0, t1); 575 | } 576 | } 577 | } 578 | 579 | #[cfg(feature = "std")] 580 | #[test] 581 | fn test_cast_sizes() { 582 | use crate::FP16; 583 | use crate::FP256; 584 | let e = std::f64::consts::E; 585 | { 586 | let wide = Float::from_f64(e).cast(FP256); 587 | let narrow = wide.cast(FP64); 588 | assert_eq!(narrow.as_f64(), e); 589 | } 590 | 591 | { 592 | let narrow = Float::from_f64(e); 593 | let wide = narrow.cast(FP256); 594 | assert_eq!(wide.as_f64(), e); 595 | } 596 | 597 | { 598 | let wide = Float::from_u64(FP256, 1 << 50); 599 | let narrow = wide.cast(FP16); 600 | assert!(narrow.is_inf()); 601 | } 602 | 603 | { 604 | let narrow = Float::from_u64(FP16, 1 << 50); 605 | let wide = narrow.cast(FP256); 606 | assert!(wide.is_inf()); 607 | } 608 | 609 | { 610 | let narrow = Float::from_u64(FP16, 50); 611 | let wide = narrow.cast(FP256); 612 | assert_eq!(wide.as_f64(), narrow.as_f64()); 613 | assert_eq!(wide.to_i64(), 50); 614 | } 615 | } 616 | -------------------------------------------------------------------------------- /src/float.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the Float data structure and basic methods. 2 | 3 | extern crate alloc; 4 | use super::bigint::BigInt; 5 | use super::bigint::LossFraction; 6 | use core::cmp::Ordering; 7 | 8 | /// Defines the supported rounding modes. 9 | /// See IEEE754-2019 Section 4.3 Rounding-direction attributes. 10 | #[derive(Debug, Clone, Copy, PartialEq)] 11 | pub enum RoundingMode { 12 | None, 13 | NearestTiesToEven, 14 | NearestTiesToAway, 15 | Zero, 16 | Positive, 17 | Negative, 18 | } 19 | 20 | impl RoundingMode { 21 | /// Create a rounding mode from a string, if valid, or return none. 22 | pub fn from_string(s: &str) -> Option { 23 | match s { 24 | "NearestTiesToEven" => Some(RoundingMode::NearestTiesToEven), 25 | "NearestTiesToAway" => Some(RoundingMode::NearestTiesToAway), 26 | "Zero" => Some(RoundingMode::Zero), 27 | "Positive" => Some(RoundingMode::Positive), 28 | "Negative" => Some(RoundingMode::Negative), 29 | _ => None, 30 | } 31 | } 32 | } 33 | 34 | /// Controls the semantics of a floating point number with: 35 | /// 'precision', that determines the number of bits, 'exponent' that controls 36 | /// the dynamic range of the number, and rounding mode that controls how 37 | /// rounding is done after arithmetic operations. 38 | /// 39 | /// # Example 40 | /// 41 | /// ``` 42 | /// use arpfloat::{Float, RoundingMode, Semantics}; 43 | /// 44 | /// // Create a new floating point semantics. 45 | /// let sem = Semantics::new(10, 100, RoundingMode::Positive); 46 | /// // Create the number 1.0 with the new semantics. 47 | /// let x = Float::one(sem, false); 48 | /// 49 | /// // Check that the value is correct when casting to `double`. 50 | /// assert_eq!(x.as_f64(), 1.0); 51 | /// ``` 52 | 53 | #[derive(Debug, Clone, Copy, PartialEq)] 54 | pub struct Semantics { 55 | /// The number of bits that define the range of the exponent. 56 | pub exponent: usize, 57 | /// The number of bits in the significand (mantissa + 1). 58 | pub precision: usize, 59 | /// The rounding mode used when performing operations on this type. 60 | pub mode: RoundingMode, 61 | } 62 | 63 | impl Semantics { 64 | pub const fn new( 65 | exponent: usize, 66 | precision: usize, 67 | mode: RoundingMode, 68 | ) -> Self { 69 | Semantics { 70 | exponent, 71 | precision, 72 | mode, 73 | } 74 | } 75 | /// Returns the precision in bits. 76 | pub fn get_precision(&self) -> usize { 77 | self.precision 78 | } 79 | /// Returns the length of the mantissa in bits (precision - 1). 80 | pub fn get_mantissa_len(&self) -> usize { 81 | self.precision - 1 82 | } 83 | /// Returns the length of the exponent in bits, which defines the valid 84 | /// range. 85 | pub fn get_exponent_len(&self) -> usize { 86 | self.exponent 87 | } 88 | 89 | /// Returns the rounding mode of the type. 90 | pub fn get_rounding_mode(&self) -> RoundingMode { 91 | self.mode 92 | } 93 | 94 | /// Create a new float semantics with increased precision with 'add' 95 | /// additional digits. 96 | pub fn increase_precision(&self, more: usize) -> Semantics { 97 | Semantics::new(self.exponent, self.precision + more, self.mode) 98 | } 99 | /// Create a new float semantics with increased precision with 'add' 100 | /// additional digits, plus ceil(log2) of the number. 101 | pub fn grow_log(&self, more: usize) -> Semantics { 102 | let log2 = self.log_precision(); 103 | Semantics::new(self.exponent, self.precision + more + log2, self.mode) 104 | } 105 | 106 | /// Return a log2 approximation for the precision value. 107 | pub fn log_precision(&self) -> usize { 108 | // This is ~Log2(precision) 109 | 64 - (self.precision as u64).leading_zeros() as usize 110 | } 111 | 112 | /// Create a new float semantics with increased exponent with 'more' 113 | /// additional digits. 114 | pub fn increase_exponent(&self, more: usize) -> Semantics { 115 | Semantics::new(self.exponent + more, self.precision, self.mode) 116 | } 117 | /// Create a new float semantics with a different rounding mode 'mode'. 118 | pub fn with_rm(&self, rm: RoundingMode) -> Semantics { 119 | Semantics::new(self.exponent, self.precision, rm) 120 | } 121 | 122 | /// Returns the exponent bias for the number, as a positive number. 123 | /// https://en.wikipedia.org/wiki/IEEE_754#Basic_and_interchange_formats 124 | pub(crate) fn get_bias(&self) -> i64 { 125 | let e = self.get_exponent_len(); 126 | ((1u64 << (e - 1)) - 1) as i64 127 | } 128 | /// Returns the upper and lower bounds of the exponent. 129 | pub fn get_exp_bounds(&self) -> (i64, i64) { 130 | let exp_min: i64 = -self.get_bias() + 1; 131 | // The highest value is 0xFFFE, because 0xFFFF is used for signaling. 132 | let exp_max: i64 = (1 << self.get_exponent_len()) - self.get_bias() - 2; 133 | (exp_min, exp_max) 134 | } 135 | } 136 | 137 | /// Declare the different categories of the floating point number. These 138 | /// categories are internal to the float, and can be access by the accessors: 139 | /// is_inf, is_zero, is_nan, is_normal. 140 | #[derive(Debug, Clone, Copy, PartialEq)] 141 | pub enum Category { 142 | Infinity, 143 | NaN, 144 | Normal, 145 | Zero, 146 | } 147 | 148 | /// This is the main data structure of this library. It represents an 149 | /// arbitrary-precision floating-point number. 150 | #[derive(Debug, Clone)] 151 | pub struct Float { 152 | // The semantics of the float (precision, exponent range). 153 | sem: Semantics, 154 | // The Sign bit. 155 | sign: bool, 156 | // The Exponent. 157 | exp: i64, 158 | // The significand, including the implicit bit, aligned to the right. 159 | // Format [00000001xxxxxxx]. 160 | mantissa: BigInt, 161 | // The kind of number this float represents. 162 | category: Category, 163 | } 164 | 165 | impl Float { 166 | pub(crate) fn get_mantissa_len(&self) -> usize { 167 | self.sem.get_mantissa_len() 168 | } 169 | pub(crate) fn get_exponent_len(&self) -> usize { 170 | self.sem.get_exponent_len() 171 | } 172 | 173 | /// Create a new normal floating point number. 174 | pub fn from_parts( 175 | sem: Semantics, 176 | sign: bool, 177 | exp: i64, 178 | mantissa: BigInt, 179 | ) -> Self { 180 | if mantissa.is_zero() { 181 | return Float::zero(sem, sign); 182 | } 183 | Float { 184 | sem, 185 | sign, 186 | exp, 187 | mantissa, 188 | category: Category::Normal, 189 | } 190 | } 191 | 192 | /// Create a new normal floating point number. 193 | pub(crate) fn raw( 194 | sem: Semantics, 195 | sign: bool, 196 | exp: i64, 197 | mantissa: BigInt, 198 | category: Category, 199 | ) -> Self { 200 | Float { 201 | sem, 202 | sign, 203 | exp, 204 | mantissa, 205 | category, 206 | } 207 | } 208 | 209 | /// Returns a new zero float. 210 | pub fn zero(sem: Semantics, sign: bool) -> Self { 211 | Float { 212 | sem, 213 | sign, 214 | exp: 0, 215 | mantissa: BigInt::zero(), 216 | category: Category::Zero, 217 | } 218 | } 219 | 220 | /// Returns a new float with the value one. 221 | pub fn one(sem: Semantics, sign: bool) -> Self { 222 | let mut one = BigInt::one(); 223 | one.shift_left(sem.get_mantissa_len()); 224 | Float { 225 | sem, 226 | sign, 227 | exp: 0, 228 | mantissa: one, 229 | category: Category::Normal, 230 | } 231 | } 232 | 233 | /// Returns a new infinity float. 234 | pub fn inf(sem: Semantics, sign: bool) -> Self { 235 | Float { 236 | sem, 237 | sign, 238 | exp: 0, 239 | mantissa: BigInt::zero(), 240 | category: Category::Infinity, 241 | } 242 | } 243 | 244 | /// Returns a new NaN float. 245 | pub fn nan(sem: Semantics, sign: bool) -> Self { 246 | Float { 247 | sem, 248 | sign, 249 | exp: 0, 250 | mantissa: BigInt::zero(), 251 | category: Category::NaN, 252 | } 253 | } 254 | /// Returns true if the Float is negative 255 | pub fn is_negative(&self) -> bool { 256 | self.sign 257 | } 258 | 259 | /// Returns true if the Float is +-inf. 260 | pub fn is_inf(&self) -> bool { 261 | if let Category::Infinity = self.category { 262 | return true; 263 | } 264 | false 265 | } 266 | 267 | /// Returns true if the Float is a +- NaN. 268 | pub fn is_nan(&self) -> bool { 269 | if let Category::NaN = self.category { 270 | return true; 271 | } 272 | false 273 | } 274 | 275 | /// Returns true if the Float is a +- zero. 276 | pub fn is_zero(&self) -> bool { 277 | if let Category::Zero = self.category { 278 | return true; 279 | } 280 | false 281 | } 282 | 283 | /// Returns true if this number is normal (not Zero, Nan, Inf). 284 | pub fn is_normal(&self) -> bool { 285 | if let Category::Normal = self.category { 286 | return true; 287 | } 288 | false 289 | } 290 | 291 | /// Return the semantics of the number 292 | pub fn get_semantics(&self) -> Semantics { 293 | self.sem 294 | } 295 | 296 | /// Returns the rounding mode of the number. 297 | pub fn get_rounding_mode(&self) -> RoundingMode { 298 | self.sem.get_rounding_mode() 299 | } 300 | 301 | /// Update the sign of the float to `sign`. True means negative. 302 | pub fn set_sign(&mut self, sign: bool) { 303 | self.sign = sign 304 | } 305 | 306 | /// Returns the sign of the float. True means negative. 307 | pub fn get_sign(&self) -> bool { 308 | self.sign 309 | } 310 | 311 | /// Returns the mantissa of the float. 312 | pub fn get_mantissa(&self) -> BigInt { 313 | self.mantissa.clone() 314 | } 315 | 316 | /// Returns the exponent of the float. 317 | pub fn get_exp(&self) -> i64 { 318 | self.exp 319 | } 320 | 321 | /// Returns the category of the float. 322 | pub fn get_category(&self) -> Category { 323 | self.category 324 | } 325 | 326 | /// Returns a new float which has a flipped sign (negated value). 327 | pub fn neg(&self) -> Self { 328 | Self::raw( 329 | self.sem, 330 | !self.sign, 331 | self.exp, 332 | self.mantissa.clone(), 333 | self.category, 334 | ) 335 | } 336 | 337 | /// Shift the mantissa to the left to ensure that the MSB if the mantissa 338 | /// is set to the precision. The method updates the exponent to keep the 339 | /// number correct. 340 | pub(super) fn align_mantissa(&mut self) { 341 | let bits = 342 | self.sem.get_precision() as i64 - self.mantissa.msb_index() as i64; 343 | if bits > 0 { 344 | self.exp += bits; 345 | self.mantissa.shift_left(bits as usize); 346 | } 347 | } 348 | 349 | /// Prints the number using the internal representation. 350 | #[cfg(feature = "std")] 351 | pub fn dump(&self) { 352 | use std::println; 353 | let sign = if self.sign { "-" } else { "+" }; 354 | match self.category { 355 | Category::NaN => { 356 | println!("[{}NaN]", sign); 357 | } 358 | Category::Infinity => { 359 | println!("[{}Inf]", sign); 360 | } 361 | Category::Zero => { 362 | println!("[{}0.0]", sign); 363 | } 364 | Category::Normal => { 365 | let m = self.mantissa.as_binary(); 366 | println!("FP[{} E={:4} M = {}]", sign, self.exp, m); 367 | } 368 | } 369 | } 370 | 371 | #[cfg(not(feature = "std"))] 372 | pub fn dump(&self) { 373 | // No-op in no_std environments 374 | } 375 | 376 | /// Returns the exponent bias for the number, as a positive number. 377 | /// https://en.wikipedia.org/wiki/IEEE_754#Basic_and_interchange_formats 378 | pub(crate) fn get_bias(&self) -> i64 { 379 | self.sem.get_bias() 380 | } 381 | 382 | /// Returns the upper and lower bounds of the exponent. 383 | pub fn get_exp_bounds(&self) -> (i64, i64) { 384 | self.sem.get_exp_bounds() 385 | } 386 | } 387 | 388 | // IEEE 754-2019 389 | // Table 3.5 — Binary interchange format parameters. 390 | use RoundingMode::NearestTiesToEven as nte; 391 | 392 | /// Predefined BF16 float with 8 exponent bits, and 7 mantissa bits. 393 | pub const BF16: Semantics = Semantics::new(8, 8, nte); 394 | /// Predefined FP16 float with 5 exponent bits, and 10 mantissa bits. 395 | pub const FP16: Semantics = Semantics::new(5, 11, nte); 396 | /// Predefined FP32 float with 8 exponent bits, and 23 mantissa bits. 397 | pub const FP32: Semantics = Semantics::new(8, 24, nte); 398 | /// Predefined FP64 float with 11 exponent bits, and 52 mantissa bits. 399 | pub const FP64: Semantics = Semantics::new(11, 53, nte); 400 | /// Predefined FP128 float with 15 exponent bits, and 112 mantissa bits. 401 | pub const FP128: Semantics = Semantics::new(15, 113, nte); 402 | /// Predefined FP256 float with 19 exponent bits, and 236 mantissa bits. 403 | pub const FP256: Semantics = Semantics::new(19, 237, nte); 404 | 405 | /// Shift `val` by `bits`, and report the loss. 406 | pub(crate) fn shift_right_with_loss( 407 | val: &BigInt, 408 | bits: usize, 409 | ) -> (BigInt, LossFraction) { 410 | let mut val = val.clone(); 411 | let loss = val.get_loss_kind_for_bit(bits); 412 | val.shift_right(bits); 413 | (val, loss) 414 | } 415 | 416 | /// Combine the loss of accuracy with `msb` more significant and `lsb` 417 | /// less significant. 418 | fn combine_loss_fraction(msb: LossFraction, lsb: LossFraction) -> LossFraction { 419 | if !lsb.is_exactly_zero() { 420 | if msb.is_exactly_zero() { 421 | return LossFraction::LessThanHalf; 422 | } else if msb.is_exactly_half() { 423 | return LossFraction::MoreThanHalf; 424 | } 425 | } 426 | msb 427 | } 428 | 429 | #[test] 430 | fn shift_right_fraction() { 431 | let x: BigInt = BigInt::from_u64(0b10000000); 432 | let res = shift_right_with_loss(&x, 3); 433 | assert!(res.1.is_exactly_zero()); 434 | 435 | let x: BigInt = BigInt::from_u64(0b10000111); 436 | let res = shift_right_with_loss(&x, 3); 437 | assert!(res.1.is_mt_half()); 438 | 439 | let x: BigInt = BigInt::from_u64(0b10000100); 440 | let res = shift_right_with_loss(&x, 3); 441 | assert!(res.1.is_exactly_half()); 442 | 443 | let x: BigInt = BigInt::from_u64(0b10000001); 444 | let res = shift_right_with_loss(&x, 3); 445 | assert!(res.1.is_lt_half()); 446 | } 447 | 448 | impl Float { 449 | /// The number overflowed, set the right value based on the rounding mode 450 | /// and sign. 451 | fn overflow(&mut self, rm: RoundingMode) { 452 | let bounds = self.get_exp_bounds(); 453 | let inf = Self::inf(self.sem, self.sign); 454 | let max = Self::from_parts( 455 | self.sem, 456 | self.sign, 457 | bounds.1, 458 | BigInt::all1s(self.get_mantissa_len()), 459 | ); 460 | 461 | *self = match rm { 462 | RoundingMode::None => inf, 463 | RoundingMode::NearestTiesToEven => inf, 464 | RoundingMode::NearestTiesToAway => inf, 465 | RoundingMode::Zero => max, 466 | RoundingMode::Positive => { 467 | if self.sign { 468 | max 469 | } else { 470 | inf 471 | } 472 | } 473 | RoundingMode::Negative => { 474 | if self.sign { 475 | inf 476 | } else { 477 | max 478 | } 479 | } 480 | } 481 | } 482 | 483 | /// Verify that the exponent is legal. 484 | pub(crate) fn check_bounds(&self) { 485 | let bounds = self.get_exp_bounds(); 486 | debug_assert!(self.exp >= bounds.0); 487 | debug_assert!(self.exp <= bounds.1); 488 | let max_mantissa = BigInt::one_hot(self.sem.get_precision()); 489 | debug_assert!(self.mantissa.lt(&max_mantissa)); 490 | } 491 | 492 | pub(crate) fn shift_significand_left(&mut self, amt: u64) { 493 | self.exp -= amt as i64; 494 | self.mantissa.shift_left(amt as usize); 495 | } 496 | 497 | pub(crate) fn shift_significand_right(&mut self, amt: u64) -> LossFraction { 498 | self.exp += amt as i64; 499 | let res = shift_right_with_loss(&self.mantissa, amt as usize); 500 | self.mantissa = res.0; 501 | res.1 502 | } 503 | 504 | /// Returns true if we need to round away from zero (increment the mantissa). 505 | pub(crate) fn need_round_away_from_zero( 506 | &self, 507 | rm: RoundingMode, 508 | loss: LossFraction, 509 | ) -> bool { 510 | debug_assert!(self.is_normal() || self.is_zero()); 511 | match rm { 512 | RoundingMode::Positive => !self.sign, 513 | RoundingMode::Negative => self.sign, 514 | RoundingMode::Zero => false, 515 | RoundingMode::None => false, 516 | RoundingMode::NearestTiesToAway => loss.is_gte_half(), 517 | RoundingMode::NearestTiesToEven => { 518 | if loss.is_mt_half() { 519 | return true; 520 | } 521 | 522 | loss.is_exactly_half() && self.mantissa.is_odd() 523 | } 524 | } 525 | } 526 | 527 | /// Returns true if the absolute value of the two numbers are the same. 528 | pub(crate) fn same_absolute_value(&self, other: &Self) -> bool { 529 | if self.category != other.category { 530 | return false; 531 | } 532 | match self.category { 533 | Category::Infinity => true, 534 | Category::NaN => true, 535 | Category::Zero => true, 536 | Category::Normal => { 537 | self.exp == other.exp && self.mantissa == other.mantissa 538 | } 539 | } 540 | } 541 | 542 | /// Normalize the number by adjusting the exponent to the legal range, shift 543 | /// the mantissa to the msb, and round the number if bits are lost. This is 544 | /// based on Neil Booth' implementation in APFloat. 545 | pub(crate) fn normalize(&mut self, rm: RoundingMode, loss: LossFraction) { 546 | if !self.is_normal() { 547 | return; 548 | } 549 | let mut loss = loss; 550 | let bounds = self.get_exp_bounds(); 551 | 552 | let nmsb = self.mantissa.msb_index() as i64; 553 | 554 | // Step I - adjust the exponent. 555 | if nmsb > 0 { 556 | // Align the number so that the MSB bit will be MANTISSA + 1. 557 | let mut exp_change = nmsb - self.sem.get_precision() as i64; 558 | 559 | // Handle overflowing exponents. 560 | if self.exp + exp_change > bounds.1 { 561 | self.overflow(rm); 562 | self.check_bounds(); 563 | return; 564 | } 565 | 566 | // Handle underflowing low exponents. Don't allow to go below the 567 | // legal exponent range. 568 | if self.exp + exp_change < bounds.0 { 569 | exp_change = bounds.0 - self.exp; 570 | } 571 | 572 | if exp_change < 0 { 573 | // Handle reducing the exponent. 574 | debug_assert!(loss.is_exactly_zero(), "losing information"); 575 | self.shift_significand_left(-exp_change as u64); 576 | return; 577 | } 578 | 579 | if exp_change > 0 { 580 | // Handle increasing the exponent. 581 | let loss2 = self.shift_significand_right(exp_change as u64); 582 | loss = combine_loss_fraction(loss2, loss); 583 | } 584 | } 585 | 586 | //Step II - round the number. 587 | 588 | // If nothing moved or the shift didn't mess things up then we're done. 589 | if loss.is_exactly_zero() { 590 | // Canonicalize to zero. 591 | if self.mantissa.is_zero() { 592 | *self = Self::zero(self.sem, self.sign); 593 | return; 594 | } 595 | return; 596 | } 597 | 598 | // Check if we need to round away from zero. 599 | if self.need_round_away_from_zero(rm, loss) { 600 | if self.mantissa.is_zero() { 601 | self.exp = bounds.0 602 | } 603 | 604 | let one = BigInt::one(); 605 | self.mantissa = self.mantissa.clone() + one; 606 | // Did the mantissa overflow? 607 | let mut m = self.mantissa.clone(); 608 | m.shift_right(self.sem.get_precision()); 609 | if !m.is_zero() { 610 | // Can we fix the exponent? 611 | if self.exp < bounds.1 { 612 | self.shift_significand_right(1); 613 | } else { 614 | *self = Self::inf(self.sem, self.sign); 615 | return; 616 | } 617 | } 618 | } 619 | 620 | // Canonicalize. 621 | if self.mantissa.is_zero() { 622 | *self = Self::zero(self.sem, self.sign); 623 | } 624 | } // round. 625 | } 626 | 627 | impl PartialEq for Float { 628 | fn eq(&self, other: &Self) -> bool { 629 | let bitwise = self.sign == other.sign 630 | && self.exp == other.exp 631 | && self.mantissa == other.mantissa 632 | && self.category == other.category; 633 | 634 | match self.category { 635 | Category::Infinity | Category::Normal => bitwise, 636 | Category::Zero => other.is_zero(), 637 | Category::NaN => false, 638 | } 639 | } 640 | } 641 | 642 | /// Page 66. Chapter 3. Floating-Point Formats and Environment 643 | /// Table 3.8: Comparison predicates and the four relations. 644 | /// and 645 | /// IEEE 754-2019 section 5.10 - totalOrder. 646 | impl PartialOrd for Float { 647 | fn partial_cmp(&self, other: &Self) -> Option { 648 | debug_assert_eq!(self.get_semantics(), other.get_semantics()); 649 | let bool_to_ord = |ord: bool| -> Option { 650 | if ord { 651 | Some(Ordering::Less) 652 | } else { 653 | Some(Ordering::Greater) 654 | } 655 | }; 656 | 657 | match (self.category, other.category) { 658 | (Category::NaN, _) | (_, Category::NaN) => None, 659 | (Category::Zero, Category::Zero) => Some(Ordering::Equal), 660 | (Category::Infinity, Category::Infinity) => { 661 | if self.sign == other.sign { 662 | Some(Ordering::Equal) 663 | } else { 664 | bool_to_ord(self.sign) 665 | } 666 | } 667 | (Category::Infinity, Category::Normal) 668 | | (Category::Infinity, Category::Zero) 669 | | (Category::Normal, Category::Zero) => bool_to_ord(self.sign), 670 | 671 | (Category::Normal, Category::Infinity) 672 | | (Category::Zero, Category::Infinity) 673 | | (Category::Zero, Category::Normal) => bool_to_ord(!other.sign), 674 | 675 | (Category::Normal, Category::Normal) => { 676 | if self.sign != other.sign { 677 | bool_to_ord(self.sign) 678 | } else if self.exp < other.exp { 679 | bool_to_ord(!self.sign) 680 | } else if self.exp > other.exp { 681 | bool_to_ord(self.sign) 682 | } else { 683 | match self.mantissa.cmp(&other.mantissa) { 684 | Ordering::Less => bool_to_ord(!self.sign), 685 | Ordering::Equal => Some(Ordering::Equal), 686 | Ordering::Greater => bool_to_ord(self.sign), 687 | } 688 | } 689 | } 690 | } 691 | } 692 | } 693 | 694 | #[cfg(feature = "std")] 695 | #[test] 696 | fn test_comparisons() { 697 | use super::utils; 698 | 699 | // Compare a bunch of special values, using the <,>,== operators and check 700 | // that they match the comparison on doubles. 701 | for first in utils::get_special_test_values() { 702 | for second in utils::get_special_test_values() { 703 | let is_less = first < second; 704 | let is_eq = first == second; 705 | let is_gt = first > second; 706 | let first = Float::from_f64(first); 707 | let second = Float::from_f64(second); 708 | assert_eq!(is_less, first < second, "<"); 709 | assert_eq!(is_eq, first == second, "=="); 710 | assert_eq!(is_gt, first > second, ">"); 711 | } 712 | } 713 | } 714 | 715 | #[test] 716 | fn test_one_imm() { 717 | let sem = Semantics::new(10, 12, nte); 718 | let x = Float::one(sem, false); 719 | assert_eq!(x.as_f64(), 1.0); 720 | } 721 | 722 | #[test] 723 | pub fn test_bigint_ctor() { 724 | // Make sure that we can load numbers of the highest border of the FP16 725 | // number. 726 | let bi = BigInt::from_u64(65519); 727 | assert_eq!(Float::from_bigint(FP16, bi).cast(FP32).to_i64(), 65504); 728 | assert_eq!(Float::from_f64(65519.).cast(FP16).to_i64(), 65504); 729 | 730 | // Make sure that we can load numbers that are greater than the precision 731 | // and that normalization fixes and moves things to the right place. 732 | let sem = Semantics::new(40, 10, nte); 733 | let bi = BigInt::from_u64(1 << 14); 734 | let num = Float::from_bigint(sem, bi); 735 | assert_eq!(num.to_i64(), 1 << 14); 736 | } 737 | 738 | #[test] 739 | pub fn test_semantics_size() { 740 | assert_eq!(FP16.log_precision(), 4); 741 | assert_eq!(FP32.log_precision(), 5); 742 | assert_eq!(FP64.log_precision(), 6); 743 | assert_eq!(FP128.log_precision(), 7); 744 | } 745 | 746 | impl Semantics { 747 | /// Returns the maximum value of the number. 748 | pub fn get_max_positive_value(&self) -> Float { 749 | let exp = self.get_exp_bounds().1; 750 | let mantissa = BigInt::all1s(self.get_precision()); 751 | Float::from_parts(*self, false, exp, mantissa) 752 | } 753 | 754 | /// Returns the minimum positive value of the number (subnormal). 755 | /// See https://en.wikipedia.org/wiki/IEEE_754 756 | pub fn get_min_positive_value(&self) -> Float { 757 | let exp = self.get_exp_bounds().0; 758 | let mantissa = BigInt::one(); 759 | Float::from_parts(*self, false, exp, mantissa) 760 | } 761 | 762 | /// Returns true if the number can be represented exactly in this format. 763 | /// A number can be represented exactly if the exponent is in the range, and 764 | /// the mantissa is not too large. In other words, the number 'val' can be 765 | /// converted to this format without any loss of accuracy. 766 | pub fn can_represent_exactly(&self, val: &Float) -> bool { 767 | // Can always represent Inf, NaN, Zero. 768 | if !val.is_normal() { 769 | return true; 770 | } 771 | 772 | // Check the semantics of the other value. 773 | let other_sem = val.get_semantics(); 774 | if other_sem.get_precision() <= self.get_precision() 775 | && other_sem.get_exponent_len() <= self.get_exponent_len() 776 | { 777 | return true; 778 | } 779 | 780 | // Check the exponent value. 781 | let exp = val.get_exp(); 782 | let bounds = self.get_exp_bounds(); 783 | if exp < bounds.0 || exp > bounds.1 { 784 | return false; 785 | } 786 | 787 | // Check if the mantissa is zero. 788 | if val.get_mantissa().is_zero() { 789 | return true; 790 | } 791 | 792 | // Check how much we can shift-right the number without losing bits. 793 | let last = val.get_mantissa().trailing_zeros(); 794 | let first = val.get_mantissa().msb_index(); 795 | // Notice that msb_index is 1-based, but this is okay because we want to 796 | // count the number of bits including the last. 797 | let used_bits = first - last; 798 | used_bits <= self.get_precision() 799 | } 800 | } 801 | 802 | #[test] 803 | fn test_min_max_val() { 804 | assert_eq!(FP16.get_max_positive_value().as_f64(), 65504.0); 805 | assert_eq!(FP32.get_max_positive_value().as_f64(), f32::MAX as f64); 806 | assert_eq!(FP64.get_max_positive_value().as_f64(), f64::MAX); 807 | assert_eq!(FP32.get_min_positive_value().as_f32(), f32::from_bits(0b01)); 808 | assert_eq!(FP64.get_min_positive_value().as_f64(), f64::from_bits(0b01)); 809 | } 810 | 811 | #[test] 812 | fn test_can_represent_exactly() { 813 | assert!(FP16.can_represent_exactly(&Float::from_f64(1.0))); 814 | assert!(FP16.can_represent_exactly(&Float::from_f64(65504.0))); 815 | assert!(!FP16.can_represent_exactly(&Float::from_f64(65504.1))); 816 | assert!(!FP16.can_represent_exactly(&Float::from_f64(0.0001))); 817 | 818 | let m10 = BigInt::from_u64(0b1000000001); 819 | let m11 = BigInt::from_u64(0b10000000001); 820 | let m12 = BigInt::from_u64(0b100000000001); 821 | 822 | let val10bits = Float::from_parts(FP32, false, 0, m10); 823 | let val11bits = Float::from_parts(FP32, false, 0, m11); 824 | let val12bits = Float::from_parts(FP32, false, 0, m12); 825 | 826 | assert!(FP16.can_represent_exactly(&val10bits)); 827 | assert!(FP16.can_represent_exactly(&val11bits)); 828 | assert!(!FP16.can_represent_exactly(&val12bits)); 829 | 830 | assert!(FP32.can_represent_exactly(&Float::pi(FP32))); 831 | assert!(!FP32.can_represent_exactly(&Float::pi(FP64))); 832 | assert!(FP64.can_represent_exactly(&Float::pi(FP32))); 833 | } 834 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! 2 | //! ARPFloat is an implementation of arbitrary precision 3 | //! [floating point](https://en.wikipedia.org/wiki/IEEE_754) data 4 | //! structures and utilities. The library can be used to emulate existing 5 | //! floating point types, such as FP16, FP32 or FP128, and create new 6 | //! floating-point types that scale to hundreds of digits, and perform very 7 | //! accurate calculations. The library contains mathematical functions such as 8 | //! `log`, `exp`, `sin`, `cos`, `tan`, and constants such as `pi` and `e` 9 | //! 10 | //! In ARPFloat the rounding mode is a part of the type-system, and this solves 11 | //! a number of problem that show up when using the global rounding flag that's 12 | //! defined in `fenv.h`. 13 | //! 14 | //! ##no_std 15 | //! The library can be built without the standard library. 16 | //! 17 | //!### Example 18 | //!``` 19 | //! use arpfloat::Float; 20 | //! use arpfloat::FP128; 21 | //! 22 | //! // Create the number '5' in FP128 format. 23 | //! let n = Float::from_f64(5.).cast(FP128); 24 | //! 25 | //! // Use Newton-Raphson to find the square root of 5. 26 | //! let mut x = n.clone(); 27 | //! for _ in 0..20 { 28 | //! x += (&n / &x)/2; 29 | //! } 30 | //! 31 | //! println!("fp128: {}", x); 32 | //! println!("fp64: {}", x.as_f64()); 33 | //! ``` 34 | //! 35 | //!The program above will print this output: 36 | //!```console 37 | //!fp128: 2.2360679774997896964091736687312763 38 | //!fp64: 2.23606797749979 39 | //!``` 40 | //! 41 | //!The library also provides API that exposes rounding modes, and low-level 42 | //!operations. 43 | //! 44 | //!``` 45 | //! use arpfloat::FP128; 46 | //! use arpfloat::RoundingMode::NearestTiesToEven; 47 | //! use arpfloat::Float; 48 | //! 49 | //! let x = Float::from_u64(FP128, 1<<53); 50 | //! let y = Float::from_f64(1000.0).cast(FP128); 51 | //! 52 | //! let val = Float::mul_with_rm(&x, &y, NearestTiesToEven); 53 | //! ``` 54 | //! 55 | //! View the internal representation of floating point numbers: 56 | //! ``` 57 | //! use arpfloat::Float; 58 | //! use arpfloat::FP16; 59 | //! 60 | //! let fp = Float::from_i64(FP16, 15); 61 | //! 62 | //! fp.dump(); // Prints FP[+ E=+3 M=11110000000] 63 | //! 64 | //! let m = fp.get_mantissa(); 65 | //! m.dump(); // Prints 11110000000 66 | //!``` 67 | //! 68 | //! Control the rounding mode for type conversion: 69 | //!``` 70 | //! use arpfloat::{FP16, FP32, RoundingMode, Float}; 71 | //! 72 | //! let x = Float::from_u64(FP32, 2649); // Load an FP32 Value. 73 | //! let b = x.cast_with_rm(FP16, RoundingMode::Zero); // Convert to FP16. 74 | //! println!("{}", b); // Prints 2648! 75 | //!``` 76 | //! 77 | //! Define new float formats and use high-precision transcendental functions: 78 | //!``` 79 | //! use arpfloat::{Float, Semantics, RoundingMode}; 80 | //! // Define a new float format with 120 bits of accuracy, and dynamic range 81 | //! // of 2^10. 82 | //! let sem = Semantics::new(10, 120, RoundingMode::NearestTiesToEven); 83 | //! 84 | //! let pi = Float::pi(sem); 85 | //! let x = Float::exp(&pi); 86 | //! println!("e^pi = {}", x); // Prints 23.1406926327792.... 87 | //!``` 88 | //! 89 | //! Floating point numbers can be converted to 90 | //! [Continued Fractions](https://en.wikipedia.org/wiki/Continued_fraction) that 91 | //! approximate the value. 92 | //! 93 | //! ```rust 94 | //! use arpfloat::{Float, FP256, RoundingMode}; 95 | //! 96 | //! let ln = Float::ln2(FP256); 97 | //! println!("ln(2) = {}", ln); 98 | //! for i in 1..20 { 99 | //! let (p,q) = ln.as_fraction(i); 100 | //! println!("{}/{}", p.as_decimal(), q.as_decimal()); 101 | //! } 102 | //! ``` 103 | //!The program above will print this output: 104 | //!```console 105 | //! ln(2) = .6931471805599453094172321214581765680755001343..... 106 | //! 0/1 107 | //! 1/1 108 | //! 2/3 109 | //! 7/10 110 | //! 9/13 111 | //! 61/88 112 | //! 192/277 113 | //! 253/365 114 | //! 445/642 115 | //! 1143/1649 116 | //! 1588/2291 117 | //! 2731/3940 118 | //! .... 119 | //!``` 120 | 121 | #![no_std] 122 | 123 | #[cfg(feature = "std")] 124 | extern crate std; 125 | 126 | mod arithmetic; 127 | mod bigint; 128 | mod cast; 129 | mod float; 130 | mod operations; 131 | mod string; 132 | mod utils; 133 | 134 | pub use self::bigint::BigInt; 135 | pub use self::float::Float; 136 | pub use self::float::RoundingMode; 137 | pub use self::float::Semantics; 138 | pub use self::float::{BF16, FP128, FP16, FP256, FP32, FP64}; 139 | 140 | // Conditionally include a module based on feature flag 141 | #[cfg(feature = "python")] 142 | pub mod py; 143 | -------------------------------------------------------------------------------- /src/operations/constants.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of methods that compute mathematical 2 | //! constants. 3 | //! 4 | use crate::RoundingMode; 5 | use crate::{Float, Semantics}; 6 | 7 | impl Float { 8 | /// Computes pi. 9 | pub fn pi(sem: Semantics) -> Self { 10 | // Algorithm description in Pg 246: 11 | // Fast Multiple-Precision Evaluation of Elementary Functions 12 | // by Richard P. Brent. 13 | 14 | // Increase the precision, because the arithmetic operations below 15 | // require rounding, so if we want to get the accurate results we need 16 | // to operate with increased precision. 17 | let orig_sem = sem; 18 | let sem = sem.grow_log(4); 19 | 20 | use RoundingMode::NearestTiesToEven as rm; 21 | 22 | let one = Self::from_i64(sem, 1); 23 | let two = Self::from_i64(sem, 2); 24 | let four = Self::from_i64(sem, 4); 25 | 26 | let mut a = one.clone(); 27 | let mut b = one.clone() / two.sqrt(); 28 | let mut t = one.clone() / four; 29 | let mut x = one; 30 | 31 | while a != b { 32 | let y = a.clone(); 33 | a = (&a + &b).scale(-1, rm); 34 | b = (&b * &y).sqrt(); 35 | t -= &x * (&a - &y).sqr(); 36 | x = x.scale(1, rm); 37 | } 38 | (a.sqr() / t).cast(orig_sem) 39 | } 40 | 41 | /// Computes e. 42 | pub fn e(sem: Semantics) -> Self { 43 | let orig_sem = sem; 44 | let sem = sem.increase_precision(1); 45 | 46 | let one = Self::one(sem, false); 47 | let mut term = one.clone(); 48 | 49 | // Use Euler's continued fraction, which is a simple series. 50 | let iterations: i64 = (sem.get_exponent_len() * 2) as i64; 51 | for i in (1..iterations).rev() { 52 | let v = Self::from_i64(sem, i); 53 | term = &v + &v / &term; 54 | } 55 | 56 | (one / term + 2).cast(orig_sem) 57 | } 58 | 59 | /// Compute log(2). 60 | pub fn ln2(sem: Semantics) -> Self { 61 | use RoundingMode::None as rm; 62 | let sem2 = sem.increase_precision(8); 63 | 64 | // Represent log(2) using the sum 1/k*2^k 65 | let one = Self::one(sem2, false); 66 | let mut sum = Self::zero(sem2, false); 67 | let mut prev = Self::inf(sem2, true); 68 | for k in 1..500 { 69 | let k2 = Self::from_u64(sem2, 1).scale(k, rm); 70 | let k = Self::from_u64(sem2, k as u64); 71 | let kk2 = &Float::mul_with_rm(&k, &k2, rm); 72 | let term = Float::div_with_rm(&one, kk2, rm); 73 | sum = Float::add_with_rm(&sum, &term, rm); 74 | if prev == sum { 75 | break; 76 | } 77 | prev = sum.clone(); 78 | } 79 | sum.cast(sem) 80 | } 81 | } 82 | 83 | #[cfg(feature = "std")] 84 | #[test] 85 | fn test_pi() { 86 | use crate::FP32; 87 | use crate::FP64; 88 | assert_eq!(Float::pi(FP64).as_f64(), std::f64::consts::PI); 89 | assert_eq!(Float::pi(FP32).as_f32(), std::f32::consts::PI); 90 | } 91 | 92 | #[cfg(feature = "std")] 93 | #[test] 94 | fn test_e() { 95 | use crate::FP32; 96 | use crate::FP64; 97 | assert_eq!(Float::e(FP64).as_f64(), std::f64::consts::E); 98 | assert_eq!(Float::e(FP32).as_f32(), std::f32::consts::E); 99 | } 100 | 101 | #[cfg(feature = "std")] 102 | #[test] 103 | fn test_ln2() { 104 | use crate::FP64; 105 | assert_eq!(Float::ln2(FP64).as_f64(), std::f64::consts::LN_2); 106 | } 107 | -------------------------------------------------------------------------------- /src/operations/exp.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of log- and exp-related methods. 2 | //! 3 | 4 | use crate::RoundingMode; 5 | 6 | use crate::float::Float; 7 | 8 | impl Float { 9 | /// Computes the taylor series, centered around 1, and valid in [0..2]. 10 | /// z = (x - 1)/(x + 1) 11 | /// log(x) = 2 (z + z^3/3 + z^5/5 + z^7/7 ... ) 12 | fn log_taylor(x: &Self) -> Self { 13 | use RoundingMode::None as rm; 14 | let sem = x.get_semantics(); 15 | let one = Self::one(sem, false); 16 | let up = Float::sub_with_rm(x, &one, rm); 17 | let down = Float::add_with_rm(x, &one, rm); 18 | let z = Float::div_with_rm(&up, &down, rm); 19 | let z2 = z.sqr(); 20 | 21 | let mut top = z; 22 | let mut sum = Self::zero(sem, false); 23 | let mut prev = Self::one(sem, true); 24 | for i in 0..50 { 25 | if prev == sum { 26 | break; // Stop if we are not making progress. 27 | } 28 | prev = sum.clone(); 29 | 30 | let bottom = &Self::from_u64(sem, i * 2 + 1); 31 | let elem = Float::div_with_rm(&top, bottom, rm); 32 | sum = Float::add_with_rm(&sum, &elem, rm); 33 | 34 | // Prepare the next iteration. 35 | top = Float::mul_with_rm(&top, &z2, rm); 36 | } 37 | 38 | sum.scale(1, RoundingMode::Zero) 39 | } 40 | 41 | /// Reduce the range of 'x' with the identity: 42 | /// ln(x) = ln(sqrt(x)^2) = 2 * ln(sqrt(x)) and 43 | /// ln(x) = -ln(1/x) 44 | fn log_range_reduce(x: &Self) -> Self { 45 | use RoundingMode::NearestTiesToEven as even; 46 | let sem = x.get_semantics(); 47 | let up = Self::from_f64(1.001).cast(sem); 48 | let one = Self::from_u64(sem, 1); 49 | 50 | if x > &up { 51 | let sx = x.sqrt(); 52 | return Self::log_range_reduce(&sx).scale(1, even); 53 | } 54 | 55 | if x < &one { 56 | let re = Float::div_with_rm(&one, x, RoundingMode::None); 57 | return Self::log_range_reduce(&re).neg(); 58 | } 59 | 60 | Self::log_taylor(x) 61 | } 62 | 63 | /// Computes logarithm of 'x'. 64 | pub fn log(&self) -> Self { 65 | use RoundingMode::None as rm; 66 | let sem = self.get_semantics(); 67 | 68 | //Fast Logarithm function for Arbitrary Precision number, 69 | // by Henrik Vestermark. 70 | 71 | // Handle all of the special cases: 72 | if !self.is_normal() || self.is_negative() { 73 | return Self::nan(sem, self.get_sign()); 74 | } 75 | 76 | let orig_sem = self.get_semantics(); 77 | let sem = orig_sem.grow_log(10).increase_exponent(10); 78 | 79 | let x = &self.cast_with_rm(sem, rm); 80 | Self::log_range_reduce(x).cast_with_rm(orig_sem, rm) 81 | } 82 | } 83 | 84 | #[test] 85 | fn test_log() { 86 | use crate::FP128; 87 | let x = Float::from_f64(0.1).cast(FP128).log(); 88 | assert_eq!(x.as_f64(), -2.3025850929940455); 89 | 90 | for x in [ 91 | 0.1, 0.5, 2.3, 4.5, 9.8, 11.2, 15.2, 91.2, 102.2, 192.4, 1024.2, 92 | 90210.2, 93 | ] { 94 | let lhs = Float::from_f64(x).cast(FP128).log().as_f64(); 95 | let rhs = x.ln(); 96 | assert_eq!(lhs, rhs); 97 | } 98 | } 99 | 100 | impl Float { 101 | /// Computes the taylor series: 102 | /// exp(x) = 1 + x/1! + x^2/2! + x^3/3! ... 103 | fn exp_taylor(x: &Self) -> Self { 104 | let sem = x.get_semantics(); 105 | use crate::bigint::BigInt; 106 | let mut top = Self::one(sem, false); 107 | let mut bottom = BigInt::one(); 108 | 109 | let mut sum = Self::zero(sem, false); 110 | let mut prev = Self::one(sem, true); 111 | for k in 1..50 { 112 | if prev == sum { 113 | break; // Stop if we are not making progress. 114 | } 115 | prev = sum.clone(); 116 | 117 | let elem = &top / &Self::from_bigint(sem, bottom.clone()); 118 | sum += elem; 119 | 120 | // Prepare the next iteration. 121 | bottom *= BigInt::from_u64(k); 122 | top = &top * x; 123 | } 124 | 125 | sum 126 | } 127 | 128 | /// Reduce the range of 'x' with the identity: 129 | /// e^x = (e^(x/2))^2 130 | fn exp_range_reduce(x: &Self) -> Self { 131 | let sem = x.get_semantics(); 132 | 133 | let one = Self::from_u64(sem, 1); 134 | 135 | if x > &one { 136 | let sx = x.scale(-3, RoundingMode::Zero); 137 | let esx = Self::exp_range_reduce(&sx); 138 | return esx.sqr().sqr().sqr(); 139 | } 140 | 141 | Self::exp_taylor(x) 142 | } 143 | 144 | /// Computes exponential function `e^self`. 145 | pub fn exp(&self) -> Self { 146 | let sem = self.get_semantics(); 147 | 148 | // Handle all of the special cases: 149 | if self.is_zero() { 150 | return Self::one(sem, false); 151 | } else if !self.is_normal() { 152 | return Self::nan(sem, self.get_sign()); 153 | } 154 | 155 | let orig_sem = self.get_semantics(); 156 | let sem = orig_sem.grow_log(10).increase_exponent(10); 157 | 158 | // Handle the negative values. 159 | if self.is_negative() { 160 | let one = Self::one(sem, false); 161 | return (one / self.cast(sem).neg().exp()).cast(orig_sem); 162 | } 163 | 164 | Self::exp_range_reduce(&self.cast(sem)).cast(orig_sem) 165 | } 166 | } 167 | 168 | #[test] 169 | fn test_exp() { 170 | assert_eq!(Float::from_f64(2.51).exp().as_f64(), 12.30493006051041); 171 | 172 | for x in [ 173 | 0.000003, 0.001, 0.12, 0.13, 0.5, 1.2, 2.3, 4.5, 9.8, 5.0, 11.2, 15.2, 174 | 25.0, 34.001, 54., 89.1, 91.2, 102.2, 150., 192.4, 212., 256., 102.3, 175 | ] { 176 | let lhs = Float::from_f64(x).exp().as_f64(); 177 | let rhs = x.exp(); 178 | assert_eq!(lhs, rhs); 179 | } 180 | } 181 | 182 | impl Float { 183 | /// Computes the sigmoid function of this number. 184 | /// Defined as ( 1 / 1 + e(-x)). 185 | pub fn sigmoid(&self) -> Self { 186 | // https://en.wikipedia.org/wiki/Sigmoid_function 187 | let one = Self::one(self.get_semantics(), false); 188 | 189 | if self.is_inf() { 190 | return Self::one(self.get_semantics(), self.get_sign()); 191 | } else if self.is_zero() { 192 | use RoundingMode::Zero as rm; 193 | return one.scale(-1, rm); 194 | } else if self.is_nan() { 195 | return self.clone(); 196 | } 197 | 198 | let ex = self.exp(); 199 | &ex / (&ex + &one) 200 | } 201 | } 202 | 203 | #[test] 204 | pub fn test_sigmoid() { 205 | // Generate a test vector using the python program: 206 | // 207 | // import numpy as np 208 | // array = np.array([-0.5, 0, 0.5, 0.99, 1.0, 2.3, 100.0]) 209 | // def sigmoid(x): return 1.0 / (1.0 + np.exp(-x)) 210 | let inp = [-0.5, 0., 0.5, 0.99, 1., 2.3, 100.]; 211 | let out = [ 212 | 0.37754067, 0.5, 0.62245933, 0.72908792, 0.73105858, 0.90887704, 1., 213 | ]; 214 | 215 | for (x, o) in inp.iter().zip(out.iter()) { 216 | let x = Float::from_f64(*x); 217 | let o = Float::from_f64(*o); 218 | let res = x.sigmoid(); 219 | assert_eq!(o.as_f32(), res.as_f32()) 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /src/operations/frac.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of methods that compute continued 2 | //! fraction. 3 | 4 | use crate::{bigint::BigInt, Float}; 5 | 6 | impl Float { 7 | /// Convert the number to a Continued Fraction of two integers. 8 | /// The fraction is computed using 'n' iterations of the form: 9 | /// a0 + 1/(a1 + 1/(a2 + 1/( ... ))). 10 | /// This method discards the sign, and returns (0, 0) for Inf and NaN. 11 | pub fn as_fraction(&self, n: usize) -> (BigInt, BigInt) { 12 | if self.is_zero() { 13 | return (BigInt::zero(), BigInt::one()); // Zero. 14 | } else if self.is_inf() || self.is_nan() { 15 | return (BigInt::zero(), BigInt::zero()); // Invalid. 16 | } 17 | 18 | // Algorithm from: 19 | // Elementary Functions: Algorithms and Implementation 20 | // 9.3.1 A few basic notions on continued fractions - Page 180. 21 | extern crate alloc; 22 | use alloc::vec::Vec; 23 | let sem = self.get_semantics(); 24 | let rm = sem.get_rounding_mode(); 25 | 26 | let one = Self::one(sem, false); 27 | let mut real = self.clone(); 28 | let mut a: Vec = Vec::new(); 29 | 30 | for _ in 0..n.max(2) { 31 | let int = real.trunc(); 32 | a.push(int.convert_normal_to_integer(rm)); 33 | let denominator = real - int; 34 | if denominator.is_zero() { 35 | break; 36 | } 37 | real = &one / (denominator); 38 | } 39 | 40 | if a.len() < 2 { 41 | return (a[0].clone(), BigInt::one()); // Found an exact value. 42 | } 43 | 44 | let one = BigInt::one(); 45 | let mut p = (&one + &(&a[0] * &a[1]), a[0].clone()); 46 | let mut q = (a[1].clone(), one); 47 | 48 | if n < 2 { 49 | return (p.1, q.1); 50 | } 51 | 52 | for elem in a.iter().skip(2) { 53 | p = (&p.1 + &(elem * &p.0), p.0); 54 | q = (&q.1 + &(elem * &q.0), q.0); 55 | } 56 | 57 | (p.0, q.0) 58 | } 59 | } 60 | 61 | #[cfg(feature = "std")] 62 | #[test] 63 | fn test_frac() { 64 | use crate::FP128; 65 | let x = Float::pi(FP128); 66 | 67 | // Verified with https://oeis.org/A001203. 68 | let (p, q) = x.as_fraction(1); 69 | assert_eq!((3, 1), (p.as_u64(), q.as_u64())); 70 | let (p, q) = x.as_fraction(2); 71 | assert_eq!((22, 7), (p.as_u64(), q.as_u64())); 72 | let (p, q) = x.as_fraction(3); 73 | assert_eq!((333, 106), (p.as_u64(), q.as_u64())); 74 | let (p, q) = x.as_fraction(4); 75 | assert_eq!((355, 113), (p.as_u64(), q.as_u64())); 76 | } 77 | 78 | #[cfg(feature = "std")] 79 | #[test] 80 | fn fix_loop_bug() { 81 | let (p, q) = Float::from_f64(5.).as_fraction(3); 82 | assert_eq!((5, 1), (p.as_u64(), q.as_u64())); 83 | 84 | let (p, q) = Float::from_f64(0.5).as_fraction(3); 85 | assert_eq!((1, 2), (p.as_u64(), q.as_u64())); 86 | } 87 | -------------------------------------------------------------------------------- /src/operations/functions.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of several arithmetic operations. 2 | 3 | use crate::RoundingMode; 4 | 5 | use crate::float::Float; 6 | 7 | impl Float { 8 | /// Return this number raised to the power of 'n'. 9 | pub fn powi(&self, mut n: u64) -> Self { 10 | let sem = self.get_semantics().increase_precision(2); 11 | let mut elem = Self::one(sem, false); 12 | // This algorithm is similar to binary conversion. Each bit in 'n' 13 | // represents a power-of-two number, like 1,2,4,8 ... We know how to 14 | // generate numbers to the power of an even number by squaring the 15 | // number log2 times. So, we just multiply all of the numbers together 16 | // to get the result. This is like converting a binary number to integer 17 | // except that instead of adding we multiply the values. 18 | let mut val = self.cast(sem); 19 | while n > 0 { 20 | if n & 1 == 1 { 21 | elem *= &val; 22 | } 23 | val *= &val.clone(); 24 | n >>= 1; 25 | } 26 | elem.cast(self.get_semantics()) 27 | } 28 | 29 | /// Calculates the power of two. 30 | pub fn sqr(&self) -> Self { 31 | self.powi(2) 32 | } 33 | /// Calculates the square root of the number. 34 | pub fn sqrt(&self) -> Self { 35 | let sem = self.get_semantics(); 36 | if self.is_zero() { 37 | return self.clone(); // (+/-) zero 38 | } else if self.is_nan() || self.is_negative() { 39 | return Self::nan(sem, self.get_sign()); // (-/+)Nan, -Number. 40 | } else if self.is_inf() { 41 | return self.clone(); // Inf+. 42 | } 43 | 44 | let target = self.clone(); 45 | let two = Self::from_u64(sem, 2); 46 | 47 | // Start the search at max(2, x). 48 | let mut x = if target < two { two } else { target.clone() }; 49 | let mut prev = x.clone(); 50 | 51 | // Use the Newton Raphson method. 52 | loop { 53 | x += &target / &x; 54 | x = x.scale(-1, RoundingMode::NearestTiesToEven); 55 | // Stop when value did not change or regressed. 56 | if prev < x || x == prev { 57 | return x; 58 | } 59 | prev = x.clone(); 60 | } 61 | } 62 | 63 | /// Returns the absolute value of this float. 64 | pub fn abs(&self) -> Self { 65 | let mut x = self.clone(); 66 | x.set_sign(false); 67 | x 68 | } 69 | 70 | /// Returns the greater of self and `other`. 71 | pub fn max(&self, other: &Self) -> Self { 72 | if self.is_nan() { 73 | return other.clone(); 74 | } else if other.is_nan() { 75 | return self.clone(); 76 | } else if self.get_sign() != other.get_sign() { 77 | return if self.get_sign() { 78 | other.clone() 79 | } else { 80 | self.clone() 81 | }; // Handle (+-)0. 82 | } 83 | if self > other { 84 | self.clone() 85 | } else { 86 | other.clone() 87 | } 88 | } 89 | 90 | /// Returns the smaller of self and `other`. 91 | pub fn min(&self, other: &Self) -> Self { 92 | if self.is_nan() { 93 | return other.clone(); 94 | } else if other.is_nan() { 95 | return self.clone(); 96 | } else if self.get_sign() != other.get_sign() { 97 | return if self.get_sign() { 98 | self.clone() 99 | } else { 100 | other.clone() 101 | }; // Handle (+-)0. 102 | } 103 | if self > other { 104 | other.clone() 105 | } else { 106 | self.clone() 107 | } 108 | } 109 | } 110 | 111 | #[cfg(feature = "std")] 112 | #[test] 113 | fn test_sqrt() { 114 | use crate::utils; 115 | use crate::FP64; 116 | 117 | // Try a few power-of-two values. 118 | for i in 0..256 { 119 | let v16 = Float::from_u64(FP64, i * i); 120 | assert_eq!(v16.sqrt().as_f64(), (i) as f64); 121 | } 122 | 123 | // Test the category and value of the different special values (inf, zero, 124 | // correct sign, etc). 125 | for v_f64 in utils::get_special_test_values() { 126 | let vf = Float::from_f64(v_f64); 127 | assert_eq!(vf.sqrt().is_inf(), v_f64.sqrt().is_infinite()); 128 | assert_eq!(vf.sqrt().is_nan(), v_f64.sqrt().is_nan()); 129 | assert_eq!(vf.sqrt().is_negative(), v_f64.sqrt().is_sign_negative()); 130 | } 131 | 132 | // Test precomputed values. 133 | fn check(inp: f64, res: f64) { 134 | assert_eq!(Float::from_f64(inp).sqrt().as_f64(), res); 135 | } 136 | check(1.5, 1.224744871391589); 137 | check(2.3, 1.51657508881031); 138 | check(6.7, 2.588435821108957); 139 | check(7.9, 2.8106938645110393); 140 | check(11.45, 3.383784863137726); 141 | check(1049.3, 32.39290045673589); 142 | check(90210.7, 300.35096137685326); 143 | check(199120056003.73413, 446228.70369770494); 144 | check(0.6666666666666666, 0.816496580927726); 145 | check(0.4347826086956522, 0.6593804733957871); 146 | check(0.14925373134328357, 0.3863337046431279); 147 | check(0.12658227848101264, 0.35578403348241); 148 | check(0.08733624454148473, 0.29552706228277087); 149 | check(0.0009530162965786716, 0.030870962028719993); 150 | check(1.1085159520988087e-5, 0.00332943831914455); 151 | check(5.0120298432056786e-8, 0.0002238756316173263); 152 | } 153 | 154 | #[cfg(feature = "std")] 155 | #[test] 156 | fn test_min_max() { 157 | use crate::utils; 158 | 159 | fn check(v0: f64, v1: f64) { 160 | // Min. 161 | let correct = v0.min(v1); 162 | let test = Float::from_f64(v0).min(&Float::from_f64(v1)).as_f64(); 163 | assert_eq!(test.is_nan(), correct.is_nan()); 164 | if !correct.is_nan() { 165 | assert_eq!(correct, test); 166 | } 167 | // Max. 168 | let correct = v0.max(v1); 169 | let test = Float::from_f64(v0).max(&Float::from_f64(v1)).as_f64(); 170 | assert_eq!(test.is_nan(), correct.is_nan()); 171 | if !correct.is_nan() { 172 | assert_eq!(correct, test); 173 | } 174 | } 175 | 176 | // Test a bunch of special values (Inf, Epsilon, Nan, (+-)Zeros). 177 | for v0 in utils::get_special_test_values() { 178 | for v1 in utils::get_special_test_values() { 179 | check(v0, v1); 180 | } 181 | } 182 | 183 | let mut lfsr = utils::Lfsr::new(); 184 | 185 | for _ in 0..100 { 186 | let v0 = f64::from_bits(lfsr.get64()); 187 | let v1 = f64::from_bits(lfsr.get64()); 188 | check(v0, v1); 189 | } 190 | } 191 | 192 | #[cfg(feature = "std")] 193 | #[test] 194 | fn test_abs() { 195 | use crate::utils; 196 | 197 | for v in utils::get_special_test_values() { 198 | if !v.is_nan() { 199 | assert_eq!(Float::from_f64(v).abs().as_f64(), v.abs()); 200 | } 201 | } 202 | } 203 | 204 | // Compute basic constants. 205 | 206 | impl Float { 207 | /// Similar to 'scalbln'. Adds or subtracts to the exponent of the number, 208 | /// and scaling it by 2^exp. 209 | pub fn scale(&self, scale: i64, rm: RoundingMode) -> Self { 210 | use crate::bigint::LossFraction; 211 | if !self.is_normal() { 212 | return self.clone(); 213 | } 214 | 215 | let mut r = Self::from_parts( 216 | self.get_semantics(), 217 | self.get_sign(), 218 | self.get_exp() + scale, 219 | self.get_mantissa(), 220 | ); 221 | r.normalize(rm, LossFraction::ExactlyZero); 222 | r 223 | } 224 | 225 | /// Returns the remainder from a division of two floats. This is equivalent 226 | /// to rust 'rem' or c 'fmod'. 227 | pub fn rem(&self, rhs: &Self) -> Self { 228 | use core::ops::Sub; 229 | // Handle NaNs. 230 | if self.is_nan() || rhs.is_nan() || self.is_inf() || rhs.is_zero() { 231 | return Self::nan(self.get_semantics(), self.get_sign()); 232 | } 233 | // Handle values that are obviously zero or self. 234 | if self.is_zero() || rhs.is_inf() { 235 | return self.clone(); 236 | } 237 | 238 | // Operate on integers. 239 | let mut lhs = self.abs(); 240 | let rhs = if rhs.is_negative() { 241 | rhs.neg() 242 | } else { 243 | rhs.clone() 244 | }; 245 | debug_assert!(lhs.is_normal() && rhs.is_normal()); 246 | 247 | // This is a clever algorithm. Subtracting the RHS from LHS in a loop 248 | // would be slow, but we perform a divide-like algorithm where we shift 249 | // 'rhs' by higher powers of two, and subtract it from LHS, until LHS is 250 | // lower than RHS. 251 | while lhs >= rhs && lhs.is_normal() { 252 | let scale = lhs.get_exp() - rhs.get_exp(); 253 | 254 | // Scale RHS by a power of two. If we overshoot, take a step back. 255 | let mut diff = rhs.scale(scale, RoundingMode::None); 256 | if diff > lhs { 257 | diff = rhs.scale(scale - 1, RoundingMode::None); 258 | } 259 | 260 | lhs = lhs.sub(diff); 261 | } 262 | 263 | // Set the original sign. 264 | lhs.set_sign(self.get_sign()); 265 | lhs 266 | } 267 | } 268 | 269 | #[test] 270 | fn test_scale() { 271 | use crate::FP64; 272 | let x = Float::from_u64(FP64, 1); 273 | let y = x.scale(1, RoundingMode::None); 274 | assert_eq!(y.as_f64(), 2.0); 275 | let z = x.scale(-1, RoundingMode::None); 276 | assert_eq!(z.as_f64(), 0.5); 277 | } 278 | 279 | #[cfg(feature = "std")] 280 | #[test] 281 | fn test_rem() { 282 | use crate::utils; 283 | use crate::utils::Lfsr; 284 | 285 | use core::ops::Rem; 286 | 287 | fn check_two_numbers(v0: f64, v1: f64) { 288 | let f0 = Float::from_f64(v0); 289 | let f1 = Float::from_f64(v1); 290 | let r0 = v0.rem(v1); 291 | let r1 = f0.rem(&f1).as_f64(); 292 | assert_eq!(r0.is_nan(), r1.is_nan()); 293 | if !r0.is_nan() { 294 | assert_eq!(r0, r1); 295 | } 296 | } 297 | 298 | // Test addition, multiplication, subtraction with random values. 299 | check_two_numbers(1.4, 2.5); 300 | check_two_numbers(2.4, 1.5); 301 | check_two_numbers(1000., std::f64::consts::PI); 302 | check_two_numbers(10000000000000000000., std::f64::consts::PI / 1000.); 303 | check_two_numbers(10000000000000000000., std::f64::consts::PI); 304 | check_two_numbers(100., std::f64::consts::PI); 305 | check_two_numbers(100., -std::f64::consts::PI); 306 | check_two_numbers(0., 10.); 307 | check_two_numbers(std::f64::consts::PI, 10.0); 308 | 309 | // Test a bunch of random values: 310 | let mut lfsr = Lfsr::new(); 311 | for _ in 0..5000 { 312 | let v0 = f64::from_bits(lfsr.get64()); 313 | let v1 = f64::from_bits(lfsr.get64()); 314 | check_two_numbers(v0, v1); 315 | } 316 | 317 | // Test the hard cases: 318 | for v0 in utils::get_special_test_values() { 319 | for v1 in utils::get_special_test_values() { 320 | check_two_numbers(v0, v1); 321 | } 322 | } 323 | } 324 | 325 | #[test] 326 | fn test_powi() { 327 | assert_eq!(Float::from_f64(2.).powi(0).as_f64(), 1.); 328 | assert_eq!(Float::from_f64(2.).powi(1).as_f64(), 2.); 329 | assert_eq!(Float::from_f64(2.).powi(3).as_f64(), 8.); 330 | assert_eq!(Float::from_f64(2.).powi(5).as_f64(), 32.); 331 | assert_eq!(Float::from_f64(2.).powi(10).as_f64(), 1024.); 332 | assert_eq!(Float::from_f64(0.3).powi(3).as_f64(), 0.026999999999999996); 333 | } 334 | 335 | impl Float { 336 | /// Return this number raised to the power of 'n'. 337 | /// Computed using e^(n * log(self)) 338 | pub fn pow(&self, n: &Float) -> Self { 339 | let orig_sem = self.get_semantics(); 340 | let one = Self::one(orig_sem, false); 341 | let sign = self.get_sign(); 342 | 343 | assert_eq!(orig_sem, n.get_semantics()); 344 | 345 | if *self == one { 346 | return self.clone(); 347 | } else if n.is_inf() || n.is_nan() { 348 | return Self::nan(orig_sem, sign); 349 | } else if n.is_zero() { 350 | return Self::one(orig_sem, sign); 351 | } else if self.is_zero() { 352 | return if n.is_negative() { 353 | Self::inf(orig_sem, sign) 354 | } else { 355 | Self::zero(orig_sem, sign) 356 | }; 357 | } else if self.is_negative() || self.is_inf() || self.is_nan() { 358 | return Self::nan(orig_sem, sign); 359 | } 360 | 361 | let sem = orig_sem.grow_log(10).increase_exponent(10); 362 | (n.cast(sem) * self.cast(sem).log()).exp().cast(orig_sem) 363 | } 364 | } 365 | 366 | #[test] 367 | fn test_pow() { 368 | fn my_pow(a: f32, b: f32) -> f32 { 369 | Float::from_f32(a).pow(&Float::from_f32(b)).as_f32() 370 | } 371 | 372 | assert_eq!(my_pow(1.24, 1.2), 1.2945118); 373 | assert_eq!(my_pow(0.94, 13.), 0.44736509); 374 | assert_eq!(my_pow(0.11, -8.), 46650738.02097334); 375 | assert_eq!(my_pow(40.0, 3.1), 92552.0); 376 | 377 | for i in 0..30 { 378 | for j in -10..10 { 379 | let i = i as f64; 380 | let j = j as f64; 381 | let res = i.powf(j); 382 | let res2 = Float::from_f64(i).pow(&Float::from_f64(j)); 383 | assert_eq!(res, res2.as_f64()); 384 | } 385 | } 386 | } 387 | -------------------------------------------------------------------------------- /src/operations/mod.rs: -------------------------------------------------------------------------------- 1 | //! Contains the implementations of various mathematical functions and 2 | //! constants. 3 | 4 | #[cfg(feature = "std")] 5 | extern crate std; 6 | 7 | mod constants; 8 | mod exp; 9 | mod frac; 10 | mod functions; 11 | mod trig; 12 | -------------------------------------------------------------------------------- /src/operations/trig.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of trigonometric functions. 2 | //! 3 | use crate::float::Float; 4 | use crate::RoundingMode; 5 | 6 | impl Float { 7 | /// sin(x) = x - x^3 / 3! + x^5 / 5! - x^7/7! .... 8 | fn sin_taylor(x: &Self) -> Self { 9 | use crate::bigint::BigInt; 10 | let sem = x.get_semantics(); 11 | 12 | let mut neg = false; 13 | let mut top = x.clone(); 14 | let mut bottom = BigInt::one(); 15 | let mut sum = Self::zero(sem, false); 16 | let x2 = x.sqr(); 17 | let mut prev = Self::one(sem, true); 18 | for i in 1..50 { 19 | if prev == sum { 20 | break; // Stop if we are not making progress. 21 | } 22 | prev = sum.clone(); 23 | // Update sum. 24 | let elem = &top / &Self::from_bigint(sem, bottom.clone()); 25 | sum = if neg { sum - elem } else { sum + elem }; 26 | 27 | // Prepare the next element. 28 | top = &top * &x2; 29 | let next_term = BigInt::from_u64((i * 2) * (i * 2 + 1)); 30 | bottom *= next_term; 31 | neg ^= true; 32 | } 33 | 34 | sum 35 | } 36 | 37 | /// Reduce sin(x) in the range 0..pi/2, using the identity: 38 | /// sin(3x) = 3sin(x)-4(sin(x)^3) 39 | fn sin_step4_reduction(x: &Self, steps: usize) -> Self { 40 | use RoundingMode::None as rm; 41 | if steps == 0 { 42 | return Self::sin_taylor(x); 43 | } 44 | let i3 = Float::from_u64(x.get_semantics(), 3); 45 | let x3 = Float::div_with_rm(x, &i3, rm); 46 | let sx = Float::sin_step4_reduction(&x3, steps - 1); 47 | let sx3 = Float::mul_with_rm(&sx, &i3, rm); 48 | Float::sub_with_rm(&sx3, &sx.powi(3).scale(2, rm), rm) 49 | } 50 | 51 | /// Computes the sine of the number (in radians). 52 | pub fn sin(&self) -> Self { 53 | use RoundingMode::None as rm; 54 | // Fast Trigonometric functions for Arbitrary Precision number 55 | // by Henrik Vestermark. 56 | 57 | if self.is_zero() || self.is_nan() { 58 | return self.clone(); 59 | } 60 | 61 | if self.is_inf() { 62 | return Self::nan(self.get_semantics(), self.get_sign()); 63 | } 64 | 65 | let orig_sem = self.get_semantics(); 66 | let sem = orig_sem.grow_log(12).increase_exponent(4); 67 | 68 | assert!(self.is_normal()); 69 | 70 | let mut neg = false; 71 | 72 | let mut val = self.cast_with_rm(sem, rm); 73 | 74 | // Handle the negatives. 75 | if val.is_negative() { 76 | val = val.neg(); 77 | neg ^= true; 78 | } 79 | 80 | // Range reductions. 81 | let is_small = self.get_exp() < 0; 82 | 83 | if !is_small { 84 | let pi = Self::pi(sem); 85 | let pi2 = pi.scale(1, rm); 86 | let pi_half = pi.scale(-1, rm); 87 | 88 | // Step 1 89 | if val > pi2 { 90 | val = val.rem(&pi2); 91 | } 92 | 93 | debug_assert!(val <= pi2); 94 | // Step 2. 95 | if val > pi { 96 | val = Float::sub_with_rm(&val, &pi, rm); 97 | neg ^= true; 98 | } 99 | 100 | debug_assert!(val <= pi); 101 | // Step 3. 102 | if val > pi_half { 103 | val = Float::sub_with_rm(&pi, &val, rm); 104 | } 105 | debug_assert!(val <= pi_half); 106 | } 107 | 108 | // Calculate the number of needed reduction: 8[2/3 * log(2) * log(p)]; 109 | let k = orig_sem.log_precision() * 4; 110 | 111 | let res = Self::sin_step4_reduction(&val, k); 112 | let res = if neg { res.neg() } else { res }; 113 | res.cast(orig_sem) 114 | } 115 | } 116 | 117 | #[cfg(feature = "std")] 118 | #[test] 119 | fn test_sin_known_value() { 120 | use crate::std::string::ToString; 121 | // Verify the results with: 122 | // from mpmath import mp 123 | // mp.dps = 1000 124 | // mp.sin(801./10000) 125 | let res = Float::from_f64(801. / 10000.).sin().to_string(); 126 | assert_eq!(res, ".08001437374006335"); 127 | let res = Float::from_f64(90210. / 10000.).sin().to_string(); 128 | assert_eq!(res, ".3928952872542333"); 129 | let res = Float::from_f64(95051.).sin().to_string(); 130 | assert_eq!(res, "-.8559198239971502"); 131 | } 132 | 133 | #[cfg(feature = "std")] 134 | #[test] 135 | fn test_sin() { 136 | use crate::utils; 137 | 138 | for i in -100..100 { 139 | let f0 = i as f64; 140 | let r0 = f0.sin(); 141 | let r1 = Float::from_f64(f0).sin().as_f64(); 142 | assert_eq!(r0, r1); 143 | } 144 | 145 | for i in -300..300 { 146 | let f0 = (i as f64) / 100.; 147 | let r0 = f0.sin(); 148 | let r1 = Float::from_f64(f0).sin().as_f64(); 149 | assert_eq!(r0, r1); 150 | } 151 | 152 | // Test non-normal values. 153 | for v in utils::get_special_test_values() { 154 | if v.is_normal() { 155 | continue; 156 | } 157 | let r0 = v.sin(); 158 | let r1 = Float::from_f64(v).sin().as_f64(); 159 | assert_eq!(r0.is_nan(), r1.is_nan()); 160 | if !r0.is_nan() { 161 | assert_eq!(r0, r1); 162 | } 163 | } 164 | } 165 | 166 | impl Float { 167 | /// cos(x) = 1 - x^2 / 2! + x^4 / 4! - x^6/6! .... 168 | fn cos_taylor(x: &Self) -> Self { 169 | use crate::bigint::BigInt; 170 | let sem = x.get_semantics(); 171 | 172 | let mut neg = false; 173 | let mut top = Self::one(sem, false); 174 | let mut bottom = BigInt::one(); 175 | let mut sum = Self::zero(sem, false); 176 | let x2 = x.sqr(); 177 | let mut prev = Self::one(sem, true); 178 | for i in 1..50 { 179 | if prev == sum { 180 | break; // Stop if we are not making progress. 181 | } 182 | prev = sum.clone(); 183 | 184 | // Update sum. 185 | let elem = &top / &Self::from_bigint(sem, bottom.clone()); 186 | sum = if neg { sum - elem } else { sum + elem }; 187 | 188 | // Prepare the next element. 189 | top = &top * &x2; 190 | let next_term = BigInt::from_u64((i * 2 - 1) * (i * 2)); 191 | bottom *= next_term; 192 | 193 | neg ^= true; 194 | } 195 | 196 | sum 197 | } 198 | 199 | /// Reduce cos(x) in the range 0..pi/2, using the identity: 200 | /// cos(2x) = 2cos(x)^2 - 1 201 | fn cos_step4_reduction(x: &Self, steps: usize) -> Self { 202 | use RoundingMode::None as rm; 203 | if steps == 0 { 204 | return Self::cos_taylor(x); 205 | } 206 | let sem = x.get_semantics(); 207 | let one = Float::one(sem, false); 208 | let half_x = x.scale(-1, rm); 209 | let sx = Float::cos_step4_reduction(&half_x, steps - 1); 210 | Float::sub_with_rm(&sx.sqr().scale(1, rm), &one, rm) 211 | } 212 | 213 | /// Computes the cosine of the number (in radians). 214 | pub fn cos(&self) -> Self { 215 | use RoundingMode::None as rm; 216 | // Fast Trigonometric functions for Arbitrary Precision number 217 | // by Henrik Vestermark. 218 | 219 | if self.is_nan() { 220 | return self.clone(); 221 | } 222 | 223 | if self.is_zero() { 224 | return Self::one(self.get_semantics(), false); 225 | } 226 | 227 | if self.is_inf() { 228 | return Self::nan(self.get_semantics(), self.get_sign()); 229 | } 230 | 231 | let orig_sem = self.get_semantics(); 232 | let sem = orig_sem.grow_log(14).increase_exponent(4); 233 | 234 | assert!(self.is_normal()); 235 | 236 | let mut neg = false; 237 | 238 | let mut val = self.cast_with_rm(sem, rm); 239 | 240 | // Handle the negatives. 241 | if val.is_negative() { 242 | val = val.neg(); 243 | } 244 | 245 | // Range reductions. 246 | let is_small = self.get_exp() < 0; // X < 1. 247 | 248 | if !is_small { 249 | let pi = Self::pi(sem); 250 | let pi2 = pi.scale(1, rm); 251 | let pi_half = pi.scale(-1, rm); 252 | 253 | // Step 1 254 | if val > pi2 { 255 | val = val.rem(&pi2); 256 | } 257 | debug_assert!(val <= pi2); 258 | 259 | // Step 2. 260 | if val > pi { 261 | val = Float::sub_with_rm(&pi2, &val, rm); 262 | } 263 | 264 | debug_assert!(val <= pi); 265 | // Step 3. 266 | if val > pi_half { 267 | val = Float::sub_with_rm(&pi, &val, rm); 268 | neg ^= true; 269 | } 270 | debug_assert!(val <= pi_half); 271 | } 272 | 273 | // Calculate the number of needed reduction: 2[log(2) * log(p)]; 274 | let k = (sem.log_precision() * 8) / 10; 275 | 276 | let res = Self::cos_step4_reduction(&val, k); 277 | let res = if neg { res.neg() } else { res }; 278 | res.cast(orig_sem) 279 | } 280 | } 281 | 282 | #[cfg(feature = "std")] 283 | #[test] 284 | fn test_cos_known_value() { 285 | use crate::std::string::ToString; 286 | 287 | // Verify the results with: 288 | // from mpmath import mp 289 | // mp.dps = 100 290 | // mp.cos(801./10000) 291 | let res = Float::from_f64(801. / 10000.).cos().to_string(); 292 | assert_eq!(res, ".9967937098492272"); 293 | let res = Float::from_f64(2.3).cos().to_string(); 294 | assert_eq!(res, "-.6662760212798241"); 295 | let res = Float::from_f64(90210. / 10000.).cos().to_string(); 296 | assert_eq!(res, "-.9195832171442742"); 297 | let res = Float::from_f64(95051.).cos().to_string(); 298 | assert_eq!(res, ".5171085523259959"); 299 | } 300 | 301 | #[cfg(feature = "std")] 302 | #[test] 303 | fn test_cos() { 304 | use crate::utils; 305 | 306 | for i in -100..100 { 307 | let f0 = i as f64; 308 | let r0 = f0.cos(); 309 | let r1 = Float::from_f64(f0).cos().as_f64(); 310 | assert_eq!(r0, r1); 311 | } 312 | 313 | // The native implementation of sin is not accurate to all 64 bits, so 314 | // we just pick a few values where we happen to get lucky and native sin 315 | // matches the arbitrary precision implementation. 316 | for i in -100..100 { 317 | let f0 = (i as f64) / 100.; 318 | let r0 = f0.cos(); 319 | let r1 = Float::from_f64(f0).cos().as_f64(); 320 | assert_eq!(r0, r1); 321 | } 322 | 323 | // Test non-normal values. 324 | for v in utils::get_special_test_values() { 325 | if v.is_normal() { 326 | continue; 327 | } 328 | let r0 = v.cos(); 329 | let r1 = Float::from_f64(v).cos().as_f64(); 330 | assert_eq!(r0.is_nan(), r1.is_nan()); 331 | if !r0.is_nan() { 332 | assert_eq!(r0, r1); 333 | } 334 | } 335 | } 336 | 337 | impl Float { 338 | /// Computes the tangent of the number (in radians). 339 | pub fn tan(&self) -> Self { 340 | use RoundingMode::None as rm; 341 | // Fast Trigonometric functions for Arbitrary Precision number 342 | // by Henrik Vestermark. 343 | 344 | if self.is_zero() || self.is_nan() { 345 | return self.clone(); 346 | } 347 | 348 | if self.is_inf() { 349 | return Self::nan(self.get_semantics(), self.get_sign()); 350 | } 351 | 352 | let orig_sem = self.get_semantics(); 353 | let sem = orig_sem.grow_log(12).increase_exponent(4); 354 | 355 | assert!(self.is_normal()); 356 | 357 | let mut neg = false; 358 | 359 | let mut val = self.cast_with_rm(sem, rm); 360 | 361 | // Handle the negatives. 362 | if val.is_negative() { 363 | val = val.neg(); 364 | neg ^= true; 365 | } 366 | 367 | // Range reductions. 368 | let is_small = self.get_exp() < 0; 369 | 370 | if !is_small { 371 | let pi = Self::pi(sem); 372 | let half_pi = pi.scale(-1, rm); 373 | 374 | // Wrap around pi. 375 | if val > pi { 376 | val = val.rem(&pi); 377 | } 378 | debug_assert!(val <= pi); 379 | 380 | // Reduce to 0..pi/2. 381 | if val > half_pi { 382 | val = pi - val; 383 | neg ^= true; 384 | } 385 | debug_assert!(val <= half_pi); 386 | } 387 | 388 | // Tan(x) = sin(x)/sqrt(1-sin(x)^2). 389 | let sinx = val.sin(); 390 | let one = Float::one(sem, false); 391 | let bottom = (one - sinx.sqr()).sqrt(); 392 | let res = sinx / bottom; 393 | let res = if neg { res.neg() } else { res }; 394 | res.cast(orig_sem) 395 | } 396 | } 397 | 398 | #[cfg(feature = "std")] 399 | #[test] 400 | fn test_tan_known_value() { 401 | use crate::std::string::ToString; 402 | 403 | // Verify the results with: 404 | // from mpmath import mp 405 | // mp.dps = 100 406 | // mp.tan(801./10000) 407 | let res = Float::from_f64(801. / 10000.).tan().to_string(); 408 | assert_eq!(res, ".08027174825588148"); 409 | let res = Float::from_f64(2.3).tan().to_string(); 410 | assert_eq!(res, "-1.1192136417341325"); 411 | let res = Float::from_f64(90210. / 10000.).tan().to_string(); 412 | assert_eq!(res, "-.4272536513599634"); 413 | let res = Float::from_f64(95051.).tan().to_string(); 414 | assert_eq!(res, "-1.6552033806966715"); 415 | } 416 | -------------------------------------------------------------------------------- /src/py.rs: -------------------------------------------------------------------------------- 1 | use crate::{BigInt, Float, RoundingMode, Semantics}; 2 | use core::ops::{Add, Div, Mul, Sub}; 3 | use pyo3::prelude::*; 4 | use std::format; 5 | use std::string::String; 6 | use std::string::ToString; 7 | 8 | /// Semantics class defining precision and rounding behavior. 9 | /// 10 | /// This class encapsulates the parameters that define the precision and 11 | /// rounding behavior of floating-point operations. 12 | #[pyclass] 13 | struct PySemantics { 14 | inner: Semantics, 15 | } 16 | 17 | #[pymethods] 18 | impl PySemantics { 19 | /// Create a new semantics object. 20 | /// 21 | /// Args: 22 | /// exp_size: The size of the exponent in bits 23 | /// mantissa_size: The size of the mantissa, including the implicit bit 24 | /// rounding_mode: The rounding mode to use: 25 | /// "NearestTiesToEven", "NearestTiesToAway", 26 | /// "Zero", "Positive", "Negative" 27 | #[new] 28 | fn new(exp_size: i64, mantissa_size: u64, rounding_mode_str: &str) -> Self { 29 | let rm = RoundingMode::from_string(rounding_mode_str); 30 | assert!(rm.is_some(), "Invalid rounding mode"); 31 | let sem = Semantics::new( 32 | exp_size as usize, 33 | mantissa_size as usize, 34 | rm.unwrap(), 35 | ); 36 | PySemantics { inner: sem } 37 | } 38 | /// Returns the length of the exponent in bits. 39 | fn get_exponent_len(&self) -> usize { 40 | self.inner.get_exponent_len() 41 | } 42 | /// Returns the length of the mantissa in bits. 43 | fn get_mantissa_len(&self) -> usize { 44 | self.inner.get_mantissa_len() 45 | } 46 | /// Returns the rounding mode as a string. 47 | fn get_rounding_mode(&self) -> String { 48 | self.inner.get_rounding_mode().as_string().to_string() 49 | } 50 | fn __str__(&self) -> String { 51 | format!("{:?}", self.inner) 52 | } 53 | fn __repr__(&self) -> String { 54 | self.__str__() 55 | } 56 | /// Returns the maximum positive value of the number. 57 | fn get_max_positive_value(&self) -> PyFloat { 58 | PyFloat { 59 | inner: self.inner.get_max_positive_value(), 60 | } 61 | } 62 | /// Returns the minimum positive value of the number. 63 | fn get_min_positive_value(&self) -> PyFloat { 64 | PyFloat { 65 | inner: self.inner.get_min_positive_value(), 66 | } 67 | } 68 | /// Returns true if the number can be represented exactly in this format. 69 | /// A number can be represented exactly if the exponent is in the range, and 70 | /// the mantissa is not too large. In other words, the number 'val' can be 71 | /// converted to this format without any loss of accuracy. 72 | fn can_represent_exactly(&self, val: &PyFloat) -> bool { 73 | self.inner.can_represent_exactly(&val.inner) 74 | } 75 | } 76 | 77 | /// A class representing arbitrary precision floating-point numbers. 78 | /// 79 | /// This class implements IEEE 754-like floating-point arithmetic with 80 | /// configurable precision and rounding modes. 81 | #[pyclass] 82 | struct PyFloat { 83 | inner: Float, 84 | } 85 | 86 | #[pymethods] 87 | impl PyFloat { 88 | /// Create a new floating-point number. 89 | /// 90 | /// Args: 91 | /// sem: The semantics (precision and rounding mode) for this number 92 | /// is_negative: Whether the number is negative (sign bit) 93 | /// exp: The biased exponent value (integer) 94 | /// mantissa: The mantissa value (integer) 95 | #[new] 96 | fn new( 97 | sem: &Bound<'_, PyAny>, 98 | is_negative: bool, 99 | exp: i64, 100 | mantissa: u64, 101 | ) -> Self { 102 | let sem: PyRef = sem.extract().unwrap(); 103 | let mut man = BigInt::from_u64(mantissa); 104 | man.flip_bit(sem.inner.get_mantissa_len()); // Add the implicit bit. 105 | let bias = sem.inner.get_bias(); 106 | PyFloat { 107 | inner: Float::from_parts(sem.inner, is_negative, exp - bias, man), 108 | } 109 | } 110 | 111 | fn __str__(&self) -> String { 112 | self.inner.to_string() 113 | } 114 | fn __repr__(&self) -> String { 115 | self.__str__() 116 | } 117 | /// Returns the mantissa of the float. 118 | fn get_mantissa(&self) -> u64 { 119 | self.inner.get_mantissa().as_u64() 120 | } 121 | /// Returns the exponent of the float. 122 | fn get_exponent(&self) -> i64 { 123 | self.inner.get_exp() 124 | } 125 | /// Returns the category of the float. 126 | fn get_category(&self) -> String { 127 | format!("{:?}", self.inner.get_category()) 128 | } 129 | /// Returns the semantics of the float. 130 | fn get_semantics(&self) -> PySemantics { 131 | PySemantics { 132 | inner: self.inner.get_semantics(), 133 | } 134 | } 135 | /// Get rounding mode of the number. 136 | fn get_rounding_mode(&self) -> String { 137 | self.inner.get_rounding_mode().as_string().to_string() 138 | } 139 | /// Returns true if the Float is negative 140 | fn is_negative(&self) -> bool { 141 | self.inner.is_negative() 142 | } 143 | /// Returns true if the Float is +-inf. 144 | fn is_inf(&self) -> bool { 145 | self.inner.is_inf() 146 | } 147 | /// Returns true if the Float is a +- NaN. 148 | fn is_nan(&self) -> bool { 149 | self.inner.is_nan() 150 | } 151 | /// Returns true if the Float is a +- zero. 152 | fn is_zero(&self) -> bool { 153 | self.inner.is_zero() 154 | } 155 | 156 | /// Returns true if this number is normal (not Zero, Nan, Inf). 157 | fn is_normal(&self) -> bool { 158 | self.inner.is_normal() 159 | } 160 | 161 | fn __add__(&self, other: &PyFloat) -> PyFloat { 162 | self.add(other) 163 | } 164 | 165 | fn __sub__(&self, other: &PyFloat) -> PyFloat { 166 | self.sub(other) 167 | } 168 | 169 | fn __mul__(&self, other: &PyFloat) -> PyFloat { 170 | self.mul(other) 171 | } 172 | fn __truediv__(&self, other: &PyFloat) -> PyFloat { 173 | self.div(other) 174 | } 175 | fn add(&self, other: &PyFloat) -> PyFloat { 176 | let val = self.inner.clone().add(other.inner.clone()); 177 | PyFloat { inner: val } 178 | } 179 | fn mul(&self, other: &PyFloat) -> PyFloat { 180 | let val = self.inner.clone().mul(other.inner.clone()); 181 | PyFloat { inner: val } 182 | } 183 | fn sub(&self, other: &PyFloat) -> PyFloat { 184 | let val = self.inner.clone().sub(other.inner.clone()); 185 | PyFloat { inner: val } 186 | } 187 | fn div(&self, other: &PyFloat) -> PyFloat { 188 | let val = self.inner.clone().div(other.inner.clone()); 189 | PyFloat { inner: val } 190 | } 191 | /// Returns the number raised to the power of `exp` which is an integer. 192 | fn powi(&self, exp: u64) -> PyFloat { 193 | PyFloat { 194 | inner: self.inner.powi(exp), 195 | } 196 | } 197 | /// Returns the number raised to the power of `exp` which is a float. 198 | fn pow(&self, exp: &PyFloat) -> PyFloat { 199 | PyFloat { 200 | inner: self.inner.pow(&exp.inner), 201 | } 202 | } 203 | /// Returns the exponential of the number. 204 | fn exp(&self) -> PyFloat { 205 | PyFloat { 206 | inner: self.inner.exp(), 207 | } 208 | } 209 | /// Returns the natural logarithm of the number. 210 | fn log(&self) -> PyFloat { 211 | PyFloat { 212 | inner: self.inner.log(), 213 | } 214 | } 215 | /// Returns the sigmoid of the number. 216 | fn sigmoid(&self) -> PyFloat { 217 | PyFloat { 218 | inner: self.inner.sigmoid(), 219 | } 220 | } 221 | /// Returns the absolute value of the number. 222 | fn abs(&self) -> PyFloat { 223 | PyFloat { 224 | inner: self.inner.abs(), 225 | } 226 | } 227 | /// Returns the maximum of two numbers (as defined by IEEE 754). 228 | fn max(&self, other: &PyFloat) -> PyFloat { 229 | PyFloat { 230 | inner: self.inner.max(&other.inner), 231 | } 232 | } 233 | /// Returns the minimum of two numbers (as defined by IEEE 754). 234 | fn min(&self, other: &PyFloat) -> PyFloat { 235 | PyFloat { 236 | inner: self.inner.min(&other.inner), 237 | } 238 | } 239 | /// Returns the remainder of the division of two numbers. 240 | fn rem(&self, other: &PyFloat) -> PyFloat { 241 | PyFloat { 242 | inner: self.inner.rem(&other.inner), 243 | } 244 | } 245 | /// Cast the number to another semantics. 246 | fn cast(&self, sem: &Bound<'_, PyAny>) -> PyFloat { 247 | let sem: PyRef = sem.extract().unwrap(); 248 | PyFloat { 249 | inner: self.inner.cast(sem.inner), 250 | } 251 | } 252 | /// Cast the number to another semantics with a specific rounding mode. 253 | fn cast_with_rm(&self, sem: &Bound<'_, PyAny>, rm: &str) -> PyFloat { 254 | let sem: PyRef = sem.extract().unwrap(); 255 | let rm = RoundingMode::from_string(rm); 256 | assert!(rm.is_some(), "Invalid rounding mode"); 257 | PyFloat { 258 | inner: self.inner.cast_with_rm(sem.inner, rm.unwrap()), 259 | } 260 | } 261 | 262 | /// Returns the number with the sign flipped. 263 | fn neg(&self) -> PyFloat { 264 | PyFloat { 265 | inner: self.inner.neg(), 266 | } 267 | } 268 | /// Returns the number with the sign flipped. 269 | fn __neg__(&self) -> PyFloat { 270 | self.neg() 271 | } 272 | /// Returns true if the number is less than the other number. 273 | fn __lt__(&self, other: &PyFloat) -> bool { 274 | self.inner < other.inner 275 | } 276 | /// Returns true if the number is less than or equal to the other number. 277 | fn __le__(&self, other: &PyFloat) -> bool { 278 | self.inner <= other.inner 279 | } 280 | /// Returns true if the number is equal to the other number. 281 | fn __eq__(&self, other: &PyFloat) -> bool { 282 | self.inner == other.inner 283 | } 284 | /// Returns true if the number is not equal to the other number. 285 | fn __ne__(&self, other: &PyFloat) -> bool { 286 | self.inner != other.inner 287 | } 288 | /// Returns true if the number is greater than the other number. 289 | fn __gt__(&self, other: &PyFloat) -> bool { 290 | self.inner > other.inner 291 | } 292 | /// Returns true if the number is greater than or equal to the other number. 293 | fn __ge__(&self, other: &PyFloat) -> bool { 294 | self.inner >= other.inner 295 | } 296 | /// Returns the sine of the number. 297 | fn sin(&self) -> PyFloat { 298 | PyFloat { 299 | inner: self.inner.sin(), 300 | } 301 | } 302 | /// Returns the cosine of the number. 303 | fn cos(&self) -> PyFloat { 304 | PyFloat { 305 | inner: self.inner.cos(), 306 | } 307 | } 308 | /// Returns the tangent of the number. 309 | fn tan(&self) -> PyFloat { 310 | PyFloat { 311 | inner: self.inner.tan(), 312 | } 313 | } 314 | /// convert to f64. 315 | fn to_float64(&self) -> f64 { 316 | self.inner.as_f64() 317 | } 318 | /// Convert the number to a Continued Fraction of two integers. 319 | /// Take 'n' iterations. 320 | fn as_fraction(&self, n: usize) -> (u64, u64) { 321 | let (a, b) = self.inner.as_fraction(n); 322 | (a.as_u64(), b.as_u64()) 323 | } 324 | /// Prints the number using the internal representation. 325 | fn dump(&self) { 326 | self.inner.dump(); 327 | } 328 | } // impl PyFloat 329 | 330 | /// Returns the mathematical constant pi with the given semantics. 331 | /// 332 | /// Args: 333 | /// sem: The semantics to use for representing pi 334 | #[pyfunction] 335 | fn pi(sem: &Bound<'_, PyAny>) -> PyResult { 336 | let sem: PyRef = sem.extract()?; 337 | Ok(PyFloat { 338 | inner: Float::pi(sem.inner), 339 | }) 340 | } 341 | 342 | /// Returns the fused multiply-add operation of three numbers. 343 | /// 344 | /// Args: (a * b) + c 345 | #[pyfunction] 346 | fn fma(a: &PyFloat, b: &PyFloat, c: &PyFloat) -> PyResult { 347 | Ok(PyFloat { 348 | inner: Float::fma(&a.inner, &b.inner, &c.inner), 349 | }) 350 | } 351 | 352 | /// Returns the mathematical constant e (Euler's number) with the given semantics. 353 | /// 354 | /// Args: 355 | /// sem: The semantics to use for representing e 356 | #[pyfunction] 357 | fn e(sem: &Bound<'_, PyAny>) -> PyResult { 358 | let sem: PyRef = sem.extract()?; 359 | Ok(PyFloat { 360 | inner: Float::e(sem.inner), 361 | }) 362 | } 363 | 364 | /// Returns the natural logarithm of 2 (ln(2)) with the given semantics. 365 | /// 366 | /// Args: 367 | /// sem: The semantics to use for representing ln(2) 368 | #[pyfunction] 369 | fn ln2(sem: &Bound<'_, PyAny>) -> PyResult { 370 | let sem: PyRef = sem.extract()?; 371 | Ok(PyFloat { 372 | inner: Float::ln2(sem.inner), 373 | }) 374 | } 375 | 376 | /// Returns the number zero with the given semantics. 377 | /// 378 | /// Args: 379 | /// sem: The semantics to use for representing e 380 | #[pyfunction] 381 | fn zero(sem: &Bound<'_, PyAny>) -> PyResult { 382 | let sem: PyRef = sem.extract()?; 383 | Ok(PyFloat { 384 | inner: Float::zero(sem.inner, false), 385 | }) 386 | } 387 | 388 | /// Returns a new float with the integer value 'val' with the given semantics. 389 | /// 390 | /// Args: 391 | /// sem: The semantics to use 392 | /// val: The integer value 393 | #[pyfunction] 394 | fn from_i64(sem: &Bound<'_, PyAny>, val: i64) -> PyResult { 395 | let sem: PyRef = sem.extract()?; 396 | Ok(PyFloat { 397 | inner: Float::from_i64(sem.inner, val), 398 | }) 399 | } 400 | 401 | /// Returns a new float with the fp64 value 'val'. 402 | /// 403 | /// Args: 404 | /// val: The f64 value 405 | #[pyfunction] 406 | fn from_fp64(val: f64) -> PyResult { 407 | Ok(PyFloat { 408 | inner: Float::from_f64(val), 409 | }) 410 | } 411 | 412 | #[pymodule] 413 | fn _arpfloat(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 414 | m.add_class::()?; 415 | m.add_class::()?; 416 | 417 | // Add the functions to the module 418 | m.add_function(wrap_pyfunction!(pi, m)?)?; 419 | m.add_function(wrap_pyfunction!(e, m)?)?; 420 | m.add_function(wrap_pyfunction!(ln2, m)?)?; 421 | m.add_function(wrap_pyfunction!(zero, m)?)?; 422 | m.add_function(wrap_pyfunction!(fma, m)?)?; 423 | m.add_function(wrap_pyfunction!(from_i64, m)?)?; 424 | m.add_function(wrap_pyfunction!(from_fp64, m)?)?; 425 | Ok(()) 426 | } 427 | -------------------------------------------------------------------------------- /src/string.rs: -------------------------------------------------------------------------------- 1 | //! This module contains the implementation of string conversion. 2 | 3 | extern crate alloc; 4 | 5 | use super::bigint::BigInt; 6 | use super::float::Float; 7 | use super::RoundingMode; 8 | use super::Semantics; 9 | use alloc::string::{String, ToString}; 10 | use alloc::vec::Vec; 11 | use core::cmp::Ordering; 12 | use core::fmt::Display; 13 | 14 | impl Float { 15 | /// Convert the number into a large integer, and a base-10 exponent. 16 | fn convert_to_integer(&self) -> (BigInt, i64) { 17 | // The natural representation of numbers is 1.mmmmmmm, where the 18 | // mantissa is aligned to the MSB. In this method we convert the numbers 19 | // into integers, that start at bit zero, so we use exponent that refers 20 | // to bit zero. 21 | // See Ryu: Fast Float-to-String Conversion -- Ulf Adams. 22 | // https://youtu.be/kw-U6smcLzk?t=681 23 | let mut exp = self.get_exp() - self.get_mantissa_len() as i64; 24 | let mut mantissa: BigInt = self.get_mantissa(); 25 | 26 | match exp.cmp(&0) { 27 | Ordering::Less => { 28 | // The number is not yet an integer, we need to convert it using 29 | // the method: 30 | // mmmmm * 5^(e) * 10 ^(-e) == mmmmm * 10 ^ (-e); 31 | // where (5^e) * (10^-e) == (2^-e) 32 | // And the left hand side is how we represent our binary number 33 | // 1.mmmm * 2^-e, and the right-hand-side is how we represent 34 | // our decimal number: nnnnnnn * 10^-e. 35 | let five = BigInt::from_u64(5); 36 | let e5 = five.powi((-exp) as u64); 37 | mantissa.inplace_mul(&e5); 38 | exp = -exp; 39 | } 40 | Ordering::Equal | Ordering::Greater => { 41 | // The number is already an integer, just align it. 42 | // In this case, E - M > 0, so we are aligning the larger 43 | // integers, for example [1.mmmm * e^15], in FP16 (where M=10). 44 | mantissa.shift_left(exp as usize); 45 | exp = 0; 46 | } 47 | } 48 | 49 | (mantissa, exp) 50 | } 51 | 52 | /// Returns the highest number of decimal digits that are needed for 53 | /// representing this type accurately. 54 | pub fn get_decimal_accuracy(&self) -> usize { 55 | // Matula, David W. “A Formalization of Floating-Point Numeric Base 56 | // N = 2 + floor(n / log_b(B)) = 2 + floor(n / log(10, 2)) 57 | // We convert from bits to base-10 digits: log(2)/log(10) ==> 59/196. 58 | // A continuous fraction of 5 iteration gives the ratio. 59 | 2 + (self.get_mantissa_len() * 59) / 196 60 | } 61 | 62 | /// Reduce a number in the representation mmmmm * e^10, to fewer bits in 63 | /// 'm', based on the max possible digits in the mantissa. 64 | fn reduce_printed_integer_length( 65 | &self, 66 | integer: &mut BigInt, 67 | exp: &mut i64, 68 | ) { 69 | let bits = integer.msb_index(); 70 | if bits <= self.get_mantissa_len() { 71 | return; 72 | }; 73 | let needed_bits = bits - self.get_mantissa_len(); 74 | // We convert from bits to base-10 digits: log(2)/log(10) ==> 59/196. 75 | // A continuous fraction of 5 iteration gives the ratio. 76 | let mut digits_to_remove = ((needed_bits * 59) / 196) as i64; 77 | 78 | // Only remove digits after the decimal points. 79 | if digits_to_remove > *exp { 80 | digits_to_remove = *exp; 81 | } 82 | *exp -= digits_to_remove; 83 | let ten = BigInt::from_u64(10); 84 | let divisor = ten.powi(digits_to_remove as u64); 85 | integer.inplace_div(&divisor); 86 | } 87 | 88 | fn convert_normal_to_string(&self) -> String { 89 | // Convert the integer to base-10 integer, and e, the exponent in 90 | // base 10 (scientific notation). 91 | let (mut integer, mut e) = self.convert_to_integer(); 92 | 93 | // Try to shorten the number. 94 | self.reduce_printed_integer_length(&mut integer, &mut e); 95 | 96 | // Extract the digits: Div10-Mod10-Div10-Mod10 .... 97 | let mut buff = Vec::new(); 98 | let digits = integer.to_digits::<10>(); 99 | for d in digits { 100 | buff.push(char::from_digit(d as u32, 10).unwrap()) 101 | } 102 | 103 | debug_assert!(e >= 0); 104 | // Add the trailing zeros, and make room to place the point. 105 | while buff.len() < e as usize { 106 | buff.insert(0, '0'); 107 | } 108 | 109 | buff.insert(buff.len() - e as usize, '.'); 110 | while !buff.is_empty() && buff[buff.len() - 1] == '0' { 111 | buff.pop(); 112 | } 113 | String::from_iter(buff) 114 | } 115 | 116 | /// Convert the number to a string. This is a simple implementation 117 | /// that does not take into account rounding during the round-trip of 118 | /// parsing-printing of the value, or scientific notation, and the minimal 119 | /// representation of numbers. For all of that that check out the paper: 120 | /// "How to Print Floating-Point Numbers Accurately" by Steele and White. 121 | fn convert_to_string(&self) -> String { 122 | // In order to print decimal digits we need a minimum number of mantissa 123 | // bits for the conversion. Small floats (such as BF16) don't have 124 | // enough bits, so we cast to a larger number. 125 | if self.get_semantics().get_mantissa_len() < 16 { 126 | use crate::FP32; 127 | return self.cast(FP32).to_string(); 128 | } 129 | 130 | let result = if self.get_sign() { "-" } else { "" }; 131 | let mut result: String = result.to_string(); 132 | 133 | let body: String = match self.get_category() { 134 | super::float::Category::Infinity => "Inf".to_string(), 135 | super::float::Category::NaN => "NaN".to_string(), 136 | super::float::Category::Normal => self.convert_normal_to_string(), 137 | super::float::Category::Zero => "0.0".to_string(), 138 | }; 139 | 140 | result.push_str(&body); 141 | result 142 | } 143 | } 144 | impl Display for Float { 145 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 146 | write!(f, "{}", self.convert_to_string()) 147 | } 148 | } 149 | 150 | impl Display for BigInt { 151 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 152 | write!(f, "{}", self.as_binary()) 153 | } 154 | } 155 | 156 | impl RoundingMode { 157 | pub fn as_string(&self) -> &str { 158 | match self { 159 | RoundingMode::None => "None", 160 | RoundingMode::NearestTiesToEven => "NearestTiesToEven", 161 | RoundingMode::NearestTiesToAway => "NearestTiesToAway", 162 | RoundingMode::Zero => "Zero", 163 | RoundingMode::Positive => "Positive", 164 | RoundingMode::Negative => "Negative", 165 | } 166 | } 167 | } 168 | 169 | impl Display for Semantics { 170 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 171 | write!( 172 | f, 173 | "(exponent:{} precision:{} rm:{})", 174 | self.get_exponent_len(), 175 | self.get_precision(), 176 | self.get_rounding_mode().as_string() 177 | ) 178 | } 179 | } 180 | 181 | #[cfg(feature = "std")] 182 | mod from { 183 | use core::fmt::{Debug, Display}; 184 | use std::error::Error; 185 | 186 | use crate::{BigInt, Float, Semantics, FP64}; 187 | 188 | impl Float { 189 | /// Try to construct a Float instance with semantics 'sem' from the 190 | /// string 'value'. Note that the operation of conversion might lose 191 | /// precision. If you care about precision you might want to use a 192 | /// higher precision float and downcast. 193 | pub fn try_from_str( 194 | value: &str, 195 | sem: Semantics, 196 | ) -> Result { 197 | // Handle the empty case. 198 | if value.is_empty() { 199 | return Err(ParseError(ParseErrorKind::InputEmpty)); 200 | } 201 | 202 | // Handle the plus or minus in front of the number. 203 | let chars = value.as_bytes(); 204 | let (sign, skip) = if chars[0] == b'-' || chars[0] == b'+' { 205 | (chars[0] == b'-', 1) 206 | } else { 207 | (false, 0) 208 | }; 209 | let value = &value[skip..]; 210 | 211 | // Handle Nan. 212 | if value.eq_ignore_ascii_case("nan") { 213 | return Ok(Self::nan(sem, sign)); 214 | } 215 | 216 | // Handle Inf. 217 | if value.eq_ignore_ascii_case("inf") { 218 | return Ok(Self::inf(sem, sign)); 219 | } 220 | 221 | // Start handling the non trivial cases. 222 | 223 | let l_r = value.split_once('.'); 224 | // Handle cases where we have no `.` and as such no mantissa. 225 | if l_r.is_none() { 226 | // No period. Just parse the integer. 227 | let ((num, _), exp_num) = parse_with_exp(value)?; 228 | let mut num = Float::from_bigint(sem, num); 229 | 230 | // Shift the number according to the exponent (in decimal). 231 | if let Some(exp) = exp_num { 232 | if exp >= 0 { 233 | num *= Float::from_bigint( 234 | sem, 235 | BigInt::from_u64(10).powi(exp as u64), 236 | ); 237 | } else { 238 | num /= Float::from_bigint( 239 | sem, 240 | BigInt::from_u64(10).powi((-exp) as u64), 241 | ); 242 | } 243 | } 244 | num.set_sign(sign); 245 | return Ok(num); 246 | } 247 | 248 | // Handle cases where we have a period in the number: 249 | let (left, right) = l_r.unwrap(); 250 | 251 | // Try parsing decimal value of 0. 252 | if right.chars().all(|chr| chr == '0') { 253 | return parse_whole_num(left, sign, sem).map(Ok).unwrap_or( 254 | Err(ParseError(ParseErrorKind::ParsingNumberFailed)), 255 | ); 256 | } 257 | 258 | // Parse the integer part. 259 | let left_num = parse_big_int(left).map(Ok).unwrap_or(Err( 260 | ParseError(ParseErrorKind::ParsingNumberFailed), 261 | ))?; 262 | 263 | // Parse the mantissa and an optional exponent part 264 | let ((right_num, right_num_digits), explicit_exp) = 265 | parse_with_exp(right)?; 266 | 267 | // Construct the integral and fractional parts, without the exp. 268 | // This is one of the places where we might lose precision. 269 | let dec_shift = BigInt::from_u64(10).powi(right_num_digits as u64); 270 | 271 | let integral = Float::from_bigint(sem, left_num); 272 | let fraction = Float::from_bigint(sem, right_num) 273 | / Float::from_bigint(sem, dec_shift); 274 | 275 | // Construct the whole number, move the fractional part into place. 276 | let mut ret = integral + fraction; 277 | 278 | // Handle the explicit exponent. (Example: e+1). 279 | if let Some(exp_num) = explicit_exp { 280 | if exp_num >= 0 { 281 | let e = BigInt::from_u64(10).powi(exp_num as u64); 282 | ret *= Float::from_bigint(sem, e) 283 | } else { 284 | let e = BigInt::from_u64(10).powi((-exp_num) as u64); 285 | ret /= Float::from_bigint(sem, e) 286 | } 287 | } 288 | ret.set_sign(sign); 289 | Ok(ret) 290 | } 291 | } 292 | 293 | impl TryFrom<&str> for Float { 294 | type Error = ParseError; 295 | 296 | fn try_from(value: &str) -> Result { 297 | const DEFAULT_SEM: Semantics = FP64; 298 | // TODO: autodetect required semantics 299 | Self::try_from_str(value, DEFAULT_SEM) 300 | } 301 | } 302 | 303 | /// Parse a number that contains the 'e' marker for exponent. 304 | /// Example: 565e+1 305 | /// Returns the number, the number of decimal digits, and an optional 306 | /// exponent value. 307 | fn parse_with_exp( 308 | value: &str, 309 | ) -> Result<((BigInt, usize), Option), ParseError> { 310 | let idx = value.find(['e', 'E']); 311 | // Split the number to the digits and the exponent. 312 | let (num_raw, exp) = if let Some(idx) = idx { 313 | let (l, r) = value.split_at(idx); 314 | (l, Some(&r[1..])) 315 | } else { 316 | (value, None) 317 | }; 318 | 319 | // Parse the left size of the expression (the number). 320 | let num = parse_big_int(num_raw) 321 | .map(|num| Ok((num, num_raw.len()))) 322 | .unwrap_or(Err(ParseError(ParseErrorKind::ParsingNumberFailed)))?; 323 | 324 | // Parse the right side (the exponent expression). 325 | if let Some(exp) = exp { 326 | match exp.parse::() { 327 | Ok(exp) => { 328 | // Found a valid expression that has exponent. 329 | return Ok((num, Some(exp))); 330 | } 331 | Err(_) => { 332 | return Err(ParseError(ParseErrorKind::ExponentParseFailed)) 333 | } 334 | } 335 | } 336 | // Found a valid number without exponent marker. 337 | Ok((num, None)) 338 | } 339 | 340 | /// Try to parse a number from the string 'value'. 341 | fn parse_big_int(value: &str) -> Option { 342 | let chars = value.as_bytes(); 343 | let ten = BigInt::from_u64(10); 344 | let mut num = BigInt::from_u64(0); 345 | for digit in chars.iter() { 346 | if *digit > b'9' || *digit < b'0' { 347 | return None; 348 | } 349 | let part = [*digit as u64 - '0' as u64]; 350 | num.inplace_mul(&ten); 351 | num.inplace_add_slice(&part); 352 | } 353 | Some(num) 354 | } 355 | 356 | /// Parse one long integer and apply a sign. 357 | fn parse_whole_num( 358 | value: &str, 359 | sign: bool, 360 | sem: Semantics, 361 | ) -> Option { 362 | let chars = value.as_bytes(); 363 | // Handle the special case of '0'. 364 | if value.len() == 1 && chars[0] == b'0' { 365 | return Some(Float::zero(sem, sign)); 366 | } 367 | 368 | // Parse the digits. 369 | let num = parse_big_int(value)?; 370 | // And construct the Float number. 371 | let mut ret = Float::from_bigint(sem, num); 372 | ret.set_sign(sign); 373 | 374 | Some(ret) 375 | } 376 | 377 | enum ParseErrorKind { 378 | InputEmpty, 379 | ParsingNumberFailed, 380 | ExponentParseFailed, 381 | } 382 | 383 | pub struct ParseError(ParseErrorKind); 384 | 385 | impl Error for ParseError {} 386 | 387 | impl Display for ParseError { 388 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 389 | match self.0 { 390 | ParseErrorKind::ParsingNumberFailed => f.write_str( 391 | "Failed parsing number part of floating point number", 392 | ), 393 | ParseErrorKind::ExponentParseFailed => { 394 | f.write_str("Failed parsing exponent of float number") 395 | } 396 | ParseErrorKind::InputEmpty => { 397 | f.write_str("The input provided was empty") 398 | } 399 | } 400 | } 401 | } 402 | 403 | impl Debug for ParseError { 404 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 405 | Display::fmt(&self, f) 406 | } 407 | } 408 | } 409 | 410 | #[cfg(feature = "std")] 411 | #[test] 412 | fn test_convert_to_string() { 413 | use crate::FP16; 414 | use crate::FP64; 415 | use core::f64; 416 | use std::format; 417 | 418 | fn to_str_w_fp16(val: f64) -> String { 419 | format!("{}", Float::from_f64(val).cast(FP16)) 420 | } 421 | 422 | fn to_str_w_bf16(val: f64) -> String { 423 | use crate::BF16; 424 | format!("{}", Float::from_f64(val).cast(BF16)) 425 | } 426 | 427 | fn to_str_w_fp64(val: f64) -> String { 428 | format!("{}", Float::from_f64(val).cast(FP64)) 429 | } 430 | 431 | assert_eq!("-0.0", to_str_w_fp16(-0.)); 432 | assert_eq!(".30004882", to_str_w_fp16(0.3)); 433 | assert_eq!("4.5", to_str_w_fp16(4.5)); 434 | assert_eq!("256.", to_str_w_fp16(256.)); 435 | assert_eq!("Inf", to_str_w_fp16(65534.)); 436 | assert_eq!("-Inf", to_str_w_fp16(-65534.)); 437 | assert_eq!(".09997558", to_str_w_fp16(0.1)); 438 | assert_eq!(".1", to_str_w_fp64(0.1)); 439 | assert_eq!(".29999999999999998", to_str_w_fp64(0.3)); 440 | assert_eq!("2251799813685248.", to_str_w_fp64((1u64 << 51) as f64)); 441 | assert_eq!("1995.1994999999999", to_str_w_fp64(1995.1995)); 442 | assert_eq!("3.140625", to_str_w_bf16(f64::consts::PI)); 443 | } 444 | 445 | #[cfg(feature = "std")] 446 | #[test] 447 | fn test_from_string() { 448 | assert_eq!("-3.", Float::try_from("-3.0").unwrap().to_string()); 449 | assert_eq!("-3.", Float::try_from("-3.00").unwrap().to_string()); 450 | assert_eq!("30.", Float::try_from("30").unwrap().to_string()); 451 | assert_eq!("430.56", Float::try_from("430.56").unwrap().to_string()); 452 | assert_eq!("5.2", Float::try_from("5.2").unwrap().to_string()); 453 | assert_eq!("Inf", Float::try_from("inf").unwrap().to_string()); 454 | assert_eq!("NaN", Float::try_from("nan").unwrap().to_string()); 455 | assert_eq!("32.", Float::try_from("3.2e1").unwrap().to_string()); 456 | assert_eq!("4.4", Float::try_from("44.e-1").unwrap().to_string()); 457 | assert_eq!("5.4", Float::try_from("54e-1").unwrap().to_string()); 458 | assert_eq!("-5.485", Float::try_from("-54.85e-1").unwrap().to_string()); 459 | assert!(Float::try_from("abc.de").is_err()); 460 | assert!(Float::try_from("e.-21").is_err()); 461 | assert!(Float::try_from("-rlp.").is_err()); 462 | assert!(Float::try_from("").is_err()); 463 | } 464 | 465 | #[test] 466 | fn test_fuzz_printing() { 467 | use crate::utils; 468 | 469 | let mut lfsr = utils::Lfsr::new(); 470 | 471 | for _ in 0..500 { 472 | let v0 = lfsr.get64(); 473 | let f0 = f64::from_bits(v0); 474 | let fp0 = Float::from_f64(f0); 475 | fp0.to_string(); 476 | } 477 | } 478 | 479 | #[cfg(feature = "std")] 480 | #[test] 481 | fn test_print_sqrt() { 482 | use crate::FP64; 483 | use std::println; 484 | 485 | // Use Newton-Raphson to find the square root of 5. 486 | let n = Float::from_u64(FP64, 5); 487 | 488 | let mut x = n.clone(); 489 | 490 | for _ in 0..100 { 491 | x = (&x + (&n / &x)) / 2; 492 | } 493 | println!("{}", x); 494 | } 495 | 496 | #[test] 497 | #[cfg(feature = "std")] 498 | fn test_readme_example() { 499 | use std::println; 500 | // Create a new type: 15 bits exponent, 112 significand. 501 | 502 | // Use Newton-Raphson to find the square root of 5. 503 | let n = Float::from_u64(FP128, 5); 504 | let mut x = n.clone(); 505 | 506 | for _ in 0..1000 { 507 | x = (&x + &n / &x) / 2; 508 | } 509 | println!("fp128: {}", x); 510 | println!("fp64: {}", x.as_f64()); 511 | 512 | use crate::{FP128, FP16}; 513 | let fp = Float::from_i64(FP16, 15); 514 | fp.dump(); 515 | } 516 | 517 | #[test] 518 | fn test_decimal_accuracy_for_type() { 519 | use crate::{FP128, FP16, FP256, FP32, FP64}; 520 | assert_eq!(Float::zero(FP16, false).get_decimal_accuracy(), 5); 521 | assert_eq!(Float::zero(FP32, false).get_decimal_accuracy(), 8); 522 | assert_eq!(Float::zero(FP64, false).get_decimal_accuracy(), 17); 523 | assert_eq!(Float::zero(FP128, false).get_decimal_accuracy(), 35); 524 | assert_eq!(Float::zero(FP256, false).get_decimal_accuracy(), 73); 525 | } 526 | 527 | impl BigInt { 528 | /// Prints the bigint as a decimal number. 529 | pub fn as_decimal(&self) -> String { 530 | if self.is_zero() { 531 | return "0".to_string(); 532 | } 533 | 534 | let mut buff = Vec::new(); 535 | let digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; 536 | let ten = Self::from_u64(10); 537 | let mut val = self.clone(); 538 | while !val.is_zero() { 539 | let rem = val.inplace_div(&ten); 540 | buff.insert(0, digits[rem.as_u64() as usize]); 541 | } 542 | 543 | String::from_iter(buff) 544 | } 545 | /// Prints the bigint as a sequence of bits. 546 | pub fn as_binary(&self) -> String { 547 | let mut sb = String::new(); 548 | 549 | if self.is_empty() || self.is_zero() { 550 | return String::from("0"); 551 | } 552 | let mut top_non_zero = 0; 553 | for i in (0..self.len()).rev() { 554 | if self.get_part(i) != 0 { 555 | top_non_zero = i; 556 | break; 557 | } 558 | } 559 | 560 | for i in 0..=top_non_zero { 561 | let mut part = self.get_part(i); 562 | // Don't print leading zeros for the first word. 563 | if i == top_non_zero { 564 | while part > 0 { 565 | let last = if part & 0x1 == 1 { '1' } else { '0' }; 566 | sb.insert(0, last); 567 | part /= 2; 568 | } 569 | continue; 570 | } 571 | 572 | // Print leading zeros for the rest of the words. 573 | for _ in 0..64 { 574 | let last = if part & 0x1 == 1 { '1' } else { '0' }; 575 | sb.insert(0, last); 576 | part /= 2; 577 | } 578 | } 579 | if sb.is_empty() { 580 | sb.push('0'); 581 | } 582 | sb 583 | } 584 | } 585 | 586 | #[cfg(feature = "std")] 587 | #[test] 588 | fn test_bigint_to_string() { 589 | let val = 0b101110011010011111010101011110000000101011110101; 590 | let mut bi = BigInt::from_u64(val); 591 | bi.shift_left(32); 592 | assert_eq!( 593 | bi.as_binary(), 594 | "10111001101001111101010101111000\ 595 | 000010101111010100000000000000000\ 596 | 000000000000000" 597 | ); 598 | 599 | let mut bi = BigInt::from_u64(val); 600 | bi.shift_left(64); 601 | bi = bi + val; 602 | assert_eq!( 603 | bi.as_binary(), 604 | "101110011010011111010101011110000000101011110101\ 605 | 0000000000000000\ 606 | 101110011010011111010101011110000000101011110101" 607 | ); 608 | } 609 | 610 | #[cfg(feature = "std")] 611 | #[test] 612 | fn test_bigint_to_decimal() { 613 | let mut num = BigInt::one(); 614 | for i in 1..41 { 615 | let term = BigInt::from_u64(i); 616 | num.inplace_mul(&term); 617 | } 618 | 619 | assert_eq!( 620 | num.as_decimal(), 621 | "815915283247897734345611269596115894272000000000" 622 | ); 623 | } 624 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | //! This file contains simple helper functions and test helpers. 2 | 3 | /// Returns a mask full of 1s, of `b` bits. 4 | pub fn mask(b: usize) -> usize { 5 | (1 << (b)) - 1 6 | } 7 | 8 | #[test] 9 | fn test_masking() { 10 | assert_eq!(mask(0), 0x0); 11 | assert_eq!(mask(1), 0x1); 12 | assert_eq!(mask(8), 255); 13 | } 14 | 15 | #[cfg(feature = "std")] 16 | #[allow(dead_code)] 17 | /// Returns list of interesting values that various tests use to catch edge cases. 18 | pub fn get_special_test_values() -> [f64; 22] { 19 | [ 20 | -f64::NAN, 21 | f64::NAN, 22 | f64::INFINITY, 23 | f64::NEG_INFINITY, 24 | f64::EPSILON, 25 | -f64::EPSILON, 26 | 0.000000000000000000000000000000000000001, 27 | f64::MIN, 28 | f64::MAX, 29 | std::f64::consts::PI, 30 | std::f64::consts::LN_2, 31 | std::f64::consts::SQRT_2, 32 | std::f64::consts::E, 33 | 0.0, 34 | -0.0, 35 | 10., 36 | -10., 37 | -0.00001, 38 | 0.1, 39 | 355. / 113., 40 | -1.0, 41 | -1.1, 42 | ] 43 | } 44 | 45 | // Linear-feedback shift register. We use this as a random number generator for 46 | // tests. 47 | pub struct Lfsr { 48 | state: u32, 49 | } 50 | 51 | impl Default for Lfsr { 52 | fn default() -> Self { 53 | Self::new() 54 | } 55 | } 56 | 57 | impl Lfsr { 58 | /// Generate a new LFSR number generator. 59 | pub fn new() -> Lfsr { 60 | Lfsr { state: 0x13371337 } 61 | } 62 | 63 | /// Generate a new LFSR number generator that starts with a specific state. 64 | pub fn new_with_seed(seed: u32) -> Lfsr { 65 | Lfsr { 66 | state: 0x13371337 ^ seed, 67 | } 68 | } 69 | 70 | pub fn next(&mut self) { 71 | let a = (self.state >> 24) & 1; 72 | let b = (self.state >> 23) & 1; 73 | let c = (self.state >> 22) & 1; 74 | let d = (self.state >> 17) & 1; 75 | let n = a ^ b ^ c ^ d ^ 1; 76 | self.state <<= 1; 77 | self.state |= n; 78 | } 79 | 80 | fn get(&mut self) -> u32 { 81 | let mut res: u32 = 0; 82 | for _ in 0..32 { 83 | self.next(); 84 | res <<= 1; 85 | res ^= self.state & 0x1; 86 | } 87 | res 88 | } 89 | 90 | pub fn get64(&mut self) -> u64 { 91 | ((self.get() as u64) << 32) | self.get() as u64 92 | } 93 | } 94 | 95 | // Implement `Iterator` for `Lfsr`. 96 | impl Iterator for Lfsr { 97 | type Item = u64; 98 | fn next(&mut self) -> Option { 99 | Some(self.get64()) 100 | } 101 | } 102 | 103 | #[test] 104 | fn test_lfsr_balance() { 105 | let mut lfsr = Lfsr::new(); 106 | 107 | // Count the number of items, and the number of 1s. 108 | let mut items = 0; 109 | let mut ones = 0; 110 | 111 | for _ in 0..10000 { 112 | let mut u = lfsr.get(); 113 | for _ in 0..32 { 114 | items += 1; 115 | ones += u & 1; 116 | u >>= 1; 117 | } 118 | } 119 | // Make sure that we have around 50% 1s and 50% zeros. 120 | assert!((ones as f64) < (0.55 * items as f64)); 121 | assert!((ones as f64) > (0.45 * items as f64)); 122 | } 123 | #[test] 124 | fn test_repetition() { 125 | let mut lfsr = Lfsr::new(); 126 | let first = lfsr.get(); 127 | let second = lfsr.get(); 128 | 129 | // Make sure that the items don't repeat themselves too frequently. 130 | for _ in 0..30000 { 131 | assert_ne!(first, lfsr.get()); 132 | assert_ne!(second, lfsr.get()); 133 | } 134 | } 135 | 136 | // Multiply a and b, and return the (low, high) parts. 137 | #[allow(dead_code)] 138 | fn mul_part(a: u64, b: u64) -> (u64, u64) { 139 | let half_bits = u64::BITS / 2; 140 | let half_mask = (1 << half_bits) - 1; 141 | 142 | let a_lo = a & half_mask; 143 | let a_hi = a >> half_bits; 144 | let b_lo = b & half_mask; 145 | let b_hi = b >> half_bits; 146 | 147 | let ab_hi = a_hi * b_hi; 148 | let ab_mid = a_hi * b_lo; 149 | let ba_mid = b_hi * a_lo; 150 | let ab_low = a_lo * b_lo; 151 | 152 | let carry = 153 | ((ab_mid & half_mask) + (ba_mid & half_mask) + (ab_low >> half_bits)) 154 | >> half_bits; 155 | let low = (ab_mid << half_bits) 156 | .overflowing_add(ba_mid << half_bits) 157 | .0 158 | .overflowing_add(ab_low) 159 | .0; 160 | 161 | let high = (ab_hi + (ab_mid >> half_bits) + (ba_mid >> half_bits)) + carry; 162 | (low, high) 163 | } 164 | 165 | #[test] 166 | fn test_mul_parts() { 167 | use super::utils::Lfsr; 168 | 169 | let mut lfsr = Lfsr::new(); 170 | 171 | for _ in 0..500 { 172 | let v0 = lfsr.get64(); 173 | let v1 = lfsr.get64(); 174 | let res = mul_part(v0, v1); 175 | let full = v0 as u128 * v1 as u128; 176 | assert_eq!(full as u64, res.0); 177 | assert_eq!((full >> 64) as u64, res.1); 178 | } 179 | } 180 | --------------------------------------------------------------------------------