├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── arpfloat
    └── __init__.py
├── benches
    └── main_benchmark.rs
├── examples
    ├── calc_pi.rs
    ├── fma.py
    ├── print_e.rs
    └── softmax.py
├── pyproject.toml
├── requirements.txt
├── rustfmt.toml
├── setup.py
└── src
    ├── arithmetic.rs
    ├── bigint.rs
    ├── cast.rs
    ├── float.rs
    ├── lib.rs
    ├── operations
        ├── constants.rs
        ├── exp.rs
        ├── frac.rs
        ├── functions.rs
        ├── mod.rs
        └── trig.rs
    ├── py.rs
    ├── string.rs
    └── utils.rs


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - master
 5 |   pull_request:
 6 |     branches:
 7 |       - '**'
 8 | 
 9 | name: CI
10 | 
11 | jobs:
12 |   audit:
13 |     name: Audit
14 |     runs-on: ubuntu-latest
15 |     timeout-minutes: 10
16 |     steps:
17 |       - uses: actions/checkout@v1
18 |       - uses: actions-rs/audit-check@v1
19 |         with:
20 |           token: ${{ secrets.GITHUB_TOKEN }}
21 | 
22 |   fmt:
23 |     name: Rustfmt
24 |     runs-on: ubuntu-latest
25 |     timeout-minutes: 10
26 |     steps:
27 |       - uses: actions/checkout@v2
28 |       - uses: actions-rs/toolchain@v1
29 |         with:
30 |           profile: minimal
31 |           toolchain: nightly
32 |           override: true
33 |           components: rustfmt
34 | 
35 |       - uses: actions-rs/cargo@v1
36 |         with:
37 |           command: fmt
38 |           args: --all -- --check
39 | 
40 |   build_and_test_linux:
41 |     name: Build and Test (Linux)
42 |     runs-on: ubuntu-latest
43 |     timeout-minutes: 10
44 |     steps:
45 |       - uses: actions/checkout@v2
46 |       - uses: actions-rs/toolchain@v1
47 |         with:
48 |           profile: minimal
49 |           toolchain: stable
50 |           override: true
51 | 
52 |       - uses: actions-rs/cargo@v1
53 |         with:
54 |           command: test
55 |           args: --workspace
56 | 
57 |   build_and_test_windows:
58 |     name: Build and Test (Windows)
59 |     runs-on: windows-latest
60 |     timeout-minutes: 10
61 |     steps:
62 |       - name: Prepare symlink configuration
63 |         run: git config --global core.symlinks true
64 | 
65 |       - uses: actions/checkout@v2
66 |       - uses: actions-rs/toolchain@v1
67 |         with:
68 |           profile: minimal
69 |           toolchain: stable
70 |           override: true
71 | 
72 |       - uses: actions-rs/cargo@v1
73 |         with:
74 |           command: test
75 |           args: --workspace
76 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 
12 | .vscode
13 | 
14 | .env/
15 | 
16 | *.egg-info
17 | __pycache__
18 | *.so
19 | 
20 | build/
21 | 
22 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "arpfloat"
 3 | version = "0.1.11"
 4 | authors = ["Nadav Rotem <nadav256@gmail.com>"]
 5 | categories = ["mathematics", "algorithms", "no-std"]
 6 | description = "Arbitrary-precision floating point library"
 7 | documentation = "https://docs.rs/arpfloat/"
 8 | edition = "2021"
 9 | keywords = ["float"]
10 | license = "Apache-2.0"
11 | readme = "README.md"
12 | repository = "https://github.com/nadavrot/arpfloat"
13 | 
14 | [dependencies]
15 | pyo3 = { version = "0.24.1", optional = true }
16 | 
17 | [dev-dependencies]
18 | criterion = "0.5"
19 | 
20 | [[bench]]
21 | name = "main_benchmark"
22 | harness = false
23 | 
24 | [features]
25 | default = ["std", "python"]
26 | std = []
27 | python=["pyo3", "std"]
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Arbitrary-Precision Floating-Point Library &emsp; 
  3 | [![Latest Version]][crates.io] [![Docs Badge]][docs]
  4 | 
  5 | [Latest Version]: https://img.shields.io/crates/v/arpfloat.svg
  6 | [crates.io]: https://crates.io/crates/arpfloat
  7 | [Docs Badge]: https://docs.rs/arpfloat/badge.svg
  8 | [docs]: https://docs.rs/arpfloat
  9 | 
 10 | ARPFloat is an implementation of arbitrary precision
 11 | [floating point](https://en.wikipedia.org/wiki/IEEE_754) data
 12 | structures and utilities. The library can be used to emulate existing floating
 13 | point types, such as FP16, and create new floating point types. Floating point
 14 | types can scale to hundreds of digits, and perform very accurate calculations.
 15 | In ARPFloat the rounding mode is a part of the type-system, and this defines
 16 | away a number of problem that show up when using fenv.h.
 17 | 
 18 | `no_std` environments are supported by disabling the `std` feature. 
 19 | `python` bindings are supported by enabling the `python` feature.
 20 | 
 21 | ### Example
 22 | ```rust
 23 |   use arpfloat::Float;
 24 |   use arpfloat::FP128;
 25 | 
 26 |   // Create the number '5' in FP128 format.
 27 |   let n = Float::from_f64(5.).cast(FP128);
 28 | 
 29 |   // Use Newton-Raphson to find the square root of 5.
 30 |   let mut x = n.clone();
 31 |   for _ in 0..20 {
 32 |       x += (&n / &x)/2;
 33 |   }
 34 | 
 35 |   println!("fp128: {}", x);
 36 |   println!("fp64:  {}", x.as_f64());
 37 |  ```
 38 | 
 39 | 
 40 | The program above will print this output:
 41 | ```console
 42 | fp128: 2.2360679774997896964091736687312763
 43 | fp64:  2.23606797749979
 44 | ```
 45 | 
 46 | The library also provides API that exposes rounding modes, and low-level
 47 | operations.
 48 | 
 49 | ```rust
 50 |     use arpfloat::FP128;
 51 |     use arpfloat::RoundingMode::NearestTiesToEven;
 52 |     use arpfloat::Float;
 53 | 
 54 |     let x = Float::from_u64(FP128, 1<<53);
 55 |     let y = Float::from_f64(1000.0).cast(FP128);
 56 | 
 57 |     let val = Float::mul_with_rm(&x, &y, NearestTiesToEven);
 58 |  ```
 59 | 
 60 |  View the internal representation of numbers:
 61 | 
 62 |  ```rust
 63 |     use arpfloat::Float;
 64 |     use arpfloat::FP16;
 65 | 
 66 |     let fp = Float::from_i64(FP16, 15);
 67 | 
 68 |     fp.dump(); // Prints FP[+ E=+3 M=11110000000]
 69 | 
 70 |     let m = fp.get_mantissa();
 71 |      m.dump(); // Prints 11110000000
 72 | ```
 73 | 
 74 |  Control the rounding mode for type conversion:
 75 | 
 76 | ```rust
 77 |     use arpfloat::{FP16, FP32, RoundingMode, Float};
 78 | 
 79 |     let x = Float::from_u64(FP32, 2649);
 80 |     let b = x.cast_with_rm(FP16, RoundingMode::Zero);
 81 |     println!("{}", b); // Prints 2648!
 82 | ```
 83 | 
 84 |  Define new float formats and use high-precision transcendental functions:
 85 | 
 86 | ```rust
 87 |   use arpfloat::{Float, Semantics};
 88 |   // Define a new float format with 120 bits of accuracy, and
 89 |   // dynamic range of 2^10.
 90 |   let sem = Semantics::new(10, 120);
 91 | 
 92 |   let pi = Float::pi(sem);
 93 |   let x = Float::exp(&pi);
 94 |   println!("e^pi = {}", x); // Prints 23.1406926327792....
 95 | ```
 96 | 
 97 |  Floating point numbers can be converted to
 98 |  [Continued Fractions](https://en.wikipedia.org/wiki/Continued_fraction) that
 99 |  approximate the value.
100 | 
101 |  ```rust
102 |   use arpfloat::{Float, FP256, RoundingMode};
103 | 
104 |   let ln = Float::ln2(FP256);
105 |   println!("ln(2) = {}", ln);
106 |   for i in 1..20 {
107 |     let (p,q) = ln.as_fraction(i);
108 |     println!("{}/{}", p.as_decimal(), q.as_decimal());
109 |   }
110 |  ```
111 | The program above will print this output:
112 | ```console
113 |   ln(2) = .6931471805599453094172321214581765680755001343602552.....
114 |   0/1
115 |   1/1
116 |   2/3
117 |   7/10
118 |   9/13
119 |   61/88
120 |   192/277
121 |   253/365
122 |   445/642
123 |   1143/1649
124 |   1588/2291
125 |   2731/3940
126 |   ....
127 | ```
128 | 
129 | The [examples](examples) directory contains a few programs that demonstrate the use of this library.
130 | 
131 | ### Python Bindings
132 | 
133 | The has python bindings that can be installed with 'pip install -e .'
134 | 
135 | ```python
136 |     >>> from arpfloat import Float, Semantics, FP16, BF16, FP32, fp64, pi
137 | 
138 |     >>> x = fp64(2.5).cast(FP16)
139 |     >>> y = fp64(1.5).cast(FP16)
140 |     >>> x + y
141 |     4.
142 | 
143 |     >>> sem = Semantics(10, 10, "NearestTiesToEven")
144 |     >>> sem
145 |     Semantics { exponent: 10, precision: 10, mode: NearestTiesToEven }
146 |     >>> Float(sem, False, 0b1000000001, 0b1100101)
147 |     4.789062
148 | 
149 |     >>> pi(FP32)
150 |     3.1415927
151 |     >>> pi(FP16)
152 |     3.140625
153 |     >>> pi(BF16)
154 |     3.140625
155 | ```
156 | 
157 | Arpfloat allows you to experiment with new floating point formats. For example,
158 | Nvidia's new [FP8](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html)
159 | format can be defined as:
160 | 
161 | ```python
162 |     import numpy as np
163 |     from arpfloat import FP32, fp64, Semantics, zero
164 | 
165 |     # Create two random numpy arrays in the range [0,1)
166 |     A0 = np.random.rand(1000000)
167 |     A1 = np.random.rand(1000000)
168 | 
169 |     # Calculate the numpy dot product of the two arrays
170 |     print("Using fp32 arithmetic    : ", np.dot(A0, A1))
171 | 
172 |     # Create the fp8 format (4 exponent bits, 3 mantissa bits + 1 implicit bit)
173 |     FP8 = Semantics(4, 3 + 1, "NearestTiesToEven")
174 | 
175 |     # Convert the arrays to fp8
176 |     A0 = [fp64(x).cast(FP8) for x in A0]
177 |     A1 = [fp64(x).cast(FP8) for x in A1]
178 | 
179 |     dot = sum([x.cast(FP32)*y.cast(FP32) for x, y in zip(A0, A1)])
180 |     print("Using fp8/fp32 arithmetic: ", dot)
181 | ```
182 | 
183 | ### Resources
184 | 
185 | There are excellent resources out there, some of which are referenced in the code:
186 | 
187 | * Books:
188 |     * Handbook of Floating-Point Arithmetic 2010th by Jean-Michel Muller et al.
189 |     * Elementary Functions: Algorithms and Implementation by Jean-Michel Muller.
190 |     * Modern Computer Arithmetic by Brent and Zimmermann.
191 | * Papers:
192 |     * An Accurate Elementary Mathematical Library for the IEEE Floating Point Standard, by Gal and Bachels.
193 |     * How to print floating-point numbers accurately by Steele, White.
194 |     * What Every Computer Scientist Should Know About Floating-Point Arithmetic by David Goldberg.
195 |     * Fast Multiple-Precision Evaluation of Elementary Functions by Richard Brent.
196 |     * Fast Trigonometric functions for Arbitrary Precision number by Henrik Vestermark.
197 | * Other excellent software implementations: APFloat, RYU, libBF, newlib, musl, etc.
198 | 
199 | ### License
200 | 
201 | Licensed under Apache-2.0
202 | 


--------------------------------------------------------------------------------
/arpfloat/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | ARPFloat: Arbitrary Precision Floating-Point Library
 4 | 
 5 | This library provides arbitrary precision floating-point arithmetic with
 6 | configurable precision and rounding modes. It implements IEEE 754
 7 | semantics and supports standard arithmetic operations.
 8 | 
 9 | Examples:
10 |     >>> from arpfloat import Float, FP16
11 |     >>> x = from_f64(FP32, 2.5).cast(FP16)
12 |     >>> y = from_f64(FP32, 1.5).cast(FP16)
13 |     >>> x + y
14 |     4
15 | 
16 |     >>> sem = Semantics(10, 10, "Zero")
17 |     >>> sem
18 |     Semantics { exponent: 10, precision: 10, mode: Zero }
19 |     >>> Float(sem, False, 1, 13)
20 |     .0507
21 | 
22 |     >>> arpfloat.pi(arpfloat.FP32)
23 |     3.1415927
24 |     >>> pi(FP16)
25 |     3.14
26 |     >>> pi(BF16)
27 |     3.15
28 | 
29 | Constants:
30 |     BF16, FP16, FP32, FP64, FP128, FP256: Standard floating-point formats
31 |     pi, e, ln2, zero: Mathematical constants
32 |     Float, Semantics: Classes for representing floating-point numbers and their semantics
33 |     from_i64, from_f64: Constructors for creating Float objects from integers and floats
34 | """
35 | 
36 | from ._arpfloat import PyFloat as Float
37 | from ._arpfloat import PySemantics as Semantics
38 | from ._arpfloat import pi, e, ln2, zero, fma
39 | from ._arpfloat import from_fp64 as fp64
40 | from ._arpfloat import from_i64 as i64
41 | 
42 | # Add __radd__ method to Float class for sum() compatibility
43 | 
44 | 
45 | def _float_radd(self, other):
46 |     if isinstance(other, (int, float)) and other == 0:
47 |         return self
48 |     return self.__add__(other)
49 | 
50 | Float.__radd__ = _float_radd
51 | 
52 | # Define standard floating-point types
53 | # Parameters match IEEE 754 standard formats
54 | BF16 = Semantics(8, 8, "NearestTiesToEven")  # BFloat16
55 | FP16 = Semantics(5, 11, "NearestTiesToEven")  # Half precision
56 | FP32 = Semantics(8, 24, "NearestTiesToEven")  # Single precision
57 | FP64 = Semantics(11, 53, "NearestTiesToEven")  # Double precision
58 | FP128 = Semantics(15, 113, "NearestTiesToEven")  # Quadruple precision
59 | FP256 = Semantics(19, 237, "NearestTiesToEven")  # Octuple precision
60 | 
61 | version = "0.1.11"
62 | 


--------------------------------------------------------------------------------
/benches/main_benchmark.rs:
--------------------------------------------------------------------------------
  1 | use arpfloat::{BigInt, Float, RoundingMode, Semantics};
  2 | 
  3 | use RoundingMode::NearestTiesToEven as rme;
  4 | 
  5 | fn test_e() {
  6 |     let sem = Semantics::new(32, 2000, rme);
  7 |     black_box(Float::e(sem));
  8 | }
  9 | 
 10 | fn test_sqrt() {
 11 |     let sem = Semantics::new(32, 10000, rme);
 12 |     black_box(Float::one(sem, false).scale(1, rme).sqrt());
 13 | }
 14 | 
 15 | fn test_pi() {
 16 |     let sem = Semantics::new(32, 2000, rme);
 17 |     black_box(Float::pi(sem));
 18 | }
 19 | 
 20 | fn test_powi() {
 21 |     let a = BigInt::from_u64(1275563424);
 22 |     black_box(a.powi(11000));
 23 | }
 24 | 
 25 | fn test_bigint_as_dec() {
 26 |     let a = BigInt::from_u64(197123);
 27 |     black_box(a.powi(100).as_decimal());
 28 | }
 29 | 
 30 | fn test_bigint_div() {
 31 |     let a = BigInt::pseudorandom(1000, 12345);
 32 |     let b = BigInt::pseudorandom(500, 67890);
 33 |     black_box(a / b);
 34 | }
 35 | 
 36 | fn test_cos() {
 37 |     let sem = Semantics::new(32, 90, rme);
 38 |     for i in 0..100 {
 39 |         let a = Float::from_u64(sem, i).cos();
 40 |         black_box(a);
 41 |     }
 42 | }
 43 | 
 44 | fn test_sin() {
 45 |     let sem = Semantics::new(32, 90, rme);
 46 |     for i in 0..100 {
 47 |         let a = Float::from_u64(sem, i).sin();
 48 |         black_box(a);
 49 |     }
 50 | }
 51 | 
 52 | fn test_log() {
 53 |     let sem = Semantics::new(32, 100, rme);
 54 |     for i in 0..100 {
 55 |         let a = Float::from_u64(sem, i).log();
 56 |         black_box(a);
 57 |     }
 58 | }
 59 | 
 60 | fn test_exp() {
 61 |     let sem = Semantics::new(32, 100, rme);
 62 |     for i in 0..1000 {
 63 |         let a = Float::from_u64(sem, 100 - i).exp();
 64 |         let b = Float::from_u64(sem, i).exp();
 65 |         black_box(a + b);
 66 |     }
 67 | }
 68 | 
 69 | fn test_bigint_mul_1() {
 70 |     let a = BigInt::pseudorandom(1000, 98765);
 71 |     let b = BigInt::pseudorandom(1000, 43210);
 72 |     black_box(a * b);
 73 | }
 74 | 
 75 | fn test_bigint_mul_2() {
 76 |     let a = BigInt::pseudorandom(10, 98765);
 77 |     let b = BigInt::pseudorandom(10, 43210);
 78 |     black_box(a * b);
 79 | }
 80 | 
 81 | fn test_bigint_mul_3() {
 82 |     let a = BigInt::pseudorandom(100, 98765);
 83 |     let b = BigInt::pseudorandom(100, 43210);
 84 |     black_box(a * b);
 85 | }
 86 | 
 87 | fn test_bigint_mul_4() {
 88 |     let a = BigInt::pseudorandom(5000, 98765);
 89 |     let b = BigInt::pseudorandom(1, 43210);
 90 |     black_box(a * b);
 91 | }
 92 | 
 93 | fn test_bigint_div_1() {
 94 |     let a = BigInt::pseudorandom(1000, 98765);
 95 |     let b = BigInt::pseudorandom(1000, 43210);
 96 |     black_box(a / b);
 97 | }
 98 | 
 99 | fn test_bigint_div_2() {
100 |     let a = BigInt::pseudorandom(1000, 98765);
101 |     let b = BigInt::pseudorandom(1, 43210);
102 |     black_box(a / b);
103 | }
104 | 
105 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
106 | 
107 | pub fn criterion_benchmark(c: &mut Criterion) {
108 |     c.bench_function("pi", |b| b.iter(test_pi));
109 |     c.bench_function("e", |b| b.iter(test_e));
110 |     c.bench_function("sqrt", |b| b.iter(test_sqrt));
111 |     c.bench_function("powi", |b| b.iter(test_powi));
112 |     c.bench_function("bigint_as_dec", |b| b.iter(test_bigint_as_dec));
113 |     c.bench_function("bigint_div", |b| b.iter(test_bigint_div));
114 |     c.bench_function("cos", |b| b.iter(test_cos));
115 |     c.bench_function("sin", |b| b.iter(test_sin));
116 |     c.bench_function("exp", |b| b.iter(test_exp));
117 |     c.bench_function("log", |b| b.iter(test_log));
118 |     c.bench_function("bigint_mul_1", |b| b.iter(test_bigint_mul_1));
119 |     c.bench_function("bigint_mul_2", |b| b.iter(test_bigint_mul_2));
120 |     c.bench_function("bigint_mul_3", |b| b.iter(test_bigint_mul_3));
121 |     c.bench_function("bigint_mul_4", |b| b.iter(test_bigint_mul_4));
122 |     c.bench_function("bigint_div_1", |b| b.iter(test_bigint_div_1));
123 |     c.bench_function("bigint_div_2", |b| b.iter(test_bigint_div_2));
124 | }
125 | 
126 | criterion_group!(benches, criterion_benchmark);
127 | criterion_main!(benches);
128 | 


--------------------------------------------------------------------------------
/examples/calc_pi.rs:
--------------------------------------------------------------------------------
 1 | //! Calculate the value of PI using the Chudnovsky_algorithm.
 2 | //!  cargo run --example calc_pi --release
 3 | 
 4 | use arpfloat::{Float, FP256};
 5 | 
 6 | fn main() {
 7 |     // https://en.wikipedia.org/wiki/Chudnovsky_algorithm
 8 |     let iterations = 5;
 9 | 
10 |     // Constants:
11 |     let c1 = Float::from_u64(FP256, 10005).sqrt();
12 |     let c2 = Float::from_u64(FP256, 545140134);
13 |     let c3 = Float::from_i64(FP256, -262537412640768000);
14 |     let c16 = Float::from_u64(FP256, 16);
15 |     let c12 = Float::from_u64(FP256, 12);
16 | 
17 |     // Initial state.
18 |     let mut kc = Float::from_u64(FP256, 6);
19 |     let mut m = Float::from_u64(FP256, 1);
20 |     let mut l = Float::from_u64(FP256, 13591409);
21 |     let mut x = Float::from_u64(FP256, 1);
22 |     let mut s = Float::from_u64(FP256, 13591409);
23 | 
24 |     for q in 1..iterations + 1 {
25 |         let q3 = Float::from_u64(FP256, q * q * q);
26 |         let k3 = &kc * &(&kc * &kc);
27 |         m = (k3 - (&kc * &c16)) * m / q3;
28 |         l += &c2;
29 |         x *= &c3;
30 |         s += &(&m * &l) / &x;
31 |         kc += &c12;
32 |     }
33 |     let pi = Float::from_u64(FP256, 426880) * (c1 / s);
34 |     println!("pi = {}", pi);
35 |     assert_eq!(pi.as_f64(), std::f64::consts::PI);
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/fma.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from arpfloat import FP32, fp64, Semantics, zero, fma
 3 | 
 4 | # Create two random numpy arrays in the range [0,1)
 5 | A0 = np.random.rand(1024)
 6 | A1 = np.random.rand(1024)
 7 | 
 8 | # Create the fp8 format (4 exponent bits, 3 mantissa bits + 1 implicit bit)
 9 | FP8 = Semantics(4, 3 + 1, "NearestTiesToEven")
10 | 
11 | # Convert the arrays to FP8
12 | B0 = [fp64(x).cast(FP8) for x in A0]
13 | B1 = [fp64(x).cast(FP8) for x in A1]
14 | 
15 | acc = zero(FP32)
16 | for x, y in zip(B0, B1):
17 |     acc = fma(x.cast(FP32), y.cast(FP32), acc)
18 | 
19 | print("Using fp8/fp32 arithmetic: ", acc)
20 | print("Using fp32 arithmetic    : ", np.dot(A0, A1))


--------------------------------------------------------------------------------
/examples/print_e.rs:
--------------------------------------------------------------------------------
 1 | //! Calculates long numbers and prints them.
 2 | //!  cargo run --example print_e --release
 3 | 
 4 | use arpfloat::{Float, RoundingMode, Semantics};
 5 | 
 6 | fn main() {
 7 |     let sem = Semantics::new(32, 5000, RoundingMode::NearestTiesToEven);
 8 |     let val = Float::e(sem);
 9 |     println!("F64: {}", val.as_f64());
10 |     println!("FP*: {}", val);
11 | }
12 | 


--------------------------------------------------------------------------------
/examples/softmax.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from arpfloat import FP32, BF16, fp64, zero
 3 | 
 4 | dtype = BF16
 5 | 
 6 | A0 = np.random.rand(6) # Random array in the range [0,1)
 7 | B0 = [fp64(x).cast(dtype) for x in A0] # Convert to the emulated format.
 8 | 
 9 | # Find the max value.
10 | max_val = max(B0)
11 | 
12 | # calculate exp(x-max) for each value.
13 | shifted_exp = [(x - max_val).exp() for x in B0]
14 | exp_sum = sum(shifted_exp)
15 | 
16 | # calculate the softmax: [exp(x-max) / sum(exp(x-max))]
17 | result = [x / exp_sum for x in shifted_exp]
18 | print("Calculated = ", result)
19 | 
20 | # NumPy's softmax.
21 | np_softmax  = np.exp(A0 - np.max(A0)) / np.exp(A0 - np.max(A0)).sum()
22 | print("Reference = ", np_softmax)
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=64.0.0", "wheel", "setuptools-rust>=1.5.2"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "arpfloat"
 7 | version = "0.1.11"
 8 | description = "Arbitrary-precision floating point library"
 9 | authors = [
10 |     {name = "Nadav Rotem", email = "nadav256@gmail.com"},
11 | ]
12 | readme = "README.md"
13 | requires-python = ">=3.6"
14 | 
15 | [project.urls]
16 | Homepage = "https://github.com/nadavrot/arpfloat"
17 | Documentation = "https://docs.rs/arpfloat/"
18 | Repository = "https://github.com/nadavrot/arpfloat"
19 | 
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==2.2.4
2 | semantic-version==2.10.0
3 | setuptools==78.1.0
4 | setuptools-rust==1.11.1
5 | wheel==0.45.1
6 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 80
2 | 
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools_rust import Binding, RustExtension
 3 | 
 4 | setup(
 5 |     name="arpfloat",
 6 |     version="0.1.11",  # Match the version in Cargo.toml
 7 |     description="Arbitrary-precision floating point library",
 8 |     author="Nadav Rotem",
 9 |     author_email="nadav256@gmail.com",
10 |     url="https://github.com/nadavrot/arpfloat",
11 |     rust_extensions=[
12 |         RustExtension(
13 |             "arpfloat._arpfloat",
14 |             binding=Binding.PyO3,
15 |             debug=False,
16 |             features=["python"],
17 |         )
18 |     ],
19 |     package_data={"arpfloat": ["py.typed"]},
20 |     packages=["arpfloat"],
21 |     zip_safe=False,
22 |     python_requires=">=3.6",
23 | )
24 | 


--------------------------------------------------------------------------------
/src/arithmetic.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the implementation of the basic arithmetic operations:
  2 | //! Addition, Subtraction, Multiplication, Division.
  3 | extern crate alloc;
  4 | use crate::bigint::BigInt;
  5 | 
  6 | use super::bigint::LossFraction;
  7 | use super::float::{Category, Float, RoundingMode};
  8 | use core::cmp::Ordering;
  9 | use core::ops::{
 10 |     Add, AddAssign, Div, DivAssign, Mul, MulAssign, Sub, SubAssign,
 11 | };
 12 | 
 13 | impl Float {
 14 |     /// An inner function that performs the addition and subtraction of normal
 15 |     /// numbers (no NaN, Inf, Zeros).
 16 |     /// See Pg 247.  Chapter 8. Algorithms for the Five Basic Operations.
 17 |     /// This implementation follows the APFloat implementation, that does not
 18 |     /// swap the operands.
 19 |     fn add_or_sub_normals(
 20 |         a: &Self,
 21 |         b: &Self,
 22 |         subtract: bool,
 23 |     ) -> (Self, LossFraction) {
 24 |         debug_assert_eq!(a.get_semantics(), b.get_semantics());
 25 |         let sem = a.get_semantics();
 26 |         let loss;
 27 |         let mut a = a.clone();
 28 |         let mut b = b.clone();
 29 | 
 30 |         // Align the input numbers on the same exponent.
 31 |         let bits = a.get_exp() - b.get_exp();
 32 | 
 33 |         // Can transform (a-b) to (a + -b), either way, there are cases where
 34 |         // subtraction needs to happen.
 35 |         let subtract = subtract ^ (a.get_sign() ^ b.get_sign());
 36 |         if subtract {
 37 |             // Align the input numbers. We shift LHS one bit to the left to
 38 |             // allow carry/borrow in case of underflow as result of subtraction.
 39 |             match bits.cmp(&0) {
 40 |                 Ordering::Equal => {
 41 |                     loss = LossFraction::ExactlyZero;
 42 |                 }
 43 |                 Ordering::Greater => {
 44 |                     loss = b.shift_significand_right((bits - 1) as u64);
 45 |                     a.shift_significand_left(1);
 46 |                 }
 47 |                 Ordering::Less => {
 48 |                     loss = a.shift_significand_right((-bits - 1) as u64);
 49 |                     b.shift_significand_left(1);
 50 |                 }
 51 |             }
 52 | 
 53 |             let a_mantissa = a.get_mantissa();
 54 |             let b_mantissa = b.get_mantissa();
 55 |             let ab_mantissa;
 56 |             let mut sign = a.get_sign();
 57 | 
 58 |             // Figure out the carry from the shifting operations that dropped
 59 |             // bits.
 60 |             let c = !loss.is_exactly_zero() as u64;
 61 |             let c = BigInt::from_u64(c);
 62 | 
 63 |             // Figure out which mantissa is larger, to make sure that we don't
 64 |             // overflow the subtraction.
 65 |             if a_mantissa < b_mantissa {
 66 |                 // A < B
 67 |                 ab_mantissa = b_mantissa - a_mantissa - c;
 68 |                 sign = !sign;
 69 |             } else {
 70 |                 // A >= B
 71 |                 ab_mantissa = a_mantissa - b_mantissa - c;
 72 |             }
 73 |             (
 74 |                 Self::from_parts(sem, sign, a.get_exp(), ab_mantissa),
 75 |                 loss.invert(),
 76 |             )
 77 |         } else {
 78 |             // Handle the easy case of Add:
 79 |             let mut b = b.clone();
 80 |             let mut a = a.clone();
 81 |             if bits > 0 {
 82 |                 loss = b.shift_significand_right(bits as u64);
 83 |             } else {
 84 |                 loss = a.shift_significand_right(-bits as u64);
 85 |             }
 86 |             debug_assert!(a.get_exp() == b.get_exp());
 87 |             let ab_mantissa = a.get_mantissa() + b.get_mantissa();
 88 |             (
 89 |                 Self::from_parts(sem, a.get_sign(), a.get_exp(), ab_mantissa),
 90 |                 loss,
 91 |             )
 92 |         }
 93 |     }
 94 | 
 95 |     /// Computes a+b using the rounding mode `rm`.
 96 |     pub fn add_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self {
 97 |         Self::add_sub(a, b, false, rm)
 98 |     }
 99 |     /// Computes a-b using the rounding mode `rm`.
100 |     pub fn sub_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self {
101 |         Self::add_sub(a, b, true, rm)
102 |     }
103 | 
104 |     fn add_sub(a: &Self, b: &Self, subtract: bool, rm: RoundingMode) -> Self {
105 |         let sem = a.get_semantics();
106 |         // Table 8.2: Specification of addition for positive floating-point
107 |         // data. Pg 247.
108 |         match (a.get_category(), b.get_category()) {
109 |             (Category::NaN, Category::Infinity)
110 |             | (Category::NaN, Category::NaN)
111 |             | (Category::NaN, Category::Normal)
112 |             | (Category::NaN, Category::Zero)
113 |             | (Category::Normal, Category::Zero)
114 |             | (Category::Infinity, Category::Normal)
115 |             | (Category::Infinity, Category::Zero) => a.clone(),
116 | 
117 |             (Category::Zero, Category::NaN)
118 |             | (Category::Normal, Category::NaN)
119 |             | (Category::Infinity, Category::NaN) => {
120 |                 Self::nan(sem, b.get_sign())
121 |             }
122 | 
123 |             (Category::Normal, Category::Infinity)
124 |             | (Category::Zero, Category::Infinity) => {
125 |                 Self::inf(sem, b.get_sign() ^ subtract)
126 |             }
127 | 
128 |             (Category::Zero, Category::Normal) => Self::from_parts(
129 |                 sem,
130 |                 b.get_sign() ^ subtract,
131 |                 b.get_exp(),
132 |                 b.get_mantissa(),
133 |             ),
134 | 
135 |             (Category::Zero, Category::Zero) => {
136 |                 Self::zero(sem, a.get_sign() && b.get_sign())
137 |             }
138 | 
139 |             (Category::Infinity, Category::Infinity) => {
140 |                 if a.get_sign() ^ b.get_sign() ^ subtract {
141 |                     return Self::nan(sem, a.get_sign() ^ b.get_sign());
142 |                 }
143 |                 Self::inf(sem, a.get_sign())
144 |             }
145 | 
146 |             (Category::Normal, Category::Normal) => {
147 |                 // The IEEE 754 spec (section 6.3) states that cancellation
148 |                 // results in a positive zero, except for the case of the
149 |                 // negative rounding mode.
150 |                 let cancellation = subtract == (a.get_sign() == b.get_sign());
151 |                 let same_absolute_number = a.same_absolute_value(b);
152 |                 if cancellation && same_absolute_number {
153 |                     let is_negative = RoundingMode::Negative == rm;
154 |                     return Self::zero(sem, is_negative);
155 |                 }
156 | 
157 |                 let mut res = Self::add_or_sub_normals(a, b, subtract);
158 |                 res.0.normalize(rm, res.1);
159 |                 res.0
160 |             }
161 |         }
162 |     }
163 | }
164 | 
165 | #[test]
166 | fn test_add() {
167 |     use super::float::FP64;
168 |     let a = Float::from_u64(FP64, 1);
169 |     let b = Float::from_u64(FP64, 2);
170 |     let _ = Float::add(a, b);
171 | }
172 | 
173 | #[test]
174 | fn test_addition() {
175 |     fn add_helper(a: f64, b: f64) -> f64 {
176 |         let a = Float::from_f64(a);
177 |         let b = Float::from_f64(b);
178 |         let c = Float::add(a, b);
179 |         c.as_f64()
180 |     }
181 | 
182 |     assert_eq!(add_helper(0., -4.), -4.);
183 |     assert_eq!(add_helper(-4., 0.), -4.);
184 |     assert_eq!(add_helper(1., 1.), 2.);
185 |     assert_eq!(add_helper(8., 4.), 12.);
186 |     assert_eq!(add_helper(8., 4.), 12.);
187 |     assert_eq!(add_helper(128., 2.), 130.);
188 |     assert_eq!(add_helper(128., -8.), 120.);
189 |     assert_eq!(add_helper(64., -60.), 4.);
190 |     assert_eq!(add_helper(69., -65.), 4.);
191 |     assert_eq!(add_helper(69., 69.), 138.);
192 |     assert_eq!(add_helper(69., 1.), 70.);
193 |     assert_eq!(add_helper(-128., -8.), -136.);
194 |     assert_eq!(add_helper(64., -65.), -1.);
195 |     assert_eq!(add_helper(-64., -65.), -129.);
196 |     assert_eq!(add_helper(-15., -15.), -30.);
197 | 
198 |     assert_eq!(add_helper(-15., 15.), 0.);
199 | 
200 |     for i in -4..15 {
201 |         for j in i..15 {
202 |             assert_eq!(
203 |                 add_helper(f64::from(j), f64::from(i)),
204 |                 f64::from(i) + f64::from(j)
205 |             );
206 |         }
207 |     }
208 | 
209 |     // Check that adding a negative and positive results in a positive zero for
210 |     // the default rounding mode.
211 |     let a = Float::from_f64(4.0);
212 |     let b = Float::from_f64(-4.0);
213 |     let c = Float::add(a.clone(), b);
214 |     let d = Float::sub(a.clone(), a);
215 |     assert!(c.is_zero());
216 |     assert!(!c.is_negative());
217 |     assert!(d.is_zero());
218 |     assert!(!d.is_negative());
219 | }
220 | 
221 | // Pg 120.  Chapter 4. Basic Properties and Algorithms.
222 | #[test]
223 | fn test_addition_large_numbers() {
224 |     use super::float::FP64;
225 |     let rm = RoundingMode::NearestTiesToEven;
226 | 
227 |     let one = Float::from_i64(FP64, 1);
228 |     let mut a = Float::from_i64(FP64, 1);
229 | 
230 |     while Float::sub_with_rm(&Float::add_with_rm(&a, &one, rm), &a, rm) == one {
231 |         a = Float::add_with_rm(&a, &a, rm);
232 |     }
233 | 
234 |     let mut b = one.clone();
235 |     while Float::sub_with_rm(&Float::add_with_rm(&a, &b, rm), &a, rm) != b {
236 |         b = Float::add_with_rm(&b, &one, rm);
237 |     }
238 | 
239 |     assert_eq!(a.as_f64(), 9007199254740992.);
240 |     assert_eq!(b.as_f64(), 2.);
241 | }
242 | 
243 | #[test]
244 | fn add_denormals() {
245 |     let v0 = f64::from_bits(0x0000_0000_0010_0010);
246 |     let v1 = f64::from_bits(0x0000_0000_1001_0010);
247 |     let v2 = f64::from_bits(0x1000_0000_0001_0010);
248 |     assert_eq!(add_f64(v2, -v1), v2 - v1);
249 | 
250 |     let a0 = Float::from_f64(v0);
251 |     assert_eq!(a0.as_f64(), v0);
252 | 
253 |     fn add_f64(a: f64, b: f64) -> f64 {
254 |         let a0 = Float::from_f64(a);
255 |         let b0 = Float::from_f64(b);
256 |         assert_eq!(a0.as_f64(), a);
257 |         Float::add(a0, b0).as_f64()
258 |     }
259 | 
260 |     // Add and subtract denormals.
261 |     assert_eq!(add_f64(v0, v1), v0 + v1);
262 |     assert_eq!(add_f64(v0, -v0), v0 - v0);
263 |     assert_eq!(add_f64(v0, v2), v0 + v2);
264 |     assert_eq!(add_f64(v2, v1), v2 + v1);
265 |     assert_eq!(add_f64(v2, -v1), v2 - v1);
266 | 
267 |     // Add and subtract denormals and normal numbers.
268 |     assert_eq!(add_f64(v0, 10.), v0 + 10.);
269 |     assert_eq!(add_f64(v0, -10.), v0 - 10.);
270 |     assert_eq!(add_f64(10000., v0), 10000. + v0);
271 | }
272 | 
273 | #[cfg(feature = "std")]
274 | #[test]
275 | fn add_special_values() {
276 |     use crate::utils;
277 | 
278 |     // Test the addition of various irregular values.
279 |     let values = utils::get_special_test_values();
280 | 
281 |     fn add_f64(a: f64, b: f64) -> f64 {
282 |         let a = Float::from_f64(a);
283 |         let b = Float::from_f64(b);
284 |         Float::add(a, b).as_f64()
285 |     }
286 | 
287 |     for v0 in values {
288 |         for v1 in values {
289 |             let r0 = add_f64(v0, v1);
290 |             let r1 = v0 + v1;
291 |             let r0_bits = r0.to_bits();
292 |             let r1_bits = r1.to_bits();
293 |             assert_eq!(r0.is_finite(), r1.is_finite());
294 |             assert_eq!(r0.is_nan(), r1.is_nan());
295 |             assert_eq!(r0.is_infinite(), r1.is_infinite());
296 |             assert_eq!(r0.is_normal(), r1.is_normal());
297 |             // Check that the results are bit identical, or are both NaN.
298 |             assert!(!r0.is_normal() || r0_bits == r1_bits);
299 |         }
300 |     }
301 | }
302 | 
303 | #[test]
304 | fn test_add_random_vals() {
305 |     use crate::utils;
306 | 
307 |     let mut lfsr = utils::Lfsr::new();
308 | 
309 |     let v0: u64 = 0x645e91f69778bad3;
310 |     let v1: u64 = 0xe4d91b16be9ae0c5;
311 | 
312 |     fn add_f64(a: f64, b: f64) -> f64 {
313 |         let a = Float::from_f64(a);
314 |         let b = Float::from_f64(b);
315 |         let k = Float::add(a, b);
316 |         k.as_f64()
317 |     }
318 | 
319 |     let f0 = f64::from_bits(v0);
320 |     let f1 = f64::from_bits(v1);
321 | 
322 |     let r0 = add_f64(f0, f1);
323 |     let r1 = f0 + f1;
324 | 
325 |     assert_eq!(r0.is_finite(), r1.is_finite());
326 |     assert_eq!(r0.is_nan(), r1.is_nan());
327 |     assert_eq!(r0.is_infinite(), r1.is_infinite());
328 |     let r0_bits = r0.to_bits();
329 |     let r1_bits = r1.to_bits();
330 |     // Check that the results are bit identical, or are both NaN.
331 |     assert!(r1.is_nan() || r0_bits == r1_bits);
332 | 
333 |     for _ in 0..50000 {
334 |         let v0 = lfsr.get64();
335 |         let v1 = lfsr.get64();
336 | 
337 |         let f0 = f64::from_bits(v0);
338 |         let f1 = f64::from_bits(v1);
339 | 
340 |         let r0 = add_f64(f0, f1);
341 |         let r1 = f0 + f1;
342 | 
343 |         assert_eq!(r0.is_finite(), r1.is_finite());
344 |         assert_eq!(r0.is_nan(), r1.is_nan());
345 |         assert_eq!(r0.is_infinite(), r1.is_infinite());
346 |         let r0_bits = r0.to_bits();
347 |         let r1_bits = r1.to_bits();
348 |         // Check that the results are bit identical, or are both NaN.
349 |         assert!(r1.is_nan() || r0_bits == r1_bits);
350 |     }
351 | }
352 | 
353 | impl Float {
354 |     /// Compute a*b using the rounding mode `rm`.
355 |     pub fn mul_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self {
356 |         let sem = a.get_semantics();
357 |         let sign = a.get_sign() ^ b.get_sign();
358 | 
359 |         // Table 8.4: Specification of multiplication for floating-point data of
360 |         // positive sign. Page 251.
361 |         match (a.get_category(), b.get_category()) {
362 |             (Category::Zero, Category::NaN)
363 |             | (Category::Normal, Category::NaN)
364 |             | (Category::Infinity, Category::NaN) => {
365 |                 Self::nan(sem, b.get_sign())
366 |             }
367 |             (Category::NaN, Category::Infinity)
368 |             | (Category::NaN, Category::NaN)
369 |             | (Category::NaN, Category::Normal)
370 |             | (Category::NaN, Category::Zero) => Self::nan(sem, a.get_sign()),
371 |             (Category::Normal, Category::Infinity)
372 |             | (Category::Infinity, Category::Normal)
373 |             | (Category::Infinity, Category::Infinity) => Self::inf(sem, sign),
374 |             (Category::Normal, Category::Zero)
375 |             | (Category::Zero, Category::Normal)
376 |             | (Category::Zero, Category::Zero) => Self::zero(sem, sign),
377 | 
378 |             (Category::Zero, Category::Infinity)
379 |             | (Category::Infinity, Category::Zero) => Self::nan(sem, sign),
380 | 
381 |             (Category::Normal, Category::Normal) => {
382 |                 let (mut res, loss) = Self::mul_normals(a, b, sign);
383 |                 res.normalize(rm, loss);
384 |                 res
385 |             }
386 |         }
387 |     }
388 | 
389 |     /// See Pg 251. 8.4 Floating-Point Multiplication
390 |     fn mul_normals(a: &Self, b: &Self, sign: bool) -> (Self, LossFraction) {
391 |         debug_assert_eq!(a.get_semantics(), b.get_semantics());
392 |         let sem = a.get_semantics();
393 |         // We multiply digits in the format 1.xx * 2^(e), or mantissa * 2^(e+1).
394 |         // When we multiply two 2^(e+1) numbers, we get:
395 |         // log(2^(e_a+1)*2^(e_b+1)) = e_a + e_b + 2.
396 |         let mut exp = a.get_exp() + b.get_exp();
397 | 
398 |         let a_significand = a.get_mantissa();
399 |         let b_significand = b.get_mantissa();
400 |         let ab_significand = a_significand * b_significand;
401 | 
402 |         // The exponent is correct, but the bits are not in the right place.
403 |         // Set the right exponent for where the bits are placed, and fix the
404 |         // exponent below.
405 |         exp -= sem.get_mantissa_len() as i64;
406 | 
407 |         let loss = LossFraction::ExactlyZero;
408 |         (Self::from_parts(sem, sign, exp, ab_significand), loss)
409 |     }
410 | }
411 | 
412 | #[test]
413 | fn test_mul_simple() {
414 |     let a: f64 = -24.0;
415 |     let b: f64 = 0.1;
416 | 
417 |     let af = Float::from_f64(a);
418 |     let bf = Float::from_f64(b);
419 |     let cf = Float::mul(af, bf);
420 | 
421 |     let r0 = cf.as_f64();
422 |     let r1: f64 = a * b;
423 |     assert_eq!(r0, r1);
424 | }
425 | 
426 | #[test]
427 | fn mul_regular_values() {
428 |     // Test the addition of regular values.
429 |     let values = [-5.0, 0., -0., 24., 1., 11., 10000., 256., 0.1, 3., 17.5];
430 | 
431 |     fn mul_f64(a: f64, b: f64) -> f64 {
432 |         let a = Float::from_f64(a);
433 |         let b = Float::from_f64(b);
434 |         Float::mul(a, b).as_f64()
435 |     }
436 | 
437 |     for v0 in values {
438 |         for v1 in values {
439 |             let r0 = mul_f64(v0, v1);
440 |             let r1 = v0 * v1;
441 |             let r0_bits = r0.to_bits();
442 |             let r1_bits = r1.to_bits();
443 |             // Check that the results are bit identical, or are both NaN.
444 |             assert_eq!(r0_bits, r1_bits);
445 |         }
446 |     }
447 | }
448 | 
449 | #[cfg(feature = "std")]
450 | #[test]
451 | fn test_mul_special_values() {
452 |     use super::utils;
453 | 
454 |     // Test the multiplication of various irregular values.
455 |     let values = utils::get_special_test_values();
456 | 
457 |     fn mul_f64(a: f64, b: f64) -> f64 {
458 |         let a = Float::from_f64(a);
459 |         let b = Float::from_f64(b);
460 |         Float::mul(a, b).as_f64()
461 |     }
462 | 
463 |     for v0 in values {
464 |         for v1 in values {
465 |             let r0 = mul_f64(v0, v1);
466 |             let r1 = v0 * v1;
467 |             assert_eq!(r0.is_finite(), r1.is_finite());
468 |             assert_eq!(r0.is_nan(), r1.is_nan());
469 |             assert_eq!(r0.is_infinite(), r1.is_infinite());
470 |             let r0_bits = r0.to_bits();
471 |             let r1_bits = r1.to_bits();
472 |             // Check that the results are bit identical, or are both NaN.
473 |             assert!(r1.is_nan() || r0_bits == r1_bits);
474 |         }
475 |     }
476 | }
477 | 
478 | #[test]
479 | fn test_mul_random_vals() {
480 |     use super::utils;
481 | 
482 |     let mut lfsr = utils::Lfsr::new();
483 | 
484 |     fn mul_f64(a: f64, b: f64) -> f64 {
485 |         let a = Float::from_f64(a);
486 |         let b = Float::from_f64(b);
487 |         let k = Float::mul(a, b);
488 |         k.as_f64()
489 |     }
490 | 
491 |     for _ in 0..50000 {
492 |         let v0 = lfsr.get64();
493 |         let v1 = lfsr.get64();
494 | 
495 |         let f0 = f64::from_bits(v0);
496 |         let f1 = f64::from_bits(v1);
497 | 
498 |         let r0 = mul_f64(f0, f1);
499 |         let r1 = f0 * f1;
500 |         assert_eq!(r0.is_finite(), r1.is_finite());
501 |         assert_eq!(r0.is_nan(), r1.is_nan());
502 |         assert_eq!(r0.is_infinite(), r1.is_infinite());
503 |         let r0_bits = r0.to_bits();
504 |         let r1_bits = r1.to_bits();
505 |         // Check that the results are bit identical, or are both NaN.
506 |         assert!(r1.is_nan() || r0_bits == r1_bits);
507 |     }
508 | }
509 | 
510 | impl Float {
511 |     /// Compute a/b, with the rounding mode `rm`.
512 |     pub fn div_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self {
513 |         let sem = a.get_semantics();
514 |         let sign = a.get_sign() ^ b.get_sign();
515 |         // Table 8.5: Special values for x/y - Page 263.
516 |         match (a.get_category(), b.get_category()) {
517 |             (Category::NaN, _)
518 |             | (_, Category::NaN)
519 |             | (Category::Zero, Category::Zero)
520 |             | (Category::Infinity, Category::Infinity) => Self::nan(sem, sign),
521 | 
522 |             (_, Category::Infinity) => Self::zero(sem, sign),
523 |             (Category::Zero, _) => Self::zero(sem, sign),
524 |             (_, Category::Zero) => Self::inf(sem, sign),
525 |             (Category::Infinity, _) => Self::inf(sem, sign),
526 |             (Category::Normal, Category::Normal) => {
527 |                 let (mut res, loss) = Self::div_normals(a, b);
528 |                 res.normalize(rm, loss);
529 |                 res
530 |             }
531 |         }
532 |     }
533 | 
534 |     /// Compute a/b, where both `a` and `b` are normals.
535 |     /// Page 262 8.6. Floating-Point Division.
536 |     /// This implementation uses a regular integer division for the mantissa.
537 |     fn div_normals(a: &Self, b: &Self) -> (Self, LossFraction) {
538 |         debug_assert_eq!(a.get_semantics(), b.get_semantics());
539 |         let sem = a.get_semantics();
540 | 
541 |         let mut a = a.clone();
542 |         let mut b = b.clone();
543 |         // Start by normalizing the dividend and divisor to the MSB.
544 |         a.align_mantissa(); // Normalize the dividend.
545 |         b.align_mantissa(); // Normalize the divisor.
546 | 
547 |         let mut a_mantissa = a.get_mantissa();
548 |         let b_mantissa = b.get_mantissa();
549 | 
550 |         // Calculate the sign and exponent.
551 |         let mut exp = a.get_exp() - b.get_exp();
552 |         let sign = a.get_sign() ^ b.get_sign();
553 | 
554 |         // Make sure that A >= B, to allow the integer division to generate all
555 |         // of the bits of the result.
556 |         if a_mantissa < b_mantissa {
557 |             a_mantissa.shift_left(1);
558 |             exp -= 1;
559 |         }
560 | 
561 |         // The bits are now aligned to the MSB of the mantissa. The
562 |         // semantics need to be 1.xxxxx, but we perform integer division.
563 |         // Shift the dividend to make sure that we generate the bits after
564 |         // the period.
565 |         a_mantissa.shift_left(sem.get_mantissa_len());
566 |         let reminder = a_mantissa.inplace_div(&b_mantissa);
567 | 
568 |         // Find 2 x reminder, to be able to compare to the reminder and figure
569 |         // out the kind of loss that we have.
570 |         let mut reminder_2x = reminder;
571 |         reminder_2x.shift_left(1);
572 | 
573 |         let reminder = reminder_2x.cmp(&b_mantissa);
574 |         let is_zero = reminder_2x.is_zero();
575 |         let loss = match reminder {
576 |             Ordering::Less => {
577 |                 if is_zero {
578 |                     LossFraction::ExactlyZero
579 |                 } else {
580 |                     LossFraction::LessThanHalf
581 |                 }
582 |             }
583 |             Ordering::Equal => LossFraction::ExactlyHalf,
584 |             Ordering::Greater => LossFraction::MoreThanHalf,
585 |         };
586 | 
587 |         let x = Self::from_parts(sem, sign, exp, a_mantissa);
588 |         (x, loss)
589 |     }
590 | }
591 | 
592 | #[test]
593 | fn test_div_simple() {
594 |     let a: f64 = 1.0;
595 |     let b: f64 = 7.0;
596 | 
597 |     let af = Float::from_f64(a);
598 |     let bf = Float::from_f64(b);
599 |     let cf = Float::div_with_rm(&af, &bf, RoundingMode::NearestTiesToEven);
600 | 
601 |     let r0 = cf.as_f64();
602 |     let r1: f64 = a / b;
603 |     assert_eq!(r0, r1);
604 | }
605 | 
606 | #[cfg(feature = "std")]
607 | #[test]
608 | fn test_div_special_values() {
609 |     use super::utils;
610 | 
611 |     // Test the multiplication of various irregular values.
612 |     let values = utils::get_special_test_values();
613 | 
614 |     fn div_f64(a: f64, b: f64) -> f64 {
615 |         let a = Float::from_f64(a);
616 |         let b = Float::from_f64(b);
617 |         Float::div_with_rm(&a, &b, RoundingMode::NearestTiesToEven).as_f64()
618 |     }
619 | 
620 |     for v0 in values {
621 |         for v1 in values {
622 |             let r0 = div_f64(v0, v1);
623 |             let r1 = v0 / v1;
624 |             assert_eq!(r0.is_finite(), r1.is_finite());
625 |             assert_eq!(r0.is_nan(), r1.is_nan());
626 |             assert_eq!(r0.is_infinite(), r1.is_infinite());
627 |             let r0_bits = r0.to_bits();
628 |             let r1_bits = r1.to_bits();
629 |             // Check that the results are bit identical, or are both NaN.
630 |             assert!(r1.is_nan() || r0_bits == r1_bits);
631 |         }
632 |     }
633 | }
634 | 
635 | macro_rules! declare_operator {
636 |     ($trait_name:ident,
637 |      $func_name:ident,
638 |      $func_impl_name:ident) => {
639 |         // Self + Self
640 |         impl $trait_name for Float {
641 |             type Output = Self;
642 |             fn $func_name(self, rhs: Self) -> Self {
643 |                 let sem = self.get_semantics();
644 |                 Self::$func_impl_name(&self, &rhs, sem.get_rounding_mode())
645 |             }
646 |         }
647 | 
648 |         // Self + u64
649 |         impl $trait_name<u64> for Float {
650 |             type Output = Self;
651 |             fn $func_name(self, rhs: u64) -> Self {
652 |                 let sem = self.get_semantics();
653 |                 Self::$func_impl_name(
654 |                     &self,
655 |                     &Self::Output::from_u64(sem, rhs),
656 |                     sem.get_rounding_mode(),
657 |                 )
658 |             }
659 |         }
660 |         // &Self + &Self
661 |         impl $trait_name<Self> for &Float {
662 |             type Output = Float;
663 |             fn $func_name(self, rhs: Self) -> Self::Output {
664 |                 let sem = self.get_semantics();
665 |                 Self::Output::$func_impl_name(
666 |                     &self,
667 |                     rhs,
668 |                     sem.get_rounding_mode(),
669 |                 )
670 |             }
671 |         }
672 |         // &Self + u64
673 |         impl $trait_name<u64> for &Float {
674 |             type Output = Float;
675 |             fn $func_name(self, rhs: u64) -> Self::Output {
676 |                 let sem = self.get_semantics();
677 |                 Self::Output::$func_impl_name(
678 |                     &self,
679 |                     &Self::Output::from_u64(self.get_semantics(), rhs),
680 |                     sem.get_rounding_mode(),
681 |                 )
682 |             }
683 |         }
684 | 
685 |         // &Self + Self
686 |         impl $trait_name<Float> for &Float {
687 |             type Output = Float;
688 |             fn $func_name(self, rhs: Float) -> Self::Output {
689 |                 let sem = self.get_semantics();
690 |                 Self::Output::$func_impl_name(
691 |                     &self,
692 |                     &rhs,
693 |                     sem.get_rounding_mode(),
694 |                 )
695 |             }
696 |         }
697 |     };
698 | }
699 | 
700 | declare_operator!(Add, add, add_with_rm);
701 | declare_operator!(Sub, sub, sub_with_rm);
702 | declare_operator!(Mul, mul, mul_with_rm);
703 | declare_operator!(Div, div, div_with_rm);
704 | 
705 | macro_rules! declare_assign_operator {
706 |     ($trait_name:ident,
707 |      $func_name:ident,
708 |      $func_impl_name:ident) => {
709 |         impl $trait_name for Float {
710 |             fn $func_name(&mut self, rhs: Self) {
711 |                 let sem = self.get_semantics();
712 |                 *self =
713 |                     Self::$func_impl_name(self, &rhs, sem.get_rounding_mode());
714 |             }
715 |         }
716 | 
717 |         impl $trait_name<&Float> for Float {
718 |             fn $func_name(&mut self, rhs: &Self) {
719 |                 let sem = self.get_semantics();
720 |                 *self =
721 |                     Self::$func_impl_name(self, rhs, sem.get_rounding_mode());
722 |             }
723 |         }
724 |     };
725 | }
726 | 
727 | declare_assign_operator!(AddAssign, add_assign, add_with_rm);
728 | declare_assign_operator!(SubAssign, sub_assign, sub_with_rm);
729 | declare_assign_operator!(MulAssign, mul_assign, mul_with_rm);
730 | declare_assign_operator!(DivAssign, div_assign, div_with_rm);
731 | 
732 | #[test]
733 | fn test_operators() {
734 |     use crate::FP64;
735 |     let a = Float::from_f32(8.0).cast(FP64);
736 |     let b = Float::from_f32(2.0).cast(FP64);
737 |     let c = &a + &b;
738 |     let d = &a - &b;
739 |     let e = &a * &b;
740 |     let f = &a / &b;
741 |     assert_eq!(c.as_f64(), 10.0);
742 |     assert_eq!(d.as_f64(), 6.0);
743 |     assert_eq!(e.as_f64(), 16.0);
744 |     assert_eq!(f.as_f64(), 4.0);
745 | }
746 | 
747 | #[test]
748 | fn test_slow_sqrt_2_test() {
749 |     use crate::FP128;
750 |     use crate::FP64;
751 | 
752 |     // Find sqrt using a binary search.
753 |     let two = Float::from_f64(2.0).cast(FP128);
754 |     let mut high = Float::from_f64(2.0).cast(FP128);
755 |     let mut low = Float::from_f64(1.0).cast(FP128);
756 | 
757 |     for _ in 0..25 {
758 |         let mid = (&high + &low) / 2;
759 |         if (&mid * &mid) < two {
760 |             low = mid;
761 |         } else {
762 |             high = mid;
763 |         }
764 |     }
765 | 
766 |     let res = low.cast(FP64);
767 |     assert!(res.as_f64() < 1.4142137_f64);
768 |     assert!(res.as_f64() > 1.4142134_f64);
769 | }
770 | 
771 | #[cfg(feature = "std")]
772 | #[test]
773 | fn test_famous_pentium4_bug() {
774 |     use crate::std::string::ToString;
775 |     // https://en.wikipedia.org/wiki/Pentium_FDIV_bug
776 |     use crate::FP128;
777 | 
778 |     let a = Float::from_u64(FP128, 4_195_835);
779 |     let b = Float::from_u64(FP128, 3_145_727);
780 |     let res = a / b;
781 |     let result = res.to_string();
782 |     assert!(result.starts_with("1.333820449136241002"));
783 | }
784 | 
785 | impl Float {
786 |     // Perform a fused multiply-add of normal numbers, without rounding.
787 |     fn fused_mul_add_normals(
788 |         a: &Self,
789 |         b: &Self,
790 |         c: &Self,
791 |     ) -> (Self, LossFraction) {
792 |         debug_assert_eq!(a.get_semantics(), b.get_semantics());
793 |         let sem = a.get_semantics();
794 | 
795 |         // Multiply a and b, without rounding.
796 |         let sign = a.get_sign() ^ b.get_sign();
797 |         let mut ab = Self::mul_normals(a, b, sign).0;
798 | 
799 |         // Shift the product, to allow enough precision for the addition.
800 |         // Notice that this can be implemented more efficiently with 3 extra
801 |         // bits and sticky bits.
802 |         // See 8.5. Floating-Point Fused Multiply-Add, Page 255.
803 |         let mut c = c.clone();
804 |         let extra_bits = sem.get_precision() + 1;
805 |         ab.shift_significand_left(extra_bits as u64);
806 |         c.shift_significand_left(extra_bits as u64);
807 | 
808 |         // Perform the addition, without rounding.
809 |         Self::add_or_sub_normals(&ab, &c, false)
810 |     }
811 | 
812 |     /// Compute a*b + c, with the rounding mode `rm`.
813 |     pub fn fused_mul_add_with_rm(
814 |         a: &Self,
815 |         b: &Self,
816 |         c: &Self,
817 |         rm: RoundingMode,
818 |     ) -> Self {
819 |         if a.is_normal() && b.is_normal() && c.is_normal() {
820 |             let (mut res, loss) = Self::fused_mul_add_normals(a, b, c);
821 |             res.normalize(rm, loss); // Finally, round the result.
822 |             res
823 |         } else {
824 |             // Perform two operations. First, handle non-normal values.
825 | 
826 |             // NaN anything = NaN
827 |             if a.is_nan() || b.is_nan() || c.is_nan() {
828 |                 return Self::nan(a.get_semantics(), a.get_sign());
829 |             }
830 |             // (infinity * 0) + c = NaN
831 |             if (a.is_inf() && b.is_zero()) || (a.is_zero() && b.is_inf()) {
832 |                 return Self::nan(a.get_semantics(), a.get_sign());
833 |             }
834 |             // (normal * normal) + infinity = infinity
835 |             if a.is_normal() && b.is_normal() && c.is_inf() {
836 |                 return c.clone();
837 |             }
838 |             // (normal * 0) + c = c
839 |             if a.is_zero() || b.is_zero() {
840 |                 return c.clone();
841 |             }
842 | 
843 |             // Multiply (with rounding), and add (with rounding).
844 |             let ab = Self::mul_with_rm(a, b, rm);
845 |             Self::add_with_rm(&ab, c, rm)
846 |         }
847 |     }
848 | 
849 |     /// Compute a*b + c.
850 |     pub fn fma(a: &Self, b: &Self, c: &Self) -> Self {
851 |         Self::fused_mul_add_with_rm(a, b, c, c.get_rounding_mode())
852 |     }
853 | }
854 | 
855 | #[test]
856 | fn test_fma() {
857 |     let v0 = -10.;
858 |     let v1 = -1.1;
859 |     let v2 = 0.000000000000000000000000000000000000001;
860 |     let af = Float::from_f64(v0);
861 |     let bf = Float::from_f64(v1);
862 |     let cf = Float::from_f64(v2);
863 | 
864 |     let r = Float::fused_mul_add_with_rm(
865 |         &af,
866 |         &bf,
867 |         &cf,
868 |         RoundingMode::NearestTiesToEven,
869 |     );
870 | 
871 |     assert_eq!(f64::mul_add(v0, v1, v2), r.as_f64());
872 | }
873 | 
874 | #[cfg(feature = "std")]
875 | #[test]
876 | fn test_fma_simple() {
877 |     use super::utils;
878 |     // Test the multiplication of various irregular values.
879 |     let values = utils::get_special_test_values();
880 |     for a in values {
881 |         for b in values {
882 |             for c in values {
883 |                 let af = Float::from_f64(a);
884 |                 let bf = Float::from_f64(b);
885 |                 let cf = Float::from_f64(c);
886 | 
887 |                 let rf = Float::fused_mul_add_with_rm(
888 |                     &af,
889 |                     &bf,
890 |                     &cf,
891 |                     RoundingMode::NearestTiesToEven,
892 |                 );
893 | 
894 |                 let r0 = rf.as_f64();
895 |                 let r1: f64 = a.mul_add(b, c);
896 |                 assert_eq!(r0.is_finite(), r1.is_finite());
897 |                 assert_eq!(r0.is_nan(), r1.is_nan());
898 |                 assert_eq!(r0.is_infinite(), r1.is_infinite());
899 |                 // Check that the results are bit identical, or are both NaN.
900 |                 assert!(r1.is_nan() || r1.is_infinite() || r0 == r1);
901 |             }
902 |         }
903 |     }
904 | }
905 | 
906 | #[test]
907 | fn test_fma_random_vals() {
908 |     use super::utils;
909 | 
910 |     let mut lfsr = utils::Lfsr::new();
911 | 
912 |     fn mul_f32(a: f32, b: f32, c: f32) -> f32 {
913 |         let a = Float::from_f32(a);
914 |         let b = Float::from_f32(b);
915 |         let c = Float::from_f32(c);
916 |         let k = Float::fused_mul_add_with_rm(
917 |             &a,
918 |             &b,
919 |             &c,
920 |             RoundingMode::NearestTiesToEven,
921 |         );
922 |         k.as_f32()
923 |     }
924 | 
925 |     for _ in 0..50000 {
926 |         let v0 = lfsr.get64() as u32;
927 |         let v1 = lfsr.get64() as u32;
928 |         let v2 = lfsr.get64() as u32;
929 | 
930 |         let f0 = f32::from_bits(v0);
931 |         let f1 = f32::from_bits(v1);
932 |         let f2 = f32::from_bits(v2);
933 | 
934 |         let r0 = mul_f32(f0, f1, f2);
935 |         let r1 = f32::mul_add(f0, f1, f2);
936 |         assert_eq!(r0.is_finite(), r1.is_finite());
937 |         assert_eq!(r0.is_nan(), r1.is_nan());
938 |         assert_eq!(r0.is_infinite(), r1.is_infinite());
939 |         let r0_bits = r0.to_bits();
940 |         let r1_bits = r1.to_bits();
941 |         // Check that the results are bit identical, or are both NaN.
942 |         assert!(r1.is_nan() || r0_bits == r1_bits);
943 |     }
944 | }
945 | 


--------------------------------------------------------------------------------
/src/cast.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the implementation of casting-related methods.
  2 | 
  3 | use crate::float::Semantics;
  4 | use crate::FP128;
  5 | 
  6 | use super::bigint::BigInt;
  7 | use super::bigint::LossFraction;
  8 | use super::float::{self, Category};
  9 | use super::float::{Float, RoundingMode, FP32, FP64};
 10 | use super::utils;
 11 | use super::utils::mask;
 12 | 
 13 | impl Float {
 14 |     /// Load the integer `val` into the float. Notice that the number may
 15 |     /// overflow, or rounded to the nearest even integer.
 16 |     pub fn from_u64(sem: Semantics, val: u64) -> Self {
 17 |         Self::from_bigint(FP128, BigInt::from_u64(val)).cast(sem)
 18 |     }
 19 | 
 20 |     /// Load the big int `val` into the float. Notice that the number may
 21 |     /// overflow, or rounded to the nearest even integer.
 22 |     pub fn from_bigint(sem: Semantics, val: BigInt) -> Self {
 23 |         let mut a =
 24 |             Self::from_parts(sem, false, sem.get_mantissa_len() as i64, val);
 25 |         a.normalize(sem.get_rounding_mode(), LossFraction::ExactlyZero);
 26 |         a
 27 |     }
 28 | 
 29 |     /// Load the integer `val` into the float. Notice that the number may
 30 |     /// overflow or rounded.
 31 |     pub fn from_i64(sem: Semantics, val: i64) -> Self {
 32 |         if val < 0 {
 33 |             let mut a = Self::from_u64(sem, -val as u64);
 34 |             a.set_sign(true);
 35 |             return a;
 36 |         }
 37 | 
 38 |         Self::from_u64(sem, val as u64)
 39 |     }
 40 | 
 41 |     /// Converts and returns the rounded integral part.
 42 |     pub fn to_i64(&self) -> i64 {
 43 |         if self.is_nan() || self.is_zero() {
 44 |             return 0;
 45 |         }
 46 | 
 47 |         if self.is_inf() {
 48 |             if self.get_sign() {
 49 |                 return i64::MIN;
 50 |             } else {
 51 |                 return i64::MAX;
 52 |             }
 53 |         }
 54 |         let rm = self.get_rounding_mode();
 55 |         let val = self.convert_normal_to_integer(rm);
 56 |         if self.get_sign() {
 57 |             -(val.as_u64() as i64)
 58 |         } else {
 59 |             val.as_u64() as i64
 60 |         }
 61 |     }
 62 | 
 63 |     /// Returns a value that is rounded to the nearest integer that's not larger
 64 |     /// in magnitude than this float.
 65 |     pub fn trunc(&self) -> Self {
 66 |         // Only handle normal numbers (don't do anything to NaN, Inf, Zero).
 67 |         if !self.is_normal() {
 68 |             return self.clone();
 69 |         }
 70 | 
 71 |         let exp = self.get_exp();
 72 | 
 73 |         if exp > self.get_mantissa_len() as i64 {
 74 |             // Already an integer.
 75 |             return self.clone();
 76 |         }
 77 | 
 78 |         // Numbers that are smaller than 1 are rounded to zero.
 79 |         if exp < -1 {
 80 |             return Self::zero(self.get_semantics(), self.get_sign());
 81 |         }
 82 | 
 83 |         // This is a fraction. Figure out which bits represent values over one
 84 |         // and clear out the values that represent the fraction.
 85 |         let trim = (self.get_mantissa_len() as i64 - exp) as usize;
 86 |         let mut m = self.get_mantissa();
 87 |         m.shift_right(trim);
 88 |         m.shift_left(trim);
 89 |         Self::from_parts(
 90 |             self.get_semantics(),
 91 |             self.get_sign(),
 92 |             self.get_exp(),
 93 |             m,
 94 |         )
 95 |     }
 96 | 
 97 |     /// Returns a number rounded to nearest integer, away from zero.
 98 |     pub fn round(&self) -> Self {
 99 |         use crate::float::shift_right_with_loss;
100 |         let sem = self.get_semantics();
101 | 
102 |         // Only handle normal numbers (don't do anything to NaN, Inf, Zero).
103 |         if !self.is_normal() {
104 |             return self.clone();
105 |         }
106 | 
107 |         let exp = self.get_exp();
108 | 
109 |         if exp > self.get_mantissa_len() as i64 {
110 |             // Already an integer.
111 |             return self.clone();
112 |         }
113 | 
114 |         // Numbers that are between 0.5 and 1.0 are rounded to 1.0.
115 |         if exp == -1 {
116 |             return Self::one(sem, self.get_sign());
117 |         }
118 | 
119 |         // Numbers below 0.5 are rounded to zero.
120 |         if exp < -2 {
121 |             return Self::zero(sem, self.get_sign());
122 |         }
123 | 
124 |         // This is a fraction. Figure out which bits represent values over one
125 |         // and clear out the values that represent the fraction.
126 |         let trim = (self.get_mantissa_len() as i64 - exp) as usize;
127 |         let (mut m, loss) = shift_right_with_loss(&self.get_mantissa(), trim);
128 |         m.shift_left(trim);
129 |         let t = Self::from_parts(sem, self.get_sign(), self.get_exp(), m);
130 | 
131 |         if loss.is_lt_half() {
132 |             t
133 |         } else if self.get_sign() {
134 |             t - Self::one(sem, false)
135 |         } else {
136 |             t + Self::one(sem, false)
137 |         }
138 |     }
139 | 
140 |     pub(crate) fn convert_normal_to_integer(&self, rm: RoundingMode) -> BigInt {
141 |         // We are converting to integer, so set the center point of the exponent
142 |         // to the lsb instead of the msb.
143 |         let i_exp = self.get_exp() - self.get_mantissa_len() as i64;
144 |         if i_exp < 0 {
145 |             let (mut m, loss) = float::shift_right_with_loss(
146 |                 &self.get_mantissa(),
147 |                 -i_exp as usize,
148 |             );
149 |             if self.need_round_away_from_zero(rm, loss) {
150 |                 m.inplace_add(&BigInt::one());
151 |             }
152 |             m
153 |         } else {
154 |             let mut m = self.get_mantissa();
155 |             m.shift_left(i_exp as usize);
156 |             m
157 |         }
158 |     }
159 | 
160 |     fn from_bits(sem: Semantics, float: u64) -> Self {
161 |         // Extract the biased exponent (wipe the sign and mantissa).
162 |         let biased_exp = ((float >> sem.get_mantissa_len())
163 |             & mask(sem.get_exponent_len()) as u64)
164 |             as i64;
165 |         // Wipe the original exponent and mantissa.
166 |         let sign =
167 |             (float >> (sem.get_exponent_len() + sem.get_mantissa_len())) & 1;
168 |         // Wipe the sign and exponent.
169 |         let mut mantissa = float & mask(sem.get_mantissa_len()) as u64;
170 | 
171 |         let sign = sign == 1;
172 | 
173 |         // Check for NaN/Inf
174 |         if biased_exp == mask(sem.get_exponent_len()) as i64 {
175 |             if mantissa == 0 {
176 |                 return Self::inf(sem, sign);
177 |             }
178 |             return Self::nan(sem, sign);
179 |         }
180 | 
181 |         let mut exp = biased_exp - sem.get_bias();
182 | 
183 |         // Add the implicit bit for normal numbers.
184 |         if biased_exp != 0 {
185 |             mantissa += 1u64 << sem.get_mantissa_len();
186 |         } else {
187 |             // Handle denormals, adjust the exponent to the legal range.
188 |             exp += 1;
189 |         }
190 | 
191 |         let mantissa = BigInt::from_u64(mantissa);
192 |         Self::from_parts(sem, sign, exp, mantissa)
193 |     }
194 | 
195 |     /// Cast to another float using the non-default rounding mode `rm`.
196 |     pub fn cast_with_rm(&self, to: Semantics, rm: RoundingMode) -> Float {
197 |         let mut loss = LossFraction::ExactlyZero;
198 |         let exp_delta =
199 |             self.get_mantissa_len() as i64 - to.get_mantissa_len() as i64;
200 |         let mut temp = self.clone();
201 |         // If we are casting to a narrow type then we need to shift the bits
202 |         // to the new-mantissa part of the word. This will adjust the exponent,
203 |         // and if we lose bits then we'll need to round the number.
204 |         if exp_delta > 0 {
205 |             loss = temp.shift_significand_right(exp_delta as u64);
206 |         }
207 | 
208 |         let mut x = Float::raw(
209 |             to,
210 |             temp.get_sign(),
211 |             temp.get_exp() - exp_delta,
212 |             temp.get_mantissa(),
213 |             temp.get_category(),
214 |         );
215 |         // Don't normalize if this is a nop conversion.
216 |         if to.get_exponent_len() != self.get_exponent_len()
217 |             || to.get_mantissa_len() != self.get_mantissa_len()
218 |         {
219 |             x.normalize(rm, loss);
220 |         }
221 |         x
222 |     }
223 |     /// Convert from one float format to another.
224 |     pub fn cast(&self, to: Semantics) -> Float {
225 |         self.cast_with_rm(to, self.get_rounding_mode())
226 |     }
227 | 
228 |     fn as_native_float(&self) -> u64 {
229 |         // https://en.wikipedia.org/wiki/IEEE_754
230 |         let mantissa: u64;
231 |         let mut exp: u64;
232 |         match self.get_category() {
233 |             Category::Infinity => {
234 |                 mantissa = 0;
235 |                 exp = mask(self.get_exponent_len()) as u64;
236 |             }
237 |             Category::NaN => {
238 |                 mantissa = 1 << (self.get_mantissa_len() - 1);
239 |                 exp = mask(self.get_exponent_len()) as u64;
240 |             }
241 |             Category::Zero => {
242 |                 mantissa = 0;
243 |                 exp = 0;
244 |             }
245 |             Category::Normal => {
246 |                 exp = (self.get_exp() + self.get_bias()) as u64;
247 |                 debug_assert!(exp > 0);
248 |                 let m = self.get_mantissa().as_u64();
249 |                 // Encode denormals. If the exponent is the minimum value and we
250 |                 // don't have a leading integer bit (in the form 1.mmmm) then
251 |                 // this is a denormal value and we need to encode it as such.
252 |                 if (exp == 1) && ((m >> self.get_mantissa_len()) == 0) {
253 |                     exp = 0;
254 |                 }
255 |                 mantissa = m & utils::mask(self.get_mantissa_len()) as u64;
256 |             }
257 |         }
258 | 
259 |         let mut bits: u64 = self.get_sign() as u64;
260 |         bits <<= self.get_exponent_len();
261 |         bits |= exp;
262 |         bits <<= self.get_mantissa_len();
263 |         debug_assert!(mantissa <= 1 << self.get_mantissa_len());
264 |         bits |= mantissa;
265 |         bits
266 |     }
267 |     /// Convert this float to fp32. Notice that the number may overflow or
268 |     /// rounded to the nearest even (see cast and cast_with_rm).
269 |     pub fn as_f32(&self) -> f32 {
270 |         let b = self.cast(FP32);
271 |         let bits = b.as_native_float();
272 |         f32::from_bits(bits as u32)
273 |     }
274 |     /// Convert this float to fp64. Notice that the number may overflow or
275 |     /// rounded to the nearest even (see cast and cast_with_rm).
276 |     pub fn as_f64(&self) -> f64 {
277 |         let b = self.cast(FP64);
278 |         let bits = b.as_native_float();
279 |         f64::from_bits(bits)
280 |     }
281 | 
282 |     /// Loads and converts a native fp32 value. Notice that the number may
283 |     /// overflow or rounded (see cast and cast_with_rm).
284 |     pub fn from_f32(float: f32) -> Self {
285 |         Float::from_bits(FP32, float.to_bits() as u64)
286 |     }
287 | 
288 |     /// Loads and converts a native fp64 value. Notice that the number may
289 |     /// overflow or rounded (see cast and cast_with_rm).
290 |     pub fn from_f64(float: f64) -> Self {
291 |         Float::from_bits(FP64, float.to_bits())
292 |     }
293 | }
294 | 
295 | #[test]
296 | fn test_rounding_to_integer() {
297 |     // Test the low integers with round-to-zero.
298 |     for i in 0..100 {
299 |         let z64 = FP64.with_rm(RoundingMode::Zero);
300 |         let r = Float::from_f64(i as f64 + 0.1).cast(z64).to_i64();
301 |         assert_eq!(i, r);
302 |     }
303 | 
304 |     // Test the high integers with round_to_zero.
305 |     for i in 0..100 {
306 |         let z64 = FP64.with_rm(RoundingMode::Zero);
307 |         let val = (i as i64) << 54;
308 |         let r = Float::from_i64(FP64, val).cast(z64).to_i64();
309 |         assert_eq!(val, r);
310 |     }
311 | 
312 |     let nta64 = FP64.with_rm(RoundingMode::NearestTiesToAway);
313 |     assert_eq!(1, Float::from_f64(0.5).cast(nta64).to_i64());
314 |     assert_eq!(0, Float::from_f64(0.49).cast(nta64).to_i64());
315 |     assert_eq!(199999, Float::from_f64(199999.49).cast(nta64).to_i64());
316 |     assert_eq!(0, Float::from_f64(-0.49).cast(nta64).to_i64());
317 |     assert_eq!(-1, Float::from_f64(-0.5).cast(nta64).to_i64());
318 | 
319 |     let z64 = FP64.with_rm(RoundingMode::Zero);
320 |     assert_eq!(0, Float::from_f64(0.9).cast(z64).to_i64());
321 |     assert_eq!(1, Float::from_f64(1.1).cast(z64).to_i64());
322 |     assert_eq!(99, Float::from_f64(99.999).cast(z64).to_i64());
323 |     assert_eq!(0, Float::from_f64(-0.99).cast(z64).to_i64());
324 |     assert_eq!(0, Float::from_f64(-0.5).cast(z64).to_i64());
325 | 
326 |     let p64 = FP64.with_rm(RoundingMode::Positive);
327 |     assert_eq!(1, Float::from_f64(0.9).cast(p64).to_i64());
328 |     assert_eq!(2, Float::from_f64(1.1).cast(p64).to_i64());
329 |     assert_eq!(100, Float::from_f64(99.999).cast(p64).to_i64());
330 |     assert_eq!(0, Float::from_f64(-0.99).cast(p64).to_i64());
331 |     assert_eq!(0, Float::from_f64(-0.5).cast(p64).to_i64());
332 | 
333 |     // Special values
334 |     let n_inf = f64::NEG_INFINITY;
335 |     let inf = f64::INFINITY;
336 |     assert_eq!(0, Float::from_f64(f64::NAN).to_i64());
337 |     assert_eq!(i64::MIN, Float::from_f64(n_inf).to_i64());
338 |     assert_eq!(i64::MAX, Float::from_f64(inf).to_i64());
339 | }
340 | 
341 | #[test]
342 | fn test_round_trip_native_float_cast() {
343 |     let f = f32::from_bits(0x41700000);
344 |     let a = Float::from_f32(f);
345 |     assert_eq!(f, a.as_f32());
346 | 
347 |     let pi = 355. / 113.;
348 |     let a = Float::from_f64(pi);
349 |     assert_eq!(pi, a.as_f64());
350 | 
351 |     assert!(Float::from_f64(f64::NAN).is_nan());
352 |     assert!(!Float::from_f64(f64::NAN).is_inf());
353 |     assert!(Float::from_f64(f64::INFINITY).is_inf());
354 |     assert!(!Float::from_f64(f64::INFINITY).is_nan());
355 |     assert!(Float::from_f64(f64::NEG_INFINITY).is_inf());
356 | 
357 |     let a_float = f32::from_bits(0x3f8fffff);
358 |     let a = Float::from_f32(a_float);
359 |     let b = a.cast(FP32);
360 |     assert_eq!(a.as_f32(), a_float);
361 |     assert_eq!(b.as_f32(), a_float);
362 | 
363 |     let f = f32::from_bits(0x000000);
364 |     let a = Float::from_f32(f);
365 |     assert!(!a.is_normal());
366 |     assert_eq!(f, a.as_f32());
367 | }
368 | 
369 | #[test]
370 | fn test_cast_easy_ctor() {
371 |     let values = [0x3f8fffff, 0x40800000, 0x3f000000, 0xc60b40ec, 0xbc675793];
372 | 
373 |     for v in values {
374 |         let output = f32::from_bits(v);
375 |         let a = Float::from_f32(output).cast(FP64);
376 |         let b = a.cast(FP32);
377 |         assert_eq!(a.as_f32(), output);
378 |         assert_eq!(b.as_f32(), output);
379 |     }
380 | }
381 | 
382 | #[test]
383 | fn test_cast_from_integers() {
384 |     use super::float::FP16;
385 | 
386 |     let pi = 355. / 133.;
387 |     let e = 193. / 71.;
388 | 
389 |     assert_eq!(Float::from_i64(FP32, 1 << 32).as_f32(), (1u64 << 32) as f32);
390 |     assert_eq!(Float::from_i64(FP32, 1 << 34).as_f32(), (1u64 << 34) as f32);
391 |     assert_eq!(Float::from_f64(pi).as_f32(), (pi) as f32);
392 |     assert_eq!(Float::from_f64(e).as_f32(), (e) as f32);
393 |     assert_eq!(Float::from_u64(FP32, 8388610).as_f32(), 8388610 as f32);
394 | 
395 |     for i in 0..(1 << 16) {
396 |         assert_eq!(Float::from_u64(FP32, i << 12).as_f32(), (i << 12) as f32);
397 |     }
398 | 
399 |     assert_eq!(Float::from_i64(FP16, 0).as_f64(), 0.);
400 |     assert_eq!(Float::from_i64(FP16, 65500).as_f64(), 65504.0);
401 |     assert_eq!(Float::from_i64(FP16, 65504).as_f64(), 65504.0);
402 |     assert_eq!(Float::from_i64(FP16, 65519).as_f64(), 65504.0);
403 |     assert_eq!(Float::from_i64(FP16, 65520).as_f64(), f64::INFINITY);
404 |     assert_eq!(Float::from_i64(FP16, 65536).as_f64(), f64::INFINITY);
405 | 
406 |     for i in -100..100 {
407 |         let a = Float::from_i64(FP32, i);
408 |         let b = Float::from_f64(i as f64).cast(FP32);
409 |         assert_eq!(a.as_f32(), b.as_f32());
410 |     }
411 | }
412 | 
413 | #[test]
414 | fn test_cast_zero_nan_inf() {
415 |     assert!(Float::nan(FP64, true).as_f64().is_nan());
416 |     assert_eq!(Float::zero(FP64, false).as_f64(), 0.0);
417 |     assert_eq!(Float::zero(FP64, true).as_f64(), -0.0);
418 | 
419 |     assert!(Float::nan(FP64, true).is_nan());
420 |     assert!(Float::inf(FP64, true).is_inf());
421 |     {
422 |         let a = Float::from_f32(f32::from_bits(0x3f8fffff));
423 |         assert!(!a.is_inf());
424 |         assert!(!a.is_nan());
425 |         assert!(!a.is_negative());
426 |     }
427 |     {
428 |         let a = Float::from_f32(f32::from_bits(0xf48fffff));
429 |         assert!(!a.is_inf());
430 |         assert!(!a.is_nan());
431 |         assert!(a.is_negative());
432 |     }
433 |     {
434 |         let a = Float::from_f32(f32::from_bits(0xff800000)); // -Inf
435 |         assert!(a.is_inf());
436 |         assert!(!a.is_nan());
437 |         assert!(a.is_negative());
438 |     }
439 |     {
440 |         let a = Float::from_f32(f32::from_bits(0xffc00000)); // -Nan.
441 |         assert!(!a.is_inf());
442 |         assert!(a.is_nan());
443 |         assert!(a.is_negative());
444 |     }
445 | 
446 |     {
447 |         let a = Float::from_f64(f64::from_bits((mask(32) << 32) as u64));
448 |         assert!(!a.is_inf());
449 |         assert!(a.is_nan());
450 |     }
451 |     {
452 |         // Check that casting propagates inf/nan.
453 |         let a = Float::from_f32(f32::from_bits(0xff800000)); // -Inf
454 |         let b = a.cast(FP64);
455 |         assert!(b.is_inf());
456 |         assert!(!b.is_nan());
457 |         assert!(b.is_negative());
458 |     }
459 | }
460 | 
461 | #[test]
462 | fn test_cast_down_easy() {
463 |     // Check that we can cast the numbers down, matching the hardware casting.
464 |     for v in [0.3, 0.1, 14151241515., 14151215., 0.0000000001, 1000000000.] {
465 |         let res = Float::from_f64(v).as_f32();
466 |         assert_eq!(Float::from_f64(v).as_f64().to_bits(), v.to_bits());
467 |         assert!(res == v as f32);
468 |     }
469 | }
470 | 
471 | #[test]
472 | fn test_load_store_all_f32() {
473 |     // Try to load and store normals and denormals.
474 |     for i in 0..(1u64 << 16) {
475 |         let in_f = f32::from_bits((i << 10) as u32);
476 |         let fp_f = Float::from_f32(in_f);
477 |         let out_f = fp_f.as_f32();
478 |         assert_eq!(in_f.is_nan(), out_f.is_nan());
479 |         assert_eq!(in_f.is_infinite(), out_f.is_infinite());
480 |         assert!(in_f.is_nan() || (in_f.to_bits() == out_f.to_bits()));
481 |     }
482 | }
483 | 
484 | #[cfg(feature = "std")]
485 | #[test]
486 | fn test_cast_down_complex() {
487 |     // Try casting a bunch of difficult values such as inf, nan, denormals, etc.
488 |     for v in utils::get_special_test_values() {
489 |         let res = Float::from_f64(v).as_f32();
490 |         assert_eq!(Float::from_f64(v).as_f64().to_bits(), v.to_bits());
491 |         assert_eq!(v.is_nan(), res.is_nan());
492 |         assert!(v.is_nan() || res == v as f32);
493 |     }
494 | }
495 | 
496 | #[cfg(feature = "std")]
497 | #[test]
498 | fn test_trunc() {
499 |     use super::utils::Lfsr;
500 | 
501 |     let large_integer = (1u64 << 52) as f64;
502 |     assert_eq!(Float::from_f64(0.4).trunc().as_f64(), 0.);
503 |     assert_eq!(Float::from_f64(1.4).trunc().as_f64(), 1.);
504 |     assert_eq!(Float::from_f64(1.99).trunc().as_f64(), 1.);
505 |     assert_eq!(Float::from_f64(2.0).trunc().as_f64(), 2.0);
506 |     assert_eq!(Float::from_f64(-2.4).trunc().as_f64(), -2.0);
507 |     assert_eq!(Float::from_f64(1999999.).trunc().as_f64(), 1999999.);
508 |     assert_eq!(
509 |         Float::from_f64(large_integer).trunc().as_f64(),
510 |         large_integer
511 |     );
512 |     assert_eq!(Float::from_f64(0.001).trunc().as_f64(), 0.);
513 | 
514 |     // Test random values.
515 |     let mut lfsr = Lfsr::new();
516 |     for _ in 0..5000 {
517 |         let v0 = f64::from_bits(lfsr.get64());
518 |         let t0 = Float::from_f64(v0).trunc().as_f64();
519 |         let t1 = v0.trunc();
520 |         assert_eq!(t0.is_nan(), t1.is_nan());
521 |         if !t1.is_nan() {
522 |             assert_eq!(t0, t1);
523 |         }
524 |     }
525 | 
526 |     // Test special values.
527 |     for val in utils::get_special_test_values() {
528 |         let t0 = Float::from_f64(val).trunc().as_f64();
529 |         let t1 = val.trunc();
530 |         assert_eq!(t0.is_nan(), t1.is_nan());
531 |         if !t1.is_nan() {
532 |             assert_eq!(t0, t1);
533 |         }
534 |     }
535 | }
536 | 
537 | #[cfg(feature = "std")]
538 | #[test]
539 | fn test_round() {
540 |     use super::utils::Lfsr;
541 |     assert_eq!(Float::from_f64(2.0).round().as_f64(), 2.0);
542 |     assert_eq!(Float::from_f64(2.5).round().as_f64(), 3.0);
543 |     assert_eq!(Float::from_f64(-2.5).round().as_f64(), -3.0);
544 | 
545 |     let big_num = (1u64 << 52) as f64;
546 |     assert_eq!(Float::from_f64(0.4).round().as_f64(), 0.);
547 |     assert_eq!(Float::from_f64(1.4).round().as_f64(), 1.);
548 |     assert_eq!(Float::from_f64(1.99).round().as_f64(), 2.);
549 |     assert_eq!(Float::from_f64(2.0).round().as_f64(), 2.0);
550 |     assert_eq!(Float::from_f64(2.1).round().as_f64(), 2.0);
551 |     assert_eq!(Float::from_f64(-2.4).round().as_f64(), -2.0);
552 |     assert_eq!(Float::from_f64(1999999.).round().as_f64(), 1999999.);
553 |     assert_eq!(Float::from_f64(big_num).round().as_f64(), big_num);
554 |     assert_eq!(Float::from_f64(0.001).round().as_f64(), 0.);
555 | 
556 |     // Test random values.
557 |     let mut lfsr = Lfsr::new();
558 |     for _ in 0..5000 {
559 |         let v0 = f64::from_bits(lfsr.get64());
560 |         let t0 = Float::from_f64(v0).round().as_f64();
561 |         let t1 = v0.round();
562 |         assert_eq!(t0.is_nan(), t1.is_nan());
563 |         if !t1.is_nan() {
564 |             assert_eq!(t0, t1);
565 |         }
566 |     }
567 | 
568 |     // Test special values.
569 |     for val in utils::get_special_test_values() {
570 |         let t0 = Float::from_f64(val).round().as_f64();
571 |         let t1 = val.round();
572 |         assert_eq!(t0.is_nan(), t1.is_nan());
573 |         if !t1.is_nan() {
574 |             assert_eq!(t0, t1);
575 |         }
576 |     }
577 | }
578 | 
579 | #[cfg(feature = "std")]
580 | #[test]
581 | fn test_cast_sizes() {
582 |     use crate::FP16;
583 |     use crate::FP256;
584 |     let e = std::f64::consts::E;
585 |     {
586 |         let wide = Float::from_f64(e).cast(FP256);
587 |         let narrow = wide.cast(FP64);
588 |         assert_eq!(narrow.as_f64(), e);
589 |     }
590 | 
591 |     {
592 |         let narrow = Float::from_f64(e);
593 |         let wide = narrow.cast(FP256);
594 |         assert_eq!(wide.as_f64(), e);
595 |     }
596 | 
597 |     {
598 |         let wide = Float::from_u64(FP256, 1 << 50);
599 |         let narrow = wide.cast(FP16);
600 |         assert!(narrow.is_inf());
601 |     }
602 | 
603 |     {
604 |         let narrow = Float::from_u64(FP16, 1 << 50);
605 |         let wide = narrow.cast(FP256);
606 |         assert!(wide.is_inf());
607 |     }
608 | 
609 |     {
610 |         let narrow = Float::from_u64(FP16, 50);
611 |         let wide = narrow.cast(FP256);
612 |         assert_eq!(wide.as_f64(), narrow.as_f64());
613 |         assert_eq!(wide.to_i64(), 50);
614 |     }
615 | }
616 | 


--------------------------------------------------------------------------------
/src/float.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the Float data structure and basic methods.
  2 | 
  3 | extern crate alloc;
  4 | use super::bigint::BigInt;
  5 | use super::bigint::LossFraction;
  6 | use core::cmp::Ordering;
  7 | 
  8 | /// Defines the supported rounding modes.
  9 | /// See IEEE754-2019 Section 4.3 Rounding-direction attributes.
 10 | #[derive(Debug, Clone, Copy, PartialEq)]
 11 | pub enum RoundingMode {
 12 |     None,
 13 |     NearestTiesToEven,
 14 |     NearestTiesToAway,
 15 |     Zero,
 16 |     Positive,
 17 |     Negative,
 18 | }
 19 | 
 20 | impl RoundingMode {
 21 |     /// Create a rounding mode from a string, if valid, or return none.
 22 |     pub fn from_string(s: &str) -> Option<Self> {
 23 |         match s {
 24 |             "NearestTiesToEven" => Some(RoundingMode::NearestTiesToEven),
 25 |             "NearestTiesToAway" => Some(RoundingMode::NearestTiesToAway),
 26 |             "Zero" => Some(RoundingMode::Zero),
 27 |             "Positive" => Some(RoundingMode::Positive),
 28 |             "Negative" => Some(RoundingMode::Negative),
 29 |             _ => None,
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | /// Controls the semantics of a floating point number with:
 35 | /// 'precision', that determines the number of bits, 'exponent' that controls
 36 | /// the dynamic range of the number, and rounding mode that controls how
 37 | /// rounding is done after arithmetic operations.
 38 | ///
 39 | /// # Example
 40 | ///
 41 | /// ```
 42 | ///     use arpfloat::{Float, RoundingMode, Semantics};
 43 | ///
 44 | ///     // Create a new floating point semantics.
 45 | ///     let sem = Semantics::new(10, 100, RoundingMode::Positive);
 46 | ///     // Create the number 1.0 with the new semantics.
 47 | ///     let x = Float::one(sem, false);
 48 | ///
 49 | ///     // Check that the value is correct when casting to `double`.
 50 | ///     assert_eq!(x.as_f64(), 1.0);
 51 | /// ```
 52 | 
 53 | #[derive(Debug, Clone, Copy, PartialEq)]
 54 | pub struct Semantics {
 55 |     /// The number of bits that define the range of the exponent.
 56 |     pub exponent: usize,
 57 |     /// The number of bits in the significand (mantissa + 1).
 58 |     pub precision: usize,
 59 |     /// The rounding mode used when performing operations on this type.
 60 |     pub mode: RoundingMode,
 61 | }
 62 | 
 63 | impl Semantics {
 64 |     pub const fn new(
 65 |         exponent: usize,
 66 |         precision: usize,
 67 |         mode: RoundingMode,
 68 |     ) -> Self {
 69 |         Semantics {
 70 |             exponent,
 71 |             precision,
 72 |             mode,
 73 |         }
 74 |     }
 75 |     /// Returns the precision in bits.
 76 |     pub fn get_precision(&self) -> usize {
 77 |         self.precision
 78 |     }
 79 |     /// Returns the length of the mantissa in bits (precision - 1).
 80 |     pub fn get_mantissa_len(&self) -> usize {
 81 |         self.precision - 1
 82 |     }
 83 |     /// Returns the length of the exponent in bits, which defines the valid
 84 |     /// range.
 85 |     pub fn get_exponent_len(&self) -> usize {
 86 |         self.exponent
 87 |     }
 88 | 
 89 |     /// Returns the rounding mode of the type.
 90 |     pub fn get_rounding_mode(&self) -> RoundingMode {
 91 |         self.mode
 92 |     }
 93 | 
 94 |     /// Create a new float semantics with increased precision with 'add'
 95 |     /// additional digits.
 96 |     pub fn increase_precision(&self, more: usize) -> Semantics {
 97 |         Semantics::new(self.exponent, self.precision + more, self.mode)
 98 |     }
 99 |     /// Create a new float semantics with increased precision with 'add'
100 |     /// additional digits, plus ceil(log2) of the number.
101 |     pub fn grow_log(&self, more: usize) -> Semantics {
102 |         let log2 = self.log_precision();
103 |         Semantics::new(self.exponent, self.precision + more + log2, self.mode)
104 |     }
105 | 
106 |     /// Return a log2 approximation for the precision value.
107 |     pub fn log_precision(&self) -> usize {
108 |         // This is ~Log2(precision)
109 |         64 - (self.precision as u64).leading_zeros() as usize
110 |     }
111 | 
112 |     /// Create a new float semantics with increased exponent with 'more'
113 |     /// additional digits.
114 |     pub fn increase_exponent(&self, more: usize) -> Semantics {
115 |         Semantics::new(self.exponent + more, self.precision, self.mode)
116 |     }
117 |     /// Create a new float semantics with a different rounding mode 'mode'.
118 |     pub fn with_rm(&self, rm: RoundingMode) -> Semantics {
119 |         Semantics::new(self.exponent, self.precision, rm)
120 |     }
121 | 
122 |     /// Returns the exponent bias for the number, as a positive number.
123 |     /// https://en.wikipedia.org/wiki/IEEE_754#Basic_and_interchange_formats
124 |     pub(crate) fn get_bias(&self) -> i64 {
125 |         let e = self.get_exponent_len();
126 |         ((1u64 << (e - 1)) - 1) as i64
127 |     }
128 |     /// Returns the upper and lower bounds of the exponent.
129 |     pub fn get_exp_bounds(&self) -> (i64, i64) {
130 |         let exp_min: i64 = -self.get_bias() + 1;
131 |         // The highest value is 0xFFFE, because 0xFFFF is used for signaling.
132 |         let exp_max: i64 = (1 << self.get_exponent_len()) - self.get_bias() - 2;
133 |         (exp_min, exp_max)
134 |     }
135 | }
136 | 
137 | /// Declare the different categories of the floating point number. These
138 | /// categories are internal to the float, and can be access by the accessors:
139 | /// is_inf, is_zero, is_nan, is_normal.
140 | #[derive(Debug, Clone, Copy, PartialEq)]
141 | pub enum Category {
142 |     Infinity,
143 |     NaN,
144 |     Normal,
145 |     Zero,
146 | }
147 | 
148 | /// This is the main data structure of this library. It represents an
149 | /// arbitrary-precision floating-point number.
150 | #[derive(Debug, Clone)]
151 | pub struct Float {
152 |     // The semantics of the float (precision, exponent range).
153 |     sem: Semantics,
154 |     // The Sign bit.
155 |     sign: bool,
156 |     // The Exponent.
157 |     exp: i64,
158 |     // The significand, including the implicit bit, aligned to the right.
159 |     // Format [00000001xxxxxxx].
160 |     mantissa: BigInt,
161 |     // The kind of number this float represents.
162 |     category: Category,
163 | }
164 | 
165 | impl Float {
166 |     pub(crate) fn get_mantissa_len(&self) -> usize {
167 |         self.sem.get_mantissa_len()
168 |     }
169 |     pub(crate) fn get_exponent_len(&self) -> usize {
170 |         self.sem.get_exponent_len()
171 |     }
172 | 
173 |     /// Create a new normal floating point number.
174 |     pub fn from_parts(
175 |         sem: Semantics,
176 |         sign: bool,
177 |         exp: i64,
178 |         mantissa: BigInt,
179 |     ) -> Self {
180 |         if mantissa.is_zero() {
181 |             return Float::zero(sem, sign);
182 |         }
183 |         Float {
184 |             sem,
185 |             sign,
186 |             exp,
187 |             mantissa,
188 |             category: Category::Normal,
189 |         }
190 |     }
191 | 
192 |     /// Create a new normal floating point number.
193 |     pub(crate) fn raw(
194 |         sem: Semantics,
195 |         sign: bool,
196 |         exp: i64,
197 |         mantissa: BigInt,
198 |         category: Category,
199 |     ) -> Self {
200 |         Float {
201 |             sem,
202 |             sign,
203 |             exp,
204 |             mantissa,
205 |             category,
206 |         }
207 |     }
208 | 
209 |     /// Returns a new zero float.
210 |     pub fn zero(sem: Semantics, sign: bool) -> Self {
211 |         Float {
212 |             sem,
213 |             sign,
214 |             exp: 0,
215 |             mantissa: BigInt::zero(),
216 |             category: Category::Zero,
217 |         }
218 |     }
219 | 
220 |     /// Returns a new float with the value one.
221 |     pub fn one(sem: Semantics, sign: bool) -> Self {
222 |         let mut one = BigInt::one();
223 |         one.shift_left(sem.get_mantissa_len());
224 |         Float {
225 |             sem,
226 |             sign,
227 |             exp: 0,
228 |             mantissa: one,
229 |             category: Category::Normal,
230 |         }
231 |     }
232 | 
233 |     /// Returns a new infinity float.
234 |     pub fn inf(sem: Semantics, sign: bool) -> Self {
235 |         Float {
236 |             sem,
237 |             sign,
238 |             exp: 0,
239 |             mantissa: BigInt::zero(),
240 |             category: Category::Infinity,
241 |         }
242 |     }
243 | 
244 |     /// Returns a new NaN float.
245 |     pub fn nan(sem: Semantics, sign: bool) -> Self {
246 |         Float {
247 |             sem,
248 |             sign,
249 |             exp: 0,
250 |             mantissa: BigInt::zero(),
251 |             category: Category::NaN,
252 |         }
253 |     }
254 |     /// Returns true if the Float is negative
255 |     pub fn is_negative(&self) -> bool {
256 |         self.sign
257 |     }
258 | 
259 |     /// Returns true if the Float is +-inf.
260 |     pub fn is_inf(&self) -> bool {
261 |         if let Category::Infinity = self.category {
262 |             return true;
263 |         }
264 |         false
265 |     }
266 | 
267 |     /// Returns true if the Float is a +- NaN.
268 |     pub fn is_nan(&self) -> bool {
269 |         if let Category::NaN = self.category {
270 |             return true;
271 |         }
272 |         false
273 |     }
274 | 
275 |     /// Returns true if the Float is a +- zero.
276 |     pub fn is_zero(&self) -> bool {
277 |         if let Category::Zero = self.category {
278 |             return true;
279 |         }
280 |         false
281 |     }
282 | 
283 |     /// Returns true if this number is normal (not Zero, Nan, Inf).
284 |     pub fn is_normal(&self) -> bool {
285 |         if let Category::Normal = self.category {
286 |             return true;
287 |         }
288 |         false
289 |     }
290 | 
291 |     /// Return the semantics of the number
292 |     pub fn get_semantics(&self) -> Semantics {
293 |         self.sem
294 |     }
295 | 
296 |     /// Returns the rounding mode of the number.
297 |     pub fn get_rounding_mode(&self) -> RoundingMode {
298 |         self.sem.get_rounding_mode()
299 |     }
300 | 
301 |     /// Update the sign of the float to `sign`. True means negative.
302 |     pub fn set_sign(&mut self, sign: bool) {
303 |         self.sign = sign
304 |     }
305 | 
306 |     /// Returns the sign of the float. True means negative.
307 |     pub fn get_sign(&self) -> bool {
308 |         self.sign
309 |     }
310 | 
311 |     /// Returns the mantissa of the float.
312 |     pub fn get_mantissa(&self) -> BigInt {
313 |         self.mantissa.clone()
314 |     }
315 | 
316 |     /// Returns the exponent of the float.
317 |     pub fn get_exp(&self) -> i64 {
318 |         self.exp
319 |     }
320 | 
321 |     /// Returns the category of the float.
322 |     pub fn get_category(&self) -> Category {
323 |         self.category
324 |     }
325 | 
326 |     /// Returns a new float which has a flipped sign (negated value).
327 |     pub fn neg(&self) -> Self {
328 |         Self::raw(
329 |             self.sem,
330 |             !self.sign,
331 |             self.exp,
332 |             self.mantissa.clone(),
333 |             self.category,
334 |         )
335 |     }
336 | 
337 |     /// Shift the mantissa to the left to ensure that the MSB if the mantissa
338 |     /// is set to the precision. The method updates the exponent to keep the
339 |     /// number correct.
340 |     pub(super) fn align_mantissa(&mut self) {
341 |         let bits =
342 |             self.sem.get_precision() as i64 - self.mantissa.msb_index() as i64;
343 |         if bits > 0 {
344 |             self.exp += bits;
345 |             self.mantissa.shift_left(bits as usize);
346 |         }
347 |     }
348 | 
349 |     /// Prints the number using the internal representation.
350 |     #[cfg(feature = "std")]
351 |     pub fn dump(&self) {
352 |         use std::println;
353 |         let sign = if self.sign { "-" } else { "+" };
354 |         match self.category {
355 |             Category::NaN => {
356 |                 println!("[{}NaN]", sign);
357 |             }
358 |             Category::Infinity => {
359 |                 println!("[{}Inf]", sign);
360 |             }
361 |             Category::Zero => {
362 |                 println!("[{}0.0]", sign);
363 |             }
364 |             Category::Normal => {
365 |                 let m = self.mantissa.as_binary();
366 |                 println!("FP[{} E={:4} M = {}]", sign, self.exp, m);
367 |             }
368 |         }
369 |     }
370 | 
371 |     #[cfg(not(feature = "std"))]
372 |     pub fn dump(&self) {
373 |         // No-op in no_std environments
374 |     }
375 | 
376 |     /// Returns the exponent bias for the number, as a positive number.
377 |     /// https://en.wikipedia.org/wiki/IEEE_754#Basic_and_interchange_formats
378 |     pub(crate) fn get_bias(&self) -> i64 {
379 |         self.sem.get_bias()
380 |     }
381 | 
382 |     /// Returns the upper and lower bounds of the exponent.
383 |     pub fn get_exp_bounds(&self) -> (i64, i64) {
384 |         self.sem.get_exp_bounds()
385 |     }
386 | }
387 | 
388 | // IEEE 754-2019
389 | // Table 3.5 — Binary interchange format parameters.
390 | use RoundingMode::NearestTiesToEven as nte;
391 | 
392 | /// Predefined BF16 float with 8 exponent bits, and 7 mantissa bits.
393 | pub const BF16: Semantics = Semantics::new(8, 8, nte);
394 | /// Predefined FP16 float with 5 exponent bits, and 10 mantissa bits.
395 | pub const FP16: Semantics = Semantics::new(5, 11, nte);
396 | /// Predefined FP32 float with 8 exponent bits, and 23 mantissa bits.
397 | pub const FP32: Semantics = Semantics::new(8, 24, nte);
398 | /// Predefined FP64 float with 11 exponent bits, and 52 mantissa bits.
399 | pub const FP64: Semantics = Semantics::new(11, 53, nte);
400 | /// Predefined FP128 float with 15 exponent bits, and 112 mantissa bits.
401 | pub const FP128: Semantics = Semantics::new(15, 113, nte);
402 | /// Predefined FP256 float with 19 exponent bits, and 236 mantissa bits.
403 | pub const FP256: Semantics = Semantics::new(19, 237, nte);
404 | 
405 | /// Shift `val` by `bits`, and report the loss.
406 | pub(crate) fn shift_right_with_loss(
407 |     val: &BigInt,
408 |     bits: usize,
409 | ) -> (BigInt, LossFraction) {
410 |     let mut val = val.clone();
411 |     let loss = val.get_loss_kind_for_bit(bits);
412 |     val.shift_right(bits);
413 |     (val, loss)
414 | }
415 | 
416 | /// Combine the loss of accuracy with `msb` more significant and `lsb`
417 | /// less significant.
418 | fn combine_loss_fraction(msb: LossFraction, lsb: LossFraction) -> LossFraction {
419 |     if !lsb.is_exactly_zero() {
420 |         if msb.is_exactly_zero() {
421 |             return LossFraction::LessThanHalf;
422 |         } else if msb.is_exactly_half() {
423 |             return LossFraction::MoreThanHalf;
424 |         }
425 |     }
426 |     msb
427 | }
428 | 
429 | #[test]
430 | fn shift_right_fraction() {
431 |     let x: BigInt = BigInt::from_u64(0b10000000);
432 |     let res = shift_right_with_loss(&x, 3);
433 |     assert!(res.1.is_exactly_zero());
434 | 
435 |     let x: BigInt = BigInt::from_u64(0b10000111);
436 |     let res = shift_right_with_loss(&x, 3);
437 |     assert!(res.1.is_mt_half());
438 | 
439 |     let x: BigInt = BigInt::from_u64(0b10000100);
440 |     let res = shift_right_with_loss(&x, 3);
441 |     assert!(res.1.is_exactly_half());
442 | 
443 |     let x: BigInt = BigInt::from_u64(0b10000001);
444 |     let res = shift_right_with_loss(&x, 3);
445 |     assert!(res.1.is_lt_half());
446 | }
447 | 
448 | impl Float {
449 |     /// The number overflowed, set the right value based on the rounding mode
450 |     /// and sign.
451 |     fn overflow(&mut self, rm: RoundingMode) {
452 |         let bounds = self.get_exp_bounds();
453 |         let inf = Self::inf(self.sem, self.sign);
454 |         let max = Self::from_parts(
455 |             self.sem,
456 |             self.sign,
457 |             bounds.1,
458 |             BigInt::all1s(self.get_mantissa_len()),
459 |         );
460 | 
461 |         *self = match rm {
462 |             RoundingMode::None => inf,
463 |             RoundingMode::NearestTiesToEven => inf,
464 |             RoundingMode::NearestTiesToAway => inf,
465 |             RoundingMode::Zero => max,
466 |             RoundingMode::Positive => {
467 |                 if self.sign {
468 |                     max
469 |                 } else {
470 |                     inf
471 |                 }
472 |             }
473 |             RoundingMode::Negative => {
474 |                 if self.sign {
475 |                     inf
476 |                 } else {
477 |                     max
478 |                 }
479 |             }
480 |         }
481 |     }
482 | 
483 |     /// Verify that the exponent is legal.
484 |     pub(crate) fn check_bounds(&self) {
485 |         let bounds = self.get_exp_bounds();
486 |         debug_assert!(self.exp >= bounds.0);
487 |         debug_assert!(self.exp <= bounds.1);
488 |         let max_mantissa = BigInt::one_hot(self.sem.get_precision());
489 |         debug_assert!(self.mantissa.lt(&max_mantissa));
490 |     }
491 | 
492 |     pub(crate) fn shift_significand_left(&mut self, amt: u64) {
493 |         self.exp -= amt as i64;
494 |         self.mantissa.shift_left(amt as usize);
495 |     }
496 | 
497 |     pub(crate) fn shift_significand_right(&mut self, amt: u64) -> LossFraction {
498 |         self.exp += amt as i64;
499 |         let res = shift_right_with_loss(&self.mantissa, amt as usize);
500 |         self.mantissa = res.0;
501 |         res.1
502 |     }
503 | 
504 |     /// Returns true if we need to round away from zero (increment the mantissa).
505 |     pub(crate) fn need_round_away_from_zero(
506 |         &self,
507 |         rm: RoundingMode,
508 |         loss: LossFraction,
509 |     ) -> bool {
510 |         debug_assert!(self.is_normal() || self.is_zero());
511 |         match rm {
512 |             RoundingMode::Positive => !self.sign,
513 |             RoundingMode::Negative => self.sign,
514 |             RoundingMode::Zero => false,
515 |             RoundingMode::None => false,
516 |             RoundingMode::NearestTiesToAway => loss.is_gte_half(),
517 |             RoundingMode::NearestTiesToEven => {
518 |                 if loss.is_mt_half() {
519 |                     return true;
520 |                 }
521 | 
522 |                 loss.is_exactly_half() && self.mantissa.is_odd()
523 |             }
524 |         }
525 |     }
526 | 
527 |     /// Returns true if the absolute value of the two numbers are the same.
528 |     pub(crate) fn same_absolute_value(&self, other: &Self) -> bool {
529 |         if self.category != other.category {
530 |             return false;
531 |         }
532 |         match self.category {
533 |             Category::Infinity => true,
534 |             Category::NaN => true,
535 |             Category::Zero => true,
536 |             Category::Normal => {
537 |                 self.exp == other.exp && self.mantissa == other.mantissa
538 |             }
539 |         }
540 |     }
541 | 
542 |     /// Normalize the number by adjusting the exponent to the legal range, shift
543 |     /// the mantissa to the msb, and round the number if bits are lost. This is
544 |     /// based on Neil Booth' implementation in APFloat.
545 |     pub(crate) fn normalize(&mut self, rm: RoundingMode, loss: LossFraction) {
546 |         if !self.is_normal() {
547 |             return;
548 |         }
549 |         let mut loss = loss;
550 |         let bounds = self.get_exp_bounds();
551 | 
552 |         let nmsb = self.mantissa.msb_index() as i64;
553 | 
554 |         // Step I - adjust the exponent.
555 |         if nmsb > 0 {
556 |             // Align the number so that the MSB bit will be MANTISSA + 1.
557 |             let mut exp_change = nmsb - self.sem.get_precision() as i64;
558 | 
559 |             // Handle overflowing exponents.
560 |             if self.exp + exp_change > bounds.1 {
561 |                 self.overflow(rm);
562 |                 self.check_bounds();
563 |                 return;
564 |             }
565 | 
566 |             // Handle underflowing low exponents. Don't allow to go below the
567 |             // legal exponent range.
568 |             if self.exp + exp_change < bounds.0 {
569 |                 exp_change = bounds.0 - self.exp;
570 |             }
571 | 
572 |             if exp_change < 0 {
573 |                 // Handle reducing the exponent.
574 |                 debug_assert!(loss.is_exactly_zero(), "losing information");
575 |                 self.shift_significand_left(-exp_change as u64);
576 |                 return;
577 |             }
578 | 
579 |             if exp_change > 0 {
580 |                 // Handle increasing the exponent.
581 |                 let loss2 = self.shift_significand_right(exp_change as u64);
582 |                 loss = combine_loss_fraction(loss2, loss);
583 |             }
584 |         }
585 | 
586 |         //Step II - round the number.
587 | 
588 |         // If nothing moved or the shift didn't mess things up then we're done.
589 |         if loss.is_exactly_zero() {
590 |             // Canonicalize to zero.
591 |             if self.mantissa.is_zero() {
592 |                 *self = Self::zero(self.sem, self.sign);
593 |                 return;
594 |             }
595 |             return;
596 |         }
597 | 
598 |         // Check if we need to round away from zero.
599 |         if self.need_round_away_from_zero(rm, loss) {
600 |             if self.mantissa.is_zero() {
601 |                 self.exp = bounds.0
602 |             }
603 | 
604 |             let one = BigInt::one();
605 |             self.mantissa = self.mantissa.clone() + one;
606 |             // Did the mantissa overflow?
607 |             let mut m = self.mantissa.clone();
608 |             m.shift_right(self.sem.get_precision());
609 |             if !m.is_zero() {
610 |                 // Can we fix the exponent?
611 |                 if self.exp < bounds.1 {
612 |                     self.shift_significand_right(1);
613 |                 } else {
614 |                     *self = Self::inf(self.sem, self.sign);
615 |                     return;
616 |                 }
617 |             }
618 |         }
619 | 
620 |         // Canonicalize.
621 |         if self.mantissa.is_zero() {
622 |             *self = Self::zero(self.sem, self.sign);
623 |         }
624 |     } // round.
625 | }
626 | 
627 | impl PartialEq for Float {
628 |     fn eq(&self, other: &Self) -> bool {
629 |         let bitwise = self.sign == other.sign
630 |             && self.exp == other.exp
631 |             && self.mantissa == other.mantissa
632 |             && self.category == other.category;
633 | 
634 |         match self.category {
635 |             Category::Infinity | Category::Normal => bitwise,
636 |             Category::Zero => other.is_zero(),
637 |             Category::NaN => false,
638 |         }
639 |     }
640 | }
641 | 
642 | /// Page 66. Chapter 3. Floating-Point Formats and Environment
643 | /// Table 3.8: Comparison predicates and the four relations.
644 | ///   and
645 | /// IEEE 754-2019 section 5.10 - totalOrder.
646 | impl PartialOrd for Float {
647 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
648 |         debug_assert_eq!(self.get_semantics(), other.get_semantics());
649 |         let bool_to_ord = |ord: bool| -> Option<Ordering> {
650 |             if ord {
651 |                 Some(Ordering::Less)
652 |             } else {
653 |                 Some(Ordering::Greater)
654 |             }
655 |         };
656 | 
657 |         match (self.category, other.category) {
658 |             (Category::NaN, _) | (_, Category::NaN) => None,
659 |             (Category::Zero, Category::Zero) => Some(Ordering::Equal),
660 |             (Category::Infinity, Category::Infinity) => {
661 |                 if self.sign == other.sign {
662 |                     Some(Ordering::Equal)
663 |                 } else {
664 |                     bool_to_ord(self.sign)
665 |                 }
666 |             }
667 |             (Category::Infinity, Category::Normal)
668 |             | (Category::Infinity, Category::Zero)
669 |             | (Category::Normal, Category::Zero) => bool_to_ord(self.sign),
670 | 
671 |             (Category::Normal, Category::Infinity)
672 |             | (Category::Zero, Category::Infinity)
673 |             | (Category::Zero, Category::Normal) => bool_to_ord(!other.sign),
674 | 
675 |             (Category::Normal, Category::Normal) => {
676 |                 if self.sign != other.sign {
677 |                     bool_to_ord(self.sign)
678 |                 } else if self.exp < other.exp {
679 |                     bool_to_ord(!self.sign)
680 |                 } else if self.exp > other.exp {
681 |                     bool_to_ord(self.sign)
682 |                 } else {
683 |                     match self.mantissa.cmp(&other.mantissa) {
684 |                         Ordering::Less => bool_to_ord(!self.sign),
685 |                         Ordering::Equal => Some(Ordering::Equal),
686 |                         Ordering::Greater => bool_to_ord(self.sign),
687 |                     }
688 |                 }
689 |             }
690 |         }
691 |     }
692 | }
693 | 
694 | #[cfg(feature = "std")]
695 | #[test]
696 | fn test_comparisons() {
697 |     use super::utils;
698 | 
699 |     // Compare a bunch of special values, using the <,>,== operators and check
700 |     // that they match the comparison on doubles.
701 |     for first in utils::get_special_test_values() {
702 |         for second in utils::get_special_test_values() {
703 |             let is_less = first < second;
704 |             let is_eq = first == second;
705 |             let is_gt = first > second;
706 |             let first = Float::from_f64(first);
707 |             let second = Float::from_f64(second);
708 |             assert_eq!(is_less, first < second, "<");
709 |             assert_eq!(is_eq, first == second, "==");
710 |             assert_eq!(is_gt, first > second, ">");
711 |         }
712 |     }
713 | }
714 | 
715 | #[test]
716 | fn test_one_imm() {
717 |     let sem = Semantics::new(10, 12, nte);
718 |     let x = Float::one(sem, false);
719 |     assert_eq!(x.as_f64(), 1.0);
720 | }
721 | 
722 | #[test]
723 | pub fn test_bigint_ctor() {
724 |     // Make sure that we can load numbers of the highest border of the FP16
725 |     // number.
726 |     let bi = BigInt::from_u64(65519);
727 |     assert_eq!(Float::from_bigint(FP16, bi).cast(FP32).to_i64(), 65504);
728 |     assert_eq!(Float::from_f64(65519.).cast(FP16).to_i64(), 65504);
729 | 
730 |     // Make sure that we can load numbers that are greater than the precision
731 |     // and that normalization fixes and moves things to the right place.
732 |     let sem = Semantics::new(40, 10, nte);
733 |     let bi = BigInt::from_u64(1 << 14);
734 |     let num = Float::from_bigint(sem, bi);
735 |     assert_eq!(num.to_i64(), 1 << 14);
736 | }
737 | 
738 | #[test]
739 | pub fn test_semantics_size() {
740 |     assert_eq!(FP16.log_precision(), 4);
741 |     assert_eq!(FP32.log_precision(), 5);
742 |     assert_eq!(FP64.log_precision(), 6);
743 |     assert_eq!(FP128.log_precision(), 7);
744 | }
745 | 
746 | impl Semantics {
747 |     /// Returns the maximum value of the number.
748 |     pub fn get_max_positive_value(&self) -> Float {
749 |         let exp = self.get_exp_bounds().1;
750 |         let mantissa = BigInt::all1s(self.get_precision());
751 |         Float::from_parts(*self, false, exp, mantissa)
752 |     }
753 | 
754 |     /// Returns the minimum positive value of the number (subnormal).
755 |     /// See https://en.wikipedia.org/wiki/IEEE_754
756 |     pub fn get_min_positive_value(&self) -> Float {
757 |         let exp = self.get_exp_bounds().0;
758 |         let mantissa = BigInt::one();
759 |         Float::from_parts(*self, false, exp, mantissa)
760 |     }
761 | 
762 |     /// Returns true if the number can be represented exactly in this format.
763 |     /// A number can be represented exactly if the exponent is in the range, and
764 |     /// the mantissa is not too large. In other words, the number 'val' can be
765 |     /// converted to this format without any loss of accuracy.
766 |     pub fn can_represent_exactly(&self, val: &Float) -> bool {
767 |         // Can always represent Inf, NaN, Zero.
768 |         if !val.is_normal() {
769 |             return true;
770 |         }
771 | 
772 |         // Check the semantics of the other value.
773 |         let other_sem = val.get_semantics();
774 |         if other_sem.get_precision() <= self.get_precision()
775 |             && other_sem.get_exponent_len() <= self.get_exponent_len()
776 |         {
777 |             return true;
778 |         }
779 | 
780 |         // Check the exponent value.
781 |         let exp = val.get_exp();
782 |         let bounds = self.get_exp_bounds();
783 |         if exp < bounds.0 || exp > bounds.1 {
784 |             return false;
785 |         }
786 | 
787 |         // Check if the mantissa is zero.
788 |         if val.get_mantissa().is_zero() {
789 |             return true;
790 |         }
791 | 
792 |         // Check how much we can shift-right the number without losing bits.
793 |         let last = val.get_mantissa().trailing_zeros();
794 |         let first = val.get_mantissa().msb_index();
795 |         // Notice that msb_index is 1-based, but this is okay because we want to
796 |         // count the number of bits including the last.
797 |         let used_bits = first - last;
798 |         used_bits <= self.get_precision()
799 |     }
800 | }
801 | 
802 | #[test]
803 | fn test_min_max_val() {
804 |     assert_eq!(FP16.get_max_positive_value().as_f64(), 65504.0);
805 |     assert_eq!(FP32.get_max_positive_value().as_f64(), f32::MAX as f64);
806 |     assert_eq!(FP64.get_max_positive_value().as_f64(), f64::MAX);
807 |     assert_eq!(FP32.get_min_positive_value().as_f32(), f32::from_bits(0b01));
808 |     assert_eq!(FP64.get_min_positive_value().as_f64(), f64::from_bits(0b01));
809 | }
810 | 
811 | #[test]
812 | fn test_can_represent_exactly() {
813 |     assert!(FP16.can_represent_exactly(&Float::from_f64(1.0)));
814 |     assert!(FP16.can_represent_exactly(&Float::from_f64(65504.0)));
815 |     assert!(!FP16.can_represent_exactly(&Float::from_f64(65504.1)));
816 |     assert!(!FP16.can_represent_exactly(&Float::from_f64(0.0001)));
817 | 
818 |     let m10 = BigInt::from_u64(0b1000000001);
819 |     let m11 = BigInt::from_u64(0b10000000001);
820 |     let m12 = BigInt::from_u64(0b100000000001);
821 | 
822 |     let val10bits = Float::from_parts(FP32, false, 0, m10);
823 |     let val11bits = Float::from_parts(FP32, false, 0, m11);
824 |     let val12bits = Float::from_parts(FP32, false, 0, m12);
825 | 
826 |     assert!(FP16.can_represent_exactly(&val10bits));
827 |     assert!(FP16.can_represent_exactly(&val11bits));
828 |     assert!(!FP16.can_represent_exactly(&val12bits));
829 | 
830 |     assert!(FP32.can_represent_exactly(&Float::pi(FP32)));
831 |     assert!(!FP32.can_represent_exactly(&Float::pi(FP64)));
832 |     assert!(FP64.can_represent_exactly(&Float::pi(FP32)));
833 | }
834 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //!
  2 | //! ARPFloat is an implementation of arbitrary precision
  3 | //! [floating point](https://en.wikipedia.org/wiki/IEEE_754) data
  4 | //! structures and utilities. The library can be used to emulate existing
  5 | //! floating point types, such as FP16, FP32 or FP128, and create new
  6 | //! floating-point types that scale to hundreds of digits, and perform very
  7 | //! accurate calculations. The library  contains mathematical functions such as
  8 | //!  `log`, `exp`, `sin`, `cos`, `tan`,  and constants such as `pi` and `e`
  9 | //!
 10 | //! In ARPFloat the rounding mode is a part of the type-system, and this solves
 11 | //! a number of problem that show up when using the global rounding flag that's
 12 | //! defined in `fenv.h`.
 13 | //!
 14 | //! ##no_std
 15 | //! The library can be built without the standard library.
 16 | //!
 17 | //!### Example
 18 | //!```
 19 | //!  use arpfloat::Float;
 20 | //!  use arpfloat::FP128;
 21 | //!
 22 | //!  // Create the number '5' in FP128 format.
 23 | //!  let n = Float::from_f64(5.).cast(FP128);
 24 | //!
 25 | //!  // Use Newton-Raphson to find the square root of 5.
 26 | //!  let mut x = n.clone();
 27 | //!  for _ in 0..20 {
 28 | //!      x += (&n / &x)/2;
 29 | //!  }
 30 | //!
 31 | //!  println!("fp128: {}", x);
 32 | //!  println!("fp64:  {}", x.as_f64());
 33 | //! ```
 34 | //!
 35 | //!The program above will print this output:
 36 | //!```console
 37 | //!fp128: 2.2360679774997896964091736687312763
 38 | //!fp64:  2.23606797749979
 39 | //!```
 40 | //!
 41 | //!The library also provides API that exposes rounding modes, and low-level
 42 | //!operations.
 43 | //!
 44 | //!```
 45 | //!    use arpfloat::FP128;
 46 | //!    use arpfloat::RoundingMode::NearestTiesToEven;
 47 | //!    use arpfloat::Float;
 48 | //!
 49 | //!    let x = Float::from_u64(FP128, 1<<53);
 50 | //!    let y = Float::from_f64(1000.0).cast(FP128);
 51 | //!
 52 | //!    let val = Float::mul_with_rm(&x, &y, NearestTiesToEven);
 53 | //! ```
 54 | //!
 55 | //! View the internal representation of floating point numbers:
 56 | //! ```
 57 | //!    use arpfloat::Float;
 58 | //!    use arpfloat::FP16;
 59 | //!
 60 | //!    let fp = Float::from_i64(FP16, 15);
 61 | //!
 62 | //!    fp.dump(); // Prints FP[+ E=+3 M=11110000000]
 63 | //!
 64 | //!    let m = fp.get_mantissa();
 65 | //!    m.dump(); // Prints 11110000000
 66 | //!```
 67 | //!
 68 | //! Control the rounding mode for type conversion:
 69 | //!```
 70 | //!    use arpfloat::{FP16, FP32, RoundingMode, Float};
 71 | //!
 72 | //!    let x = Float::from_u64(FP32, 2649);              // Load an FP32 Value.
 73 | //!    let b = x.cast_with_rm(FP16, RoundingMode::Zero); // Convert to FP16.
 74 | //!    println!("{}", b);                                // Prints 2648!
 75 | //!```
 76 | //!
 77 | //! Define new float formats and use high-precision transcendental functions:
 78 | //!```
 79 | //!  use arpfloat::{Float, Semantics, RoundingMode};
 80 | //!  // Define a new float format with 120 bits of accuracy, and dynamic range
 81 | //!  // of 2^10.
 82 | //!  let sem = Semantics::new(10, 120, RoundingMode::NearestTiesToEven);
 83 | //!
 84 | //!  let pi = Float::pi(sem);
 85 | //!  let x = Float::exp(&pi);
 86 | //!  println!("e^pi = {}", x); // Prints 23.1406926327792....
 87 | //!```
 88 | //!
 89 | //! Floating point numbers can be converted to
 90 | //! [Continued Fractions](https://en.wikipedia.org/wiki/Continued_fraction) that
 91 | //! approximate the value.
 92 | //!
 93 | //! ```rust
 94 | //!  use arpfloat::{Float, FP256, RoundingMode};
 95 | //!
 96 | //!  let ln = Float::ln2(FP256);
 97 | //!  println!("ln(2) = {}", ln);
 98 | //!  for i in 1..20 {
 99 | //!    let (p,q) = ln.as_fraction(i);
100 | //!    println!("{}/{}", p.as_decimal(), q.as_decimal());
101 | //!  }
102 | //! ```
103 | //!The program above will print this output:
104 | //!```console
105 | //!  ln(2) = .6931471805599453094172321214581765680755001343.....
106 | //!  0/1
107 | //!  1/1
108 | //!  2/3
109 | //!  7/10
110 | //!  9/13
111 | //!  61/88
112 | //!  192/277
113 | //!  253/365
114 | //!  445/642
115 | //!  1143/1649
116 | //!  1588/2291
117 | //!  2731/3940
118 | //!  ....
119 | //!```
120 | 
121 | #![no_std]
122 | 
123 | #[cfg(feature = "std")]
124 | extern crate std;
125 | 
126 | mod arithmetic;
127 | mod bigint;
128 | mod cast;
129 | mod float;
130 | mod operations;
131 | mod string;
132 | mod utils;
133 | 
134 | pub use self::bigint::BigInt;
135 | pub use self::float::Float;
136 | pub use self::float::RoundingMode;
137 | pub use self::float::Semantics;
138 | pub use self::float::{BF16, FP128, FP16, FP256, FP32, FP64};
139 | 
140 | // Conditionally include a module based on feature flag
141 | #[cfg(feature = "python")]
142 | pub mod py;
143 | 


--------------------------------------------------------------------------------
/src/operations/constants.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the implementation of methods that compute mathematical
  2 | //! constants.
  3 | //!
  4 | use crate::RoundingMode;
  5 | use crate::{Float, Semantics};
  6 | 
  7 | impl Float {
  8 |     /// Computes pi.
  9 |     pub fn pi(sem: Semantics) -> Self {
 10 |         // Algorithm description in Pg 246:
 11 |         // Fast Multiple-Precision Evaluation of Elementary Functions
 12 |         // by Richard P. Brent.
 13 | 
 14 |         // Increase the precision, because the arithmetic operations below
 15 |         // require rounding, so if we want to get the accurate results we need
 16 |         // to operate with increased precision.
 17 |         let orig_sem = sem;
 18 |         let sem = sem.grow_log(4);
 19 | 
 20 |         use RoundingMode::NearestTiesToEven as rm;
 21 | 
 22 |         let one = Self::from_i64(sem, 1);
 23 |         let two = Self::from_i64(sem, 2);
 24 |         let four = Self::from_i64(sem, 4);
 25 | 
 26 |         let mut a = one.clone();
 27 |         let mut b = one.clone() / two.sqrt();
 28 |         let mut t = one.clone() / four;
 29 |         let mut x = one;
 30 | 
 31 |         while a != b {
 32 |             let y = a.clone();
 33 |             a = (&a + &b).scale(-1, rm);
 34 |             b = (&b * &y).sqrt();
 35 |             t -= &x * (&a - &y).sqr();
 36 |             x = x.scale(1, rm);
 37 |         }
 38 |         (a.sqr() / t).cast(orig_sem)
 39 |     }
 40 | 
 41 |     /// Computes e.
 42 |     pub fn e(sem: Semantics) -> Self {
 43 |         let orig_sem = sem;
 44 |         let sem = sem.increase_precision(1);
 45 | 
 46 |         let one = Self::one(sem, false);
 47 |         let mut term = one.clone();
 48 | 
 49 |         // Use Euler's continued fraction, which is a simple series.
 50 |         let iterations: i64 = (sem.get_exponent_len() * 2) as i64;
 51 |         for i in (1..iterations).rev() {
 52 |             let v = Self::from_i64(sem, i);
 53 |             term = &v + &v / &term;
 54 |         }
 55 | 
 56 |         (one / term + 2).cast(orig_sem)
 57 |     }
 58 | 
 59 |     /// Compute log(2).
 60 |     pub fn ln2(sem: Semantics) -> Self {
 61 |         use RoundingMode::None as rm;
 62 |         let sem2 = sem.increase_precision(8);
 63 | 
 64 |         // Represent log(2) using the sum 1/k*2^k
 65 |         let one = Self::one(sem2, false);
 66 |         let mut sum = Self::zero(sem2, false);
 67 |         let mut prev = Self::inf(sem2, true);
 68 |         for k in 1..500 {
 69 |             let k2 = Self::from_u64(sem2, 1).scale(k, rm);
 70 |             let k = Self::from_u64(sem2, k as u64);
 71 |             let kk2 = &Float::mul_with_rm(&k, &k2, rm);
 72 |             let term = Float::div_with_rm(&one, kk2, rm);
 73 |             sum = Float::add_with_rm(&sum, &term, rm);
 74 |             if prev == sum {
 75 |                 break;
 76 |             }
 77 |             prev = sum.clone();
 78 |         }
 79 |         sum.cast(sem)
 80 |     }
 81 | }
 82 | 
 83 | #[cfg(feature = "std")]
 84 | #[test]
 85 | fn test_pi() {
 86 |     use crate::FP32;
 87 |     use crate::FP64;
 88 |     assert_eq!(Float::pi(FP64).as_f64(), std::f64::consts::PI);
 89 |     assert_eq!(Float::pi(FP32).as_f32(), std::f32::consts::PI);
 90 | }
 91 | 
 92 | #[cfg(feature = "std")]
 93 | #[test]
 94 | fn test_e() {
 95 |     use crate::FP32;
 96 |     use crate::FP64;
 97 |     assert_eq!(Float::e(FP64).as_f64(), std::f64::consts::E);
 98 |     assert_eq!(Float::e(FP32).as_f32(), std::f32::consts::E);
 99 | }
100 | 
101 | #[cfg(feature = "std")]
102 | #[test]
103 | fn test_ln2() {
104 |     use crate::FP64;
105 |     assert_eq!(Float::ln2(FP64).as_f64(), std::f64::consts::LN_2);
106 | }
107 | 


--------------------------------------------------------------------------------
/src/operations/exp.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the implementation of log- and exp-related methods.
  2 | //!
  3 | 
  4 | use crate::RoundingMode;
  5 | 
  6 | use crate::float::Float;
  7 | 
  8 | impl Float {
  9 |     /// Computes the taylor series, centered around 1, and valid in [0..2].
 10 |     /// z = (x - 1)/(x + 1)
 11 |     /// log(x) = 2 (z + z^3/3 + z^5/5 + z^7/7 ... )
 12 |     fn log_taylor(x: &Self) -> Self {
 13 |         use RoundingMode::None as rm;
 14 |         let sem = x.get_semantics();
 15 |         let one = Self::one(sem, false);
 16 |         let up = Float::sub_with_rm(x, &one, rm);
 17 |         let down = Float::add_with_rm(x, &one, rm);
 18 |         let z = Float::div_with_rm(&up, &down, rm);
 19 |         let z2 = z.sqr();
 20 | 
 21 |         let mut top = z;
 22 |         let mut sum = Self::zero(sem, false);
 23 |         let mut prev = Self::one(sem, true);
 24 |         for i in 0..50 {
 25 |             if prev == sum {
 26 |                 break; // Stop if we are not making progress.
 27 |             }
 28 |             prev = sum.clone();
 29 | 
 30 |             let bottom = &Self::from_u64(sem, i * 2 + 1);
 31 |             let elem = Float::div_with_rm(&top, bottom, rm);
 32 |             sum = Float::add_with_rm(&sum, &elem, rm);
 33 | 
 34 |             // Prepare the next iteration.
 35 |             top = Float::mul_with_rm(&top, &z2, rm);
 36 |         }
 37 | 
 38 |         sum.scale(1, RoundingMode::Zero)
 39 |     }
 40 | 
 41 |     /// Reduce the range of 'x' with the identity:
 42 |     /// ln(x) = ln(sqrt(x)^2) = 2 * ln(sqrt(x)) and
 43 |     /// ln(x) = -ln(1/x)
 44 |     fn log_range_reduce(x: &Self) -> Self {
 45 |         use RoundingMode::NearestTiesToEven as even;
 46 |         let sem = x.get_semantics();
 47 |         let up = Self::from_f64(1.001).cast(sem);
 48 |         let one = Self::from_u64(sem, 1);
 49 | 
 50 |         if x > &up {
 51 |             let sx = x.sqrt();
 52 |             return Self::log_range_reduce(&sx).scale(1, even);
 53 |         }
 54 | 
 55 |         if x < &one {
 56 |             let re = Float::div_with_rm(&one, x, RoundingMode::None);
 57 |             return Self::log_range_reduce(&re).neg();
 58 |         }
 59 | 
 60 |         Self::log_taylor(x)
 61 |     }
 62 | 
 63 |     /// Computes logarithm of 'x'.
 64 |     pub fn log(&self) -> Self {
 65 |         use RoundingMode::None as rm;
 66 |         let sem = self.get_semantics();
 67 | 
 68 |         //Fast Logarithm function for Arbitrary Precision number,
 69 |         // by Henrik Vestermark.
 70 | 
 71 |         // Handle all of the special cases:
 72 |         if !self.is_normal() || self.is_negative() {
 73 |             return Self::nan(sem, self.get_sign());
 74 |         }
 75 | 
 76 |         let orig_sem = self.get_semantics();
 77 |         let sem = orig_sem.grow_log(10).increase_exponent(10);
 78 | 
 79 |         let x = &self.cast_with_rm(sem, rm);
 80 |         Self::log_range_reduce(x).cast_with_rm(orig_sem, rm)
 81 |     }
 82 | }
 83 | 
 84 | #[test]
 85 | fn test_log() {
 86 |     use crate::FP128;
 87 |     let x = Float::from_f64(0.1).cast(FP128).log();
 88 |     assert_eq!(x.as_f64(), -2.3025850929940455);
 89 | 
 90 |     for x in [
 91 |         0.1, 0.5, 2.3, 4.5, 9.8, 11.2, 15.2, 91.2, 102.2, 192.4, 1024.2,
 92 |         90210.2,
 93 |     ] {
 94 |         let lhs = Float::from_f64(x).cast(FP128).log().as_f64();
 95 |         let rhs = x.ln();
 96 |         assert_eq!(lhs, rhs);
 97 |     }
 98 | }
 99 | 
100 | impl Float {
101 |     /// Computes the taylor series:
102 |     /// exp(x) = 1 + x/1! + x^2/2! + x^3/3! ...
103 |     fn exp_taylor(x: &Self) -> Self {
104 |         let sem = x.get_semantics();
105 |         use crate::bigint::BigInt;
106 |         let mut top = Self::one(sem, false);
107 |         let mut bottom = BigInt::one();
108 | 
109 |         let mut sum = Self::zero(sem, false);
110 |         let mut prev = Self::one(sem, true);
111 |         for k in 1..50 {
112 |             if prev == sum {
113 |                 break; // Stop if we are not making progress.
114 |             }
115 |             prev = sum.clone();
116 | 
117 |             let elem = &top / &Self::from_bigint(sem, bottom.clone());
118 |             sum += elem;
119 | 
120 |             // Prepare the next iteration.
121 |             bottom *= BigInt::from_u64(k);
122 |             top = &top * x;
123 |         }
124 | 
125 |         sum
126 |     }
127 | 
128 |     /// Reduce the range of 'x' with the identity:
129 |     /// e^x = (e^(x/2))^2
130 |     fn exp_range_reduce(x: &Self) -> Self {
131 |         let sem = x.get_semantics();
132 | 
133 |         let one = Self::from_u64(sem, 1);
134 | 
135 |         if x > &one {
136 |             let sx = x.scale(-3, RoundingMode::Zero);
137 |             let esx = Self::exp_range_reduce(&sx);
138 |             return esx.sqr().sqr().sqr();
139 |         }
140 | 
141 |         Self::exp_taylor(x)
142 |     }
143 | 
144 |     /// Computes exponential function `e^self`.
145 |     pub fn exp(&self) -> Self {
146 |         let sem = self.get_semantics();
147 | 
148 |         // Handle all of the special cases:
149 |         if self.is_zero() {
150 |             return Self::one(sem, false);
151 |         } else if !self.is_normal() {
152 |             return Self::nan(sem, self.get_sign());
153 |         }
154 | 
155 |         let orig_sem = self.get_semantics();
156 |         let sem = orig_sem.grow_log(10).increase_exponent(10);
157 | 
158 |         // Handle the negative values.
159 |         if self.is_negative() {
160 |             let one = Self::one(sem, false);
161 |             return (one / self.cast(sem).neg().exp()).cast(orig_sem);
162 |         }
163 | 
164 |         Self::exp_range_reduce(&self.cast(sem)).cast(orig_sem)
165 |     }
166 | }
167 | 
168 | #[test]
169 | fn test_exp() {
170 |     assert_eq!(Float::from_f64(2.51).exp().as_f64(), 12.30493006051041);
171 | 
172 |     for x in [
173 |         0.000003, 0.001, 0.12, 0.13, 0.5, 1.2, 2.3, 4.5, 9.8, 5.0, 11.2, 15.2,
174 |         25.0, 34.001, 54., 89.1, 91.2, 102.2, 150., 192.4, 212., 256., 102.3,
175 |     ] {
176 |         let lhs = Float::from_f64(x).exp().as_f64();
177 |         let rhs = x.exp();
178 |         assert_eq!(lhs, rhs);
179 |     }
180 | }
181 | 
182 | impl Float {
183 |     /// Computes the sigmoid function of this number.
184 |     /// Defined as ( 1 / 1 + e(-x)).
185 |     pub fn sigmoid(&self) -> Self {
186 |         // https://en.wikipedia.org/wiki/Sigmoid_function
187 |         let one = Self::one(self.get_semantics(), false);
188 | 
189 |         if self.is_inf() {
190 |             return Self::one(self.get_semantics(), self.get_sign());
191 |         } else if self.is_zero() {
192 |             use RoundingMode::Zero as rm;
193 |             return one.scale(-1, rm);
194 |         } else if self.is_nan() {
195 |             return self.clone();
196 |         }
197 | 
198 |         let ex = self.exp();
199 |         &ex / (&ex + &one)
200 |     }
201 | }
202 | 
203 | #[test]
204 | pub fn test_sigmoid() {
205 |     // Generate a test vector using the python program:
206 |     //
207 |     // import numpy as np
208 |     // array = np.array([-0.5, 0, 0.5, 0.99, 1.0, 2.3, 100.0])
209 |     // def sigmoid(x): return 1.0 / (1.0 + np.exp(-x))
210 |     let inp = [-0.5, 0., 0.5, 0.99, 1., 2.3, 100.];
211 |     let out = [
212 |         0.37754067, 0.5, 0.62245933, 0.72908792, 0.73105858, 0.90887704, 1.,
213 |     ];
214 | 
215 |     for (x, o) in inp.iter().zip(out.iter()) {
216 |         let x = Float::from_f64(*x);
217 |         let o = Float::from_f64(*o);
218 |         let res = x.sigmoid();
219 |         assert_eq!(o.as_f32(), res.as_f32())
220 |     }
221 | }
222 | 


--------------------------------------------------------------------------------
/src/operations/frac.rs:
--------------------------------------------------------------------------------
 1 | //! This module contains the implementation of methods that compute continued
 2 | //! fraction.
 3 | 
 4 | use crate::{bigint::BigInt, Float};
 5 | 
 6 | impl Float {
 7 |     /// Convert the number to a Continued Fraction of two integers.
 8 |     /// The fraction is computed using 'n' iterations of the form:
 9 |     /// a0 + 1/(a1 + 1/(a2 + 1/( ... ))).
10 |     /// This method discards the sign, and returns (0, 0) for Inf and NaN.
11 |     pub fn as_fraction(&self, n: usize) -> (BigInt, BigInt) {
12 |         if self.is_zero() {
13 |             return (BigInt::zero(), BigInt::one()); // Zero.
14 |         } else if self.is_inf() || self.is_nan() {
15 |             return (BigInt::zero(), BigInt::zero()); // Invalid.
16 |         }
17 | 
18 |         // Algorithm from:
19 |         // Elementary Functions: Algorithms and Implementation
20 |         // 9.3.1 A few basic notions on continued fractions - Page 180.
21 |         extern crate alloc;
22 |         use alloc::vec::Vec;
23 |         let sem = self.get_semantics();
24 |         let rm = sem.get_rounding_mode();
25 | 
26 |         let one = Self::one(sem, false);
27 |         let mut real = self.clone();
28 |         let mut a: Vec<BigInt> = Vec::new();
29 | 
30 |         for _ in 0..n.max(2) {
31 |             let int = real.trunc();
32 |             a.push(int.convert_normal_to_integer(rm));
33 |             let denominator = real - int;
34 |             if denominator.is_zero() {
35 |                 break;
36 |             }
37 |             real = &one / (denominator);
38 |         }
39 | 
40 |         if a.len() < 2 {
41 |             return (a[0].clone(), BigInt::one()); // Found an exact value.
42 |         }
43 | 
44 |         let one = BigInt::one();
45 |         let mut p = (&one + &(&a[0] * &a[1]), a[0].clone());
46 |         let mut q = (a[1].clone(), one);
47 | 
48 |         if n < 2 {
49 |             return (p.1, q.1);
50 |         }
51 | 
52 |         for elem in a.iter().skip(2) {
53 |             p = (&p.1 + &(elem * &p.0), p.0);
54 |             q = (&q.1 + &(elem * &q.0), q.0);
55 |         }
56 | 
57 |         (p.0, q.0)
58 |     }
59 | }
60 | 
61 | #[cfg(feature = "std")]
62 | #[test]
63 | fn test_frac() {
64 |     use crate::FP128;
65 |     let x = Float::pi(FP128);
66 | 
67 |     // Verified with https://oeis.org/A001203.
68 |     let (p, q) = x.as_fraction(1);
69 |     assert_eq!((3, 1), (p.as_u64(), q.as_u64()));
70 |     let (p, q) = x.as_fraction(2);
71 |     assert_eq!((22, 7), (p.as_u64(), q.as_u64()));
72 |     let (p, q) = x.as_fraction(3);
73 |     assert_eq!((333, 106), (p.as_u64(), q.as_u64()));
74 |     let (p, q) = x.as_fraction(4);
75 |     assert_eq!((355, 113), (p.as_u64(), q.as_u64()));
76 | }
77 | 
78 | #[cfg(feature = "std")]
79 | #[test]
80 | fn fix_loop_bug() {
81 |     let (p, q) = Float::from_f64(5.).as_fraction(3);
82 |     assert_eq!((5, 1), (p.as_u64(), q.as_u64()));
83 | 
84 |     let (p, q) = Float::from_f64(0.5).as_fraction(3);
85 |     assert_eq!((1, 2), (p.as_u64(), q.as_u64()));
86 | }
87 | 


--------------------------------------------------------------------------------
/src/operations/functions.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the implementation of several arithmetic operations.
  2 | 
  3 | use crate::RoundingMode;
  4 | 
  5 | use crate::float::Float;
  6 | 
  7 | impl Float {
  8 |     /// Return this number raised to the power of 'n'.
  9 |     pub fn powi(&self, mut n: u64) -> Self {
 10 |         let sem = self.get_semantics().increase_precision(2);
 11 |         let mut elem = Self::one(sem, false);
 12 |         // This algorithm is similar to binary conversion. Each bit in 'n'
 13 |         // represents a power-of-two number, like 1,2,4,8 ... We know how to
 14 |         // generate numbers to the power of an even number by squaring the
 15 |         // number log2 times. So, we just multiply all of the numbers together
 16 |         // to get the result. This is like converting a binary number to integer
 17 |         // except that instead of adding we multiply the values.
 18 |         let mut val = self.cast(sem);
 19 |         while n > 0 {
 20 |             if n & 1 == 1 {
 21 |                 elem *= &val;
 22 |             }
 23 |             val *= &val.clone();
 24 |             n >>= 1;
 25 |         }
 26 |         elem.cast(self.get_semantics())
 27 |     }
 28 | 
 29 |     /// Calculates the power of two.
 30 |     pub fn sqr(&self) -> Self {
 31 |         self.powi(2)
 32 |     }
 33 |     /// Calculates the square root of the number.
 34 |     pub fn sqrt(&self) -> Self {
 35 |         let sem = self.get_semantics();
 36 |         if self.is_zero() {
 37 |             return self.clone(); // (+/-) zero
 38 |         } else if self.is_nan() || self.is_negative() {
 39 |             return Self::nan(sem, self.get_sign()); // (-/+)Nan, -Number.
 40 |         } else if self.is_inf() {
 41 |             return self.clone(); // Inf+.
 42 |         }
 43 | 
 44 |         let target = self.clone();
 45 |         let two = Self::from_u64(sem, 2);
 46 | 
 47 |         // Start the search at max(2, x).
 48 |         let mut x = if target < two { two } else { target.clone() };
 49 |         let mut prev = x.clone();
 50 | 
 51 |         // Use the Newton Raphson method.
 52 |         loop {
 53 |             x += &target / &x;
 54 |             x = x.scale(-1, RoundingMode::NearestTiesToEven);
 55 |             // Stop when value did not change or regressed.
 56 |             if prev < x || x == prev {
 57 |                 return x;
 58 |             }
 59 |             prev = x.clone();
 60 |         }
 61 |     }
 62 | 
 63 |     /// Returns the absolute value of this float.
 64 |     pub fn abs(&self) -> Self {
 65 |         let mut x = self.clone();
 66 |         x.set_sign(false);
 67 |         x
 68 |     }
 69 | 
 70 |     /// Returns the greater of self and `other`.
 71 |     pub fn max(&self, other: &Self) -> Self {
 72 |         if self.is_nan() {
 73 |             return other.clone();
 74 |         } else if other.is_nan() {
 75 |             return self.clone();
 76 |         } else if self.get_sign() != other.get_sign() {
 77 |             return if self.get_sign() {
 78 |                 other.clone()
 79 |             } else {
 80 |                 self.clone()
 81 |             }; // Handle (+-)0.
 82 |         }
 83 |         if self > other {
 84 |             self.clone()
 85 |         } else {
 86 |             other.clone()
 87 |         }
 88 |     }
 89 | 
 90 |     /// Returns the smaller of self and `other`.
 91 |     pub fn min(&self, other: &Self) -> Self {
 92 |         if self.is_nan() {
 93 |             return other.clone();
 94 |         } else if other.is_nan() {
 95 |             return self.clone();
 96 |         } else if self.get_sign() != other.get_sign() {
 97 |             return if self.get_sign() {
 98 |                 self.clone()
 99 |             } else {
100 |                 other.clone()
101 |             }; // Handle (+-)0.
102 |         }
103 |         if self > other {
104 |             other.clone()
105 |         } else {
106 |             self.clone()
107 |         }
108 |     }
109 | }
110 | 
111 | #[cfg(feature = "std")]
112 | #[test]
113 | fn test_sqrt() {
114 |     use crate::utils;
115 |     use crate::FP64;
116 | 
117 |     // Try a few power-of-two values.
118 |     for i in 0..256 {
119 |         let v16 = Float::from_u64(FP64, i * i);
120 |         assert_eq!(v16.sqrt().as_f64(), (i) as f64);
121 |     }
122 | 
123 |     // Test the category and value of the different special values (inf, zero,
124 |     // correct sign, etc).
125 |     for v_f64 in utils::get_special_test_values() {
126 |         let vf = Float::from_f64(v_f64);
127 |         assert_eq!(vf.sqrt().is_inf(), v_f64.sqrt().is_infinite());
128 |         assert_eq!(vf.sqrt().is_nan(), v_f64.sqrt().is_nan());
129 |         assert_eq!(vf.sqrt().is_negative(), v_f64.sqrt().is_sign_negative());
130 |     }
131 | 
132 |     // Test precomputed values.
133 |     fn check(inp: f64, res: f64) {
134 |         assert_eq!(Float::from_f64(inp).sqrt().as_f64(), res);
135 |     }
136 |     check(1.5, 1.224744871391589);
137 |     check(2.3, 1.51657508881031);
138 |     check(6.7, 2.588435821108957);
139 |     check(7.9, 2.8106938645110393);
140 |     check(11.45, 3.383784863137726);
141 |     check(1049.3, 32.39290045673589);
142 |     check(90210.7, 300.35096137685326);
143 |     check(199120056003.73413, 446228.70369770494);
144 |     check(0.6666666666666666, 0.816496580927726);
145 |     check(0.4347826086956522, 0.6593804733957871);
146 |     check(0.14925373134328357, 0.3863337046431279);
147 |     check(0.12658227848101264, 0.35578403348241);
148 |     check(0.08733624454148473, 0.29552706228277087);
149 |     check(0.0009530162965786716, 0.030870962028719993);
150 |     check(1.1085159520988087e-5, 0.00332943831914455);
151 |     check(5.0120298432056786e-8, 0.0002238756316173263);
152 | }
153 | 
154 | #[cfg(feature = "std")]
155 | #[test]
156 | fn test_min_max() {
157 |     use crate::utils;
158 | 
159 |     fn check(v0: f64, v1: f64) {
160 |         // Min.
161 |         let correct = v0.min(v1);
162 |         let test = Float::from_f64(v0).min(&Float::from_f64(v1)).as_f64();
163 |         assert_eq!(test.is_nan(), correct.is_nan());
164 |         if !correct.is_nan() {
165 |             assert_eq!(correct, test);
166 |         }
167 |         // Max.
168 |         let correct = v0.max(v1);
169 |         let test = Float::from_f64(v0).max(&Float::from_f64(v1)).as_f64();
170 |         assert_eq!(test.is_nan(), correct.is_nan());
171 |         if !correct.is_nan() {
172 |             assert_eq!(correct, test);
173 |         }
174 |     }
175 | 
176 |     // Test a bunch of special values (Inf, Epsilon, Nan, (+-)Zeros).
177 |     for v0 in utils::get_special_test_values() {
178 |         for v1 in utils::get_special_test_values() {
179 |             check(v0, v1);
180 |         }
181 |     }
182 | 
183 |     let mut lfsr = utils::Lfsr::new();
184 | 
185 |     for _ in 0..100 {
186 |         let v0 = f64::from_bits(lfsr.get64());
187 |         let v1 = f64::from_bits(lfsr.get64());
188 |         check(v0, v1);
189 |     }
190 | }
191 | 
192 | #[cfg(feature = "std")]
193 | #[test]
194 | fn test_abs() {
195 |     use crate::utils;
196 | 
197 |     for v in utils::get_special_test_values() {
198 |         if !v.is_nan() {
199 |             assert_eq!(Float::from_f64(v).abs().as_f64(), v.abs());
200 |         }
201 |     }
202 | }
203 | 
204 | //  Compute basic constants.
205 | 
206 | impl Float {
207 |     /// Similar to 'scalbln'. Adds or subtracts to the exponent of the number,
208 |     /// and scaling it by 2^exp.
209 |     pub fn scale(&self, scale: i64, rm: RoundingMode) -> Self {
210 |         use crate::bigint::LossFraction;
211 |         if !self.is_normal() {
212 |             return self.clone();
213 |         }
214 | 
215 |         let mut r = Self::from_parts(
216 |             self.get_semantics(),
217 |             self.get_sign(),
218 |             self.get_exp() + scale,
219 |             self.get_mantissa(),
220 |         );
221 |         r.normalize(rm, LossFraction::ExactlyZero);
222 |         r
223 |     }
224 | 
225 |     /// Returns the remainder from a division of two floats. This is equivalent
226 |     /// to rust 'rem' or c 'fmod'.
227 |     pub fn rem(&self, rhs: &Self) -> Self {
228 |         use core::ops::Sub;
229 |         // Handle NaNs.
230 |         if self.is_nan() || rhs.is_nan() || self.is_inf() || rhs.is_zero() {
231 |             return Self::nan(self.get_semantics(), self.get_sign());
232 |         }
233 |         // Handle values that are obviously zero or self.
234 |         if self.is_zero() || rhs.is_inf() {
235 |             return self.clone();
236 |         }
237 | 
238 |         // Operate on integers.
239 |         let mut lhs = self.abs();
240 |         let rhs = if rhs.is_negative() {
241 |             rhs.neg()
242 |         } else {
243 |             rhs.clone()
244 |         };
245 |         debug_assert!(lhs.is_normal() && rhs.is_normal());
246 | 
247 |         // This is a clever algorithm. Subtracting the RHS from LHS in a loop
248 |         // would be slow, but we perform a divide-like algorithm where we shift
249 |         // 'rhs' by higher powers of two, and subtract it from LHS, until LHS is
250 |         // lower than RHS.
251 |         while lhs >= rhs && lhs.is_normal() {
252 |             let scale = lhs.get_exp() - rhs.get_exp();
253 | 
254 |             // Scale RHS by a power of two. If we overshoot, take a step back.
255 |             let mut diff = rhs.scale(scale, RoundingMode::None);
256 |             if diff > lhs {
257 |                 diff = rhs.scale(scale - 1, RoundingMode::None);
258 |             }
259 | 
260 |             lhs = lhs.sub(diff);
261 |         }
262 | 
263 |         // Set the original sign.
264 |         lhs.set_sign(self.get_sign());
265 |         lhs
266 |     }
267 | }
268 | 
269 | #[test]
270 | fn test_scale() {
271 |     use crate::FP64;
272 |     let x = Float::from_u64(FP64, 1);
273 |     let y = x.scale(1, RoundingMode::None);
274 |     assert_eq!(y.as_f64(), 2.0);
275 |     let z = x.scale(-1, RoundingMode::None);
276 |     assert_eq!(z.as_f64(), 0.5);
277 | }
278 | 
279 | #[cfg(feature = "std")]
280 | #[test]
281 | fn test_rem() {
282 |     use crate::utils;
283 |     use crate::utils::Lfsr;
284 | 
285 |     use core::ops::Rem;
286 | 
287 |     fn check_two_numbers(v0: f64, v1: f64) {
288 |         let f0 = Float::from_f64(v0);
289 |         let f1 = Float::from_f64(v1);
290 |         let r0 = v0.rem(v1);
291 |         let r1 = f0.rem(&f1).as_f64();
292 |         assert_eq!(r0.is_nan(), r1.is_nan());
293 |         if !r0.is_nan() {
294 |             assert_eq!(r0, r1);
295 |         }
296 |     }
297 | 
298 |     // Test addition, multiplication, subtraction with random values.
299 |     check_two_numbers(1.4, 2.5);
300 |     check_two_numbers(2.4, 1.5);
301 |     check_two_numbers(1000., std::f64::consts::PI);
302 |     check_two_numbers(10000000000000000000., std::f64::consts::PI / 1000.);
303 |     check_two_numbers(10000000000000000000., std::f64::consts::PI);
304 |     check_two_numbers(100., std::f64::consts::PI);
305 |     check_two_numbers(100., -std::f64::consts::PI);
306 |     check_two_numbers(0., 10.);
307 |     check_two_numbers(std::f64::consts::PI, 10.0);
308 | 
309 |     // Test a bunch of random values:
310 |     let mut lfsr = Lfsr::new();
311 |     for _ in 0..5000 {
312 |         let v0 = f64::from_bits(lfsr.get64());
313 |         let v1 = f64::from_bits(lfsr.get64());
314 |         check_two_numbers(v0, v1);
315 |     }
316 | 
317 |     // Test the hard cases:
318 |     for v0 in utils::get_special_test_values() {
319 |         for v1 in utils::get_special_test_values() {
320 |             check_two_numbers(v0, v1);
321 |         }
322 |     }
323 | }
324 | 
325 | #[test]
326 | fn test_powi() {
327 |     assert_eq!(Float::from_f64(2.).powi(0).as_f64(), 1.);
328 |     assert_eq!(Float::from_f64(2.).powi(1).as_f64(), 2.);
329 |     assert_eq!(Float::from_f64(2.).powi(3).as_f64(), 8.);
330 |     assert_eq!(Float::from_f64(2.).powi(5).as_f64(), 32.);
331 |     assert_eq!(Float::from_f64(2.).powi(10).as_f64(), 1024.);
332 |     assert_eq!(Float::from_f64(0.3).powi(3).as_f64(), 0.026999999999999996);
333 | }
334 | 
335 | impl Float {
336 |     /// Return this number raised to the power of 'n'.
337 |     /// Computed using e^(n * log(self))
338 |     pub fn pow(&self, n: &Float) -> Self {
339 |         let orig_sem = self.get_semantics();
340 |         let one = Self::one(orig_sem, false);
341 |         let sign = self.get_sign();
342 | 
343 |         assert_eq!(orig_sem, n.get_semantics());
344 | 
345 |         if *self == one {
346 |             return self.clone();
347 |         } else if n.is_inf() || n.is_nan() {
348 |             return Self::nan(orig_sem, sign);
349 |         } else if n.is_zero() {
350 |             return Self::one(orig_sem, sign);
351 |         } else if self.is_zero() {
352 |             return if n.is_negative() {
353 |                 Self::inf(orig_sem, sign)
354 |             } else {
355 |                 Self::zero(orig_sem, sign)
356 |             };
357 |         } else if self.is_negative() || self.is_inf() || self.is_nan() {
358 |             return Self::nan(orig_sem, sign);
359 |         }
360 | 
361 |         let sem = orig_sem.grow_log(10).increase_exponent(10);
362 |         (n.cast(sem) * self.cast(sem).log()).exp().cast(orig_sem)
363 |     }
364 | }
365 | 
366 | #[test]
367 | fn test_pow() {
368 |     fn my_pow(a: f32, b: f32) -> f32 {
369 |         Float::from_f32(a).pow(&Float::from_f32(b)).as_f32()
370 |     }
371 | 
372 |     assert_eq!(my_pow(1.24, 1.2), 1.2945118);
373 |     assert_eq!(my_pow(0.94, 13.), 0.44736509);
374 |     assert_eq!(my_pow(0.11, -8.), 46650738.02097334);
375 |     assert_eq!(my_pow(40.0, 3.1), 92552.0);
376 | 
377 |     for i in 0..30 {
378 |         for j in -10..10 {
379 |             let i = i as f64;
380 |             let j = j as f64;
381 |             let res = i.powf(j);
382 |             let res2 = Float::from_f64(i).pow(&Float::from_f64(j));
383 |             assert_eq!(res, res2.as_f64());
384 |         }
385 |     }
386 | }
387 | 


--------------------------------------------------------------------------------
/src/operations/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Contains the implementations of various mathematical functions and
 2 | //! constants.
 3 | 
 4 | #[cfg(feature = "std")]
 5 | extern crate std;
 6 | 
 7 | mod constants;
 8 | mod exp;
 9 | mod frac;
10 | mod functions;
11 | mod trig;
12 | 


--------------------------------------------------------------------------------
/src/operations/trig.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the implementation of trigonometric functions.
  2 | //!
  3 | use crate::float::Float;
  4 | use crate::RoundingMode;
  5 | 
  6 | impl Float {
  7 |     /// sin(x) = x - x^3 / 3! + x^5 / 5! - x^7/7! ....
  8 |     fn sin_taylor(x: &Self) -> Self {
  9 |         use crate::bigint::BigInt;
 10 |         let sem = x.get_semantics();
 11 | 
 12 |         let mut neg = false;
 13 |         let mut top = x.clone();
 14 |         let mut bottom = BigInt::one();
 15 |         let mut sum = Self::zero(sem, false);
 16 |         let x2 = x.sqr();
 17 |         let mut prev = Self::one(sem, true);
 18 |         for i in 1..50 {
 19 |             if prev == sum {
 20 |                 break; // Stop if we are not making progress.
 21 |             }
 22 |             prev = sum.clone();
 23 |             // Update sum.
 24 |             let elem = &top / &Self::from_bigint(sem, bottom.clone());
 25 |             sum = if neg { sum - elem } else { sum + elem };
 26 | 
 27 |             // Prepare the next element.
 28 |             top = &top * &x2;
 29 |             let next_term = BigInt::from_u64((i * 2) * (i * 2 + 1));
 30 |             bottom *= next_term;
 31 |             neg ^= true;
 32 |         }
 33 | 
 34 |         sum
 35 |     }
 36 | 
 37 |     /// Reduce sin(x) in the range 0..pi/2, using the identity:
 38 |     /// sin(3x) = 3sin(x)-4(sin(x)^3)
 39 |     fn sin_step4_reduction(x: &Self, steps: usize) -> Self {
 40 |         use RoundingMode::None as rm;
 41 |         if steps == 0 {
 42 |             return Self::sin_taylor(x);
 43 |         }
 44 |         let i3 = Float::from_u64(x.get_semantics(), 3);
 45 |         let x3 = Float::div_with_rm(x, &i3, rm);
 46 |         let sx = Float::sin_step4_reduction(&x3, steps - 1);
 47 |         let sx3 = Float::mul_with_rm(&sx, &i3, rm);
 48 |         Float::sub_with_rm(&sx3, &sx.powi(3).scale(2, rm), rm)
 49 |     }
 50 | 
 51 |     /// Computes the sine of the number (in radians).
 52 |     pub fn sin(&self) -> Self {
 53 |         use RoundingMode::None as rm;
 54 |         // Fast Trigonometric functions for Arbitrary Precision number
 55 |         // by Henrik Vestermark.
 56 | 
 57 |         if self.is_zero() || self.is_nan() {
 58 |             return self.clone();
 59 |         }
 60 | 
 61 |         if self.is_inf() {
 62 |             return Self::nan(self.get_semantics(), self.get_sign());
 63 |         }
 64 | 
 65 |         let orig_sem = self.get_semantics();
 66 |         let sem = orig_sem.grow_log(12).increase_exponent(4);
 67 | 
 68 |         assert!(self.is_normal());
 69 | 
 70 |         let mut neg = false;
 71 | 
 72 |         let mut val = self.cast_with_rm(sem, rm);
 73 | 
 74 |         // Handle the negatives.
 75 |         if val.is_negative() {
 76 |             val = val.neg();
 77 |             neg ^= true;
 78 |         }
 79 | 
 80 |         // Range reductions.
 81 |         let is_small = self.get_exp() < 0;
 82 | 
 83 |         if !is_small {
 84 |             let pi = Self::pi(sem);
 85 |             let pi2 = pi.scale(1, rm);
 86 |             let pi_half = pi.scale(-1, rm);
 87 | 
 88 |             // Step 1
 89 |             if val > pi2 {
 90 |                 val = val.rem(&pi2);
 91 |             }
 92 | 
 93 |             debug_assert!(val <= pi2);
 94 |             // Step 2.
 95 |             if val > pi {
 96 |                 val = Float::sub_with_rm(&val, &pi, rm);
 97 |                 neg ^= true;
 98 |             }
 99 | 
100 |             debug_assert!(val <= pi);
101 |             // Step 3.
102 |             if val > pi_half {
103 |                 val = Float::sub_with_rm(&pi, &val, rm);
104 |             }
105 |             debug_assert!(val <= pi_half);
106 |         }
107 | 
108 |         // Calculate the number of needed reduction: 8[2/3 * log(2) * log(p)];
109 |         let k = orig_sem.log_precision() * 4;
110 | 
111 |         let res = Self::sin_step4_reduction(&val, k);
112 |         let res = if neg { res.neg() } else { res };
113 |         res.cast(orig_sem)
114 |     }
115 | }
116 | 
117 | #[cfg(feature = "std")]
118 | #[test]
119 | fn test_sin_known_value() {
120 |     use crate::std::string::ToString;
121 |     // Verify the results with:
122 |     // from mpmath import mp
123 |     // mp.dps = 1000
124 |     // mp.sin(801./10000)
125 |     let res = Float::from_f64(801. / 10000.).sin().to_string();
126 |     assert_eq!(res, ".08001437374006335");
127 |     let res = Float::from_f64(90210. / 10000.).sin().to_string();
128 |     assert_eq!(res, ".3928952872542333");
129 |     let res = Float::from_f64(95051.).sin().to_string();
130 |     assert_eq!(res, "-.8559198239971502");
131 | }
132 | 
133 | #[cfg(feature = "std")]
134 | #[test]
135 | fn test_sin() {
136 |     use crate::utils;
137 | 
138 |     for i in -100..100 {
139 |         let f0 = i as f64;
140 |         let r0 = f0.sin();
141 |         let r1 = Float::from_f64(f0).sin().as_f64();
142 |         assert_eq!(r0, r1);
143 |     }
144 | 
145 |     for i in -300..300 {
146 |         let f0 = (i as f64) / 100.;
147 |         let r0 = f0.sin();
148 |         let r1 = Float::from_f64(f0).sin().as_f64();
149 |         assert_eq!(r0, r1);
150 |     }
151 | 
152 |     // Test non-normal values.
153 |     for v in utils::get_special_test_values() {
154 |         if v.is_normal() {
155 |             continue;
156 |         }
157 |         let r0 = v.sin();
158 |         let r1 = Float::from_f64(v).sin().as_f64();
159 |         assert_eq!(r0.is_nan(), r1.is_nan());
160 |         if !r0.is_nan() {
161 |             assert_eq!(r0, r1);
162 |         }
163 |     }
164 | }
165 | 
166 | impl Float {
167 |     /// cos(x) = 1 - x^2 / 2! + x^4 / 4! - x^6/6! ....
168 |     fn cos_taylor(x: &Self) -> Self {
169 |         use crate::bigint::BigInt;
170 |         let sem = x.get_semantics();
171 | 
172 |         let mut neg = false;
173 |         let mut top = Self::one(sem, false);
174 |         let mut bottom = BigInt::one();
175 |         let mut sum = Self::zero(sem, false);
176 |         let x2 = x.sqr();
177 |         let mut prev = Self::one(sem, true);
178 |         for i in 1..50 {
179 |             if prev == sum {
180 |                 break; // Stop if we are not making progress.
181 |             }
182 |             prev = sum.clone();
183 | 
184 |             // Update sum.
185 |             let elem = &top / &Self::from_bigint(sem, bottom.clone());
186 |             sum = if neg { sum - elem } else { sum + elem };
187 | 
188 |             // Prepare the next element.
189 |             top = &top * &x2;
190 |             let next_term = BigInt::from_u64((i * 2 - 1) * (i * 2));
191 |             bottom *= next_term;
192 | 
193 |             neg ^= true;
194 |         }
195 | 
196 |         sum
197 |     }
198 | 
199 |     /// Reduce cos(x) in the range 0..pi/2, using the identity:
200 |     /// cos(2x) = 2cos(x)^2 - 1
201 |     fn cos_step4_reduction(x: &Self, steps: usize) -> Self {
202 |         use RoundingMode::None as rm;
203 |         if steps == 0 {
204 |             return Self::cos_taylor(x);
205 |         }
206 |         let sem = x.get_semantics();
207 |         let one = Float::one(sem, false);
208 |         let half_x = x.scale(-1, rm);
209 |         let sx = Float::cos_step4_reduction(&half_x, steps - 1);
210 |         Float::sub_with_rm(&sx.sqr().scale(1, rm), &one, rm)
211 |     }
212 | 
213 |     /// Computes the cosine of the number (in radians).
214 |     pub fn cos(&self) -> Self {
215 |         use RoundingMode::None as rm;
216 |         // Fast Trigonometric functions for Arbitrary Precision number
217 |         // by Henrik Vestermark.
218 | 
219 |         if self.is_nan() {
220 |             return self.clone();
221 |         }
222 | 
223 |         if self.is_zero() {
224 |             return Self::one(self.get_semantics(), false);
225 |         }
226 | 
227 |         if self.is_inf() {
228 |             return Self::nan(self.get_semantics(), self.get_sign());
229 |         }
230 | 
231 |         let orig_sem = self.get_semantics();
232 |         let sem = orig_sem.grow_log(14).increase_exponent(4);
233 | 
234 |         assert!(self.is_normal());
235 | 
236 |         let mut neg = false;
237 | 
238 |         let mut val = self.cast_with_rm(sem, rm);
239 | 
240 |         // Handle the negatives.
241 |         if val.is_negative() {
242 |             val = val.neg();
243 |         }
244 | 
245 |         // Range reductions.
246 |         let is_small = self.get_exp() < 0; // X < 1.
247 | 
248 |         if !is_small {
249 |             let pi = Self::pi(sem);
250 |             let pi2 = pi.scale(1, rm);
251 |             let pi_half = pi.scale(-1, rm);
252 | 
253 |             // Step 1
254 |             if val > pi2 {
255 |                 val = val.rem(&pi2);
256 |             }
257 |             debug_assert!(val <= pi2);
258 | 
259 |             // Step 2.
260 |             if val > pi {
261 |                 val = Float::sub_with_rm(&pi2, &val, rm);
262 |             }
263 | 
264 |             debug_assert!(val <= pi);
265 |             // Step 3.
266 |             if val > pi_half {
267 |                 val = Float::sub_with_rm(&pi, &val, rm);
268 |                 neg ^= true;
269 |             }
270 |             debug_assert!(val <= pi_half);
271 |         }
272 | 
273 |         // Calculate the number of needed reduction: 2[log(2) * log(p)];
274 |         let k = (sem.log_precision() * 8) / 10;
275 | 
276 |         let res = Self::cos_step4_reduction(&val, k);
277 |         let res = if neg { res.neg() } else { res };
278 |         res.cast(orig_sem)
279 |     }
280 | }
281 | 
282 | #[cfg(feature = "std")]
283 | #[test]
284 | fn test_cos_known_value() {
285 |     use crate::std::string::ToString;
286 | 
287 |     // Verify the results with:
288 |     // from mpmath import mp
289 |     // mp.dps = 100
290 |     // mp.cos(801./10000)
291 |     let res = Float::from_f64(801. / 10000.).cos().to_string();
292 |     assert_eq!(res, ".9967937098492272");
293 |     let res = Float::from_f64(2.3).cos().to_string();
294 |     assert_eq!(res, "-.6662760212798241");
295 |     let res = Float::from_f64(90210. / 10000.).cos().to_string();
296 |     assert_eq!(res, "-.9195832171442742");
297 |     let res = Float::from_f64(95051.).cos().to_string();
298 |     assert_eq!(res, ".5171085523259959");
299 | }
300 | 
301 | #[cfg(feature = "std")]
302 | #[test]
303 | fn test_cos() {
304 |     use crate::utils;
305 | 
306 |     for i in -100..100 {
307 |         let f0 = i as f64;
308 |         let r0 = f0.cos();
309 |         let r1 = Float::from_f64(f0).cos().as_f64();
310 |         assert_eq!(r0, r1);
311 |     }
312 | 
313 |     // The native implementation of sin is not accurate to all 64 bits, so
314 |     // we just pick a few values where we happen to get lucky and native sin
315 |     // matches the arbitrary precision implementation.
316 |     for i in -100..100 {
317 |         let f0 = (i as f64) / 100.;
318 |         let r0 = f0.cos();
319 |         let r1 = Float::from_f64(f0).cos().as_f64();
320 |         assert_eq!(r0, r1);
321 |     }
322 | 
323 |     // Test non-normal values.
324 |     for v in utils::get_special_test_values() {
325 |         if v.is_normal() {
326 |             continue;
327 |         }
328 |         let r0 = v.cos();
329 |         let r1 = Float::from_f64(v).cos().as_f64();
330 |         assert_eq!(r0.is_nan(), r1.is_nan());
331 |         if !r0.is_nan() {
332 |             assert_eq!(r0, r1);
333 |         }
334 |     }
335 | }
336 | 
337 | impl Float {
338 |     /// Computes the tangent of the number (in radians).
339 |     pub fn tan(&self) -> Self {
340 |         use RoundingMode::None as rm;
341 |         // Fast Trigonometric functions for Arbitrary Precision number
342 |         // by Henrik Vestermark.
343 | 
344 |         if self.is_zero() || self.is_nan() {
345 |             return self.clone();
346 |         }
347 | 
348 |         if self.is_inf() {
349 |             return Self::nan(self.get_semantics(), self.get_sign());
350 |         }
351 | 
352 |         let orig_sem = self.get_semantics();
353 |         let sem = orig_sem.grow_log(12).increase_exponent(4);
354 | 
355 |         assert!(self.is_normal());
356 | 
357 |         let mut neg = false;
358 | 
359 |         let mut val = self.cast_with_rm(sem, rm);
360 | 
361 |         // Handle the negatives.
362 |         if val.is_negative() {
363 |             val = val.neg();
364 |             neg ^= true;
365 |         }
366 | 
367 |         // Range reductions.
368 |         let is_small = self.get_exp() < 0;
369 | 
370 |         if !is_small {
371 |             let pi = Self::pi(sem);
372 |             let half_pi = pi.scale(-1, rm);
373 | 
374 |             // Wrap around pi.
375 |             if val > pi {
376 |                 val = val.rem(&pi);
377 |             }
378 |             debug_assert!(val <= pi);
379 | 
380 |             // Reduce to 0..pi/2.
381 |             if val > half_pi {
382 |                 val = pi - val;
383 |                 neg ^= true;
384 |             }
385 |             debug_assert!(val <= half_pi);
386 |         }
387 | 
388 |         // Tan(x) = sin(x)/sqrt(1-sin(x)^2).
389 |         let sinx = val.sin();
390 |         let one = Float::one(sem, false);
391 |         let bottom = (one - sinx.sqr()).sqrt();
392 |         let res = sinx / bottom;
393 |         let res = if neg { res.neg() } else { res };
394 |         res.cast(orig_sem)
395 |     }
396 | }
397 | 
398 | #[cfg(feature = "std")]
399 | #[test]
400 | fn test_tan_known_value() {
401 |     use crate::std::string::ToString;
402 | 
403 |     // Verify the results with:
404 |     // from mpmath import mp
405 |     // mp.dps = 100
406 |     // mp.tan(801./10000)
407 |     let res = Float::from_f64(801. / 10000.).tan().to_string();
408 |     assert_eq!(res, ".08027174825588148");
409 |     let res = Float::from_f64(2.3).tan().to_string();
410 |     assert_eq!(res, "-1.1192136417341325");
411 |     let res = Float::from_f64(90210. / 10000.).tan().to_string();
412 |     assert_eq!(res, "-.4272536513599634");
413 |     let res = Float::from_f64(95051.).tan().to_string();
414 |     assert_eq!(res, "-1.6552033806966715");
415 | }
416 | 


--------------------------------------------------------------------------------
/src/py.rs:
--------------------------------------------------------------------------------
  1 | use crate::{BigInt, Float, RoundingMode, Semantics};
  2 | use core::ops::{Add, Div, Mul, Sub};
  3 | use pyo3::prelude::*;
  4 | use std::format;
  5 | use std::string::String;
  6 | use std::string::ToString;
  7 | 
  8 | /// Semantics class defining precision and rounding behavior.
  9 | ///
 10 | /// This class encapsulates the parameters that define the precision and
 11 | /// rounding behavior of floating-point operations.
 12 | #[pyclass]
 13 | struct PySemantics {
 14 |     inner: Semantics,
 15 | }
 16 | 
 17 | #[pymethods]
 18 | impl PySemantics {
 19 |     /// Create a new semantics object.
 20 |     ///
 21 |     /// Args:
 22 |     ///     exp_size: The size of the exponent in bits
 23 |     ///     mantissa_size: The size of the mantissa, including the implicit bit
 24 |     ///     rounding_mode: The rounding mode to use:
 25 |     ///         "NearestTiesToEven", "NearestTiesToAway",
 26 |     ///         "Zero", "Positive", "Negative"
 27 |     #[new]
 28 |     fn new(exp_size: i64, mantissa_size: u64, rounding_mode_str: &str) -> Self {
 29 |         let rm = RoundingMode::from_string(rounding_mode_str);
 30 |         assert!(rm.is_some(), "Invalid rounding mode");
 31 |         let sem = Semantics::new(
 32 |             exp_size as usize,
 33 |             mantissa_size as usize,
 34 |             rm.unwrap(),
 35 |         );
 36 |         PySemantics { inner: sem }
 37 |     }
 38 |     /// Returns the length of the exponent in bits.
 39 |     fn get_exponent_len(&self) -> usize {
 40 |         self.inner.get_exponent_len()
 41 |     }
 42 |     /// Returns the length of the mantissa in bits.
 43 |     fn get_mantissa_len(&self) -> usize {
 44 |         self.inner.get_mantissa_len()
 45 |     }
 46 |     /// Returns the rounding mode as a string.
 47 |     fn get_rounding_mode(&self) -> String {
 48 |         self.inner.get_rounding_mode().as_string().to_string()
 49 |     }
 50 |     fn __str__(&self) -> String {
 51 |         format!("{:?}", self.inner)
 52 |     }
 53 |     fn __repr__(&self) -> String {
 54 |         self.__str__()
 55 |     }
 56 |     /// Returns the maximum positive value of the number.
 57 |     fn get_max_positive_value(&self) -> PyFloat {
 58 |         PyFloat {
 59 |             inner: self.inner.get_max_positive_value(),
 60 |         }
 61 |     }
 62 |     /// Returns the minimum positive value of the number.
 63 |     fn get_min_positive_value(&self) -> PyFloat {
 64 |         PyFloat {
 65 |             inner: self.inner.get_min_positive_value(),
 66 |         }
 67 |     }
 68 |     /// Returns true if the number can be represented exactly in this format.
 69 |     /// A number can be represented exactly if the exponent is in the range, and
 70 |     /// the mantissa is not too large. In other words, the number 'val' can be
 71 |     /// converted to this format without any loss of accuracy.
 72 |     fn can_represent_exactly(&self, val: &PyFloat) -> bool {
 73 |         self.inner.can_represent_exactly(&val.inner)
 74 |     }
 75 | }
 76 | 
 77 | /// A class representing arbitrary precision floating-point numbers.
 78 | ///
 79 | /// This class implements IEEE 754-like floating-point arithmetic with
 80 | ///  configurable precision and rounding modes.
 81 | #[pyclass]
 82 | struct PyFloat {
 83 |     inner: Float,
 84 | }
 85 | 
 86 | #[pymethods]
 87 | impl PyFloat {
 88 |     /// Create a new floating-point number.
 89 |     ///
 90 |     /// Args:
 91 |     ///     sem: The semantics (precision and rounding mode) for this number
 92 |     ///     is_negative: Whether the number is negative (sign bit)
 93 |     ///     exp: The biased exponent value (integer)
 94 |     ///     mantissa: The mantissa value (integer)
 95 |     #[new]
 96 |     fn new(
 97 |         sem: &Bound<'_, PyAny>,
 98 |         is_negative: bool,
 99 |         exp: i64,
100 |         mantissa: u64,
101 |     ) -> Self {
102 |         let sem: PyRef<PySemantics> = sem.extract().unwrap();
103 |         let mut man = BigInt::from_u64(mantissa);
104 |         man.flip_bit(sem.inner.get_mantissa_len()); // Add the implicit bit.
105 |         let bias = sem.inner.get_bias();
106 |         PyFloat {
107 |             inner: Float::from_parts(sem.inner, is_negative, exp - bias, man),
108 |         }
109 |     }
110 | 
111 |     fn __str__(&self) -> String {
112 |         self.inner.to_string()
113 |     }
114 |     fn __repr__(&self) -> String {
115 |         self.__str__()
116 |     }
117 |     /// Returns the mantissa of the float.
118 |     fn get_mantissa(&self) -> u64 {
119 |         self.inner.get_mantissa().as_u64()
120 |     }
121 |     /// Returns the exponent of the float.
122 |     fn get_exponent(&self) -> i64 {
123 |         self.inner.get_exp()
124 |     }
125 |     /// Returns the category of the float.
126 |     fn get_category(&self) -> String {
127 |         format!("{:?}", self.inner.get_category())
128 |     }
129 |     /// Returns the semantics of the float.
130 |     fn get_semantics(&self) -> PySemantics {
131 |         PySemantics {
132 |             inner: self.inner.get_semantics(),
133 |         }
134 |     }
135 |     /// Get rounding mode of the number.
136 |     fn get_rounding_mode(&self) -> String {
137 |         self.inner.get_rounding_mode().as_string().to_string()
138 |     }
139 |     /// Returns true if the Float is negative
140 |     fn is_negative(&self) -> bool {
141 |         self.inner.is_negative()
142 |     }
143 |     /// Returns true if the Float is +-inf.
144 |     fn is_inf(&self) -> bool {
145 |         self.inner.is_inf()
146 |     }
147 |     /// Returns true if the Float is a +- NaN.
148 |     fn is_nan(&self) -> bool {
149 |         self.inner.is_nan()
150 |     }
151 |     /// Returns true if the Float is a +- zero.
152 |     fn is_zero(&self) -> bool {
153 |         self.inner.is_zero()
154 |     }
155 | 
156 |     /// Returns true if this number is normal (not Zero, Nan, Inf).
157 |     fn is_normal(&self) -> bool {
158 |         self.inner.is_normal()
159 |     }
160 | 
161 |     fn __add__(&self, other: &PyFloat) -> PyFloat {
162 |         self.add(other)
163 |     }
164 | 
165 |     fn __sub__(&self, other: &PyFloat) -> PyFloat {
166 |         self.sub(other)
167 |     }
168 | 
169 |     fn __mul__(&self, other: &PyFloat) -> PyFloat {
170 |         self.mul(other)
171 |     }
172 |     fn __truediv__(&self, other: &PyFloat) -> PyFloat {
173 |         self.div(other)
174 |     }
175 |     fn add(&self, other: &PyFloat) -> PyFloat {
176 |         let val = self.inner.clone().add(other.inner.clone());
177 |         PyFloat { inner: val }
178 |     }
179 |     fn mul(&self, other: &PyFloat) -> PyFloat {
180 |         let val = self.inner.clone().mul(other.inner.clone());
181 |         PyFloat { inner: val }
182 |     }
183 |     fn sub(&self, other: &PyFloat) -> PyFloat {
184 |         let val = self.inner.clone().sub(other.inner.clone());
185 |         PyFloat { inner: val }
186 |     }
187 |     fn div(&self, other: &PyFloat) -> PyFloat {
188 |         let val = self.inner.clone().div(other.inner.clone());
189 |         PyFloat { inner: val }
190 |     }
191 |     /// Returns the number raised to the power of `exp` which is an integer.
192 |     fn powi(&self, exp: u64) -> PyFloat {
193 |         PyFloat {
194 |             inner: self.inner.powi(exp),
195 |         }
196 |     }
197 |     /// Returns the number raised to the power of `exp` which is a float.
198 |     fn pow(&self, exp: &PyFloat) -> PyFloat {
199 |         PyFloat {
200 |             inner: self.inner.pow(&exp.inner),
201 |         }
202 |     }
203 |     /// Returns the exponential of the number.
204 |     fn exp(&self) -> PyFloat {
205 |         PyFloat {
206 |             inner: self.inner.exp(),
207 |         }
208 |     }
209 |     /// Returns the natural logarithm of the number.
210 |     fn log(&self) -> PyFloat {
211 |         PyFloat {
212 |             inner: self.inner.log(),
213 |         }
214 |     }
215 |     /// Returns the sigmoid of the number.
216 |     fn sigmoid(&self) -> PyFloat {
217 |         PyFloat {
218 |             inner: self.inner.sigmoid(),
219 |         }
220 |     }
221 |     /// Returns the absolute value of the number.
222 |     fn abs(&self) -> PyFloat {
223 |         PyFloat {
224 |             inner: self.inner.abs(),
225 |         }
226 |     }
227 |     /// Returns the maximum of two numbers (as defined by IEEE 754).
228 |     fn max(&self, other: &PyFloat) -> PyFloat {
229 |         PyFloat {
230 |             inner: self.inner.max(&other.inner),
231 |         }
232 |     }
233 |     /// Returns the minimum of two numbers (as defined by IEEE 754).
234 |     fn min(&self, other: &PyFloat) -> PyFloat {
235 |         PyFloat {
236 |             inner: self.inner.min(&other.inner),
237 |         }
238 |     }
239 |     /// Returns the remainder of the division of two numbers.
240 |     fn rem(&self, other: &PyFloat) -> PyFloat {
241 |         PyFloat {
242 |             inner: self.inner.rem(&other.inner),
243 |         }
244 |     }
245 |     /// Cast the number to another semantics.
246 |     fn cast(&self, sem: &Bound<'_, PyAny>) -> PyFloat {
247 |         let sem: PyRef<PySemantics> = sem.extract().unwrap();
248 |         PyFloat {
249 |             inner: self.inner.cast(sem.inner),
250 |         }
251 |     }
252 |     /// Cast the number to another semantics with a specific rounding mode.
253 |     fn cast_with_rm(&self, sem: &Bound<'_, PyAny>, rm: &str) -> PyFloat {
254 |         let sem: PyRef<PySemantics> = sem.extract().unwrap();
255 |         let rm = RoundingMode::from_string(rm);
256 |         assert!(rm.is_some(), "Invalid rounding mode");
257 |         PyFloat {
258 |             inner: self.inner.cast_with_rm(sem.inner, rm.unwrap()),
259 |         }
260 |     }
261 | 
262 |     /// Returns the number with the sign flipped.
263 |     fn neg(&self) -> PyFloat {
264 |         PyFloat {
265 |             inner: self.inner.neg(),
266 |         }
267 |     }
268 |     /// Returns the number with the sign flipped.
269 |     fn __neg__(&self) -> PyFloat {
270 |         self.neg()
271 |     }
272 |     /// Returns true if the number is less than the other number.
273 |     fn __lt__(&self, other: &PyFloat) -> bool {
274 |         self.inner < other.inner
275 |     }
276 |     /// Returns true if the number is less than or equal to the other number.
277 |     fn __le__(&self, other: &PyFloat) -> bool {
278 |         self.inner <= other.inner
279 |     }
280 |     /// Returns true if the number is equal to the other number.
281 |     fn __eq__(&self, other: &PyFloat) -> bool {
282 |         self.inner == other.inner
283 |     }
284 |     /// Returns true if the number is not equal to the other number.
285 |     fn __ne__(&self, other: &PyFloat) -> bool {
286 |         self.inner != other.inner
287 |     }
288 |     /// Returns true if the number is greater than the other number.
289 |     fn __gt__(&self, other: &PyFloat) -> bool {
290 |         self.inner > other.inner
291 |     }
292 |     /// Returns true if the number is greater than or equal to the other number.
293 |     fn __ge__(&self, other: &PyFloat) -> bool {
294 |         self.inner >= other.inner
295 |     }
296 |     /// Returns the sine of the number.
297 |     fn sin(&self) -> PyFloat {
298 |         PyFloat {
299 |             inner: self.inner.sin(),
300 |         }
301 |     }
302 |     /// Returns the cosine of the number.
303 |     fn cos(&self) -> PyFloat {
304 |         PyFloat {
305 |             inner: self.inner.cos(),
306 |         }
307 |     }
308 |     /// Returns the tangent of the number.
309 |     fn tan(&self) -> PyFloat {
310 |         PyFloat {
311 |             inner: self.inner.tan(),
312 |         }
313 |     }
314 |     /// convert to f64.
315 |     fn to_float64(&self) -> f64 {
316 |         self.inner.as_f64()
317 |     }
318 |     /// Convert the number to a Continued Fraction of two integers.
319 |     /// Take 'n' iterations.
320 |     fn as_fraction(&self, n: usize) -> (u64, u64) {
321 |         let (a, b) = self.inner.as_fraction(n);
322 |         (a.as_u64(), b.as_u64())
323 |     }
324 |     /// Prints the number using the internal representation.
325 |     fn dump(&self) {
326 |         self.inner.dump();
327 |     }
328 | } // impl PyFloat
329 | 
330 | /// Returns the mathematical constant pi with the given semantics.
331 | ///
332 | /// Args:
333 | ///     sem: The semantics to use for representing pi
334 | #[pyfunction]
335 | fn pi(sem: &Bound<'_, PyAny>) -> PyResult<PyFloat> {
336 |     let sem: PyRef<PySemantics> = sem.extract()?;
337 |     Ok(PyFloat {
338 |         inner: Float::pi(sem.inner),
339 |     })
340 | }
341 | 
342 | /// Returns the fused multiply-add operation of three numbers.
343 | ///
344 | /// Args: (a * b) + c
345 | #[pyfunction]
346 | fn fma(a: &PyFloat, b: &PyFloat, c: &PyFloat) -> PyResult<PyFloat> {
347 |     Ok(PyFloat {
348 |         inner: Float::fma(&a.inner, &b.inner, &c.inner),
349 |     })
350 | }
351 | 
352 | /// Returns the mathematical constant e (Euler's number) with the given semantics.
353 | ///
354 | /// Args:
355 | ///     sem: The semantics to use for representing e
356 | #[pyfunction]
357 | fn e(sem: &Bound<'_, PyAny>) -> PyResult<PyFloat> {
358 |     let sem: PyRef<PySemantics> = sem.extract()?;
359 |     Ok(PyFloat {
360 |         inner: Float::e(sem.inner),
361 |     })
362 | }
363 | 
364 | /// Returns the natural logarithm of 2 (ln(2)) with the given semantics.
365 | ///
366 | /// Args:
367 | ///     sem: The semantics to use for representing ln(2)
368 | #[pyfunction]
369 | fn ln2(sem: &Bound<'_, PyAny>) -> PyResult<PyFloat> {
370 |     let sem: PyRef<PySemantics> = sem.extract()?;
371 |     Ok(PyFloat {
372 |         inner: Float::ln2(sem.inner),
373 |     })
374 | }
375 | 
376 | /// Returns the number zero with the given semantics.
377 | ///
378 | /// Args:
379 | ///     sem: The semantics to use for representing e
380 | #[pyfunction]
381 | fn zero(sem: &Bound<'_, PyAny>) -> PyResult<PyFloat> {
382 |     let sem: PyRef<PySemantics> = sem.extract()?;
383 |     Ok(PyFloat {
384 |         inner: Float::zero(sem.inner, false),
385 |     })
386 | }
387 | 
388 | /// Returns a new float with the integer value 'val' with the given semantics.
389 | ///
390 | /// Args:
391 | ///     sem: The semantics to use
392 | ///     val: The integer value
393 | #[pyfunction]
394 | fn from_i64(sem: &Bound<'_, PyAny>, val: i64) -> PyResult<PyFloat> {
395 |     let sem: PyRef<PySemantics> = sem.extract()?;
396 |     Ok(PyFloat {
397 |         inner: Float::from_i64(sem.inner, val),
398 |     })
399 | }
400 | 
401 | /// Returns a new float with the fp64 value 'val'.
402 | ///
403 | /// Args:
404 | ///     val: The f64 value
405 | #[pyfunction]
406 | fn from_fp64(val: f64) -> PyResult<PyFloat> {
407 |     Ok(PyFloat {
408 |         inner: Float::from_f64(val),
409 |     })
410 | }
411 | 
412 | #[pymodule]
413 | fn _arpfloat(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
414 |     m.add_class::<PyFloat>()?;
415 |     m.add_class::<PySemantics>()?;
416 | 
417 |     // Add the functions to the module
418 |     m.add_function(wrap_pyfunction!(pi, m)?)?;
419 |     m.add_function(wrap_pyfunction!(e, m)?)?;
420 |     m.add_function(wrap_pyfunction!(ln2, m)?)?;
421 |     m.add_function(wrap_pyfunction!(zero, m)?)?;
422 |     m.add_function(wrap_pyfunction!(fma, m)?)?;
423 |     m.add_function(wrap_pyfunction!(from_i64, m)?)?;
424 |     m.add_function(wrap_pyfunction!(from_fp64, m)?)?;
425 |     Ok(())
426 | }
427 | 


--------------------------------------------------------------------------------
/src/string.rs:
--------------------------------------------------------------------------------
  1 | //! This module contains the implementation of string conversion.
  2 | 
  3 | extern crate alloc;
  4 | 
  5 | use super::bigint::BigInt;
  6 | use super::float::Float;
  7 | use super::RoundingMode;
  8 | use super::Semantics;
  9 | use alloc::string::{String, ToString};
 10 | use alloc::vec::Vec;
 11 | use core::cmp::Ordering;
 12 | use core::fmt::Display;
 13 | 
 14 | impl Float {
 15 |     /// Convert the number into a large integer, and a base-10 exponent.
 16 |     fn convert_to_integer(&self) -> (BigInt, i64) {
 17 |         // The natural representation of numbers is 1.mmmmmmm, where the
 18 |         // mantissa is aligned to the MSB. In this method we convert the numbers
 19 |         // into integers, that start at bit zero, so we use exponent that refers
 20 |         //  to bit zero.
 21 |         // See Ryu: Fast Float-to-String Conversion -- Ulf Adams.
 22 |         // https://youtu.be/kw-U6smcLzk?t=681
 23 |         let mut exp = self.get_exp() - self.get_mantissa_len() as i64;
 24 |         let mut mantissa: BigInt = self.get_mantissa();
 25 | 
 26 |         match exp.cmp(&0) {
 27 |             Ordering::Less => {
 28 |                 // The number is not yet an integer, we need to convert it using
 29 |                 // the method:
 30 |                 // mmmmm * 5^(e) * 10 ^(-e) == mmmmm * 10 ^ (-e);
 31 |                 // where (5^e) * (10^-e) == (2^-e)
 32 |                 // And the left hand side is how we represent our binary number
 33 |                 // 1.mmmm * 2^-e, and the right-hand-side is how we represent
 34 |                 // our decimal number: nnnnnnn * 10^-e.
 35 |                 let five = BigInt::from_u64(5);
 36 |                 let e5 = five.powi((-exp) as u64);
 37 |                 mantissa.inplace_mul(&e5);
 38 |                 exp = -exp;
 39 |             }
 40 |             Ordering::Equal | Ordering::Greater => {
 41 |                 // The number is already an integer, just align it.
 42 |                 // In this case, E - M > 0, so we are aligning the larger
 43 |                 // integers, for example [1.mmmm * e^15], in FP16 (where M=10).
 44 |                 mantissa.shift_left(exp as usize);
 45 |                 exp = 0;
 46 |             }
 47 |         }
 48 | 
 49 |         (mantissa, exp)
 50 |     }
 51 | 
 52 |     /// Returns the highest number of decimal digits that are needed for
 53 |     /// representing this type accurately.
 54 |     pub fn get_decimal_accuracy(&self) -> usize {
 55 |         // Matula, David W. “A Formalization of Floating-Point Numeric Base
 56 |         // N = 2 + floor(n / log_b(B)) = 2 + floor(n / log(10, 2))
 57 |         // We convert from bits to base-10 digits: log(2)/log(10) ==> 59/196.
 58 |         // A continuous fraction of 5 iteration gives the ratio.
 59 |         2 + (self.get_mantissa_len() * 59) / 196
 60 |     }
 61 | 
 62 |     /// Reduce a number in the representation mmmmm * e^10, to fewer bits in
 63 |     /// 'm', based on the max possible digits in the mantissa.
 64 |     fn reduce_printed_integer_length(
 65 |         &self,
 66 |         integer: &mut BigInt,
 67 |         exp: &mut i64,
 68 |     ) {
 69 |         let bits = integer.msb_index();
 70 |         if bits <= self.get_mantissa_len() {
 71 |             return;
 72 |         };
 73 |         let needed_bits = bits - self.get_mantissa_len();
 74 |         // We convert from bits to base-10 digits: log(2)/log(10) ==> 59/196.
 75 |         // A continuous fraction of 5 iteration gives the ratio.
 76 |         let mut digits_to_remove = ((needed_bits * 59) / 196) as i64;
 77 | 
 78 |         // Only remove digits after the decimal points.
 79 |         if digits_to_remove > *exp {
 80 |             digits_to_remove = *exp;
 81 |         }
 82 |         *exp -= digits_to_remove;
 83 |         let ten = BigInt::from_u64(10);
 84 |         let divisor = ten.powi(digits_to_remove as u64);
 85 |         integer.inplace_div(&divisor);
 86 |     }
 87 | 
 88 |     fn convert_normal_to_string(&self) -> String {
 89 |         // Convert the integer to base-10 integer, and e, the exponent in
 90 |         // base 10 (scientific notation).
 91 |         let (mut integer, mut e) = self.convert_to_integer();
 92 | 
 93 |         // Try to shorten the number.
 94 |         self.reduce_printed_integer_length(&mut integer, &mut e);
 95 | 
 96 |         // Extract the digits: Div10-Mod10-Div10-Mod10 ....
 97 |         let mut buff = Vec::new();
 98 |         let digits = integer.to_digits::<10>();
 99 |         for d in digits {
100 |             buff.push(char::from_digit(d as u32, 10).unwrap())
101 |         }
102 | 
103 |         debug_assert!(e >= 0);
104 |         // Add the trailing zeros, and make room to place the point.
105 |         while buff.len() < e as usize {
106 |             buff.insert(0, '0');
107 |         }
108 | 
109 |         buff.insert(buff.len() - e as usize, '.');
110 |         while !buff.is_empty() && buff[buff.len() - 1] == '0' {
111 |             buff.pop();
112 |         }
113 |         String::from_iter(buff)
114 |     }
115 | 
116 |     /// Convert the number to a string. This is a simple implementation
117 |     /// that does not take into account rounding during the round-trip of
118 |     /// parsing-printing of the value, or scientific notation, and the minimal
119 |     /// representation of numbers. For all of that that check out the paper:
120 |     /// "How to Print Floating-Point Numbers Accurately" by Steele and White.
121 |     fn convert_to_string(&self) -> String {
122 |         // In order to print decimal digits we need a minimum number of mantissa
123 |         // bits for the conversion. Small floats (such as BF16) don't have
124 |         // enough bits, so we cast to a larger number.
125 |         if self.get_semantics().get_mantissa_len() < 16 {
126 |             use crate::FP32;
127 |             return self.cast(FP32).to_string();
128 |         }
129 | 
130 |         let result = if self.get_sign() { "-" } else { "" };
131 |         let mut result: String = result.to_string();
132 | 
133 |         let body: String = match self.get_category() {
134 |             super::float::Category::Infinity => "Inf".to_string(),
135 |             super::float::Category::NaN => "NaN".to_string(),
136 |             super::float::Category::Normal => self.convert_normal_to_string(),
137 |             super::float::Category::Zero => "0.0".to_string(),
138 |         };
139 | 
140 |         result.push_str(&body);
141 |         result
142 |     }
143 | }
144 | impl Display for Float {
145 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
146 |         write!(f, "{}", self.convert_to_string())
147 |     }
148 | }
149 | 
150 | impl Display for BigInt {
151 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
152 |         write!(f, "{}", self.as_binary())
153 |     }
154 | }
155 | 
156 | impl RoundingMode {
157 |     pub fn as_string(&self) -> &str {
158 |         match self {
159 |             RoundingMode::None => "None",
160 |             RoundingMode::NearestTiesToEven => "NearestTiesToEven",
161 |             RoundingMode::NearestTiesToAway => "NearestTiesToAway",
162 |             RoundingMode::Zero => "Zero",
163 |             RoundingMode::Positive => "Positive",
164 |             RoundingMode::Negative => "Negative",
165 |         }
166 |     }
167 | }
168 | 
169 | impl Display for Semantics {
170 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
171 |         write!(
172 |             f,
173 |             "(exponent:{} precision:{} rm:{})",
174 |             self.get_exponent_len(),
175 |             self.get_precision(),
176 |             self.get_rounding_mode().as_string()
177 |         )
178 |     }
179 | }
180 | 
181 | #[cfg(feature = "std")]
182 | mod from {
183 |     use core::fmt::{Debug, Display};
184 |     use std::error::Error;
185 | 
186 |     use crate::{BigInt, Float, Semantics, FP64};
187 | 
188 |     impl Float {
189 |         /// Try to construct a Float instance with semantics 'sem' from the
190 |         /// string 'value'. Note that the operation of conversion might lose
191 |         /// precision. If you care about precision you might want to use a
192 |         /// higher precision float and downcast.
193 |         pub fn try_from_str(
194 |             value: &str,
195 |             sem: Semantics,
196 |         ) -> Result<Self, ParseError> {
197 |             // Handle the empty case.
198 |             if value.is_empty() {
199 |                 return Err(ParseError(ParseErrorKind::InputEmpty));
200 |             }
201 | 
202 |             // Handle the plus or minus in front of the number.
203 |             let chars = value.as_bytes();
204 |             let (sign, skip) = if chars[0] == b'-' || chars[0] == b'+' {
205 |                 (chars[0] == b'-', 1)
206 |             } else {
207 |                 (false, 0)
208 |             };
209 |             let value = &value[skip..];
210 | 
211 |             // Handle Nan.
212 |             if value.eq_ignore_ascii_case("nan") {
213 |                 return Ok(Self::nan(sem, sign));
214 |             }
215 | 
216 |             // Handle Inf.
217 |             if value.eq_ignore_ascii_case("inf") {
218 |                 return Ok(Self::inf(sem, sign));
219 |             }
220 | 
221 |             // Start handling the non trivial cases.
222 | 
223 |             let l_r = value.split_once('.');
224 |             // Handle cases where we have no `.` and as such no mantissa.
225 |             if l_r.is_none() {
226 |                 // No period. Just parse the integer.
227 |                 let ((num, _), exp_num) = parse_with_exp(value)?;
228 |                 let mut num = Float::from_bigint(sem, num);
229 | 
230 |                 // Shift the number according to the exponent (in decimal).
231 |                 if let Some(exp) = exp_num {
232 |                     if exp >= 0 {
233 |                         num *= Float::from_bigint(
234 |                             sem,
235 |                             BigInt::from_u64(10).powi(exp as u64),
236 |                         );
237 |                     } else {
238 |                         num /= Float::from_bigint(
239 |                             sem,
240 |                             BigInt::from_u64(10).powi((-exp) as u64),
241 |                         );
242 |                     }
243 |                 }
244 |                 num.set_sign(sign);
245 |                 return Ok(num);
246 |             }
247 | 
248 |             // Handle cases where we have a period in the number:
249 |             let (left, right) = l_r.unwrap();
250 | 
251 |             // Try parsing decimal value of 0.
252 |             if right.chars().all(|chr| chr == '0') {
253 |                 return parse_whole_num(left, sign, sem).map(Ok).unwrap_or(
254 |                     Err(ParseError(ParseErrorKind::ParsingNumberFailed)),
255 |                 );
256 |             }
257 | 
258 |             // Parse the integer part.
259 |             let left_num = parse_big_int(left).map(Ok).unwrap_or(Err(
260 |                 ParseError(ParseErrorKind::ParsingNumberFailed),
261 |             ))?;
262 | 
263 |             // Parse the mantissa and an optional exponent part
264 |             let ((right_num, right_num_digits), explicit_exp) =
265 |                 parse_with_exp(right)?;
266 | 
267 |             // Construct the integral and fractional parts, without the exp.
268 |             // This is one of the places where we might lose precision.
269 |             let dec_shift = BigInt::from_u64(10).powi(right_num_digits as u64);
270 | 
271 |             let integral = Float::from_bigint(sem, left_num);
272 |             let fraction = Float::from_bigint(sem, right_num)
273 |                 / Float::from_bigint(sem, dec_shift);
274 | 
275 |             // Construct the whole number, move the fractional part into place.
276 |             let mut ret = integral + fraction;
277 | 
278 |             // Handle the explicit exponent. (Example: e+1).
279 |             if let Some(exp_num) = explicit_exp {
280 |                 if exp_num >= 0 {
281 |                     let e = BigInt::from_u64(10).powi(exp_num as u64);
282 |                     ret *= Float::from_bigint(sem, e)
283 |                 } else {
284 |                     let e = BigInt::from_u64(10).powi((-exp_num) as u64);
285 |                     ret /= Float::from_bigint(sem, e)
286 |                 }
287 |             }
288 |             ret.set_sign(sign);
289 |             Ok(ret)
290 |         }
291 |     }
292 | 
293 |     impl TryFrom<&str> for Float {
294 |         type Error = ParseError;
295 | 
296 |         fn try_from(value: &str) -> Result<Self, Self::Error> {
297 |             const DEFAULT_SEM: Semantics = FP64;
298 |             // TODO: autodetect required semantics
299 |             Self::try_from_str(value, DEFAULT_SEM)
300 |         }
301 |     }
302 | 
303 |     /// Parse a number that contains the 'e' marker for exponent.
304 |     /// Example: 565e+1
305 |     /// Returns the number, the number of decimal digits, and an optional
306 |     /// exponent value.
307 |     fn parse_with_exp(
308 |         value: &str,
309 |     ) -> Result<((BigInt, usize), Option<i64>), ParseError> {
310 |         let idx = value.find(['e', 'E']);
311 |         // Split the number to the digits and the exponent.
312 |         let (num_raw, exp) = if let Some(idx) = idx {
313 |             let (l, r) = value.split_at(idx);
314 |             (l, Some(&r[1..]))
315 |         } else {
316 |             (value, None)
317 |         };
318 | 
319 |         // Parse the left size of the expression (the number).
320 |         let num = parse_big_int(num_raw)
321 |             .map(|num| Ok((num, num_raw.len())))
322 |             .unwrap_or(Err(ParseError(ParseErrorKind::ParsingNumberFailed)))?;
323 | 
324 |         // Parse the right side (the exponent expression).
325 |         if let Some(exp) = exp {
326 |             match exp.parse::<i64>() {
327 |                 Ok(exp) => {
328 |                     // Found a valid expression that has exponent.
329 |                     return Ok((num, Some(exp)));
330 |                 }
331 |                 Err(_) => {
332 |                     return Err(ParseError(ParseErrorKind::ExponentParseFailed))
333 |                 }
334 |             }
335 |         }
336 |         // Found a valid number without exponent marker.
337 |         Ok((num, None))
338 |     }
339 | 
340 |     /// Try to parse a number from the string 'value'.
341 |     fn parse_big_int(value: &str) -> Option<BigInt> {
342 |         let chars = value.as_bytes();
343 |         let ten = BigInt::from_u64(10);
344 |         let mut num = BigInt::from_u64(0);
345 |         for digit in chars.iter() {
346 |             if *digit > b'9' || *digit < b'0' {
347 |                 return None;
348 |             }
349 |             let part = [*digit as u64 - '0' as u64];
350 |             num.inplace_mul(&ten);
351 |             num.inplace_add_slice(&part);
352 |         }
353 |         Some(num)
354 |     }
355 | 
356 |     /// Parse one long integer and apply a sign.
357 |     fn parse_whole_num(
358 |         value: &str,
359 |         sign: bool,
360 |         sem: Semantics,
361 |     ) -> Option<Float> {
362 |         let chars = value.as_bytes();
363 |         // Handle the special case of '0'.
364 |         if value.len() == 1 && chars[0] == b'0' {
365 |             return Some(Float::zero(sem, sign));
366 |         }
367 | 
368 |         // Parse the digits.
369 |         let num = parse_big_int(value)?;
370 |         // And construct the Float number.
371 |         let mut ret = Float::from_bigint(sem, num);
372 |         ret.set_sign(sign);
373 | 
374 |         Some(ret)
375 |     }
376 | 
377 |     enum ParseErrorKind {
378 |         InputEmpty,
379 |         ParsingNumberFailed,
380 |         ExponentParseFailed,
381 |     }
382 | 
383 |     pub struct ParseError(ParseErrorKind);
384 | 
385 |     impl Error for ParseError {}
386 | 
387 |     impl Display for ParseError {
388 |         fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
389 |             match self.0 {
390 |                 ParseErrorKind::ParsingNumberFailed => f.write_str(
391 |                     "Failed parsing number part of floating point number",
392 |                 ),
393 |                 ParseErrorKind::ExponentParseFailed => {
394 |                     f.write_str("Failed parsing exponent of float number")
395 |                 }
396 |                 ParseErrorKind::InputEmpty => {
397 |                     f.write_str("The input provided was empty")
398 |                 }
399 |             }
400 |         }
401 |     }
402 | 
403 |     impl Debug for ParseError {
404 |         fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
405 |             Display::fmt(&self, f)
406 |         }
407 |     }
408 | }
409 | 
410 | #[cfg(feature = "std")]
411 | #[test]
412 | fn test_convert_to_string() {
413 |     use crate::FP16;
414 |     use crate::FP64;
415 |     use core::f64;
416 |     use std::format;
417 | 
418 |     fn to_str_w_fp16(val: f64) -> String {
419 |         format!("{}", Float::from_f64(val).cast(FP16))
420 |     }
421 | 
422 |     fn to_str_w_bf16(val: f64) -> String {
423 |         use crate::BF16;
424 |         format!("{}", Float::from_f64(val).cast(BF16))
425 |     }
426 | 
427 |     fn to_str_w_fp64(val: f64) -> String {
428 |         format!("{}", Float::from_f64(val).cast(FP64))
429 |     }
430 | 
431 |     assert_eq!("-0.0", to_str_w_fp16(-0.));
432 |     assert_eq!(".30004882", to_str_w_fp16(0.3));
433 |     assert_eq!("4.5", to_str_w_fp16(4.5));
434 |     assert_eq!("256.", to_str_w_fp16(256.));
435 |     assert_eq!("Inf", to_str_w_fp16(65534.));
436 |     assert_eq!("-Inf", to_str_w_fp16(-65534.));
437 |     assert_eq!(".09997558", to_str_w_fp16(0.1));
438 |     assert_eq!(".1", to_str_w_fp64(0.1));
439 |     assert_eq!(".29999999999999998", to_str_w_fp64(0.3));
440 |     assert_eq!("2251799813685248.", to_str_w_fp64((1u64 << 51) as f64));
441 |     assert_eq!("1995.1994999999999", to_str_w_fp64(1995.1995));
442 |     assert_eq!("3.140625", to_str_w_bf16(f64::consts::PI));
443 | }
444 | 
445 | #[cfg(feature = "std")]
446 | #[test]
447 | fn test_from_string() {
448 |     assert_eq!("-3.", Float::try_from("-3.0").unwrap().to_string());
449 |     assert_eq!("-3.", Float::try_from("-3.00").unwrap().to_string());
450 |     assert_eq!("30.", Float::try_from("30").unwrap().to_string());
451 |     assert_eq!("430.56", Float::try_from("430.56").unwrap().to_string());
452 |     assert_eq!("5.2", Float::try_from("5.2").unwrap().to_string());
453 |     assert_eq!("Inf", Float::try_from("inf").unwrap().to_string());
454 |     assert_eq!("NaN", Float::try_from("nan").unwrap().to_string());
455 |     assert_eq!("32.", Float::try_from("3.2e1").unwrap().to_string());
456 |     assert_eq!("4.4", Float::try_from("44.e-1").unwrap().to_string());
457 |     assert_eq!("5.4", Float::try_from("54e-1").unwrap().to_string());
458 |     assert_eq!("-5.485", Float::try_from("-54.85e-1").unwrap().to_string());
459 |     assert!(Float::try_from("abc.de").is_err());
460 |     assert!(Float::try_from("e.-21").is_err());
461 |     assert!(Float::try_from("-rlp.").is_err());
462 |     assert!(Float::try_from("").is_err());
463 | }
464 | 
465 | #[test]
466 | fn test_fuzz_printing() {
467 |     use crate::utils;
468 | 
469 |     let mut lfsr = utils::Lfsr::new();
470 | 
471 |     for _ in 0..500 {
472 |         let v0 = lfsr.get64();
473 |         let f0 = f64::from_bits(v0);
474 |         let fp0 = Float::from_f64(f0);
475 |         fp0.to_string();
476 |     }
477 | }
478 | 
479 | #[cfg(feature = "std")]
480 | #[test]
481 | fn test_print_sqrt() {
482 |     use crate::FP64;
483 |     use std::println;
484 | 
485 |     // Use Newton-Raphson to find the square root of 5.
486 |     let n = Float::from_u64(FP64, 5);
487 | 
488 |     let mut x = n.clone();
489 | 
490 |     for _ in 0..100 {
491 |         x = (&x + (&n / &x)) / 2;
492 |     }
493 |     println!("{}", x);
494 | }
495 | 
496 | #[test]
497 | #[cfg(feature = "std")]
498 | fn test_readme_example() {
499 |     use std::println;
500 |     // Create a new type: 15 bits exponent, 112 significand.
501 | 
502 |     // Use Newton-Raphson to find the square root of 5.
503 |     let n = Float::from_u64(FP128, 5);
504 |     let mut x = n.clone();
505 | 
506 |     for _ in 0..1000 {
507 |         x = (&x + &n / &x) / 2;
508 |     }
509 |     println!("fp128: {}", x);
510 |     println!("fp64:  {}", x.as_f64());
511 | 
512 |     use crate::{FP128, FP16};
513 |     let fp = Float::from_i64(FP16, 15);
514 |     fp.dump();
515 | }
516 | 
517 | #[test]
518 | fn test_decimal_accuracy_for_type() {
519 |     use crate::{FP128, FP16, FP256, FP32, FP64};
520 |     assert_eq!(Float::zero(FP16, false).get_decimal_accuracy(), 5);
521 |     assert_eq!(Float::zero(FP32, false).get_decimal_accuracy(), 8);
522 |     assert_eq!(Float::zero(FP64, false).get_decimal_accuracy(), 17);
523 |     assert_eq!(Float::zero(FP128, false).get_decimal_accuracy(), 35);
524 |     assert_eq!(Float::zero(FP256, false).get_decimal_accuracy(), 73);
525 | }
526 | 
527 | impl BigInt {
528 |     /// Prints the bigint as a decimal number.
529 |     pub fn as_decimal(&self) -> String {
530 |         if self.is_zero() {
531 |             return "0".to_string();
532 |         }
533 | 
534 |         let mut buff = Vec::new();
535 |         let digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
536 |         let ten = Self::from_u64(10);
537 |         let mut val = self.clone();
538 |         while !val.is_zero() {
539 |             let rem = val.inplace_div(&ten);
540 |             buff.insert(0, digits[rem.as_u64() as usize]);
541 |         }
542 | 
543 |         String::from_iter(buff)
544 |     }
545 |     /// Prints the bigint as a sequence of bits.
546 |     pub fn as_binary(&self) -> String {
547 |         let mut sb = String::new();
548 | 
549 |         if self.is_empty() || self.is_zero() {
550 |             return String::from("0");
551 |         }
552 |         let mut top_non_zero = 0;
553 |         for i in (0..self.len()).rev() {
554 |             if self.get_part(i) != 0 {
555 |                 top_non_zero = i;
556 |                 break;
557 |             }
558 |         }
559 | 
560 |         for i in 0..=top_non_zero {
561 |             let mut part = self.get_part(i);
562 |             // Don't print leading zeros for the first word.
563 |             if i == top_non_zero {
564 |                 while part > 0 {
565 |                     let last = if part & 0x1 == 1 { '1' } else { '0' };
566 |                     sb.insert(0, last);
567 |                     part /= 2;
568 |                 }
569 |                 continue;
570 |             }
571 | 
572 |             // Print leading zeros for the rest of the words.
573 |             for _ in 0..64 {
574 |                 let last = if part & 0x1 == 1 { '1' } else { '0' };
575 |                 sb.insert(0, last);
576 |                 part /= 2;
577 |             }
578 |         }
579 |         if sb.is_empty() {
580 |             sb.push('0');
581 |         }
582 |         sb
583 |     }
584 | }
585 | 
586 | #[cfg(feature = "std")]
587 | #[test]
588 | fn test_bigint_to_string() {
589 |     let val = 0b101110011010011111010101011110000000101011110101;
590 |     let mut bi = BigInt::from_u64(val);
591 |     bi.shift_left(32);
592 |     assert_eq!(
593 |         bi.as_binary(),
594 |         "10111001101001111101010101111000\
595 |         000010101111010100000000000000000\
596 |         000000000000000"
597 |     );
598 | 
599 |     let mut bi = BigInt::from_u64(val);
600 |     bi.shift_left(64);
601 |     bi = bi + val;
602 |     assert_eq!(
603 |         bi.as_binary(),
604 |         "101110011010011111010101011110000000101011110101\
605 |          0000000000000000\
606 |          101110011010011111010101011110000000101011110101"
607 |     );
608 | }
609 | 
610 | #[cfg(feature = "std")]
611 | #[test]
612 | fn test_bigint_to_decimal() {
613 |     let mut num = BigInt::one();
614 |     for i in 1..41 {
615 |         let term = BigInt::from_u64(i);
616 |         num.inplace_mul(&term);
617 |     }
618 | 
619 |     assert_eq!(
620 |         num.as_decimal(),
621 |         "815915283247897734345611269596115894272000000000"
622 |     );
623 | }
624 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
  1 | //! This file contains simple helper functions and test helpers.
  2 | 
  3 | /// Returns a mask full of 1s, of `b` bits.
  4 | pub fn mask(b: usize) -> usize {
  5 |     (1 << (b)) - 1
  6 | }
  7 | 
  8 | #[test]
  9 | fn test_masking() {
 10 |     assert_eq!(mask(0), 0x0);
 11 |     assert_eq!(mask(1), 0x1);
 12 |     assert_eq!(mask(8), 255);
 13 | }
 14 | 
 15 | #[cfg(feature = "std")]
 16 | #[allow(dead_code)]
 17 | /// Returns list of interesting values that various tests use to catch edge cases.
 18 | pub fn get_special_test_values() -> [f64; 22] {
 19 |     [
 20 |         -f64::NAN,
 21 |         f64::NAN,
 22 |         f64::INFINITY,
 23 |         f64::NEG_INFINITY,
 24 |         f64::EPSILON,
 25 |         -f64::EPSILON,
 26 |         0.000000000000000000000000000000000000001,
 27 |         f64::MIN,
 28 |         f64::MAX,
 29 |         std::f64::consts::PI,
 30 |         std::f64::consts::LN_2,
 31 |         std::f64::consts::SQRT_2,
 32 |         std::f64::consts::E,
 33 |         0.0,
 34 |         -0.0,
 35 |         10.,
 36 |         -10.,
 37 |         -0.00001,
 38 |         0.1,
 39 |         355. / 113.,
 40 |         -1.0,
 41 |         -1.1,
 42 |     ]
 43 | }
 44 | 
 45 | // Linear-feedback shift register. We use this as a random number generator for
 46 | // tests.
 47 | pub struct Lfsr {
 48 |     state: u32,
 49 | }
 50 | 
 51 | impl Default for Lfsr {
 52 |     fn default() -> Self {
 53 |         Self::new()
 54 |     }
 55 | }
 56 | 
 57 | impl Lfsr {
 58 |     /// Generate a new LFSR number generator.
 59 |     pub fn new() -> Lfsr {
 60 |         Lfsr { state: 0x13371337 }
 61 |     }
 62 | 
 63 |     /// Generate a new LFSR number generator that starts with a specific state.
 64 |     pub fn new_with_seed(seed: u32) -> Lfsr {
 65 |         Lfsr {
 66 |             state: 0x13371337 ^ seed,
 67 |         }
 68 |     }
 69 | 
 70 |     pub fn next(&mut self) {
 71 |         let a = (self.state >> 24) & 1;
 72 |         let b = (self.state >> 23) & 1;
 73 |         let c = (self.state >> 22) & 1;
 74 |         let d = (self.state >> 17) & 1;
 75 |         let n = a ^ b ^ c ^ d ^ 1;
 76 |         self.state <<= 1;
 77 |         self.state |= n;
 78 |     }
 79 | 
 80 |     fn get(&mut self) -> u32 {
 81 |         let mut res: u32 = 0;
 82 |         for _ in 0..32 {
 83 |             self.next();
 84 |             res <<= 1;
 85 |             res ^= self.state & 0x1;
 86 |         }
 87 |         res
 88 |     }
 89 | 
 90 |     pub fn get64(&mut self) -> u64 {
 91 |         ((self.get() as u64) << 32) | self.get() as u64
 92 |     }
 93 | }
 94 | 
 95 | // Implement `Iterator` for `Lfsr`.
 96 | impl Iterator for Lfsr {
 97 |     type Item = u64;
 98 |     fn next(&mut self) -> Option<Self::Item> {
 99 |         Some(self.get64())
100 |     }
101 | }
102 | 
103 | #[test]
104 | fn test_lfsr_balance() {
105 |     let mut lfsr = Lfsr::new();
106 | 
107 |     // Count the number of items, and the number of 1s.
108 |     let mut items = 0;
109 |     let mut ones = 0;
110 | 
111 |     for _ in 0..10000 {
112 |         let mut u = lfsr.get();
113 |         for _ in 0..32 {
114 |             items += 1;
115 |             ones += u & 1;
116 |             u >>= 1;
117 |         }
118 |     }
119 |     // Make sure that we have around 50% 1s and 50% zeros.
120 |     assert!((ones as f64) < (0.55 * items as f64));
121 |     assert!((ones as f64) > (0.45 * items as f64));
122 | }
123 | #[test]
124 | fn test_repetition() {
125 |     let mut lfsr = Lfsr::new();
126 |     let first = lfsr.get();
127 |     let second = lfsr.get();
128 | 
129 |     // Make sure that the items don't repeat themselves too frequently.
130 |     for _ in 0..30000 {
131 |         assert_ne!(first, lfsr.get());
132 |         assert_ne!(second, lfsr.get());
133 |     }
134 | }
135 | 
136 | // Multiply a and b, and return the (low, high) parts.
137 | #[allow(dead_code)]
138 | fn mul_part(a: u64, b: u64) -> (u64, u64) {
139 |     let half_bits = u64::BITS / 2;
140 |     let half_mask = (1 << half_bits) - 1;
141 | 
142 |     let a_lo = a & half_mask;
143 |     let a_hi = a >> half_bits;
144 |     let b_lo = b & half_mask;
145 |     let b_hi = b >> half_bits;
146 | 
147 |     let ab_hi = a_hi * b_hi;
148 |     let ab_mid = a_hi * b_lo;
149 |     let ba_mid = b_hi * a_lo;
150 |     let ab_low = a_lo * b_lo;
151 | 
152 |     let carry =
153 |         ((ab_mid & half_mask) + (ba_mid & half_mask) + (ab_low >> half_bits))
154 |             >> half_bits;
155 |     let low = (ab_mid << half_bits)
156 |         .overflowing_add(ba_mid << half_bits)
157 |         .0
158 |         .overflowing_add(ab_low)
159 |         .0;
160 | 
161 |     let high = (ab_hi + (ab_mid >> half_bits) + (ba_mid >> half_bits)) + carry;
162 |     (low, high)
163 | }
164 | 
165 | #[test]
166 | fn test_mul_parts() {
167 |     use super::utils::Lfsr;
168 | 
169 |     let mut lfsr = Lfsr::new();
170 | 
171 |     for _ in 0..500 {
172 |         let v0 = lfsr.get64();
173 |         let v1 = lfsr.get64();
174 |         let res = mul_part(v0, v1);
175 |         let full = v0 as u128 * v1 as u128;
176 |         assert_eq!(full as u64, res.0);
177 |         assert_eq!((full >> 64) as u64, res.1);
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------