├── .golangci.yml ├── .travis.yml ├── README.md ├── UNLICENSE ├── bfloat ├── bfloat.go └── bfloat_test.go ├── binary128 ├── binary128.go ├── binary128_test.go ├── extra_test.go ├── extra_test.tmpl └── gen.go ├── binary16 ├── binary16.go ├── binary16_test.go ├── extra_test.go ├── extra_test.tmpl ├── gen.go └── testdata │ ├── Makefile │ ├── binary16.c │ └── binary16.ll ├── float.go ├── float128ppc ├── float128ppc.go └── float128ppc_test.go ├── float80x86 ├── float80x86.go └── float80x86_test.go ├── go.mod ├── go.sum └── internal └── strconv ├── decimal.go ├── extfloat.go ├── ftoa.go └── itoa.go /.golangci.yml: -------------------------------------------------------------------------------- 1 | linters: 2 | enable-all: true 3 | disable: 4 | - dupl 5 | - maligned 6 | - lll 7 | - gochecknoglobals 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - "1.12" 5 | - tip 6 | 7 | notifications: 8 | email: false 9 | 10 | env: 11 | global: 12 | # Coveralls.io token. 13 | - secure: "gIMQehPQgokFi1b2weJcxY3t5n17Idzr4ONULNylyERInyFg7RtuAPbqi1O2QXqroVDSI+lCYy/hmxvuHIlg0ps6lA4lUiDkr1ScC/p05vGTjEamlC2AacCbP1Oa8OSBhxsKwfu8Z713q/ppfvVlUrOpI9XS1pdQh6QAu3vdOpFPFEewbgQDUZj0K21raj8DROUFo0W548eHTj4CQbgSIkKtbysXrvwvR7fEvqzRnq/7HDqH+6JMahrHQRIrFIHdL8SZxtkKiR9/1QdmXVmY/ZjQUgKJWzGfFBPd5IfrrLNGupZIUFsOd5S2oUmFXiwYXdJ3HtyEJVHEM8M1UjJp/XLDmPFXeu4o1C3FpL8fOmFPda6iANUyMOGnquW+jPNNXAWfhMF06vjtPFDe5XpsemaXrwhmweMGsauBVWMfI9tGzbyko+bSYSlPjjGpcaEamGH1ioUULFMgtHz/cPm+mbvqqG/7Ccrhu8j1bLuZy3893IL/8miOmKrGu+8U6vUYO5PD+edCTx36uIR211mDzakchjttZkT7QR/Zox7QHW3GGvfpKeuPOCcFoG2ufEuRIWGNxH9c3hLfIq/xklLPIG+Oykuh7aNxGfplqaNVfRIdrwVis/BsW/N5Er7+qCePAvzxbIKfVcy77en20N5LUgdxe2CuuLOqRF5tXAqPzTE=" 14 | 15 | install: 16 | - go get -t ./... 17 | 18 | before_script: 19 | - wget https://github.com/mewmew/ci/raw/master/get_tools.sh 20 | - chmod +x get_tools.sh 21 | - ./get_tools.sh 22 | - wget https://github.com/mewmew/ci/raw/master/ci_checks.sh 23 | - chmod +x ci_checks.sh 24 | 25 | script: 26 | - GOTEST_RACE=0 ./ci_checks.sh 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # float 2 | 3 | [![Build Status](https://travis-ci.org/mewmew/float.svg?branch=master)](https://travis-ci.org/mewmew/float) 4 | [![Coverage Status](https://coveralls.io/repos/github/mewmew/float/badge.svg?branch=master)](https://coveralls.io/github/mewmew/float?branch=master) 5 | [![go.dev reference](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white&style=flat-square)](https://pkg.go.dev/github.com/mewmew/float) 6 | 7 | Floating-point formats. 8 | 9 | * [binary16](https://pkg.go.dev/github.com/mewmew/float/binary16) (IEEE 754 [half precision](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) floating-point format) 10 | * [binary128](https://pkg.go.dev/github.com/mewmew/float/binary128) (IEEE 754 [quadruple precision](https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format) floating-point format) 11 | * [float80x86](https://pkg.go.dev/github.com/mewmew/float/float80x86) ([x86 extended precision](https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format) floating-point format) 12 | * [float128ppc](https://pkg.go.dev/github.com/mewmew/float/float128ppc) ([PowerPC double-double arithmetic](https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Double-double_arithmetic) floating-point format) 13 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /bfloat/bfloat.go: -------------------------------------------------------------------------------- 1 | package bfloat 2 | 3 | import ( 4 | "fmt" 5 | "math/big" 6 | ) 7 | 8 | const ( 9 | // precision specifies the number of bits in the mantissa (including the 10 | // implicit lead bit). 11 | precision = 8 12 | // exponent bias. 13 | bias = 127 14 | ) 15 | 16 | // Float is a floating-point number in bfloat16 floating-point format. 17 | type Float struct { 18 | // Sign, exponent and fraction. 19 | // 20 | // 1 bit: sign 21 | // 8 bits: exponent 22 | // 7 bits: fraction 23 | bits uint16 24 | } 25 | 26 | func NewFromBits(bits uint16) Float { 27 | return Float{bits: bits} 28 | } 29 | 30 | func (f Float) Big() (x *big.Float, nan bool) { 31 | signbit := f.Signbit() 32 | exp := f.Exp() 33 | frac := f.Frac() 34 | x = big.NewFloat(0) 35 | x.SetPrec(precision) 36 | x.SetMode(big.ToNearestEven) 37 | 38 | // ref: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Contrast_with_bfloat16_and_single_precision 39 | // 40 | // 0b00001 - 0b11110 41 | // Normalized number. 42 | // 43 | // (-1)^signbit * 2^(exp-127) * 1.mant_2 44 | lead := 1 45 | exponent := exp - bias 46 | 47 | switch exp { 48 | case 0xFF: 49 | // Inf or NaN 50 | if frac == 0 { 51 | // +-Inf 52 | x.SetInf(signbit) 53 | return x, false 54 | } 55 | // +-NaN 56 | if signbit { 57 | x.Neg(x) 58 | } 59 | return x, true 60 | case 0x00: 61 | if frac == 0 { 62 | // +-Zero 63 | if signbit { 64 | x.Neg(x) 65 | } 66 | return x, false 67 | } 68 | // Denormalized number. 69 | // 70 | // (-1)^signbit * 2^(-126) * 0.mant_2 71 | lead = 0 72 | exponent = -126 73 | } 74 | 75 | // number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity . 76 | sign := "+" 77 | if signbit { 78 | sign = "-" 79 | } 80 | s := fmt.Sprintf("%s0b%d.%07bp%d", sign, lead, frac, exponent) 81 | if _, _, err := x.Parse(s, 0); err != nil { 82 | panic(err) 83 | } 84 | return x, false 85 | } 86 | 87 | // Signbit reports whether f is negative or negative 0. 88 | func (f Float) Signbit() bool { 89 | // first bit is sign bit: 0b1000000000000000 90 | return f.bits&0x8000 != 0 91 | } 92 | 93 | // Exp returns the exponent of f. 94 | func (f Float) Exp() int { 95 | // 8 bit exponent: 0b0111111110000000 96 | return int(f.bits & 0x7F80 >> 7) 97 | } 98 | 99 | // Frac returns the fraction of f. 100 | func (f Float) Frac() uint16 { 101 | // 7 bit mantissa: 0b0000000001111111 102 | return f.bits & 0x7F 103 | } 104 | -------------------------------------------------------------------------------- /bfloat/bfloat_test.go: -------------------------------------------------------------------------------- 1 | package bfloat 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | func TestNewFromBits(t *testing.T) { 9 | golden := []struct { 10 | bits uint16 11 | want float64 12 | }{ 13 | // Special numbers. 14 | // 0 00000000 0000000 = 0 15 | {bits: 0, want: 0}, 16 | // 1 00000000 0000000 = -0 17 | {bits: 0x8000, want: 1. / math.Inf(-1)}, 18 | // 0 11111111 0000000 = +Inf 19 | {bits: 0x7f80, want: math.Inf(1)}, 20 | // 1 11111111 0000000 = -Inf 21 | {bits: 0xff80, want: math.Inf(-1)}, 22 | 23 | // 0 11111111 0000001 = +NaN 24 | {bits: 0x7f81, want: math.NaN()}, 25 | // 1 11111111 0000001 = -NaN 26 | {bits: 0xff81, want: -math.NaN()}, 27 | 28 | // from: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Examples 29 | {bits: 0x3f80, want: 1}, 30 | {bits: 0xc000, want: -2}, 31 | {bits: 0x4049, want: 3.140625}, 32 | {bits: 0x3eab, want: 0.333984375}, 33 | } 34 | for _, g := range golden { 35 | f := NewFromBits(g.bits) 36 | b, isNan := f.Big() 37 | got, _ := b.Float64() 38 | if isNan { 39 | got = g.want 40 | } 41 | wantBits := math.Float64bits(g.want) 42 | gotBits := math.Float64bits(got) 43 | if wantBits != gotBits { 44 | t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got) 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /binary128/binary128.go: -------------------------------------------------------------------------------- 1 | //go:generate go run gen.go -o extra_test.go 2 | 3 | // Package binary128 implements encoding and decoding of IEEE 754 quadruple 4 | // precision floating-point numbers. 5 | // 6 | // https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format 7 | package binary128 8 | 9 | import ( 10 | "fmt" 11 | "math" 12 | "math/big" 13 | ) 14 | 15 | const ( 16 | // precision specifies the number of bits in the mantissa (including the 17 | // implicit lead bit). 18 | precision = 113 19 | // exponent bias. 20 | bias = 16383 21 | ) 22 | 23 | // Positive and negative Not-a-Number, infinity and zero. 24 | var ( 25 | // +NaN 26 | NaN = Float{a: 0x7FFF800000000000, b: 0} 27 | // -NaN 28 | NegNaN = Float{a: 0xFFFF800000000000, b: 0} 29 | // +Inf 30 | Inf = Float{a: 0x7FFF000000000000, b: 0} 31 | // -Inf 32 | NegInf = Float{a: 0xFFFF000000000000, b: 0} 33 | // +zero 34 | Zero = Float{a: 0x0000000000000000, b: 0} 35 | // -zero 36 | NegZero = Float{a: 0x8000000000000000, b: 0} 37 | ) 38 | 39 | // Float is a floating-point number in IEEE 754 quadruple precision format. 40 | type Float struct { 41 | // Sign, exponent and fraction. 42 | // 43 | // 1 bit: sign 44 | // 15 bits: exponent 45 | // 112 bits: fraction 46 | a uint64 47 | b uint64 48 | } 49 | 50 | // NewFromBits returns the floating-point number corresponding to the IEEE 754 51 | // quadruple precision binary representation. 52 | func NewFromBits(a, b uint64) Float { 53 | return Float{a: a, b: b} 54 | } 55 | 56 | // NewFromFloat32 returns the nearest quadruple precision floating-point number 57 | // for x and the accuracy of the conversion. 58 | func NewFromFloat32(x float32) (Float, big.Accuracy) { 59 | f, acc := NewFromFloat64(float64(x)) 60 | if acc == big.Exact { 61 | _, acc = f.Float32() 62 | } 63 | return f, acc 64 | } 65 | 66 | // NewFromFloat64 returns the nearest quadruple precision floating-point number 67 | // for x and the accuracy of the conversion. 68 | func NewFromFloat64(x float64) (Float, big.Accuracy) { 69 | // +-NaN 70 | switch { 71 | case math.IsNaN(x): 72 | if math.Signbit(x) { 73 | // -NaN 74 | return NegNaN, big.Exact 75 | } 76 | // +NaN 77 | return NaN, big.Exact 78 | } 79 | y := big.NewFloat(x) 80 | y.SetPrec(precision) 81 | y.SetMode(big.ToNearestEven) 82 | // TODO: check accuracy after setting precision? 83 | return NewFromBig(y) 84 | } 85 | 86 | // NewFromBig returns the nearest quadruple precision floating-point number for 87 | // x and the accuracy of the conversion. 88 | func NewFromBig(x *big.Float) (Float, big.Accuracy) { 89 | // +-Inf 90 | zero := big.NewFloat(0).SetPrec(precision) 91 | switch { 92 | case x.IsInf(): 93 | if x.Signbit() { 94 | // -Inf 95 | return NegInf, big.Exact 96 | } 97 | // +Inf 98 | return Inf, big.Exact 99 | // +-zero 100 | case x.Cmp(zero) == 0: 101 | if x.Signbit() { 102 | // -zero 103 | return NegZero, big.Exact 104 | } 105 | // +zero 106 | return Zero, big.Exact 107 | } 108 | 109 | // Sign 110 | var a, b uint64 111 | if x.Signbit() { 112 | a |= 0x8000000000000000 113 | } 114 | 115 | // Exponent and mantissa. 116 | mant := new(big.Float).SetPrec(precision) 117 | exponent := x.MantExp(mant) 118 | // Remove 1 from the exponent as big.Float has an no lead bit. 119 | exp := exponent - 1 + bias 120 | 121 | // Handle denormalized values. 122 | // TODO: validate implementation of denormalized values. 123 | if exp <= 0 { 124 | acc := big.Exact 125 | if exp <= -(precision - 1) { 126 | exp = precision - 1 127 | acc = big.Below 128 | } 129 | mant.SetMantExp(mant, exp+precision-1) 130 | if mant.Signbit() { 131 | mant.Neg(mant) 132 | } 133 | mantissa, _ := mant.Int(nil) 134 | maskA := big.NewInt(0) 135 | for i := 64; i < 112; i++ { 136 | maskA.SetBit(maskA, i, 1) 137 | } 138 | maskB := big.NewInt(0) 139 | for i := 0; i < 64; i++ { 140 | maskB.SetBit(maskB, i, 1) 141 | } 142 | bigA := new(big.Int).And(mantissa, maskA) // a = (mantissa & maskA) >> 64 143 | bigA.Rsh(bigA, 64) 144 | bigB := new(big.Int).And(mantissa, maskB) // b = mantissa & maskB 145 | // TODO: calculate acc based on if mantissa&^maskA != 0 {} 146 | a |= bigA.Uint64() & 0x0000FFFFFFFFFFFF 147 | b = bigB.Uint64() 148 | return Float{a: a, b: b}, acc 149 | } 150 | 151 | // exponent mask (15 bits): 0b111111111111111 152 | acc := big.Exact 153 | if (exp &^ 0x7FFF) != 0 { 154 | acc = big.Above 155 | } 156 | a |= uint64(exp&0x7FFF) << 48 157 | 158 | if mant.Signbit() { 159 | mant.Neg(mant) 160 | } 161 | mant.SetMantExp(mant, precision) 162 | if !mant.IsInt() { 163 | acc = big.Below 164 | } 165 | mantissa, _ := mant.Int(nil) 166 | mantissa.SetBit(mantissa, 112, 0) // clear implicit lead bit; 2^112 167 | 168 | // mantissa mask (113 bits, including implicit lead bit): 0x1FFFFFFFFFFFFFFFFFFFFFFFFFFFF 169 | maskA := big.NewInt(0) 170 | for i := 64; i < 112; i++ { 171 | maskA.SetBit(maskA, i, 1) 172 | } 173 | maskB := big.NewInt(0) 174 | for i := 0; i < 64; i++ { 175 | maskB.SetBit(maskB, i, 1) 176 | } 177 | bigA := new(big.Int).And(mantissa, maskA) // a = (mantissa & maskA) >> 64 178 | bigA.Rsh(bigA, 64) 179 | bigB := new(big.Int).And(mantissa, maskB) // b = mantissa & maskB 180 | if acc == big.Exact && (bigA.Uint64()&^0x0000FFFFFFFFFFFF) != 0 { 181 | acc = big.Below 182 | } 183 | a |= bigA.Uint64() & 0x0000FFFFFFFFFFFF 184 | b = bigB.Uint64() 185 | return Float{a: a, b: b}, acc 186 | } 187 | 188 | // Bits returns the IEEE 754 quadruple precision binary representation of f. 189 | func (f Float) Bits() (a, b uint64) { 190 | return f.a, f.b 191 | } 192 | 193 | // Float32 returns the float32 value nearest to f. If f is too small to be 194 | // represented by a float32 (|f| < math.SmallestNonzeroFloat32), the result is 195 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is 196 | // too large to be represented by a float32 (|f| > math.MaxFloat32), the result 197 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f. 198 | func (f Float) Float32() (float32, big.Accuracy) { 199 | x, nan := f.Big() 200 | if nan { 201 | if x.Signbit() { 202 | return float32(-math.NaN()), big.Exact 203 | } 204 | return float32(math.NaN()), big.Exact 205 | } 206 | return x.Float32() 207 | } 208 | 209 | // Float64 returns the float64 value nearest to f. If f is too small to be 210 | // represented by a float64 (|f| < math.SmallestNonzeroFloat64), the result is 211 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is 212 | // too large to be represented by a float64 (|f| > math.MaxFloat64), the result 213 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f. 214 | func (f Float) Float64() (float64, big.Accuracy) { 215 | x, nan := f.Big() 216 | if nan { 217 | if x.Signbit() { 218 | return -math.NaN(), big.Exact 219 | } 220 | return math.NaN(), big.Exact 221 | } 222 | return x.Float64() 223 | } 224 | 225 | // Big returns the multi-precision floating-point number representation of f and 226 | // a boolean indicating whether f is Not-a-Number. 227 | func (f Float) Big() (x *big.Float, nan bool) { 228 | signbit := f.Signbit() 229 | exp := f.Exp() 230 | frac1, frac2 := f.Frac() 231 | x = big.NewFloat(0) 232 | x.SetPrec(precision) 233 | x.SetMode(big.ToNearestEven) 234 | 235 | lead := 1 236 | exponent := exp - bias 237 | 238 | switch exp { 239 | // 0b111111111111111 240 | case 0x7FFF: 241 | // Inf or NaN 242 | if frac1 == 0 && frac2 == 0 { 243 | // +-Inf 244 | x.SetInf(signbit) 245 | return x, false 246 | } 247 | // +-NaN 248 | if signbit { 249 | x.Neg(x) 250 | } 251 | return x, true 252 | // 0b000000000000000 253 | case 0x0000: 254 | if frac1 == 0 && frac2 == 0 { 255 | // +-Zero 256 | if signbit { 257 | x.Neg(x) 258 | } 259 | return x, false 260 | } 261 | // Denormalized number. 262 | // 263 | // (-1)^signbit * 2^(-16382) * 0.mant_2 264 | lead = 0 265 | exponent = -16382 266 | } 267 | 268 | // number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity . 269 | sign := "+" 270 | if signbit { 271 | sign = "-" 272 | } 273 | // first part cut the sign and exponent which only contains 48 bits 274 | fracStr := fmt.Sprintf("%048b%064b", frac1, frac2) 275 | s := fmt.Sprintf("%s0b%d.%sp%d", sign, lead, fracStr, exponent) 276 | if _, _, err := x.Parse(s, 0); err != nil { 277 | panic(err) 278 | } 279 | return x, false 280 | } 281 | 282 | // Signbit reports whether f is negative or negative 0. 283 | func (f Float) Signbit() bool { 284 | // first bit is sign bit 285 | return f.a&0x8000000000000000 != 0 286 | } 287 | 288 | // Exp returns the exponent of f. 289 | func (f Float) Exp() int { 290 | // 15 bit exponent 291 | return int(f.a&0x7FFF000000000000) >> 48 292 | } 293 | 294 | // Frac returns the fraction of f. 295 | func (f Float) Frac() (uint64, uint64) { 296 | // 0x0000FFFFFFFFFFFF removes the sign and exponent part (total 16 bits) from 297 | // our floating-point number. Now we can say it contains 48 bits of fraction, 298 | // and `f.b` part has the rest of fraction. 299 | return (f.a & 0x0000FFFFFFFFFFFF), f.b 300 | } 301 | -------------------------------------------------------------------------------- /binary128/binary128_test.go: -------------------------------------------------------------------------------- 1 | package binary128 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/big" 7 | "testing" 8 | ) 9 | 10 | func TestNewFromBits(t *testing.T) { 11 | const rawpi = "3.1415926535897932384626433832795028" 12 | pi, ok := newFloat(0).SetString(rawpi) 13 | if !ok { 14 | panic(fmt.Errorf("unable to create arbitrary floating-point value of pi (%q)", rawpi)) 15 | } 16 | golden := []struct { 17 | a, b uint64 18 | want *big.Float 19 | nan bool 20 | }{ 21 | // Special numbers. 22 | // +NaN 23 | // 0x7FFF 8000000000000000000000000000 = +NaN 24 | {a: 0x7FFF800000000000, b: 0x0000000000000001, want: newFloat(0), nan: true}, 25 | // -NaN 26 | // 0xFFFF 8000000000000000000000000000 = -NaN 27 | {a: 0xFFFF800000000000, b: 0x0000000000000000, want: newFloat(math.Copysign(0, -1)), nan: true}, 28 | // +inf 29 | // 0x7FFF0000000000000000000000000000 = +inf 30 | {a: 0x7FFF000000000000, b: 0x0000000000000000, want: newFloat(0).SetInf(false)}, 31 | // -inf 32 | // 0xFFFF0000000000000000000000000000 = -inf 33 | {a: 0xFFFF000000000000, b: 0x0000000000000000, want: newFloat(0).SetInf(true)}, 34 | // +0 35 | // 0x00000000000000000000000000000000 = +0 36 | {a: 0x0000000000000000, b: 0x0000000000000000, want: newFloat(+0)}, 37 | // -0 38 | // 0x80000000000000000000000000000000 = -0 39 | {a: 0x8000000000000000, b: 0x0000000000000000, want: newFloat(math.Copysign(0, -1))}, 40 | 41 | // from: https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Quadruple_precision_examples 42 | 43 | // smallest positive subnormal number 44 | // 0x00000000000000000000000000000001 = 2^{-16382} * 2^{-112} = 2^{-16494} 45 | {a: 0x0000000000000000, b: 0x0000000000000001, want: pow(newFloat(2), -16494)}, 46 | // largest subnormal number 47 | // 0x0000FFFFFFFFFFFFFFFFFFFFFFFFFFFF = 2^{-16382} * (1 - 2^{-112}) 48 | {a: 0x0000FFFFFFFFFFFF, b: 0xFFFFFFFFFFFFFFFF, want: mul(pow(newFloat(2), -16382), sub(newFloat(1), pow(newFloat(2), -112)))}, 49 | // smallest positive normal number 50 | // 0x00010000000000000000000000000000 = 2^{-16382} 51 | {a: 0x0001000000000000, b: 0x0000000000000000, want: pow(newFloat(2), -16382)}, 52 | // largest normal number 53 | // 0x7FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF = 2^16383 * (2 - 2^{-112}) 54 | {a: 0x7FFEFFFFFFFFFFFF, b: 0xFFFFFFFFFFFFFFFF, want: mul(pow(newFloat(2), 16383), sub(newFloat(2), pow(newFloat(2), -112)))}, 55 | // largest number less than one 56 | // 0x3FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF = 1 - 2^{-113} 57 | {a: 0x3FFEFFFFFFFFFFFF, b: 0xFFFFFFFFFFFFFFFF, want: sub(newFloat(1), pow(newFloat(2), -113))}, 58 | // one 59 | // 0x3FFF0000000000000000000000000000 = 1 60 | {a: 0x3FFF000000000000, b: 0x0000000000000000, want: newFloat(1)}, 61 | // smallest number larger than one 62 | // 0x3FFF0000000000000000000000000001 = 1 + 2^{-112} 63 | {a: 0x3FFF000000000000, b: 0x0000000000000001, want: add(newFloat(1), pow(newFloat(2), -112))}, 64 | // -2 65 | // 0xC0000000000000000000000000000000 = -2 66 | {a: 0xC000000000000000, b: 0x0000000000000000, want: newFloat(-2)}, 67 | // pi 68 | // 0x4000921FB54442D18469898CC51701B8 = pi 69 | {a: 0x4000921FB54442D1, b: 0x8469898CC51701B8, want: pi}, 70 | // 1/3 71 | // 0x3FFD5555555555555555555555555555 = 1/3 72 | {a: 0x3FFD555555555555, b: 0x5555555555555555, want: newFloat(0).SetRat(big.NewRat(1, 3))}, 73 | } 74 | for _, g := range golden { 75 | f := NewFromBits(g.a, g.b) 76 | got, nan := f.Big() 77 | if g.want.Cmp(got) != 0 { 78 | t.Errorf("0x%016X%016X: floating-point number mismatch; expected %v, got %v", g.a, g.b, g.want, got) 79 | } 80 | if g.nan != nan { 81 | t.Errorf("0x%016X%016X: floating-point Not-a-Number indicator mismatch; expected %v, got %v", g.a, g.b, g.nan, nan) 82 | } 83 | } 84 | } 85 | 86 | func TestNewFromFloat64(t *testing.T) { 87 | golden := []struct { 88 | in float64 89 | a, b uint64 90 | acc big.Accuracy 91 | }{ 92 | // Special numbers. 93 | // 0x7FFF 8000000000000000000000000000 = +NaN 94 | {in: math.NaN(), a: 0x7FFF800000000000, b: 0x0000000000000000, acc: big.Exact}, 95 | // -NaN 96 | // 0xFFFF 8000000000000000000000000000 = -NaN 97 | {in: -math.NaN(), a: 0xFFFF800000000000, b: 0x0000000000000000, acc: big.Exact}, 98 | // +inf 99 | // 0x7FFF0000000000000000000000000000 = +inf 100 | {in: math.Inf(+1), a: 0x7FFF000000000000, b: 0x0000000000000000, acc: big.Exact}, 101 | // -inf 102 | // 0xFFFF0000000000000000000000000000 = -inf 103 | {in: math.Inf(-1), a: 0xFFFF000000000000, b: 0x0000000000000000, acc: big.Exact}, 104 | // +0 105 | // 0x00000000000000000000000000000000 = +0 106 | {in: +0, a: 0x0000000000000000, b: 0x0000000000000000, acc: big.Exact}, 107 | // -0 108 | // 0x80000000000000000000000000000000 = -0 109 | {in: math.Copysign(0, -1), a: 0x8000000000000000, b: 0x0000000000000000, acc: big.Exact}, 110 | 111 | // from: https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Quadruple_precision_examples 112 | 113 | // one 114 | // 0x3FFF0000000000000000000000000000 = 1 115 | {in: 1, a: 0x3FFF000000000000, b: 0x0000000000000000, acc: big.Exact}, 116 | // -2 117 | // 0xC0000000000000000000000000000000 = -2 118 | {in: -2, a: 0xC000000000000000, b: 0x0000000000000000, acc: big.Exact}, 119 | // pi 120 | // 0x4000921FB54442D18469898CC51701B8 = pi 121 | {in: math.Pi, a: 0x4000921FB54442D1, b: 0x8469898CC51701B8, acc: big.Exact}, 122 | // 1/3 123 | // 0x3FFD5555555555555555555555555555 = 1/3 124 | {in: 1.0 / 3.0, a: 0x3FFD555555555555, b: 0x5555555555555555, acc: big.Exact}, 125 | } 126 | for _, g := range golden { 127 | f, acc := NewFromFloat64(g.in) 128 | a, b := f.Bits() 129 | x, _ := f.Float64() 130 | // mask last 60 bits, as float64 only has 53 bits precision (as compared to 131 | // 113 of binary128). 132 | const mask = 0xF000000000000000 133 | wantBMask := g.b & mask 134 | gotBMask := b & mask 135 | if g.a != a || wantBMask != gotBMask { 136 | t.Errorf("bits mismatch; expected 0x%016X%016X (%v), got 0x%016X%016X (%v)", g.a, wantBMask, g.in, a, gotBMask, x) 137 | } 138 | if g.acc != acc { 139 | t.Errorf("accuracy mismatch; expected %v (%v), got %v (%v)", g.acc, g.in, acc, x) 140 | } 141 | } 142 | } 143 | 144 | // ### [ Helper functions ] #################################################### 145 | 146 | // pow returns x**y, the base-x exponential of y. 147 | func pow(x *big.Float, y int64) *big.Float { 148 | switch { 149 | // x^{-42} 150 | case y < 0: 151 | z := newFloat(1) 152 | for i := int64(0); i < -y; i++ { 153 | z = div(z, x) 154 | } 155 | return z 156 | // x^42 157 | case y > 0: 158 | z := newFloat(1) 159 | for i := int64(0); i < y; i++ { 160 | z = mul(z, x) 161 | } 162 | return z 163 | // x^0 164 | default: // y == 0 165 | return newFloat(1) 166 | } 167 | } 168 | 169 | // add returns the sum x+y. 170 | func add(x, y *big.Float) *big.Float { 171 | return newFloat(0).Add(x, y) 172 | } 173 | 174 | // add returns the difference x-y. 175 | func sub(x, y *big.Float) *big.Float { 176 | return newFloat(0).Sub(x, y) 177 | } 178 | 179 | // add returns the product x*y. 180 | func mul(x, y *big.Float) *big.Float { 181 | return newFloat(0).Mul(x, y) 182 | } 183 | 184 | // add returns the quotient x/y. 185 | func div(x, y *big.Float) *big.Float { 186 | return newFloat(0).Quo(x, y) 187 | } 188 | 189 | // newFloat returns a new floating-point value based on x with precision 113. 190 | func newFloat(x float64) *big.Float { 191 | return big.NewFloat(0).SetPrec(precision).SetFloat64(x) 192 | } 193 | -------------------------------------------------------------------------------- /binary128/extra_test.tmpl: -------------------------------------------------------------------------------- 1 | // Code generated by go run gen.go; DO NOT EDIT. 2 | 3 | package binary128 4 | 5 | import ( 6 | "math" 7 | "math/big" 8 | "testing" 9 | ) 10 | 11 | func TestNewFromBitsNormalized(t *testing.T) { 12 | testNewFromBits(t, goldenNormalized) 13 | } 14 | 15 | func TestNewFromBitsDenormalized(t *testing.T) { 16 | testNewFromBits(t, goldenDenormalized) 17 | } 18 | 19 | func testNewFromBits(t *testing.T, golden []Golden) { 20 | for _, g := range golden { 21 | f := NewFromBits(g.a, g.b) 22 | // Check arbitrary precision floating-point value. 23 | got, _ := f.Big() 24 | gotStr := got.Text('g', 35) 25 | if g.want != gotStr { 26 | t.Errorf("0x%016X%016X: floating-point number mismatch; expected %v, got %v", g.a, g.b, g.want, gotStr) 27 | } 28 | // Check 64-bit floating-point value. 29 | got64, acc64 := f.Float64() 30 | want64Bits := math.Float64bits(g.want64) 31 | got64Bits := math.Float64bits(got64) 32 | if want64Bits != got64Bits { 33 | t.Errorf("0x%016X%016X: floating-point number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.a, g.b, want64Bits, g.want64, got64Bits, got64) 34 | } 35 | // Check accuracy of 64-bit floating-point value, as compared to arbitrary 36 | // precision floating-point value. 37 | if g.acc64 != acc64 { 38 | t.Errorf("0x%016X%016X: floating-point accuracy mismatch for float64 %v of big %v; expected %v, got %v", g.a, g.b, g.want64, g.want, g.acc64, acc64) 39 | } 40 | // Validate 64-bit floating-point accuracy. 41 | wantBig, _ := big.NewFloat(0).SetPrec(precision).SetString(g.want) 42 | got64Big := big.NewFloat(got64) 43 | switch acc64 { 44 | case big.Below: 45 | // got is below want 46 | if got64Big.Cmp(wantBig) != -1 { 47 | t.Errorf("%v: floating-point value not below; expected %v < %v, got %v >= %v", g.want, got64, g.want, got64, g.want) 48 | } 49 | case big.Exact: 50 | // got is want 51 | if got64Big.Cmp(wantBig) != 0 { 52 | t.Errorf("%v: floating-point value not equal; expected %v == %v, got %v != %v", g.want, got64, g.want, got64, g.want) 53 | } 54 | case big.Above: 55 | // got is above want 56 | if got64Big.Cmp(wantBig) != 1 { 57 | t.Errorf("%v: floating-point value not above; expected %v > %v, got %v <= %v", g.want, got64, g.want, got64, g.want) 58 | } 59 | } 60 | // Validate 32-bit floating-point accuracy. 61 | got32, acc32 := f.Float32() 62 | got32Big := big.NewFloat(float64(got32)) 63 | switch acc32 { 64 | case big.Below: 65 | // got is below want 66 | if got32Big.Cmp(wantBig) != -1 { 67 | t.Errorf("%v: floating-point value not below; expected %v < %v, got %v >= %v", g.want, got32, g.want, got32, g.want) 68 | } 69 | case big.Exact: 70 | // got is want 71 | if got32Big.Cmp(wantBig) != 0 { 72 | t.Errorf("%v: floating-point value not equal; expected %v == %v, got %v != %v", g.want, got32, g.want, got32, g.want) 73 | } 74 | case big.Above: 75 | // got is above want 76 | if got32Big.Cmp(wantBig) != 1 { 77 | t.Errorf("%v: floating-point value not above; expected %v > %v, got %v <= %v", g.want, got32, g.want, got32, g.want) 78 | } 79 | } 80 | } 81 | } 82 | 83 | func TestNewFromFloat32Normalized(t *testing.T) { 84 | for _, g := range goldenNormalized { 85 | want32 := float32(g.want64) 86 | f, _ := NewFromFloat32(want32) 87 | got32, _ := f.Float32() 88 | if want32 != got32 { 89 | t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, want32, got32) 90 | } 91 | } 92 | } 93 | 94 | func TestNewFromFloat64Normalized(t *testing.T) { 95 | for _, g := range goldenNormalized { 96 | f, _ := NewFromFloat64(g.want64) 97 | got64, _ := f.Float64() 98 | if g.want64 != got64 { 99 | t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, g.want64, got64) 100 | } 101 | } 102 | } 103 | 104 | func TestNewFromFloat32Denormalized(t *testing.T) { 105 | for _, g := range goldenDenormalized { 106 | want32 := float32(g.want64) 107 | f, _ := NewFromFloat32(want32) 108 | got32, _ := f.Float32() 109 | if want32 != got32 { 110 | t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, want32, got32) 111 | } 112 | } 113 | } 114 | 115 | func TestNewFromFloat64Denormalized(t *testing.T) { 116 | for _, g := range goldenDenormalized { 117 | f, _ := NewFromFloat64(g.want64) 118 | got64, _ := f.Float64() 119 | if g.want64 != got64 { 120 | t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, g.want64, got64) 121 | } 122 | } 123 | } 124 | 125 | type Golden struct { 126 | a, b uint64 127 | want string 128 | want64 float64 129 | acc64 big.Accuracy 130 | } 131 | 132 | var goldenNormalized = []Golden{ 133 | // Normalized values. 134 | {{- range .normalized }} 135 | {{ . }} 136 | {{- end }} 137 | } 138 | 139 | var goldenDenormalized = []Golden{ 140 | // Denormalized values. 141 | {{- range .denormalized }} 142 | {{ . }} 143 | {{- end }} 144 | } 145 | -------------------------------------------------------------------------------- /binary128/gen.go: -------------------------------------------------------------------------------- 1 | //+build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "flag" 7 | "fmt" 8 | "log" 9 | "math" 10 | "math/big" 11 | "math/rand" 12 | "os" 13 | "sort" 14 | "text/template" 15 | 16 | "github.com/pkg/errors" 17 | ) 18 | 19 | func main() { 20 | var out string 21 | flag.StringVar(&out, "o", "extra_test.go", "test cases output path") 22 | flag.Parse() 23 | if err := dumpTest(out); err != nil { 24 | log.Fatalf("%+v", err) 25 | } 26 | } 27 | 28 | func dumpTest(path string) error { 29 | f, err := os.Create(path) 30 | if err != nil { 31 | return errors.WithStack(err) 32 | } 33 | defer f.Close() 34 | t, err := template.ParseFiles("extra_test.tmpl") 35 | if err != nil { 36 | return errors.WithStack(err) 37 | } 38 | // Use deterministic source for pseudo-random nunmbers. 39 | rand.Seed(1234) 40 | // Randomize the exponent since the number of exponent, mantissa combinations 41 | // otherwise become huge. 42 | var exps []int 43 | const nrandExps = 64 44 | for i := 0; i < nrandExps; i++ { 45 | // exponent bits: 0x0001 - 0x7FFE 46 | exp := rand.Intn(0x7FFE) + 1 47 | exps = append(exps, exp) 48 | } 49 | sort.Ints(exps) 50 | // Randomize the mantissa since we cannot check 112 (48 + 64) bits 51 | // exhaustively. 52 | var mants []MantBits 53 | const nrandMants = 512 54 | for i := 0; i < nrandMants; i++ { 55 | // 48 bits. 56 | a := rand.Uint64() & 0xFFFFFFFFFFFF 57 | // 64 bits. 58 | b := rand.Uint64() 59 | mant := MantBits{a: a, b: b} 60 | mants = append(mants, mant) 61 | } 62 | sort.Slice(mants, func(i, j int) bool { 63 | if mants[i].a < mants[j].a { 64 | return true 65 | } 66 | return mants[i].b < mants[j].b 67 | }) 68 | data := map[string][]string{ 69 | "normalized": getNormalized(exps, mants), 70 | "denormalized": getDenormalized(mants), 71 | } 72 | if err := t.Execute(f, data); err != nil { 73 | return errors.WithStack(err) 74 | } 75 | return nil 76 | } 77 | 78 | const ( 79 | // precision specifies the number of bits in the mantissa (including the 80 | // implicit lead bit). 81 | precision = 113 82 | // exponent bias. 83 | bias = 16383 84 | ) 85 | 86 | type MantBits struct { 87 | // 48 bits. 88 | a uint64 89 | // 64 bits. 90 | b uint64 91 | } 92 | 93 | func getNormalized(exps []int, mants []MantBits) []string { 94 | var ns []string 95 | // normalized 96 | // 97 | // exponent bits: 0x0001 - 0x7FFE 98 | // 99 | // (-1)^signbit * 2^(exp-16383) * 1.mant_2 100 | const lead = 1 101 | for signbit := 0; signbit <= 1; signbit++ { 102 | sign := "+" 103 | if signbit == 1 { 104 | sign = "-" 105 | } 106 | for _, exp := range exps { 107 | exponent := exp - bias 108 | // mantissa bits: 112 (48 + 64) bits 109 | for _, mantBits := range mants { 110 | mant := fmt.Sprintf("%048b%064b", mantBits.a, mantBits.b) 111 | s := fmt.Sprintf("%s0b%d.%sp%d", sign, lead, mant, exponent) 112 | m, _, err := big.ParseFloat(s, 0, precision, big.ToNearestEven) 113 | if err != nil { 114 | panic(err) 115 | } 116 | want := m.Text('g', 35) 117 | want64, acc64 := m.Float64() 118 | a := uint64(signbit) << 63 119 | a |= uint64(exp) << 48 120 | a |= mantBits.a 121 | b := mantBits.b 122 | var n string 123 | switch { 124 | // Compare floating-point bits of want64, as otherwise +0 == -0 125 | case math.Float64bits(want64) == math.Float64bits(math.Copysign(0, -1)): 126 | // -zero 127 | n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Copysign(0, -1), acc64: big.%v}, // %s", a, b, want, acc64, s) 128 | case want64 == math.Inf(+1): 129 | // +inf 130 | n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(+1), acc64: big.%v}, // %s", a, b, want, acc64, s) 131 | case want64 == math.Inf(-1): 132 | // -inf 133 | n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(-1), acc64: big.%v}, // %s", a, b, want, acc64, s) 134 | default: 135 | n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: %v, acc64: big.%v}, // %s", a, b, want, want64, acc64, s) 136 | } 137 | ns = append(ns, n) 138 | } 139 | } 140 | } 141 | return ns 142 | } 143 | 144 | func getDenormalized(mants []MantBits) []string { 145 | var ds []string 146 | // denormalized 147 | // 148 | // exponent bits: 0x0000 149 | // 150 | // (-1)^signbit * 2^(-14) * 0.mant_2 151 | const lead = 0 152 | for signbit := 0; signbit <= 1; signbit++ { 153 | sign := "+" 154 | if signbit == 1 { 155 | sign = "-" 156 | } 157 | const exp = 0x0000 158 | exponent := exp - bias + 1 159 | // mantissa bits: 112 (48 + 64) bits 160 | for _, mantBits := range mants { 161 | mant := fmt.Sprintf("%048b%064b", mantBits.a, mantBits.b) 162 | s := fmt.Sprintf("%s0b%d.%sp%d", sign, lead, mant, exponent) 163 | m, _, err := big.ParseFloat(s, 0, precision, big.ToNearestEven) 164 | if err != nil { 165 | panic(err) 166 | } 167 | want := m.Text('g', 35) 168 | want64, acc64 := m.Float64() 169 | a := uint64(signbit) << 63 170 | a |= uint64(exp) << 48 171 | a |= mantBits.a 172 | b := mantBits.b 173 | var d string 174 | switch { 175 | // Compare floating-point bits of want64, as otherwise +0 == -0 176 | case math.Float64bits(want64) == math.Float64bits(math.Copysign(0, -1)): 177 | // -zero 178 | d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Copysign(0, -1), acc64: big.%v}, // %s", a, b, want, acc64, s) 179 | case want64 == math.Inf(+1): 180 | // +inf 181 | d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(+1), acc64: big.%v}, // %s", a, b, want, acc64, s) 182 | case want64 == math.Inf(-1): 183 | // -inf 184 | d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(-1), acc64: big.%v}, // %s", a, b, want, acc64, s) 185 | default: 186 | d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: %v, acc64: big.%v}, // %s", a, b, want, want64, acc64, s) 187 | } 188 | ds = append(ds, d) 189 | } 190 | } 191 | return ds 192 | } 193 | -------------------------------------------------------------------------------- /binary16/binary16.go: -------------------------------------------------------------------------------- 1 | //go:generate go run gen.go -o extra_test.go 2 | 3 | // Package binary16 implements encoding and decoding of IEEE 754 half precision 4 | // floating-point numbers. 5 | // 6 | // https://en.wikipedia.org/wiki/Half-precision_floating-point_format 7 | package binary16 8 | 9 | import ( 10 | "fmt" 11 | "math" 12 | "math/big" 13 | ) 14 | 15 | const ( 16 | // precision specifies the number of bits in the mantissa (including the 17 | // implicit lead bit). 18 | precision = 11 19 | // exponent bias. 20 | bias = 15 21 | ) 22 | 23 | // Positive and negative Not-a-Number, infinity and zero. 24 | var ( 25 | // +NaN 26 | NaN = Float{bits: 0x7E00} 27 | // -NaN 28 | NegNaN = Float{bits: 0xFE00} 29 | // +Inf 30 | Inf = Float{bits: 0x7C00} 31 | // -Inf 32 | NegInf = Float{bits: 0xFC00} 33 | // +zero 34 | Zero = Float{bits: 0x0000} 35 | // -zero 36 | NegZero = Float{bits: 0x8000} 37 | ) 38 | 39 | // Float is a floating-point number in IEEE 754 half precision format. 40 | type Float struct { 41 | // Sign, exponent and fraction. 42 | // 43 | // 1 bit: sign 44 | // 5 bits: exponent 45 | // 10 bits: fraction 46 | bits uint16 47 | } 48 | 49 | // NewFromBits returns the floating-point number corresponding to the IEEE 754 50 | // half precision binary representation. 51 | func NewFromBits(bits uint16) Float { 52 | return Float{bits: bits} 53 | } 54 | 55 | // NewFromFloat32 returns the nearest half precision floating-point number for x 56 | // and the accuracy of the conversion. 57 | func NewFromFloat32(x float32) (Float, big.Accuracy) { 58 | f, acc := NewFromFloat64(float64(x)) 59 | if acc == big.Exact { 60 | _, acc = f.Float32() 61 | } 62 | return f, acc 63 | } 64 | 65 | // NewFromFloat64 returns the nearest half precision floating-point number for x 66 | // and the accuracy of the conversion. 67 | func NewFromFloat64(x float64) (Float, big.Accuracy) { 68 | // +-NaN 69 | switch { 70 | case math.IsNaN(x): 71 | if math.Signbit(x) { 72 | // -NaN 73 | return NegNaN, big.Exact 74 | } 75 | // +NaN 76 | return NaN, big.Exact 77 | } 78 | y := big.NewFloat(x) 79 | y.SetPrec(precision) 80 | y.SetMode(big.ToNearestEven) 81 | // TODO: check accuracy after setting precision? 82 | return NewFromBig(y) 83 | } 84 | 85 | // NewFromBig returns the nearest half precision floating-point number for x and 86 | // the accuracy of the conversion. 87 | func NewFromBig(x *big.Float) (Float, big.Accuracy) { 88 | // +-Inf 89 | zero := big.NewFloat(0) 90 | switch { 91 | case x.IsInf(): 92 | if x.Signbit() { 93 | // -Inf 94 | return NegInf, big.Exact 95 | } 96 | // +Inf 97 | return Inf, big.Exact 98 | // +-zero 99 | case x.Cmp(zero) == 0: 100 | if x.Signbit() { 101 | // -zero 102 | return NegZero, big.Exact 103 | } 104 | // +zero 105 | return Zero, big.Exact 106 | } 107 | 108 | // Sign 109 | var bits uint16 110 | if x.Signbit() { 111 | bits |= 0x8000 112 | } 113 | 114 | // Exponent and mantissa. 115 | mant := new(big.Float) 116 | exponent := x.MantExp(mant) 117 | // Remove 1 from the exponent as big.Float has an no lead bit. 118 | exp := exponent - 1 + bias 119 | 120 | // Handle denormalized values. 121 | // TODO: validate implementation of denormalized values. 122 | if exp <= 0 { 123 | acc := big.Exact 124 | if exp <= -(precision - 1) { 125 | exp = precision - 1 126 | acc = big.Below 127 | } 128 | mant.SetMantExp(mant, exp+precision-1) 129 | if mant.Signbit() { 130 | mant.Neg(mant) 131 | } 132 | mantissa, _ := mant.Uint64() 133 | // TODO: calculate acc based on if mantissa&^0x7FF != 0 {} 134 | bits |= uint16(mantissa & 0x7FF) 135 | return Float{bits: bits}, acc 136 | } 137 | 138 | // exponent mask (5 bits): 0b11111 139 | acc := big.Exact 140 | if (exp &^ 0x1F) != 0 { 141 | acc = big.Above 142 | } 143 | bits |= uint16(exp&0x1F) << 10 144 | 145 | if mant.Signbit() { 146 | mant.Neg(mant) 147 | } 148 | mant.SetMantExp(mant, precision) 149 | if !mant.IsInt() { 150 | acc = big.Below 151 | } 152 | mantissa, _ := mant.Uint64() 153 | mantissa &^= 0x400 // clear implicit lead bit; 2^10 154 | 155 | // mantissa mask (11 bits, including implicit lead bit): 0b11111111111 156 | if acc == big.Exact && (mantissa&^0x7FF) != 0 { 157 | acc = big.Below 158 | } 159 | mantissa &= 0x7FF 160 | bits |= uint16(mantissa) 161 | return Float{bits: bits}, acc 162 | } 163 | 164 | // Bits returns the IEEE 754 half precision binary representation of f. 165 | func (f Float) Bits() uint16 { 166 | return f.bits 167 | } 168 | 169 | // Float32 returns the float32 value nearest to f. If f is too small to be 170 | // represented by a float32 (|f| < math.SmallestNonzeroFloat32), the result is 171 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is 172 | // too large to be represented by a float32 (|f| > math.MaxFloat32), the result 173 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f. 174 | func (f Float) Float32() (float32, big.Accuracy) { 175 | x, nan := f.Big() 176 | if nan { 177 | if x.Signbit() { 178 | return float32(-math.NaN()), big.Exact 179 | } 180 | return float32(math.NaN()), big.Exact 181 | } 182 | return x.Float32() 183 | } 184 | 185 | // Float64 returns the float64 value nearest to f. If f is too small to be 186 | // represented by a float64 (|f| < math.SmallestNonzeroFloat64), the result is 187 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is 188 | // too large to be represented by a float64 (|f| > math.MaxFloat64), the result 189 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f. 190 | func (f Float) Float64() (float64, big.Accuracy) { 191 | x, nan := f.Big() 192 | if nan { 193 | if x.Signbit() { 194 | return -math.NaN(), big.Exact 195 | } 196 | return math.NaN(), big.Exact 197 | } 198 | return x.Float64() 199 | } 200 | 201 | // Big returns the multi-precision floating-point number representation of f and 202 | // a boolean indicating whether f is Not-a-Number. 203 | func (f Float) Big() (x *big.Float, nan bool) { 204 | signbit := f.Signbit() 205 | exp := f.Exp() 206 | frac := f.Frac() 207 | x = big.NewFloat(0) 208 | x.SetPrec(precision) 209 | x.SetMode(big.ToNearestEven) 210 | 211 | // ref: https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Exponent_encoding 212 | // 213 | // 0b00001 - 0b11110 214 | // Normalized number. 215 | // 216 | // (-1)^signbit * 2^(exp-15) * 1.mant_2 217 | lead := 1 218 | exponent := exp - bias 219 | 220 | switch exp { 221 | // 0b11111 222 | case 0x1F: 223 | // Inf or NaN 224 | if frac == 0 { 225 | // +-Inf 226 | x.SetInf(signbit) 227 | return x, false 228 | } 229 | // +-NaN 230 | if signbit { 231 | x.Neg(x) 232 | } 233 | return x, true 234 | // 0b00000 235 | case 0x00: 236 | if frac == 0 { 237 | // +-Zero 238 | if signbit { 239 | x.Neg(x) 240 | } 241 | return x, false 242 | } 243 | // Denormalized number. 244 | // 245 | // (-1)^signbit * 2^(-14) * 0.mant_2 246 | lead = 0 247 | exponent = -14 248 | } 249 | 250 | // number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity . 251 | sign := "+" 252 | if signbit { 253 | sign = "-" 254 | } 255 | s := fmt.Sprintf("%s0b%d.%010bp%d", sign, lead, frac, exponent) 256 | if _, _, err := x.Parse(s, 0); err != nil { 257 | panic(err) 258 | } 259 | return x, false 260 | } 261 | 262 | // Signbit reports whether f is negative or negative 0. 263 | func (f Float) Signbit() bool { 264 | // first bit is sign bit: 0b1000000000000000 265 | return f.bits&0x8000 != 0 266 | } 267 | 268 | // Exp returns the exponent of f. 269 | func (f Float) Exp() int { 270 | // 5 bit exponent: 0b0111110000000000 271 | return int(f.bits & 0x7C00 >> 10) 272 | } 273 | 274 | // Frac returns the fraction of f. 275 | func (f Float) Frac() uint16 { 276 | // 10 bit mantissa: 0b0000001111111111 277 | return f.bits & 0x03FF 278 | } 279 | -------------------------------------------------------------------------------- /binary16/binary16_test.go: -------------------------------------------------------------------------------- 1 | package binary16 2 | 3 | import ( 4 | "math" 5 | "math/big" 6 | "testing" 7 | ) 8 | 9 | func TestNewFromBits(t *testing.T) { 10 | golden := []struct { 11 | bits uint16 12 | want float64 13 | }{ 14 | // Special numbers. 15 | // 0 11111 1000000000 = +NaN 16 | {bits: 0x7E00, want: math.NaN()}, 17 | // -NaN 18 | // 1 11111 1000000000 = -NaN 19 | {bits: 0xFE00, want: -math.NaN()}, 20 | 21 | // from: https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Half_precision_examples 22 | 23 | // 0 01111 0000000000 = 1 24 | {bits: 0x3C00, want: 1}, 25 | // 0 01111 0000000001 = 1 + 2^(-10) = 1.0009765625 (next smallest float after 1) 26 | {bits: 0x3C01, want: 1.0009765625}, 27 | // 1 10000 0000000000 = -2 28 | {bits: 0xC000, want: -2}, 29 | // 0 11110 1111111111 = 65504 (max half precision) 30 | {bits: 0x7BFF, want: 65504}, 31 | // 0 00001 0000000000 = 2^(-14) ~= 6.10352 * 10^(-5) (minimum positive normal) 32 | {bits: 0x0400, want: math.Pow(2, -14)}, 33 | // 0 00000 0000000001 = 2^(-24) ~= 5.96046 * 10^(-8) (minimum positive subnormal) 34 | {bits: 0x0001, want: math.Pow(2, -24)}, 35 | // 0 00000 0000000000 = 0 36 | {bits: 0x0000, want: 0}, 37 | // 1 00000 0000000000 = −0 38 | {bits: 0x8000, want: math.Copysign(0, -1)}, 39 | // 0 11111 0000000000 = infinity 40 | {bits: 0x7C00, want: math.Inf(1)}, 41 | // 1 11111 0000000000 = -infinity 42 | {bits: 0xFC00, want: math.Inf(-1)}, 43 | // 0 01101 0101010101 = 0.333251953125 ~= 1/3 44 | {bits: 0x3555, want: 0.333251953125}, 45 | 46 | // from: https://reviews.llvm.org/rL237161 47 | 48 | // Normalized numbers. 49 | // 0 01110 0000000000 = 0.5 50 | {bits: 0x3800, want: 0.5}, 51 | // 1 01110 0000000000 = -0.5 52 | {bits: 0xB800, want: -0.5}, 53 | // 0 01111 1000000000 = 1.5 54 | {bits: 0x3E00, want: 1.5}, 55 | // 1 01111 1000000000 = -1.5 56 | {bits: 0xBE00, want: -1.5}, 57 | // 0 10000 0100000000 = 2.5 58 | {bits: 0x4100, want: 2.5}, 59 | // 1 10000 0100000000 = -2.5 60 | {bits: 0xC100, want: -2.5}, 61 | // Denormalized numbers. 62 | // 0 00000 0000010000 = 2^(-20) 63 | {bits: 0x0010, want: math.Pow(2, -20)}, 64 | // 1 00000 0000000001 = -2^(-24) 65 | {bits: 0x8001, want: -math.Pow(2, -24)}, 66 | 67 | // 2^i 68 | {bits: 0x0001, want: math.Pow(2, -24)}, // 2^(-24) 69 | {bits: 0x0002, want: math.Pow(2, -23)}, // 2^(-23) 70 | {bits: 0x0004, want: math.Pow(2, -22)}, // 2^(-22) 71 | {bits: 0x0008, want: math.Pow(2, -21)}, // 2^(-21) 72 | {bits: 0x0010, want: math.Pow(2, -20)}, // 2^(-20) 73 | {bits: 0x0020, want: math.Pow(2, -19)}, // 2^(-19) 74 | {bits: 0x0040, want: math.Pow(2, -18)}, // 2^(-18) 75 | {bits: 0x0080, want: math.Pow(2, -17)}, // 2^(-17) 76 | {bits: 0x0100, want: math.Pow(2, -16)}, // 2^(-16) 77 | {bits: 0x0200, want: math.Pow(2, -15)}, // 2^(-15) 78 | {bits: 0x0400, want: math.Pow(2, -14)}, // 2^(-14) 79 | {bits: 0x0800, want: math.Pow(2, -13)}, // 2^(-13) 80 | {bits: 0x0C00, want: math.Pow(2, -12)}, // 2^(-12) 81 | {bits: 0x1000, want: math.Pow(2, -11)}, // 2^(-11) 82 | {bits: 0x1400, want: math.Pow(2, -10)}, // 2^(-10) 83 | {bits: 0x1800, want: math.Pow(2, -9)}, // 2^(-9) 84 | {bits: 0x1C00, want: math.Pow(2, -8)}, // 2^(-8) 85 | {bits: 0x2000, want: math.Pow(2, -7)}, // 2^(-7) 86 | {bits: 0x2400, want: math.Pow(2, -6)}, // 2^(-6) 87 | {bits: 0x2800, want: math.Pow(2, -5)}, // 2^(-5) 88 | {bits: 0x2C00, want: math.Pow(2, -4)}, // 2^(-4) 89 | {bits: 0x3000, want: math.Pow(2, -3)}, // 2^(-3) 90 | {bits: 0x3400, want: math.Pow(2, -2)}, // 2^(-2) 91 | {bits: 0x3800, want: math.Pow(2, -1)}, // 2^(-1) 92 | {bits: 0x3C00, want: math.Pow(2, 0)}, // 2^0 93 | {bits: 0x4000, want: math.Pow(2, 1)}, // 2^1 94 | {bits: 0x4400, want: math.Pow(2, 2)}, // 2^2 95 | {bits: 0x4800, want: math.Pow(2, 3)}, // 2^3 96 | {bits: 0x4C00, want: math.Pow(2, 4)}, // 2^4 97 | {bits: 0x5000, want: math.Pow(2, 5)}, // 2^5 98 | {bits: 0x5400, want: math.Pow(2, 6)}, // 2^6 99 | {bits: 0x5800, want: math.Pow(2, 7)}, // 2^7 100 | {bits: 0x5C00, want: math.Pow(2, 8)}, // 2^8 101 | {bits: 0x6000, want: math.Pow(2, 9)}, // 2^9 102 | {bits: 0x6400, want: math.Pow(2, 10)}, // 2^10 103 | {bits: 0x6800, want: math.Pow(2, 11)}, // 2^11 104 | {bits: 0x6C00, want: math.Pow(2, 12)}, // 2^12 105 | {bits: 0x7000, want: math.Pow(2, 13)}, // 2^13 106 | {bits: 0x7400, want: math.Pow(2, 14)}, // 2^14 107 | {bits: 0x7800, want: math.Pow(2, 15)}, // 2^15 108 | } 109 | for _, g := range golden { 110 | f := NewFromBits(g.bits) 111 | got, _ := f.Float64() 112 | wantBits := math.Float64bits(g.want) 113 | gotBits := math.Float64bits(got) 114 | //fmt.Printf("bits: 0x%04X (%v)\n", g.bits, g.want) 115 | if wantBits != gotBits { 116 | t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got) 117 | } 118 | } 119 | } 120 | 121 | func TestNewFromFloat64(t *testing.T) { 122 | golden := []struct { 123 | in float64 124 | want uint16 125 | acc big.Accuracy 126 | }{ 127 | // Special numbers. 128 | // 0 11111 1000000000 = +NaN 129 | {in: math.NaN(), want: 0x7E00, acc: big.Exact}, 130 | // -NaN 131 | // 1 11111 1000000000 = -NaN 132 | {in: -math.NaN(), want: 0xFE00, acc: big.Exact}, 133 | 134 | // from: https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Half_precision_examples 135 | 136 | // 0 01111 0000000000 = 1 137 | {in: 1, want: 0x3C00, acc: big.Exact}, 138 | // 0 01111 0000000001 = 1 + 2^(-10) = 1.0009765625 (next smallest float after 1) 139 | {in: 1.0009765625, want: 0x3C01, acc: big.Exact}, 140 | // 1 10000 0000000000 = -2 141 | {in: -2, want: 0xC000, acc: big.Exact}, 142 | // 0 11110 1111111111 = 65504 (max half precision) 143 | {in: 65504, want: 0x7BFF, acc: big.Exact}, 144 | // 0 00001 0000000000 = 2^(-14) ~= 6.10352 * 10^(-5) (minimum positive normal) 145 | {in: math.Pow(2, -14), want: 0x0400, acc: big.Exact}, 146 | // 0 00000 0000000001 = 2^(-24) ~= 5.96046 * 10^(-8) (minimum positive subnormal) 147 | {in: math.Pow(2, -24), want: 0x0001, acc: big.Exact}, 148 | // 0 00000 0000000000 = 0 149 | {in: 0, want: 0x0000, acc: big.Exact}, 150 | // 1 00000 0000000000 = −0 151 | {in: math.Copysign(0, -1), want: 0x8000, acc: big.Exact}, 152 | // 0 11111 0000000000 = infinity 153 | {in: math.Inf(1), want: 0x7C00, acc: big.Exact}, 154 | // 1 11111 0000000000 = -infinity 155 | {in: math.Inf(-1), want: 0xFC00, acc: big.Exact}, 156 | // 0 01101 0101010101 = 0.333251953125 ~= 1/3 157 | {in: 0.333251953125, want: 0x3555, acc: big.Exact}, 158 | 159 | // from: https://reviews.llvm.org/rL237161 160 | 161 | // Normalized numbers. 162 | // 0 01110 0000000000 = 0.5 163 | {in: 0.5, want: 0x3800, acc: big.Exact}, 164 | // 1 01110 0000000000 = -0.5 165 | {in: -0.5, want: 0xB800, acc: big.Exact}, 166 | // 0 01111 1000000000 = 1.5 167 | {in: 1.5, want: 0x3E00, acc: big.Exact}, 168 | // 1 01111 1000000000 = -1.5 169 | {in: -1.5, want: 0xBE00, acc: big.Exact}, 170 | // 0 10000 0100000000 = 2.5 171 | {in: 2.5, want: 0x4100, acc: big.Exact}, 172 | // 1 10000 0100000000 = -2.5 173 | {in: -2.5, want: 0xC100, acc: big.Exact}, 174 | // Denormalized numbers. 175 | // 0 00000 0000010000 = 2^(-20) 176 | {in: math.Pow(2, -20), want: 0x0010, acc: big.Exact}, 177 | // 1 00000 0000000001 = -2^(-24) 178 | {in: -math.Pow(2, -24), want: 0x8001, acc: big.Exact}, 179 | 180 | // 2^i 181 | {in: math.Pow(2, -25), want: 0x0000, acc: big.Below}, // 2^(-25) 182 | {in: math.Pow(2, -24), want: 0x0001, acc: big.Exact}, // 2^(-24) 183 | {in: math.Pow(2, -23), want: 0x0002, acc: big.Exact}, // 2^(-23) 184 | {in: math.Pow(2, -22), want: 0x0004, acc: big.Exact}, // 2^(-22) 185 | {in: math.Pow(2, -21), want: 0x0008, acc: big.Exact}, // 2^(-21) 186 | {in: math.Pow(2, -20), want: 0x0010, acc: big.Exact}, // 2^(-20) 187 | {in: math.Pow(2, -19), want: 0x0020, acc: big.Exact}, // 2^(-19) 188 | {in: math.Pow(2, -18), want: 0x0040, acc: big.Exact}, // 2^(-18) 189 | {in: math.Pow(2, -17), want: 0x0080, acc: big.Exact}, // 2^(-17) 190 | {in: math.Pow(2, -16), want: 0x0100, acc: big.Exact}, // 2^(-16) 191 | {in: math.Pow(2, -15), want: 0x0200, acc: big.Exact}, // 2^(-15) 192 | {in: math.Pow(2, -14), want: 0x0400, acc: big.Exact}, // 2^(-14) 193 | {in: math.Pow(2, -13), want: 0x0800, acc: big.Exact}, // 2^(-13) 194 | {in: math.Pow(2, -12), want: 0x0C00, acc: big.Exact}, // 2^(-12) 195 | {in: math.Pow(2, -11), want: 0x1000, acc: big.Exact}, // 2^(-11) 196 | {in: math.Pow(2, -10), want: 0x1400, acc: big.Exact}, // 2^(-10) 197 | {in: math.Pow(2, -9), want: 0x1800, acc: big.Exact}, // 2^(-9) 198 | {in: math.Pow(2, -8), want: 0x1C00, acc: big.Exact}, // 2^(-8) 199 | {in: math.Pow(2, -7), want: 0x2000, acc: big.Exact}, // 2^(-7) 200 | {in: math.Pow(2, -6), want: 0x2400, acc: big.Exact}, // 2^(-6) 201 | {in: math.Pow(2, -5), want: 0x2800, acc: big.Exact}, // 2^(-5) 202 | {in: math.Pow(2, -4), want: 0x2C00, acc: big.Exact}, // 2^(-4) 203 | {in: math.Pow(2, -3), want: 0x3000, acc: big.Exact}, // 2^(-3) 204 | {in: math.Pow(2, -2), want: 0x3400, acc: big.Exact}, // 2^(-2) 205 | {in: math.Pow(2, -1), want: 0x3800, acc: big.Exact}, // 2^(-1) 206 | {in: math.Pow(2, 0), want: 0x3C00, acc: big.Exact}, // 2^0 207 | {in: math.Pow(2, 1), want: 0x4000, acc: big.Exact}, // 2^1 208 | {in: math.Pow(2, 2), want: 0x4400, acc: big.Exact}, // 2^2 209 | {in: math.Pow(2, 3), want: 0x4800, acc: big.Exact}, // 2^3 210 | {in: math.Pow(2, 4), want: 0x4C00, acc: big.Exact}, // 2^4 211 | {in: math.Pow(2, 5), want: 0x5000, acc: big.Exact}, // 2^5 212 | {in: math.Pow(2, 6), want: 0x5400, acc: big.Exact}, // 2^6 213 | {in: math.Pow(2, 7), want: 0x5800, acc: big.Exact}, // 2^7 214 | {in: math.Pow(2, 8), want: 0x5C00, acc: big.Exact}, // 2^8 215 | {in: math.Pow(2, 9), want: 0x6000, acc: big.Exact}, // 2^9 216 | {in: math.Pow(2, 10), want: 0x6400, acc: big.Exact}, // 2^10 217 | {in: math.Pow(2, 11), want: 0x6800, acc: big.Exact}, // 2^11 218 | {in: math.Pow(2, 12), want: 0x6C00, acc: big.Exact}, // 2^12 219 | {in: math.Pow(2, 13), want: 0x7000, acc: big.Exact}, // 2^13 220 | {in: math.Pow(2, 14), want: 0x7400, acc: big.Exact}, // 2^14 221 | {in: math.Pow(2, 15), want: 0x7800, acc: big.Exact}, // 2^15 222 | } 223 | for _, g := range golden { 224 | f, acc := NewFromFloat64(g.in) 225 | got := f.Bits() 226 | x, _ := f.Float64() 227 | if g.want != got { 228 | t.Errorf("bits mismatch; expected 0x%04X (%v), got 0x%04X (%v)", g.want, g.in, got, x) 229 | } 230 | if g.acc != acc { 231 | t.Errorf("accuracy mismatch; expected %v (%v), got %v (%v)", g.acc, g.in, acc, x) 232 | } 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /binary16/extra_test.tmpl: -------------------------------------------------------------------------------- 1 | // Code generated by go run gen.go; DO NOT EDIT. 2 | 3 | package binary16 4 | 5 | import ( 6 | "math" 7 | "testing" 8 | ) 9 | 10 | func TestNewFromBitsNormalized(t *testing.T) { 11 | for _, g := range goldenNormalized { 12 | f := NewFromBits(g.bits) 13 | got, _ := f.Float64() 14 | wantBits := math.Float64bits(g.want) 15 | gotBits := math.Float64bits(got) 16 | if wantBits != gotBits { 17 | t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got) 18 | } 19 | } 20 | } 21 | 22 | func TestNewFromBitsDenormalized(t *testing.T) { 23 | for _, g := range goldenDenormalized { 24 | f := NewFromBits(g.bits) 25 | got, _ := f.Float64() 26 | wantBits := math.Float64bits(g.want) 27 | gotBits := math.Float64bits(got) 28 | if wantBits != gotBits { 29 | t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got) 30 | } 31 | } 32 | } 33 | 34 | func TestNewFromFloat32Normalized(t *testing.T) { 35 | for _, g := range goldenNormalized { 36 | in := float32(g.want) 37 | f, acc := NewFromFloat32(in) 38 | _ = acc 39 | got := f.Bits() 40 | if g.bits != got { 41 | t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", in, g.bits, got) 42 | } 43 | } 44 | } 45 | 46 | func TestNewFromFloat64Normalized(t *testing.T) { 47 | for _, g := range goldenNormalized { 48 | f, acc := NewFromFloat64(g.want) 49 | _ = acc 50 | got := f.Bits() 51 | if g.bits != got { 52 | t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", g.want, g.bits, got) 53 | } 54 | } 55 | } 56 | 57 | func TestNewFromFloat32Denormalized(t *testing.T) { 58 | for _, g := range goldenDenormalized { 59 | in := float32(g.want) 60 | f, acc := NewFromFloat32(in) 61 | _ = acc 62 | got := f.Bits() 63 | if g.bits != got { 64 | t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", in, g.bits, got) 65 | } 66 | } 67 | } 68 | 69 | func TestNewFromFloat64Denormalized(t *testing.T) { 70 | for _, g := range goldenDenormalized { 71 | f, acc := NewFromFloat64(g.want) 72 | _ = acc 73 | got := f.Bits() 74 | if g.bits != got { 75 | t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", g.want, g.bits, got) 76 | } 77 | } 78 | } 79 | 80 | var goldenNormalized = []struct { 81 | bits uint16 82 | want float64 83 | }{ 84 | // Normalized values. 85 | {{- range .normalized }} 86 | {{ . }} 87 | {{- end }} 88 | } 89 | 90 | var goldenDenormalized = []struct { 91 | bits uint16 92 | want float64 93 | }{ 94 | // Denormalized values. 95 | {{- range .denormalized }} 96 | {{ . }} 97 | {{- end }} 98 | } 99 | -------------------------------------------------------------------------------- /binary16/gen.go: -------------------------------------------------------------------------------- 1 | //+build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "flag" 7 | "fmt" 8 | "log" 9 | "math" 10 | "math/big" 11 | "os" 12 | "text/template" 13 | 14 | "github.com/pkg/errors" 15 | ) 16 | 17 | func main() { 18 | var out string 19 | flag.StringVar(&out, "o", "extra_test.go", "test cases output path") 20 | flag.Parse() 21 | if err := dumpTest(out); err != nil { 22 | log.Fatalf("%+v", err) 23 | } 24 | } 25 | 26 | func dumpTest(path string) error { 27 | f, err := os.Create(path) 28 | if err != nil { 29 | return errors.WithStack(err) 30 | } 31 | defer f.Close() 32 | t, err := template.ParseFiles("extra_test.tmpl") 33 | if err != nil { 34 | return errors.WithStack(err) 35 | } 36 | data := map[string][]string{ 37 | "normalized": getNormalized(), 38 | "denormalized": getDenormalized(), 39 | } 40 | if err := t.Execute(f, data); err != nil { 41 | return errors.WithStack(err) 42 | } 43 | return nil 44 | } 45 | 46 | // exponent bias. 47 | const bias = 15 48 | 49 | func getNormalized() []string { 50 | var ns []string 51 | // normalized 52 | // 53 | // exponent bits: 0b00001 - 0b11110 54 | // 55 | // (-1)^signbit * 2^(exp-15) * 1.mant_2 56 | const lead = 1 57 | for signbit := 0; signbit <= 1; signbit++ { 58 | for exp := 1; exp <= 0x1E; exp++ { 59 | // mantissa bits: 0b0000000000 - 0b1111111111 60 | for mant := 0; mant <= 0x3FF; mant++ { 61 | s := fmt.Sprintf("%s0b%d.%010bp0", "+", lead, mant) 62 | m, _, err := big.ParseFloat(s, 0, 53, big.ToNearestEven) 63 | if err != nil { 64 | panic(err) 65 | } 66 | mantissa, acc := m.Float64() 67 | if acc != big.Exact { 68 | panic("not exact") 69 | } 70 | want := math.Pow(-1, float64(signbit)) * math.Pow(2, float64(exp)-bias) * mantissa 71 | bits := uint16(signbit) << 15 72 | bits |= uint16(exp) << 10 73 | bits |= uint16(mant) 74 | n := fmt.Sprintf("{bits: 0x%04X, want: %v}, // %s", bits, want, s) 75 | ns = append(ns, n) 76 | } 77 | } 78 | } 79 | return ns 80 | } 81 | 82 | func getDenormalized() []string { 83 | var ds []string 84 | // denormalized 85 | // 86 | // exponent bits: 0b00000 87 | // 88 | // (-1)^signbit * 2^(-14) * 0.mant_2 89 | const lead = 0 90 | for signbit := 0; signbit <= 1; signbit++ { 91 | // mantissa bits: 0b0000000000 - 0b1111111111 92 | const exp = 0 93 | for mant := 0; mant <= 0x3FF; mant++ { 94 | s := fmt.Sprintf("%s0b%d.%010bp0", "+", lead, mant) 95 | m, _, err := big.ParseFloat(s, 0, 53, big.ToNearestEven) 96 | if err != nil { 97 | panic(err) 98 | } 99 | mantissa, acc := m.Float64() 100 | if acc != big.Exact { 101 | panic("not exact") 102 | } 103 | want := math.Pow(-1, float64(signbit)) * math.Pow(2, exp-bias+1) * mantissa 104 | bits := uint16(signbit) << 15 105 | bits |= uint16(exp) << 10 106 | bits |= uint16(mant) 107 | if bits == 0x8000 { 108 | // -zero 109 | d := fmt.Sprintf("{bits: 0x%04X, want: math.Copysign(0, -1)}, // %s", bits, s) 110 | ds = append(ds, d) 111 | } else { 112 | d := fmt.Sprintf("{bits: 0x%04X, want: %v}, // %s", bits, want, s) 113 | ds = append(ds, d) 114 | } 115 | } 116 | } 117 | return ds 118 | } 119 | -------------------------------------------------------------------------------- /binary16/testdata/Makefile: -------------------------------------------------------------------------------- 1 | C=$(wildcard *.c) 2 | LL=$(C:.c=.ll) 3 | 4 | all: $(LL) 5 | 6 | %.ll: %.c sar 7 | clang -S -emit-llvm -o $@ $< 8 | sar -i "(?m:^[^@\n][^\n]*[\n])" "" $@ 9 | sar -i ", align[^\n]*" "" $@ 10 | sar -i "[\n]+" "\n" $@ 11 | sar -i "^[\n]+" "" $@ 12 | 13 | sar: 14 | @if ! which $@ &> /dev/null ; then \ 15 | echo "Please install the \"sar\" tool"; \ 16 | echo; \ 17 | echo " go get -u github.com/mewkiz/cmd/sar"; \ 18 | echo; \ 19 | exit 1; \ 20 | fi 21 | 22 | .PHONY: all 23 | -------------------------------------------------------------------------------- /binary16/testdata/binary16.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | _Float16 f_pos_nan = +NAN; 4 | _Float16 f_neg_nan = -NAN; 5 | _Float16 f_pos_inf = +INFINITY; 6 | _Float16 f_neg_inf = -INFINITY; 7 | _Float16 f_pos_0 = +0.0; 8 | _Float16 f_neg_0 = -0.0; 9 | _Float16 f_pos_0_dot_5 = +0.5; 10 | _Float16 f_neg_0_dot_5 = -0.5; 11 | _Float16 f_pos_1_dot_5 = +1.5; 12 | _Float16 f_neg_1_dot_5 = -1.5; 13 | _Float16 f_pos_2_dot_5 = +2.5; 14 | _Float16 f_neg_2_dot_5 = -2.5; 15 | _Float16 f_pos_3_dot_14 = +3.14; 16 | _Float16 f_neg_3_dot_14 = -3.14; 17 | -------------------------------------------------------------------------------- /binary16/testdata/binary16.ll: -------------------------------------------------------------------------------- 1 | @f_pos_nan = global half 0xH7E00 2 | @f_neg_nan = global half 0xHFE00 3 | @f_pos_inf = global half 0xH7C00 4 | @f_neg_inf = global half 0xHFC00 5 | @f_pos_0 = global half 0xH0000 6 | @f_neg_0 = global half 0xH8000 7 | @f_pos_0_dot_5 = global half 0xH3800 8 | @f_neg_0_dot_5 = global half 0xHB800 9 | @f_pos_1_dot_5 = global half 0xH3E00 10 | @f_neg_1_dot_5 = global half 0xHBE00 11 | @f_pos_2_dot_5 = global half 0xH4100 12 | @f_neg_2_dot_5 = global half 0xHC100 13 | @f_pos_3_dot_14 = global half 0xH4248 14 | @f_neg_3_dot_14 = global half 0xHC248 15 | -------------------------------------------------------------------------------- /float.go: -------------------------------------------------------------------------------- 1 | // Package float implements floating-point representation utility functions. 2 | package float 3 | 4 | import ( 5 | "fmt" 6 | "math/big" 7 | "strings" 8 | 9 | "github.com/mewmew/float/internal/strconv" 10 | ) 11 | 12 | // IsExact16 reports whether x may be represented exactly as a 16-bit 13 | // floating-point value. 14 | func IsExact16(x *big.Float) bool { 15 | f, acc := x.Float64() 16 | if acc != big.Exact { 17 | return false 18 | } 19 | s1 := strconv.FormatFloat(f, 'e', -1, 16) 20 | s2 := trimZeros(x.Text('e', 100)) 21 | return s1 == s2 22 | } 23 | 24 | // IsExact32 reports whether x may be represented exactly as a 32-bit 25 | // floating-point value. 26 | func IsExact32(x *big.Float) bool { 27 | f, acc := x.Float32() 28 | if acc != big.Exact { 29 | return false 30 | } 31 | s1 := strconv.FormatFloat(float64(f), 'e', -1, 32) 32 | s2 := trimZeros(x.Text('e', 100)) 33 | return s1 == s2 34 | } 35 | 36 | // IsExact64 reports whether x may be represented exactly as a 64-bit 37 | // floating-point value. 38 | func IsExact64(x *big.Float) bool { 39 | f, acc := x.Float64() 40 | if acc != big.Exact { 41 | return false 42 | } 43 | s1 := strconv.FormatFloat(f, 'e', -1, 64) 44 | s2 := trimZeros(x.Text('e', 100)) 45 | return s1 == s2 46 | } 47 | 48 | // trimZeros trims trailing zeroes after the decimal point in the given 49 | // floating-point value (represented in scientific notation). If all digits 50 | // after the decimal point are trimmed this way, the decimal point is also 51 | // trimmed. 52 | func trimZeros(s string) string { 53 | epos := strings.Index(s, "e") 54 | if epos == -1 { 55 | panic(fmt.Errorf("unable to locate position of exponent (e.g. e+02) in %q", s)) 56 | } 57 | pos := epos - 1 58 | for ; pos > 0; pos-- { 59 | if s[pos] != '0' { 60 | break 61 | } 62 | } 63 | if s[pos] != '.' { 64 | pos++ 65 | } 66 | return fmt.Sprintf("%s%s", s[:pos], s[epos:]) 67 | } 68 | -------------------------------------------------------------------------------- /float128ppc/float128ppc.go: -------------------------------------------------------------------------------- 1 | // Package float128ppc implements encoding and decoding of double-double 2 | // floating-point numbers. 3 | // 4 | // https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Double-double_arithmetic 5 | package float128ppc 6 | 7 | import ( 8 | "math" 9 | "math/big" 10 | ) 11 | 12 | const ( 13 | // precision specifies the number of bits in the mantissa (including the 14 | // implicit lead bit). 15 | precision = 106 16 | ) 17 | 18 | // Positive and negative Not-a-Number, infinity and zero. 19 | var ( 20 | // +NaN 21 | NaN = Float{high: math.NaN(), low: 0} 22 | // -NaN 23 | NegNaN = Float{high: -math.NaN(), low: 0} 24 | // +Inf 25 | Inf = Float{high: math.Inf(1), low: 0} 26 | // -Inf 27 | NegInf = Float{high: -math.Inf(-1), low: 0} 28 | // +zero 29 | Zero = Float{high: 0, low: 0} 30 | // -zero 31 | NegZero = Float{high: math.Copysign(0, -1), low: 0} 32 | ) 33 | 34 | // Float is a floating-point number in double-double format. 35 | type Float struct { 36 | // where a long double value is regarded as the exact sum of two double-precision values, giving at least a 106-bit precision 37 | high float64 38 | low float64 39 | } 40 | 41 | // NewFromBits returns the floating-point number corresponding to the 42 | // double-double representation. 43 | func NewFromBits(a, b uint64) Float { 44 | high := math.Float64frombits(a) 45 | low := math.Float64frombits(b) 46 | return Float{ 47 | high: high, 48 | low: low, 49 | } 50 | } 51 | 52 | // NewFromFloat32 returns the nearest double-double precision floating-point 53 | // number for x and the accuracy of the conversion. 54 | func NewFromFloat32(x float32) (Float, big.Accuracy) { 55 | f, acc := NewFromFloat64(float64(x)) 56 | if acc == big.Exact { 57 | _, acc = f.Float32() 58 | } 59 | return f, acc 60 | } 61 | 62 | // NewFromFloat64 returns the nearest double-double precision floating-point 63 | // number for x and the accuracy of the conversion. 64 | func NewFromFloat64(x float64) (Float, big.Accuracy) { 65 | // +-NaN 66 | switch { 67 | case math.IsNaN(x): 68 | if math.Signbit(x) { 69 | // -NaN 70 | return NegNaN, big.Exact 71 | } 72 | // +NaN 73 | return NaN, big.Exact 74 | } 75 | r := Float{high: x, low: 0} 76 | br, _ := r.Big() 77 | return r, br.Acc() 78 | } 79 | 80 | // NewFromBig returns the nearest double-double floating-point number for x and 81 | // the accuracy of the conversion. 82 | func NewFromBig(x *big.Float) (Float, big.Accuracy) { 83 | // +-Inf 84 | zero := big.NewFloat(0).SetPrec(precision) 85 | switch { 86 | case x.IsInf(): 87 | if x.Signbit() { 88 | // -Inf 89 | return NegInf, big.Exact 90 | } 91 | // +Inf 92 | return Inf, big.Exact 93 | // +-zero 94 | case x.Cmp(zero) == 0: 95 | if x.Signbit() { 96 | // -zero 97 | return NegZero, big.Exact 98 | } 99 | // +zero 100 | return Zero, big.Exact 101 | } 102 | 103 | // set precision of x. 104 | x.SetPrec(precision).SetMode(big.ToNearestEven) 105 | 106 | // get high part of the double-double floating-point value. 107 | high, _ := x.Float64() 108 | h := big.NewFloat(high).SetPrec(precision).SetMode(big.ToNearestEven) 109 | 110 | // compute low part by subtracting high from x. 111 | l := big.NewFloat(0).SetPrec(precision).SetMode(big.ToNearestEven) 112 | l.Sub(x, h) 113 | low, _ := l.Float64() 114 | 115 | // check accuracy of results. 116 | result := big.NewFloat(0).SetPrec(precision).SetMode(big.ToNearestEven) 117 | result.Add(h, l) 118 | acc := big.Accuracy(x.Cmp(result)) 119 | 120 | return Float{high: high, low: low}, acc 121 | } 122 | 123 | // Bits returns the double-double binary representation of f. 124 | func (f Float) Bits() (a, b uint64) { 125 | return math.Float64bits(f.high), math.Float64bits(f.low) 126 | } 127 | 128 | // Float32 returns the float32 representation of f. 129 | func (f Float) Float32() (float32, big.Accuracy) { 130 | x, nan := f.Big() 131 | if nan { 132 | if x.Signbit() { 133 | return float32(-math.NaN()), big.Exact 134 | } 135 | return float32(math.NaN()), big.Exact 136 | } 137 | return x.Float32() 138 | } 139 | 140 | // Float64 returns the float64 representation of f. 141 | func (f Float) Float64() (float64, big.Accuracy) { 142 | x, nan := f.Big() 143 | if nan { 144 | if x.Signbit() { 145 | return -math.NaN(), big.Exact 146 | } 147 | return math.NaN(), big.Exact 148 | } 149 | return x.Float64() 150 | } 151 | 152 | // Big returns the multi-precision floating-point number representation of f and 153 | // a boolean indicating whether f is Not-a-Number. 154 | func (f Float) Big() (x *big.Float, nan bool) { 155 | x = big.NewFloat(0) 156 | x.SetPrec(precision) 157 | x.SetMode(big.ToNearestEven) 158 | if f.IsNaN() { 159 | return x, true 160 | } 161 | h := big.NewFloat(f.high).SetPrec(precision) 162 | l := big.NewFloat(f.low).SetPrec(precision) 163 | x.Add(h, l) 164 | 165 | zero := big.NewFloat(0).SetPrec(precision) 166 | if x.Cmp(zero) == 0 && math.Signbit(f.high) { 167 | // -zero 168 | if !x.Signbit() { 169 | x.Neg(x) 170 | } 171 | } 172 | 173 | return x, false 174 | } 175 | 176 | // IsNaN returns true if the Float is NaN 177 | func (f Float) IsNaN() bool { 178 | // NaN + NaN should be NaN in consideration 179 | return math.IsNaN(f.high) || math.IsNaN(f.low) 180 | } 181 | -------------------------------------------------------------------------------- /float128ppc/float128ppc_test.go: -------------------------------------------------------------------------------- 1 | package float128ppc 2 | 3 | import ( 4 | "math/big" 5 | "testing" 6 | ) 7 | 8 | func TestRoundTrip(t *testing.T) { 9 | golden := []struct { 10 | h, l uint64 11 | }{ 12 | {h: 0x0000000000000000, l: 0x0000000000000000}, // "0xM00000000000000000000000000000000" 13 | {h: 0x3DF0000000000000, l: 0x0000000000000000}, // "0xM3DF00000000000000000000000000000" 14 | {h: 0x3FF0000000000000, l: 0x0000000000000000}, // "0xM3FF00000000000000000000000000000" 15 | {h: 0x4000000000000000, l: 0x0000000000000000}, // "0xM40000000000000000000000000000000" 16 | {h: 0x400C000000000030, l: 0x0000000010000000}, // "0xM400C0000000000300000000010000000" 17 | {h: 0x400F000000000000, l: 0xBCB0000000000000}, // "0xM400F000000000000BCB0000000000000" 18 | {h: 0x403B000000000000, l: 0x0000000000000000}, // "0xM403B0000000000000000000000000000" 19 | {h: 0x405EDA5E353F7CEE, l: 0x0000000000000000}, // "0xM405EDA5E353F7CEE0000000000000000" 20 | {h: 0x4093B40000000000, l: 0x0000000000000000}, // "0xM4093B400000000000000000000000000" 21 | {h: 0x41F0000000000000, l: 0x0000000000000000}, // "0xM41F00000000000000000000000000000" 22 | {h: 0x4D436562A0416DE0, l: 0x0000000000000000}, // "0xM4D436562A0416DE00000000000000000" 23 | {h: 0x8000000000000000, l: 0x0000000000000000}, // "0xM80000000000000000000000000000000" 24 | {h: 0x818F2887B9295809, l: 0x800000000032D000}, // "0xM818F2887B9295809800000000032D000" 25 | {h: 0xC00547AE147AE148, l: 0x3CA47AE147AE147A}, // "0xMC00547AE147AE1483CA47AE147AE147A" 26 | } 27 | for _, g := range golden { 28 | f1 := NewFromBits(g.h, g.l) 29 | fbig, nan := f1.Big() 30 | _ = nan 31 | f2, acc := NewFromBig(fbig) 32 | _ = acc 33 | h, l := f2.Bits() 34 | if g.h != h { 35 | t.Errorf("0xM%016X%016X; high mismatch; expected 0x%016X, got 0x%016X", g.h, g.l, g.h, h) 36 | } 37 | if g.l != l { 38 | t.Errorf("0xM%016X%016X; low mismatch; expected 0x%016X, got 0x%016X", g.h, g.l, g.l, l) 39 | } 40 | if acc != big.Exact { 41 | t.Errorf("0xM%016X%016X; round-trip result accuracy inexact; expected %v, got %v", g.h, g.l, big.Exact, acc) 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /float80x86/float80x86.go: -------------------------------------------------------------------------------- 1 | //go:generate go run gen.go -o extra_test.go 2 | 3 | // Package float80x86 implements encoding and decoding of x86 extended precision 4 | // floating-point numbers. 5 | // 6 | // https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format 7 | package float80x86 8 | 9 | import ( 10 | "fmt" 11 | "math" 12 | "math/big" 13 | ) 14 | 15 | const ( 16 | // precision specifies the number of bits in the mantissa (including the 17 | // explicit lead bit). 18 | precision = 64 19 | // exponent bias. 20 | bias = 16383 21 | ) 22 | 23 | // Positive and negative Not-a-Number, infinity and zero. 24 | var ( 25 | // +NaN 26 | NaN = Float{se: 0x7FFF, m: 0xBFFFFFFFFFFFFFFF} 27 | // -NaN 28 | NegNaN = Float{se: 0xFFFF, m: 0xBFFFFFFFFFFFFFFF} 29 | // +Inf 30 | Inf = Float{se: 0x7FFF, m: 0x8000000000000000} 31 | // -Inf 32 | NegInf = Float{se: 0xFFFF, m: 0x8000000000000000} 33 | // +zero 34 | Zero = Float{se: 0x0000, m: 0x0000000000000000} 35 | // -zero 36 | NegZero = Float{se: 0x8000, m: 0x0000000000000000} 37 | ) 38 | 39 | // Float is a floating-point number in x86 extended precision format. 40 | type Float struct { 41 | // Sign and exponent. 42 | // 43 | // 1 bit: sign 44 | // 15 bits: exponent 45 | se uint16 46 | // Integer part and fraction. 47 | // 48 | // 1 bit: integer part 49 | // 63 bits: fraction 50 | m uint64 51 | } 52 | 53 | // NewFromBits returns the floating-point number corresponding to the x86 54 | // extended precision representation. 55 | func NewFromBits(se uint16, m uint64) Float { 56 | return Float{se: se, m: m} 57 | } 58 | 59 | // NewFromFloat32 returns the nearest x86 extended precision floating-point 60 | // number for x and the accuracy of the conversion. 61 | func NewFromFloat32(x float32) (Float, big.Accuracy) { 62 | f, acc := NewFromFloat64(float64(x)) 63 | if acc == big.Exact { 64 | _, acc = f.Float32() 65 | } 66 | return f, acc 67 | } 68 | 69 | // NewFromFloat64 returns the nearest x86 extended precision floating-point 70 | // number for x and the accuracy of the conversion. 71 | func NewFromFloat64(x float64) (Float, big.Accuracy) { 72 | // +-NaN 73 | switch { 74 | case math.IsNaN(x): 75 | if math.Signbit(x) { 76 | // -NaN 77 | // sign: 1 78 | // exp: all ones 79 | // mant: 10 non-zero 80 | return NegNaN, big.Exact 81 | } 82 | // +NaN 83 | // sign: 0 84 | // exp: all ones 85 | // mant: 10 non-zero 86 | return NaN, big.Exact 87 | } 88 | y := big.NewFloat(x) 89 | y.SetPrec(precision) 90 | y.SetMode(big.ToNearestEven) 91 | // TODO: check accuracy after setting precision? 92 | return NewFromBig(y) 93 | } 94 | 95 | // NewFromBig returns the nearest x86 extended precision floating-point number 96 | // for x and the accuracy of the conversion. 97 | func NewFromBig(x *big.Float) (Float, big.Accuracy) { 98 | // +-Inf 99 | zero := big.NewFloat(0) 100 | switch { 101 | case x.IsInf(): 102 | if x.Signbit() { 103 | // -Inf 104 | // sign: 1 105 | // exp: all ones 106 | // mant: 10 zero 107 | return NegInf, big.Exact 108 | } 109 | // +Inf 110 | // sign: 0 111 | // exp: all ones 112 | // mant: 10 zero 113 | return Inf, big.Exact 114 | // +-zero 115 | case x.Cmp(zero) == 0: 116 | if x.Signbit() { 117 | // -zero 118 | // sign: 1 119 | // exp: zero 120 | // mant: zero 121 | return NegZero, big.Exact 122 | } 123 | // +zero 124 | // sign: 0 125 | // exp: zero 126 | // mant: zero 127 | return Zero, big.Exact 128 | } 129 | 130 | // Sign 131 | var se uint16 132 | if x.Signbit() { 133 | se |= 0x8000 134 | } 135 | 136 | // Exponent and mantissa. 137 | var m uint64 138 | mant := &big.Float{} 139 | exponent := x.MantExp(mant) 140 | // TODO: verify, as float80x86 also has an explicit lead bit. 141 | // Remove 1 from the exponent as big.Float has an no lead bit. 142 | exp := exponent - 1 + bias 143 | 144 | // Handle denormalized values. 145 | // TODO: validate implementation of denormalized values. 146 | if exp <= 0 { 147 | acc := big.Exact 148 | if exp <= -(precision - 1) { 149 | exp = precision - 1 150 | acc = big.Below 151 | } 152 | mant.SetMantExp(mant, exp+precision-1) 153 | if mant.Signbit() { 154 | mant.Neg(mant) 155 | } 156 | v, _ := mant.Uint64() 157 | // TODO: calculate acc based on if v&^0x7FFFFFFFFFFFFFFF != 0 {} 158 | m |= v & 0x7FFFFFFFFFFFFFFF 159 | return Float{se: se, m: m}, acc 160 | } 161 | 162 | // 0b111111111111111 163 | acc := big.Exact 164 | if (exp &^ 0x7FFF) != 0 { 165 | acc = big.Above 166 | } 167 | se |= uint16(exp & 0x7FFF) 168 | 169 | if mant.Signbit() { 170 | mant.Neg(mant) 171 | } 172 | mant.SetMantExp(mant, precision) 173 | if !mant.IsInt() { 174 | acc = big.Below 175 | } 176 | // mantissa, including explicit lead bit 177 | mantissa, acc2 := mant.Uint64() 178 | if acc == big.Exact { 179 | acc = acc2 180 | } 181 | m |= mantissa 182 | return Float{se: se, m: m}, acc 183 | } 184 | 185 | // Bits returns the x86 extended precision binary representation of f. 186 | func (f Float) Bits() (se uint16, m uint64) { 187 | return f.se, f.m 188 | } 189 | 190 | // Float32 returns the float32 value nearest to f. If f is too small to be 191 | // represented by a float32 (|f| < math.SmallestNonzeroFloat32), the result is 192 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is 193 | // too large to be represented by a float32 (|f| > math.MaxFloat32), the result 194 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f. 195 | func (f Float) Float32() (float32, big.Accuracy) { 196 | x, nan := f.Big() 197 | if nan { 198 | if x.Signbit() { 199 | return float32(-math.NaN()), big.Exact 200 | } 201 | return float32(math.NaN()), big.Exact 202 | } 203 | return x.Float32() 204 | } 205 | 206 | // Float64 returns the float64 value nearest to f. If f is too small to be 207 | // represented by a float64 (|f| < math.SmallestNonzeroFloat64), the result is 208 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is 209 | // too large to be represented by a float64 (|f| > math.MaxFloat64), the result 210 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f. 211 | func (f Float) Float64() (float64, big.Accuracy) { 212 | x, nan := f.Big() 213 | if nan { 214 | if x.Signbit() { 215 | return -math.NaN(), big.Exact 216 | } 217 | return math.NaN(), big.Exact 218 | } 219 | return x.Float64() 220 | } 221 | 222 | // Big returns the multi-precision floating-point number representation of f and 223 | // a boolean indicating whether f is Not-a-Number. 224 | func (f Float) Big() (x *big.Float, nan bool) { 225 | signbit := f.Signbit() 226 | exp := f.Exp() 227 | x = big.NewFloat(0) 228 | x.SetPrec(precision) 229 | x.SetMode(big.ToNearestEven) 230 | 231 | // ref: https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format 232 | // 233 | // 0b000000000000001 - 0b111111111111110 234 | // Normalized number. 235 | // 236 | // (-1)^signbit * 2^(exp-16383) * 1.mant_2 237 | exponent := exp - bias 238 | 239 | switch exp { 240 | // 0b111111111111111 241 | case 0x7FFF: 242 | // Inf or NaN 243 | if f.m == 0x8000000000000000 { 244 | // +-Inf 245 | // 10 zero 246 | x.SetInf(signbit) 247 | return x, false 248 | } 249 | // +-NaN 250 | // 10 non-zero 251 | if signbit { 252 | x.Neg(x) 253 | } 254 | return x, true 255 | // 0b000000000000000 256 | case 0x0000: 257 | if f.m == 0 { 258 | // +-Zero 259 | if signbit { 260 | x.Neg(x) 261 | } 262 | return x, false 263 | } 264 | // Denormalized number. 265 | // 266 | // (-1)^signbit * 2^(-16382) * 0.mant_2 267 | exponent = -16382 268 | } 269 | 270 | // number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity . 271 | sign := "+" 272 | if signbit { 273 | sign = "-" 274 | } 275 | lead := f.Lead() 276 | frac := f.Frac() 277 | s := fmt.Sprintf("%s0b%d.%063bp%d", sign, lead, frac, exponent) 278 | if _, _, err := x.Parse(s, 0); err != nil { 279 | panic(err) 280 | } 281 | return x, false 282 | } 283 | 284 | // Signbit reports whether f is negative or negative 0. 285 | func (f Float) Signbit() bool { 286 | // 0b1000000000000000 287 | return f.se&0x8000 != 0 288 | } 289 | 290 | // Exp returns the exponent of f. 291 | func (f Float) Exp() int { 292 | // 0b0111111111111111 293 | return int(f.se & 0x7FFF) 294 | } 295 | 296 | // Lead returns the explicit lead bit of f. 297 | func (f Float) Lead() int { 298 | return int(f.m >> 63) 299 | } 300 | 301 | // Frac returns the fraction of f. 302 | func (f Float) Frac() uint64 { 303 | return f.m & 0x7FFFFFFFFFFFFFFF 304 | } 305 | -------------------------------------------------------------------------------- /float80x86/float80x86_test.go: -------------------------------------------------------------------------------- 1 | package float80x86 2 | 3 | import ( 4 | "math" 5 | "math/big" 6 | "testing" 7 | ) 8 | 9 | func TestNewFromBits(t *testing.T) { 10 | golden := []struct { 11 | se uint16 12 | m uint64 13 | want float64 14 | }{ 15 | // Special numbers. 16 | // 0 111111111111111 10 non-zero = +NaN 17 | {se: 0x7FFF, m: 0xBFFFFFFFFFFFFFFF, want: math.NaN()}, 18 | // -NaN 19 | // 1 111111111111111 10 non-zero = -NaN 20 | {se: 0xFFFF, m: 0xBFFFFFFFFFFFFFFF, want: -math.NaN()}, 21 | 22 | // from: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_math.html#960 23 | 24 | // 0000 00000000 00000000 = 0.0 25 | {se: 0x0000, m: 0x0000000000000000, want: 0.0}, 26 | // 8000 00000000 00000000 = -0.0 27 | {se: 0x8000, m: 0x0000000000000000, want: math.Copysign(0, -1)}, 28 | // 3FFF 80000000 00000000 = 1.0 29 | {se: 0x3FFF, m: 0x8000000000000000, want: 1.0}, 30 | // 4000 80000000 00000000 = 2.0 31 | {se: 0x4000, m: 0x8000000000000000, want: 2.0}, 32 | // 7FFE FFFFFFFF FFFFFFFF = 1.18973149535723176505e+4932 (max normal) 33 | //{se: 0x7FFE, m: 0xFFFFFFFFFFFFFFFF, want: 1.18973149535723176505e+4932}, 34 | // 0001 80000000 00000000 = 3.36210314311209350626e-4932 (min positive normal) 35 | //{se: 0x0001, m: 0x8000000000000000, want: 3.36210314311209350626e-4932}, 36 | // 0000 7FFFFFFF FFFFFFFF = 3.36210314311209350608e-4932 (max subnormal) 37 | //{se: 0x0000, m: 0x7FFFFFFFFFFFFFFF, want: 3.36210314311209350608e-4932}, 38 | // 0000 00000000 00000001 = 3.64519953188247460253e-4951 (min positive subnormal) 39 | //{se: 0x0000, m: 0x0000000000000001, want: 3.64519953188247460253e-4951}, 40 | // 7FFF 80000000 00000000 = infinity 41 | {se: 0x7FFF, m: 0x8000000000000000, want: math.Inf(1)}, 42 | // FFFF 80000000 00000000 = -infinity 43 | {se: 0xFFFF, m: 0x8000000000000000, want: math.Inf(-1)}, 44 | 45 | // 2^i 46 | // TODO: add test cases for 2^i 47 | } 48 | for _, g := range golden { 49 | f := NewFromBits(g.se, g.m) 50 | got, _ := f.Float64() 51 | wantBits := math.Float64bits(g.want) 52 | gotBits := math.Float64bits(got) 53 | //fmt.Printf("bits: 0x%04X (%v)\n", g.bits, g.want) 54 | if wantBits != gotBits { 55 | t.Errorf("0x%04X %016X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.se, g.m, wantBits, g.want, gotBits, got) 56 | } 57 | } 58 | } 59 | 60 | func TestNewFromFloat64(t *testing.T) { 61 | golden := []struct { 62 | in float64 63 | se uint16 64 | m uint64 65 | acc big.Accuracy 66 | }{ 67 | // Special numbers. 68 | // 0 111111111111111 10 non-zero = +NaN 69 | {in: math.NaN(), se: 0x7FFF, m: 0xBFFFFFFFFFFFFFFF, acc: big.Exact}, 70 | // -NaN 71 | // 1 111111111111111 10 non-zero = -NaN 72 | {in: -math.NaN(), se: 0xFFFF, m: 0xBFFFFFFFFFFFFFFF, acc: big.Exact}, 73 | 74 | // from: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_math.html#960 75 | 76 | // 0000 00000000 00000000 = 0.0 77 | {in: 0.0, se: 0x0000, m: 0x0000000000000000, acc: big.Exact}, 78 | // 8000 00000000 00000000 = -0.0 79 | {in: math.Copysign(0, -1), se: 0x8000, m: 0x0000000000000000, acc: big.Exact}, 80 | // 3FFF 80000000 00000000 = 1.0 81 | {in: 1.0, se: 0x3FFF, m: 0x8000000000000000, acc: big.Exact}, 82 | // 4000 80000000 00000000 = 2.0 83 | {in: 2.0, se: 0x4000, m: 0x8000000000000000, acc: big.Exact}, 84 | // 7FFE FFFFFFFF FFFFFFFF = 1.18973149535723176505e+4932 (max normal) 85 | //{in: 1.18973149535723176505e+4932, se: 0x7FFE, m: 0xFFFFFFFFFFFFFFFF, acc: big.Exact}, 86 | // 0001 80000000 00000000 = 3.36210314311209350626e-4932 (min positive normal) 87 | //{in: 3.36210314311209350626e-4932, se: 0x0001, m: 0x8000000000000000, acc: big.Exact}, 88 | // 0000 7FFFFFFF FFFFFFFF = 3.36210314311209350608e-4932 (max subnormal) 89 | //{in: 3.36210314311209350608e-4932, se: 0x0000, m: 0x7FFFFFFFFFFFFFFF, acc: big.Exact}, 90 | // 0000 00000000 00000001 = 3.64519953188247460253e-4951 (min positive subnormal) 91 | //{in: 3.64519953188247460253e-4951, se: 0x0000, m: 0x0000000000000001, acc: big.Exact}, 92 | // 7FFF 80000000 00000000 = infinity 93 | {in: math.Inf(1), se: 0x7FFF, m: 0x8000000000000000, acc: big.Exact}, 94 | // FFFF 80000000 00000000 = -infinity 95 | {in: math.Inf(-1), se: 0xFFFF, m: 0x8000000000000000, acc: big.Exact}, 96 | 97 | // 2^i 98 | // TODO: add test cases for 2^i 99 | } 100 | for _, g := range golden { 101 | f, acc := NewFromFloat64(g.in) 102 | se, m := f.Bits() 103 | if g.se != se || g.m != m { 104 | x, _ := f.Float64() 105 | t.Errorf("bits mismatch; expected 0x%04X %016X (%v), got 0x%04X %016X (%v)", g.se, g.m, g.in, se, m, x) 106 | } 107 | if g.acc != acc { 108 | x, _ := f.Float64() 109 | t.Errorf("accuracy mismatch; expected %v (%v), got %v (%v)", g.acc, g.in, acc, x) 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/mewmew/float 2 | 3 | go 1.13 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mewmew/float/4fe539893335ae74f3d0fd5e782763a8e3e565d2/go.sum -------------------------------------------------------------------------------- /internal/strconv/decimal.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Multiprecision decimal numbers. 6 | // For floating-point formatting only; not general purpose. 7 | // Only operations are assign and (binary) left/right shift. 8 | // Can do binary floating point in multiprecision decimal precisely 9 | // because 2 divides 10; cannot do decimal floating point 10 | // in multiprecision binary precisely. 11 | 12 | package strconv 13 | 14 | type decimal struct { 15 | d [800]byte // digits, big-endian representation 16 | nd int // number of digits used 17 | dp int // decimal point 18 | neg bool // negative flag 19 | trunc bool // discarded nonzero digits beyond d[:nd] 20 | } 21 | 22 | func (a *decimal) String() string { 23 | n := 10 + a.nd 24 | if a.dp > 0 { 25 | n += a.dp 26 | } 27 | if a.dp < 0 { 28 | n += -a.dp 29 | } 30 | 31 | buf := make([]byte, n) 32 | w := 0 33 | switch { 34 | case a.nd == 0: 35 | return "0" 36 | 37 | case a.dp <= 0: 38 | // zeros fill space between decimal point and digits 39 | buf[w] = '0' 40 | w++ 41 | buf[w] = '.' 42 | w++ 43 | w += digitZero(buf[w : w+-a.dp]) 44 | w += copy(buf[w:], a.d[0:a.nd]) 45 | 46 | case a.dp < a.nd: 47 | // decimal point in middle of digits 48 | w += copy(buf[w:], a.d[0:a.dp]) 49 | buf[w] = '.' 50 | w++ 51 | w += copy(buf[w:], a.d[a.dp:a.nd]) 52 | 53 | default: 54 | // zeros fill space between digits and decimal point 55 | w += copy(buf[w:], a.d[0:a.nd]) 56 | w += digitZero(buf[w : w+a.dp-a.nd]) 57 | } 58 | return string(buf[0:w]) 59 | } 60 | 61 | func digitZero(dst []byte) int { 62 | for i := range dst { 63 | dst[i] = '0' 64 | } 65 | return len(dst) 66 | } 67 | 68 | // trim trailing zeros from number. 69 | // (They are meaningless; the decimal point is tracked 70 | // independent of the number of digits.) 71 | func trim(a *decimal) { 72 | for a.nd > 0 && a.d[a.nd-1] == '0' { 73 | a.nd-- 74 | } 75 | if a.nd == 0 { 76 | a.dp = 0 77 | } 78 | } 79 | 80 | // Assign v to a. 81 | func (a *decimal) Assign(v uint64) { 82 | var buf [24]byte 83 | 84 | // Write reversed decimal in buf. 85 | n := 0 86 | for v > 0 { 87 | v1 := v / 10 88 | v -= 10 * v1 89 | buf[n] = byte(v + '0') 90 | n++ 91 | v = v1 92 | } 93 | 94 | // Reverse again to produce forward decimal in a.d. 95 | a.nd = 0 96 | for n--; n >= 0; n-- { 97 | a.d[a.nd] = buf[n] 98 | a.nd++ 99 | } 100 | a.dp = a.nd 101 | trim(a) 102 | } 103 | 104 | // Maximum shift that we can do in one pass without overflow. 105 | // A uint has 32 or 64 bits, and we have to be able to accommodate 9<> 63) 107 | const maxShift = uintSize - 4 108 | 109 | // Binary shift right (/ 2) by k bits. k <= maxShift to avoid overflow. 110 | func rightShift(a *decimal, k uint) { 111 | r := 0 // read pointer 112 | w := 0 // write pointer 113 | 114 | // Pick up enough leading digits to cover first shift. 115 | var n uint 116 | for ; n>>k == 0; r++ { 117 | if r >= a.nd { 118 | if n == 0 { 119 | // a == 0; shouldn't get here, but handle anyway. 120 | a.nd = 0 121 | return 122 | } 123 | for n>>k == 0 { 124 | n *= 10 125 | r++ 126 | } 127 | break 128 | } 129 | c := uint(a.d[r]) 130 | n = n*10 + c - '0' 131 | } 132 | a.dp -= r - 1 133 | 134 | var mask uint = (1 << k) - 1 135 | 136 | // Pick up a digit, put down a digit. 137 | for ; r < a.nd; r++ { 138 | c := uint(a.d[r]) 139 | dig := n >> k 140 | n &= mask 141 | a.d[w] = byte(dig + '0') 142 | w++ 143 | n = n*10 + c - '0' 144 | } 145 | 146 | // Put down extra digits. 147 | for n > 0 { 148 | dig := n >> k 149 | n &= mask 150 | if w < len(a.d) { 151 | a.d[w] = byte(dig + '0') 152 | w++ 153 | } else if dig > 0 { 154 | a.trunc = true 155 | } 156 | n *= 10 157 | } 158 | 159 | a.nd = w 160 | trim(a) 161 | } 162 | 163 | // Cheat sheet for left shift: table indexed by shift count giving 164 | // number of new digits that will be introduced by that shift. 165 | // 166 | // For example, leftcheats[4] = {2, "625"}. That means that 167 | // if we are shifting by 4 (multiplying by 16), it will add 2 digits 168 | // when the string prefix is "625" through "999", and one fewer digit 169 | // if the string prefix is "000" through "624". 170 | // 171 | // Credit for this trick goes to Ken. 172 | 173 | type leftCheat struct { 174 | delta int // number of new digits 175 | cutoff string // minus one digit if original < a. 176 | } 177 | 178 | var leftcheats = []leftCheat{ 179 | // Leading digits of 1/2^i = 5^i. 180 | // 5^23 is not an exact 64-bit floating point number, 181 | // so have to use bc for the math. 182 | // Go up to 60 to be large enough for 32bit and 64bit platforms. 183 | /* 184 | seq 60 | sed 's/^/5^/' | bc | 185 | awk 'BEGIN{ print "\t{ 0, \"\" }," } 186 | { 187 | log2 = log(2)/log(10) 188 | printf("\t{ %d, \"%s\" },\t// * %d\n", 189 | int(log2*NR+1), $0, 2**NR) 190 | }' 191 | */ 192 | {0, ""}, 193 | {1, "5"}, // * 2 194 | {1, "25"}, // * 4 195 | {1, "125"}, // * 8 196 | {2, "625"}, // * 16 197 | {2, "3125"}, // * 32 198 | {2, "15625"}, // * 64 199 | {3, "78125"}, // * 128 200 | {3, "390625"}, // * 256 201 | {3, "1953125"}, // * 512 202 | {4, "9765625"}, // * 1024 203 | {4, "48828125"}, // * 2048 204 | {4, "244140625"}, // * 4096 205 | {4, "1220703125"}, // * 8192 206 | {5, "6103515625"}, // * 16384 207 | {5, "30517578125"}, // * 32768 208 | {5, "152587890625"}, // * 65536 209 | {6, "762939453125"}, // * 131072 210 | {6, "3814697265625"}, // * 262144 211 | {6, "19073486328125"}, // * 524288 212 | {7, "95367431640625"}, // * 1048576 213 | {7, "476837158203125"}, // * 2097152 214 | {7, "2384185791015625"}, // * 4194304 215 | {7, "11920928955078125"}, // * 8388608 216 | {8, "59604644775390625"}, // * 16777216 217 | {8, "298023223876953125"}, // * 33554432 218 | {8, "1490116119384765625"}, // * 67108864 219 | {9, "7450580596923828125"}, // * 134217728 220 | {9, "37252902984619140625"}, // * 268435456 221 | {9, "186264514923095703125"}, // * 536870912 222 | {10, "931322574615478515625"}, // * 1073741824 223 | {10, "4656612873077392578125"}, // * 2147483648 224 | {10, "23283064365386962890625"}, // * 4294967296 225 | {10, "116415321826934814453125"}, // * 8589934592 226 | {11, "582076609134674072265625"}, // * 17179869184 227 | {11, "2910383045673370361328125"}, // * 34359738368 228 | {11, "14551915228366851806640625"}, // * 68719476736 229 | {12, "72759576141834259033203125"}, // * 137438953472 230 | {12, "363797880709171295166015625"}, // * 274877906944 231 | {12, "1818989403545856475830078125"}, // * 549755813888 232 | {13, "9094947017729282379150390625"}, // * 1099511627776 233 | {13, "45474735088646411895751953125"}, // * 2199023255552 234 | {13, "227373675443232059478759765625"}, // * 4398046511104 235 | {13, "1136868377216160297393798828125"}, // * 8796093022208 236 | {14, "5684341886080801486968994140625"}, // * 17592186044416 237 | {14, "28421709430404007434844970703125"}, // * 35184372088832 238 | {14, "142108547152020037174224853515625"}, // * 70368744177664 239 | {15, "710542735760100185871124267578125"}, // * 140737488355328 240 | {15, "3552713678800500929355621337890625"}, // * 281474976710656 241 | {15, "17763568394002504646778106689453125"}, // * 562949953421312 242 | {16, "88817841970012523233890533447265625"}, // * 1125899906842624 243 | {16, "444089209850062616169452667236328125"}, // * 2251799813685248 244 | {16, "2220446049250313080847263336181640625"}, // * 4503599627370496 245 | {16, "11102230246251565404236316680908203125"}, // * 9007199254740992 246 | {17, "55511151231257827021181583404541015625"}, // * 18014398509481984 247 | {17, "277555756156289135105907917022705078125"}, // * 36028797018963968 248 | {17, "1387778780781445675529539585113525390625"}, // * 72057594037927936 249 | {18, "6938893903907228377647697925567626953125"}, // * 144115188075855872 250 | {18, "34694469519536141888238489627838134765625"}, // * 288230376151711744 251 | {18, "173472347597680709441192448139190673828125"}, // * 576460752303423488 252 | {19, "867361737988403547205962240695953369140625"}, // * 1152921504606846976 253 | } 254 | 255 | // Is the leading prefix of b lexicographically less than s? 256 | func prefixIsLessThan(b []byte, s string) bool { 257 | for i := 0; i < len(s); i++ { 258 | if i >= len(b) { 259 | return true 260 | } 261 | if b[i] != s[i] { 262 | return b[i] < s[i] 263 | } 264 | } 265 | return false 266 | } 267 | 268 | // Binary shift left (* 2) by k bits. k <= maxShift to avoid overflow. 269 | func leftShift(a *decimal, k uint) { 270 | delta := leftcheats[k].delta 271 | if prefixIsLessThan(a.d[0:a.nd], leftcheats[k].cutoff) { 272 | delta-- 273 | } 274 | 275 | r := a.nd // read index 276 | w := a.nd + delta // write index 277 | 278 | // Pick up a digit, put down a digit. 279 | var n uint 280 | for r--; r >= 0; r-- { 281 | n += (uint(a.d[r]) - '0') << k 282 | quo := n / 10 283 | rem := n - 10*quo 284 | w-- 285 | if w < len(a.d) { 286 | a.d[w] = byte(rem + '0') 287 | } else if rem != 0 { 288 | a.trunc = true 289 | } 290 | n = quo 291 | } 292 | 293 | // Put down extra digits. 294 | for n > 0 { 295 | quo := n / 10 296 | rem := n - 10*quo 297 | w-- 298 | if w < len(a.d) { 299 | a.d[w] = byte(rem + '0') 300 | } else if rem != 0 { 301 | a.trunc = true 302 | } 303 | n = quo 304 | } 305 | 306 | a.nd += delta 307 | if a.nd >= len(a.d) { 308 | a.nd = len(a.d) 309 | } 310 | a.dp += delta 311 | trim(a) 312 | } 313 | 314 | // Binary shift left (k > 0) or right (k < 0). 315 | func (a *decimal) Shift(k int) { 316 | switch { 317 | case a.nd == 0: 318 | // nothing to do: a == 0 319 | case k > 0: 320 | for k > maxShift { 321 | leftShift(a, maxShift) 322 | k -= maxShift 323 | } 324 | leftShift(a, uint(k)) 325 | case k < 0: 326 | for k < -maxShift { 327 | rightShift(a, maxShift) 328 | k += maxShift 329 | } 330 | rightShift(a, uint(-k)) 331 | } 332 | } 333 | 334 | // If we chop a at nd digits, should we round up? 335 | func shouldRoundUp(a *decimal, nd int) bool { 336 | if nd < 0 || nd >= a.nd { 337 | return false 338 | } 339 | if a.d[nd] == '5' && nd+1 == a.nd { // exactly halfway - round to even 340 | // if we truncated, a little higher than what's recorded - always round up 341 | if a.trunc { 342 | return true 343 | } 344 | return nd > 0 && (a.d[nd-1]-'0')%2 != 0 345 | } 346 | // not halfway - digit tells all 347 | return a.d[nd] >= '5' 348 | } 349 | 350 | // Round a to nd digits (or fewer). 351 | // If nd is zero, it means we're rounding 352 | // just to the left of the digits, as in 353 | // 0.09 -> 0.1. 354 | func (a *decimal) Round(nd int) { 355 | if nd < 0 || nd >= a.nd { 356 | return 357 | } 358 | if shouldRoundUp(a, nd) { 359 | a.RoundUp(nd) 360 | } else { 361 | a.RoundDown(nd) 362 | } 363 | } 364 | 365 | // Round a down to nd digits (or fewer). 366 | func (a *decimal) RoundDown(nd int) { 367 | if nd < 0 || nd >= a.nd { 368 | return 369 | } 370 | a.nd = nd 371 | trim(a) 372 | } 373 | 374 | // Round a up to nd digits (or fewer). 375 | func (a *decimal) RoundUp(nd int) { 376 | if nd < 0 || nd >= a.nd { 377 | return 378 | } 379 | 380 | // round up 381 | for i := nd - 1; i >= 0; i-- { 382 | c := a.d[i] 383 | if c < '9' { // can stop after this digit 384 | a.d[i]++ 385 | a.nd = i + 1 386 | return 387 | } 388 | } 389 | 390 | // Number is all 9s. 391 | // Change to single 1 with adjusted decimal point. 392 | a.d[0] = '1' 393 | a.nd = 1 394 | a.dp++ 395 | } 396 | 397 | // Extract integer part, rounded appropriately. 398 | // No guarantees about overflow. 399 | func (a *decimal) RoundedInteger() uint64 { 400 | if a.dp > 20 { 401 | return 0xFFFFFFFFFFFFFFFF 402 | } 403 | var i int 404 | n := uint64(0) 405 | for i = 0; i < a.dp && i < a.nd; i++ { 406 | n = n*10 + uint64(a.d[i]-'0') 407 | } 408 | for ; i < a.dp; i++ { 409 | n *= 10 410 | } 411 | if shouldRoundUp(a, a.dp) { 412 | n++ 413 | } 414 | return n 415 | } 416 | -------------------------------------------------------------------------------- /internal/strconv/extfloat.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package strconv 6 | 7 | import ( 8 | "math/bits" 9 | ) 10 | 11 | // An extFloat represents an extended floating-point number, with more 12 | // precision than a float64. It does not try to save bits: the 13 | // number represented by the structure is mant*(2^exp), with a negative 14 | // sign if neg is true. 15 | type extFloat struct { 16 | mant uint64 17 | exp int 18 | neg bool 19 | } 20 | 21 | // Powers of ten taken from double-conversion library. 22 | // https://code.google.com/p/double-conversion/ 23 | const ( 24 | firstPowerOfTen = -348 25 | stepPowerOfTen = 8 26 | ) 27 | 28 | var smallPowersOfTen = [...]extFloat{ 29 | {1 << 63, -63, false}, // 1 30 | {0xa << 60, -60, false}, // 1e1 31 | {0x64 << 57, -57, false}, // 1e2 32 | {0x3e8 << 54, -54, false}, // 1e3 33 | {0x2710 << 50, -50, false}, // 1e4 34 | {0x186a0 << 47, -47, false}, // 1e5 35 | {0xf4240 << 44, -44, false}, // 1e6 36 | {0x989680 << 40, -40, false}, // 1e7 37 | } 38 | 39 | var powersOfTen = [...]extFloat{ 40 | {0xfa8fd5a0081c0288, -1220, false}, // 10^-348 41 | {0xbaaee17fa23ebf76, -1193, false}, // 10^-340 42 | {0x8b16fb203055ac76, -1166, false}, // 10^-332 43 | {0xcf42894a5dce35ea, -1140, false}, // 10^-324 44 | {0x9a6bb0aa55653b2d, -1113, false}, // 10^-316 45 | {0xe61acf033d1a45df, -1087, false}, // 10^-308 46 | {0xab70fe17c79ac6ca, -1060, false}, // 10^-300 47 | {0xff77b1fcbebcdc4f, -1034, false}, // 10^-292 48 | {0xbe5691ef416bd60c, -1007, false}, // 10^-284 49 | {0x8dd01fad907ffc3c, -980, false}, // 10^-276 50 | {0xd3515c2831559a83, -954, false}, // 10^-268 51 | {0x9d71ac8fada6c9b5, -927, false}, // 10^-260 52 | {0xea9c227723ee8bcb, -901, false}, // 10^-252 53 | {0xaecc49914078536d, -874, false}, // 10^-244 54 | {0x823c12795db6ce57, -847, false}, // 10^-236 55 | {0xc21094364dfb5637, -821, false}, // 10^-228 56 | {0x9096ea6f3848984f, -794, false}, // 10^-220 57 | {0xd77485cb25823ac7, -768, false}, // 10^-212 58 | {0xa086cfcd97bf97f4, -741, false}, // 10^-204 59 | {0xef340a98172aace5, -715, false}, // 10^-196 60 | {0xb23867fb2a35b28e, -688, false}, // 10^-188 61 | {0x84c8d4dfd2c63f3b, -661, false}, // 10^-180 62 | {0xc5dd44271ad3cdba, -635, false}, // 10^-172 63 | {0x936b9fcebb25c996, -608, false}, // 10^-164 64 | {0xdbac6c247d62a584, -582, false}, // 10^-156 65 | {0xa3ab66580d5fdaf6, -555, false}, // 10^-148 66 | {0xf3e2f893dec3f126, -529, false}, // 10^-140 67 | {0xb5b5ada8aaff80b8, -502, false}, // 10^-132 68 | {0x87625f056c7c4a8b, -475, false}, // 10^-124 69 | {0xc9bcff6034c13053, -449, false}, // 10^-116 70 | {0x964e858c91ba2655, -422, false}, // 10^-108 71 | {0xdff9772470297ebd, -396, false}, // 10^-100 72 | {0xa6dfbd9fb8e5b88f, -369, false}, // 10^-92 73 | {0xf8a95fcf88747d94, -343, false}, // 10^-84 74 | {0xb94470938fa89bcf, -316, false}, // 10^-76 75 | {0x8a08f0f8bf0f156b, -289, false}, // 10^-68 76 | {0xcdb02555653131b6, -263, false}, // 10^-60 77 | {0x993fe2c6d07b7fac, -236, false}, // 10^-52 78 | {0xe45c10c42a2b3b06, -210, false}, // 10^-44 79 | {0xaa242499697392d3, -183, false}, // 10^-36 80 | {0xfd87b5f28300ca0e, -157, false}, // 10^-28 81 | {0xbce5086492111aeb, -130, false}, // 10^-20 82 | {0x8cbccc096f5088cc, -103, false}, // 10^-12 83 | {0xd1b71758e219652c, -77, false}, // 10^-4 84 | {0x9c40000000000000, -50, false}, // 10^4 85 | {0xe8d4a51000000000, -24, false}, // 10^12 86 | {0xad78ebc5ac620000, 3, false}, // 10^20 87 | {0x813f3978f8940984, 30, false}, // 10^28 88 | {0xc097ce7bc90715b3, 56, false}, // 10^36 89 | {0x8f7e32ce7bea5c70, 83, false}, // 10^44 90 | {0xd5d238a4abe98068, 109, false}, // 10^52 91 | {0x9f4f2726179a2245, 136, false}, // 10^60 92 | {0xed63a231d4c4fb27, 162, false}, // 10^68 93 | {0xb0de65388cc8ada8, 189, false}, // 10^76 94 | {0x83c7088e1aab65db, 216, false}, // 10^84 95 | {0xc45d1df942711d9a, 242, false}, // 10^92 96 | {0x924d692ca61be758, 269, false}, // 10^100 97 | {0xda01ee641a708dea, 295, false}, // 10^108 98 | {0xa26da3999aef774a, 322, false}, // 10^116 99 | {0xf209787bb47d6b85, 348, false}, // 10^124 100 | {0xb454e4a179dd1877, 375, false}, // 10^132 101 | {0x865b86925b9bc5c2, 402, false}, // 10^140 102 | {0xc83553c5c8965d3d, 428, false}, // 10^148 103 | {0x952ab45cfa97a0b3, 455, false}, // 10^156 104 | {0xde469fbd99a05fe3, 481, false}, // 10^164 105 | {0xa59bc234db398c25, 508, false}, // 10^172 106 | {0xf6c69a72a3989f5c, 534, false}, // 10^180 107 | {0xb7dcbf5354e9bece, 561, false}, // 10^188 108 | {0x88fcf317f22241e2, 588, false}, // 10^196 109 | {0xcc20ce9bd35c78a5, 614, false}, // 10^204 110 | {0x98165af37b2153df, 641, false}, // 10^212 111 | {0xe2a0b5dc971f303a, 667, false}, // 10^220 112 | {0xa8d9d1535ce3b396, 694, false}, // 10^228 113 | {0xfb9b7cd9a4a7443c, 720, false}, // 10^236 114 | {0xbb764c4ca7a44410, 747, false}, // 10^244 115 | {0x8bab8eefb6409c1a, 774, false}, // 10^252 116 | {0xd01fef10a657842c, 800, false}, // 10^260 117 | {0x9b10a4e5e9913129, 827, false}, // 10^268 118 | {0xe7109bfba19c0c9d, 853, false}, // 10^276 119 | {0xac2820d9623bf429, 880, false}, // 10^284 120 | {0x80444b5e7aa7cf85, 907, false}, // 10^292 121 | {0xbf21e44003acdd2d, 933, false}, // 10^300 122 | {0x8e679c2f5e44ff8f, 960, false}, // 10^308 123 | {0xd433179d9c8cb841, 986, false}, // 10^316 124 | {0x9e19db92b4e31ba9, 1013, false}, // 10^324 125 | {0xeb96bf6ebadf77d9, 1039, false}, // 10^332 126 | {0xaf87023b9bf0ee6b, 1066, false}, // 10^340 127 | } 128 | 129 | // floatBits returns the bits of the float64 that best approximates 130 | // the extFloat passed as receiver. Overflow is set to true if 131 | // the resulting float64 is ±Inf. 132 | func (f *extFloat) floatBits(flt *floatInfo) (bits uint64, overflow bool) { 133 | f.Normalize() 134 | 135 | exp := f.exp + 63 136 | 137 | // Exponent too small. 138 | if exp < flt.bias+1 { 139 | n := flt.bias + 1 - exp 140 | f.mant >>= uint(n) 141 | exp += n 142 | } 143 | 144 | // Extract 1+flt.mantbits bits from the 64-bit mantissa. 145 | mant := f.mant >> (63 - flt.mantbits) 146 | if f.mant&(1<<(62-flt.mantbits)) != 0 { 147 | // Round up. 148 | mant++ 149 | } 150 | 151 | // Rounding might have added a bit; shift down. 152 | if mant == 2<>= 1 154 | exp++ 155 | } 156 | 157 | // Infinities. 158 | if exp-flt.bias >= 1<>uint(-f.exp))<>= uint(-f.exp) 187 | f.exp = 0 188 | return *f, *f 189 | } 190 | expBiased := exp - flt.bias 191 | 192 | upper = extFloat{mant: 2*f.mant + 1, exp: f.exp - 1, neg: f.neg} 193 | if mant != 1<>32, uint64(uint32(f.mant)) 218 | ghi, glo := g.mant>>32, uint64(uint32(g.mant)) 219 | 220 | // Cross products. 221 | cross1 := fhi * glo 222 | cross2 := flo * ghi 223 | 224 | // f.mant*g.mant is fhi*ghi << 64 + (cross1+cross2) << 32 + flo*glo 225 | f.mant = fhi*ghi + (cross1 >> 32) + (cross2 >> 32) 226 | rem := uint64(uint32(cross1)) + uint64(uint32(cross2)) + ((flo * glo) >> 32) 227 | // Round up. 228 | rem += (1 << 31) 229 | 230 | f.mant += (rem >> 32) 231 | f.exp = f.exp + g.exp + 64 232 | } 233 | 234 | var uint64pow10 = [...]uint64{ 235 | 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 236 | 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 237 | } 238 | 239 | // AssignDecimal sets f to an approximate value mantissa*10^exp. It 240 | // reports whether the value represented by f is guaranteed to be the 241 | // best approximation of d after being rounded to a float64 or 242 | // float32 depending on flt. 243 | func (f *extFloat) AssignDecimal(mantissa uint64, exp10 int, neg bool, trunc bool, flt *floatInfo) (ok bool) { 244 | const uint64digits = 19 245 | const errorscale = 8 246 | errors := 0 // An upper bound for error, computed in errorscale*ulp. 247 | if trunc { 248 | // the decimal number was truncated. 249 | errors += errorscale / 2 250 | } 251 | 252 | f.mant = mantissa 253 | f.exp = 0 254 | f.neg = neg 255 | 256 | // Multiply by powers of ten. 257 | i := (exp10 - firstPowerOfTen) / stepPowerOfTen 258 | if exp10 < firstPowerOfTen || i >= len(powersOfTen) { 259 | return false 260 | } 261 | adjExp := (exp10 - firstPowerOfTen) % stepPowerOfTen 262 | 263 | // We multiply by exp%step 264 | if adjExp < uint64digits && mantissa < uint64pow10[uint64digits-adjExp] { 265 | // We can multiply the mantissa exactly. 266 | f.mant *= uint64pow10[adjExp] 267 | f.Normalize() 268 | } else { 269 | f.Normalize() 270 | f.Multiply(smallPowersOfTen[adjExp]) 271 | errors += errorscale / 2 272 | } 273 | 274 | // We multiply by 10 to the exp - exp%step. 275 | f.Multiply(powersOfTen[i]) 276 | if errors > 0 { 277 | errors++ 278 | } 279 | errors += errorscale / 2 280 | 281 | // Normalize 282 | shift := f.Normalize() 283 | errors <<= shift 284 | 285 | // Now f is a good approximation of the decimal. 286 | // Check whether the error is too large: that is, if the mantissa 287 | // is perturbated by the error, the resulting float64 will change. 288 | // The 64 bits mantissa is 1 + 52 bits for float64 + 11 extra bits. 289 | // 290 | // In many cases the approximation will be good enough. 291 | denormalExp := flt.bias - 63 292 | var extrabits uint 293 | if f.exp <= denormalExp { 294 | // f.mant * 2^f.exp is smaller than 2^(flt.bias+1). 295 | extrabits = 63 - flt.mantbits + 1 + uint(denormalExp-f.exp) 296 | } else { 297 | extrabits = 63 - flt.mantbits 298 | } 299 | 300 | halfway := uint64(1) << (extrabits - 1) 301 | mantExtra := f.mant & (1< expMax: 336 | i-- 337 | default: 338 | break Loop 339 | } 340 | } 341 | // Apply the desired decimal shift on f. It will have exponent 342 | // in the desired range. This is multiplication by 10^-exp10. 343 | f.Multiply(powersOfTen[i]) 344 | 345 | return -(firstPowerOfTen + i*stepPowerOfTen), i 346 | } 347 | 348 | // frexp10Many applies a common shift by a power of ten to a, b, c. 349 | func frexp10Many(a, b, c *extFloat) (exp10 int) { 350 | exp10, i := c.frexp10() 351 | a.Multiply(powersOfTen[i]) 352 | b.Multiply(powersOfTen[i]) 353 | return 354 | } 355 | 356 | // FixedDecimal stores in d the first n significant digits 357 | // of the decimal representation of f. It returns false 358 | // if it cannot be sure of the answer. 359 | func (f *extFloat) FixedDecimal(d *decimalSlice, n int) bool { 360 | if f.mant == 0 { 361 | d.nd = 0 362 | d.dp = 0 363 | d.neg = f.neg 364 | return true 365 | } 366 | if n == 0 { 367 | panic("strconv: internal error: extFloat.FixedDecimal called with n == 0") 368 | } 369 | // Multiply by an appropriate power of ten to have a reasonable 370 | // number to process. 371 | f.Normalize() 372 | exp10, _ := f.frexp10() 373 | 374 | shift := uint(-f.exp) 375 | integer := uint32(f.mant >> shift) 376 | fraction := f.mant - (uint64(integer) << shift) 377 | ε := uint64(1) // ε is the uncertainty we have on the mantissa of f. 378 | 379 | // Write exactly n digits to d. 380 | needed := n // how many digits are left to write. 381 | integerDigits := 0 // the number of decimal digits of integer. 382 | pow10 := uint64(1) // the power of ten by which f was scaled. 383 | for i, pow := 0, uint64(1); i < 20; i++ { 384 | if pow > uint64(integer) { 385 | integerDigits = i 386 | break 387 | } 388 | pow *= 10 389 | } 390 | rest := integer 391 | if integerDigits > needed { 392 | // the integral part is already large, trim the last digits. 393 | pow10 = uint64pow10[integerDigits-needed] 394 | integer /= uint32(pow10) 395 | rest -= integer * uint32(pow10) 396 | } else { 397 | rest = 0 398 | } 399 | 400 | // Write the digits of integer: the digits of rest are omitted. 401 | var buf [32]byte 402 | pos := len(buf) 403 | for v := integer; v > 0; { 404 | v1 := v / 10 405 | v -= 10 * v1 406 | pos-- 407 | buf[pos] = byte(v + '0') 408 | v = v1 409 | } 410 | for i := pos; i < len(buf); i++ { 411 | d.d[i-pos] = buf[i] 412 | } 413 | nd := len(buf) - pos 414 | d.nd = nd 415 | d.dp = integerDigits + exp10 416 | needed -= nd 417 | 418 | if needed > 0 { 419 | if rest != 0 || pow10 != 1 { 420 | panic("strconv: internal error, rest != 0 but needed > 0") 421 | } 422 | // Emit digits for the fractional part. Each time, 10*fraction 423 | // fits in a uint64 without overflow. 424 | for needed > 0 { 425 | fraction *= 10 426 | ε *= 10 // the uncertainty scales as we multiply by ten. 427 | if 2*ε > 1<> shift 432 | d.d[nd] = byte(digit + '0') 433 | fraction -= digit << shift 434 | nd++ 435 | needed-- 436 | } 437 | d.nd = nd 438 | } 439 | 440 | // We have written a truncation of f (a numerator / 10^d.dp). The remaining part 441 | // can be interpreted as a small number (< 1) to be added to the last digit of the 442 | // numerator. 443 | // 444 | // If rest > 0, the amount is: 445 | // (rest< 0 guarantees that pow10 << shift does not overflow a uint64. 448 | // 449 | // If rest = 0, pow10 == 1 and the amount is 450 | // fraction / (1 << shift) 451 | // fraction being known with a ±ε uncertainty. 452 | // 453 | // We pass this information to the rounding routine for adjustment. 454 | 455 | ok := adjustLastDigitFixed(d, uint64(rest)<= 0; i-- { 461 | if d.d[i] != '0' { 462 | d.nd = i + 1 463 | break 464 | } 465 | } 466 | return true 467 | } 468 | 469 | // adjustLastDigitFixed assumes d contains the representation of the integral part 470 | // of some number, whose fractional part is num / (den << shift). The numerator 471 | // num is only known up to an uncertainty of size ε, assumed to be less than 472 | // (den << shift)/2. 473 | // 474 | // It will increase the last digit by one to account for correct rounding, typically 475 | // when the fractional part is greater than 1/2, and will return false if ε is such 476 | // that no correct answer can be given. 477 | func adjustLastDigitFixed(d *decimalSlice, num, den uint64, shift uint, ε uint64) bool { 478 | if num > den< den< den< (den< den<= 0; i-- { 491 | if d.d[i] == '9' { 492 | d.nd-- 493 | } else { 494 | break 495 | } 496 | } 497 | if i < 0 { 498 | d.d[0] = '1' 499 | d.nd = 1 500 | d.dp++ 501 | } else { 502 | d.d[i]++ 503 | } 504 | return true 505 | } 506 | return false 507 | } 508 | 509 | // ShortestDecimal stores in d the shortest decimal representation of f 510 | // which belongs to the open interval (lower, upper), where f is supposed 511 | // to lie. It returns false whenever the result is unsure. The implementation 512 | // uses the Grisu3 algorithm. 513 | func (f *extFloat) ShortestDecimal(d *decimalSlice, lower, upper *extFloat) bool { 514 | if f.mant == 0 { 515 | d.nd = 0 516 | d.dp = 0 517 | d.neg = f.neg 518 | return true 519 | } 520 | if f.exp == 0 && *lower == *f && *lower == *upper { 521 | // an exact integer. 522 | var buf [24]byte 523 | n := len(buf) - 1 524 | for v := f.mant; v > 0; { 525 | v1 := v / 10 526 | v -= 10 * v1 527 | buf[n] = byte(v + '0') 528 | n-- 529 | v = v1 530 | } 531 | nd := len(buf) - n - 1 532 | for i := 0; i < nd; i++ { 533 | d.d[i] = buf[n+1+i] 534 | } 535 | d.nd, d.dp = nd, nd 536 | for d.nd > 0 && d.d[d.nd-1] == '0' { 537 | d.nd-- 538 | } 539 | if d.nd == 0 { 540 | d.dp = 0 541 | } 542 | d.neg = f.neg 543 | return true 544 | } 545 | upper.Normalize() 546 | // Uniformize exponents. 547 | if f.exp > upper.exp { 548 | f.mant <<= uint(f.exp - upper.exp) 549 | f.exp = upper.exp 550 | } 551 | if lower.exp > upper.exp { 552 | lower.mant <<= uint(lower.exp - upper.exp) 553 | lower.exp = upper.exp 554 | } 555 | 556 | exp10 := frexp10Many(lower, f, upper) 557 | // Take a safety margin due to rounding in frexp10Many, but we lose precision. 558 | upper.mant++ 559 | lower.mant-- 560 | 561 | // The shortest representation of f is either rounded up or down, but 562 | // in any case, it is a truncation of upper. 563 | shift := uint(-upper.exp) 564 | integer := uint32(upper.mant >> shift) 565 | fraction := upper.mant - (uint64(integer) << shift) 566 | 567 | // How far we can go down from upper until the result is wrong. 568 | allowance := upper.mant - lower.mant 569 | // How far we should go to get a very precise result. 570 | targetDiff := upper.mant - f.mant 571 | 572 | // Count integral digits: there are at most 10. 573 | var integerDigits int 574 | for i, pow := 0, uint64(1); i < 20; i++ { 575 | if pow > uint64(integer) { 576 | integerDigits = i 577 | break 578 | } 579 | pow *= 10 580 | } 581 | for i := 0; i < integerDigits; i++ { 582 | pow := uint64pow10[integerDigits-i-1] 583 | digit := integer / uint32(pow) 584 | d.d[i] = byte(digit + '0') 585 | integer -= digit * uint32(pow) 586 | // evaluate whether we should stop. 587 | if currentDiff := uint64(integer)<> shift) 608 | d.d[d.nd] = byte(digit + '0') 609 | d.nd++ 610 | fraction -= uint64(digit) << shift 611 | if fraction < allowance*multiplier { 612 | // We are in the admissible range. Note that if allowance is about to 613 | // overflow, that is, allowance > 2^64/10, the condition is automatically 614 | // true due to the limited range of fraction. 615 | return adjustLastDigit(d, 616 | fraction, targetDiff*multiplier, allowance*multiplier, 617 | 1< maxDiff-ulpBinary { 640 | // we went too far 641 | return false 642 | } 643 | if d.nd == 1 && d.d[0] == '0' { 644 | // the number has actually reached zero. 645 | d.nd = 0 646 | d.dp = 0 647 | } 648 | return true 649 | } 650 | -------------------------------------------------------------------------------- /internal/strconv/ftoa.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Binary to decimal floating point conversion. 6 | // Algorithm: 7 | // 1) store mantissa in multiprecision decimal 8 | // 2) shift decimal by exponent 9 | // 3) read digits out & format 10 | 11 | package strconv 12 | 13 | import ( 14 | "math" 15 | 16 | "github.com/mewmew/float/binary16" 17 | ) 18 | 19 | var optimize = true // can change for testing 20 | 21 | // TODO: move elsewhere? 22 | type floatInfo struct { 23 | mantbits uint 24 | expbits uint 25 | bias int 26 | } 27 | 28 | var float16info = floatInfo{10, 5, -15} 29 | var float32info = floatInfo{23, 8, -127} 30 | var float64info = floatInfo{52, 11, -1023} 31 | 32 | // FormatFloat converts the floating-point number f to a string, 33 | // according to the format fmt and precision prec. It rounds the 34 | // result assuming that the original was obtained from a floating-point 35 | // value of bitSize bits (32 for float32, 64 for float64). 36 | // 37 | // The format fmt is one of 38 | // 'b' (-ddddp±ddd, a binary exponent), 39 | // 'e' (-d.dddde±dd, a decimal exponent), 40 | // 'E' (-d.ddddE±dd, a decimal exponent), 41 | // 'f' (-ddd.dddd, no exponent), 42 | // 'g' ('e' for large exponents, 'f' otherwise), or 43 | // 'G' ('E' for large exponents, 'f' otherwise). 44 | // 45 | // The precision prec controls the number of digits (excluding the exponent) 46 | // printed by the 'e', 'E', 'f', 'g', and 'G' formats. 47 | // For 'e', 'E', and 'f' it is the number of digits after the decimal point. 48 | // For 'g' and 'G' it is the maximum number of significant digits (trailing 49 | // zeros are removed). 50 | // The special precision -1 uses the smallest number of digits 51 | // necessary such that ParseFloat will return f exactly. 52 | func FormatFloat(f float64, fmt byte, prec, bitSize int) string { 53 | return string(genericFtoa(make([]byte, 0, max(prec+4, 24)), f, fmt, prec, bitSize)) 54 | } 55 | 56 | // AppendFloat appends the string form of the floating-point number f, 57 | // as generated by FormatFloat, to dst and returns the extended buffer. 58 | func AppendFloat(dst []byte, f float64, fmt byte, prec, bitSize int) []byte { 59 | return genericFtoa(dst, f, fmt, prec, bitSize) 60 | } 61 | 62 | func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte { 63 | var bits uint64 64 | var flt *floatInfo 65 | switch bitSize { 66 | case 16: 67 | f, acc := binary16.NewFromFloat64(val) 68 | _ = acc 69 | bits = uint64(f.Bits()) 70 | flt = &float16info 71 | case 32: 72 | bits = uint64(math.Float32bits(float32(val))) 73 | flt = &float32info 74 | case 64: 75 | bits = math.Float64bits(val) 76 | flt = &float64info 77 | default: 78 | panic("strconv: illegal AppendFloat/FormatFloat bitSize") 79 | } 80 | 81 | neg := bits>>(flt.expbits+flt.mantbits) != 0 82 | exp := int(bits>>flt.mantbits) & (1< digs.nd && digs.nd >= digs.dp { 214 | eprec = digs.nd 215 | } 216 | // %e is used if the exponent from the conversion 217 | // is less than -4 or greater than or equal to the precision. 218 | // if precision was the shortest possible, use precision 6 for this decision. 219 | if shortest { 220 | eprec = 6 221 | } 222 | exp := digs.dp - 1 223 | if exp < -4 || exp >= eprec { 224 | if prec > digs.nd { 225 | prec = digs.nd 226 | } 227 | return fmtE(dst, neg, digs, prec-1, fmt+'e'-'g') 228 | } 229 | if prec > digs.dp { 230 | prec = digs.nd 231 | } 232 | return fmtF(dst, neg, digs, max(prec-digs.dp, 0)) 233 | } 234 | 235 | // unknown format 236 | return append(dst, '%', fmt) 237 | } 238 | 239 | // roundShortest rounds d (= mant * 2^exp) to the shortest number of digits 240 | // that will let the original floating point value be precisely reconstructed. 241 | func roundShortest(d *decimal, mant uint64, exp int, flt *floatInfo) { 242 | // If mantissa is zero, the number is zero; stop now. 243 | if mant == 0 { 244 | d.nd = 0 245 | return 246 | } 247 | 248 | // Compute upper and lower such that any decimal number 249 | // between upper and lower (possibly inclusive) 250 | // will round to the original floating point number. 251 | 252 | // We may see at once that the number is already shortest. 253 | // 254 | // Suppose d is not denormal, so that 2^exp <= d < 10^dp. 255 | // The closest shorter number is at least 10^(dp-nd) away. 256 | // The lower/upper bounds computed below are at distance 257 | // at most 2^(exp-mantbits). 258 | // 259 | // So the number is already shortest if 10^(dp-nd) > 2^(exp-mantbits), 260 | // or equivalently log2(10)*(dp-nd) > exp-mantbits. 261 | // It is true if 332/100*(dp-nd) >= exp-mantbits (log2(10) > 3.32). 262 | minexp := flt.bias + 1 // minimum possible exponent 263 | if exp > minexp && 332*(d.dp-d.nd) >= 100*(exp-int(flt.mantbits)) { 264 | // The number is already shortest. 265 | return 266 | } 267 | 268 | // d = mant << (exp - mantbits) 269 | // Next highest floating point number is mant+1 << exp-mantbits. 270 | // Our upper bound is halfway between, mant*2+1 << exp-mantbits-1. 271 | upper := new(decimal) 272 | upper.Assign(mant*2 + 1) 273 | upper.Shift(exp - int(flt.mantbits) - 1) 274 | 275 | // d = mant << (exp - mantbits) 276 | // Next lowest floating point number is mant-1 << exp-mantbits, 277 | // unless mant-1 drops the significant bit and exp is not the minimum exp, 278 | // in which case the next lowest is mant*2-1 << exp-mantbits-1. 279 | // Either way, call it mantlo << explo-mantbits. 280 | // Our lower bound is halfway between, mantlo*2+1 << explo-mantbits-1. 281 | var mantlo uint64 282 | var explo int 283 | if mant > 1< 0 { 359 | dst = append(dst, '.') 360 | i := 1 361 | m := min(d.nd, prec+1) 362 | if i < m { 363 | dst = append(dst, d.d[i:m]...) 364 | i = m 365 | } 366 | for ; i <= prec; i++ { 367 | dst = append(dst, '0') 368 | } 369 | } 370 | 371 | // e± 372 | dst = append(dst, fmt) 373 | exp := d.dp - 1 374 | if d.nd == 0 { // special case: 0 has exponent 0 375 | exp = 0 376 | } 377 | if exp < 0 { 378 | ch = '-' 379 | exp = -exp 380 | } else { 381 | ch = '+' 382 | } 383 | dst = append(dst, ch) 384 | 385 | // dd or ddd 386 | switch { 387 | case exp < 10: 388 | dst = append(dst, '0', byte(exp)+'0') 389 | case exp < 100: 390 | dst = append(dst, byte(exp/10)+'0', byte(exp%10)+'0') 391 | default: 392 | dst = append(dst, byte(exp/100)+'0', byte(exp/10)%10+'0', byte(exp%10)+'0') 393 | } 394 | 395 | return dst 396 | } 397 | 398 | // %f: -ddddddd.ddddd 399 | func fmtF(dst []byte, neg bool, d decimalSlice, prec int) []byte { 400 | // sign 401 | if neg { 402 | dst = append(dst, '-') 403 | } 404 | 405 | // integer, padded with zeros as needed. 406 | if d.dp > 0 { 407 | m := min(d.nd, d.dp) 408 | dst = append(dst, d.d[:m]...) 409 | for ; m < d.dp; m++ { 410 | dst = append(dst, '0') 411 | } 412 | } else { 413 | dst = append(dst, '0') 414 | } 415 | 416 | // fraction 417 | if prec > 0 { 418 | dst = append(dst, '.') 419 | for i := 0; i < prec; i++ { 420 | ch := byte('0') 421 | if j := d.dp + i; 0 <= j && j < d.nd { 422 | ch = d.d[j] 423 | } 424 | dst = append(dst, ch) 425 | } 426 | } 427 | 428 | return dst 429 | } 430 | 431 | // %b: -ddddddddp±ddd 432 | func fmtB(dst []byte, neg bool, mant uint64, exp int, flt *floatInfo) []byte { 433 | // sign 434 | if neg { 435 | dst = append(dst, '-') 436 | } 437 | 438 | // mantissa 439 | dst, _ = formatBits(dst, mant, 10, false, true) 440 | 441 | // p 442 | dst = append(dst, 'p') 443 | 444 | // ±exponent 445 | exp -= int(flt.mantbits) 446 | if exp >= 0 { 447 | dst = append(dst, '+') 448 | } 449 | dst, _ = formatBits(dst, uint64(exp), 10, exp < 0, true) 450 | 451 | return dst 452 | } 453 | 454 | func min(a, b int) int { 455 | if a < b { 456 | return a 457 | } 458 | return b 459 | } 460 | 461 | func max(a, b int) int { 462 | if a > b { 463 | return a 464 | } 465 | return b 466 | } 467 | -------------------------------------------------------------------------------- /internal/strconv/itoa.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package strconv 6 | 7 | import "math/bits" 8 | 9 | const fastSmalls = true // enable fast path for small integers 10 | 11 | // FormatUint returns the string representation of i in the given base, 12 | // for 2 <= base <= 36. The result uses the lower-case letters 'a' to 'z' 13 | // for digit values >= 10. 14 | func FormatUint(i uint64, base int) string { 15 | if fastSmalls && i < nSmalls && base == 10 { 16 | return small(int(i)) 17 | } 18 | _, s := formatBits(nil, i, base, false, false) 19 | return s 20 | } 21 | 22 | // FormatInt returns the string representation of i in the given base, 23 | // for 2 <= base <= 36. The result uses the lower-case letters 'a' to 'z' 24 | // for digit values >= 10. 25 | func FormatInt(i int64, base int) string { 26 | if fastSmalls && 0 <= i && i < nSmalls && base == 10 { 27 | return small(int(i)) 28 | } 29 | _, s := formatBits(nil, uint64(i), base, i < 0, false) 30 | return s 31 | } 32 | 33 | // Itoa is shorthand for FormatInt(int64(i), 10). 34 | func Itoa(i int) string { 35 | return FormatInt(int64(i), 10) 36 | } 37 | 38 | // AppendInt appends the string form of the integer i, 39 | // as generated by FormatInt, to dst and returns the extended buffer. 40 | func AppendInt(dst []byte, i int64, base int) []byte { 41 | if fastSmalls && 0 <= i && i < nSmalls && base == 10 { 42 | return append(dst, small(int(i))...) 43 | } 44 | dst, _ = formatBits(dst, uint64(i), base, i < 0, true) 45 | return dst 46 | } 47 | 48 | // AppendUint appends the string form of the unsigned integer i, 49 | // as generated by FormatUint, to dst and returns the extended buffer. 50 | func AppendUint(dst []byte, i uint64, base int) []byte { 51 | if fastSmalls && i < nSmalls && base == 10 { 52 | return append(dst, small(int(i))...) 53 | } 54 | dst, _ = formatBits(dst, i, base, false, true) 55 | return dst 56 | } 57 | 58 | // small returns the string for an i with 0 <= i < nSmalls. 59 | func small(i int) string { 60 | if i < 10 { 61 | return digits[i : i+1] 62 | } 63 | return smallsString[i*2 : i*2+2] 64 | } 65 | 66 | const nSmalls = 100 67 | 68 | const smallsString = "00010203040506070809" + 69 | "10111213141516171819" + 70 | "20212223242526272829" + 71 | "30313233343536373839" + 72 | "40414243444546474849" + 73 | "50515253545556575859" + 74 | "60616263646566676869" + 75 | "70717273747576777879" + 76 | "80818283848586878889" + 77 | "90919293949596979899" 78 | 79 | const host32bit = ^uint(0)>>32 == 0 80 | 81 | const digits = "0123456789abcdefghijklmnopqrstuvwxyz" 82 | 83 | // formatBits computes the string representation of u in the given base. 84 | // If neg is set, u is treated as negative int64 value. If doAppend is 85 | // set, the string is appended to dst and the resulting byte slice is 86 | // returned as the first result value; otherwise the string is returned 87 | // as the second result value. 88 | // 89 | func formatBits(dst []byte, u uint64, base int, neg, doAppend bool) (d []byte, s string) { 90 | if base < 2 || base > len(digits) { 91 | panic("strconv: illegal AppendInt/FormatInt base") 92 | } 93 | // 2 <= base && base <= len(digits) 94 | 95 | var a [64 + 1]byte // +1 for sign of 64bit value in base 2 96 | i := len(a) 97 | 98 | if neg { 99 | u = -u 100 | } 101 | 102 | // convert bits 103 | // We use uint values where we can because those will 104 | // fit into a single register even on a 32bit machine. 105 | if base == 10 { 106 | // common case: use constants for / because 107 | // the compiler can optimize it into a multiply+shift 108 | 109 | if host32bit { 110 | // convert the lower digits using 32bit operations 111 | for u >= 1e9 { 112 | // Avoid using r = a%b in addition to q = a/b 113 | // since 64bit division and modulo operations 114 | // are calculated by runtime functions on 32bit machines. 115 | q := u / 1e9 116 | us := uint(u - q*1e9) // u % 1e9 fits into a uint 117 | for j := 4; j > 0; j-- { 118 | is := us % 100 * 2 119 | us /= 100 120 | i -= 2 121 | a[i+1] = smallsString[is+1] 122 | a[i+0] = smallsString[is+0] 123 | } 124 | 125 | // us < 10, since it contains the last digit 126 | // from the initial 9-digit us. 127 | i-- 128 | a[i] = smallsString[us*2+1] 129 | 130 | u = q 131 | } 132 | // u < 1e9 133 | } 134 | 135 | // u guaranteed to fit into a uint 136 | us := uint(u) 137 | for us >= 100 { 138 | is := us % 100 * 2 139 | us /= 100 140 | i -= 2 141 | a[i+1] = smallsString[is+1] 142 | a[i+0] = smallsString[is+0] 143 | } 144 | 145 | // us < 100 146 | is := us * 2 147 | i-- 148 | a[i] = smallsString[is+1] 149 | if us >= 10 { 150 | i-- 151 | a[i] = smallsString[is] 152 | } 153 | 154 | } else if isPowerOfTwo(base) { 155 | // Use shifts and masks instead of / and %. 156 | // Base is a power of 2 and 2 <= base <= len(digits) where len(digits) is 36. 157 | // The largest power of 2 below or equal to 36 is 32, which is 1 << 5; 158 | // i.e., the largest possible shift count is 5. By &-ind that value with 159 | // the constant 7 we tell the compiler that the shift count is always 160 | // less than 8 which is smaller than any register width. This allows 161 | // the compiler to generate better code for the shift operation. 162 | shift := uint(bits.TrailingZeros(uint(base))) & 7 163 | b := uint64(base) 164 | m := uint(base) - 1 // == 1<= b { 166 | i-- 167 | a[i] = digits[uint(u)&m] 168 | u >>= shift 169 | } 170 | // u < base 171 | i-- 172 | a[i] = digits[uint(u)] 173 | } else { 174 | // general case 175 | b := uint64(base) 176 | for u >= b { 177 | i-- 178 | // Avoid using r = a%b in addition to q = a/b 179 | // since 64bit division and modulo operations 180 | // are calculated by runtime functions on 32bit machines. 181 | q := u / b 182 | a[i] = digits[uint(u-q*b)] 183 | u = q 184 | } 185 | // u < base 186 | i-- 187 | a[i] = digits[uint(u)] 188 | } 189 | 190 | // add sign, if any 191 | if neg { 192 | i-- 193 | a[i] = '-' 194 | } 195 | 196 | if doAppend { 197 | d = append(dst, a[i:]...) 198 | return 199 | } 200 | s = string(a[i:]) 201 | return 202 | } 203 | 204 | func isPowerOfTwo(x int) bool { 205 | return x&(x-1) == 0 206 | } 207 | --------------------------------------------------------------------------------