├── .golangci.yml
├── .travis.yml
├── README.md
├── UNLICENSE
├── bfloat
    ├── bfloat.go
    └── bfloat_test.go
├── binary128
    ├── binary128.go
    ├── binary128_test.go
    ├── extra_test.go
    ├── extra_test.tmpl
    └── gen.go
├── binary16
    ├── binary16.go
    ├── binary16_test.go
    ├── extra_test.go
    ├── extra_test.tmpl
    ├── gen.go
    └── testdata
    │   ├── Makefile
    │   ├── binary16.c
    │   └── binary16.ll
├── float.go
├── float128ppc
    ├── float128ppc.go
    └── float128ppc_test.go
├── float80x86
    ├── float80x86.go
    └── float80x86_test.go
├── go.mod
├── go.sum
└── internal
    └── strconv
        ├── decimal.go
        ├── extfloat.go
        ├── ftoa.go
        └── itoa.go


/.golangci.yml:
--------------------------------------------------------------------------------
1 | linters:
2 |   enable-all: true
3 |   disable:
4 |     - dupl
5 |     - maligned
6 |     - lll
7 |     - gochecknoglobals
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | 
 3 | go:
 4 |   - "1.12"
 5 |   - tip
 6 | 
 7 | notifications:
 8 |   email: false
 9 | 
10 | env:
11 |   global:
12 |     # Coveralls.io token.
13 |     - secure: "gIMQehPQgokFi1b2weJcxY3t5n17Idzr4ONULNylyERInyFg7RtuAPbqi1O2QXqroVDSI+lCYy/hmxvuHIlg0ps6lA4lUiDkr1ScC/p05vGTjEamlC2AacCbP1Oa8OSBhxsKwfu8Z713q/ppfvVlUrOpI9XS1pdQh6QAu3vdOpFPFEewbgQDUZj0K21raj8DROUFo0W548eHTj4CQbgSIkKtbysXrvwvR7fEvqzRnq/7HDqH+6JMahrHQRIrFIHdL8SZxtkKiR9/1QdmXVmY/ZjQUgKJWzGfFBPd5IfrrLNGupZIUFsOd5S2oUmFXiwYXdJ3HtyEJVHEM8M1UjJp/XLDmPFXeu4o1C3FpL8fOmFPda6iANUyMOGnquW+jPNNXAWfhMF06vjtPFDe5XpsemaXrwhmweMGsauBVWMfI9tGzbyko+bSYSlPjjGpcaEamGH1ioUULFMgtHz/cPm+mbvqqG/7Ccrhu8j1bLuZy3893IL/8miOmKrGu+8U6vUYO5PD+edCTx36uIR211mDzakchjttZkT7QR/Zox7QHW3GGvfpKeuPOCcFoG2ufEuRIWGNxH9c3hLfIq/xklLPIG+Oykuh7aNxGfplqaNVfRIdrwVis/BsW/N5Er7+qCePAvzxbIKfVcy77en20N5LUgdxe2CuuLOqRF5tXAqPzTE="
14 | 
15 | install:
16 |   - go get -t ./...
17 | 
18 | before_script:
19 |   - wget https://github.com/mewmew/ci/raw/master/get_tools.sh
20 |   - chmod +x get_tools.sh
21 |   - ./get_tools.sh
22 |   - wget https://github.com/mewmew/ci/raw/master/ci_checks.sh
23 |   - chmod +x ci_checks.sh
24 | 
25 | script:
26 |   - GOTEST_RACE=0 ./ci_checks.sh
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # float
 2 | 
 3 | [![Build Status](https://travis-ci.org/mewmew/float.svg?branch=master)](https://travis-ci.org/mewmew/float)
 4 | [![Coverage Status](https://coveralls.io/repos/github/mewmew/float/badge.svg?branch=master)](https://coveralls.io/github/mewmew/float?branch=master)
 5 | [![go.dev reference](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white&style=flat-square)](https://pkg.go.dev/github.com/mewmew/float)
 6 | 
 7 | Floating-point formats.
 8 | 
 9 | * [binary16](https://pkg.go.dev/github.com/mewmew/float/binary16) (IEEE 754 [half precision](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) floating-point format)
10 | * [binary128](https://pkg.go.dev/github.com/mewmew/float/binary128) (IEEE 754 [quadruple precision](https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format) floating-point format)
11 | * [float80x86](https://pkg.go.dev/github.com/mewmew/float/float80x86) ([x86 extended precision](https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format) floating-point format)
12 | * [float128ppc](https://pkg.go.dev/github.com/mewmew/float/float128ppc) ([PowerPC double-double arithmetic](https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Double-double_arithmetic) floating-point format)
13 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/bfloat/bfloat.go:
--------------------------------------------------------------------------------
  1 | package bfloat
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math/big"
  6 | )
  7 | 
  8 | const (
  9 | 	// precision specifies the number of bits in the mantissa (including the
 10 | 	// implicit lead bit).
 11 | 	precision = 8
 12 | 	// exponent bias.
 13 | 	bias = 127
 14 | )
 15 | 
 16 | // Float is a floating-point number in bfloat16 floating-point format.
 17 | type Float struct {
 18 | 	// Sign, exponent and fraction.
 19 | 	//
 20 | 	//    1 bit:   sign
 21 | 	//    8 bits:  exponent
 22 | 	//    7 bits:  fraction
 23 | 	bits uint16
 24 | }
 25 | 
 26 | func NewFromBits(bits uint16) Float {
 27 | 	return Float{bits: bits}
 28 | }
 29 | 
 30 | func (f Float) Big() (x *big.Float, nan bool) {
 31 | 	signbit := f.Signbit()
 32 | 	exp := f.Exp()
 33 | 	frac := f.Frac()
 34 | 	x = big.NewFloat(0)
 35 | 	x.SetPrec(precision)
 36 | 	x.SetMode(big.ToNearestEven)
 37 | 
 38 | 	// ref: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Contrast_with_bfloat16_and_single_precision
 39 | 	//
 40 | 	// 0b00001 - 0b11110
 41 | 	// Normalized number.
 42 | 	//
 43 | 	//    (-1)^signbit * 2^(exp-127) * 1.mant_2
 44 | 	lead := 1
 45 | 	exponent := exp - bias
 46 | 
 47 | 	switch exp {
 48 | 	case 0xFF:
 49 | 		// Inf or NaN
 50 | 		if frac == 0 {
 51 | 			// +-Inf
 52 | 			x.SetInf(signbit)
 53 | 			return x, false
 54 | 		}
 55 | 		// +-NaN
 56 | 		if signbit {
 57 | 			x.Neg(x)
 58 | 		}
 59 | 		return x, true
 60 | 	case 0x00:
 61 | 		if frac == 0 {
 62 | 			// +-Zero
 63 | 			if signbit {
 64 | 				x.Neg(x)
 65 | 			}
 66 | 			return x, false
 67 | 		}
 68 | 		// Denormalized number.
 69 | 		//
 70 | 		//    (-1)^signbit * 2^(-126) * 0.mant_2
 71 | 		lead = 0
 72 | 		exponent = -126
 73 | 	}
 74 | 
 75 | 	// number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity .
 76 | 	sign := "+"
 77 | 	if signbit {
 78 | 		sign = "-"
 79 | 	}
 80 | 	s := fmt.Sprintf("%s0b%d.%07bp%d", sign, lead, frac, exponent)
 81 | 	if _, _, err := x.Parse(s, 0); err != nil {
 82 | 		panic(err)
 83 | 	}
 84 | 	return x, false
 85 | }
 86 | 
 87 | // Signbit reports whether f is negative or negative 0.
 88 | func (f Float) Signbit() bool {
 89 | 	// first bit is sign bit: 0b1000000000000000
 90 | 	return f.bits&0x8000 != 0
 91 | }
 92 | 
 93 | // Exp returns the exponent of f.
 94 | func (f Float) Exp() int {
 95 | 	// 8 bit exponent: 0b0111111110000000
 96 | 	return int(f.bits & 0x7F80 >> 7)
 97 | }
 98 | 
 99 | // Frac returns the fraction of f.
100 | func (f Float) Frac() uint16 {
101 | 	// 7 bit mantissa: 0b0000000001111111
102 | 	return f.bits & 0x7F
103 | }
104 | 


--------------------------------------------------------------------------------
/bfloat/bfloat_test.go:
--------------------------------------------------------------------------------
 1 | package bfloat
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestNewFromBits(t *testing.T) {
 9 | 	golden := []struct {
10 | 		bits uint16
11 | 		want float64
12 | 	}{
13 | 		// Special numbers.
14 | 		// 0 00000000 0000000 = 0
15 | 		{bits: 0, want: 0},
16 | 		// 1 00000000 0000000 = -0
17 | 		{bits: 0x8000, want: 1. / math.Inf(-1)},
18 | 		// 0 11111111 0000000 = +Inf
19 | 		{bits: 0x7f80, want: math.Inf(1)},
20 | 		// 1 11111111 0000000 = -Inf
21 | 		{bits: 0xff80, want: math.Inf(-1)},
22 | 
23 | 		// 0 11111111 0000001 = +NaN
24 | 		{bits: 0x7f81, want: math.NaN()},
25 | 		// 1 11111111 0000001 = -NaN
26 | 		{bits: 0xff81, want: -math.NaN()},
27 | 
28 | 		// from: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Examples
29 | 		{bits: 0x3f80, want: 1},
30 | 		{bits: 0xc000, want: -2},
31 | 		{bits: 0x4049, want: 3.140625},
32 | 		{bits: 0x3eab, want: 0.333984375},
33 | 	}
34 | 	for _, g := range golden {
35 | 		f := NewFromBits(g.bits)
36 | 		b, isNan := f.Big()
37 | 		got, _ := b.Float64()
38 | 		if isNan {
39 | 			got = g.want
40 | 		}
41 | 		wantBits := math.Float64bits(g.want)
42 | 		gotBits := math.Float64bits(got)
43 | 		if wantBits != gotBits {
44 | 			t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got)
45 | 		}
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/binary128/binary128.go:
--------------------------------------------------------------------------------
  1 | //go:generate go run gen.go -o extra_test.go
  2 | 
  3 | // Package binary128 implements encoding and decoding of IEEE 754 quadruple
  4 | // precision floating-point numbers.
  5 | //
  6 | // https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format
  7 | package binary128
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"math"
 12 | 	"math/big"
 13 | )
 14 | 
 15 | const (
 16 | 	// precision specifies the number of bits in the mantissa (including the
 17 | 	// implicit lead bit).
 18 | 	precision = 113
 19 | 	// exponent bias.
 20 | 	bias = 16383
 21 | )
 22 | 
 23 | // Positive and negative Not-a-Number, infinity and zero.
 24 | var (
 25 | 	// +NaN
 26 | 	NaN = Float{a: 0x7FFF800000000000, b: 0}
 27 | 	// -NaN
 28 | 	NegNaN = Float{a: 0xFFFF800000000000, b: 0}
 29 | 	// +Inf
 30 | 	Inf = Float{a: 0x7FFF000000000000, b: 0}
 31 | 	// -Inf
 32 | 	NegInf = Float{a: 0xFFFF000000000000, b: 0}
 33 | 	// +zero
 34 | 	Zero = Float{a: 0x0000000000000000, b: 0}
 35 | 	// -zero
 36 | 	NegZero = Float{a: 0x8000000000000000, b: 0}
 37 | )
 38 | 
 39 | // Float is a floating-point number in IEEE 754 quadruple precision format.
 40 | type Float struct {
 41 | 	// Sign, exponent and fraction.
 42 | 	//
 43 | 	//    1 bit:    sign
 44 | 	//    15 bits:  exponent
 45 | 	//    112 bits: fraction
 46 | 	a uint64
 47 | 	b uint64
 48 | }
 49 | 
 50 | // NewFromBits returns the floating-point number corresponding to the IEEE 754
 51 | // quadruple precision binary representation.
 52 | func NewFromBits(a, b uint64) Float {
 53 | 	return Float{a: a, b: b}
 54 | }
 55 | 
 56 | // NewFromFloat32 returns the nearest quadruple precision floating-point number
 57 | // for x and the accuracy of the conversion.
 58 | func NewFromFloat32(x float32) (Float, big.Accuracy) {
 59 | 	f, acc := NewFromFloat64(float64(x))
 60 | 	if acc == big.Exact {
 61 | 		_, acc = f.Float32()
 62 | 	}
 63 | 	return f, acc
 64 | }
 65 | 
 66 | // NewFromFloat64 returns the nearest quadruple precision floating-point number
 67 | // for x and the accuracy of the conversion.
 68 | func NewFromFloat64(x float64) (Float, big.Accuracy) {
 69 | 	// +-NaN
 70 | 	switch {
 71 | 	case math.IsNaN(x):
 72 | 		if math.Signbit(x) {
 73 | 			// -NaN
 74 | 			return NegNaN, big.Exact
 75 | 		}
 76 | 		// +NaN
 77 | 		return NaN, big.Exact
 78 | 	}
 79 | 	y := big.NewFloat(x)
 80 | 	y.SetPrec(precision)
 81 | 	y.SetMode(big.ToNearestEven)
 82 | 	// TODO: check accuracy after setting precision?
 83 | 	return NewFromBig(y)
 84 | }
 85 | 
 86 | // NewFromBig returns the nearest quadruple precision floating-point number for
 87 | // x and the accuracy of the conversion.
 88 | func NewFromBig(x *big.Float) (Float, big.Accuracy) {
 89 | 	// +-Inf
 90 | 	zero := big.NewFloat(0).SetPrec(precision)
 91 | 	switch {
 92 | 	case x.IsInf():
 93 | 		if x.Signbit() {
 94 | 			// -Inf
 95 | 			return NegInf, big.Exact
 96 | 		}
 97 | 		// +Inf
 98 | 		return Inf, big.Exact
 99 | 	// +-zero
100 | 	case x.Cmp(zero) == 0:
101 | 		if x.Signbit() {
102 | 			// -zero
103 | 			return NegZero, big.Exact
104 | 		}
105 | 		// +zero
106 | 		return Zero, big.Exact
107 | 	}
108 | 
109 | 	// Sign
110 | 	var a, b uint64
111 | 	if x.Signbit() {
112 | 		a |= 0x8000000000000000
113 | 	}
114 | 
115 | 	// Exponent and mantissa.
116 | 	mant := new(big.Float).SetPrec(precision)
117 | 	exponent := x.MantExp(mant)
118 | 	// Remove 1 from the exponent as big.Float has an no lead bit.
119 | 	exp := exponent - 1 + bias
120 | 
121 | 	// Handle denormalized values.
122 | 	// TODO: validate implementation of denormalized values.
123 | 	if exp <= 0 {
124 | 		acc := big.Exact
125 | 		if exp <= -(precision - 1) {
126 | 			exp = precision - 1
127 | 			acc = big.Below
128 | 		}
129 | 		mant.SetMantExp(mant, exp+precision-1)
130 | 		if mant.Signbit() {
131 | 			mant.Neg(mant)
132 | 		}
133 | 		mantissa, _ := mant.Int(nil)
134 | 		maskA := big.NewInt(0)
135 | 		for i := 64; i < 112; i++ {
136 | 			maskA.SetBit(maskA, i, 1)
137 | 		}
138 | 		maskB := big.NewInt(0)
139 | 		for i := 0; i < 64; i++ {
140 | 			maskB.SetBit(maskB, i, 1)
141 | 		}
142 | 		bigA := new(big.Int).And(mantissa, maskA) // a = (mantissa & maskA) >> 64
143 | 		bigA.Rsh(bigA, 64)
144 | 		bigB := new(big.Int).And(mantissa, maskB) // b = mantissa & maskB
145 | 		// TODO: calculate acc based on if mantissa&^maskA != 0 {}
146 | 		a |= bigA.Uint64() & 0x0000FFFFFFFFFFFF
147 | 		b = bigB.Uint64()
148 | 		return Float{a: a, b: b}, acc
149 | 	}
150 | 
151 | 	// exponent mask (15 bits): 0b111111111111111
152 | 	acc := big.Exact
153 | 	if (exp &^ 0x7FFF) != 0 {
154 | 		acc = big.Above
155 | 	}
156 | 	a |= uint64(exp&0x7FFF) << 48
157 | 
158 | 	if mant.Signbit() {
159 | 		mant.Neg(mant)
160 | 	}
161 | 	mant.SetMantExp(mant, precision)
162 | 	if !mant.IsInt() {
163 | 		acc = big.Below
164 | 	}
165 | 	mantissa, _ := mant.Int(nil)
166 | 	mantissa.SetBit(mantissa, 112, 0) // clear implicit lead bit; 2^112
167 | 
168 | 	// mantissa mask (113 bits, including implicit lead bit): 0x1FFFFFFFFFFFFFFFFFFFFFFFFFFFF
169 | 	maskA := big.NewInt(0)
170 | 	for i := 64; i < 112; i++ {
171 | 		maskA.SetBit(maskA, i, 1)
172 | 	}
173 | 	maskB := big.NewInt(0)
174 | 	for i := 0; i < 64; i++ {
175 | 		maskB.SetBit(maskB, i, 1)
176 | 	}
177 | 	bigA := new(big.Int).And(mantissa, maskA) // a = (mantissa & maskA) >> 64
178 | 	bigA.Rsh(bigA, 64)
179 | 	bigB := new(big.Int).And(mantissa, maskB) // b = mantissa & maskB
180 | 	if acc == big.Exact && (bigA.Uint64()&^0x0000FFFFFFFFFFFF) != 0 {
181 | 		acc = big.Below
182 | 	}
183 | 	a |= bigA.Uint64() & 0x0000FFFFFFFFFFFF
184 | 	b = bigB.Uint64()
185 | 	return Float{a: a, b: b}, acc
186 | }
187 | 
188 | // Bits returns the IEEE 754 quadruple precision binary representation of f.
189 | func (f Float) Bits() (a, b uint64) {
190 | 	return f.a, f.b
191 | }
192 | 
193 | // Float32 returns the float32 value nearest to f. If f is too small to be
194 | // represented by a float32 (|f| < math.SmallestNonzeroFloat32), the result is
195 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is
196 | // too large to be represented by a float32 (|f| > math.MaxFloat32), the result
197 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f.
198 | func (f Float) Float32() (float32, big.Accuracy) {
199 | 	x, nan := f.Big()
200 | 	if nan {
201 | 		if x.Signbit() {
202 | 			return float32(-math.NaN()), big.Exact
203 | 		}
204 | 		return float32(math.NaN()), big.Exact
205 | 	}
206 | 	return x.Float32()
207 | }
208 | 
209 | // Float64 returns the float64 value nearest to f. If f is too small to be
210 | // represented by a float64 (|f| < math.SmallestNonzeroFloat64), the result is
211 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is
212 | // too large to be represented by a float64 (|f| > math.MaxFloat64), the result
213 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f.
214 | func (f Float) Float64() (float64, big.Accuracy) {
215 | 	x, nan := f.Big()
216 | 	if nan {
217 | 		if x.Signbit() {
218 | 			return -math.NaN(), big.Exact
219 | 		}
220 | 		return math.NaN(), big.Exact
221 | 	}
222 | 	return x.Float64()
223 | }
224 | 
225 | // Big returns the multi-precision floating-point number representation of f and
226 | // a boolean indicating whether f is Not-a-Number.
227 | func (f Float) Big() (x *big.Float, nan bool) {
228 | 	signbit := f.Signbit()
229 | 	exp := f.Exp()
230 | 	frac1, frac2 := f.Frac()
231 | 	x = big.NewFloat(0)
232 | 	x.SetPrec(precision)
233 | 	x.SetMode(big.ToNearestEven)
234 | 
235 | 	lead := 1
236 | 	exponent := exp - bias
237 | 
238 | 	switch exp {
239 | 	// 0b111111111111111
240 | 	case 0x7FFF:
241 | 		// Inf or NaN
242 | 		if frac1 == 0 && frac2 == 0 {
243 | 			// +-Inf
244 | 			x.SetInf(signbit)
245 | 			return x, false
246 | 		}
247 | 		// +-NaN
248 | 		if signbit {
249 | 			x.Neg(x)
250 | 		}
251 | 		return x, true
252 | 	// 0b000000000000000
253 | 	case 0x0000:
254 | 		if frac1 == 0 && frac2 == 0 {
255 | 			// +-Zero
256 | 			if signbit {
257 | 				x.Neg(x)
258 | 			}
259 | 			return x, false
260 | 		}
261 | 		// Denormalized number.
262 | 		//
263 | 		//    (-1)^signbit * 2^(-16382) * 0.mant_2
264 | 		lead = 0
265 | 		exponent = -16382
266 | 	}
267 | 
268 | 	// number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity .
269 | 	sign := "+"
270 | 	if signbit {
271 | 		sign = "-"
272 | 	}
273 | 	// first part cut the sign and exponent which only contains 48 bits
274 | 	fracStr := fmt.Sprintf("%048b%064b", frac1, frac2)
275 | 	s := fmt.Sprintf("%s0b%d.%sp%d", sign, lead, fracStr, exponent)
276 | 	if _, _, err := x.Parse(s, 0); err != nil {
277 | 		panic(err)
278 | 	}
279 | 	return x, false
280 | }
281 | 
282 | // Signbit reports whether f is negative or negative 0.
283 | func (f Float) Signbit() bool {
284 | 	// first bit is sign bit
285 | 	return f.a&0x8000000000000000 != 0
286 | }
287 | 
288 | // Exp returns the exponent of f.
289 | func (f Float) Exp() int {
290 | 	// 15 bit exponent
291 | 	return int(f.a&0x7FFF000000000000) >> 48
292 | }
293 | 
294 | // Frac returns the fraction of f.
295 | func (f Float) Frac() (uint64, uint64) {
296 | 	// 0x0000FFFFFFFFFFFF removes the sign and exponent part (total 16 bits) from
297 | 	// our floating-point number. Now we can say it contains 48 bits of fraction,
298 | 	// and `f.b` part has the rest of fraction.
299 | 	return (f.a & 0x0000FFFFFFFFFFFF), f.b
300 | }
301 | 


--------------------------------------------------------------------------------
/binary128/binary128_test.go:
--------------------------------------------------------------------------------
  1 | package binary128
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"math/big"
  7 | 	"testing"
  8 | )
  9 | 
 10 | func TestNewFromBits(t *testing.T) {
 11 | 	const rawpi = "3.1415926535897932384626433832795028"
 12 | 	pi, ok := newFloat(0).SetString(rawpi)
 13 | 	if !ok {
 14 | 		panic(fmt.Errorf("unable to create arbitrary floating-point value of pi (%q)", rawpi))
 15 | 	}
 16 | 	golden := []struct {
 17 | 		a, b uint64
 18 | 		want *big.Float
 19 | 		nan  bool
 20 | 	}{
 21 | 		// Special numbers.
 22 | 		// +NaN
 23 | 		// 0x7FFF 8000000000000000000000000000 = +NaN
 24 | 		{a: 0x7FFF800000000000, b: 0x0000000000000001, want: newFloat(0), nan: true},
 25 | 		// -NaN
 26 | 		// 0xFFFF 8000000000000000000000000000 = -NaN
 27 | 		{a: 0xFFFF800000000000, b: 0x0000000000000000, want: newFloat(math.Copysign(0, -1)), nan: true},
 28 | 		// +inf
 29 | 		// 0x7FFF0000000000000000000000000000 = +inf
 30 | 		{a: 0x7FFF000000000000, b: 0x0000000000000000, want: newFloat(0).SetInf(false)},
 31 | 		// -inf
 32 | 		// 0xFFFF0000000000000000000000000000 = -inf
 33 | 		{a: 0xFFFF000000000000, b: 0x0000000000000000, want: newFloat(0).SetInf(true)},
 34 | 		// +0
 35 | 		// 0x00000000000000000000000000000000 = +0
 36 | 		{a: 0x0000000000000000, b: 0x0000000000000000, want: newFloat(+0)},
 37 | 		// -0
 38 | 		// 0x80000000000000000000000000000000 = -0
 39 | 		{a: 0x8000000000000000, b: 0x0000000000000000, want: newFloat(math.Copysign(0, -1))},
 40 | 
 41 | 		// from: https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Quadruple_precision_examples
 42 | 
 43 | 		// smallest positive subnormal number
 44 | 		// 0x00000000000000000000000000000001 = 2^{-16382} * 2^{-112} =  2^{-16494}
 45 | 		{a: 0x0000000000000000, b: 0x0000000000000001, want: pow(newFloat(2), -16494)},
 46 | 		// largest subnormal number
 47 | 		// 0x0000FFFFFFFFFFFFFFFFFFFFFFFFFFFF = 2^{-16382} * (1 - 2^{-112})
 48 | 		{a: 0x0000FFFFFFFFFFFF, b: 0xFFFFFFFFFFFFFFFF, want: mul(pow(newFloat(2), -16382), sub(newFloat(1), pow(newFloat(2), -112)))},
 49 | 		// smallest positive normal number
 50 | 		// 0x00010000000000000000000000000000 = 2^{-16382}
 51 | 		{a: 0x0001000000000000, b: 0x0000000000000000, want: pow(newFloat(2), -16382)},
 52 | 		// largest normal number
 53 | 		// 0x7FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF = 2^16383 * (2 - 2^{-112})
 54 | 		{a: 0x7FFEFFFFFFFFFFFF, b: 0xFFFFFFFFFFFFFFFF, want: mul(pow(newFloat(2), 16383), sub(newFloat(2), pow(newFloat(2), -112)))},
 55 | 		// largest number less than one
 56 | 		// 0x3FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF = 1 - 2^{-113}
 57 | 		{a: 0x3FFEFFFFFFFFFFFF, b: 0xFFFFFFFFFFFFFFFF, want: sub(newFloat(1), pow(newFloat(2), -113))},
 58 | 		// one
 59 | 		// 0x3FFF0000000000000000000000000000 = 1
 60 | 		{a: 0x3FFF000000000000, b: 0x0000000000000000, want: newFloat(1)},
 61 | 		// smallest number larger than one
 62 | 		// 0x3FFF0000000000000000000000000001 = 1 + 2^{-112}
 63 | 		{a: 0x3FFF000000000000, b: 0x0000000000000001, want: add(newFloat(1), pow(newFloat(2), -112))},
 64 | 		// -2
 65 | 		// 0xC0000000000000000000000000000000 = -2
 66 | 		{a: 0xC000000000000000, b: 0x0000000000000000, want: newFloat(-2)},
 67 | 		// pi
 68 | 		// 0x4000921FB54442D18469898CC51701B8 = pi
 69 | 		{a: 0x4000921FB54442D1, b: 0x8469898CC51701B8, want: pi},
 70 | 		// 1/3
 71 | 		// 0x3FFD5555555555555555555555555555 = 1/3
 72 | 		{a: 0x3FFD555555555555, b: 0x5555555555555555, want: newFloat(0).SetRat(big.NewRat(1, 3))},
 73 | 	}
 74 | 	for _, g := range golden {
 75 | 		f := NewFromBits(g.a, g.b)
 76 | 		got, nan := f.Big()
 77 | 		if g.want.Cmp(got) != 0 {
 78 | 			t.Errorf("0x%016X%016X: floating-point number mismatch; expected %v, got %v", g.a, g.b, g.want, got)
 79 | 		}
 80 | 		if g.nan != nan {
 81 | 			t.Errorf("0x%016X%016X: floating-point Not-a-Number indicator mismatch; expected %v, got %v", g.a, g.b, g.nan, nan)
 82 | 		}
 83 | 	}
 84 | }
 85 | 
 86 | func TestNewFromFloat64(t *testing.T) {
 87 | 	golden := []struct {
 88 | 		in   float64
 89 | 		a, b uint64
 90 | 		acc  big.Accuracy
 91 | 	}{
 92 | 		// Special numbers.
 93 | 		// 0x7FFF 8000000000000000000000000000 = +NaN
 94 | 		{in: math.NaN(), a: 0x7FFF800000000000, b: 0x0000000000000000, acc: big.Exact},
 95 | 		// -NaN
 96 | 		// 0xFFFF 8000000000000000000000000000 = -NaN
 97 | 		{in: -math.NaN(), a: 0xFFFF800000000000, b: 0x0000000000000000, acc: big.Exact},
 98 | 		// +inf
 99 | 		// 0x7FFF0000000000000000000000000000 = +inf
100 | 		{in: math.Inf(+1), a: 0x7FFF000000000000, b: 0x0000000000000000, acc: big.Exact},
101 | 		// -inf
102 | 		// 0xFFFF0000000000000000000000000000 = -inf
103 | 		{in: math.Inf(-1), a: 0xFFFF000000000000, b: 0x0000000000000000, acc: big.Exact},
104 | 		// +0
105 | 		// 0x00000000000000000000000000000000 = +0
106 | 		{in: +0, a: 0x0000000000000000, b: 0x0000000000000000, acc: big.Exact},
107 | 		// -0
108 | 		// 0x80000000000000000000000000000000 = -0
109 | 		{in: math.Copysign(0, -1), a: 0x8000000000000000, b: 0x0000000000000000, acc: big.Exact},
110 | 
111 | 		// from: https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Quadruple_precision_examples
112 | 
113 | 		// one
114 | 		// 0x3FFF0000000000000000000000000000 = 1
115 | 		{in: 1, a: 0x3FFF000000000000, b: 0x0000000000000000, acc: big.Exact},
116 | 		// -2
117 | 		// 0xC0000000000000000000000000000000 = -2
118 | 		{in: -2, a: 0xC000000000000000, b: 0x0000000000000000, acc: big.Exact},
119 | 		// pi
120 | 		// 0x4000921FB54442D18469898CC51701B8 = pi
121 | 		{in: math.Pi, a: 0x4000921FB54442D1, b: 0x8469898CC51701B8, acc: big.Exact},
122 | 		// 1/3
123 | 		// 0x3FFD5555555555555555555555555555 = 1/3
124 | 		{in: 1.0 / 3.0, a: 0x3FFD555555555555, b: 0x5555555555555555, acc: big.Exact},
125 | 	}
126 | 	for _, g := range golden {
127 | 		f, acc := NewFromFloat64(g.in)
128 | 		a, b := f.Bits()
129 | 		x, _ := f.Float64()
130 | 		// mask last 60 bits, as float64 only has 53 bits precision (as compared to
131 | 		// 113 of binary128).
132 | 		const mask = 0xF000000000000000
133 | 		wantBMask := g.b & mask
134 | 		gotBMask := b & mask
135 | 		if g.a != a || wantBMask != gotBMask {
136 | 			t.Errorf("bits mismatch; expected 0x%016X%016X (%v), got 0x%016X%016X (%v)", g.a, wantBMask, g.in, a, gotBMask, x)
137 | 		}
138 | 		if g.acc != acc {
139 | 			t.Errorf("accuracy mismatch; expected %v (%v), got %v (%v)", g.acc, g.in, acc, x)
140 | 		}
141 | 	}
142 | }
143 | 
144 | // ### [ Helper functions ] ####################################################
145 | 
146 | // pow returns x**y, the base-x exponential of y.
147 | func pow(x *big.Float, y int64) *big.Float {
148 | 	switch {
149 | 	// x^{-42}
150 | 	case y < 0:
151 | 		z := newFloat(1)
152 | 		for i := int64(0); i < -y; i++ {
153 | 			z = div(z, x)
154 | 		}
155 | 		return z
156 | 	// x^42
157 | 	case y > 0:
158 | 		z := newFloat(1)
159 | 		for i := int64(0); i < y; i++ {
160 | 			z = mul(z, x)
161 | 		}
162 | 		return z
163 | 	// x^0
164 | 	default: // y == 0
165 | 		return newFloat(1)
166 | 	}
167 | }
168 | 
169 | // add returns the sum x+y.
170 | func add(x, y *big.Float) *big.Float {
171 | 	return newFloat(0).Add(x, y)
172 | }
173 | 
174 | // add returns the difference x-y.
175 | func sub(x, y *big.Float) *big.Float {
176 | 	return newFloat(0).Sub(x, y)
177 | }
178 | 
179 | // add returns the product x*y.
180 | func mul(x, y *big.Float) *big.Float {
181 | 	return newFloat(0).Mul(x, y)
182 | }
183 | 
184 | // add returns the quotient x/y.
185 | func div(x, y *big.Float) *big.Float {
186 | 	return newFloat(0).Quo(x, y)
187 | }
188 | 
189 | // newFloat returns a new floating-point value based on x with precision 113.
190 | func newFloat(x float64) *big.Float {
191 | 	return big.NewFloat(0).SetPrec(precision).SetFloat64(x)
192 | }
193 | 


--------------------------------------------------------------------------------
/binary128/extra_test.tmpl:
--------------------------------------------------------------------------------
  1 | // Code generated by go run gen.go; DO NOT EDIT.
  2 | 
  3 | package binary128
  4 | 
  5 | import (
  6 | 	"math"
  7 | 	"math/big"
  8 | 	"testing"
  9 | )
 10 | 
 11 | func TestNewFromBitsNormalized(t *testing.T) {
 12 | 	testNewFromBits(t, goldenNormalized)
 13 | }
 14 | 
 15 | func TestNewFromBitsDenormalized(t *testing.T) {
 16 | 	testNewFromBits(t, goldenDenormalized)
 17 | }
 18 | 
 19 | func testNewFromBits(t *testing.T, golden []Golden) {
 20 | 	for _, g := range golden {
 21 | 		f := NewFromBits(g.a, g.b)
 22 | 		// Check arbitrary precision floating-point value.
 23 | 		got, _ := f.Big()
 24 | 		gotStr := got.Text('g', 35)
 25 | 		if g.want != gotStr {
 26 | 			t.Errorf("0x%016X%016X: floating-point number mismatch; expected %v, got %v", g.a, g.b, g.want, gotStr)
 27 | 		}
 28 | 		// Check 64-bit floating-point value.
 29 | 		got64, acc64 := f.Float64()
 30 | 		want64Bits := math.Float64bits(g.want64)
 31 | 		got64Bits := math.Float64bits(got64)
 32 | 		if want64Bits != got64Bits {
 33 | 			t.Errorf("0x%016X%016X: floating-point number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.a, g.b, want64Bits, g.want64, got64Bits, got64)
 34 | 		}
 35 | 		// Check accuracy of 64-bit floating-point value, as compared to arbitrary
 36 | 		// precision floating-point value.
 37 | 		if g.acc64 != acc64 {
 38 | 			t.Errorf("0x%016X%016X: floating-point accuracy mismatch for float64 %v of big %v; expected %v, got %v", g.a, g.b, g.want64, g.want, g.acc64, acc64)
 39 | 		}
 40 | 		// Validate 64-bit floating-point accuracy.
 41 | 		wantBig, _ := big.NewFloat(0).SetPrec(precision).SetString(g.want)
 42 | 		got64Big := big.NewFloat(got64)
 43 | 		switch acc64 {
 44 | 		case big.Below:
 45 | 			// got is below want
 46 | 			if got64Big.Cmp(wantBig) != -1 {
 47 | 				t.Errorf("%v: floating-point value not below; expected %v < %v, got %v >= %v", g.want, got64, g.want, got64, g.want)
 48 | 			}
 49 | 		case big.Exact:
 50 | 			// got is want
 51 | 			if got64Big.Cmp(wantBig) != 0 {
 52 | 				t.Errorf("%v: floating-point value not equal; expected %v == %v, got %v != %v", g.want, got64, g.want, got64, g.want)
 53 | 			}
 54 | 		case big.Above:
 55 | 			// got is above want
 56 | 			if got64Big.Cmp(wantBig) != 1 {
 57 | 				t.Errorf("%v: floating-point value not above; expected %v > %v, got %v <= %v", g.want, got64, g.want, got64, g.want)
 58 | 			}
 59 | 		}
 60 | 		// Validate 32-bit floating-point accuracy.
 61 | 		got32, acc32 := f.Float32()
 62 | 		got32Big := big.NewFloat(float64(got32))
 63 | 		switch acc32 {
 64 | 		case big.Below:
 65 | 			// got is below want
 66 | 			if got32Big.Cmp(wantBig) != -1 {
 67 | 				t.Errorf("%v: floating-point value not below; expected %v < %v, got %v >= %v", g.want, got32, g.want, got32, g.want)
 68 | 			}
 69 | 		case big.Exact:
 70 | 			// got is want
 71 | 			if got32Big.Cmp(wantBig) != 0 {
 72 | 				t.Errorf("%v: floating-point value not equal; expected %v == %v, got %v != %v", g.want, got32, g.want, got32, g.want)
 73 | 			}
 74 | 		case big.Above:
 75 | 			// got is above want
 76 | 			if got32Big.Cmp(wantBig) != 1 {
 77 | 				t.Errorf("%v: floating-point value not above; expected %v > %v, got %v <= %v", g.want, got32, g.want, got32, g.want)
 78 | 			}
 79 | 		}
 80 | 	}
 81 | }
 82 | 
 83 | func TestNewFromFloat32Normalized(t *testing.T) {
 84 | 	for _, g := range goldenNormalized {
 85 | 		want32 := float32(g.want64)
 86 | 		f, _ := NewFromFloat32(want32)
 87 | 		got32, _ := f.Float32()
 88 | 		if want32 != got32 {
 89 | 			t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, want32, got32)
 90 | 		}
 91 | 	}
 92 | }
 93 | 
 94 | func TestNewFromFloat64Normalized(t *testing.T) {
 95 | 	for _, g := range goldenNormalized {
 96 | 		f, _ := NewFromFloat64(g.want64)
 97 | 		got64, _ := f.Float64()
 98 | 		if g.want64 != got64 {
 99 | 			t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, g.want64, got64)
100 | 		}
101 | 	}
102 | }
103 | 
104 | func TestNewFromFloat32Denormalized(t *testing.T) {
105 | 	for _, g := range goldenDenormalized {
106 | 		want32 := float32(g.want64)
107 | 		f, _ := NewFromFloat32(want32)
108 | 		got32, _ := f.Float32()
109 | 		if want32 != got32 {
110 | 			t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, want32, got32)
111 | 		}
112 | 	}
113 | }
114 | 
115 | func TestNewFromFloat64Denormalized(t *testing.T) {
116 | 	for _, g := range goldenDenormalized {
117 | 		f, _ := NewFromFloat64(g.want64)
118 | 		got64, _ := f.Float64()
119 | 		if g.want64 != got64 {
120 | 			t.Errorf("%v: floating-point value mismatch; expected %v, got %v", g.want, g.want64, got64)
121 | 		}
122 | 	}
123 | }
124 | 
125 | type Golden struct {
126 | 	a, b   uint64
127 | 	want   string
128 | 	want64 float64
129 | 	acc64  big.Accuracy
130 | }
131 | 
132 | var goldenNormalized = []Golden{
133 | 	// Normalized values.
134 | 	{{- range .normalized }}
135 | 	{{ . }}
136 | 	{{- end }}
137 | }
138 | 
139 | var goldenDenormalized = []Golden{
140 | 	// Denormalized values.
141 | 	{{- range .denormalized }}
142 | 	{{ . }}
143 | 	{{- end }}
144 | }
145 | 


--------------------------------------------------------------------------------
/binary128/gen.go:
--------------------------------------------------------------------------------
  1 | //+build ignore
  2 | 
  3 | package main
  4 | 
  5 | import (
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"log"
  9 | 	"math"
 10 | 	"math/big"
 11 | 	"math/rand"
 12 | 	"os"
 13 | 	"sort"
 14 | 	"text/template"
 15 | 
 16 | 	"github.com/pkg/errors"
 17 | )
 18 | 
 19 | func main() {
 20 | 	var out string
 21 | 	flag.StringVar(&out, "o", "extra_test.go", "test cases output path")
 22 | 	flag.Parse()
 23 | 	if err := dumpTest(out); err != nil {
 24 | 		log.Fatalf("%+v", err)
 25 | 	}
 26 | }
 27 | 
 28 | func dumpTest(path string) error {
 29 | 	f, err := os.Create(path)
 30 | 	if err != nil {
 31 | 		return errors.WithStack(err)
 32 | 	}
 33 | 	defer f.Close()
 34 | 	t, err := template.ParseFiles("extra_test.tmpl")
 35 | 	if err != nil {
 36 | 		return errors.WithStack(err)
 37 | 	}
 38 | 	// Use deterministic source for pseudo-random nunmbers.
 39 | 	rand.Seed(1234)
 40 | 	// Randomize the exponent since the number of exponent, mantissa combinations
 41 | 	// otherwise become huge.
 42 | 	var exps []int
 43 | 	const nrandExps = 64
 44 | 	for i := 0; i < nrandExps; i++ {
 45 | 		// exponent bits: 0x0001 - 0x7FFE
 46 | 		exp := rand.Intn(0x7FFE) + 1
 47 | 		exps = append(exps, exp)
 48 | 	}
 49 | 	sort.Ints(exps)
 50 | 	// Randomize the mantissa since we cannot check 112 (48 + 64) bits
 51 | 	// exhaustively.
 52 | 	var mants []MantBits
 53 | 	const nrandMants = 512
 54 | 	for i := 0; i < nrandMants; i++ {
 55 | 		// 48 bits.
 56 | 		a := rand.Uint64() & 0xFFFFFFFFFFFF
 57 | 		// 64 bits.
 58 | 		b := rand.Uint64()
 59 | 		mant := MantBits{a: a, b: b}
 60 | 		mants = append(mants, mant)
 61 | 	}
 62 | 	sort.Slice(mants, func(i, j int) bool {
 63 | 		if mants[i].a < mants[j].a {
 64 | 			return true
 65 | 		}
 66 | 		return mants[i].b < mants[j].b
 67 | 	})
 68 | 	data := map[string][]string{
 69 | 		"normalized":   getNormalized(exps, mants),
 70 | 		"denormalized": getDenormalized(mants),
 71 | 	}
 72 | 	if err := t.Execute(f, data); err != nil {
 73 | 		return errors.WithStack(err)
 74 | 	}
 75 | 	return nil
 76 | }
 77 | 
 78 | const (
 79 | 	// precision specifies the number of bits in the mantissa (including the
 80 | 	// implicit lead bit).
 81 | 	precision = 113
 82 | 	// exponent bias.
 83 | 	bias = 16383
 84 | )
 85 | 
 86 | type MantBits struct {
 87 | 	// 48 bits.
 88 | 	a uint64
 89 | 	// 64 bits.
 90 | 	b uint64
 91 | }
 92 | 
 93 | func getNormalized(exps []int, mants []MantBits) []string {
 94 | 	var ns []string
 95 | 	// normalized
 96 | 	//
 97 | 	// exponent bits: 0x0001 - 0x7FFE
 98 | 	//
 99 | 	//    (-1)^signbit * 2^(exp-16383) * 1.mant_2
100 | 	const lead = 1
101 | 	for signbit := 0; signbit <= 1; signbit++ {
102 | 		sign := "+"
103 | 		if signbit == 1 {
104 | 			sign = "-"
105 | 		}
106 | 		for _, exp := range exps {
107 | 			exponent := exp - bias
108 | 			// mantissa bits: 112 (48 + 64) bits
109 | 			for _, mantBits := range mants {
110 | 				mant := fmt.Sprintf("%048b%064b", mantBits.a, mantBits.b)
111 | 				s := fmt.Sprintf("%s0b%d.%sp%d", sign, lead, mant, exponent)
112 | 				m, _, err := big.ParseFloat(s, 0, precision, big.ToNearestEven)
113 | 				if err != nil {
114 | 					panic(err)
115 | 				}
116 | 				want := m.Text('g', 35)
117 | 				want64, acc64 := m.Float64()
118 | 				a := uint64(signbit) << 63
119 | 				a |= uint64(exp) << 48
120 | 				a |= mantBits.a
121 | 				b := mantBits.b
122 | 				var n string
123 | 				switch {
124 | 				// Compare floating-point bits of want64, as otherwise +0 == -0
125 | 				case math.Float64bits(want64) == math.Float64bits(math.Copysign(0, -1)):
126 | 					// -zero
127 | 					n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Copysign(0, -1), acc64: big.%v}, // %s", a, b, want, acc64, s)
128 | 				case want64 == math.Inf(+1):
129 | 					// +inf
130 | 					n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(+1), acc64: big.%v}, // %s", a, b, want, acc64, s)
131 | 				case want64 == math.Inf(-1):
132 | 					// -inf
133 | 					n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(-1), acc64: big.%v}, // %s", a, b, want, acc64, s)
134 | 				default:
135 | 					n = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: %v, acc64: big.%v}, // %s", a, b, want, want64, acc64, s)
136 | 				}
137 | 				ns = append(ns, n)
138 | 			}
139 | 		}
140 | 	}
141 | 	return ns
142 | }
143 | 
144 | func getDenormalized(mants []MantBits) []string {
145 | 	var ds []string
146 | 	// denormalized
147 | 	//
148 | 	// exponent bits: 0x0000
149 | 	//
150 | 	//    (-1)^signbit * 2^(-14) * 0.mant_2
151 | 	const lead = 0
152 | 	for signbit := 0; signbit <= 1; signbit++ {
153 | 		sign := "+"
154 | 		if signbit == 1 {
155 | 			sign = "-"
156 | 		}
157 | 		const exp = 0x0000
158 | 		exponent := exp - bias + 1
159 | 		// mantissa bits: 112 (48 + 64) bits
160 | 		for _, mantBits := range mants {
161 | 			mant := fmt.Sprintf("%048b%064b", mantBits.a, mantBits.b)
162 | 			s := fmt.Sprintf("%s0b%d.%sp%d", sign, lead, mant, exponent)
163 | 			m, _, err := big.ParseFloat(s, 0, precision, big.ToNearestEven)
164 | 			if err != nil {
165 | 				panic(err)
166 | 			}
167 | 			want := m.Text('g', 35)
168 | 			want64, acc64 := m.Float64()
169 | 			a := uint64(signbit) << 63
170 | 			a |= uint64(exp) << 48
171 | 			a |= mantBits.a
172 | 			b := mantBits.b
173 | 			var d string
174 | 			switch {
175 | 			// Compare floating-point bits of want64, as otherwise +0 == -0
176 | 			case math.Float64bits(want64) == math.Float64bits(math.Copysign(0, -1)):
177 | 				// -zero
178 | 				d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Copysign(0, -1), acc64: big.%v}, // %s", a, b, want, acc64, s)
179 | 			case want64 == math.Inf(+1):
180 | 				// +inf
181 | 				d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(+1), acc64: big.%v}, // %s", a, b, want, acc64, s)
182 | 			case want64 == math.Inf(-1):
183 | 				// -inf
184 | 				d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: math.Inf(-1), acc64: big.%v}, // %s", a, b, want, acc64, s)
185 | 			default:
186 | 				d = fmt.Sprintf("{a: 0x%016X, b: 0x%016X, want: %q, want64: %v, acc64: big.%v}, // %s", a, b, want, want64, acc64, s)
187 | 			}
188 | 			ds = append(ds, d)
189 | 		}
190 | 	}
191 | 	return ds
192 | }
193 | 


--------------------------------------------------------------------------------
/binary16/binary16.go:
--------------------------------------------------------------------------------
  1 | //go:generate go run gen.go -o extra_test.go
  2 | 
  3 | // Package binary16 implements encoding and decoding of IEEE 754 half precision
  4 | // floating-point numbers.
  5 | //
  6 | // https://en.wikipedia.org/wiki/Half-precision_floating-point_format
  7 | package binary16
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"math"
 12 | 	"math/big"
 13 | )
 14 | 
 15 | const (
 16 | 	// precision specifies the number of bits in the mantissa (including the
 17 | 	// implicit lead bit).
 18 | 	precision = 11
 19 | 	// exponent bias.
 20 | 	bias = 15
 21 | )
 22 | 
 23 | // Positive and negative Not-a-Number, infinity and zero.
 24 | var (
 25 | 	// +NaN
 26 | 	NaN = Float{bits: 0x7E00}
 27 | 	// -NaN
 28 | 	NegNaN = Float{bits: 0xFE00}
 29 | 	// +Inf
 30 | 	Inf = Float{bits: 0x7C00}
 31 | 	// -Inf
 32 | 	NegInf = Float{bits: 0xFC00}
 33 | 	// +zero
 34 | 	Zero = Float{bits: 0x0000}
 35 | 	// -zero
 36 | 	NegZero = Float{bits: 0x8000}
 37 | )
 38 | 
 39 | // Float is a floating-point number in IEEE 754 half precision format.
 40 | type Float struct {
 41 | 	// Sign, exponent and fraction.
 42 | 	//
 43 | 	//    1 bit:   sign
 44 | 	//    5 bits:  exponent
 45 | 	//    10 bits: fraction
 46 | 	bits uint16
 47 | }
 48 | 
 49 | // NewFromBits returns the floating-point number corresponding to the IEEE 754
 50 | // half precision binary representation.
 51 | func NewFromBits(bits uint16) Float {
 52 | 	return Float{bits: bits}
 53 | }
 54 | 
 55 | // NewFromFloat32 returns the nearest half precision floating-point number for x
 56 | // and the accuracy of the conversion.
 57 | func NewFromFloat32(x float32) (Float, big.Accuracy) {
 58 | 	f, acc := NewFromFloat64(float64(x))
 59 | 	if acc == big.Exact {
 60 | 		_, acc = f.Float32()
 61 | 	}
 62 | 	return f, acc
 63 | }
 64 | 
 65 | // NewFromFloat64 returns the nearest half precision floating-point number for x
 66 | // and the accuracy of the conversion.
 67 | func NewFromFloat64(x float64) (Float, big.Accuracy) {
 68 | 	// +-NaN
 69 | 	switch {
 70 | 	case math.IsNaN(x):
 71 | 		if math.Signbit(x) {
 72 | 			// -NaN
 73 | 			return NegNaN, big.Exact
 74 | 		}
 75 | 		// +NaN
 76 | 		return NaN, big.Exact
 77 | 	}
 78 | 	y := big.NewFloat(x)
 79 | 	y.SetPrec(precision)
 80 | 	y.SetMode(big.ToNearestEven)
 81 | 	// TODO: check accuracy after setting precision?
 82 | 	return NewFromBig(y)
 83 | }
 84 | 
 85 | // NewFromBig returns the nearest half precision floating-point number for x and
 86 | // the accuracy of the conversion.
 87 | func NewFromBig(x *big.Float) (Float, big.Accuracy) {
 88 | 	// +-Inf
 89 | 	zero := big.NewFloat(0)
 90 | 	switch {
 91 | 	case x.IsInf():
 92 | 		if x.Signbit() {
 93 | 			// -Inf
 94 | 			return NegInf, big.Exact
 95 | 		}
 96 | 		// +Inf
 97 | 		return Inf, big.Exact
 98 | 	// +-zero
 99 | 	case x.Cmp(zero) == 0:
100 | 		if x.Signbit() {
101 | 			// -zero
102 | 			return NegZero, big.Exact
103 | 		}
104 | 		// +zero
105 | 		return Zero, big.Exact
106 | 	}
107 | 
108 | 	// Sign
109 | 	var bits uint16
110 | 	if x.Signbit() {
111 | 		bits |= 0x8000
112 | 	}
113 | 
114 | 	// Exponent and mantissa.
115 | 	mant := new(big.Float)
116 | 	exponent := x.MantExp(mant)
117 | 	// Remove 1 from the exponent as big.Float has an no lead bit.
118 | 	exp := exponent - 1 + bias
119 | 
120 | 	// Handle denormalized values.
121 | 	// TODO: validate implementation of denormalized values.
122 | 	if exp <= 0 {
123 | 		acc := big.Exact
124 | 		if exp <= -(precision - 1) {
125 | 			exp = precision - 1
126 | 			acc = big.Below
127 | 		}
128 | 		mant.SetMantExp(mant, exp+precision-1)
129 | 		if mant.Signbit() {
130 | 			mant.Neg(mant)
131 | 		}
132 | 		mantissa, _ := mant.Uint64()
133 | 		// TODO: calculate acc based on if mantissa&^0x7FF != 0 {}
134 | 		bits |= uint16(mantissa & 0x7FF)
135 | 		return Float{bits: bits}, acc
136 | 	}
137 | 
138 | 	// exponent mask (5 bits): 0b11111
139 | 	acc := big.Exact
140 | 	if (exp &^ 0x1F) != 0 {
141 | 		acc = big.Above
142 | 	}
143 | 	bits |= uint16(exp&0x1F) << 10
144 | 
145 | 	if mant.Signbit() {
146 | 		mant.Neg(mant)
147 | 	}
148 | 	mant.SetMantExp(mant, precision)
149 | 	if !mant.IsInt() {
150 | 		acc = big.Below
151 | 	}
152 | 	mantissa, _ := mant.Uint64()
153 | 	mantissa &^= 0x400 // clear implicit lead bit; 2^10
154 | 
155 | 	// mantissa mask (11 bits, including implicit lead bit): 0b11111111111
156 | 	if acc == big.Exact && (mantissa&^0x7FF) != 0 {
157 | 		acc = big.Below
158 | 	}
159 | 	mantissa &= 0x7FF
160 | 	bits |= uint16(mantissa)
161 | 	return Float{bits: bits}, acc
162 | }
163 | 
164 | // Bits returns the IEEE 754 half precision binary representation of f.
165 | func (f Float) Bits() uint16 {
166 | 	return f.bits
167 | }
168 | 
169 | // Float32 returns the float32 value nearest to f. If f is too small to be
170 | // represented by a float32 (|f| < math.SmallestNonzeroFloat32), the result is
171 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is
172 | // too large to be represented by a float32 (|f| > math.MaxFloat32), the result
173 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f.
174 | func (f Float) Float32() (float32, big.Accuracy) {
175 | 	x, nan := f.Big()
176 | 	if nan {
177 | 		if x.Signbit() {
178 | 			return float32(-math.NaN()), big.Exact
179 | 		}
180 | 		return float32(math.NaN()), big.Exact
181 | 	}
182 | 	return x.Float32()
183 | }
184 | 
185 | // Float64 returns the float64 value nearest to f. If f is too small to be
186 | // represented by a float64 (|f| < math.SmallestNonzeroFloat64), the result is
187 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is
188 | // too large to be represented by a float64 (|f| > math.MaxFloat64), the result
189 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f.
190 | func (f Float) Float64() (float64, big.Accuracy) {
191 | 	x, nan := f.Big()
192 | 	if nan {
193 | 		if x.Signbit() {
194 | 			return -math.NaN(), big.Exact
195 | 		}
196 | 		return math.NaN(), big.Exact
197 | 	}
198 | 	return x.Float64()
199 | }
200 | 
201 | // Big returns the multi-precision floating-point number representation of f and
202 | // a boolean indicating whether f is Not-a-Number.
203 | func (f Float) Big() (x *big.Float, nan bool) {
204 | 	signbit := f.Signbit()
205 | 	exp := f.Exp()
206 | 	frac := f.Frac()
207 | 	x = big.NewFloat(0)
208 | 	x.SetPrec(precision)
209 | 	x.SetMode(big.ToNearestEven)
210 | 
211 | 	// ref: https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Exponent_encoding
212 | 	//
213 | 	// 0b00001 - 0b11110
214 | 	// Normalized number.
215 | 	//
216 | 	//    (-1)^signbit * 2^(exp-15) * 1.mant_2
217 | 	lead := 1
218 | 	exponent := exp - bias
219 | 
220 | 	switch exp {
221 | 	// 0b11111
222 | 	case 0x1F:
223 | 		// Inf or NaN
224 | 		if frac == 0 {
225 | 			// +-Inf
226 | 			x.SetInf(signbit)
227 | 			return x, false
228 | 		}
229 | 		// +-NaN
230 | 		if signbit {
231 | 			x.Neg(x)
232 | 		}
233 | 		return x, true
234 | 	// 0b00000
235 | 	case 0x00:
236 | 		if frac == 0 {
237 | 			// +-Zero
238 | 			if signbit {
239 | 				x.Neg(x)
240 | 			}
241 | 			return x, false
242 | 		}
243 | 		// Denormalized number.
244 | 		//
245 | 		//    (-1)^signbit * 2^(-14) * 0.mant_2
246 | 		lead = 0
247 | 		exponent = -14
248 | 	}
249 | 
250 | 	// number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity .
251 | 	sign := "+"
252 | 	if signbit {
253 | 		sign = "-"
254 | 	}
255 | 	s := fmt.Sprintf("%s0b%d.%010bp%d", sign, lead, frac, exponent)
256 | 	if _, _, err := x.Parse(s, 0); err != nil {
257 | 		panic(err)
258 | 	}
259 | 	return x, false
260 | }
261 | 
262 | // Signbit reports whether f is negative or negative 0.
263 | func (f Float) Signbit() bool {
264 | 	// first bit is sign bit: 0b1000000000000000
265 | 	return f.bits&0x8000 != 0
266 | }
267 | 
268 | // Exp returns the exponent of f.
269 | func (f Float) Exp() int {
270 | 	// 5 bit exponent: 0b0111110000000000
271 | 	return int(f.bits & 0x7C00 >> 10)
272 | }
273 | 
274 | // Frac returns the fraction of f.
275 | func (f Float) Frac() uint16 {
276 | 	// 10 bit mantissa: 0b0000001111111111
277 | 	return f.bits & 0x03FF
278 | }
279 | 


--------------------------------------------------------------------------------
/binary16/binary16_test.go:
--------------------------------------------------------------------------------
  1 | package binary16
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"math/big"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestNewFromBits(t *testing.T) {
 10 | 	golden := []struct {
 11 | 		bits uint16
 12 | 		want float64
 13 | 	}{
 14 | 		// Special numbers.
 15 | 		// 0 11111 1000000000 = +NaN
 16 | 		{bits: 0x7E00, want: math.NaN()},
 17 | 		// -NaN
 18 | 		// 1 11111 1000000000 = -NaN
 19 | 		{bits: 0xFE00, want: -math.NaN()},
 20 | 
 21 | 		// from: https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Half_precision_examples
 22 | 
 23 | 		// 0 01111 0000000000 = 1
 24 | 		{bits: 0x3C00, want: 1},
 25 | 		// 0 01111 0000000001 = 1 + 2^(-10) = 1.0009765625 (next smallest float after 1)
 26 | 		{bits: 0x3C01, want: 1.0009765625},
 27 | 		// 1 10000 0000000000 = -2
 28 | 		{bits: 0xC000, want: -2},
 29 | 		// 0 11110 1111111111 = 65504 (max half precision)
 30 | 		{bits: 0x7BFF, want: 65504},
 31 | 		// 0 00001 0000000000 = 2^(-14) ~= 6.10352 * 10^(-5) (minimum positive normal)
 32 | 		{bits: 0x0400, want: math.Pow(2, -14)},
 33 | 		// 0 00000 0000000001 = 2^(-24) ~= 5.96046 * 10^(-8) (minimum positive subnormal)
 34 | 		{bits: 0x0001, want: math.Pow(2, -24)},
 35 | 		// 0 00000 0000000000 = 0
 36 | 		{bits: 0x0000, want: 0},
 37 | 		// 1 00000 0000000000 = −0
 38 | 		{bits: 0x8000, want: math.Copysign(0, -1)},
 39 | 		// 0 11111 0000000000 = infinity
 40 | 		{bits: 0x7C00, want: math.Inf(1)},
 41 | 		// 1 11111 0000000000 = -infinity
 42 | 		{bits: 0xFC00, want: math.Inf(-1)},
 43 | 		// 0 01101 0101010101 = 0.333251953125 ~= 1/3
 44 | 		{bits: 0x3555, want: 0.333251953125},
 45 | 
 46 | 		// from: https://reviews.llvm.org/rL237161
 47 | 
 48 | 		// Normalized numbers.
 49 | 		// 0 01110 0000000000 = 0.5
 50 | 		{bits: 0x3800, want: 0.5},
 51 | 		// 1 01110 0000000000 = -0.5
 52 | 		{bits: 0xB800, want: -0.5},
 53 | 		// 0 01111 1000000000 = 1.5
 54 | 		{bits: 0x3E00, want: 1.5},
 55 | 		// 1 01111 1000000000 = -1.5
 56 | 		{bits: 0xBE00, want: -1.5},
 57 | 		// 0 10000 0100000000 = 2.5
 58 | 		{bits: 0x4100, want: 2.5},
 59 | 		// 1 10000 0100000000 = -2.5
 60 | 		{bits: 0xC100, want: -2.5},
 61 | 		// Denormalized numbers.
 62 | 		// 0 00000 0000010000 = 2^(-20)
 63 | 		{bits: 0x0010, want: math.Pow(2, -20)},
 64 | 		// 1 00000 0000000001 = -2^(-24)
 65 | 		{bits: 0x8001, want: -math.Pow(2, -24)},
 66 | 
 67 | 		// 2^i
 68 | 		{bits: 0x0001, want: math.Pow(2, -24)}, // 2^(-24)
 69 | 		{bits: 0x0002, want: math.Pow(2, -23)}, // 2^(-23)
 70 | 		{bits: 0x0004, want: math.Pow(2, -22)}, // 2^(-22)
 71 | 		{bits: 0x0008, want: math.Pow(2, -21)}, // 2^(-21)
 72 | 		{bits: 0x0010, want: math.Pow(2, -20)}, // 2^(-20)
 73 | 		{bits: 0x0020, want: math.Pow(2, -19)}, // 2^(-19)
 74 | 		{bits: 0x0040, want: math.Pow(2, -18)}, // 2^(-18)
 75 | 		{bits: 0x0080, want: math.Pow(2, -17)}, // 2^(-17)
 76 | 		{bits: 0x0100, want: math.Pow(2, -16)}, // 2^(-16)
 77 | 		{bits: 0x0200, want: math.Pow(2, -15)}, // 2^(-15)
 78 | 		{bits: 0x0400, want: math.Pow(2, -14)}, // 2^(-14)
 79 | 		{bits: 0x0800, want: math.Pow(2, -13)}, // 2^(-13)
 80 | 		{bits: 0x0C00, want: math.Pow(2, -12)}, // 2^(-12)
 81 | 		{bits: 0x1000, want: math.Pow(2, -11)}, // 2^(-11)
 82 | 		{bits: 0x1400, want: math.Pow(2, -10)}, // 2^(-10)
 83 | 		{bits: 0x1800, want: math.Pow(2, -9)},  // 2^(-9)
 84 | 		{bits: 0x1C00, want: math.Pow(2, -8)},  // 2^(-8)
 85 | 		{bits: 0x2000, want: math.Pow(2, -7)},  // 2^(-7)
 86 | 		{bits: 0x2400, want: math.Pow(2, -6)},  // 2^(-6)
 87 | 		{bits: 0x2800, want: math.Pow(2, -5)},  // 2^(-5)
 88 | 		{bits: 0x2C00, want: math.Pow(2, -4)},  // 2^(-4)
 89 | 		{bits: 0x3000, want: math.Pow(2, -3)},  // 2^(-3)
 90 | 		{bits: 0x3400, want: math.Pow(2, -2)},  // 2^(-2)
 91 | 		{bits: 0x3800, want: math.Pow(2, -1)},  // 2^(-1)
 92 | 		{bits: 0x3C00, want: math.Pow(2, 0)},   // 2^0
 93 | 		{bits: 0x4000, want: math.Pow(2, 1)},   // 2^1
 94 | 		{bits: 0x4400, want: math.Pow(2, 2)},   // 2^2
 95 | 		{bits: 0x4800, want: math.Pow(2, 3)},   // 2^3
 96 | 		{bits: 0x4C00, want: math.Pow(2, 4)},   // 2^4
 97 | 		{bits: 0x5000, want: math.Pow(2, 5)},   // 2^5
 98 | 		{bits: 0x5400, want: math.Pow(2, 6)},   // 2^6
 99 | 		{bits: 0x5800, want: math.Pow(2, 7)},   // 2^7
100 | 		{bits: 0x5C00, want: math.Pow(2, 8)},   // 2^8
101 | 		{bits: 0x6000, want: math.Pow(2, 9)},   // 2^9
102 | 		{bits: 0x6400, want: math.Pow(2, 10)},  // 2^10
103 | 		{bits: 0x6800, want: math.Pow(2, 11)},  // 2^11
104 | 		{bits: 0x6C00, want: math.Pow(2, 12)},  // 2^12
105 | 		{bits: 0x7000, want: math.Pow(2, 13)},  // 2^13
106 | 		{bits: 0x7400, want: math.Pow(2, 14)},  // 2^14
107 | 		{bits: 0x7800, want: math.Pow(2, 15)},  // 2^15
108 | 	}
109 | 	for _, g := range golden {
110 | 		f := NewFromBits(g.bits)
111 | 		got, _ := f.Float64()
112 | 		wantBits := math.Float64bits(g.want)
113 | 		gotBits := math.Float64bits(got)
114 | 		//fmt.Printf("bits: 0x%04X (%v)\n", g.bits, g.want)
115 | 		if wantBits != gotBits {
116 | 			t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got)
117 | 		}
118 | 	}
119 | }
120 | 
121 | func TestNewFromFloat64(t *testing.T) {
122 | 	golden := []struct {
123 | 		in   float64
124 | 		want uint16
125 | 		acc  big.Accuracy
126 | 	}{
127 | 		// Special numbers.
128 | 		// 0 11111 1000000000 = +NaN
129 | 		{in: math.NaN(), want: 0x7E00, acc: big.Exact},
130 | 		// -NaN
131 | 		// 1 11111 1000000000 = -NaN
132 | 		{in: -math.NaN(), want: 0xFE00, acc: big.Exact},
133 | 
134 | 		// from: https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Half_precision_examples
135 | 
136 | 		// 0 01111 0000000000 = 1
137 | 		{in: 1, want: 0x3C00, acc: big.Exact},
138 | 		// 0 01111 0000000001 = 1 + 2^(-10) = 1.0009765625 (next smallest float after 1)
139 | 		{in: 1.0009765625, want: 0x3C01, acc: big.Exact},
140 | 		// 1 10000 0000000000 = -2
141 | 		{in: -2, want: 0xC000, acc: big.Exact},
142 | 		// 0 11110 1111111111 = 65504 (max half precision)
143 | 		{in: 65504, want: 0x7BFF, acc: big.Exact},
144 | 		// 0 00001 0000000000 = 2^(-14) ~= 6.10352 * 10^(-5) (minimum positive normal)
145 | 		{in: math.Pow(2, -14), want: 0x0400, acc: big.Exact},
146 | 		// 0 00000 0000000001 = 2^(-24) ~= 5.96046 * 10^(-8) (minimum positive subnormal)
147 | 		{in: math.Pow(2, -24), want: 0x0001, acc: big.Exact},
148 | 		// 0 00000 0000000000 = 0
149 | 		{in: 0, want: 0x0000, acc: big.Exact},
150 | 		// 1 00000 0000000000 = −0
151 | 		{in: math.Copysign(0, -1), want: 0x8000, acc: big.Exact},
152 | 		// 0 11111 0000000000 = infinity
153 | 		{in: math.Inf(1), want: 0x7C00, acc: big.Exact},
154 | 		// 1 11111 0000000000 = -infinity
155 | 		{in: math.Inf(-1), want: 0xFC00, acc: big.Exact},
156 | 		// 0 01101 0101010101 = 0.333251953125 ~= 1/3
157 | 		{in: 0.333251953125, want: 0x3555, acc: big.Exact},
158 | 
159 | 		// from: https://reviews.llvm.org/rL237161
160 | 
161 | 		// Normalized numbers.
162 | 		// 0 01110 0000000000 = 0.5
163 | 		{in: 0.5, want: 0x3800, acc: big.Exact},
164 | 		// 1 01110 0000000000 = -0.5
165 | 		{in: -0.5, want: 0xB800, acc: big.Exact},
166 | 		// 0 01111 1000000000 = 1.5
167 | 		{in: 1.5, want: 0x3E00, acc: big.Exact},
168 | 		// 1 01111 1000000000 = -1.5
169 | 		{in: -1.5, want: 0xBE00, acc: big.Exact},
170 | 		// 0 10000 0100000000 = 2.5
171 | 		{in: 2.5, want: 0x4100, acc: big.Exact},
172 | 		// 1 10000 0100000000 = -2.5
173 | 		{in: -2.5, want: 0xC100, acc: big.Exact},
174 | 		// Denormalized numbers.
175 | 		// 0 00000 0000010000 = 2^(-20)
176 | 		{in: math.Pow(2, -20), want: 0x0010, acc: big.Exact},
177 | 		// 1 00000 0000000001 = -2^(-24)
178 | 		{in: -math.Pow(2, -24), want: 0x8001, acc: big.Exact},
179 | 
180 | 		// 2^i
181 | 		{in: math.Pow(2, -25), want: 0x0000, acc: big.Below}, // 2^(-25)
182 | 		{in: math.Pow(2, -24), want: 0x0001, acc: big.Exact}, // 2^(-24)
183 | 		{in: math.Pow(2, -23), want: 0x0002, acc: big.Exact}, // 2^(-23)
184 | 		{in: math.Pow(2, -22), want: 0x0004, acc: big.Exact}, // 2^(-22)
185 | 		{in: math.Pow(2, -21), want: 0x0008, acc: big.Exact}, // 2^(-21)
186 | 		{in: math.Pow(2, -20), want: 0x0010, acc: big.Exact}, // 2^(-20)
187 | 		{in: math.Pow(2, -19), want: 0x0020, acc: big.Exact}, // 2^(-19)
188 | 		{in: math.Pow(2, -18), want: 0x0040, acc: big.Exact}, // 2^(-18)
189 | 		{in: math.Pow(2, -17), want: 0x0080, acc: big.Exact}, // 2^(-17)
190 | 		{in: math.Pow(2, -16), want: 0x0100, acc: big.Exact}, // 2^(-16)
191 | 		{in: math.Pow(2, -15), want: 0x0200, acc: big.Exact}, // 2^(-15)
192 | 		{in: math.Pow(2, -14), want: 0x0400, acc: big.Exact}, // 2^(-14)
193 | 		{in: math.Pow(2, -13), want: 0x0800, acc: big.Exact}, // 2^(-13)
194 | 		{in: math.Pow(2, -12), want: 0x0C00, acc: big.Exact}, // 2^(-12)
195 | 		{in: math.Pow(2, -11), want: 0x1000, acc: big.Exact}, // 2^(-11)
196 | 		{in: math.Pow(2, -10), want: 0x1400, acc: big.Exact}, // 2^(-10)
197 | 		{in: math.Pow(2, -9), want: 0x1800, acc: big.Exact},  // 2^(-9)
198 | 		{in: math.Pow(2, -8), want: 0x1C00, acc: big.Exact},  // 2^(-8)
199 | 		{in: math.Pow(2, -7), want: 0x2000, acc: big.Exact},  // 2^(-7)
200 | 		{in: math.Pow(2, -6), want: 0x2400, acc: big.Exact},  // 2^(-6)
201 | 		{in: math.Pow(2, -5), want: 0x2800, acc: big.Exact},  // 2^(-5)
202 | 		{in: math.Pow(2, -4), want: 0x2C00, acc: big.Exact},  // 2^(-4)
203 | 		{in: math.Pow(2, -3), want: 0x3000, acc: big.Exact},  // 2^(-3)
204 | 		{in: math.Pow(2, -2), want: 0x3400, acc: big.Exact},  // 2^(-2)
205 | 		{in: math.Pow(2, -1), want: 0x3800, acc: big.Exact},  // 2^(-1)
206 | 		{in: math.Pow(2, 0), want: 0x3C00, acc: big.Exact},   // 2^0
207 | 		{in: math.Pow(2, 1), want: 0x4000, acc: big.Exact},   // 2^1
208 | 		{in: math.Pow(2, 2), want: 0x4400, acc: big.Exact},   // 2^2
209 | 		{in: math.Pow(2, 3), want: 0x4800, acc: big.Exact},   // 2^3
210 | 		{in: math.Pow(2, 4), want: 0x4C00, acc: big.Exact},   // 2^4
211 | 		{in: math.Pow(2, 5), want: 0x5000, acc: big.Exact},   // 2^5
212 | 		{in: math.Pow(2, 6), want: 0x5400, acc: big.Exact},   // 2^6
213 | 		{in: math.Pow(2, 7), want: 0x5800, acc: big.Exact},   // 2^7
214 | 		{in: math.Pow(2, 8), want: 0x5C00, acc: big.Exact},   // 2^8
215 | 		{in: math.Pow(2, 9), want: 0x6000, acc: big.Exact},   // 2^9
216 | 		{in: math.Pow(2, 10), want: 0x6400, acc: big.Exact},  // 2^10
217 | 		{in: math.Pow(2, 11), want: 0x6800, acc: big.Exact},  // 2^11
218 | 		{in: math.Pow(2, 12), want: 0x6C00, acc: big.Exact},  // 2^12
219 | 		{in: math.Pow(2, 13), want: 0x7000, acc: big.Exact},  // 2^13
220 | 		{in: math.Pow(2, 14), want: 0x7400, acc: big.Exact},  // 2^14
221 | 		{in: math.Pow(2, 15), want: 0x7800, acc: big.Exact},  // 2^15
222 | 	}
223 | 	for _, g := range golden {
224 | 		f, acc := NewFromFloat64(g.in)
225 | 		got := f.Bits()
226 | 		x, _ := f.Float64()
227 | 		if g.want != got {
228 | 			t.Errorf("bits mismatch; expected 0x%04X (%v), got 0x%04X (%v)", g.want, g.in, got, x)
229 | 		}
230 | 		if g.acc != acc {
231 | 			t.Errorf("accuracy mismatch; expected %v (%v), got %v (%v)", g.acc, g.in, acc, x)
232 | 		}
233 | 	}
234 | }
235 | 


--------------------------------------------------------------------------------
/binary16/extra_test.tmpl:
--------------------------------------------------------------------------------
 1 | // Code generated by go run gen.go; DO NOT EDIT.
 2 | 
 3 | package binary16
 4 | 
 5 | import (
 6 | 	"math"
 7 | 	"testing"
 8 | )
 9 | 
10 | func TestNewFromBitsNormalized(t *testing.T) {
11 | 	for _, g := range goldenNormalized {
12 | 		f := NewFromBits(g.bits)
13 | 		got, _ := f.Float64()
14 | 		wantBits := math.Float64bits(g.want)
15 | 		gotBits := math.Float64bits(got)
16 | 		if wantBits != gotBits {
17 | 			t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got)
18 | 		}
19 | 	}
20 | }
21 | 
22 | func TestNewFromBitsDenormalized(t *testing.T) {
23 | 	for _, g := range goldenDenormalized {
24 | 		f := NewFromBits(g.bits)
25 | 		got, _ := f.Float64()
26 | 		wantBits := math.Float64bits(g.want)
27 | 		gotBits := math.Float64bits(got)
28 | 		if wantBits != gotBits {
29 | 			t.Errorf("0x%04X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.bits, wantBits, g.want, gotBits, got)
30 | 		}
31 | 	}
32 | }
33 | 
34 | func TestNewFromFloat32Normalized(t *testing.T) {
35 | 	for _, g := range goldenNormalized {
36 | 		in := float32(g.want)
37 | 		f, acc := NewFromFloat32(in)
38 | 		_ = acc
39 | 		got := f.Bits()
40 | 		if g.bits != got {
41 | 			t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", in, g.bits, got)
42 | 		}
43 | 	}
44 | }
45 | 
46 | func TestNewFromFloat64Normalized(t *testing.T) {
47 | 	for _, g := range goldenNormalized {
48 | 		f, acc := NewFromFloat64(g.want)
49 | 		_ = acc
50 | 		got := f.Bits()
51 | 		if g.bits != got {
52 | 			t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", g.want, g.bits, got)
53 | 		}
54 | 	}
55 | }
56 | 
57 | func TestNewFromFloat32Denormalized(t *testing.T) {
58 | 	for _, g := range goldenDenormalized {
59 | 		in := float32(g.want)
60 | 		f, acc := NewFromFloat32(in)
61 | 		_ = acc
62 | 		got := f.Bits()
63 | 		if g.bits != got {
64 | 			t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", in, g.bits, got)
65 | 		}
66 | 	}
67 | }
68 | 
69 | func TestNewFromFloat64Denormalized(t *testing.T) {
70 | 	for _, g := range goldenDenormalized {
71 | 		f, acc := NewFromFloat64(g.want)
72 | 		_ = acc
73 | 		got := f.Bits()
74 | 		if g.bits != got {
75 | 			t.Errorf("%v: bits mismatch; expected 0x%04X, got 0x%04X", g.want, g.bits, got)
76 | 		}
77 | 	}
78 | }
79 | 
80 | var goldenNormalized = []struct {
81 | 	bits uint16
82 | 	want float64
83 | }{
84 | 	// Normalized values.
85 | 	{{- range .normalized }}
86 | 	{{ . }}
87 | 	{{- end }}
88 | }
89 | 
90 | var goldenDenormalized = []struct {
91 | 	bits uint16
92 | 	want float64
93 | }{
94 | 	// Denormalized values.
95 | 	{{- range .denormalized }}
96 | 	{{ . }}
97 | 	{{- end }}
98 | }
99 | 


--------------------------------------------------------------------------------
/binary16/gen.go:
--------------------------------------------------------------------------------
  1 | //+build ignore
  2 | 
  3 | package main
  4 | 
  5 | import (
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"log"
  9 | 	"math"
 10 | 	"math/big"
 11 | 	"os"
 12 | 	"text/template"
 13 | 
 14 | 	"github.com/pkg/errors"
 15 | )
 16 | 
 17 | func main() {
 18 | 	var out string
 19 | 	flag.StringVar(&out, "o", "extra_test.go", "test cases output path")
 20 | 	flag.Parse()
 21 | 	if err := dumpTest(out); err != nil {
 22 | 		log.Fatalf("%+v", err)
 23 | 	}
 24 | }
 25 | 
 26 | func dumpTest(path string) error {
 27 | 	f, err := os.Create(path)
 28 | 	if err != nil {
 29 | 		return errors.WithStack(err)
 30 | 	}
 31 | 	defer f.Close()
 32 | 	t, err := template.ParseFiles("extra_test.tmpl")
 33 | 	if err != nil {
 34 | 		return errors.WithStack(err)
 35 | 	}
 36 | 	data := map[string][]string{
 37 | 		"normalized":   getNormalized(),
 38 | 		"denormalized": getDenormalized(),
 39 | 	}
 40 | 	if err := t.Execute(f, data); err != nil {
 41 | 		return errors.WithStack(err)
 42 | 	}
 43 | 	return nil
 44 | }
 45 | 
 46 | // exponent bias.
 47 | const bias = 15
 48 | 
 49 | func getNormalized() []string {
 50 | 	var ns []string
 51 | 	// normalized
 52 | 	//
 53 | 	// exponent bits: 0b00001 - 0b11110
 54 | 	//
 55 | 	//    (-1)^signbit * 2^(exp-15) * 1.mant_2
 56 | 	const lead = 1
 57 | 	for signbit := 0; signbit <= 1; signbit++ {
 58 | 		for exp := 1; exp <= 0x1E; exp++ {
 59 | 			// mantissa bits: 0b0000000000 - 0b1111111111
 60 | 			for mant := 0; mant <= 0x3FF; mant++ {
 61 | 				s := fmt.Sprintf("%s0b%d.%010bp0", "+", lead, mant)
 62 | 				m, _, err := big.ParseFloat(s, 0, 53, big.ToNearestEven)
 63 | 				if err != nil {
 64 | 					panic(err)
 65 | 				}
 66 | 				mantissa, acc := m.Float64()
 67 | 				if acc != big.Exact {
 68 | 					panic("not exact")
 69 | 				}
 70 | 				want := math.Pow(-1, float64(signbit)) * math.Pow(2, float64(exp)-bias) * mantissa
 71 | 				bits := uint16(signbit) << 15
 72 | 				bits |= uint16(exp) << 10
 73 | 				bits |= uint16(mant)
 74 | 				n := fmt.Sprintf("{bits: 0x%04X, want: %v}, // %s", bits, want, s)
 75 | 				ns = append(ns, n)
 76 | 			}
 77 | 		}
 78 | 	}
 79 | 	return ns
 80 | }
 81 | 
 82 | func getDenormalized() []string {
 83 | 	var ds []string
 84 | 	// denormalized
 85 | 	//
 86 | 	// exponent bits: 0b00000
 87 | 	//
 88 | 	//    (-1)^signbit * 2^(-14) * 0.mant_2
 89 | 	const lead = 0
 90 | 	for signbit := 0; signbit <= 1; signbit++ {
 91 | 		// mantissa bits: 0b0000000000 - 0b1111111111
 92 | 		const exp = 0
 93 | 		for mant := 0; mant <= 0x3FF; mant++ {
 94 | 			s := fmt.Sprintf("%s0b%d.%010bp0", "+", lead, mant)
 95 | 			m, _, err := big.ParseFloat(s, 0, 53, big.ToNearestEven)
 96 | 			if err != nil {
 97 | 				panic(err)
 98 | 			}
 99 | 			mantissa, acc := m.Float64()
100 | 			if acc != big.Exact {
101 | 				panic("not exact")
102 | 			}
103 | 			want := math.Pow(-1, float64(signbit)) * math.Pow(2, exp-bias+1) * mantissa
104 | 			bits := uint16(signbit) << 15
105 | 			bits |= uint16(exp) << 10
106 | 			bits |= uint16(mant)
107 | 			if bits == 0x8000 {
108 | 				// -zero
109 | 				d := fmt.Sprintf("{bits: 0x%04X, want: math.Copysign(0, -1)}, // %s", bits, s)
110 | 				ds = append(ds, d)
111 | 			} else {
112 | 				d := fmt.Sprintf("{bits: 0x%04X, want: %v}, // %s", bits, want, s)
113 | 				ds = append(ds, d)
114 | 			}
115 | 		}
116 | 	}
117 | 	return ds
118 | }
119 | 


--------------------------------------------------------------------------------
/binary16/testdata/Makefile:
--------------------------------------------------------------------------------
 1 | C=$(wildcard *.c)
 2 | LL=$(C:.c=.ll)
 3 | 
 4 | all: $(LL)
 5 | 
 6 | %.ll: %.c sar
 7 | 	clang -S -emit-llvm -o $@ $<
 8 | 	sar -i "(?m:^[^@\n][^\n]*[\n])" "" $@
 9 | 	sar -i ", align[^\n]*" "" $@
10 | 	sar -i "[\n]+" "\n" $@
11 | 	sar -i "^[\n]+" "" $@
12 | 
13 | sar:
14 | 	@if ! which $@ &> /dev/null ; then \
15 | 		echo "Please install the \"sar\" tool"; \
16 | 		echo; \
17 | 		echo "   go get -u github.com/mewkiz/cmd/sar"; \
18 | 		echo; \
19 | 		exit 1; \
20 | 	fi
21 | 
22 | .PHONY: all
23 | 


--------------------------------------------------------------------------------
/binary16/testdata/binary16.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | 
 3 | _Float16 f_pos_nan = +NAN;
 4 | _Float16 f_neg_nan = -NAN;
 5 | _Float16 f_pos_inf = +INFINITY;
 6 | _Float16 f_neg_inf = -INFINITY;
 7 | _Float16 f_pos_0 = +0.0;
 8 | _Float16 f_neg_0 = -0.0;
 9 | _Float16 f_pos_0_dot_5 = +0.5;
10 | _Float16 f_neg_0_dot_5 = -0.5;
11 | _Float16 f_pos_1_dot_5 = +1.5;
12 | _Float16 f_neg_1_dot_5 = -1.5;
13 | _Float16 f_pos_2_dot_5 = +2.5;
14 | _Float16 f_neg_2_dot_5 = -2.5;
15 | _Float16 f_pos_3_dot_14 = +3.14;
16 | _Float16 f_neg_3_dot_14 = -3.14;
17 | 


--------------------------------------------------------------------------------
/binary16/testdata/binary16.ll:
--------------------------------------------------------------------------------
 1 | @f_pos_nan = global half 0xH7E00
 2 | @f_neg_nan = global half 0xHFE00
 3 | @f_pos_inf = global half 0xH7C00
 4 | @f_neg_inf = global half 0xHFC00
 5 | @f_pos_0 = global half 0xH0000
 6 | @f_neg_0 = global half 0xH8000
 7 | @f_pos_0_dot_5 = global half 0xH3800
 8 | @f_neg_0_dot_5 = global half 0xHB800
 9 | @f_pos_1_dot_5 = global half 0xH3E00
10 | @f_neg_1_dot_5 = global half 0xHBE00
11 | @f_pos_2_dot_5 = global half 0xH4100
12 | @f_neg_2_dot_5 = global half 0xHC100
13 | @f_pos_3_dot_14 = global half 0xH4248
14 | @f_neg_3_dot_14 = global half 0xHC248
15 | 


--------------------------------------------------------------------------------
/float.go:
--------------------------------------------------------------------------------
 1 | // Package float implements floating-point representation utility functions.
 2 | package float
 3 | 
 4 | import (
 5 | 	"fmt"
 6 | 	"math/big"
 7 | 	"strings"
 8 | 
 9 | 	"github.com/mewmew/float/internal/strconv"
10 | )
11 | 
12 | // IsExact16 reports whether x may be represented exactly as a 16-bit
13 | // floating-point value.
14 | func IsExact16(x *big.Float) bool {
15 | 	f, acc := x.Float64()
16 | 	if acc != big.Exact {
17 | 		return false
18 | 	}
19 | 	s1 := strconv.FormatFloat(f, 'e', -1, 16)
20 | 	s2 := trimZeros(x.Text('e', 100))
21 | 	return s1 == s2
22 | }
23 | 
24 | // IsExact32 reports whether x may be represented exactly as a 32-bit
25 | // floating-point value.
26 | func IsExact32(x *big.Float) bool {
27 | 	f, acc := x.Float32()
28 | 	if acc != big.Exact {
29 | 		return false
30 | 	}
31 | 	s1 := strconv.FormatFloat(float64(f), 'e', -1, 32)
32 | 	s2 := trimZeros(x.Text('e', 100))
33 | 	return s1 == s2
34 | }
35 | 
36 | // IsExact64 reports whether x may be represented exactly as a 64-bit
37 | // floating-point value.
38 | func IsExact64(x *big.Float) bool {
39 | 	f, acc := x.Float64()
40 | 	if acc != big.Exact {
41 | 		return false
42 | 	}
43 | 	s1 := strconv.FormatFloat(f, 'e', -1, 64)
44 | 	s2 := trimZeros(x.Text('e', 100))
45 | 	return s1 == s2
46 | }
47 | 
48 | // trimZeros trims trailing zeroes after the decimal point in the given
49 | // floating-point value (represented in scientific notation). If all digits
50 | // after the decimal point are trimmed this way, the decimal point is also
51 | // trimmed.
52 | func trimZeros(s string) string {
53 | 	epos := strings.Index(s, "e")
54 | 	if epos == -1 {
55 | 		panic(fmt.Errorf("unable to locate position of exponent (e.g. e+02) in %q", s))
56 | 	}
57 | 	pos := epos - 1
58 | 	for ; pos > 0; pos-- {
59 | 		if s[pos] != '0' {
60 | 			break
61 | 		}
62 | 	}
63 | 	if s[pos] != '.' {
64 | 		pos++
65 | 	}
66 | 	return fmt.Sprintf("%s%s", s[:pos], s[epos:])
67 | }
68 | 


--------------------------------------------------------------------------------
/float128ppc/float128ppc.go:
--------------------------------------------------------------------------------
  1 | // Package float128ppc implements encoding and decoding of double-double
  2 | // floating-point numbers.
  3 | //
  4 | // https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Double-double_arithmetic
  5 | package float128ppc
  6 | 
  7 | import (
  8 | 	"math"
  9 | 	"math/big"
 10 | )
 11 | 
 12 | const (
 13 | 	// precision specifies the number of bits in the mantissa (including the
 14 | 	// implicit lead bit).
 15 | 	precision = 106
 16 | )
 17 | 
 18 | // Positive and negative Not-a-Number, infinity and zero.
 19 | var (
 20 | 	// +NaN
 21 | 	NaN = Float{high: math.NaN(), low: 0}
 22 | 	// -NaN
 23 | 	NegNaN = Float{high: -math.NaN(), low: 0}
 24 | 	// +Inf
 25 | 	Inf = Float{high: math.Inf(1), low: 0}
 26 | 	// -Inf
 27 | 	NegInf = Float{high: -math.Inf(-1), low: 0}
 28 | 	// +zero
 29 | 	Zero = Float{high: 0, low: 0}
 30 | 	// -zero
 31 | 	NegZero = Float{high: math.Copysign(0, -1), low: 0}
 32 | )
 33 | 
 34 | // Float is a floating-point number in double-double format.
 35 | type Float struct {
 36 | 	// where a long double value is regarded as the exact sum of two double-precision values, giving at least a 106-bit precision
 37 | 	high float64
 38 | 	low  float64
 39 | }
 40 | 
 41 | // NewFromBits returns the floating-point number corresponding to the
 42 | // double-double representation.
 43 | func NewFromBits(a, b uint64) Float {
 44 | 	high := math.Float64frombits(a)
 45 | 	low := math.Float64frombits(b)
 46 | 	return Float{
 47 | 		high: high,
 48 | 		low:  low,
 49 | 	}
 50 | }
 51 | 
 52 | // NewFromFloat32 returns the nearest double-double precision floating-point
 53 | // number for x and the accuracy of the conversion.
 54 | func NewFromFloat32(x float32) (Float, big.Accuracy) {
 55 | 	f, acc := NewFromFloat64(float64(x))
 56 | 	if acc == big.Exact {
 57 | 		_, acc = f.Float32()
 58 | 	}
 59 | 	return f, acc
 60 | }
 61 | 
 62 | // NewFromFloat64 returns the nearest double-double precision floating-point
 63 | // number for x and the accuracy of the conversion.
 64 | func NewFromFloat64(x float64) (Float, big.Accuracy) {
 65 | 	// +-NaN
 66 | 	switch {
 67 | 	case math.IsNaN(x):
 68 | 		if math.Signbit(x) {
 69 | 			// -NaN
 70 | 			return NegNaN, big.Exact
 71 | 		}
 72 | 		// +NaN
 73 | 		return NaN, big.Exact
 74 | 	}
 75 | 	r := Float{high: x, low: 0}
 76 | 	br, _ := r.Big()
 77 | 	return r, br.Acc()
 78 | }
 79 | 
 80 | // NewFromBig returns the nearest double-double floating-point number for x and
 81 | // the accuracy of the conversion.
 82 | func NewFromBig(x *big.Float) (Float, big.Accuracy) {
 83 | 	// +-Inf
 84 | 	zero := big.NewFloat(0).SetPrec(precision)
 85 | 	switch {
 86 | 	case x.IsInf():
 87 | 		if x.Signbit() {
 88 | 			// -Inf
 89 | 			return NegInf, big.Exact
 90 | 		}
 91 | 		// +Inf
 92 | 		return Inf, big.Exact
 93 | 	// +-zero
 94 | 	case x.Cmp(zero) == 0:
 95 | 		if x.Signbit() {
 96 | 			// -zero
 97 | 			return NegZero, big.Exact
 98 | 		}
 99 | 		// +zero
100 | 		return Zero, big.Exact
101 | 	}
102 | 
103 | 	// set precision of x.
104 | 	x.SetPrec(precision).SetMode(big.ToNearestEven)
105 | 
106 | 	// get high part of the double-double floating-point value.
107 | 	high, _ := x.Float64()
108 | 	h := big.NewFloat(high).SetPrec(precision).SetMode(big.ToNearestEven)
109 | 
110 | 	// compute low part by subtracting high from x.
111 | 	l := big.NewFloat(0).SetPrec(precision).SetMode(big.ToNearestEven)
112 | 	l.Sub(x, h)
113 | 	low, _ := l.Float64()
114 | 
115 | 	// check accuracy of results.
116 | 	result := big.NewFloat(0).SetPrec(precision).SetMode(big.ToNearestEven)
117 | 	result.Add(h, l)
118 | 	acc := big.Accuracy(x.Cmp(result))
119 | 
120 | 	return Float{high: high, low: low}, acc
121 | }
122 | 
123 | // Bits returns the double-double binary representation of f.
124 | func (f Float) Bits() (a, b uint64) {
125 | 	return math.Float64bits(f.high), math.Float64bits(f.low)
126 | }
127 | 
128 | // Float32 returns the float32 representation of f.
129 | func (f Float) Float32() (float32, big.Accuracy) {
130 | 	x, nan := f.Big()
131 | 	if nan {
132 | 		if x.Signbit() {
133 | 			return float32(-math.NaN()), big.Exact
134 | 		}
135 | 		return float32(math.NaN()), big.Exact
136 | 	}
137 | 	return x.Float32()
138 | }
139 | 
140 | // Float64 returns the float64 representation of f.
141 | func (f Float) Float64() (float64, big.Accuracy) {
142 | 	x, nan := f.Big()
143 | 	if nan {
144 | 		if x.Signbit() {
145 | 			return -math.NaN(), big.Exact
146 | 		}
147 | 		return math.NaN(), big.Exact
148 | 	}
149 | 	return x.Float64()
150 | }
151 | 
152 | // Big returns the multi-precision floating-point number representation of f and
153 | // a boolean indicating whether f is Not-a-Number.
154 | func (f Float) Big() (x *big.Float, nan bool) {
155 | 	x = big.NewFloat(0)
156 | 	x.SetPrec(precision)
157 | 	x.SetMode(big.ToNearestEven)
158 | 	if f.IsNaN() {
159 | 		return x, true
160 | 	}
161 | 	h := big.NewFloat(f.high).SetPrec(precision)
162 | 	l := big.NewFloat(f.low).SetPrec(precision)
163 | 	x.Add(h, l)
164 | 
165 | 	zero := big.NewFloat(0).SetPrec(precision)
166 | 	if x.Cmp(zero) == 0 && math.Signbit(f.high) {
167 | 		// -zero
168 | 		if !x.Signbit() {
169 | 			x.Neg(x)
170 | 		}
171 | 	}
172 | 
173 | 	return x, false
174 | }
175 | 
176 | // IsNaN returns true if the Float is NaN
177 | func (f Float) IsNaN() bool {
178 | 	// NaN + NaN should be NaN in consideration
179 | 	return math.IsNaN(f.high) || math.IsNaN(f.low)
180 | }
181 | 


--------------------------------------------------------------------------------
/float128ppc/float128ppc_test.go:
--------------------------------------------------------------------------------
 1 | package float128ppc
 2 | 
 3 | import (
 4 | 	"math/big"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestRoundTrip(t *testing.T) {
 9 | 	golden := []struct {
10 | 		h, l uint64
11 | 	}{
12 | 		{h: 0x0000000000000000, l: 0x0000000000000000}, // "0xM00000000000000000000000000000000"
13 | 		{h: 0x3DF0000000000000, l: 0x0000000000000000}, // "0xM3DF00000000000000000000000000000"
14 | 		{h: 0x3FF0000000000000, l: 0x0000000000000000}, // "0xM3FF00000000000000000000000000000"
15 | 		{h: 0x4000000000000000, l: 0x0000000000000000}, // "0xM40000000000000000000000000000000"
16 | 		{h: 0x400C000000000030, l: 0x0000000010000000}, // "0xM400C0000000000300000000010000000"
17 | 		{h: 0x400F000000000000, l: 0xBCB0000000000000}, // "0xM400F000000000000BCB0000000000000"
18 | 		{h: 0x403B000000000000, l: 0x0000000000000000}, // "0xM403B0000000000000000000000000000"
19 | 		{h: 0x405EDA5E353F7CEE, l: 0x0000000000000000}, // "0xM405EDA5E353F7CEE0000000000000000"
20 | 		{h: 0x4093B40000000000, l: 0x0000000000000000}, // "0xM4093B400000000000000000000000000"
21 | 		{h: 0x41F0000000000000, l: 0x0000000000000000}, // "0xM41F00000000000000000000000000000"
22 | 		{h: 0x4D436562A0416DE0, l: 0x0000000000000000}, // "0xM4D436562A0416DE00000000000000000"
23 | 		{h: 0x8000000000000000, l: 0x0000000000000000}, // "0xM80000000000000000000000000000000"
24 | 		{h: 0x818F2887B9295809, l: 0x800000000032D000}, // "0xM818F2887B9295809800000000032D000"
25 | 		{h: 0xC00547AE147AE148, l: 0x3CA47AE147AE147A}, // "0xMC00547AE147AE1483CA47AE147AE147A"
26 | 	}
27 | 	for _, g := range golden {
28 | 		f1 := NewFromBits(g.h, g.l)
29 | 		fbig, nan := f1.Big()
30 | 		_ = nan
31 | 		f2, acc := NewFromBig(fbig)
32 | 		_ = acc
33 | 		h, l := f2.Bits()
34 | 		if g.h != h {
35 | 			t.Errorf("0xM%016X%016X; high mismatch; expected 0x%016X, got 0x%016X", g.h, g.l, g.h, h)
36 | 		}
37 | 		if g.l != l {
38 | 			t.Errorf("0xM%016X%016X; low mismatch; expected 0x%016X, got 0x%016X", g.h, g.l, g.l, l)
39 | 		}
40 | 		if acc != big.Exact {
41 | 			t.Errorf("0xM%016X%016X; round-trip result accuracy inexact; expected %v, got %v", g.h, g.l, big.Exact, acc)
42 | 		}
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/float80x86/float80x86.go:
--------------------------------------------------------------------------------
  1 | //go:generate go run gen.go -o extra_test.go
  2 | 
  3 | // Package float80x86 implements encoding and decoding of x86 extended precision
  4 | // floating-point numbers.
  5 | //
  6 | // https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format
  7 | package float80x86
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"math"
 12 | 	"math/big"
 13 | )
 14 | 
 15 | const (
 16 | 	// precision specifies the number of bits in the mantissa (including the
 17 | 	// explicit lead bit).
 18 | 	precision = 64
 19 | 	// exponent bias.
 20 | 	bias = 16383
 21 | )
 22 | 
 23 | // Positive and negative Not-a-Number, infinity and zero.
 24 | var (
 25 | 	// +NaN
 26 | 	NaN = Float{se: 0x7FFF, m: 0xBFFFFFFFFFFFFFFF}
 27 | 	// -NaN
 28 | 	NegNaN = Float{se: 0xFFFF, m: 0xBFFFFFFFFFFFFFFF}
 29 | 	// +Inf
 30 | 	Inf = Float{se: 0x7FFF, m: 0x8000000000000000}
 31 | 	// -Inf
 32 | 	NegInf = Float{se: 0xFFFF, m: 0x8000000000000000}
 33 | 	// +zero
 34 | 	Zero = Float{se: 0x0000, m: 0x0000000000000000}
 35 | 	// -zero
 36 | 	NegZero = Float{se: 0x8000, m: 0x0000000000000000}
 37 | )
 38 | 
 39 | // Float is a floating-point number in x86 extended precision format.
 40 | type Float struct {
 41 | 	// Sign and exponent.
 42 | 	//
 43 | 	//    1 bit:   sign
 44 | 	//    15 bits: exponent
 45 | 	se uint16
 46 | 	// Integer part and fraction.
 47 | 	//
 48 | 	//    1 bit:   integer part
 49 | 	//    63 bits: fraction
 50 | 	m uint64
 51 | }
 52 | 
 53 | // NewFromBits returns the floating-point number corresponding to the x86
 54 | // extended precision representation.
 55 | func NewFromBits(se uint16, m uint64) Float {
 56 | 	return Float{se: se, m: m}
 57 | }
 58 | 
 59 | // NewFromFloat32 returns the nearest x86 extended precision floating-point
 60 | // number for x and the accuracy of the conversion.
 61 | func NewFromFloat32(x float32) (Float, big.Accuracy) {
 62 | 	f, acc := NewFromFloat64(float64(x))
 63 | 	if acc == big.Exact {
 64 | 		_, acc = f.Float32()
 65 | 	}
 66 | 	return f, acc
 67 | }
 68 | 
 69 | // NewFromFloat64 returns the nearest x86 extended precision floating-point
 70 | // number for x and the accuracy of the conversion.
 71 | func NewFromFloat64(x float64) (Float, big.Accuracy) {
 72 | 	// +-NaN
 73 | 	switch {
 74 | 	case math.IsNaN(x):
 75 | 		if math.Signbit(x) {
 76 | 			// -NaN
 77 | 			//    sign: 1
 78 | 			//    exp:  all ones
 79 | 			//    mant: 10 non-zero
 80 | 			return NegNaN, big.Exact
 81 | 		}
 82 | 		// +NaN
 83 | 		//    sign: 0
 84 | 		//    exp:  all ones
 85 | 		//    mant: 10 non-zero
 86 | 		return NaN, big.Exact
 87 | 	}
 88 | 	y := big.NewFloat(x)
 89 | 	y.SetPrec(precision)
 90 | 	y.SetMode(big.ToNearestEven)
 91 | 	// TODO: check accuracy after setting precision?
 92 | 	return NewFromBig(y)
 93 | }
 94 | 
 95 | // NewFromBig returns the nearest x86 extended precision floating-point number
 96 | // for x and the accuracy of the conversion.
 97 | func NewFromBig(x *big.Float) (Float, big.Accuracy) {
 98 | 	// +-Inf
 99 | 	zero := big.NewFloat(0)
100 | 	switch {
101 | 	case x.IsInf():
102 | 		if x.Signbit() {
103 | 			// -Inf
104 | 			//    sign: 1
105 | 			//    exp:  all ones
106 | 			//    mant: 10 zero
107 | 			return NegInf, big.Exact
108 | 		}
109 | 		// +Inf
110 | 		//    sign: 0
111 | 		//    exp:  all ones
112 | 		//    mant: 10 zero
113 | 		return Inf, big.Exact
114 | 	// +-zero
115 | 	case x.Cmp(zero) == 0:
116 | 		if x.Signbit() {
117 | 			// -zero
118 | 			//    sign: 1
119 | 			//    exp:  zero
120 | 			//    mant: zero
121 | 			return NegZero, big.Exact
122 | 		}
123 | 		// +zero
124 | 		//    sign: 0
125 | 		//    exp:  zero
126 | 		//    mant: zero
127 | 		return Zero, big.Exact
128 | 	}
129 | 
130 | 	// Sign
131 | 	var se uint16
132 | 	if x.Signbit() {
133 | 		se |= 0x8000
134 | 	}
135 | 
136 | 	// Exponent and mantissa.
137 | 	var m uint64
138 | 	mant := &big.Float{}
139 | 	exponent := x.MantExp(mant)
140 | 	// TODO: verify, as float80x86 also has an explicit lead bit.
141 | 	// Remove 1 from the exponent as big.Float has an no lead bit.
142 | 	exp := exponent - 1 + bias
143 | 
144 | 	// Handle denormalized values.
145 | 	// TODO: validate implementation of denormalized values.
146 | 	if exp <= 0 {
147 | 		acc := big.Exact
148 | 		if exp <= -(precision - 1) {
149 | 			exp = precision - 1
150 | 			acc = big.Below
151 | 		}
152 | 		mant.SetMantExp(mant, exp+precision-1)
153 | 		if mant.Signbit() {
154 | 			mant.Neg(mant)
155 | 		}
156 | 		v, _ := mant.Uint64()
157 | 		// TODO: calculate acc based on if v&^0x7FFFFFFFFFFFFFFF != 0 {}
158 | 		m |= v & 0x7FFFFFFFFFFFFFFF
159 | 		return Float{se: se, m: m}, acc
160 | 	}
161 | 
162 | 	// 0b111111111111111
163 | 	acc := big.Exact
164 | 	if (exp &^ 0x7FFF) != 0 {
165 | 		acc = big.Above
166 | 	}
167 | 	se |= uint16(exp & 0x7FFF)
168 | 
169 | 	if mant.Signbit() {
170 | 		mant.Neg(mant)
171 | 	}
172 | 	mant.SetMantExp(mant, precision)
173 | 	if !mant.IsInt() {
174 | 		acc = big.Below
175 | 	}
176 | 	// mantissa, including explicit lead bit
177 | 	mantissa, acc2 := mant.Uint64()
178 | 	if acc == big.Exact {
179 | 		acc = acc2
180 | 	}
181 | 	m |= mantissa
182 | 	return Float{se: se, m: m}, acc
183 | }
184 | 
185 | // Bits returns the x86 extended precision binary representation of f.
186 | func (f Float) Bits() (se uint16, m uint64) {
187 | 	return f.se, f.m
188 | }
189 | 
190 | // Float32 returns the float32 value nearest to f. If f is too small to be
191 | // represented by a float32 (|f| < math.SmallestNonzeroFloat32), the result is
192 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is
193 | // too large to be represented by a float32 (|f| > math.MaxFloat32), the result
194 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f.
195 | func (f Float) Float32() (float32, big.Accuracy) {
196 | 	x, nan := f.Big()
197 | 	if nan {
198 | 		if x.Signbit() {
199 | 			return float32(-math.NaN()), big.Exact
200 | 		}
201 | 		return float32(math.NaN()), big.Exact
202 | 	}
203 | 	return x.Float32()
204 | }
205 | 
206 | // Float64 returns the float64 value nearest to f. If f is too small to be
207 | // represented by a float64 (|f| < math.SmallestNonzeroFloat64), the result is
208 | // (0, Below) or (-0, Above), respectively, depending on the sign of f. If f is
209 | // too large to be represented by a float64 (|f| > math.MaxFloat64), the result
210 | // is (+Inf, Above) or (-Inf, Below), depending on the sign of f.
211 | func (f Float) Float64() (float64, big.Accuracy) {
212 | 	x, nan := f.Big()
213 | 	if nan {
214 | 		if x.Signbit() {
215 | 			return -math.NaN(), big.Exact
216 | 		}
217 | 		return math.NaN(), big.Exact
218 | 	}
219 | 	return x.Float64()
220 | }
221 | 
222 | // Big returns the multi-precision floating-point number representation of f and
223 | // a boolean indicating whether f is Not-a-Number.
224 | func (f Float) Big() (x *big.Float, nan bool) {
225 | 	signbit := f.Signbit()
226 | 	exp := f.Exp()
227 | 	x = big.NewFloat(0)
228 | 	x.SetPrec(precision)
229 | 	x.SetMode(big.ToNearestEven)
230 | 
231 | 	// ref: https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format
232 | 	//
233 | 	// 0b000000000000001 - 0b111111111111110
234 | 	// Normalized number.
235 | 	//
236 | 	//    (-1)^signbit * 2^(exp-16383) * 1.mant_2
237 | 	exponent := exp - bias
238 | 
239 | 	switch exp {
240 | 	// 0b111111111111111
241 | 	case 0x7FFF:
242 | 		// Inf or NaN
243 | 		if f.m == 0x8000000000000000 {
244 | 			// +-Inf
245 | 			//    10 zero
246 | 			x.SetInf(signbit)
247 | 			return x, false
248 | 		}
249 | 		// +-NaN
250 | 		//    10 non-zero
251 | 		if signbit {
252 | 			x.Neg(x)
253 | 		}
254 | 		return x, true
255 | 	// 0b000000000000000
256 | 	case 0x0000:
257 | 		if f.m == 0 {
258 | 			// +-Zero
259 | 			if signbit {
260 | 				x.Neg(x)
261 | 			}
262 | 			return x, false
263 | 		}
264 | 		// Denormalized number.
265 | 		//
266 | 		//    (-1)^signbit * 2^(-16382) * 0.mant_2
267 | 		exponent = -16382
268 | 	}
269 | 
270 | 	// number = [ sign ] [ prefix ] mantissa [ exponent ] | infinity .
271 | 	sign := "+"
272 | 	if signbit {
273 | 		sign = "-"
274 | 	}
275 | 	lead := f.Lead()
276 | 	frac := f.Frac()
277 | 	s := fmt.Sprintf("%s0b%d.%063bp%d", sign, lead, frac, exponent)
278 | 	if _, _, err := x.Parse(s, 0); err != nil {
279 | 		panic(err)
280 | 	}
281 | 	return x, false
282 | }
283 | 
284 | // Signbit reports whether f is negative or negative 0.
285 | func (f Float) Signbit() bool {
286 | 	// 0b1000000000000000
287 | 	return f.se&0x8000 != 0
288 | }
289 | 
290 | // Exp returns the exponent of f.
291 | func (f Float) Exp() int {
292 | 	// 0b0111111111111111
293 | 	return int(f.se & 0x7FFF)
294 | }
295 | 
296 | // Lead returns the explicit lead bit of f.
297 | func (f Float) Lead() int {
298 | 	return int(f.m >> 63)
299 | }
300 | 
301 | // Frac returns the fraction of f.
302 | func (f Float) Frac() uint64 {
303 | 	return f.m & 0x7FFFFFFFFFFFFFFF
304 | }
305 | 


--------------------------------------------------------------------------------
/float80x86/float80x86_test.go:
--------------------------------------------------------------------------------
  1 | package float80x86
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"math/big"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestNewFromBits(t *testing.T) {
 10 | 	golden := []struct {
 11 | 		se   uint16
 12 | 		m    uint64
 13 | 		want float64
 14 | 	}{
 15 | 		// Special numbers.
 16 | 		// 0 111111111111111 10 non-zero = +NaN
 17 | 		{se: 0x7FFF, m: 0xBFFFFFFFFFFFFFFF, want: math.NaN()},
 18 | 		// -NaN
 19 | 		// 1 111111111111111 10 non-zero = -NaN
 20 | 		{se: 0xFFFF, m: 0xBFFFFFFFFFFFFFFF, want: -math.NaN()},
 21 | 
 22 | 		// from: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_math.html#960
 23 | 
 24 | 		// 0000 00000000 00000000 = 0.0
 25 | 		{se: 0x0000, m: 0x0000000000000000, want: 0.0},
 26 | 		// 8000 00000000 00000000 = -0.0
 27 | 		{se: 0x8000, m: 0x0000000000000000, want: math.Copysign(0, -1)},
 28 | 		// 3FFF 80000000 00000000 = 1.0
 29 | 		{se: 0x3FFF, m: 0x8000000000000000, want: 1.0},
 30 | 		// 4000 80000000 00000000 = 2.0
 31 | 		{se: 0x4000, m: 0x8000000000000000, want: 2.0},
 32 | 		// 7FFE FFFFFFFF FFFFFFFF = 1.18973149535723176505e+4932 (max normal)
 33 | 		//{se: 0x7FFE, m: 0xFFFFFFFFFFFFFFFF, want: 1.18973149535723176505e+4932},
 34 | 		// 0001 80000000 00000000 = 3.36210314311209350626e-4932 (min positive normal)
 35 | 		//{se: 0x0001, m: 0x8000000000000000, want: 3.36210314311209350626e-4932},
 36 | 		// 0000 7FFFFFFF FFFFFFFF = 3.36210314311209350608e-4932 (max subnormal)
 37 | 		//{se: 0x0000, m: 0x7FFFFFFFFFFFFFFF, want: 3.36210314311209350608e-4932},
 38 | 		// 0000 00000000 00000001 = 3.64519953188247460253e-4951 (min positive subnormal)
 39 | 		//{se: 0x0000, m: 0x0000000000000001, want: 3.64519953188247460253e-4951},
 40 | 		// 7FFF 80000000 00000000 = infinity
 41 | 		{se: 0x7FFF, m: 0x8000000000000000, want: math.Inf(1)},
 42 | 		// FFFF 80000000 00000000 = -infinity
 43 | 		{se: 0xFFFF, m: 0x8000000000000000, want: math.Inf(-1)},
 44 | 
 45 | 		// 2^i
 46 | 		// TODO: add test cases for 2^i
 47 | 	}
 48 | 	for _, g := range golden {
 49 | 		f := NewFromBits(g.se, g.m)
 50 | 		got, _ := f.Float64()
 51 | 		wantBits := math.Float64bits(g.want)
 52 | 		gotBits := math.Float64bits(got)
 53 | 		//fmt.Printf("bits: 0x%04X (%v)\n", g.bits, g.want)
 54 | 		if wantBits != gotBits {
 55 | 			t.Errorf("0x%04X %016X: number mismatch; expected 0x%016X (%v), got 0x%016X (%v)", g.se, g.m, wantBits, g.want, gotBits, got)
 56 | 		}
 57 | 	}
 58 | }
 59 | 
 60 | func TestNewFromFloat64(t *testing.T) {
 61 | 	golden := []struct {
 62 | 		in  float64
 63 | 		se  uint16
 64 | 		m   uint64
 65 | 		acc big.Accuracy
 66 | 	}{
 67 | 		// Special numbers.
 68 | 		// 0 111111111111111 10 non-zero = +NaN
 69 | 		{in: math.NaN(), se: 0x7FFF, m: 0xBFFFFFFFFFFFFFFF, acc: big.Exact},
 70 | 		// -NaN
 71 | 		// 1 111111111111111 10 non-zero = -NaN
 72 | 		{in: -math.NaN(), se: 0xFFFF, m: 0xBFFFFFFFFFFFFFFF, acc: big.Exact},
 73 | 
 74 | 		// from: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_math.html#960
 75 | 
 76 | 		// 0000 00000000 00000000 = 0.0
 77 | 		{in: 0.0, se: 0x0000, m: 0x0000000000000000, acc: big.Exact},
 78 | 		// 8000 00000000 00000000 = -0.0
 79 | 		{in: math.Copysign(0, -1), se: 0x8000, m: 0x0000000000000000, acc: big.Exact},
 80 | 		// 3FFF 80000000 00000000 = 1.0
 81 | 		{in: 1.0, se: 0x3FFF, m: 0x8000000000000000, acc: big.Exact},
 82 | 		// 4000 80000000 00000000 = 2.0
 83 | 		{in: 2.0, se: 0x4000, m: 0x8000000000000000, acc: big.Exact},
 84 | 		// 7FFE FFFFFFFF FFFFFFFF = 1.18973149535723176505e+4932 (max normal)
 85 | 		//{in: 1.18973149535723176505e+4932, se: 0x7FFE, m: 0xFFFFFFFFFFFFFFFF, acc: big.Exact},
 86 | 		// 0001 80000000 00000000 = 3.36210314311209350626e-4932 (min positive normal)
 87 | 		//{in: 3.36210314311209350626e-4932, se: 0x0001, m: 0x8000000000000000, acc: big.Exact},
 88 | 		// 0000 7FFFFFFF FFFFFFFF = 3.36210314311209350608e-4932 (max subnormal)
 89 | 		//{in: 3.36210314311209350608e-4932, se: 0x0000, m: 0x7FFFFFFFFFFFFFFF, acc: big.Exact},
 90 | 		// 0000 00000000 00000001 = 3.64519953188247460253e-4951 (min positive subnormal)
 91 | 		//{in: 3.64519953188247460253e-4951, se: 0x0000, m: 0x0000000000000001, acc: big.Exact},
 92 | 		// 7FFF 80000000 00000000 = infinity
 93 | 		{in: math.Inf(1), se: 0x7FFF, m: 0x8000000000000000, acc: big.Exact},
 94 | 		// FFFF 80000000 00000000 = -infinity
 95 | 		{in: math.Inf(-1), se: 0xFFFF, m: 0x8000000000000000, acc: big.Exact},
 96 | 
 97 | 		// 2^i
 98 | 		// TODO: add test cases for 2^i
 99 | 	}
100 | 	for _, g := range golden {
101 | 		f, acc := NewFromFloat64(g.in)
102 | 		se, m := f.Bits()
103 | 		if g.se != se || g.m != m {
104 | 			x, _ := f.Float64()
105 | 			t.Errorf("bits mismatch; expected 0x%04X %016X (%v), got 0x%04X %016X (%v)", g.se, g.m, g.in, se, m, x)
106 | 		}
107 | 		if g.acc != acc {
108 | 			x, _ := f.Float64()
109 | 			t.Errorf("accuracy mismatch; expected %v (%v), got %v (%v)", g.acc, g.in, acc, x)
110 | 		}
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/mewmew/float
2 | 
3 | go 1.13
4 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mewmew/float/4fe539893335ae74f3d0fd5e782763a8e3e565d2/go.sum


--------------------------------------------------------------------------------
/internal/strconv/decimal.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // Multiprecision decimal numbers.
  6 | // For floating-point formatting only; not general purpose.
  7 | // Only operations are assign and (binary) left/right shift.
  8 | // Can do binary floating point in multiprecision decimal precisely
  9 | // because 2 divides 10; cannot do decimal floating point
 10 | // in multiprecision binary precisely.
 11 | 
 12 | package strconv
 13 | 
 14 | type decimal struct {
 15 | 	d     [800]byte // digits, big-endian representation
 16 | 	nd    int       // number of digits used
 17 | 	dp    int       // decimal point
 18 | 	neg   bool      // negative flag
 19 | 	trunc bool      // discarded nonzero digits beyond d[:nd]
 20 | }
 21 | 
 22 | func (a *decimal) String() string {
 23 | 	n := 10 + a.nd
 24 | 	if a.dp > 0 {
 25 | 		n += a.dp
 26 | 	}
 27 | 	if a.dp < 0 {
 28 | 		n += -a.dp
 29 | 	}
 30 | 
 31 | 	buf := make([]byte, n)
 32 | 	w := 0
 33 | 	switch {
 34 | 	case a.nd == 0:
 35 | 		return "0"
 36 | 
 37 | 	case a.dp <= 0:
 38 | 		// zeros fill space between decimal point and digits
 39 | 		buf[w] = '0'
 40 | 		w++
 41 | 		buf[w] = '.'
 42 | 		w++
 43 | 		w += digitZero(buf[w : w+-a.dp])
 44 | 		w += copy(buf[w:], a.d[0:a.nd])
 45 | 
 46 | 	case a.dp < a.nd:
 47 | 		// decimal point in middle of digits
 48 | 		w += copy(buf[w:], a.d[0:a.dp])
 49 | 		buf[w] = '.'
 50 | 		w++
 51 | 		w += copy(buf[w:], a.d[a.dp:a.nd])
 52 | 
 53 | 	default:
 54 | 		// zeros fill space between digits and decimal point
 55 | 		w += copy(buf[w:], a.d[0:a.nd])
 56 | 		w += digitZero(buf[w : w+a.dp-a.nd])
 57 | 	}
 58 | 	return string(buf[0:w])
 59 | }
 60 | 
 61 | func digitZero(dst []byte) int {
 62 | 	for i := range dst {
 63 | 		dst[i] = '0'
 64 | 	}
 65 | 	return len(dst)
 66 | }
 67 | 
 68 | // trim trailing zeros from number.
 69 | // (They are meaningless; the decimal point is tracked
 70 | // independent of the number of digits.)
 71 | func trim(a *decimal) {
 72 | 	for a.nd > 0 && a.d[a.nd-1] == '0' {
 73 | 		a.nd--
 74 | 	}
 75 | 	if a.nd == 0 {
 76 | 		a.dp = 0
 77 | 	}
 78 | }
 79 | 
 80 | // Assign v to a.
 81 | func (a *decimal) Assign(v uint64) {
 82 | 	var buf [24]byte
 83 | 
 84 | 	// Write reversed decimal in buf.
 85 | 	n := 0
 86 | 	for v > 0 {
 87 | 		v1 := v / 10
 88 | 		v -= 10 * v1
 89 | 		buf[n] = byte(v + '0')
 90 | 		n++
 91 | 		v = v1
 92 | 	}
 93 | 
 94 | 	// Reverse again to produce forward decimal in a.d.
 95 | 	a.nd = 0
 96 | 	for n--; n >= 0; n-- {
 97 | 		a.d[a.nd] = buf[n]
 98 | 		a.nd++
 99 | 	}
100 | 	a.dp = a.nd
101 | 	trim(a)
102 | }
103 | 
104 | // Maximum shift that we can do in one pass without overflow.
105 | // A uint has 32 or 64 bits, and we have to be able to accommodate 9<<k.
106 | const uintSize = 32 << (^uint(0) >> 63)
107 | const maxShift = uintSize - 4
108 | 
109 | // Binary shift right (/ 2) by k bits.  k <= maxShift to avoid overflow.
110 | func rightShift(a *decimal, k uint) {
111 | 	r := 0 // read pointer
112 | 	w := 0 // write pointer
113 | 
114 | 	// Pick up enough leading digits to cover first shift.
115 | 	var n uint
116 | 	for ; n>>k == 0; r++ {
117 | 		if r >= a.nd {
118 | 			if n == 0 {
119 | 				// a == 0; shouldn't get here, but handle anyway.
120 | 				a.nd = 0
121 | 				return
122 | 			}
123 | 			for n>>k == 0 {
124 | 				n *= 10
125 | 				r++
126 | 			}
127 | 			break
128 | 		}
129 | 		c := uint(a.d[r])
130 | 		n = n*10 + c - '0'
131 | 	}
132 | 	a.dp -= r - 1
133 | 
134 | 	var mask uint = (1 << k) - 1
135 | 
136 | 	// Pick up a digit, put down a digit.
137 | 	for ; r < a.nd; r++ {
138 | 		c := uint(a.d[r])
139 | 		dig := n >> k
140 | 		n &= mask
141 | 		a.d[w] = byte(dig + '0')
142 | 		w++
143 | 		n = n*10 + c - '0'
144 | 	}
145 | 
146 | 	// Put down extra digits.
147 | 	for n > 0 {
148 | 		dig := n >> k
149 | 		n &= mask
150 | 		if w < len(a.d) {
151 | 			a.d[w] = byte(dig + '0')
152 | 			w++
153 | 		} else if dig > 0 {
154 | 			a.trunc = true
155 | 		}
156 | 		n *= 10
157 | 	}
158 | 
159 | 	a.nd = w
160 | 	trim(a)
161 | }
162 | 
163 | // Cheat sheet for left shift: table indexed by shift count giving
164 | // number of new digits that will be introduced by that shift.
165 | //
166 | // For example, leftcheats[4] = {2, "625"}.  That means that
167 | // if we are shifting by 4 (multiplying by 16), it will add 2 digits
168 | // when the string prefix is "625" through "999", and one fewer digit
169 | // if the string prefix is "000" through "624".
170 | //
171 | // Credit for this trick goes to Ken.
172 | 
173 | type leftCheat struct {
174 | 	delta  int    // number of new digits
175 | 	cutoff string // minus one digit if original < a.
176 | }
177 | 
178 | var leftcheats = []leftCheat{
179 | 	// Leading digits of 1/2^i = 5^i.
180 | 	// 5^23 is not an exact 64-bit floating point number,
181 | 	// so have to use bc for the math.
182 | 	// Go up to 60 to be large enough for 32bit and 64bit platforms.
183 | 	/*
184 | 		seq 60 | sed 's/^/5^/' | bc |
185 | 		awk 'BEGIN{ print "\t{ 0, \"\" }," }
186 | 		{
187 | 			log2 = log(2)/log(10)
188 | 			printf("\t{ %d, \"%s\" },\t// * %d\n",
189 | 				int(log2*NR+1), $0, 2**NR)
190 | 		}'
191 | 	*/
192 | 	{0, ""},
193 | 	{1, "5"},                                           // * 2
194 | 	{1, "25"},                                          // * 4
195 | 	{1, "125"},                                         // * 8
196 | 	{2, "625"},                                         // * 16
197 | 	{2, "3125"},                                        // * 32
198 | 	{2, "15625"},                                       // * 64
199 | 	{3, "78125"},                                       // * 128
200 | 	{3, "390625"},                                      // * 256
201 | 	{3, "1953125"},                                     // * 512
202 | 	{4, "9765625"},                                     // * 1024
203 | 	{4, "48828125"},                                    // * 2048
204 | 	{4, "244140625"},                                   // * 4096
205 | 	{4, "1220703125"},                                  // * 8192
206 | 	{5, "6103515625"},                                  // * 16384
207 | 	{5, "30517578125"},                                 // * 32768
208 | 	{5, "152587890625"},                                // * 65536
209 | 	{6, "762939453125"},                                // * 131072
210 | 	{6, "3814697265625"},                               // * 262144
211 | 	{6, "19073486328125"},                              // * 524288
212 | 	{7, "95367431640625"},                              // * 1048576
213 | 	{7, "476837158203125"},                             // * 2097152
214 | 	{7, "2384185791015625"},                            // * 4194304
215 | 	{7, "11920928955078125"},                           // * 8388608
216 | 	{8, "59604644775390625"},                           // * 16777216
217 | 	{8, "298023223876953125"},                          // * 33554432
218 | 	{8, "1490116119384765625"},                         // * 67108864
219 | 	{9, "7450580596923828125"},                         // * 134217728
220 | 	{9, "37252902984619140625"},                        // * 268435456
221 | 	{9, "186264514923095703125"},                       // * 536870912
222 | 	{10, "931322574615478515625"},                      // * 1073741824
223 | 	{10, "4656612873077392578125"},                     // * 2147483648
224 | 	{10, "23283064365386962890625"},                    // * 4294967296
225 | 	{10, "116415321826934814453125"},                   // * 8589934592
226 | 	{11, "582076609134674072265625"},                   // * 17179869184
227 | 	{11, "2910383045673370361328125"},                  // * 34359738368
228 | 	{11, "14551915228366851806640625"},                 // * 68719476736
229 | 	{12, "72759576141834259033203125"},                 // * 137438953472
230 | 	{12, "363797880709171295166015625"},                // * 274877906944
231 | 	{12, "1818989403545856475830078125"},               // * 549755813888
232 | 	{13, "9094947017729282379150390625"},               // * 1099511627776
233 | 	{13, "45474735088646411895751953125"},              // * 2199023255552
234 | 	{13, "227373675443232059478759765625"},             // * 4398046511104
235 | 	{13, "1136868377216160297393798828125"},            // * 8796093022208
236 | 	{14, "5684341886080801486968994140625"},            // * 17592186044416
237 | 	{14, "28421709430404007434844970703125"},           // * 35184372088832
238 | 	{14, "142108547152020037174224853515625"},          // * 70368744177664
239 | 	{15, "710542735760100185871124267578125"},          // * 140737488355328
240 | 	{15, "3552713678800500929355621337890625"},         // * 281474976710656
241 | 	{15, "17763568394002504646778106689453125"},        // * 562949953421312
242 | 	{16, "88817841970012523233890533447265625"},        // * 1125899906842624
243 | 	{16, "444089209850062616169452667236328125"},       // * 2251799813685248
244 | 	{16, "2220446049250313080847263336181640625"},      // * 4503599627370496
245 | 	{16, "11102230246251565404236316680908203125"},     // * 9007199254740992
246 | 	{17, "55511151231257827021181583404541015625"},     // * 18014398509481984
247 | 	{17, "277555756156289135105907917022705078125"},    // * 36028797018963968
248 | 	{17, "1387778780781445675529539585113525390625"},   // * 72057594037927936
249 | 	{18, "6938893903907228377647697925567626953125"},   // * 144115188075855872
250 | 	{18, "34694469519536141888238489627838134765625"},  // * 288230376151711744
251 | 	{18, "173472347597680709441192448139190673828125"}, // * 576460752303423488
252 | 	{19, "867361737988403547205962240695953369140625"}, // * 1152921504606846976
253 | }
254 | 
255 | // Is the leading prefix of b lexicographically less than s?
256 | func prefixIsLessThan(b []byte, s string) bool {
257 | 	for i := 0; i < len(s); i++ {
258 | 		if i >= len(b) {
259 | 			return true
260 | 		}
261 | 		if b[i] != s[i] {
262 | 			return b[i] < s[i]
263 | 		}
264 | 	}
265 | 	return false
266 | }
267 | 
268 | // Binary shift left (* 2) by k bits.  k <= maxShift to avoid overflow.
269 | func leftShift(a *decimal, k uint) {
270 | 	delta := leftcheats[k].delta
271 | 	if prefixIsLessThan(a.d[0:a.nd], leftcheats[k].cutoff) {
272 | 		delta--
273 | 	}
274 | 
275 | 	r := a.nd         // read index
276 | 	w := a.nd + delta // write index
277 | 
278 | 	// Pick up a digit, put down a digit.
279 | 	var n uint
280 | 	for r--; r >= 0; r-- {
281 | 		n += (uint(a.d[r]) - '0') << k
282 | 		quo := n / 10
283 | 		rem := n - 10*quo
284 | 		w--
285 | 		if w < len(a.d) {
286 | 			a.d[w] = byte(rem + '0')
287 | 		} else if rem != 0 {
288 | 			a.trunc = true
289 | 		}
290 | 		n = quo
291 | 	}
292 | 
293 | 	// Put down extra digits.
294 | 	for n > 0 {
295 | 		quo := n / 10
296 | 		rem := n - 10*quo
297 | 		w--
298 | 		if w < len(a.d) {
299 | 			a.d[w] = byte(rem + '0')
300 | 		} else if rem != 0 {
301 | 			a.trunc = true
302 | 		}
303 | 		n = quo
304 | 	}
305 | 
306 | 	a.nd += delta
307 | 	if a.nd >= len(a.d) {
308 | 		a.nd = len(a.d)
309 | 	}
310 | 	a.dp += delta
311 | 	trim(a)
312 | }
313 | 
314 | // Binary shift left (k > 0) or right (k < 0).
315 | func (a *decimal) Shift(k int) {
316 | 	switch {
317 | 	case a.nd == 0:
318 | 		// nothing to do: a == 0
319 | 	case k > 0:
320 | 		for k > maxShift {
321 | 			leftShift(a, maxShift)
322 | 			k -= maxShift
323 | 		}
324 | 		leftShift(a, uint(k))
325 | 	case k < 0:
326 | 		for k < -maxShift {
327 | 			rightShift(a, maxShift)
328 | 			k += maxShift
329 | 		}
330 | 		rightShift(a, uint(-k))
331 | 	}
332 | }
333 | 
334 | // If we chop a at nd digits, should we round up?
335 | func shouldRoundUp(a *decimal, nd int) bool {
336 | 	if nd < 0 || nd >= a.nd {
337 | 		return false
338 | 	}
339 | 	if a.d[nd] == '5' && nd+1 == a.nd { // exactly halfway - round to even
340 | 		// if we truncated, a little higher than what's recorded - always round up
341 | 		if a.trunc {
342 | 			return true
343 | 		}
344 | 		return nd > 0 && (a.d[nd-1]-'0')%2 != 0
345 | 	}
346 | 	// not halfway - digit tells all
347 | 	return a.d[nd] >= '5'
348 | }
349 | 
350 | // Round a to nd digits (or fewer).
351 | // If nd is zero, it means we're rounding
352 | // just to the left of the digits, as in
353 | // 0.09 -> 0.1.
354 | func (a *decimal) Round(nd int) {
355 | 	if nd < 0 || nd >= a.nd {
356 | 		return
357 | 	}
358 | 	if shouldRoundUp(a, nd) {
359 | 		a.RoundUp(nd)
360 | 	} else {
361 | 		a.RoundDown(nd)
362 | 	}
363 | }
364 | 
365 | // Round a down to nd digits (or fewer).
366 | func (a *decimal) RoundDown(nd int) {
367 | 	if nd < 0 || nd >= a.nd {
368 | 		return
369 | 	}
370 | 	a.nd = nd
371 | 	trim(a)
372 | }
373 | 
374 | // Round a up to nd digits (or fewer).
375 | func (a *decimal) RoundUp(nd int) {
376 | 	if nd < 0 || nd >= a.nd {
377 | 		return
378 | 	}
379 | 
380 | 	// round up
381 | 	for i := nd - 1; i >= 0; i-- {
382 | 		c := a.d[i]
383 | 		if c < '9' { // can stop after this digit
384 | 			a.d[i]++
385 | 			a.nd = i + 1
386 | 			return
387 | 		}
388 | 	}
389 | 
390 | 	// Number is all 9s.
391 | 	// Change to single 1 with adjusted decimal point.
392 | 	a.d[0] = '1'
393 | 	a.nd = 1
394 | 	a.dp++
395 | }
396 | 
397 | // Extract integer part, rounded appropriately.
398 | // No guarantees about overflow.
399 | func (a *decimal) RoundedInteger() uint64 {
400 | 	if a.dp > 20 {
401 | 		return 0xFFFFFFFFFFFFFFFF
402 | 	}
403 | 	var i int
404 | 	n := uint64(0)
405 | 	for i = 0; i < a.dp && i < a.nd; i++ {
406 | 		n = n*10 + uint64(a.d[i]-'0')
407 | 	}
408 | 	for ; i < a.dp; i++ {
409 | 		n *= 10
410 | 	}
411 | 	if shouldRoundUp(a, a.dp) {
412 | 		n++
413 | 	}
414 | 	return n
415 | }
416 | 


--------------------------------------------------------------------------------
/internal/strconv/extfloat.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package strconv
  6 | 
  7 | import (
  8 | 	"math/bits"
  9 | )
 10 | 
 11 | // An extFloat represents an extended floating-point number, with more
 12 | // precision than a float64. It does not try to save bits: the
 13 | // number represented by the structure is mant*(2^exp), with a negative
 14 | // sign if neg is true.
 15 | type extFloat struct {
 16 | 	mant uint64
 17 | 	exp  int
 18 | 	neg  bool
 19 | }
 20 | 
 21 | // Powers of ten taken from double-conversion library.
 22 | // https://code.google.com/p/double-conversion/
 23 | const (
 24 | 	firstPowerOfTen = -348
 25 | 	stepPowerOfTen  = 8
 26 | )
 27 | 
 28 | var smallPowersOfTen = [...]extFloat{
 29 | 	{1 << 63, -63, false},        // 1
 30 | 	{0xa << 60, -60, false},      // 1e1
 31 | 	{0x64 << 57, -57, false},     // 1e2
 32 | 	{0x3e8 << 54, -54, false},    // 1e3
 33 | 	{0x2710 << 50, -50, false},   // 1e4
 34 | 	{0x186a0 << 47, -47, false},  // 1e5
 35 | 	{0xf4240 << 44, -44, false},  // 1e6
 36 | 	{0x989680 << 40, -40, false}, // 1e7
 37 | }
 38 | 
 39 | var powersOfTen = [...]extFloat{
 40 | 	{0xfa8fd5a0081c0288, -1220, false}, // 10^-348
 41 | 	{0xbaaee17fa23ebf76, -1193, false}, // 10^-340
 42 | 	{0x8b16fb203055ac76, -1166, false}, // 10^-332
 43 | 	{0xcf42894a5dce35ea, -1140, false}, // 10^-324
 44 | 	{0x9a6bb0aa55653b2d, -1113, false}, // 10^-316
 45 | 	{0xe61acf033d1a45df, -1087, false}, // 10^-308
 46 | 	{0xab70fe17c79ac6ca, -1060, false}, // 10^-300
 47 | 	{0xff77b1fcbebcdc4f, -1034, false}, // 10^-292
 48 | 	{0xbe5691ef416bd60c, -1007, false}, // 10^-284
 49 | 	{0x8dd01fad907ffc3c, -980, false},  // 10^-276
 50 | 	{0xd3515c2831559a83, -954, false},  // 10^-268
 51 | 	{0x9d71ac8fada6c9b5, -927, false},  // 10^-260
 52 | 	{0xea9c227723ee8bcb, -901, false},  // 10^-252
 53 | 	{0xaecc49914078536d, -874, false},  // 10^-244
 54 | 	{0x823c12795db6ce57, -847, false},  // 10^-236
 55 | 	{0xc21094364dfb5637, -821, false},  // 10^-228
 56 | 	{0x9096ea6f3848984f, -794, false},  // 10^-220
 57 | 	{0xd77485cb25823ac7, -768, false},  // 10^-212
 58 | 	{0xa086cfcd97bf97f4, -741, false},  // 10^-204
 59 | 	{0xef340a98172aace5, -715, false},  // 10^-196
 60 | 	{0xb23867fb2a35b28e, -688, false},  // 10^-188
 61 | 	{0x84c8d4dfd2c63f3b, -661, false},  // 10^-180
 62 | 	{0xc5dd44271ad3cdba, -635, false},  // 10^-172
 63 | 	{0x936b9fcebb25c996, -608, false},  // 10^-164
 64 | 	{0xdbac6c247d62a584, -582, false},  // 10^-156
 65 | 	{0xa3ab66580d5fdaf6, -555, false},  // 10^-148
 66 | 	{0xf3e2f893dec3f126, -529, false},  // 10^-140
 67 | 	{0xb5b5ada8aaff80b8, -502, false},  // 10^-132
 68 | 	{0x87625f056c7c4a8b, -475, false},  // 10^-124
 69 | 	{0xc9bcff6034c13053, -449, false},  // 10^-116
 70 | 	{0x964e858c91ba2655, -422, false},  // 10^-108
 71 | 	{0xdff9772470297ebd, -396, false},  // 10^-100
 72 | 	{0xa6dfbd9fb8e5b88f, -369, false},  // 10^-92
 73 | 	{0xf8a95fcf88747d94, -343, false},  // 10^-84
 74 | 	{0xb94470938fa89bcf, -316, false},  // 10^-76
 75 | 	{0x8a08f0f8bf0f156b, -289, false},  // 10^-68
 76 | 	{0xcdb02555653131b6, -263, false},  // 10^-60
 77 | 	{0x993fe2c6d07b7fac, -236, false},  // 10^-52
 78 | 	{0xe45c10c42a2b3b06, -210, false},  // 10^-44
 79 | 	{0xaa242499697392d3, -183, false},  // 10^-36
 80 | 	{0xfd87b5f28300ca0e, -157, false},  // 10^-28
 81 | 	{0xbce5086492111aeb, -130, false},  // 10^-20
 82 | 	{0x8cbccc096f5088cc, -103, false},  // 10^-12
 83 | 	{0xd1b71758e219652c, -77, false},   // 10^-4
 84 | 	{0x9c40000000000000, -50, false},   // 10^4
 85 | 	{0xe8d4a51000000000, -24, false},   // 10^12
 86 | 	{0xad78ebc5ac620000, 3, false},     // 10^20
 87 | 	{0x813f3978f8940984, 30, false},    // 10^28
 88 | 	{0xc097ce7bc90715b3, 56, false},    // 10^36
 89 | 	{0x8f7e32ce7bea5c70, 83, false},    // 10^44
 90 | 	{0xd5d238a4abe98068, 109, false},   // 10^52
 91 | 	{0x9f4f2726179a2245, 136, false},   // 10^60
 92 | 	{0xed63a231d4c4fb27, 162, false},   // 10^68
 93 | 	{0xb0de65388cc8ada8, 189, false},   // 10^76
 94 | 	{0x83c7088e1aab65db, 216, false},   // 10^84
 95 | 	{0xc45d1df942711d9a, 242, false},   // 10^92
 96 | 	{0x924d692ca61be758, 269, false},   // 10^100
 97 | 	{0xda01ee641a708dea, 295, false},   // 10^108
 98 | 	{0xa26da3999aef774a, 322, false},   // 10^116
 99 | 	{0xf209787bb47d6b85, 348, false},   // 10^124
100 | 	{0xb454e4a179dd1877, 375, false},   // 10^132
101 | 	{0x865b86925b9bc5c2, 402, false},   // 10^140
102 | 	{0xc83553c5c8965d3d, 428, false},   // 10^148
103 | 	{0x952ab45cfa97a0b3, 455, false},   // 10^156
104 | 	{0xde469fbd99a05fe3, 481, false},   // 10^164
105 | 	{0xa59bc234db398c25, 508, false},   // 10^172
106 | 	{0xf6c69a72a3989f5c, 534, false},   // 10^180
107 | 	{0xb7dcbf5354e9bece, 561, false},   // 10^188
108 | 	{0x88fcf317f22241e2, 588, false},   // 10^196
109 | 	{0xcc20ce9bd35c78a5, 614, false},   // 10^204
110 | 	{0x98165af37b2153df, 641, false},   // 10^212
111 | 	{0xe2a0b5dc971f303a, 667, false},   // 10^220
112 | 	{0xa8d9d1535ce3b396, 694, false},   // 10^228
113 | 	{0xfb9b7cd9a4a7443c, 720, false},   // 10^236
114 | 	{0xbb764c4ca7a44410, 747, false},   // 10^244
115 | 	{0x8bab8eefb6409c1a, 774, false},   // 10^252
116 | 	{0xd01fef10a657842c, 800, false},   // 10^260
117 | 	{0x9b10a4e5e9913129, 827, false},   // 10^268
118 | 	{0xe7109bfba19c0c9d, 853, false},   // 10^276
119 | 	{0xac2820d9623bf429, 880, false},   // 10^284
120 | 	{0x80444b5e7aa7cf85, 907, false},   // 10^292
121 | 	{0xbf21e44003acdd2d, 933, false},   // 10^300
122 | 	{0x8e679c2f5e44ff8f, 960, false},   // 10^308
123 | 	{0xd433179d9c8cb841, 986, false},   // 10^316
124 | 	{0x9e19db92b4e31ba9, 1013, false},  // 10^324
125 | 	{0xeb96bf6ebadf77d9, 1039, false},  // 10^332
126 | 	{0xaf87023b9bf0ee6b, 1066, false},  // 10^340
127 | }
128 | 
129 | // floatBits returns the bits of the float64 that best approximates
130 | // the extFloat passed as receiver. Overflow is set to true if
131 | // the resulting float64 is ±Inf.
132 | func (f *extFloat) floatBits(flt *floatInfo) (bits uint64, overflow bool) {
133 | 	f.Normalize()
134 | 
135 | 	exp := f.exp + 63
136 | 
137 | 	// Exponent too small.
138 | 	if exp < flt.bias+1 {
139 | 		n := flt.bias + 1 - exp
140 | 		f.mant >>= uint(n)
141 | 		exp += n
142 | 	}
143 | 
144 | 	// Extract 1+flt.mantbits bits from the 64-bit mantissa.
145 | 	mant := f.mant >> (63 - flt.mantbits)
146 | 	if f.mant&(1<<(62-flt.mantbits)) != 0 {
147 | 		// Round up.
148 | 		mant++
149 | 	}
150 | 
151 | 	// Rounding might have added a bit; shift down.
152 | 	if mant == 2<<flt.mantbits {
153 | 		mant >>= 1
154 | 		exp++
155 | 	}
156 | 
157 | 	// Infinities.
158 | 	if exp-flt.bias >= 1<<flt.expbits-1 {
159 | 		// ±Inf
160 | 		mant = 0
161 | 		exp = 1<<flt.expbits - 1 + flt.bias
162 | 		overflow = true
163 | 	} else if mant&(1<<flt.mantbits) == 0 {
164 | 		// Denormalized?
165 | 		exp = flt.bias
166 | 	}
167 | 	// Assemble bits.
168 | 	bits = mant & (uint64(1)<<flt.mantbits - 1)
169 | 	bits |= uint64((exp-flt.bias)&(1<<flt.expbits-1)) << flt.mantbits
170 | 	if f.neg {
171 | 		bits |= 1 << (flt.mantbits + flt.expbits)
172 | 	}
173 | 	return
174 | }
175 | 
176 | // AssignComputeBounds sets f to the floating point value
177 | // defined by mant, exp and precision given by flt. It returns
178 | // lower, upper such that any number in the closed interval
179 | // [lower, upper] is converted back to the same floating point number.
180 | func (f *extFloat) AssignComputeBounds(mant uint64, exp int, neg bool, flt *floatInfo) (lower, upper extFloat) {
181 | 	f.mant = mant
182 | 	f.exp = exp - int(flt.mantbits)
183 | 	f.neg = neg
184 | 	if f.exp <= 0 && mant == (mant>>uint(-f.exp))<<uint(-f.exp) {
185 | 		// An exact integer
186 | 		f.mant >>= uint(-f.exp)
187 | 		f.exp = 0
188 | 		return *f, *f
189 | 	}
190 | 	expBiased := exp - flt.bias
191 | 
192 | 	upper = extFloat{mant: 2*f.mant + 1, exp: f.exp - 1, neg: f.neg}
193 | 	if mant != 1<<flt.mantbits || expBiased == 1 {
194 | 		lower = extFloat{mant: 2*f.mant - 1, exp: f.exp - 1, neg: f.neg}
195 | 	} else {
196 | 		lower = extFloat{mant: 4*f.mant - 1, exp: f.exp - 2, neg: f.neg}
197 | 	}
198 | 	return
199 | }
200 | 
201 | // Normalize normalizes f so that the highest bit of the mantissa is
202 | // set, and returns the number by which the mantissa was left-shifted.
203 | func (f *extFloat) Normalize() uint {
204 | 	// bits.LeadingZeros64 would return 64
205 | 	if f.mant == 0 {
206 | 		return 0
207 | 	}
208 | 	shift := bits.LeadingZeros64(f.mant)
209 | 	f.mant <<= uint(shift)
210 | 	f.exp -= shift
211 | 	return uint(shift)
212 | }
213 | 
214 | // Multiply sets f to the product f*g: the result is correctly rounded,
215 | // but not normalized.
216 | func (f *extFloat) Multiply(g extFloat) {
217 | 	fhi, flo := f.mant>>32, uint64(uint32(f.mant))
218 | 	ghi, glo := g.mant>>32, uint64(uint32(g.mant))
219 | 
220 | 	// Cross products.
221 | 	cross1 := fhi * glo
222 | 	cross2 := flo * ghi
223 | 
224 | 	// f.mant*g.mant is fhi*ghi << 64 + (cross1+cross2) << 32 + flo*glo
225 | 	f.mant = fhi*ghi + (cross1 >> 32) + (cross2 >> 32)
226 | 	rem := uint64(uint32(cross1)) + uint64(uint32(cross2)) + ((flo * glo) >> 32)
227 | 	// Round up.
228 | 	rem += (1 << 31)
229 | 
230 | 	f.mant += (rem >> 32)
231 | 	f.exp = f.exp + g.exp + 64
232 | }
233 | 
234 | var uint64pow10 = [...]uint64{
235 | 	1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
236 | 	1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
237 | }
238 | 
239 | // AssignDecimal sets f to an approximate value mantissa*10^exp. It
240 | // reports whether the value represented by f is guaranteed to be the
241 | // best approximation of d after being rounded to a float64 or
242 | // float32 depending on flt.
243 | func (f *extFloat) AssignDecimal(mantissa uint64, exp10 int, neg bool, trunc bool, flt *floatInfo) (ok bool) {
244 | 	const uint64digits = 19
245 | 	const errorscale = 8
246 | 	errors := 0 // An upper bound for error, computed in errorscale*ulp.
247 | 	if trunc {
248 | 		// the decimal number was truncated.
249 | 		errors += errorscale / 2
250 | 	}
251 | 
252 | 	f.mant = mantissa
253 | 	f.exp = 0
254 | 	f.neg = neg
255 | 
256 | 	// Multiply by powers of ten.
257 | 	i := (exp10 - firstPowerOfTen) / stepPowerOfTen
258 | 	if exp10 < firstPowerOfTen || i >= len(powersOfTen) {
259 | 		return false
260 | 	}
261 | 	adjExp := (exp10 - firstPowerOfTen) % stepPowerOfTen
262 | 
263 | 	// We multiply by exp%step
264 | 	if adjExp < uint64digits && mantissa < uint64pow10[uint64digits-adjExp] {
265 | 		// We can multiply the mantissa exactly.
266 | 		f.mant *= uint64pow10[adjExp]
267 | 		f.Normalize()
268 | 	} else {
269 | 		f.Normalize()
270 | 		f.Multiply(smallPowersOfTen[adjExp])
271 | 		errors += errorscale / 2
272 | 	}
273 | 
274 | 	// We multiply by 10 to the exp - exp%step.
275 | 	f.Multiply(powersOfTen[i])
276 | 	if errors > 0 {
277 | 		errors++
278 | 	}
279 | 	errors += errorscale / 2
280 | 
281 | 	// Normalize
282 | 	shift := f.Normalize()
283 | 	errors <<= shift
284 | 
285 | 	// Now f is a good approximation of the decimal.
286 | 	// Check whether the error is too large: that is, if the mantissa
287 | 	// is perturbated by the error, the resulting float64 will change.
288 | 	// The 64 bits mantissa is 1 + 52 bits for float64 + 11 extra bits.
289 | 	//
290 | 	// In many cases the approximation will be good enough.
291 | 	denormalExp := flt.bias - 63
292 | 	var extrabits uint
293 | 	if f.exp <= denormalExp {
294 | 		// f.mant * 2^f.exp is smaller than 2^(flt.bias+1).
295 | 		extrabits = 63 - flt.mantbits + 1 + uint(denormalExp-f.exp)
296 | 	} else {
297 | 		extrabits = 63 - flt.mantbits
298 | 	}
299 | 
300 | 	halfway := uint64(1) << (extrabits - 1)
301 | 	mantExtra := f.mant & (1<<extrabits - 1)
302 | 
303 | 	// Do a signed comparison here! If the error estimate could make
304 | 	// the mantissa round differently for the conversion to double,
305 | 	// then we can't give a definite answer.
306 | 	if int64(halfway)-int64(errors) < int64(mantExtra) &&
307 | 		int64(mantExtra) < int64(halfway)+int64(errors) {
308 | 		return false
309 | 	}
310 | 	return true
311 | }
312 | 
313 | // Frexp10 is an analogue of math.Frexp for decimal powers. It scales
314 | // f by an approximate power of ten 10^-exp, and returns exp10, so
315 | // that f*10^exp10 has the same value as the old f, up to an ulp,
316 | // as well as the index of 10^-exp in the powersOfTen table.
317 | func (f *extFloat) frexp10() (exp10, index int) {
318 | 	// The constants expMin and expMax constrain the final value of the
319 | 	// binary exponent of f. We want a small integral part in the result
320 | 	// because finding digits of an integer requires divisions, whereas
321 | 	// digits of the fractional part can be found by repeatedly multiplying
322 | 	// by 10.
323 | 	const expMin = -60
324 | 	const expMax = -32
325 | 	// Find power of ten such that x * 10^n has a binary exponent
326 | 	// between expMin and expMax.
327 | 	approxExp10 := ((expMin+expMax)/2 - f.exp) * 28 / 93 // log(10)/log(2) is close to 93/28.
328 | 	i := (approxExp10 - firstPowerOfTen) / stepPowerOfTen
329 | Loop:
330 | 	for {
331 | 		exp := f.exp + powersOfTen[i].exp + 64
332 | 		switch {
333 | 		case exp < expMin:
334 | 			i++
335 | 		case exp > expMax:
336 | 			i--
337 | 		default:
338 | 			break Loop
339 | 		}
340 | 	}
341 | 	// Apply the desired decimal shift on f. It will have exponent
342 | 	// in the desired range. This is multiplication by 10^-exp10.
343 | 	f.Multiply(powersOfTen[i])
344 | 
345 | 	return -(firstPowerOfTen + i*stepPowerOfTen), i
346 | }
347 | 
348 | // frexp10Many applies a common shift by a power of ten to a, b, c.
349 | func frexp10Many(a, b, c *extFloat) (exp10 int) {
350 | 	exp10, i := c.frexp10()
351 | 	a.Multiply(powersOfTen[i])
352 | 	b.Multiply(powersOfTen[i])
353 | 	return
354 | }
355 | 
356 | // FixedDecimal stores in d the first n significant digits
357 | // of the decimal representation of f. It returns false
358 | // if it cannot be sure of the answer.
359 | func (f *extFloat) FixedDecimal(d *decimalSlice, n int) bool {
360 | 	if f.mant == 0 {
361 | 		d.nd = 0
362 | 		d.dp = 0
363 | 		d.neg = f.neg
364 | 		return true
365 | 	}
366 | 	if n == 0 {
367 | 		panic("strconv: internal error: extFloat.FixedDecimal called with n == 0")
368 | 	}
369 | 	// Multiply by an appropriate power of ten to have a reasonable
370 | 	// number to process.
371 | 	f.Normalize()
372 | 	exp10, _ := f.frexp10()
373 | 
374 | 	shift := uint(-f.exp)
375 | 	integer := uint32(f.mant >> shift)
376 | 	fraction := f.mant - (uint64(integer) << shift)
377 | 	ε := uint64(1) // ε is the uncertainty we have on the mantissa of f.
378 | 
379 | 	// Write exactly n digits to d.
380 | 	needed := n        // how many digits are left to write.
381 | 	integerDigits := 0 // the number of decimal digits of integer.
382 | 	pow10 := uint64(1) // the power of ten by which f was scaled.
383 | 	for i, pow := 0, uint64(1); i < 20; i++ {
384 | 		if pow > uint64(integer) {
385 | 			integerDigits = i
386 | 			break
387 | 		}
388 | 		pow *= 10
389 | 	}
390 | 	rest := integer
391 | 	if integerDigits > needed {
392 | 		// the integral part is already large, trim the last digits.
393 | 		pow10 = uint64pow10[integerDigits-needed]
394 | 		integer /= uint32(pow10)
395 | 		rest -= integer * uint32(pow10)
396 | 	} else {
397 | 		rest = 0
398 | 	}
399 | 
400 | 	// Write the digits of integer: the digits of rest are omitted.
401 | 	var buf [32]byte
402 | 	pos := len(buf)
403 | 	for v := integer; v > 0; {
404 | 		v1 := v / 10
405 | 		v -= 10 * v1
406 | 		pos--
407 | 		buf[pos] = byte(v + '0')
408 | 		v = v1
409 | 	}
410 | 	for i := pos; i < len(buf); i++ {
411 | 		d.d[i-pos] = buf[i]
412 | 	}
413 | 	nd := len(buf) - pos
414 | 	d.nd = nd
415 | 	d.dp = integerDigits + exp10
416 | 	needed -= nd
417 | 
418 | 	if needed > 0 {
419 | 		if rest != 0 || pow10 != 1 {
420 | 			panic("strconv: internal error, rest != 0 but needed > 0")
421 | 		}
422 | 		// Emit digits for the fractional part. Each time, 10*fraction
423 | 		// fits in a uint64 without overflow.
424 | 		for needed > 0 {
425 | 			fraction *= 10
426 | 			ε *= 10 // the uncertainty scales as we multiply by ten.
427 | 			if 2*ε > 1<<shift {
428 | 				// the error is so large it could modify which digit to write, abort.
429 | 				return false
430 | 			}
431 | 			digit := fraction >> shift
432 | 			d.d[nd] = byte(digit + '0')
433 | 			fraction -= digit << shift
434 | 			nd++
435 | 			needed--
436 | 		}
437 | 		d.nd = nd
438 | 	}
439 | 
440 | 	// We have written a truncation of f (a numerator / 10^d.dp). The remaining part
441 | 	// can be interpreted as a small number (< 1) to be added to the last digit of the
442 | 	// numerator.
443 | 	//
444 | 	// If rest > 0, the amount is:
445 | 	//    (rest<<shift | fraction) / (pow10 << shift)
446 | 	//    fraction being known with a ±ε uncertainty.
447 | 	//    The fact that n > 0 guarantees that pow10 << shift does not overflow a uint64.
448 | 	//
449 | 	// If rest = 0, pow10 == 1 and the amount is
450 | 	//    fraction / (1 << shift)
451 | 	//    fraction being known with a ±ε uncertainty.
452 | 	//
453 | 	// We pass this information to the rounding routine for adjustment.
454 | 
455 | 	ok := adjustLastDigitFixed(d, uint64(rest)<<shift|fraction, pow10, shift, ε)
456 | 	if !ok {
457 | 		return false
458 | 	}
459 | 	// Trim trailing zeros.
460 | 	for i := d.nd - 1; i >= 0; i-- {
461 | 		if d.d[i] != '0' {
462 | 			d.nd = i + 1
463 | 			break
464 | 		}
465 | 	}
466 | 	return true
467 | }
468 | 
469 | // adjustLastDigitFixed assumes d contains the representation of the integral part
470 | // of some number, whose fractional part is num / (den << shift). The numerator
471 | // num is only known up to an uncertainty of size ε, assumed to be less than
472 | // (den << shift)/2.
473 | //
474 | // It will increase the last digit by one to account for correct rounding, typically
475 | // when the fractional part is greater than 1/2, and will return false if ε is such
476 | // that no correct answer can be given.
477 | func adjustLastDigitFixed(d *decimalSlice, num, den uint64, shift uint, ε uint64) bool {
478 | 	if num > den<<shift {
479 | 		panic("strconv: num > den<<shift in adjustLastDigitFixed")
480 | 	}
481 | 	if 2*ε > den<<shift {
482 | 		panic("strconv: ε > (den<<shift)/2")
483 | 	}
484 | 	if 2*(num+ε) < den<<shift {
485 | 		return true
486 | 	}
487 | 	if 2*(num-ε) > den<<shift {
488 | 		// increment d by 1.
489 | 		i := d.nd - 1
490 | 		for ; i >= 0; i-- {
491 | 			if d.d[i] == '9' {
492 | 				d.nd--
493 | 			} else {
494 | 				break
495 | 			}
496 | 		}
497 | 		if i < 0 {
498 | 			d.d[0] = '1'
499 | 			d.nd = 1
500 | 			d.dp++
501 | 		} else {
502 | 			d.d[i]++
503 | 		}
504 | 		return true
505 | 	}
506 | 	return false
507 | }
508 | 
509 | // ShortestDecimal stores in d the shortest decimal representation of f
510 | // which belongs to the open interval (lower, upper), where f is supposed
511 | // to lie. It returns false whenever the result is unsure. The implementation
512 | // uses the Grisu3 algorithm.
513 | func (f *extFloat) ShortestDecimal(d *decimalSlice, lower, upper *extFloat) bool {
514 | 	if f.mant == 0 {
515 | 		d.nd = 0
516 | 		d.dp = 0
517 | 		d.neg = f.neg
518 | 		return true
519 | 	}
520 | 	if f.exp == 0 && *lower == *f && *lower == *upper {
521 | 		// an exact integer.
522 | 		var buf [24]byte
523 | 		n := len(buf) - 1
524 | 		for v := f.mant; v > 0; {
525 | 			v1 := v / 10
526 | 			v -= 10 * v1
527 | 			buf[n] = byte(v + '0')
528 | 			n--
529 | 			v = v1
530 | 		}
531 | 		nd := len(buf) - n - 1
532 | 		for i := 0; i < nd; i++ {
533 | 			d.d[i] = buf[n+1+i]
534 | 		}
535 | 		d.nd, d.dp = nd, nd
536 | 		for d.nd > 0 && d.d[d.nd-1] == '0' {
537 | 			d.nd--
538 | 		}
539 | 		if d.nd == 0 {
540 | 			d.dp = 0
541 | 		}
542 | 		d.neg = f.neg
543 | 		return true
544 | 	}
545 | 	upper.Normalize()
546 | 	// Uniformize exponents.
547 | 	if f.exp > upper.exp {
548 | 		f.mant <<= uint(f.exp - upper.exp)
549 | 		f.exp = upper.exp
550 | 	}
551 | 	if lower.exp > upper.exp {
552 | 		lower.mant <<= uint(lower.exp - upper.exp)
553 | 		lower.exp = upper.exp
554 | 	}
555 | 
556 | 	exp10 := frexp10Many(lower, f, upper)
557 | 	// Take a safety margin due to rounding in frexp10Many, but we lose precision.
558 | 	upper.mant++
559 | 	lower.mant--
560 | 
561 | 	// The shortest representation of f is either rounded up or down, but
562 | 	// in any case, it is a truncation of upper.
563 | 	shift := uint(-upper.exp)
564 | 	integer := uint32(upper.mant >> shift)
565 | 	fraction := upper.mant - (uint64(integer) << shift)
566 | 
567 | 	// How far we can go down from upper until the result is wrong.
568 | 	allowance := upper.mant - lower.mant
569 | 	// How far we should go to get a very precise result.
570 | 	targetDiff := upper.mant - f.mant
571 | 
572 | 	// Count integral digits: there are at most 10.
573 | 	var integerDigits int
574 | 	for i, pow := 0, uint64(1); i < 20; i++ {
575 | 		if pow > uint64(integer) {
576 | 			integerDigits = i
577 | 			break
578 | 		}
579 | 		pow *= 10
580 | 	}
581 | 	for i := 0; i < integerDigits; i++ {
582 | 		pow := uint64pow10[integerDigits-i-1]
583 | 		digit := integer / uint32(pow)
584 | 		d.d[i] = byte(digit + '0')
585 | 		integer -= digit * uint32(pow)
586 | 		// evaluate whether we should stop.
587 | 		if currentDiff := uint64(integer)<<shift + fraction; currentDiff < allowance {
588 | 			d.nd = i + 1
589 | 			d.dp = integerDigits + exp10
590 | 			d.neg = f.neg
591 | 			// Sometimes allowance is so large the last digit might need to be
592 | 			// decremented to get closer to f.
593 | 			return adjustLastDigit(d, currentDiff, targetDiff, allowance, pow<<shift, 2)
594 | 		}
595 | 	}
596 | 	d.nd = integerDigits
597 | 	d.dp = d.nd + exp10
598 | 	d.neg = f.neg
599 | 
600 | 	// Compute digits of the fractional part. At each step fraction does not
601 | 	// overflow. The choice of minExp implies that fraction is less than 2^60.
602 | 	var digit int
603 | 	multiplier := uint64(1)
604 | 	for {
605 | 		fraction *= 10
606 | 		multiplier *= 10
607 | 		digit = int(fraction >> shift)
608 | 		d.d[d.nd] = byte(digit + '0')
609 | 		d.nd++
610 | 		fraction -= uint64(digit) << shift
611 | 		if fraction < allowance*multiplier {
612 | 			// We are in the admissible range. Note that if allowance is about to
613 | 			// overflow, that is, allowance > 2^64/10, the condition is automatically
614 | 			// true due to the limited range of fraction.
615 | 			return adjustLastDigit(d,
616 | 				fraction, targetDiff*multiplier, allowance*multiplier,
617 | 				1<<shift, multiplier*2)
618 | 		}
619 | 	}
620 | }
621 | 
622 | // adjustLastDigit modifies d = x-currentDiff*ε, to get closest to
623 | // d = x-targetDiff*ε, without becoming smaller than x-maxDiff*ε.
624 | // It assumes that a decimal digit is worth ulpDecimal*ε, and that
625 | // all data is known with an error estimate of ulpBinary*ε.
626 | func adjustLastDigit(d *decimalSlice, currentDiff, targetDiff, maxDiff, ulpDecimal, ulpBinary uint64) bool {
627 | 	if ulpDecimal < 2*ulpBinary {
628 | 		// Approximation is too wide.
629 | 		return false
630 | 	}
631 | 	for currentDiff+ulpDecimal/2+ulpBinary < targetDiff {
632 | 		d.d[d.nd-1]--
633 | 		currentDiff += ulpDecimal
634 | 	}
635 | 	if currentDiff+ulpDecimal <= targetDiff+ulpDecimal/2+ulpBinary {
636 | 		// we have two choices, and don't know what to do.
637 | 		return false
638 | 	}
639 | 	if currentDiff < ulpBinary || currentDiff > maxDiff-ulpBinary {
640 | 		// we went too far
641 | 		return false
642 | 	}
643 | 	if d.nd == 1 && d.d[0] == '0' {
644 | 		// the number has actually reached zero.
645 | 		d.nd = 0
646 | 		d.dp = 0
647 | 	}
648 | 	return true
649 | }
650 | 


--------------------------------------------------------------------------------
/internal/strconv/ftoa.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // Binary to decimal floating point conversion.
  6 | // Algorithm:
  7 | //   1) store mantissa in multiprecision decimal
  8 | //   2) shift decimal by exponent
  9 | //   3) read digits out & format
 10 | 
 11 | package strconv
 12 | 
 13 | import (
 14 | 	"math"
 15 | 
 16 | 	"github.com/mewmew/float/binary16"
 17 | )
 18 | 
 19 | var optimize = true // can change for testing
 20 | 
 21 | // TODO: move elsewhere?
 22 | type floatInfo struct {
 23 | 	mantbits uint
 24 | 	expbits  uint
 25 | 	bias     int
 26 | }
 27 | 
 28 | var float16info = floatInfo{10, 5, -15}
 29 | var float32info = floatInfo{23, 8, -127}
 30 | var float64info = floatInfo{52, 11, -1023}
 31 | 
 32 | // FormatFloat converts the floating-point number f to a string,
 33 | // according to the format fmt and precision prec. It rounds the
 34 | // result assuming that the original was obtained from a floating-point
 35 | // value of bitSize bits (32 for float32, 64 for float64).
 36 | //
 37 | // The format fmt is one of
 38 | // 'b' (-ddddp±ddd, a binary exponent),
 39 | // 'e' (-d.dddde±dd, a decimal exponent),
 40 | // 'E' (-d.ddddE±dd, a decimal exponent),
 41 | // 'f' (-ddd.dddd, no exponent),
 42 | // 'g' ('e' for large exponents, 'f' otherwise), or
 43 | // 'G' ('E' for large exponents, 'f' otherwise).
 44 | //
 45 | // The precision prec controls the number of digits (excluding the exponent)
 46 | // printed by the 'e', 'E', 'f', 'g', and 'G' formats.
 47 | // For 'e', 'E', and 'f' it is the number of digits after the decimal point.
 48 | // For 'g' and 'G' it is the maximum number of significant digits (trailing
 49 | // zeros are removed).
 50 | // The special precision -1 uses the smallest number of digits
 51 | // necessary such that ParseFloat will return f exactly.
 52 | func FormatFloat(f float64, fmt byte, prec, bitSize int) string {
 53 | 	return string(genericFtoa(make([]byte, 0, max(prec+4, 24)), f, fmt, prec, bitSize))
 54 | }
 55 | 
 56 | // AppendFloat appends the string form of the floating-point number f,
 57 | // as generated by FormatFloat, to dst and returns the extended buffer.
 58 | func AppendFloat(dst []byte, f float64, fmt byte, prec, bitSize int) []byte {
 59 | 	return genericFtoa(dst, f, fmt, prec, bitSize)
 60 | }
 61 | 
 62 | func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte {
 63 | 	var bits uint64
 64 | 	var flt *floatInfo
 65 | 	switch bitSize {
 66 | 	case 16:
 67 | 		f, acc := binary16.NewFromFloat64(val)
 68 | 		_ = acc
 69 | 		bits = uint64(f.Bits())
 70 | 		flt = &float16info
 71 | 	case 32:
 72 | 		bits = uint64(math.Float32bits(float32(val)))
 73 | 		flt = &float32info
 74 | 	case 64:
 75 | 		bits = math.Float64bits(val)
 76 | 		flt = &float64info
 77 | 	default:
 78 | 		panic("strconv: illegal AppendFloat/FormatFloat bitSize")
 79 | 	}
 80 | 
 81 | 	neg := bits>>(flt.expbits+flt.mantbits) != 0
 82 | 	exp := int(bits>>flt.mantbits) & (1<<flt.expbits - 1)
 83 | 	mant := bits & (uint64(1)<<flt.mantbits - 1)
 84 | 
 85 | 	switch exp {
 86 | 	case 1<<flt.expbits - 1:
 87 | 		// Inf, NaN
 88 | 		var s string
 89 | 		switch {
 90 | 		case mant != 0:
 91 | 			s = "NaN"
 92 | 		case neg:
 93 | 			s = "-Inf"
 94 | 		default:
 95 | 			s = "+Inf"
 96 | 		}
 97 | 		return append(dst, s...)
 98 | 
 99 | 	case 0:
100 | 		// denormalized
101 | 		exp++
102 | 
103 | 	default:
104 | 		// add implicit top bit
105 | 		mant |= uint64(1) << flt.mantbits
106 | 	}
107 | 	exp += flt.bias
108 | 
109 | 	// Pick off easy binary format.
110 | 	if fmt == 'b' {
111 | 		return fmtB(dst, neg, mant, exp, flt)
112 | 	}
113 | 
114 | 	if !optimize {
115 | 		return bigFtoa(dst, prec, fmt, neg, mant, exp, flt)
116 | 	}
117 | 
118 | 	var digs decimalSlice
119 | 	ok := false
120 | 	// Negative precision means "only as much as needed to be exact."
121 | 	shortest := prec < 0
122 | 	if shortest {
123 | 		// Try Grisu3 algorithm.
124 | 		f := new(extFloat)
125 | 		lower, upper := f.AssignComputeBounds(mant, exp, neg, flt)
126 | 		var buf [32]byte
127 | 		digs.d = buf[:]
128 | 		ok = f.ShortestDecimal(&digs, &lower, &upper)
129 | 		if !ok {
130 | 			return bigFtoa(dst, prec, fmt, neg, mant, exp, flt)
131 | 		}
132 | 		// Precision for shortest representation mode.
133 | 		switch fmt {
134 | 		case 'e', 'E':
135 | 			prec = max(digs.nd-1, 0)
136 | 		case 'f':
137 | 			prec = max(digs.nd-digs.dp, 0)
138 | 		case 'g', 'G':
139 | 			prec = digs.nd
140 | 		}
141 | 	} else if fmt != 'f' {
142 | 		// Fixed number of digits.
143 | 		digits := prec
144 | 		switch fmt {
145 | 		case 'e', 'E':
146 | 			digits++
147 | 		case 'g', 'G':
148 | 			if prec == 0 {
149 | 				prec = 1
150 | 			}
151 | 			digits = prec
152 | 		}
153 | 		if digits <= 15 {
154 | 			// try fast algorithm when the number of digits is reasonable.
155 | 			var buf [24]byte
156 | 			digs.d = buf[:]
157 | 			f := extFloat{mant, exp - int(flt.mantbits), neg}
158 | 			ok = f.FixedDecimal(&digs, digits)
159 | 		}
160 | 	}
161 | 	if !ok {
162 | 		return bigFtoa(dst, prec, fmt, neg, mant, exp, flt)
163 | 	}
164 | 	return formatDigits(dst, shortest, neg, digs, prec, fmt)
165 | }
166 | 
167 | // bigFtoa uses multiprecision computations to format a float.
168 | func bigFtoa(dst []byte, prec int, fmt byte, neg bool, mant uint64, exp int, flt *floatInfo) []byte {
169 | 	d := new(decimal)
170 | 	d.Assign(mant)
171 | 	d.Shift(exp - int(flt.mantbits))
172 | 	var digs decimalSlice
173 | 	shortest := prec < 0
174 | 	if shortest {
175 | 		roundShortest(d, mant, exp, flt)
176 | 		digs = decimalSlice{d: d.d[:], nd: d.nd, dp: d.dp}
177 | 		// Precision for shortest representation mode.
178 | 		switch fmt {
179 | 		case 'e', 'E':
180 | 			prec = digs.nd - 1
181 | 		case 'f':
182 | 			prec = max(digs.nd-digs.dp, 0)
183 | 		case 'g', 'G':
184 | 			prec = digs.nd
185 | 		}
186 | 	} else {
187 | 		// Round appropriately.
188 | 		switch fmt {
189 | 		case 'e', 'E':
190 | 			d.Round(prec + 1)
191 | 		case 'f':
192 | 			d.Round(d.dp + prec)
193 | 		case 'g', 'G':
194 | 			if prec == 0 {
195 | 				prec = 1
196 | 			}
197 | 			d.Round(prec)
198 | 		}
199 | 		digs = decimalSlice{d: d.d[:], nd: d.nd, dp: d.dp}
200 | 	}
201 | 	return formatDigits(dst, shortest, neg, digs, prec, fmt)
202 | }
203 | 
204 | func formatDigits(dst []byte, shortest bool, neg bool, digs decimalSlice, prec int, fmt byte) []byte {
205 | 	switch fmt {
206 | 	case 'e', 'E':
207 | 		return fmtE(dst, neg, digs, prec, fmt)
208 | 	case 'f':
209 | 		return fmtF(dst, neg, digs, prec)
210 | 	case 'g', 'G':
211 | 		// trailing fractional zeros in 'e' form will be trimmed.
212 | 		eprec := prec
213 | 		if eprec > digs.nd && digs.nd >= digs.dp {
214 | 			eprec = digs.nd
215 | 		}
216 | 		// %e is used if the exponent from the conversion
217 | 		// is less than -4 or greater than or equal to the precision.
218 | 		// if precision was the shortest possible, use precision 6 for this decision.
219 | 		if shortest {
220 | 			eprec = 6
221 | 		}
222 | 		exp := digs.dp - 1
223 | 		if exp < -4 || exp >= eprec {
224 | 			if prec > digs.nd {
225 | 				prec = digs.nd
226 | 			}
227 | 			return fmtE(dst, neg, digs, prec-1, fmt+'e'-'g')
228 | 		}
229 | 		if prec > digs.dp {
230 | 			prec = digs.nd
231 | 		}
232 | 		return fmtF(dst, neg, digs, max(prec-digs.dp, 0))
233 | 	}
234 | 
235 | 	// unknown format
236 | 	return append(dst, '%', fmt)
237 | }
238 | 
239 | // roundShortest rounds d (= mant * 2^exp) to the shortest number of digits
240 | // that will let the original floating point value be precisely reconstructed.
241 | func roundShortest(d *decimal, mant uint64, exp int, flt *floatInfo) {
242 | 	// If mantissa is zero, the number is zero; stop now.
243 | 	if mant == 0 {
244 | 		d.nd = 0
245 | 		return
246 | 	}
247 | 
248 | 	// Compute upper and lower such that any decimal number
249 | 	// between upper and lower (possibly inclusive)
250 | 	// will round to the original floating point number.
251 | 
252 | 	// We may see at once that the number is already shortest.
253 | 	//
254 | 	// Suppose d is not denormal, so that 2^exp <= d < 10^dp.
255 | 	// The closest shorter number is at least 10^(dp-nd) away.
256 | 	// The lower/upper bounds computed below are at distance
257 | 	// at most 2^(exp-mantbits).
258 | 	//
259 | 	// So the number is already shortest if 10^(dp-nd) > 2^(exp-mantbits),
260 | 	// or equivalently log2(10)*(dp-nd) > exp-mantbits.
261 | 	// It is true if 332/100*(dp-nd) >= exp-mantbits (log2(10) > 3.32).
262 | 	minexp := flt.bias + 1 // minimum possible exponent
263 | 	if exp > minexp && 332*(d.dp-d.nd) >= 100*(exp-int(flt.mantbits)) {
264 | 		// The number is already shortest.
265 | 		return
266 | 	}
267 | 
268 | 	// d = mant << (exp - mantbits)
269 | 	// Next highest floating point number is mant+1 << exp-mantbits.
270 | 	// Our upper bound is halfway between, mant*2+1 << exp-mantbits-1.
271 | 	upper := new(decimal)
272 | 	upper.Assign(mant*2 + 1)
273 | 	upper.Shift(exp - int(flt.mantbits) - 1)
274 | 
275 | 	// d = mant << (exp - mantbits)
276 | 	// Next lowest floating point number is mant-1 << exp-mantbits,
277 | 	// unless mant-1 drops the significant bit and exp is not the minimum exp,
278 | 	// in which case the next lowest is mant*2-1 << exp-mantbits-1.
279 | 	// Either way, call it mantlo << explo-mantbits.
280 | 	// Our lower bound is halfway between, mantlo*2+1 << explo-mantbits-1.
281 | 	var mantlo uint64
282 | 	var explo int
283 | 	if mant > 1<<flt.mantbits || exp == minexp {
284 | 		mantlo = mant - 1
285 | 		explo = exp
286 | 	} else {
287 | 		mantlo = mant*2 - 1
288 | 		explo = exp - 1
289 | 	}
290 | 	lower := new(decimal)
291 | 	lower.Assign(mantlo*2 + 1)
292 | 	lower.Shift(explo - int(flt.mantbits) - 1)
293 | 
294 | 	// The upper and lower bounds are possible outputs only if
295 | 	// the original mantissa is even, so that IEEE round-to-even
296 | 	// would round to the original mantissa and not the neighbors.
297 | 	inclusive := mant%2 == 0
298 | 
299 | 	// Now we can figure out the minimum number of digits required.
300 | 	// Walk along until d has distinguished itself from upper and lower.
301 | 	for i := 0; i < d.nd; i++ {
302 | 		l := byte('0') // lower digit
303 | 		if i < lower.nd {
304 | 			l = lower.d[i]
305 | 		}
306 | 		m := d.d[i]    // middle digit
307 | 		u := byte('0') // upper digit
308 | 		if i < upper.nd {
309 | 			u = upper.d[i]
310 | 		}
311 | 
312 | 		// Okay to round down (truncate) if lower has a different digit
313 | 		// or if lower is inclusive and is exactly the result of rounding
314 | 		// down (i.e., and we have reached the final digit of lower).
315 | 		okdown := l != m || inclusive && i+1 == lower.nd
316 | 
317 | 		// Okay to round up if upper has a different digit and either upper
318 | 		// is inclusive or upper is bigger than the result of rounding up.
319 | 		okup := m != u && (inclusive || m+1 < u || i+1 < upper.nd)
320 | 
321 | 		// If it's okay to do either, then round to the nearest one.
322 | 		// If it's okay to do only one, do it.
323 | 		switch {
324 | 		case okdown && okup:
325 | 			d.Round(i + 1)
326 | 			return
327 | 		case okdown:
328 | 			d.RoundDown(i + 1)
329 | 			return
330 | 		case okup:
331 | 			d.RoundUp(i + 1)
332 | 			return
333 | 		}
334 | 	}
335 | }
336 | 
337 | type decimalSlice struct {
338 | 	d      []byte
339 | 	nd, dp int
340 | 	neg    bool
341 | }
342 | 
343 | // %e: -d.ddddde±dd
344 | func fmtE(dst []byte, neg bool, d decimalSlice, prec int, fmt byte) []byte {
345 | 	// sign
346 | 	if neg {
347 | 		dst = append(dst, '-')
348 | 	}
349 | 
350 | 	// first digit
351 | 	ch := byte('0')
352 | 	if d.nd != 0 {
353 | 		ch = d.d[0]
354 | 	}
355 | 	dst = append(dst, ch)
356 | 
357 | 	// .moredigits
358 | 	if prec > 0 {
359 | 		dst = append(dst, '.')
360 | 		i := 1
361 | 		m := min(d.nd, prec+1)
362 | 		if i < m {
363 | 			dst = append(dst, d.d[i:m]...)
364 | 			i = m
365 | 		}
366 | 		for ; i <= prec; i++ {
367 | 			dst = append(dst, '0')
368 | 		}
369 | 	}
370 | 
371 | 	// e±
372 | 	dst = append(dst, fmt)
373 | 	exp := d.dp - 1
374 | 	if d.nd == 0 { // special case: 0 has exponent 0
375 | 		exp = 0
376 | 	}
377 | 	if exp < 0 {
378 | 		ch = '-'
379 | 		exp = -exp
380 | 	} else {
381 | 		ch = '+'
382 | 	}
383 | 	dst = append(dst, ch)
384 | 
385 | 	// dd or ddd
386 | 	switch {
387 | 	case exp < 10:
388 | 		dst = append(dst, '0', byte(exp)+'0')
389 | 	case exp < 100:
390 | 		dst = append(dst, byte(exp/10)+'0', byte(exp%10)+'0')
391 | 	default:
392 | 		dst = append(dst, byte(exp/100)+'0', byte(exp/10)%10+'0', byte(exp%10)+'0')
393 | 	}
394 | 
395 | 	return dst
396 | }
397 | 
398 | // %f: -ddddddd.ddddd
399 | func fmtF(dst []byte, neg bool, d decimalSlice, prec int) []byte {
400 | 	// sign
401 | 	if neg {
402 | 		dst = append(dst, '-')
403 | 	}
404 | 
405 | 	// integer, padded with zeros as needed.
406 | 	if d.dp > 0 {
407 | 		m := min(d.nd, d.dp)
408 | 		dst = append(dst, d.d[:m]...)
409 | 		for ; m < d.dp; m++ {
410 | 			dst = append(dst, '0')
411 | 		}
412 | 	} else {
413 | 		dst = append(dst, '0')
414 | 	}
415 | 
416 | 	// fraction
417 | 	if prec > 0 {
418 | 		dst = append(dst, '.')
419 | 		for i := 0; i < prec; i++ {
420 | 			ch := byte('0')
421 | 			if j := d.dp + i; 0 <= j && j < d.nd {
422 | 				ch = d.d[j]
423 | 			}
424 | 			dst = append(dst, ch)
425 | 		}
426 | 	}
427 | 
428 | 	return dst
429 | }
430 | 
431 | // %b: -ddddddddp±ddd
432 | func fmtB(dst []byte, neg bool, mant uint64, exp int, flt *floatInfo) []byte {
433 | 	// sign
434 | 	if neg {
435 | 		dst = append(dst, '-')
436 | 	}
437 | 
438 | 	// mantissa
439 | 	dst, _ = formatBits(dst, mant, 10, false, true)
440 | 
441 | 	// p
442 | 	dst = append(dst, 'p')
443 | 
444 | 	// ±exponent
445 | 	exp -= int(flt.mantbits)
446 | 	if exp >= 0 {
447 | 		dst = append(dst, '+')
448 | 	}
449 | 	dst, _ = formatBits(dst, uint64(exp), 10, exp < 0, true)
450 | 
451 | 	return dst
452 | }
453 | 
454 | func min(a, b int) int {
455 | 	if a < b {
456 | 		return a
457 | 	}
458 | 	return b
459 | }
460 | 
461 | func max(a, b int) int {
462 | 	if a > b {
463 | 		return a
464 | 	}
465 | 	return b
466 | }
467 | 


--------------------------------------------------------------------------------
/internal/strconv/itoa.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package strconv
  6 | 
  7 | import "math/bits"
  8 | 
  9 | const fastSmalls = true // enable fast path for small integers
 10 | 
 11 | // FormatUint returns the string representation of i in the given base,
 12 | // for 2 <= base <= 36. The result uses the lower-case letters 'a' to 'z'
 13 | // for digit values >= 10.
 14 | func FormatUint(i uint64, base int) string {
 15 | 	if fastSmalls && i < nSmalls && base == 10 {
 16 | 		return small(int(i))
 17 | 	}
 18 | 	_, s := formatBits(nil, i, base, false, false)
 19 | 	return s
 20 | }
 21 | 
 22 | // FormatInt returns the string representation of i in the given base,
 23 | // for 2 <= base <= 36. The result uses the lower-case letters 'a' to 'z'
 24 | // for digit values >= 10.
 25 | func FormatInt(i int64, base int) string {
 26 | 	if fastSmalls && 0 <= i && i < nSmalls && base == 10 {
 27 | 		return small(int(i))
 28 | 	}
 29 | 	_, s := formatBits(nil, uint64(i), base, i < 0, false)
 30 | 	return s
 31 | }
 32 | 
 33 | // Itoa is shorthand for FormatInt(int64(i), 10).
 34 | func Itoa(i int) string {
 35 | 	return FormatInt(int64(i), 10)
 36 | }
 37 | 
 38 | // AppendInt appends the string form of the integer i,
 39 | // as generated by FormatInt, to dst and returns the extended buffer.
 40 | func AppendInt(dst []byte, i int64, base int) []byte {
 41 | 	if fastSmalls && 0 <= i && i < nSmalls && base == 10 {
 42 | 		return append(dst, small(int(i))...)
 43 | 	}
 44 | 	dst, _ = formatBits(dst, uint64(i), base, i < 0, true)
 45 | 	return dst
 46 | }
 47 | 
 48 | // AppendUint appends the string form of the unsigned integer i,
 49 | // as generated by FormatUint, to dst and returns the extended buffer.
 50 | func AppendUint(dst []byte, i uint64, base int) []byte {
 51 | 	if fastSmalls && i < nSmalls && base == 10 {
 52 | 		return append(dst, small(int(i))...)
 53 | 	}
 54 | 	dst, _ = formatBits(dst, i, base, false, true)
 55 | 	return dst
 56 | }
 57 | 
 58 | // small returns the string for an i with 0 <= i < nSmalls.
 59 | func small(i int) string {
 60 | 	if i < 10 {
 61 | 		return digits[i : i+1]
 62 | 	}
 63 | 	return smallsString[i*2 : i*2+2]
 64 | }
 65 | 
 66 | const nSmalls = 100
 67 | 
 68 | const smallsString = "00010203040506070809" +
 69 | 	"10111213141516171819" +
 70 | 	"20212223242526272829" +
 71 | 	"30313233343536373839" +
 72 | 	"40414243444546474849" +
 73 | 	"50515253545556575859" +
 74 | 	"60616263646566676869" +
 75 | 	"70717273747576777879" +
 76 | 	"80818283848586878889" +
 77 | 	"90919293949596979899"
 78 | 
 79 | const host32bit = ^uint(0)>>32 == 0
 80 | 
 81 | const digits = "0123456789abcdefghijklmnopqrstuvwxyz"
 82 | 
 83 | // formatBits computes the string representation of u in the given base.
 84 | // If neg is set, u is treated as negative int64 value. If doAppend is
 85 | // set, the string is appended to dst and the resulting byte slice is
 86 | // returned as the first result value; otherwise the string is returned
 87 | // as the second result value.
 88 | //
 89 | func formatBits(dst []byte, u uint64, base int, neg, doAppend bool) (d []byte, s string) {
 90 | 	if base < 2 || base > len(digits) {
 91 | 		panic("strconv: illegal AppendInt/FormatInt base")
 92 | 	}
 93 | 	// 2 <= base && base <= len(digits)
 94 | 
 95 | 	var a [64 + 1]byte // +1 for sign of 64bit value in base 2
 96 | 	i := len(a)
 97 | 
 98 | 	if neg {
 99 | 		u = -u
100 | 	}
101 | 
102 | 	// convert bits
103 | 	// We use uint values where we can because those will
104 | 	// fit into a single register even on a 32bit machine.
105 | 	if base == 10 {
106 | 		// common case: use constants for / because
107 | 		// the compiler can optimize it into a multiply+shift
108 | 
109 | 		if host32bit {
110 | 			// convert the lower digits using 32bit operations
111 | 			for u >= 1e9 {
112 | 				// Avoid using r = a%b in addition to q = a/b
113 | 				// since 64bit division and modulo operations
114 | 				// are calculated by runtime functions on 32bit machines.
115 | 				q := u / 1e9
116 | 				us := uint(u - q*1e9) // u % 1e9 fits into a uint
117 | 				for j := 4; j > 0; j-- {
118 | 					is := us % 100 * 2
119 | 					us /= 100
120 | 					i -= 2
121 | 					a[i+1] = smallsString[is+1]
122 | 					a[i+0] = smallsString[is+0]
123 | 				}
124 | 
125 | 				// us < 10, since it contains the last digit
126 | 				// from the initial 9-digit us.
127 | 				i--
128 | 				a[i] = smallsString[us*2+1]
129 | 
130 | 				u = q
131 | 			}
132 | 			// u < 1e9
133 | 		}
134 | 
135 | 		// u guaranteed to fit into a uint
136 | 		us := uint(u)
137 | 		for us >= 100 {
138 | 			is := us % 100 * 2
139 | 			us /= 100
140 | 			i -= 2
141 | 			a[i+1] = smallsString[is+1]
142 | 			a[i+0] = smallsString[is+0]
143 | 		}
144 | 
145 | 		// us < 100
146 | 		is := us * 2
147 | 		i--
148 | 		a[i] = smallsString[is+1]
149 | 		if us >= 10 {
150 | 			i--
151 | 			a[i] = smallsString[is]
152 | 		}
153 | 
154 | 	} else if isPowerOfTwo(base) {
155 | 		// Use shifts and masks instead of / and %.
156 | 		// Base is a power of 2 and 2 <= base <= len(digits) where len(digits) is 36.
157 | 		// The largest power of 2 below or equal to 36 is 32, which is 1 << 5;
158 | 		// i.e., the largest possible shift count is 5. By &-ind that value with
159 | 		// the constant 7 we tell the compiler that the shift count is always
160 | 		// less than 8 which is smaller than any register width. This allows
161 | 		// the compiler to generate better code for the shift operation.
162 | 		shift := uint(bits.TrailingZeros(uint(base))) & 7
163 | 		b := uint64(base)
164 | 		m := uint(base) - 1 // == 1<<shift - 1
165 | 		for u >= b {
166 | 			i--
167 | 			a[i] = digits[uint(u)&m]
168 | 			u >>= shift
169 | 		}
170 | 		// u < base
171 | 		i--
172 | 		a[i] = digits[uint(u)]
173 | 	} else {
174 | 		// general case
175 | 		b := uint64(base)
176 | 		for u >= b {
177 | 			i--
178 | 			// Avoid using r = a%b in addition to q = a/b
179 | 			// since 64bit division and modulo operations
180 | 			// are calculated by runtime functions on 32bit machines.
181 | 			q := u / b
182 | 			a[i] = digits[uint(u-q*b)]
183 | 			u = q
184 | 		}
185 | 		// u < base
186 | 		i--
187 | 		a[i] = digits[uint(u)]
188 | 	}
189 | 
190 | 	// add sign, if any
191 | 	if neg {
192 | 		i--
193 | 		a[i] = '-'
194 | 	}
195 | 
196 | 	if doAppend {
197 | 		d = append(dst, a[i:]...)
198 | 		return
199 | 	}
200 | 	s = string(a[i:])
201 | 	return
202 | }
203 | 
204 | func isPowerOfTwo(x int) bool {
205 | 	return x&(x-1) == 0
206 | }
207 | 


--------------------------------------------------------------------------------