├── .github
    └── workflows
    │   └── test.yml
├── LICENSE
├── README.md
├── _asm
    ├── go.mod
    ├── go.sum
    └── nat_amd64_asm.go
├── extra_test.go
├── go.mod
├── go.sum
├── nat.go
├── nat_386.s
├── nat_amd64.s
├── nat_arm.s
├── nat_arm64.s
├── nat_asm.go
├── nat_loong64.s
├── nat_noasm.go
├── nat_ppc64x.s
├── nat_riscv64.s
├── nat_s390x.s
├── nat_test.go
├── nat_wasm.go
└── testdata
    └── mod_inv_tests.txt


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Go tests
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   test:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v2
 8 |       - uses: actions/setup-go@v4
 9 |         with:
10 |           go-version-file: go.mod
11 |           check-latest: true
12 |       - run: go test -short ./...
13 |       - run: go test -short -tags purego ./...
14 |       - run: GOARCH=386 go test -c
15 |       - run: GOARCH=arm go test -c
16 |       - run: GOARCH=arm64 go test -c
17 |       - run: GOARCH=ppc64 go test -c
18 |       - run: GOARCH=ppc64le go test -c
19 |       - run: GOARCH=riscv64 go test -c
20 |       - run: GOARCH=s390x go test -c
21 |       - run: GOARCH=loong64 go test -c
22 |       - run: GOOS=js GOARCH=wasm go test -c
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Package bigmod implements constant-time big integer arithmetic modulo large
 2 | moduli. Unlike math/big, this package is suitable for implementing
 3 | security-sensitive cryptographic operations. It is a re-exported version the
 4 | standard library package crypto/internal/fips140/bigmod used to implement
 5 | crypto/rsa amongst others.
 6 | 
 7 | v0.1.0 is up to date with Go 1.24.
 8 | 
 9 | The API is NOT stable.
10 | 


--------------------------------------------------------------------------------
/_asm/go.mod:
--------------------------------------------------------------------------------
 1 | module filippo.io/bigmod/_asm
 2 | 
 3 | go 1.25
 4 | 
 5 | require github.com/mmcloughlin/avo v0.6.0
 6 | 
 7 | require (
 8 | 	golang.org/x/mod v0.14.0 // indirect
 9 | 	golang.org/x/tools v0.16.1 // indirect
10 | )
11 | 


--------------------------------------------------------------------------------
/_asm/go.sum:
--------------------------------------------------------------------------------
1 | github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
2 | github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
3 | golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0=
4 | golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
5 | golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
6 | golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
7 | golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA=
8 | golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0=
9 | 


--------------------------------------------------------------------------------
/_asm/nat_amd64_asm.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2023 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"strconv"
  9 | 
 10 | 	. "github.com/mmcloughlin/avo/build"
 11 | 	. "github.com/mmcloughlin/avo/operand"
 12 | 	. "github.com/mmcloughlin/avo/reg"
 13 | )
 14 | 
 15 | //go:generate go run . -out ../nat_amd64.s -pkg bigmod
 16 | 
 17 | func main() {
 18 | 	Package("filippo.io/bigmod")
 19 | 	ConstraintExpr("!purego")
 20 | 
 21 | 	addMulVVW(1024)
 22 | 	addMulVVW(1536)
 23 | 	addMulVVW(2048)
 24 | 
 25 | 	Generate()
 26 | }
 27 | 
 28 | func addMulVVW(bits int) {
 29 | 	if bits%64 != 0 {
 30 | 		panic("bit size unsupported")
 31 | 	}
 32 | 
 33 | 	Implement("addMulVVW" + strconv.Itoa(bits))
 34 | 
 35 | 	CMPB(Mem{Symbol: Symbol{Name: "·supportADX"}, Base: StaticBase}, Imm(1))
 36 | 	JEQ(LabelRef("adx"))
 37 | 
 38 | 	z := Mem{Base: Load(Param("z"), GP64())}
 39 | 	x := Mem{Base: Load(Param("x"), GP64())}
 40 | 	y := Load(Param("y"), GP64())
 41 | 
 42 | 	carry := GP64()
 43 | 	XORQ(carry, carry) // zero out carry
 44 | 
 45 | 	for i := 0; i < bits/64; i++ {
 46 | 		Comment("Iteration " + strconv.Itoa(i))
 47 | 		hi, lo := RDX, RAX // implicit MULQ inputs and outputs
 48 | 		MOVQ(x.Offset(i*8), lo)
 49 | 		MULQ(y)
 50 | 		ADDQ(z.Offset(i*8), lo)
 51 | 		ADCQ(Imm(0), hi)
 52 | 		ADDQ(carry, lo)
 53 | 		ADCQ(Imm(0), hi)
 54 | 		MOVQ(hi, carry)
 55 | 		MOVQ(lo, z.Offset(i*8))
 56 | 	}
 57 | 
 58 | 	Store(carry, ReturnIndex(0))
 59 | 	RET()
 60 | 
 61 | 	Label("adx")
 62 | 
 63 | 	// The ADX strategy implements the following function, where c1 and c2 are
 64 | 	// the overflow and the carry flag respectively.
 65 | 	//
 66 | 	//    func addMulVVW(z, x []uint, y uint) (carry uint) {
 67 | 	//        var c1, c2 uint
 68 | 	//        for i := range z {
 69 | 	//            hi, lo := bits.Mul(x[i], y)
 70 | 	//            lo, c1 = bits.Add(lo, z[i], c1)
 71 | 	//            z[i], c2 = bits.Add(lo, carry, c2)
 72 | 	//            carry = hi
 73 | 	//        }
 74 | 	//        return carry + c1 + c2
 75 | 	//    }
 76 | 	//
 77 | 	// The loop is fully unrolled and the hi / carry registers are alternated
 78 | 	// instead of introducing a MOV.
 79 | 
 80 | 	z = Mem{Base: Load(Param("z"), GP64())}
 81 | 	x = Mem{Base: Load(Param("x"), GP64())}
 82 | 	Load(Param("y"), RDX) // implicit source of MULXQ
 83 | 
 84 | 	carry = GP64()
 85 | 	XORQ(carry, carry) // zero out carry
 86 | 	z0 := GP64()
 87 | 	XORQ(z0, z0) // unset flags and zero out z0
 88 | 
 89 | 	for i := 0; i < bits/64; i++ {
 90 | 		hi, lo := GP64(), GP64()
 91 | 
 92 | 		Comment("Iteration " + strconv.Itoa(i))
 93 | 		MULXQ(x.Offset(i*8), lo, hi)
 94 | 		ADCXQ(carry, lo)
 95 | 		ADOXQ(z.Offset(i*8), lo)
 96 | 		MOVQ(lo, z.Offset(i*8))
 97 | 
 98 | 		i++
 99 | 
100 | 		Comment("Iteration " + strconv.Itoa(i))
101 | 		MULXQ(x.Offset(i*8), lo, carry)
102 | 		ADCXQ(hi, lo)
103 | 		ADOXQ(z.Offset(i*8), lo)
104 | 		MOVQ(lo, z.Offset(i*8))
105 | 	}
106 | 
107 | 	Comment("Add back carry flags and return")
108 | 	ADCXQ(z0, carry)
109 | 	ADOXQ(z0, carry)
110 | 
111 | 	Store(carry, ReturnIndex(0))
112 | 	RET()
113 | }
114 | 


--------------------------------------------------------------------------------
/extra_test.go:
--------------------------------------------------------------------------------
 1 | package bigmod_test
 2 | 
 3 | import (
 4 | 	"crypto"
 5 | 	"crypto/rand"
 6 | 	"crypto/rsa"
 7 | 	"testing"
 8 | 
 9 | 	"filippo.io/bigmod"
10 | )
11 | 
12 | // TestLinkWithStdlib ensures this package can be linked with the standard
13 | // library package crypto/internal/bigmod, which might have duplicate global
14 | // symbol names in the assembly files. See Issue 1.
15 | func TestLinkWithStdlib(t *testing.T) {
16 | 	bigmod.NewNat()
17 | 	k, _ := rsa.GenerateKey(rand.Reader, 512)
18 | 	rsa.SignPSS(rand.Reader, k, crypto.SHA256, make([]byte, 32), nil)
19 | }
20 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module filippo.io/bigmod
2 | 
3 | go 1.23
4 | 
5 | require golang.org/x/sys v0.11.0
6 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
2 | golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
3 | 


--------------------------------------------------------------------------------
/nat.go:
--------------------------------------------------------------------------------
   1 | // Copyright 2021 The Go Authors. All rights reserved.
   2 | // Use of this source code is governed by a BSD-style
   3 | // license that can be found in the LICENSE file.
   4 | 
   5 | // Package bigmod implements constant-time big integer arithmetic modulo large
   6 | // moduli. Unlike math/big, this package is suitable for implementing
   7 | // security-sensitive cryptographic operations. It is a re-exported version the
   8 | // standard library package crypto/internal/fips140/bigmod used to implement
   9 | // crypto/rsa amongst others.
  10 | //
  11 | // The API is NOT stable. The caller is responsible for ensuring that Nats are
  12 | // reduced modulo the Modulus they are used with.
  13 | package bigmod
  14 | 
  15 | import (
  16 | 	"encoding/binary"
  17 | 	"errors"
  18 | 	"math/bits"
  19 | )
  20 | 
  21 | const (
  22 | 	// _W is the size in bits of our limbs.
  23 | 	_W = bits.UintSize
  24 | 	// _S is the size in bytes of our limbs.
  25 | 	_S = _W / 8
  26 | )
  27 | 
  28 | // Note: These functions make many loops over all the words in a Nat.
  29 | // These loops used to be in assembly, invisible to -race, -asan, and -msan,
  30 | // but now they are in Go and incur significant overhead in those modes.
  31 | // To bring the old performance back, we mark all functions that loop
  32 | // over Nat words with //go:norace. Because //go:norace does not
  33 | // propagate across inlining, we must also mark functions that inline
  34 | // //go:norace functions - specifically, those that inline add, addMulVVW,
  35 | // assign, cmpGeq, rshift1, and sub.
  36 | 
  37 | // choice represents a constant-time boolean. The value of choice is always
  38 | // either 1 or 0. We use an int instead of bool in order to make decisions in
  39 | // constant time by turning it into a mask.
  40 | type choice uint
  41 | 
  42 | func not(c choice) choice { return 1 ^ c }
  43 | 
  44 | const yes = choice(1)
  45 | const no = choice(0)
  46 | 
  47 | // ctMask is all 1s if on is yes, and all 0s otherwise.
  48 | func ctMask(on choice) uint { return -uint(on) }
  49 | 
  50 | // ctEq returns 1 if x == y, and 0 otherwise. The execution time of this
  51 | // function does not depend on its inputs.
  52 | func ctEq(x, y uint) choice {
  53 | 	// If x != y, then either x - y or y - x will generate a carry.
  54 | 	_, c1 := bits.Sub(x, y, 0)
  55 | 	_, c2 := bits.Sub(y, x, 0)
  56 | 	return not(choice(c1 | c2))
  57 | }
  58 | 
  59 | // Nat represents an arbitrary natural number
  60 | //
  61 | // Each Nat has an announced length, which is the number of limbs it has stored.
  62 | // Operations on this number are allowed to leak this length, but will not leak
  63 | // any information about the values contained in those limbs.
  64 | type Nat struct {
  65 | 	// limbs is little-endian in base 2^W with W = bits.UintSize.
  66 | 	limbs []uint
  67 | }
  68 | 
  69 | // preallocTarget is the size in bits of the numbers used to implement the most
  70 | // common and most performant RSA key size. It's also enough to cover some of
  71 | // the operations of key sizes up to 4096.
  72 | const preallocTarget = 2048
  73 | const preallocLimbs = (preallocTarget + _W - 1) / _W
  74 | 
  75 | // NewNat returns a new nat with a size of zero, just like new(Nat), but with
  76 | // the preallocated capacity to hold a number of up to 2048 bits.
  77 | // NewNat inlines, so the allocation can live on the stack.
  78 | func NewNat() *Nat {
  79 | 	limbs := make([]uint, 0, preallocLimbs)
  80 | 	return &Nat{limbs}
  81 | }
  82 | 
  83 | // expand expands x to n limbs, leaving its value unchanged.
  84 | func (x *Nat) expand(n int) *Nat {
  85 | 	if len(x.limbs) > n {
  86 | 		panic("bigmod: internal error: shrinking nat")
  87 | 	}
  88 | 	if cap(x.limbs) < n {
  89 | 		newLimbs := make([]uint, n)
  90 | 		copy(newLimbs, x.limbs)
  91 | 		x.limbs = newLimbs
  92 | 		return x
  93 | 	}
  94 | 	extraLimbs := x.limbs[len(x.limbs):n]
  95 | 	clear(extraLimbs)
  96 | 	x.limbs = x.limbs[:n]
  97 | 	return x
  98 | }
  99 | 
 100 | // reset returns a zero nat of n limbs, reusing x's storage if n <= cap(x.limbs).
 101 | func (x *Nat) reset(n int) *Nat {
 102 | 	if cap(x.limbs) < n {
 103 | 		x.limbs = make([]uint, n)
 104 | 		return x
 105 | 	}
 106 | 	// Clear both the returned limbs and the previously used ones.
 107 | 	clear(x.limbs[:max(n, len(x.limbs))])
 108 | 	x.limbs = x.limbs[:n]
 109 | 	return x
 110 | }
 111 | 
 112 | // resetToBytes assigns x = b, where b is a slice of big-endian bytes, resizing
 113 | // n to the appropriate size.
 114 | //
 115 | // The announced length of x is set based on the actual bit size of the input,
 116 | // ignoring leading zeroes.
 117 | func (x *Nat) resetToBytes(b []byte) *Nat {
 118 | 	x.reset((len(b) + _S - 1) / _S)
 119 | 	if err := x.setBytes(b); err != nil {
 120 | 		panic("bigmod: internal error: bad arithmetic")
 121 | 	}
 122 | 	return x.trim()
 123 | }
 124 | 
 125 | // trim reduces the size of x to match its value.
 126 | func (x *Nat) trim() *Nat {
 127 | 	// Trim most significant (trailing in little-endian) zero limbs.
 128 | 	// We assume comparison with zero (but not the branch) is constant time.
 129 | 	for i := len(x.limbs) - 1; i >= 0; i-- {
 130 | 		if x.limbs[i] != 0 {
 131 | 			break
 132 | 		}
 133 | 		x.limbs = x.limbs[:i]
 134 | 	}
 135 | 	return x
 136 | }
 137 | 
 138 | // set assigns x = y, optionally resizing x to the appropriate size.
 139 | func (x *Nat) set(y *Nat) *Nat {
 140 | 	x.reset(len(y.limbs))
 141 | 	copy(x.limbs, y.limbs)
 142 | 	return x
 143 | }
 144 | 
 145 | // Bits returns x as a little-endian slice of uint. The length of the slice
 146 | // matches the announced length of x. The result and x share the same underlying
 147 | // array.
 148 | func (x *Nat) Bits() []uint {
 149 | 	return x.limbs
 150 | }
 151 | 
 152 | // Bytes returns x as a zero-extended big-endian byte slice. The size of the
 153 | // slice will match the size of m.
 154 | //
 155 | // x must have the same size as m and it must be less than or equal to m.
 156 | func (x *Nat) Bytes(m *Modulus) []byte {
 157 | 	i := m.Size()
 158 | 	bytes := make([]byte, i)
 159 | 	for _, limb := range x.limbs {
 160 | 		for j := 0; j < _S; j++ {
 161 | 			i--
 162 | 			if i < 0 {
 163 | 				if limb == 0 {
 164 | 					break
 165 | 				}
 166 | 				panic("bigmod: modulus is smaller than nat")
 167 | 			}
 168 | 			bytes[i] = byte(limb)
 169 | 			limb >>= 8
 170 | 		}
 171 | 	}
 172 | 	return bytes
 173 | }
 174 | 
 175 | // SetBytes assigns x = b, where b is a slice of big-endian bytes.
 176 | // SetBytes returns an error if b >= m.
 177 | //
 178 | // The output will be resized to the size of m and overwritten.
 179 | //
 180 | //go:norace
 181 | func (x *Nat) SetBytes(b []byte, m *Modulus) (*Nat, error) {
 182 | 	x.resetFor(m)
 183 | 	if err := x.setBytes(b); err != nil {
 184 | 		return nil, err
 185 | 	}
 186 | 	if x.cmpGeq(m.nat) == yes {
 187 | 		return nil, errors.New("input overflows the modulus")
 188 | 	}
 189 | 	return x, nil
 190 | }
 191 | 
 192 | // SetOverflowingBytes assigns x = b, where b is a slice of big-endian bytes.
 193 | // SetOverflowingBytes returns an error if b has a longer bit length than m, but
 194 | // reduces overflowing values up to 2^⌈log2(m)⌉ - 1.
 195 | //
 196 | // The output will be resized to the size of m and overwritten.
 197 | func (x *Nat) SetOverflowingBytes(b []byte, m *Modulus) (*Nat, error) {
 198 | 	x.resetFor(m)
 199 | 	if err := x.setBytes(b); err != nil {
 200 | 		return nil, err
 201 | 	}
 202 | 	// setBytes would have returned an error if the input overflowed the limb
 203 | 	// size of the modulus, so now we only need to check if the most significant
 204 | 	// limb of x has more bits than the most significant limb of the modulus.
 205 | 	if bitLen(x.limbs[len(x.limbs)-1]) > bitLen(m.nat.limbs[len(m.nat.limbs)-1]) {
 206 | 		return nil, errors.New("input overflows the modulus size")
 207 | 	}
 208 | 	x.maybeSubtractModulus(no, m)
 209 | 	return x, nil
 210 | }
 211 | 
 212 | // bigEndianUint returns the contents of buf interpreted as a
 213 | // big-endian encoded uint value.
 214 | func bigEndianUint(buf []byte) uint {
 215 | 	if _W == 64 {
 216 | 		return uint(binary.BigEndian.Uint64(buf))
 217 | 	}
 218 | 	return uint(binary.BigEndian.Uint32(buf))
 219 | }
 220 | 
 221 | func (x *Nat) setBytes(b []byte) error {
 222 | 	i, k := len(b), 0
 223 | 	for k < len(x.limbs) && i >= _S {
 224 | 		x.limbs[k] = bigEndianUint(b[i-_S : i])
 225 | 		i -= _S
 226 | 		k++
 227 | 	}
 228 | 	for s := 0; s < _W && k < len(x.limbs) && i > 0; s += 8 {
 229 | 		x.limbs[k] |= uint(b[i-1]) << s
 230 | 		i--
 231 | 	}
 232 | 	if i > 0 {
 233 | 		return errors.New("input overflows the modulus size")
 234 | 	}
 235 | 	return nil
 236 | }
 237 | 
 238 | // SetUint assigns x = y.
 239 | //
 240 | // The output will be resized to a single limb and overwritten.
 241 | func (x *Nat) SetUint(y uint) *Nat {
 242 | 	x.reset(1)
 243 | 	x.limbs[0] = y
 244 | 	return x
 245 | }
 246 | 
 247 | // Equal returns 1 if x == y, and 0 otherwise.
 248 | //
 249 | // Both operands must have the same announced length.
 250 | //
 251 | //go:norace
 252 | func (x *Nat) Equal(y *Nat) uint {
 253 | 	// Eliminate bounds checks in the loop.
 254 | 	size := len(x.limbs)
 255 | 	xLimbs := x.limbs[:size]
 256 | 	yLimbs := y.limbs[:size]
 257 | 
 258 | 	equal := yes
 259 | 	for i := 0; i < size; i++ {
 260 | 		equal &= ctEq(xLimbs[i], yLimbs[i])
 261 | 	}
 262 | 	return uint(equal)
 263 | }
 264 | 
 265 | // IsZero returns 1 if x == 0, and 0 otherwise.
 266 | //
 267 | //go:norace
 268 | func (x *Nat) IsZero() uint {
 269 | 	// Eliminate bounds checks in the loop.
 270 | 	size := len(x.limbs)
 271 | 	xLimbs := x.limbs[:size]
 272 | 
 273 | 	zero := yes
 274 | 	for i := 0; i < size; i++ {
 275 | 		zero &= ctEq(xLimbs[i], 0)
 276 | 	}
 277 | 	return uint(zero)
 278 | }
 279 | 
 280 | // IsOne returns 1 if x == 1, and 0 otherwise.
 281 | //
 282 | //go:norace
 283 | func (x *Nat) IsOne() uint {
 284 | 	// Eliminate bounds checks in the loop.
 285 | 	size := len(x.limbs)
 286 | 	xLimbs := x.limbs[:size]
 287 | 
 288 | 	if len(xLimbs) == 0 {
 289 | 		return uint(no)
 290 | 	}
 291 | 
 292 | 	one := ctEq(xLimbs[0], 1)
 293 | 	for i := 1; i < size; i++ {
 294 | 		one &= ctEq(xLimbs[i], 0)
 295 | 	}
 296 | 	return uint(one)
 297 | }
 298 | 
 299 | // IsMinusOne returns 1 if x == -1 mod m, and 0 otherwise.
 300 | //
 301 | // The length of x must be the same as the modulus. x must already be reduced
 302 | // modulo m.
 303 | //
 304 | //go:norace
 305 | func (x *Nat) IsMinusOne(m *Modulus) uint {
 306 | 	minusOne := m.Nat()
 307 | 	minusOne.SubOne(m)
 308 | 	return x.Equal(minusOne)
 309 | }
 310 | 
 311 | // IsOdd returns 1 if x is odd, and 0 otherwise.
 312 | func (x *Nat) IsOdd() uint {
 313 | 	if len(x.limbs) == 0 {
 314 | 		return uint(no)
 315 | 	}
 316 | 	return uint(x.limbs[0] & 1)
 317 | }
 318 | 
 319 | // TrailingZeroBitsVarTime returns the number of trailing zero bits in x.
 320 | func (x *Nat) TrailingZeroBitsVarTime() uint {
 321 | 	var t uint
 322 | 	limbs := x.limbs
 323 | 	for _, l := range limbs {
 324 | 		if l == 0 {
 325 | 			t += _W
 326 | 			continue
 327 | 		}
 328 | 		t += uint(bits.TrailingZeros(l))
 329 | 		break
 330 | 	}
 331 | 	return t
 332 | }
 333 | 
 334 | // cmpGeq returns 1 if x >= y, and 0 otherwise.
 335 | //
 336 | // Both operands must have the same announced length.
 337 | //
 338 | //go:norace
 339 | func (x *Nat) cmpGeq(y *Nat) choice {
 340 | 	// Eliminate bounds checks in the loop.
 341 | 	size := len(x.limbs)
 342 | 	xLimbs := x.limbs[:size]
 343 | 	yLimbs := y.limbs[:size]
 344 | 
 345 | 	var c uint
 346 | 	for i := 0; i < size; i++ {
 347 | 		_, c = bits.Sub(xLimbs[i], yLimbs[i], c)
 348 | 	}
 349 | 	// If there was a carry, then subtracting y underflowed, so
 350 | 	// x is not greater than or equal to y.
 351 | 	return not(choice(c))
 352 | }
 353 | 
 354 | // assign sets x <- y if on == 1, and does nothing otherwise.
 355 | //
 356 | // Both operands must have the same announced length.
 357 | //
 358 | //go:norace
 359 | func (x *Nat) assign(on choice, y *Nat) *Nat {
 360 | 	// Eliminate bounds checks in the loop.
 361 | 	size := len(x.limbs)
 362 | 	xLimbs := x.limbs[:size]
 363 | 	yLimbs := y.limbs[:size]
 364 | 
 365 | 	mask := ctMask(on)
 366 | 	for i := 0; i < size; i++ {
 367 | 		xLimbs[i] ^= mask & (xLimbs[i] ^ yLimbs[i])
 368 | 	}
 369 | 	return x
 370 | }
 371 | 
 372 | // add computes x += y and returns the carry.
 373 | //
 374 | // Both operands must have the same announced length.
 375 | //
 376 | //go:norace
 377 | func (x *Nat) add(y *Nat) (c uint) {
 378 | 	// Eliminate bounds checks in the loop.
 379 | 	size := len(x.limbs)
 380 | 	xLimbs := x.limbs[:size]
 381 | 	yLimbs := y.limbs[:size]
 382 | 
 383 | 	for i := 0; i < size; i++ {
 384 | 		xLimbs[i], c = bits.Add(xLimbs[i], yLimbs[i], c)
 385 | 	}
 386 | 	return
 387 | }
 388 | 
 389 | // sub computes x -= y. It returns the borrow of the subtraction.
 390 | //
 391 | // Both operands must have the same announced length.
 392 | //
 393 | //go:norace
 394 | func (x *Nat) sub(y *Nat) (c uint) {
 395 | 	// Eliminate bounds checks in the loop.
 396 | 	size := len(x.limbs)
 397 | 	xLimbs := x.limbs[:size]
 398 | 	yLimbs := y.limbs[:size]
 399 | 
 400 | 	for i := 0; i < size; i++ {
 401 | 		xLimbs[i], c = bits.Sub(xLimbs[i], yLimbs[i], c)
 402 | 	}
 403 | 	return
 404 | }
 405 | 
 406 | // ShiftRightVarTime sets x = x >> n.
 407 | //
 408 | // The announced length of x is unchanged.
 409 | //
 410 | //go:norace
 411 | func (x *Nat) ShiftRightVarTime(n uint) *Nat {
 412 | 	// Eliminate bounds checks in the loop.
 413 | 	size := len(x.limbs)
 414 | 	xLimbs := x.limbs[:size]
 415 | 
 416 | 	shift := int(n % _W)
 417 | 	shiftLimbs := int(n / _W)
 418 | 
 419 | 	var shiftedLimbs []uint
 420 | 	if shiftLimbs < size {
 421 | 		shiftedLimbs = xLimbs[shiftLimbs:]
 422 | 	}
 423 | 
 424 | 	for i := range xLimbs {
 425 | 		if i >= len(shiftedLimbs) {
 426 | 			xLimbs[i] = 0
 427 | 			continue
 428 | 		}
 429 | 
 430 | 		xLimbs[i] = shiftedLimbs[i] >> shift
 431 | 		if i+1 < len(shiftedLimbs) {
 432 | 			xLimbs[i] |= shiftedLimbs[i+1] << (_W - shift)
 433 | 		}
 434 | 	}
 435 | 
 436 | 	return x
 437 | }
 438 | 
 439 | // BitLenVarTime returns the actual size of x in bits.
 440 | //
 441 | // The actual size of x (but nothing more) leaks through timing side-channels.
 442 | // Note that this is ordinarily secret, as opposed to the announced size of x.
 443 | func (x *Nat) BitLenVarTime() int {
 444 | 	// Eliminate bounds checks in the loop.
 445 | 	size := len(x.limbs)
 446 | 	xLimbs := x.limbs[:size]
 447 | 
 448 | 	for i := size - 1; i >= 0; i-- {
 449 | 		if xLimbs[i] != 0 {
 450 | 			return i*_W + bitLen(xLimbs[i])
 451 | 		}
 452 | 	}
 453 | 	return 0
 454 | }
 455 | 
 456 | // bitLen is a version of bits.Len that only leaks the bit length of n, but not
 457 | // its value. bits.Len and bits.LeadingZeros use a lookup table for the
 458 | // low-order bits on some architectures.
 459 | func bitLen(n uint) int {
 460 | 	len := 0
 461 | 	// We assume, here and elsewhere, that comparison to zero is constant time
 462 | 	// with respect to different non-zero values.
 463 | 	for n != 0 {
 464 | 		len++
 465 | 		n >>= 1
 466 | 	}
 467 | 	return len
 468 | }
 469 | 
 470 | // Modulus is used for modular arithmetic, precomputing relevant constants.
 471 | //
 472 | // A Modulus can leak the exact number of bits needed to store its value
 473 | // and is stored without padding. Its actual value is still kept secret.
 474 | type Modulus struct {
 475 | 	// The underlying natural number for this modulus.
 476 | 	//
 477 | 	// This will be stored without any padding, and shouldn't alias with any
 478 | 	// other natural number being used.
 479 | 	nat *Nat
 480 | 
 481 | 	// If m is even, the following fields are not set.
 482 | 	odd   bool
 483 | 	m0inv uint // -nat.limbs[0]⁻¹ mod _W
 484 | 	rr    *Nat // R*R for montgomeryRepresentation
 485 | }
 486 | 
 487 | // rr returns R*R with R = 2^(_W * n) and n = len(m.nat.limbs).
 488 | func rr(m *Modulus) *Nat {
 489 | 	rr := NewNat().ExpandFor(m)
 490 | 	n := uint(len(rr.limbs))
 491 | 	mLen := uint(m.BitLen())
 492 | 	logR := _W * n
 493 | 
 494 | 	// We start by computing R = 2^(_W * n) mod m. We can get pretty close, to
 495 | 	// 2^⌊log₂m⌋, by setting the highest bit we can without having to reduce.
 496 | 	rr.limbs[n-1] = 1 << ((mLen - 1) % _W)
 497 | 	// Then we double until we reach 2^(_W * n).
 498 | 	for i := mLen - 1; i < logR; i++ {
 499 | 		rr.Add(rr, m)
 500 | 	}
 501 | 
 502 | 	// Next we need to get from R to 2^(_W * n) R mod m (aka from one to R in
 503 | 	// the Montgomery domain, meaning we can use Montgomery multiplication now).
 504 | 	// We could do that by doubling _W * n times, or with a square-and-double
 505 | 	// chain log2(_W * n) long. Turns out the fastest thing is to start out with
 506 | 	// doublings, and switch to square-and-double once the exponent is large
 507 | 	// enough to justify the cost of the multiplications.
 508 | 
 509 | 	// The threshold is selected experimentally as a linear function of n.
 510 | 	threshold := n / 4
 511 | 
 512 | 	// We calculate how many of the most-significant bits of the exponent we can
 513 | 	// compute before crossing the threshold, and we do it with doublings.
 514 | 	i := bits.UintSize
 515 | 	for logR>>i <= threshold {
 516 | 		i--
 517 | 	}
 518 | 	for k := uint(0); k < logR>>i; k++ {
 519 | 		rr.Add(rr, m)
 520 | 	}
 521 | 
 522 | 	// Then we process the remaining bits of the exponent with a
 523 | 	// square-and-double chain.
 524 | 	for i > 0 {
 525 | 		rr.montgomeryMul(rr, rr, m)
 526 | 		i--
 527 | 		if logR>>i&1 != 0 {
 528 | 			rr.Add(rr, m)
 529 | 		}
 530 | 	}
 531 | 
 532 | 	return rr
 533 | }
 534 | 
 535 | // minusInverseModW computes -x⁻¹ mod _W with x odd.
 536 | //
 537 | // This operation is used to precompute a constant involved in Montgomery
 538 | // multiplication.
 539 | func minusInverseModW(x uint) uint {
 540 | 	// Every iteration of this loop doubles the least-significant bits of
 541 | 	// correct inverse in y. The first three bits are already correct (1⁻¹ = 1,
 542 | 	// 3⁻¹ = 3, 5⁻¹ = 5, and 7⁻¹ = 7 mod 8), so doubling five times is enough
 543 | 	// for 64 bits (and wastes only one iteration for 32 bits).
 544 | 	//
 545 | 	// See https://crypto.stackexchange.com/a/47496.
 546 | 	y := x
 547 | 	for i := 0; i < 5; i++ {
 548 | 		y = y * (2 - x*y)
 549 | 	}
 550 | 	return -y
 551 | }
 552 | 
 553 | // NewModulus creates a new Modulus from a slice of big-endian bytes. The
 554 | // modulus must be greater than one.
 555 | //
 556 | // The number of significant bits and whether the modulus is even is leaked
 557 | // through timing side-channels.
 558 | func NewModulus(b []byte) (*Modulus, error) {
 559 | 	n := NewNat().resetToBytes(b)
 560 | 	return newModulus(n)
 561 | }
 562 | 
 563 | // NewModulusProduct creates a new Modulus from the product of two numbers
 564 | // represented as big-endian byte slices. The result must be greater than one.
 565 | //
 566 | //go:norace
 567 | func NewModulusProduct(a, b []byte) (*Modulus, error) {
 568 | 	x := NewNat().resetToBytes(a)
 569 | 	y := NewNat().resetToBytes(b)
 570 | 	n := NewNat().reset(len(x.limbs) + len(y.limbs))
 571 | 	for i := range y.limbs {
 572 | 		n.limbs[i+len(x.limbs)] = addMulVVW(n.limbs[i:i+len(x.limbs)], x.limbs, y.limbs[i])
 573 | 	}
 574 | 	return newModulus(n.trim())
 575 | }
 576 | 
 577 | func newModulus(n *Nat) (*Modulus, error) {
 578 | 	m := &Modulus{nat: n}
 579 | 	if m.nat.IsZero() == 1 || m.nat.IsOne() == 1 {
 580 | 		return nil, errors.New("modulus must be > 1")
 581 | 	}
 582 | 	if m.nat.IsOdd() == 1 {
 583 | 		m.odd = true
 584 | 		m.m0inv = minusInverseModW(m.nat.limbs[0])
 585 | 		m.rr = rr(m)
 586 | 	}
 587 | 	return m, nil
 588 | }
 589 | 
 590 | // Size returns the size of m in bytes.
 591 | func (m *Modulus) Size() int {
 592 | 	return (m.BitLen() + 7) / 8
 593 | }
 594 | 
 595 | // BitLen returns the size of m in bits.
 596 | func (m *Modulus) BitLen() int {
 597 | 	return m.nat.BitLenVarTime()
 598 | }
 599 | 
 600 | // Nat returns m as a Nat.
 601 | func (m *Modulus) Nat() *Nat {
 602 | 	// Make a copy so that the caller can't modify m.nat or alias it with
 603 | 	// another Nat in a modulus operation.
 604 | 	n := NewNat()
 605 | 	n.set(m.nat)
 606 | 	return n
 607 | }
 608 | 
 609 | // shiftIn calculates x = x << _W + y mod m.
 610 | //
 611 | // This assumes that x is already reduced mod m.
 612 | //
 613 | //go:norace
 614 | func (x *Nat) shiftIn(y uint, m *Modulus) *Nat {
 615 | 	d := NewNat().resetFor(m)
 616 | 
 617 | 	// Eliminate bounds checks in the loop.
 618 | 	size := len(m.nat.limbs)
 619 | 	xLimbs := x.limbs[:size]
 620 | 	dLimbs := d.limbs[:size]
 621 | 	mLimbs := m.nat.limbs[:size]
 622 | 
 623 | 	// Each iteration of this loop computes x = 2x + b mod m, where b is a bit
 624 | 	// from y. Effectively, it left-shifts x and adds y one bit at a time,
 625 | 	// reducing it every time.
 626 | 	//
 627 | 	// To do the reduction, each iteration computes both 2x + b and 2x + b - m.
 628 | 	// The next iteration (and finally the return line) will use either result
 629 | 	// based on whether 2x + b overflows m.
 630 | 	needSubtraction := no
 631 | 	for i := _W - 1; i >= 0; i-- {
 632 | 		carry := (y >> i) & 1
 633 | 		var borrow uint
 634 | 		mask := ctMask(needSubtraction)
 635 | 		for i := 0; i < size; i++ {
 636 | 			l := xLimbs[i] ^ (mask & (xLimbs[i] ^ dLimbs[i]))
 637 | 			xLimbs[i], carry = bits.Add(l, l, carry)
 638 | 			dLimbs[i], borrow = bits.Sub(xLimbs[i], mLimbs[i], borrow)
 639 | 		}
 640 | 		// Like in maybeSubtractModulus, we need the subtraction if either it
 641 | 		// didn't underflow (meaning 2x + b > m) or if computing 2x + b
 642 | 		// overflowed (meaning 2x + b > 2^_W*n > m).
 643 | 		needSubtraction = not(choice(borrow)) | choice(carry)
 644 | 	}
 645 | 	return x.assign(needSubtraction, d)
 646 | }
 647 | 
 648 | // Mod calculates out = y mod m.
 649 | //
 650 | // This works regardless how large the value of y is.
 651 | //
 652 | // The output will be resized to the size of m and overwritten.
 653 | //
 654 | //go:norace
 655 | func (x *Nat) Mod(y *Nat, m *Modulus) *Nat {
 656 | 	out, x := x, y
 657 | 	out.resetFor(m)
 658 | 	// Working our way from the most significant to the least significant limb,
 659 | 	// we can insert each limb at the least significant position, shifting all
 660 | 	// previous limbs left by _W. This way each limb will get shifted by the
 661 | 	// correct number of bits. We can insert at least N - 1 limbs without
 662 | 	// overflowing m. After that, we need to reduce every time we shift.
 663 | 	i := len(x.limbs) - 1
 664 | 	// For the first N - 1 limbs we can skip the actual shifting and position
 665 | 	// them at the shifted position, which starts at min(N - 2, i).
 666 | 	start := len(m.nat.limbs) - 2
 667 | 	if i < start {
 668 | 		start = i
 669 | 	}
 670 | 	for j := start; j >= 0; j-- {
 671 | 		out.limbs[j] = x.limbs[i]
 672 | 		i--
 673 | 	}
 674 | 	// We shift in the remaining limbs, reducing modulo m each time.
 675 | 	for i >= 0 {
 676 | 		out.shiftIn(x.limbs[i], m)
 677 | 		i--
 678 | 	}
 679 | 	return out
 680 | }
 681 | 
 682 | // ExpandFor ensures x has the right size to work with operations modulo m.
 683 | //
 684 | // The announced size of x must be smaller than or equal to that of m.
 685 | func (x *Nat) ExpandFor(m *Modulus) *Nat {
 686 | 	return x.expand(len(m.nat.limbs))
 687 | }
 688 | 
 689 | // resetFor ensures x has the right size to work with operations modulo m.
 690 | //
 691 | // x is zeroed and may start at any size.
 692 | func (x *Nat) resetFor(m *Modulus) *Nat {
 693 | 	return x.reset(len(m.nat.limbs))
 694 | }
 695 | 
 696 | // maybeSubtractModulus computes x -= m if and only if x >= m or if "always" is yes.
 697 | //
 698 | // It can be used to reduce modulo m a value up to 2m - 1, which is a common
 699 | // range for results computed by higher level operations.
 700 | //
 701 | // always is usually a carry that indicates that the operation that produced x
 702 | // overflowed its size, meaning abstractly x > 2^_W*n > m even if x < m.
 703 | //
 704 | // x and m operands must have the same announced length.
 705 | //
 706 | //go:norace
 707 | func (x *Nat) maybeSubtractModulus(always choice, m *Modulus) {
 708 | 	t := NewNat().set(x)
 709 | 	underflow := t.sub(m.nat)
 710 | 	// We keep the result if x - m didn't underflow (meaning x >= m)
 711 | 	// or if always was set.
 712 | 	keep := not(choice(underflow)) | choice(always)
 713 | 	x.assign(keep, t)
 714 | }
 715 | 
 716 | // Sub computes x = x - y mod m.
 717 | //
 718 | // The length of both operands must be the same as the modulus. Both operands
 719 | // must already be reduced modulo m.
 720 | //
 721 | //go:norace
 722 | func (x *Nat) Sub(y *Nat, m *Modulus) *Nat {
 723 | 	underflow := x.sub(y)
 724 | 	// If the subtraction underflowed, add m.
 725 | 	t := NewNat().set(x)
 726 | 	t.add(m.nat)
 727 | 	x.assign(choice(underflow), t)
 728 | 	return x
 729 | }
 730 | 
 731 | // SubOne computes x = x - 1 mod m.
 732 | //
 733 | // The length of x must be the same as the modulus.
 734 | func (x *Nat) SubOne(m *Modulus) *Nat {
 735 | 	one := NewNat().ExpandFor(m)
 736 | 	one.limbs[0] = 1
 737 | 	// Sub asks for x to be reduced modulo m, while SubOne doesn't, but when
 738 | 	// y = 1, it works, and this is an internal use.
 739 | 	return x.Sub(one, m)
 740 | }
 741 | 
 742 | // Add computes x = x + y mod m.
 743 | //
 744 | // The length of both operands must be the same as the modulus. Both operands
 745 | // must already be reduced modulo m.
 746 | //
 747 | //go:norace
 748 | func (x *Nat) Add(y *Nat, m *Modulus) *Nat {
 749 | 	overflow := x.add(y)
 750 | 	x.maybeSubtractModulus(choice(overflow), m)
 751 | 	return x
 752 | }
 753 | 
 754 | // montgomeryRepresentation calculates x = x * R mod m, with R = 2^(_W * n) and
 755 | // n = len(m.nat.limbs).
 756 | //
 757 | // Faster Montgomery multiplication replaces standard modular multiplication for
 758 | // numbers in this representation.
 759 | //
 760 | // This assumes that x is already reduced mod m.
 761 | func (x *Nat) montgomeryRepresentation(m *Modulus) *Nat {
 762 | 	// A Montgomery multiplication (which computes a * b / R) by R * R works out
 763 | 	// to a multiplication by R, which takes the value out of the Montgomery domain.
 764 | 	return x.montgomeryMul(x, m.rr, m)
 765 | }
 766 | 
 767 | // montgomeryReduction calculates x = x / R mod m, with R = 2^(_W * n) and
 768 | // n = len(m.nat.limbs).
 769 | //
 770 | // This assumes that x is already reduced mod m.
 771 | func (x *Nat) montgomeryReduction(m *Modulus) *Nat {
 772 | 	// By Montgomery multiplying with 1 not in Montgomery representation, we
 773 | 	// convert out back from Montgomery representation, because it works out to
 774 | 	// dividing by R.
 775 | 	one := NewNat().ExpandFor(m)
 776 | 	one.limbs[0] = 1
 777 | 	return x.montgomeryMul(x, one, m)
 778 | }
 779 | 
 780 | // montgomeryMul calculates x = a * b / R mod m, with R = 2^(_W * n) and
 781 | // n = len(m.nat.limbs), also known as a Montgomery multiplication.
 782 | //
 783 | // All inputs should be the same length and already reduced modulo m.
 784 | // x will be resized to the size of m and overwritten.
 785 | //
 786 | //go:norace
 787 | func (x *Nat) montgomeryMul(a *Nat, b *Nat, m *Modulus) *Nat {
 788 | 	n := len(m.nat.limbs)
 789 | 	mLimbs := m.nat.limbs[:n]
 790 | 	aLimbs := a.limbs[:n]
 791 | 	bLimbs := b.limbs[:n]
 792 | 
 793 | 	switch n {
 794 | 	default:
 795 | 		// Attempt to use a stack-allocated backing array.
 796 | 		T := make([]uint, 0, preallocLimbs*2)
 797 | 		if cap(T) < n*2 {
 798 | 			T = make([]uint, 0, n*2)
 799 | 		}
 800 | 		T = T[:n*2]
 801 | 
 802 | 		// This loop implements Word-by-Word Montgomery Multiplication, as
 803 | 		// described in Algorithm 4 (Fig. 3) of "Efficient Software
 804 | 		// Implementations of Modular Exponentiation" by Shay Gueron
 805 | 		// [https://eprint.iacr.org/2011/239.pdf].
 806 | 		var c uint
 807 | 		for i := 0; i < n; i++ {
 808 | 			_ = T[n+i] // bounds check elimination hint
 809 | 
 810 | 			// Step 1 (T = a × b) is computed as a large pen-and-paper column
 811 | 			// multiplication of two numbers with n base-2^_W digits. If we just
 812 | 			// wanted to produce 2n-wide T, we would do
 813 | 			//
 814 | 			//   for i := 0; i < n; i++ {
 815 | 			//       d := bLimbs[i]
 816 | 			//       T[n+i] = addMulVVW(T[i:n+i], aLimbs, d)
 817 | 			//   }
 818 | 			//
 819 | 			// where d is a digit of the multiplier, T[i:n+i] is the shifted
 820 | 			// position of the product of that digit, and T[n+i] is the final carry.
 821 | 			// Note that T[i] isn't modified after processing the i-th digit.
 822 | 			//
 823 | 			// Instead of running two loops, one for Step 1 and one for Steps 2–6,
 824 | 			// the result of Step 1 is computed during the next loop. This is
 825 | 			// possible because each iteration only uses T[i] in Step 2 and then
 826 | 			// discards it in Step 6.
 827 | 			d := bLimbs[i]
 828 | 			c1 := addMulVVW(T[i:n+i], aLimbs, d)
 829 | 
 830 | 			// Step 6 is replaced by shifting the virtual window we operate
 831 | 			// over: T of the algorithm is T[i:] for us. That means that T1 in
 832 | 			// Step 2 (T mod 2^_W) is simply T[i]. k0 in Step 3 is our m0inv.
 833 | 			Y := T[i] * m.m0inv
 834 | 
 835 | 			// Step 4 and 5 add Y × m to T, which as mentioned above is stored
 836 | 			// at T[i:]. The two carries (from a × d and Y × m) are added up in
 837 | 			// the next word T[n+i], and the carry bit from that addition is
 838 | 			// brought forward to the next iteration.
 839 | 			c2 := addMulVVW(T[i:n+i], mLimbs, Y)
 840 | 			T[n+i], c = bits.Add(c1, c2, c)
 841 | 		}
 842 | 
 843 | 		// Finally for Step 7 we copy the final T window into x, and subtract m
 844 | 		// if necessary (which as explained in maybeSubtractModulus can be the
 845 | 		// case both if x >= m, or if x overflowed).
 846 | 		//
 847 | 		// The paper suggests in Section 4 that we can do an "Almost Montgomery
 848 | 		// Multiplication" by subtracting only in the overflow case, but the
 849 | 		// cost is very similar since the constant time subtraction tells us if
 850 | 		// x >= m as a side effect, and taking care of the broken invariant is
 851 | 		// highly undesirable (see https://go.dev/issue/13907).
 852 | 		copy(x.reset(n).limbs, T[n:])
 853 | 		x.maybeSubtractModulus(choice(c), m)
 854 | 
 855 | 	// The following specialized cases follow the exact same algorithm, but
 856 | 	// optimized for the sizes most used in RSA. addMulVVW is implemented in
 857 | 	// assembly with loop unrolling depending on the architecture and bounds
 858 | 	// checks are removed by the compiler thanks to the constant size.
 859 | 	case 1024 / _W:
 860 | 		const n = 1024 / _W // compiler hint
 861 | 		T := make([]uint, n*2)
 862 | 		var c uint
 863 | 		for i := 0; i < n; i++ {
 864 | 			d := bLimbs[i]
 865 | 			c1 := addMulVVW1024(&T[i], &aLimbs[0], d)
 866 | 			Y := T[i] * m.m0inv
 867 | 			c2 := addMulVVW1024(&T[i], &mLimbs[0], Y)
 868 | 			T[n+i], c = bits.Add(c1, c2, c)
 869 | 		}
 870 | 		copy(x.reset(n).limbs, T[n:])
 871 | 		x.maybeSubtractModulus(choice(c), m)
 872 | 
 873 | 	case 1536 / _W:
 874 | 		const n = 1536 / _W // compiler hint
 875 | 		T := make([]uint, n*2)
 876 | 		var c uint
 877 | 		for i := 0; i < n; i++ {
 878 | 			d := bLimbs[i]
 879 | 			c1 := addMulVVW1536(&T[i], &aLimbs[0], d)
 880 | 			Y := T[i] * m.m0inv
 881 | 			c2 := addMulVVW1536(&T[i], &mLimbs[0], Y)
 882 | 			T[n+i], c = bits.Add(c1, c2, c)
 883 | 		}
 884 | 		copy(x.reset(n).limbs, T[n:])
 885 | 		x.maybeSubtractModulus(choice(c), m)
 886 | 
 887 | 	case 2048 / _W:
 888 | 		const n = 2048 / _W // compiler hint
 889 | 		T := make([]uint, n*2)
 890 | 		var c uint
 891 | 		for i := 0; i < n; i++ {
 892 | 			d := bLimbs[i]
 893 | 			c1 := addMulVVW2048(&T[i], &aLimbs[0], d)
 894 | 			Y := T[i] * m.m0inv
 895 | 			c2 := addMulVVW2048(&T[i], &mLimbs[0], Y)
 896 | 			T[n+i], c = bits.Add(c1, c2, c)
 897 | 		}
 898 | 		copy(x.reset(n).limbs, T[n:])
 899 | 		x.maybeSubtractModulus(choice(c), m)
 900 | 	}
 901 | 
 902 | 	return x
 903 | }
 904 | 
 905 | // addMulVVW multiplies the multi-word value x by the single-word value y,
 906 | // adding the result to the multi-word value z and returning the final carry.
 907 | // It can be thought of as one row of a pen-and-paper column multiplication.
 908 | //
 909 | //go:norace
 910 | func addMulVVW(z, x []uint, y uint) (carry uint) {
 911 | 	_ = x[len(z)-1] // bounds check elimination hint
 912 | 	for i := range z {
 913 | 		hi, lo := bits.Mul(x[i], y)
 914 | 		lo, c := bits.Add(lo, z[i], 0)
 915 | 		// We use bits.Add with zero to get an add-with-carry instruction that
 916 | 		// absorbs the carry from the previous bits.Add.
 917 | 		hi, _ = bits.Add(hi, 0, c)
 918 | 		lo, c = bits.Add(lo, carry, 0)
 919 | 		hi, _ = bits.Add(hi, 0, c)
 920 | 		carry = hi
 921 | 		z[i] = lo
 922 | 	}
 923 | 	return carry
 924 | }
 925 | 
 926 | // Mul calculates x = x * y mod m.
 927 | //
 928 | // The length of both operands must be the same as the modulus. Both operands
 929 | // must already be reduced modulo m.
 930 | //
 931 | //go:norace
 932 | func (x *Nat) Mul(y *Nat, m *Modulus) *Nat {
 933 | 	if m.odd {
 934 | 		// A Montgomery multiplication by a value out of the Montgomery domain
 935 | 		// takes the result out of Montgomery representation.
 936 | 		xR := NewNat().set(x).montgomeryRepresentation(m) // xR = x * R mod m
 937 | 		return x.montgomeryMul(xR, y, m)                  // x = xR * y / R mod m
 938 | 	}
 939 | 
 940 | 	n := len(m.nat.limbs)
 941 | 	xLimbs := x.limbs[:n]
 942 | 	yLimbs := y.limbs[:n]
 943 | 
 944 | 	switch n {
 945 | 	default:
 946 | 		// Attempt to use a stack-allocated backing array.
 947 | 		T := make([]uint, 0, preallocLimbs*2)
 948 | 		if cap(T) < n*2 {
 949 | 			T = make([]uint, 0, n*2)
 950 | 		}
 951 | 		T = T[:n*2]
 952 | 
 953 | 		// T = x * y
 954 | 		for i := 0; i < n; i++ {
 955 | 			T[n+i] = addMulVVW(T[i:n+i], xLimbs, yLimbs[i])
 956 | 		}
 957 | 
 958 | 		// x = T mod m
 959 | 		return x.Mod(&Nat{limbs: T}, m)
 960 | 
 961 | 	// The following specialized cases follow the exact same algorithm, but
 962 | 	// optimized for the sizes most used in RSA. See montgomeryMul for details.
 963 | 	case 1024 / _W:
 964 | 		const n = 1024 / _W // compiler hint
 965 | 		T := make([]uint, n*2)
 966 | 		for i := 0; i < n; i++ {
 967 | 			T[n+i] = addMulVVW1024(&T[i], &xLimbs[0], yLimbs[i])
 968 | 		}
 969 | 		return x.Mod(&Nat{limbs: T}, m)
 970 | 	case 1536 / _W:
 971 | 		const n = 1536 / _W // compiler hint
 972 | 		T := make([]uint, n*2)
 973 | 		for i := 0; i < n; i++ {
 974 | 			T[n+i] = addMulVVW1536(&T[i], &xLimbs[0], yLimbs[i])
 975 | 		}
 976 | 		return x.Mod(&Nat{limbs: T}, m)
 977 | 	case 2048 / _W:
 978 | 		const n = 2048 / _W // compiler hint
 979 | 		T := make([]uint, n*2)
 980 | 		for i := 0; i < n; i++ {
 981 | 			T[n+i] = addMulVVW2048(&T[i], &xLimbs[0], yLimbs[i])
 982 | 		}
 983 | 		return x.Mod(&Nat{limbs: T}, m)
 984 | 	}
 985 | }
 986 | 
 987 | // Exp calculates x = y^e mod m.
 988 | //
 989 | // The exponent e is represented in big-endian order. The output will be resized
 990 | // to the size of m and overwritten. y must already be reduced modulo m.
 991 | //
 992 | // m must be odd, or Exp will panic.
 993 | //
 994 | //go:norace
 995 | func (x *Nat) Exp(y *Nat, e []byte, m *Modulus) *Nat {
 996 | 	out, x := x, y
 997 | 
 998 | 	if !m.odd {
 999 | 		panic("bigmod: modulus for Exp must be odd")
1000 | 	}
1001 | 
1002 | 	// We use a 4 bit window. For our RSA workload, 4 bit windows are faster
1003 | 	// than 2 bit windows, but use an extra 12 nats worth of scratch space.
1004 | 	// Using bit sizes that don't divide 8 are more complex to implement, but
1005 | 	// are likely to be more efficient if necessary.
1006 | 
1007 | 	table := [(1 << 4) - 1]*Nat{ // table[i] = x ^ (i+1)
1008 | 		// newNat calls are unrolled so they are allocated on the stack.
1009 | 		NewNat(), NewNat(), NewNat(), NewNat(), NewNat(),
1010 | 		NewNat(), NewNat(), NewNat(), NewNat(), NewNat(),
1011 | 		NewNat(), NewNat(), NewNat(), NewNat(), NewNat(),
1012 | 	}
1013 | 	table[0].set(x).montgomeryRepresentation(m)
1014 | 	for i := 1; i < len(table); i++ {
1015 | 		table[i].montgomeryMul(table[i-1], table[0], m)
1016 | 	}
1017 | 
1018 | 	out.resetFor(m)
1019 | 	out.limbs[0] = 1
1020 | 	out.montgomeryRepresentation(m)
1021 | 	tmp := NewNat().ExpandFor(m)
1022 | 	for _, b := range e {
1023 | 		for _, j := range []int{4, 0} {
1024 | 			// Square four times. Optimization note: this can be implemented
1025 | 			// more efficiently than with generic Montgomery multiplication.
1026 | 			out.montgomeryMul(out, out, m)
1027 | 			out.montgomeryMul(out, out, m)
1028 | 			out.montgomeryMul(out, out, m)
1029 | 			out.montgomeryMul(out, out, m)
1030 | 
1031 | 			// Select x^k in constant time from the table.
1032 | 			k := uint((b >> j) & 0b1111)
1033 | 			for i := range table {
1034 | 				tmp.assign(ctEq(k, uint(i+1)), table[i])
1035 | 			}
1036 | 
1037 | 			// Multiply by x^k, discarding the result if k = 0.
1038 | 			tmp.montgomeryMul(out, tmp, m)
1039 | 			out.assign(not(ctEq(k, 0)), tmp)
1040 | 		}
1041 | 	}
1042 | 
1043 | 	return out.montgomeryReduction(m)
1044 | }
1045 | 
1046 | // ExpShortVarTime calculates out = x^e mod m.
1047 | //
1048 | // The output will be resized to the size of m and overwritten. x must already
1049 | // be reduced modulo m. This leaks the exponent through timing side-channels.
1050 | //
1051 | // m must be odd, or ExpShortVarTime will panic.
1052 | func (x *Nat) ExpShortVarTime(y *Nat, e uint, m *Modulus) *Nat {
1053 | 	out, x := x, y
1054 | 
1055 | 	if !m.odd {
1056 | 		panic("bigmod: modulus for ExpShortVarTime must be odd")
1057 | 	}
1058 | 	// For short exponents, precomputing a table and using a window like in Exp
1059 | 	// doesn't pay off. Instead, we do a simple conditional square-and-multiply
1060 | 	// chain, skipping the initial run of zeroes.
1061 | 	xR := NewNat().set(x).montgomeryRepresentation(m)
1062 | 	out.set(xR)
1063 | 	for i := bits.UintSize - bits.Len(e) + 1; i < bits.UintSize; i++ {
1064 | 		out.montgomeryMul(out, out, m)
1065 | 		if k := (e >> (bits.UintSize - i - 1)) & 1; k != 0 {
1066 | 			out.montgomeryMul(out, xR, m)
1067 | 		}
1068 | 	}
1069 | 	return out.montgomeryReduction(m)
1070 | }
1071 | 
1072 | // InverseVarTime calculates x = a⁻¹ mod m and returns (x, true) if a is
1073 | // invertible. Otherwise, InverseVarTime returns (x, false) and x is not
1074 | // modified.
1075 | //
1076 | // a must be reduced modulo m, but doesn't need to have the same size. The
1077 | // output will be resized to the size of m and overwritten.
1078 | //
1079 | //go:norace
1080 | func (x *Nat) InverseVarTime(a *Nat, m *Modulus) (*Nat, bool) {
1081 | 	u, A, err := extendedGCD(a, m.nat)
1082 | 	if err != nil {
1083 | 		return x, false
1084 | 	}
1085 | 	if u.IsOne() == 0 {
1086 | 		return x, false
1087 | 	}
1088 | 	return x.set(A), true
1089 | }
1090 | 
1091 | // GCDVarTime calculates x = GCD(a, b) where at least one of a or b is odd, and
1092 | // both are non-zero. If GCDVarTime returns an error, x is not modified.
1093 | //
1094 | // The output will be resized to the size of the larger of a and b.
1095 | func (x *Nat) GCDVarTime(a, b *Nat) (*Nat, error) {
1096 | 	u, _, err := extendedGCD(a, b)
1097 | 	if err != nil {
1098 | 		return nil, err
1099 | 	}
1100 | 	return x.set(u), nil
1101 | }
1102 | 
1103 | // extendedGCD computes u and A such that a = GCD(a, m) and u = A*a - B*m.
1104 | //
1105 | // u will have the size of the larger of a and m, and A will have the size of m.
1106 | //
1107 | // It is an error if either a or m is zero, or if they are both even.
1108 | func extendedGCD(a, m *Nat) (u, A *Nat, err error) {
1109 | 	// This is the extended binary GCD algorithm described in the Handbook of
1110 | 	// Applied Cryptography, Algorithm 14.61, adapted by BoringSSL to bound
1111 | 	// coefficients and avoid negative numbers. For more details and proof of
1112 | 	// correctness, see https://github.com/mit-plv/fiat-crypto/pull/333/files.
1113 | 	//
1114 | 	// Following the proof linked in the PR above, the changes are:
1115 | 	//
1116 | 	// 1. Negate [B] and [C] so they are positive. The invariant now involves a
1117 | 	//    subtraction.
1118 | 	// 2. If step 2 (both [x] and [y] are even) runs, abort immediately. This
1119 | 	//    case needs to be handled by the caller.
1120 | 	// 3. Subtract copies of [x] and [y] as needed in step 6 (both [u] and [v]
1121 | 	//    are odd) so coefficients stay in bounds.
1122 | 	// 4. Replace the [u >= v] check with [u > v]. This changes the end
1123 | 	//    condition to [v = 0] rather than [u = 0]. This saves an extra
1124 | 	//    subtraction due to which coefficients were negated.
1125 | 	// 5. Rename x and y to a and n, to capture that one is a modulus.
1126 | 	// 6. Rearrange steps 4 through 6 slightly. Merge the loops in steps 4 and
1127 | 	//    5 into the main loop (step 7's goto), and move step 6 to the start of
1128 | 	//    the loop iteration, ensuring each loop iteration halves at least one
1129 | 	//    value.
1130 | 	//
1131 | 	// Note this algorithm does not handle either input being zero.
1132 | 
1133 | 	if a.IsZero() == 1 || m.IsZero() == 1 {
1134 | 		return nil, nil, errors.New("extendedGCD: a or m is zero")
1135 | 	}
1136 | 	if a.IsOdd() == 0 && m.IsOdd() == 0 {
1137 | 		return nil, nil, errors.New("extendedGCD: both a and m are even")
1138 | 	}
1139 | 
1140 | 	size := max(len(a.limbs), len(m.limbs))
1141 | 	u = NewNat().set(a).expand(size)
1142 | 	v := NewNat().set(m).expand(size)
1143 | 
1144 | 	A = NewNat().reset(len(m.limbs))
1145 | 	A.limbs[0] = 1
1146 | 	B := NewNat().reset(len(a.limbs))
1147 | 	C := NewNat().reset(len(m.limbs))
1148 | 	D := NewNat().reset(len(a.limbs))
1149 | 	D.limbs[0] = 1
1150 | 
1151 | 	// Before and after each loop iteration, the following hold:
1152 | 	//
1153 | 	//   u = A*a - B*m
1154 | 	//   v = D*m - C*a
1155 | 	//   0 < u <= a
1156 | 	//   0 <= v <= m
1157 | 	//   0 <= A < m
1158 | 	//   0 <= B <= a
1159 | 	//   0 <= C < m
1160 | 	//   0 <= D <= a
1161 | 	//
1162 | 	// After each loop iteration, u and v only get smaller, and at least one of
1163 | 	// them shrinks by at least a factor of two.
1164 | 	for {
1165 | 		// If both u and v are odd, subtract the smaller from the larger.
1166 | 		// If u = v, we need to subtract from v to hit the modified exit condition.
1167 | 		if u.IsOdd() == 1 && v.IsOdd() == 1 {
1168 | 			if v.cmpGeq(u) == 0 {
1169 | 				u.sub(v)
1170 | 				A.Add(C, &Modulus{nat: m})
1171 | 				B.Add(D, &Modulus{nat: a})
1172 | 			} else {
1173 | 				v.sub(u)
1174 | 				C.Add(A, &Modulus{nat: m})
1175 | 				D.Add(B, &Modulus{nat: a})
1176 | 			}
1177 | 		}
1178 | 
1179 | 		// Exactly one of u and v is now even.
1180 | 		if u.IsOdd() == v.IsOdd() {
1181 | 			panic("bigmod: internal error: u and v are not in the expected state")
1182 | 		}
1183 | 
1184 | 		// Halve the even one and adjust the corresponding coefficient.
1185 | 		if u.IsOdd() == 0 {
1186 | 			rshift1(u, 0)
1187 | 			if A.IsOdd() == 1 || B.IsOdd() == 1 {
1188 | 				rshift1(A, A.add(m))
1189 | 				rshift1(B, B.add(a))
1190 | 			} else {
1191 | 				rshift1(A, 0)
1192 | 				rshift1(B, 0)
1193 | 			}
1194 | 		} else { // v.IsOdd() == 0
1195 | 			rshift1(v, 0)
1196 | 			if C.IsOdd() == 1 || D.IsOdd() == 1 {
1197 | 				rshift1(C, C.add(m))
1198 | 				rshift1(D, D.add(a))
1199 | 			} else {
1200 | 				rshift1(C, 0)
1201 | 				rshift1(D, 0)
1202 | 			}
1203 | 		}
1204 | 
1205 | 		if v.IsZero() == 1 {
1206 | 			return u, A, nil
1207 | 		}
1208 | 	}
1209 | }
1210 | 
1211 | //go:norace
1212 | func rshift1(a *Nat, carry uint) {
1213 | 	size := len(a.limbs)
1214 | 	aLimbs := a.limbs[:size]
1215 | 
1216 | 	for i := range size {
1217 | 		aLimbs[i] >>= 1
1218 | 		if i+1 < size {
1219 | 			aLimbs[i] |= aLimbs[i+1] << (_W - 1)
1220 | 		} else {
1221 | 			aLimbs[i] |= carry << (_W - 1)
1222 | 		}
1223 | 	}
1224 | }
1225 | 
1226 | // DivShortVarTime calculates x = x / y and returns the remainder.
1227 | //
1228 | // It panics if y is zero.
1229 | //
1230 | //go:norace
1231 | func (x *Nat) DivShortVarTime(y uint) uint {
1232 | 	if y == 0 {
1233 | 		panic("bigmod: division by zero")
1234 | 	}
1235 | 
1236 | 	var r uint
1237 | 	for i := len(x.limbs) - 1; i >= 0; i-- {
1238 | 		x.limbs[i], r = bits.Div(r, x.limbs[i], y)
1239 | 	}
1240 | 	return r
1241 | }
1242 | 


--------------------------------------------------------------------------------
/nat_386.s:
--------------------------------------------------------------------------------
 1 | // Copyright 2009 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 | TEXT ·addMulVVW1024(SB), $0-16
11 | 	MOVL	$32, BX
12 | 	JMP		addMulVVWx<>(SB)
13 | 
14 | // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 | TEXT ·addMulVVW1536(SB), $0-16
16 | 	MOVL	$48, BX
17 | 	JMP		addMulVVWx<>(SB)
18 | 
19 | // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 | TEXT ·addMulVVW2048(SB), $0-16
21 | 	MOVL	$64, BX
22 | 	JMP		addMulVVWx<>(SB)
23 | 
24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0
25 | 	MOVL z+0(FP), DI
26 | 	MOVL x+4(FP), SI
27 | 	MOVL y+8(FP), BP
28 | 	LEAL (DI)(BX*4), DI
29 | 	LEAL (SI)(BX*4), SI
30 | 	NEGL BX			// i = -n
31 | 	MOVL $0, CX		// c = 0
32 | 	JMP E6
33 | 
34 | L6:	MOVL (SI)(BX*4), AX
35 | 	MULL BP
36 | 	ADDL CX, AX
37 | 	ADCL $0, DX
38 | 	ADDL AX, (DI)(BX*4)
39 | 	ADCL $0, DX
40 | 	MOVL DX, CX
41 | 	ADDL $1, BX		// i++
42 | 
43 | E6:	CMPL BX, $0		// i < 0
44 | 	JL L6
45 | 
46 | 	MOVL CX, c+12(FP)
47 | 	RET
48 | 


--------------------------------------------------------------------------------
/nat_amd64.s:
--------------------------------------------------------------------------------
   1 | // Code generated by command: go run nat_amd64_asm.go -out ../nat_amd64.s -pkg bigmod. DO NOT EDIT.
   2 | 
   3 | //go:build !purego
   4 | 
   5 | // func addMulVVW1024(z *uint, x *uint, y uint) (c uint)
   6 | // Requires: ADX, BMI2
   7 | TEXT ·addMulVVW1024(SB), $0-32
   8 | 	CMPB ·supportADX+0(SB), $0x01
   9 | 	JEQ  adx
  10 | 	MOVQ z+0(FP), CX
  11 | 	MOVQ x+8(FP), BX
  12 | 	MOVQ y+16(FP), SI
  13 | 	XORQ DI, DI
  14 | 
  15 | 	// Iteration 0
  16 | 	MOVQ (BX), AX
  17 | 	MULQ SI
  18 | 	ADDQ (CX), AX
  19 | 	ADCQ $0x00, DX
  20 | 	ADDQ DI, AX
  21 | 	ADCQ $0x00, DX
  22 | 	MOVQ DX, DI
  23 | 	MOVQ AX, (CX)
  24 | 
  25 | 	// Iteration 1
  26 | 	MOVQ 8(BX), AX
  27 | 	MULQ SI
  28 | 	ADDQ 8(CX), AX
  29 | 	ADCQ $0x00, DX
  30 | 	ADDQ DI, AX
  31 | 	ADCQ $0x00, DX
  32 | 	MOVQ DX, DI
  33 | 	MOVQ AX, 8(CX)
  34 | 
  35 | 	// Iteration 2
  36 | 	MOVQ 16(BX), AX
  37 | 	MULQ SI
  38 | 	ADDQ 16(CX), AX
  39 | 	ADCQ $0x00, DX
  40 | 	ADDQ DI, AX
  41 | 	ADCQ $0x00, DX
  42 | 	MOVQ DX, DI
  43 | 	MOVQ AX, 16(CX)
  44 | 
  45 | 	// Iteration 3
  46 | 	MOVQ 24(BX), AX
  47 | 	MULQ SI
  48 | 	ADDQ 24(CX), AX
  49 | 	ADCQ $0x00, DX
  50 | 	ADDQ DI, AX
  51 | 	ADCQ $0x00, DX
  52 | 	MOVQ DX, DI
  53 | 	MOVQ AX, 24(CX)
  54 | 
  55 | 	// Iteration 4
  56 | 	MOVQ 32(BX), AX
  57 | 	MULQ SI
  58 | 	ADDQ 32(CX), AX
  59 | 	ADCQ $0x00, DX
  60 | 	ADDQ DI, AX
  61 | 	ADCQ $0x00, DX
  62 | 	MOVQ DX, DI
  63 | 	MOVQ AX, 32(CX)
  64 | 
  65 | 	// Iteration 5
  66 | 	MOVQ 40(BX), AX
  67 | 	MULQ SI
  68 | 	ADDQ 40(CX), AX
  69 | 	ADCQ $0x00, DX
  70 | 	ADDQ DI, AX
  71 | 	ADCQ $0x00, DX
  72 | 	MOVQ DX, DI
  73 | 	MOVQ AX, 40(CX)
  74 | 
  75 | 	// Iteration 6
  76 | 	MOVQ 48(BX), AX
  77 | 	MULQ SI
  78 | 	ADDQ 48(CX), AX
  79 | 	ADCQ $0x00, DX
  80 | 	ADDQ DI, AX
  81 | 	ADCQ $0x00, DX
  82 | 	MOVQ DX, DI
  83 | 	MOVQ AX, 48(CX)
  84 | 
  85 | 	// Iteration 7
  86 | 	MOVQ 56(BX), AX
  87 | 	MULQ SI
  88 | 	ADDQ 56(CX), AX
  89 | 	ADCQ $0x00, DX
  90 | 	ADDQ DI, AX
  91 | 	ADCQ $0x00, DX
  92 | 	MOVQ DX, DI
  93 | 	MOVQ AX, 56(CX)
  94 | 
  95 | 	// Iteration 8
  96 | 	MOVQ 64(BX), AX
  97 | 	MULQ SI
  98 | 	ADDQ 64(CX), AX
  99 | 	ADCQ $0x00, DX
 100 | 	ADDQ DI, AX
 101 | 	ADCQ $0x00, DX
 102 | 	MOVQ DX, DI
 103 | 	MOVQ AX, 64(CX)
 104 | 
 105 | 	// Iteration 9
 106 | 	MOVQ 72(BX), AX
 107 | 	MULQ SI
 108 | 	ADDQ 72(CX), AX
 109 | 	ADCQ $0x00, DX
 110 | 	ADDQ DI, AX
 111 | 	ADCQ $0x00, DX
 112 | 	MOVQ DX, DI
 113 | 	MOVQ AX, 72(CX)
 114 | 
 115 | 	// Iteration 10
 116 | 	MOVQ 80(BX), AX
 117 | 	MULQ SI
 118 | 	ADDQ 80(CX), AX
 119 | 	ADCQ $0x00, DX
 120 | 	ADDQ DI, AX
 121 | 	ADCQ $0x00, DX
 122 | 	MOVQ DX, DI
 123 | 	MOVQ AX, 80(CX)
 124 | 
 125 | 	// Iteration 11
 126 | 	MOVQ 88(BX), AX
 127 | 	MULQ SI
 128 | 	ADDQ 88(CX), AX
 129 | 	ADCQ $0x00, DX
 130 | 	ADDQ DI, AX
 131 | 	ADCQ $0x00, DX
 132 | 	MOVQ DX, DI
 133 | 	MOVQ AX, 88(CX)
 134 | 
 135 | 	// Iteration 12
 136 | 	MOVQ 96(BX), AX
 137 | 	MULQ SI
 138 | 	ADDQ 96(CX), AX
 139 | 	ADCQ $0x00, DX
 140 | 	ADDQ DI, AX
 141 | 	ADCQ $0x00, DX
 142 | 	MOVQ DX, DI
 143 | 	MOVQ AX, 96(CX)
 144 | 
 145 | 	// Iteration 13
 146 | 	MOVQ 104(BX), AX
 147 | 	MULQ SI
 148 | 	ADDQ 104(CX), AX
 149 | 	ADCQ $0x00, DX
 150 | 	ADDQ DI, AX
 151 | 	ADCQ $0x00, DX
 152 | 	MOVQ DX, DI
 153 | 	MOVQ AX, 104(CX)
 154 | 
 155 | 	// Iteration 14
 156 | 	MOVQ 112(BX), AX
 157 | 	MULQ SI
 158 | 	ADDQ 112(CX), AX
 159 | 	ADCQ $0x00, DX
 160 | 	ADDQ DI, AX
 161 | 	ADCQ $0x00, DX
 162 | 	MOVQ DX, DI
 163 | 	MOVQ AX, 112(CX)
 164 | 
 165 | 	// Iteration 15
 166 | 	MOVQ 120(BX), AX
 167 | 	MULQ SI
 168 | 	ADDQ 120(CX), AX
 169 | 	ADCQ $0x00, DX
 170 | 	ADDQ DI, AX
 171 | 	ADCQ $0x00, DX
 172 | 	MOVQ DX, DI
 173 | 	MOVQ AX, 120(CX)
 174 | 	MOVQ DI, c+24(FP)
 175 | 	RET
 176 | 
 177 | adx:
 178 | 	MOVQ z+0(FP), AX
 179 | 	MOVQ x+8(FP), CX
 180 | 	MOVQ y+16(FP), DX
 181 | 	XORQ BX, BX
 182 | 	XORQ SI, SI
 183 | 
 184 | 	// Iteration 0
 185 | 	MULXQ (CX), R8, DI
 186 | 	ADCXQ BX, R8
 187 | 	ADOXQ (AX), R8
 188 | 	MOVQ  R8, (AX)
 189 | 
 190 | 	// Iteration 1
 191 | 	MULXQ 8(CX), R8, BX
 192 | 	ADCXQ DI, R8
 193 | 	ADOXQ 8(AX), R8
 194 | 	MOVQ  R8, 8(AX)
 195 | 
 196 | 	// Iteration 2
 197 | 	MULXQ 16(CX), R8, DI
 198 | 	ADCXQ BX, R8
 199 | 	ADOXQ 16(AX), R8
 200 | 	MOVQ  R8, 16(AX)
 201 | 
 202 | 	// Iteration 3
 203 | 	MULXQ 24(CX), R8, BX
 204 | 	ADCXQ DI, R8
 205 | 	ADOXQ 24(AX), R8
 206 | 	MOVQ  R8, 24(AX)
 207 | 
 208 | 	// Iteration 4
 209 | 	MULXQ 32(CX), R8, DI
 210 | 	ADCXQ BX, R8
 211 | 	ADOXQ 32(AX), R8
 212 | 	MOVQ  R8, 32(AX)
 213 | 
 214 | 	// Iteration 5
 215 | 	MULXQ 40(CX), R8, BX
 216 | 	ADCXQ DI, R8
 217 | 	ADOXQ 40(AX), R8
 218 | 	MOVQ  R8, 40(AX)
 219 | 
 220 | 	// Iteration 6
 221 | 	MULXQ 48(CX), R8, DI
 222 | 	ADCXQ BX, R8
 223 | 	ADOXQ 48(AX), R8
 224 | 	MOVQ  R8, 48(AX)
 225 | 
 226 | 	// Iteration 7
 227 | 	MULXQ 56(CX), R8, BX
 228 | 	ADCXQ DI, R8
 229 | 	ADOXQ 56(AX), R8
 230 | 	MOVQ  R8, 56(AX)
 231 | 
 232 | 	// Iteration 8
 233 | 	MULXQ 64(CX), R8, DI
 234 | 	ADCXQ BX, R8
 235 | 	ADOXQ 64(AX), R8
 236 | 	MOVQ  R8, 64(AX)
 237 | 
 238 | 	// Iteration 9
 239 | 	MULXQ 72(CX), R8, BX
 240 | 	ADCXQ DI, R8
 241 | 	ADOXQ 72(AX), R8
 242 | 	MOVQ  R8, 72(AX)
 243 | 
 244 | 	// Iteration 10
 245 | 	MULXQ 80(CX), R8, DI
 246 | 	ADCXQ BX, R8
 247 | 	ADOXQ 80(AX), R8
 248 | 	MOVQ  R8, 80(AX)
 249 | 
 250 | 	// Iteration 11
 251 | 	MULXQ 88(CX), R8, BX
 252 | 	ADCXQ DI, R8
 253 | 	ADOXQ 88(AX), R8
 254 | 	MOVQ  R8, 88(AX)
 255 | 
 256 | 	// Iteration 12
 257 | 	MULXQ 96(CX), R8, DI
 258 | 	ADCXQ BX, R8
 259 | 	ADOXQ 96(AX), R8
 260 | 	MOVQ  R8, 96(AX)
 261 | 
 262 | 	// Iteration 13
 263 | 	MULXQ 104(CX), R8, BX
 264 | 	ADCXQ DI, R8
 265 | 	ADOXQ 104(AX), R8
 266 | 	MOVQ  R8, 104(AX)
 267 | 
 268 | 	// Iteration 14
 269 | 	MULXQ 112(CX), R8, DI
 270 | 	ADCXQ BX, R8
 271 | 	ADOXQ 112(AX), R8
 272 | 	MOVQ  R8, 112(AX)
 273 | 
 274 | 	// Iteration 15
 275 | 	MULXQ 120(CX), R8, BX
 276 | 	ADCXQ DI, R8
 277 | 	ADOXQ 120(AX), R8
 278 | 	MOVQ  R8, 120(AX)
 279 | 
 280 | 	// Add back carry flags and return
 281 | 	ADCXQ SI, BX
 282 | 	ADOXQ SI, BX
 283 | 	MOVQ  BX, c+24(FP)
 284 | 	RET
 285 | 
 286 | // func addMulVVW1536(z *uint, x *uint, y uint) (c uint)
 287 | // Requires: ADX, BMI2
 288 | TEXT ·addMulVVW1536(SB), $0-32
 289 | 	CMPB ·supportADX+0(SB), $0x01
 290 | 	JEQ  adx
 291 | 	MOVQ z+0(FP), CX
 292 | 	MOVQ x+8(FP), BX
 293 | 	MOVQ y+16(FP), SI
 294 | 	XORQ DI, DI
 295 | 
 296 | 	// Iteration 0
 297 | 	MOVQ (BX), AX
 298 | 	MULQ SI
 299 | 	ADDQ (CX), AX
 300 | 	ADCQ $0x00, DX
 301 | 	ADDQ DI, AX
 302 | 	ADCQ $0x00, DX
 303 | 	MOVQ DX, DI
 304 | 	MOVQ AX, (CX)
 305 | 
 306 | 	// Iteration 1
 307 | 	MOVQ 8(BX), AX
 308 | 	MULQ SI
 309 | 	ADDQ 8(CX), AX
 310 | 	ADCQ $0x00, DX
 311 | 	ADDQ DI, AX
 312 | 	ADCQ $0x00, DX
 313 | 	MOVQ DX, DI
 314 | 	MOVQ AX, 8(CX)
 315 | 
 316 | 	// Iteration 2
 317 | 	MOVQ 16(BX), AX
 318 | 	MULQ SI
 319 | 	ADDQ 16(CX), AX
 320 | 	ADCQ $0x00, DX
 321 | 	ADDQ DI, AX
 322 | 	ADCQ $0x00, DX
 323 | 	MOVQ DX, DI
 324 | 	MOVQ AX, 16(CX)
 325 | 
 326 | 	// Iteration 3
 327 | 	MOVQ 24(BX), AX
 328 | 	MULQ SI
 329 | 	ADDQ 24(CX), AX
 330 | 	ADCQ $0x00, DX
 331 | 	ADDQ DI, AX
 332 | 	ADCQ $0x00, DX
 333 | 	MOVQ DX, DI
 334 | 	MOVQ AX, 24(CX)
 335 | 
 336 | 	// Iteration 4
 337 | 	MOVQ 32(BX), AX
 338 | 	MULQ SI
 339 | 	ADDQ 32(CX), AX
 340 | 	ADCQ $0x00, DX
 341 | 	ADDQ DI, AX
 342 | 	ADCQ $0x00, DX
 343 | 	MOVQ DX, DI
 344 | 	MOVQ AX, 32(CX)
 345 | 
 346 | 	// Iteration 5
 347 | 	MOVQ 40(BX), AX
 348 | 	MULQ SI
 349 | 	ADDQ 40(CX), AX
 350 | 	ADCQ $0x00, DX
 351 | 	ADDQ DI, AX
 352 | 	ADCQ $0x00, DX
 353 | 	MOVQ DX, DI
 354 | 	MOVQ AX, 40(CX)
 355 | 
 356 | 	// Iteration 6
 357 | 	MOVQ 48(BX), AX
 358 | 	MULQ SI
 359 | 	ADDQ 48(CX), AX
 360 | 	ADCQ $0x00, DX
 361 | 	ADDQ DI, AX
 362 | 	ADCQ $0x00, DX
 363 | 	MOVQ DX, DI
 364 | 	MOVQ AX, 48(CX)
 365 | 
 366 | 	// Iteration 7
 367 | 	MOVQ 56(BX), AX
 368 | 	MULQ SI
 369 | 	ADDQ 56(CX), AX
 370 | 	ADCQ $0x00, DX
 371 | 	ADDQ DI, AX
 372 | 	ADCQ $0x00, DX
 373 | 	MOVQ DX, DI
 374 | 	MOVQ AX, 56(CX)
 375 | 
 376 | 	// Iteration 8
 377 | 	MOVQ 64(BX), AX
 378 | 	MULQ SI
 379 | 	ADDQ 64(CX), AX
 380 | 	ADCQ $0x00, DX
 381 | 	ADDQ DI, AX
 382 | 	ADCQ $0x00, DX
 383 | 	MOVQ DX, DI
 384 | 	MOVQ AX, 64(CX)
 385 | 
 386 | 	// Iteration 9
 387 | 	MOVQ 72(BX), AX
 388 | 	MULQ SI
 389 | 	ADDQ 72(CX), AX
 390 | 	ADCQ $0x00, DX
 391 | 	ADDQ DI, AX
 392 | 	ADCQ $0x00, DX
 393 | 	MOVQ DX, DI
 394 | 	MOVQ AX, 72(CX)
 395 | 
 396 | 	// Iteration 10
 397 | 	MOVQ 80(BX), AX
 398 | 	MULQ SI
 399 | 	ADDQ 80(CX), AX
 400 | 	ADCQ $0x00, DX
 401 | 	ADDQ DI, AX
 402 | 	ADCQ $0x00, DX
 403 | 	MOVQ DX, DI
 404 | 	MOVQ AX, 80(CX)
 405 | 
 406 | 	// Iteration 11
 407 | 	MOVQ 88(BX), AX
 408 | 	MULQ SI
 409 | 	ADDQ 88(CX), AX
 410 | 	ADCQ $0x00, DX
 411 | 	ADDQ DI, AX
 412 | 	ADCQ $0x00, DX
 413 | 	MOVQ DX, DI
 414 | 	MOVQ AX, 88(CX)
 415 | 
 416 | 	// Iteration 12
 417 | 	MOVQ 96(BX), AX
 418 | 	MULQ SI
 419 | 	ADDQ 96(CX), AX
 420 | 	ADCQ $0x00, DX
 421 | 	ADDQ DI, AX
 422 | 	ADCQ $0x00, DX
 423 | 	MOVQ DX, DI
 424 | 	MOVQ AX, 96(CX)
 425 | 
 426 | 	// Iteration 13
 427 | 	MOVQ 104(BX), AX
 428 | 	MULQ SI
 429 | 	ADDQ 104(CX), AX
 430 | 	ADCQ $0x00, DX
 431 | 	ADDQ DI, AX
 432 | 	ADCQ $0x00, DX
 433 | 	MOVQ DX, DI
 434 | 	MOVQ AX, 104(CX)
 435 | 
 436 | 	// Iteration 14
 437 | 	MOVQ 112(BX), AX
 438 | 	MULQ SI
 439 | 	ADDQ 112(CX), AX
 440 | 	ADCQ $0x00, DX
 441 | 	ADDQ DI, AX
 442 | 	ADCQ $0x00, DX
 443 | 	MOVQ DX, DI
 444 | 	MOVQ AX, 112(CX)
 445 | 
 446 | 	// Iteration 15
 447 | 	MOVQ 120(BX), AX
 448 | 	MULQ SI
 449 | 	ADDQ 120(CX), AX
 450 | 	ADCQ $0x00, DX
 451 | 	ADDQ DI, AX
 452 | 	ADCQ $0x00, DX
 453 | 	MOVQ DX, DI
 454 | 	MOVQ AX, 120(CX)
 455 | 
 456 | 	// Iteration 16
 457 | 	MOVQ 128(BX), AX
 458 | 	MULQ SI
 459 | 	ADDQ 128(CX), AX
 460 | 	ADCQ $0x00, DX
 461 | 	ADDQ DI, AX
 462 | 	ADCQ $0x00, DX
 463 | 	MOVQ DX, DI
 464 | 	MOVQ AX, 128(CX)
 465 | 
 466 | 	// Iteration 17
 467 | 	MOVQ 136(BX), AX
 468 | 	MULQ SI
 469 | 	ADDQ 136(CX), AX
 470 | 	ADCQ $0x00, DX
 471 | 	ADDQ DI, AX
 472 | 	ADCQ $0x00, DX
 473 | 	MOVQ DX, DI
 474 | 	MOVQ AX, 136(CX)
 475 | 
 476 | 	// Iteration 18
 477 | 	MOVQ 144(BX), AX
 478 | 	MULQ SI
 479 | 	ADDQ 144(CX), AX
 480 | 	ADCQ $0x00, DX
 481 | 	ADDQ DI, AX
 482 | 	ADCQ $0x00, DX
 483 | 	MOVQ DX, DI
 484 | 	MOVQ AX, 144(CX)
 485 | 
 486 | 	// Iteration 19
 487 | 	MOVQ 152(BX), AX
 488 | 	MULQ SI
 489 | 	ADDQ 152(CX), AX
 490 | 	ADCQ $0x00, DX
 491 | 	ADDQ DI, AX
 492 | 	ADCQ $0x00, DX
 493 | 	MOVQ DX, DI
 494 | 	MOVQ AX, 152(CX)
 495 | 
 496 | 	// Iteration 20
 497 | 	MOVQ 160(BX), AX
 498 | 	MULQ SI
 499 | 	ADDQ 160(CX), AX
 500 | 	ADCQ $0x00, DX
 501 | 	ADDQ DI, AX
 502 | 	ADCQ $0x00, DX
 503 | 	MOVQ DX, DI
 504 | 	MOVQ AX, 160(CX)
 505 | 
 506 | 	// Iteration 21
 507 | 	MOVQ 168(BX), AX
 508 | 	MULQ SI
 509 | 	ADDQ 168(CX), AX
 510 | 	ADCQ $0x00, DX
 511 | 	ADDQ DI, AX
 512 | 	ADCQ $0x00, DX
 513 | 	MOVQ DX, DI
 514 | 	MOVQ AX, 168(CX)
 515 | 
 516 | 	// Iteration 22
 517 | 	MOVQ 176(BX), AX
 518 | 	MULQ SI
 519 | 	ADDQ 176(CX), AX
 520 | 	ADCQ $0x00, DX
 521 | 	ADDQ DI, AX
 522 | 	ADCQ $0x00, DX
 523 | 	MOVQ DX, DI
 524 | 	MOVQ AX, 176(CX)
 525 | 
 526 | 	// Iteration 23
 527 | 	MOVQ 184(BX), AX
 528 | 	MULQ SI
 529 | 	ADDQ 184(CX), AX
 530 | 	ADCQ $0x00, DX
 531 | 	ADDQ DI, AX
 532 | 	ADCQ $0x00, DX
 533 | 	MOVQ DX, DI
 534 | 	MOVQ AX, 184(CX)
 535 | 	MOVQ DI, c+24(FP)
 536 | 	RET
 537 | 
 538 | adx:
 539 | 	MOVQ z+0(FP), AX
 540 | 	MOVQ x+8(FP), CX
 541 | 	MOVQ y+16(FP), DX
 542 | 	XORQ BX, BX
 543 | 	XORQ SI, SI
 544 | 
 545 | 	// Iteration 0
 546 | 	MULXQ (CX), R8, DI
 547 | 	ADCXQ BX, R8
 548 | 	ADOXQ (AX), R8
 549 | 	MOVQ  R8, (AX)
 550 | 
 551 | 	// Iteration 1
 552 | 	MULXQ 8(CX), R8, BX
 553 | 	ADCXQ DI, R8
 554 | 	ADOXQ 8(AX), R8
 555 | 	MOVQ  R8, 8(AX)
 556 | 
 557 | 	// Iteration 2
 558 | 	MULXQ 16(CX), R8, DI
 559 | 	ADCXQ BX, R8
 560 | 	ADOXQ 16(AX), R8
 561 | 	MOVQ  R8, 16(AX)
 562 | 
 563 | 	// Iteration 3
 564 | 	MULXQ 24(CX), R8, BX
 565 | 	ADCXQ DI, R8
 566 | 	ADOXQ 24(AX), R8
 567 | 	MOVQ  R8, 24(AX)
 568 | 
 569 | 	// Iteration 4
 570 | 	MULXQ 32(CX), R8, DI
 571 | 	ADCXQ BX, R8
 572 | 	ADOXQ 32(AX), R8
 573 | 	MOVQ  R8, 32(AX)
 574 | 
 575 | 	// Iteration 5
 576 | 	MULXQ 40(CX), R8, BX
 577 | 	ADCXQ DI, R8
 578 | 	ADOXQ 40(AX), R8
 579 | 	MOVQ  R8, 40(AX)
 580 | 
 581 | 	// Iteration 6
 582 | 	MULXQ 48(CX), R8, DI
 583 | 	ADCXQ BX, R8
 584 | 	ADOXQ 48(AX), R8
 585 | 	MOVQ  R8, 48(AX)
 586 | 
 587 | 	// Iteration 7
 588 | 	MULXQ 56(CX), R8, BX
 589 | 	ADCXQ DI, R8
 590 | 	ADOXQ 56(AX), R8
 591 | 	MOVQ  R8, 56(AX)
 592 | 
 593 | 	// Iteration 8
 594 | 	MULXQ 64(CX), R8, DI
 595 | 	ADCXQ BX, R8
 596 | 	ADOXQ 64(AX), R8
 597 | 	MOVQ  R8, 64(AX)
 598 | 
 599 | 	// Iteration 9
 600 | 	MULXQ 72(CX), R8, BX
 601 | 	ADCXQ DI, R8
 602 | 	ADOXQ 72(AX), R8
 603 | 	MOVQ  R8, 72(AX)
 604 | 
 605 | 	// Iteration 10
 606 | 	MULXQ 80(CX), R8, DI
 607 | 	ADCXQ BX, R8
 608 | 	ADOXQ 80(AX), R8
 609 | 	MOVQ  R8, 80(AX)
 610 | 
 611 | 	// Iteration 11
 612 | 	MULXQ 88(CX), R8, BX
 613 | 	ADCXQ DI, R8
 614 | 	ADOXQ 88(AX), R8
 615 | 	MOVQ  R8, 88(AX)
 616 | 
 617 | 	// Iteration 12
 618 | 	MULXQ 96(CX), R8, DI
 619 | 	ADCXQ BX, R8
 620 | 	ADOXQ 96(AX), R8
 621 | 	MOVQ  R8, 96(AX)
 622 | 
 623 | 	// Iteration 13
 624 | 	MULXQ 104(CX), R8, BX
 625 | 	ADCXQ DI, R8
 626 | 	ADOXQ 104(AX), R8
 627 | 	MOVQ  R8, 104(AX)
 628 | 
 629 | 	// Iteration 14
 630 | 	MULXQ 112(CX), R8, DI
 631 | 	ADCXQ BX, R8
 632 | 	ADOXQ 112(AX), R8
 633 | 	MOVQ  R8, 112(AX)
 634 | 
 635 | 	// Iteration 15
 636 | 	MULXQ 120(CX), R8, BX
 637 | 	ADCXQ DI, R8
 638 | 	ADOXQ 120(AX), R8
 639 | 	MOVQ  R8, 120(AX)
 640 | 
 641 | 	// Iteration 16
 642 | 	MULXQ 128(CX), R8, DI
 643 | 	ADCXQ BX, R8
 644 | 	ADOXQ 128(AX), R8
 645 | 	MOVQ  R8, 128(AX)
 646 | 
 647 | 	// Iteration 17
 648 | 	MULXQ 136(CX), R8, BX
 649 | 	ADCXQ DI, R8
 650 | 	ADOXQ 136(AX), R8
 651 | 	MOVQ  R8, 136(AX)
 652 | 
 653 | 	// Iteration 18
 654 | 	MULXQ 144(CX), R8, DI
 655 | 	ADCXQ BX, R8
 656 | 	ADOXQ 144(AX), R8
 657 | 	MOVQ  R8, 144(AX)
 658 | 
 659 | 	// Iteration 19
 660 | 	MULXQ 152(CX), R8, BX
 661 | 	ADCXQ DI, R8
 662 | 	ADOXQ 152(AX), R8
 663 | 	MOVQ  R8, 152(AX)
 664 | 
 665 | 	// Iteration 20
 666 | 	MULXQ 160(CX), R8, DI
 667 | 	ADCXQ BX, R8
 668 | 	ADOXQ 160(AX), R8
 669 | 	MOVQ  R8, 160(AX)
 670 | 
 671 | 	// Iteration 21
 672 | 	MULXQ 168(CX), R8, BX
 673 | 	ADCXQ DI, R8
 674 | 	ADOXQ 168(AX), R8
 675 | 	MOVQ  R8, 168(AX)
 676 | 
 677 | 	// Iteration 22
 678 | 	MULXQ 176(CX), R8, DI
 679 | 	ADCXQ BX, R8
 680 | 	ADOXQ 176(AX), R8
 681 | 	MOVQ  R8, 176(AX)
 682 | 
 683 | 	// Iteration 23
 684 | 	MULXQ 184(CX), R8, BX
 685 | 	ADCXQ DI, R8
 686 | 	ADOXQ 184(AX), R8
 687 | 	MOVQ  R8, 184(AX)
 688 | 
 689 | 	// Add back carry flags and return
 690 | 	ADCXQ SI, BX
 691 | 	ADOXQ SI, BX
 692 | 	MOVQ  BX, c+24(FP)
 693 | 	RET
 694 | 
 695 | // func addMulVVW2048(z *uint, x *uint, y uint) (c uint)
 696 | // Requires: ADX, BMI2
 697 | TEXT ·addMulVVW2048(SB), $0-32
 698 | 	CMPB ·supportADX+0(SB), $0x01
 699 | 	JEQ  adx
 700 | 	MOVQ z+0(FP), CX
 701 | 	MOVQ x+8(FP), BX
 702 | 	MOVQ y+16(FP), SI
 703 | 	XORQ DI, DI
 704 | 
 705 | 	// Iteration 0
 706 | 	MOVQ (BX), AX
 707 | 	MULQ SI
 708 | 	ADDQ (CX), AX
 709 | 	ADCQ $0x00, DX
 710 | 	ADDQ DI, AX
 711 | 	ADCQ $0x00, DX
 712 | 	MOVQ DX, DI
 713 | 	MOVQ AX, (CX)
 714 | 
 715 | 	// Iteration 1
 716 | 	MOVQ 8(BX), AX
 717 | 	MULQ SI
 718 | 	ADDQ 8(CX), AX
 719 | 	ADCQ $0x00, DX
 720 | 	ADDQ DI, AX
 721 | 	ADCQ $0x00, DX
 722 | 	MOVQ DX, DI
 723 | 	MOVQ AX, 8(CX)
 724 | 
 725 | 	// Iteration 2
 726 | 	MOVQ 16(BX), AX
 727 | 	MULQ SI
 728 | 	ADDQ 16(CX), AX
 729 | 	ADCQ $0x00, DX
 730 | 	ADDQ DI, AX
 731 | 	ADCQ $0x00, DX
 732 | 	MOVQ DX, DI
 733 | 	MOVQ AX, 16(CX)
 734 | 
 735 | 	// Iteration 3
 736 | 	MOVQ 24(BX), AX
 737 | 	MULQ SI
 738 | 	ADDQ 24(CX), AX
 739 | 	ADCQ $0x00, DX
 740 | 	ADDQ DI, AX
 741 | 	ADCQ $0x00, DX
 742 | 	MOVQ DX, DI
 743 | 	MOVQ AX, 24(CX)
 744 | 
 745 | 	// Iteration 4
 746 | 	MOVQ 32(BX), AX
 747 | 	MULQ SI
 748 | 	ADDQ 32(CX), AX
 749 | 	ADCQ $0x00, DX
 750 | 	ADDQ DI, AX
 751 | 	ADCQ $0x00, DX
 752 | 	MOVQ DX, DI
 753 | 	MOVQ AX, 32(CX)
 754 | 
 755 | 	// Iteration 5
 756 | 	MOVQ 40(BX), AX
 757 | 	MULQ SI
 758 | 	ADDQ 40(CX), AX
 759 | 	ADCQ $0x00, DX
 760 | 	ADDQ DI, AX
 761 | 	ADCQ $0x00, DX
 762 | 	MOVQ DX, DI
 763 | 	MOVQ AX, 40(CX)
 764 | 
 765 | 	// Iteration 6
 766 | 	MOVQ 48(BX), AX
 767 | 	MULQ SI
 768 | 	ADDQ 48(CX), AX
 769 | 	ADCQ $0x00, DX
 770 | 	ADDQ DI, AX
 771 | 	ADCQ $0x00, DX
 772 | 	MOVQ DX, DI
 773 | 	MOVQ AX, 48(CX)
 774 | 
 775 | 	// Iteration 7
 776 | 	MOVQ 56(BX), AX
 777 | 	MULQ SI
 778 | 	ADDQ 56(CX), AX
 779 | 	ADCQ $0x00, DX
 780 | 	ADDQ DI, AX
 781 | 	ADCQ $0x00, DX
 782 | 	MOVQ DX, DI
 783 | 	MOVQ AX, 56(CX)
 784 | 
 785 | 	// Iteration 8
 786 | 	MOVQ 64(BX), AX
 787 | 	MULQ SI
 788 | 	ADDQ 64(CX), AX
 789 | 	ADCQ $0x00, DX
 790 | 	ADDQ DI, AX
 791 | 	ADCQ $0x00, DX
 792 | 	MOVQ DX, DI
 793 | 	MOVQ AX, 64(CX)
 794 | 
 795 | 	// Iteration 9
 796 | 	MOVQ 72(BX), AX
 797 | 	MULQ SI
 798 | 	ADDQ 72(CX), AX
 799 | 	ADCQ $0x00, DX
 800 | 	ADDQ DI, AX
 801 | 	ADCQ $0x00, DX
 802 | 	MOVQ DX, DI
 803 | 	MOVQ AX, 72(CX)
 804 | 
 805 | 	// Iteration 10
 806 | 	MOVQ 80(BX), AX
 807 | 	MULQ SI
 808 | 	ADDQ 80(CX), AX
 809 | 	ADCQ $0x00, DX
 810 | 	ADDQ DI, AX
 811 | 	ADCQ $0x00, DX
 812 | 	MOVQ DX, DI
 813 | 	MOVQ AX, 80(CX)
 814 | 
 815 | 	// Iteration 11
 816 | 	MOVQ 88(BX), AX
 817 | 	MULQ SI
 818 | 	ADDQ 88(CX), AX
 819 | 	ADCQ $0x00, DX
 820 | 	ADDQ DI, AX
 821 | 	ADCQ $0x00, DX
 822 | 	MOVQ DX, DI
 823 | 	MOVQ AX, 88(CX)
 824 | 
 825 | 	// Iteration 12
 826 | 	MOVQ 96(BX), AX
 827 | 	MULQ SI
 828 | 	ADDQ 96(CX), AX
 829 | 	ADCQ $0x00, DX
 830 | 	ADDQ DI, AX
 831 | 	ADCQ $0x00, DX
 832 | 	MOVQ DX, DI
 833 | 	MOVQ AX, 96(CX)
 834 | 
 835 | 	// Iteration 13
 836 | 	MOVQ 104(BX), AX
 837 | 	MULQ SI
 838 | 	ADDQ 104(CX), AX
 839 | 	ADCQ $0x00, DX
 840 | 	ADDQ DI, AX
 841 | 	ADCQ $0x00, DX
 842 | 	MOVQ DX, DI
 843 | 	MOVQ AX, 104(CX)
 844 | 
 845 | 	// Iteration 14
 846 | 	MOVQ 112(BX), AX
 847 | 	MULQ SI
 848 | 	ADDQ 112(CX), AX
 849 | 	ADCQ $0x00, DX
 850 | 	ADDQ DI, AX
 851 | 	ADCQ $0x00, DX
 852 | 	MOVQ DX, DI
 853 | 	MOVQ AX, 112(CX)
 854 | 
 855 | 	// Iteration 15
 856 | 	MOVQ 120(BX), AX
 857 | 	MULQ SI
 858 | 	ADDQ 120(CX), AX
 859 | 	ADCQ $0x00, DX
 860 | 	ADDQ DI, AX
 861 | 	ADCQ $0x00, DX
 862 | 	MOVQ DX, DI
 863 | 	MOVQ AX, 120(CX)
 864 | 
 865 | 	// Iteration 16
 866 | 	MOVQ 128(BX), AX
 867 | 	MULQ SI
 868 | 	ADDQ 128(CX), AX
 869 | 	ADCQ $0x00, DX
 870 | 	ADDQ DI, AX
 871 | 	ADCQ $0x00, DX
 872 | 	MOVQ DX, DI
 873 | 	MOVQ AX, 128(CX)
 874 | 
 875 | 	// Iteration 17
 876 | 	MOVQ 136(BX), AX
 877 | 	MULQ SI
 878 | 	ADDQ 136(CX), AX
 879 | 	ADCQ $0x00, DX
 880 | 	ADDQ DI, AX
 881 | 	ADCQ $0x00, DX
 882 | 	MOVQ DX, DI
 883 | 	MOVQ AX, 136(CX)
 884 | 
 885 | 	// Iteration 18
 886 | 	MOVQ 144(BX), AX
 887 | 	MULQ SI
 888 | 	ADDQ 144(CX), AX
 889 | 	ADCQ $0x00, DX
 890 | 	ADDQ DI, AX
 891 | 	ADCQ $0x00, DX
 892 | 	MOVQ DX, DI
 893 | 	MOVQ AX, 144(CX)
 894 | 
 895 | 	// Iteration 19
 896 | 	MOVQ 152(BX), AX
 897 | 	MULQ SI
 898 | 	ADDQ 152(CX), AX
 899 | 	ADCQ $0x00, DX
 900 | 	ADDQ DI, AX
 901 | 	ADCQ $0x00, DX
 902 | 	MOVQ DX, DI
 903 | 	MOVQ AX, 152(CX)
 904 | 
 905 | 	// Iteration 20
 906 | 	MOVQ 160(BX), AX
 907 | 	MULQ SI
 908 | 	ADDQ 160(CX), AX
 909 | 	ADCQ $0x00, DX
 910 | 	ADDQ DI, AX
 911 | 	ADCQ $0x00, DX
 912 | 	MOVQ DX, DI
 913 | 	MOVQ AX, 160(CX)
 914 | 
 915 | 	// Iteration 21
 916 | 	MOVQ 168(BX), AX
 917 | 	MULQ SI
 918 | 	ADDQ 168(CX), AX
 919 | 	ADCQ $0x00, DX
 920 | 	ADDQ DI, AX
 921 | 	ADCQ $0x00, DX
 922 | 	MOVQ DX, DI
 923 | 	MOVQ AX, 168(CX)
 924 | 
 925 | 	// Iteration 22
 926 | 	MOVQ 176(BX), AX
 927 | 	MULQ SI
 928 | 	ADDQ 176(CX), AX
 929 | 	ADCQ $0x00, DX
 930 | 	ADDQ DI, AX
 931 | 	ADCQ $0x00, DX
 932 | 	MOVQ DX, DI
 933 | 	MOVQ AX, 176(CX)
 934 | 
 935 | 	// Iteration 23
 936 | 	MOVQ 184(BX), AX
 937 | 	MULQ SI
 938 | 	ADDQ 184(CX), AX
 939 | 	ADCQ $0x00, DX
 940 | 	ADDQ DI, AX
 941 | 	ADCQ $0x00, DX
 942 | 	MOVQ DX, DI
 943 | 	MOVQ AX, 184(CX)
 944 | 
 945 | 	// Iteration 24
 946 | 	MOVQ 192(BX), AX
 947 | 	MULQ SI
 948 | 	ADDQ 192(CX), AX
 949 | 	ADCQ $0x00, DX
 950 | 	ADDQ DI, AX
 951 | 	ADCQ $0x00, DX
 952 | 	MOVQ DX, DI
 953 | 	MOVQ AX, 192(CX)
 954 | 
 955 | 	// Iteration 25
 956 | 	MOVQ 200(BX), AX
 957 | 	MULQ SI
 958 | 	ADDQ 200(CX), AX
 959 | 	ADCQ $0x00, DX
 960 | 	ADDQ DI, AX
 961 | 	ADCQ $0x00, DX
 962 | 	MOVQ DX, DI
 963 | 	MOVQ AX, 200(CX)
 964 | 
 965 | 	// Iteration 26
 966 | 	MOVQ 208(BX), AX
 967 | 	MULQ SI
 968 | 	ADDQ 208(CX), AX
 969 | 	ADCQ $0x00, DX
 970 | 	ADDQ DI, AX
 971 | 	ADCQ $0x00, DX
 972 | 	MOVQ DX, DI
 973 | 	MOVQ AX, 208(CX)
 974 | 
 975 | 	// Iteration 27
 976 | 	MOVQ 216(BX), AX
 977 | 	MULQ SI
 978 | 	ADDQ 216(CX), AX
 979 | 	ADCQ $0x00, DX
 980 | 	ADDQ DI, AX
 981 | 	ADCQ $0x00, DX
 982 | 	MOVQ DX, DI
 983 | 	MOVQ AX, 216(CX)
 984 | 
 985 | 	// Iteration 28
 986 | 	MOVQ 224(BX), AX
 987 | 	MULQ SI
 988 | 	ADDQ 224(CX), AX
 989 | 	ADCQ $0x00, DX
 990 | 	ADDQ DI, AX
 991 | 	ADCQ $0x00, DX
 992 | 	MOVQ DX, DI
 993 | 	MOVQ AX, 224(CX)
 994 | 
 995 | 	// Iteration 29
 996 | 	MOVQ 232(BX), AX
 997 | 	MULQ SI
 998 | 	ADDQ 232(CX), AX
 999 | 	ADCQ $0x00, DX
1000 | 	ADDQ DI, AX
1001 | 	ADCQ $0x00, DX
1002 | 	MOVQ DX, DI
1003 | 	MOVQ AX, 232(CX)
1004 | 
1005 | 	// Iteration 30
1006 | 	MOVQ 240(BX), AX
1007 | 	MULQ SI
1008 | 	ADDQ 240(CX), AX
1009 | 	ADCQ $0x00, DX
1010 | 	ADDQ DI, AX
1011 | 	ADCQ $0x00, DX
1012 | 	MOVQ DX, DI
1013 | 	MOVQ AX, 240(CX)
1014 | 
1015 | 	// Iteration 31
1016 | 	MOVQ 248(BX), AX
1017 | 	MULQ SI
1018 | 	ADDQ 248(CX), AX
1019 | 	ADCQ $0x00, DX
1020 | 	ADDQ DI, AX
1021 | 	ADCQ $0x00, DX
1022 | 	MOVQ DX, DI
1023 | 	MOVQ AX, 248(CX)
1024 | 	MOVQ DI, c+24(FP)
1025 | 	RET
1026 | 
1027 | adx:
1028 | 	MOVQ z+0(FP), AX
1029 | 	MOVQ x+8(FP), CX
1030 | 	MOVQ y+16(FP), DX
1031 | 	XORQ BX, BX
1032 | 	XORQ SI, SI
1033 | 
1034 | 	// Iteration 0
1035 | 	MULXQ (CX), R8, DI
1036 | 	ADCXQ BX, R8
1037 | 	ADOXQ (AX), R8
1038 | 	MOVQ  R8, (AX)
1039 | 
1040 | 	// Iteration 1
1041 | 	MULXQ 8(CX), R8, BX
1042 | 	ADCXQ DI, R8
1043 | 	ADOXQ 8(AX), R8
1044 | 	MOVQ  R8, 8(AX)
1045 | 
1046 | 	// Iteration 2
1047 | 	MULXQ 16(CX), R8, DI
1048 | 	ADCXQ BX, R8
1049 | 	ADOXQ 16(AX), R8
1050 | 	MOVQ  R8, 16(AX)
1051 | 
1052 | 	// Iteration 3
1053 | 	MULXQ 24(CX), R8, BX
1054 | 	ADCXQ DI, R8
1055 | 	ADOXQ 24(AX), R8
1056 | 	MOVQ  R8, 24(AX)
1057 | 
1058 | 	// Iteration 4
1059 | 	MULXQ 32(CX), R8, DI
1060 | 	ADCXQ BX, R8
1061 | 	ADOXQ 32(AX), R8
1062 | 	MOVQ  R8, 32(AX)
1063 | 
1064 | 	// Iteration 5
1065 | 	MULXQ 40(CX), R8, BX
1066 | 	ADCXQ DI, R8
1067 | 	ADOXQ 40(AX), R8
1068 | 	MOVQ  R8, 40(AX)
1069 | 
1070 | 	// Iteration 6
1071 | 	MULXQ 48(CX), R8, DI
1072 | 	ADCXQ BX, R8
1073 | 	ADOXQ 48(AX), R8
1074 | 	MOVQ  R8, 48(AX)
1075 | 
1076 | 	// Iteration 7
1077 | 	MULXQ 56(CX), R8, BX
1078 | 	ADCXQ DI, R8
1079 | 	ADOXQ 56(AX), R8
1080 | 	MOVQ  R8, 56(AX)
1081 | 
1082 | 	// Iteration 8
1083 | 	MULXQ 64(CX), R8, DI
1084 | 	ADCXQ BX, R8
1085 | 	ADOXQ 64(AX), R8
1086 | 	MOVQ  R8, 64(AX)
1087 | 
1088 | 	// Iteration 9
1089 | 	MULXQ 72(CX), R8, BX
1090 | 	ADCXQ DI, R8
1091 | 	ADOXQ 72(AX), R8
1092 | 	MOVQ  R8, 72(AX)
1093 | 
1094 | 	// Iteration 10
1095 | 	MULXQ 80(CX), R8, DI
1096 | 	ADCXQ BX, R8
1097 | 	ADOXQ 80(AX), R8
1098 | 	MOVQ  R8, 80(AX)
1099 | 
1100 | 	// Iteration 11
1101 | 	MULXQ 88(CX), R8, BX
1102 | 	ADCXQ DI, R8
1103 | 	ADOXQ 88(AX), R8
1104 | 	MOVQ  R8, 88(AX)
1105 | 
1106 | 	// Iteration 12
1107 | 	MULXQ 96(CX), R8, DI
1108 | 	ADCXQ BX, R8
1109 | 	ADOXQ 96(AX), R8
1110 | 	MOVQ  R8, 96(AX)
1111 | 
1112 | 	// Iteration 13
1113 | 	MULXQ 104(CX), R8, BX
1114 | 	ADCXQ DI, R8
1115 | 	ADOXQ 104(AX), R8
1116 | 	MOVQ  R8, 104(AX)
1117 | 
1118 | 	// Iteration 14
1119 | 	MULXQ 112(CX), R8, DI
1120 | 	ADCXQ BX, R8
1121 | 	ADOXQ 112(AX), R8
1122 | 	MOVQ  R8, 112(AX)
1123 | 
1124 | 	// Iteration 15
1125 | 	MULXQ 120(CX), R8, BX
1126 | 	ADCXQ DI, R8
1127 | 	ADOXQ 120(AX), R8
1128 | 	MOVQ  R8, 120(AX)
1129 | 
1130 | 	// Iteration 16
1131 | 	MULXQ 128(CX), R8, DI
1132 | 	ADCXQ BX, R8
1133 | 	ADOXQ 128(AX), R8
1134 | 	MOVQ  R8, 128(AX)
1135 | 
1136 | 	// Iteration 17
1137 | 	MULXQ 136(CX), R8, BX
1138 | 	ADCXQ DI, R8
1139 | 	ADOXQ 136(AX), R8
1140 | 	MOVQ  R8, 136(AX)
1141 | 
1142 | 	// Iteration 18
1143 | 	MULXQ 144(CX), R8, DI
1144 | 	ADCXQ BX, R8
1145 | 	ADOXQ 144(AX), R8
1146 | 	MOVQ  R8, 144(AX)
1147 | 
1148 | 	// Iteration 19
1149 | 	MULXQ 152(CX), R8, BX
1150 | 	ADCXQ DI, R8
1151 | 	ADOXQ 152(AX), R8
1152 | 	MOVQ  R8, 152(AX)
1153 | 
1154 | 	// Iteration 20
1155 | 	MULXQ 160(CX), R8, DI
1156 | 	ADCXQ BX, R8
1157 | 	ADOXQ 160(AX), R8
1158 | 	MOVQ  R8, 160(AX)
1159 | 
1160 | 	// Iteration 21
1161 | 	MULXQ 168(CX), R8, BX
1162 | 	ADCXQ DI, R8
1163 | 	ADOXQ 168(AX), R8
1164 | 	MOVQ  R8, 168(AX)
1165 | 
1166 | 	// Iteration 22
1167 | 	MULXQ 176(CX), R8, DI
1168 | 	ADCXQ BX, R8
1169 | 	ADOXQ 176(AX), R8
1170 | 	MOVQ  R8, 176(AX)
1171 | 
1172 | 	// Iteration 23
1173 | 	MULXQ 184(CX), R8, BX
1174 | 	ADCXQ DI, R8
1175 | 	ADOXQ 184(AX), R8
1176 | 	MOVQ  R8, 184(AX)
1177 | 
1178 | 	// Iteration 24
1179 | 	MULXQ 192(CX), R8, DI
1180 | 	ADCXQ BX, R8
1181 | 	ADOXQ 192(AX), R8
1182 | 	MOVQ  R8, 192(AX)
1183 | 
1184 | 	// Iteration 25
1185 | 	MULXQ 200(CX), R8, BX
1186 | 	ADCXQ DI, R8
1187 | 	ADOXQ 200(AX), R8
1188 | 	MOVQ  R8, 200(AX)
1189 | 
1190 | 	// Iteration 26
1191 | 	MULXQ 208(CX), R8, DI
1192 | 	ADCXQ BX, R8
1193 | 	ADOXQ 208(AX), R8
1194 | 	MOVQ  R8, 208(AX)
1195 | 
1196 | 	// Iteration 27
1197 | 	MULXQ 216(CX), R8, BX
1198 | 	ADCXQ DI, R8
1199 | 	ADOXQ 216(AX), R8
1200 | 	MOVQ  R8, 216(AX)
1201 | 
1202 | 	// Iteration 28
1203 | 	MULXQ 224(CX), R8, DI
1204 | 	ADCXQ BX, R8
1205 | 	ADOXQ 224(AX), R8
1206 | 	MOVQ  R8, 224(AX)
1207 | 
1208 | 	// Iteration 29
1209 | 	MULXQ 232(CX), R8, BX
1210 | 	ADCXQ DI, R8
1211 | 	ADOXQ 232(AX), R8
1212 | 	MOVQ  R8, 232(AX)
1213 | 
1214 | 	// Iteration 30
1215 | 	MULXQ 240(CX), R8, DI
1216 | 	ADCXQ BX, R8
1217 | 	ADOXQ 240(AX), R8
1218 | 	MOVQ  R8, 240(AX)
1219 | 
1220 | 	// Iteration 31
1221 | 	MULXQ 248(CX), R8, BX
1222 | 	ADCXQ DI, R8
1223 | 	ADOXQ 248(AX), R8
1224 | 	MOVQ  R8, 248(AX)
1225 | 
1226 | 	// Add back carry flags and return
1227 | 	ADCXQ SI, BX
1228 | 	ADOXQ SI, BX
1229 | 	MOVQ  BX, c+24(FP)
1230 | 	RET
1231 | 


--------------------------------------------------------------------------------
/nat_arm.s:
--------------------------------------------------------------------------------
 1 | // Copyright 2009 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 | TEXT ·addMulVVW1024(SB), $0-16
11 | 	MOVW	$32, R5
12 | 	JMP		addMulVVWx<>(SB)
13 | 
14 | // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 | TEXT ·addMulVVW1536(SB), $0-16
16 | 	MOVW	$48, R5
17 | 	JMP		addMulVVWx<>(SB)
18 | 
19 | // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 | TEXT ·addMulVVW2048(SB), $0-16
21 | 	MOVW	$64, R5
22 | 	JMP		addMulVVWx<>(SB)
23 | 
24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0
25 | 	MOVW	$0, R0
26 | 	MOVW	z+0(FP), R1
27 | 	MOVW	x+4(FP), R2
28 | 	MOVW	y+8(FP), R3
29 | 	ADD	R5<<2, R1, R5
30 | 	MOVW	$0, R4
31 | 	B E9
32 | 
33 | L9:	MOVW.P	4(R2), R6
34 | 	MULLU	R6, R3, (R7, R6)
35 | 	ADD.S	R4, R6
36 | 	ADC	R0, R7
37 | 	MOVW	0(R1), R4
38 | 	ADD.S	R4, R6
39 | 	ADC	R0, R7
40 | 	MOVW.P	R6, 4(R1)
41 | 	MOVW	R7, R4
42 | 
43 | E9:	TEQ	R1, R5
44 | 	BNE	L9
45 | 
46 | 	MOVW	R4, c+12(FP)
47 | 	RET
48 | 


--------------------------------------------------------------------------------
/nat_arm64.s:
--------------------------------------------------------------------------------
 1 | // Copyright 2013 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 | TEXT ·addMulVVW1024(SB), $0-32
11 | 	MOVD	$16, R0
12 | 	JMP		addMulVVWx<>(SB)
13 | 
14 | // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 | TEXT ·addMulVVW1536(SB), $0-32
16 | 	MOVD	$24, R0
17 | 	JMP		addMulVVWx<>(SB)
18 | 
19 | // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 | TEXT ·addMulVVW2048(SB), $0-32
21 | 	MOVD	$32, R0
22 | 	JMP		addMulVVWx<>(SB)
23 | 
24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0
25 | 	MOVD	z+0(FP), R1
26 | 	MOVD	x+8(FP), R2
27 | 	MOVD	y+16(FP), R3
28 | 	MOVD	$0, R4
29 | 
30 | // The main loop of this code operates on a block of 4 words every iteration
31 | // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
32 | // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
33 | // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
34 | loop:
35 | 	CBZ	R0, done
36 | 
37 | 	LDP.P	16(R2), (R5, R6)
38 | 	LDP.P	16(R2), (R7, R8)
39 | 
40 | 	LDP	(R1), (R9, R10)
41 | 	ADDS	R4, R9
42 | 	MUL	R6, R3, R14
43 | 	ADCS	R14, R10
44 | 	MUL	R7, R3, R15
45 | 	LDP	16(R1), (R11, R12)
46 | 	ADCS	R15, R11
47 | 	MUL	R8, R3, R16
48 | 	ADCS	R16, R12
49 | 	UMULH	R8, R3, R20
50 | 	ADC	$0, R20
51 | 
52 | 	MUL	R5, R3, R13
53 | 	ADDS	R13, R9
54 | 	UMULH	R5, R3, R17
55 | 	ADCS	R17, R10
56 | 	UMULH	R6, R3, R21
57 | 	STP.P	(R9, R10), 16(R1)
58 | 	ADCS	R21, R11
59 | 	UMULH	R7, R3, R19
60 | 	ADCS	R19, R12
61 | 	STP.P	(R11, R12), 16(R1)
62 | 	ADC	$0, R20, R4
63 | 
64 | 	SUB	$4, R0
65 | 	B	loop
66 | 
67 | done:
68 | 	MOVD	R4, c+24(FP)
69 | 	RET
70 | 


--------------------------------------------------------------------------------
/nat_asm.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego && (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || riscv64 || s390x)
 6 | 
 7 | package bigmod
 8 | 
 9 | import "golang.org/x/sys/cpu"
10 | 
11 | // amd64 assembly uses ADCX/ADOX/MULX if ADX is available to run two carry
12 | // chains in the flags in parallel across the whole operation, and aggressively
13 | // unrolls loops. arm64 processes four words at a time.
14 | //
15 | // It's unclear why the assembly for all other architectures, as well as for
16 | // amd64 without ADX, perform better than the compiler output.
17 | // TODO(filippo): file cmd/compile performance issue.
18 | 
19 | var supportADX = cpu.X86.HasADX && cpu.X86.HasBMI2
20 | 
21 | //go:noescape
22 | func addMulVVW1024(z, x *uint, y uint) (c uint)
23 | 
24 | //go:noescape
25 | func addMulVVW1536(z, x *uint, y uint) (c uint)
26 | 
27 | //go:noescape
28 | func addMulVVW2048(z, x *uint, y uint) (c uint)
29 | 


--------------------------------------------------------------------------------
/nat_loong64.s:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // derived from crypto/internal/fips140/bigmod/nat_riscv64.s
 6 | 
 7 | //go:build !purego
 8 | 
 9 | #include "textflag.h"
10 | 
11 | // func addMulVVW1024(z, x *uint, y uint) (c uint)
12 | TEXT ·addMulVVW1024(SB),$0-32
13 | 	MOVV	$16, R8
14 | 	JMP	addMulVVWx<>(SB)
15 | 
16 | // func addMulVVW1536(z, x *uint, y uint) (c uint)
17 | TEXT ·addMulVVW1536(SB),$0-32
18 | 	MOVV	$24, R8
19 | 	JMP	addMulVVWx<>(SB)
20 | 
21 | // func addMulVVW2048(z, x *uint, y uint) (c uint)
22 | TEXT ·addMulVVW2048(SB),$0-32
23 | 	MOVV	$32, R8
24 | 	JMP	addMulVVWx<>(SB)
25 | 
26 | TEXT addMulVVWx<>(SB),NOFRAME|NOSPLIT,$0
27 | 	MOVV	z+0(FP), R4
28 | 	MOVV	x+8(FP), R6
29 | 	MOVV	y+16(FP), R5
30 | 	MOVV	$0, R7
31 | 
32 | 	BEQ	R8, R0, done
33 | loop:
34 | 	MOVV	0*8(R4), R9	// z[0]
35 | 	MOVV	1*8(R4), R10	// z[1]
36 | 	MOVV	2*8(R4), R11	// z[2]
37 | 	MOVV	3*8(R4), R12	// z[3]
38 | 
39 | 	MOVV	0*8(R6), R13	// x[0]
40 | 	MOVV	1*8(R6), R14	// x[1]
41 | 	MOVV	2*8(R6), R15	// x[2]
42 | 	MOVV	3*8(R6), R16	// x[3]
43 | 
44 | 	MULHVU	R13, R5, R17	// z_hi[0] = x[0] * y
45 | 	MULV	R13, R5, R13	// z_lo[0] = x[0] * y
46 | 	ADDV	R13, R9, R18	// z_lo[0] = x[0] * y + z[0]
47 | 	SGTU	R13, R18, R19
48 | 	ADDV	R17, R19, R17	// z_hi[0] = x[0] * y + z[0]
49 | 	ADDV	R18, R7, R9	// z_lo[0] = x[0] * y + z[0] + c
50 | 	SGTU	R18, R9, R19
51 | 	ADDV	R17, R19, R7	// next c
52 | 
53 | 	MULHVU	R14, R5, R24	// z_hi[1] = x[1] * y
54 | 	MULV	R14, R5, R14	// z_lo[1] = x[1] * y
55 | 	ADDV	R14, R10, R18	// z_lo[1] = x[1] * y + z[1]
56 | 	SGTU	R14, R18, R19
57 | 	ADDV	R24, R19, R24	// z_hi[1] = x[1] * y + z[1]
58 | 	ADDV	R18, R7, R10	// z_lo[1] = x[1] * y + z[1] + c
59 | 	SGTU	R18, R10, R19
60 | 	ADDV	R24, R19, R7	// next c
61 | 
62 | 	MULHVU	R15, R5, R25	// z_hi[2] = x[2] * y
63 | 	MULV	R15, R5, R15	// z_lo[2] = x[2] * y
64 | 	ADDV	R15, R11, R18	// z_lo[2] = x[2] * y + z[2]
65 | 	SGTU	R15, R18, R19
66 | 	ADDV	R25, R19, R25	// z_hi[2] = x[2] * y + z[2]
67 | 	ADDV	R18, R7, R11	// z_lo[2] = x[2] * y + z[2] + c
68 | 	SGTU	R18, R11, R19
69 | 	ADDV	R25, R19, R7	// next c
70 | 
71 | 	MULHVU	R16, R5, R26	// z_hi[3] = x[3] * y
72 | 	MULV	R16, R5, R16	// z_lo[3] = x[3] * y
73 | 	ADDV	R16, R12, R18	// z_lo[3] = x[3] * y + z[3]
74 | 	SGTU	R16, R18, R19
75 | 	ADDV	R26, R19, R26	// z_hi[3] = x[3] * y + z[3]
76 | 	ADDV	R18, R7, R12	// z_lo[3] = x[3] * y + z[3] + c
77 | 	SGTU	R18, R12, R19
78 | 	ADDV	R26, R19, R7	// next c
79 | 
80 | 	MOVV	R9, 0*8(R4)	// z[0]
81 | 	MOVV	R10, 1*8(R4)	// z[1]
82 | 	MOVV	R11, 2*8(R4)	// z[2]
83 | 	MOVV	R12, 3*8(R4)	// z[3]
84 | 
85 | 	ADDV	$32, R4
86 | 	ADDV	$32, R6
87 | 
88 | 	SUBV	$4, R8
89 | 	BNE	R8, R0, loop
90 | 
91 | done:
92 | 	MOVV	R7, c+24(FP)
93 | 	RET
94 | 


--------------------------------------------------------------------------------
/nat_noasm.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build purego || !(386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || riscv64 || s390x || wasm)
 6 | 
 7 | package bigmod
 8 | 
 9 | import "unsafe"
10 | 
11 | func addMulVVW1024(z, x *uint, y uint) (c uint) {
12 | 	return addMulVVW(unsafe.Slice(z, 1024/_W), unsafe.Slice(x, 1024/_W), y)
13 | }
14 | 
15 | func addMulVVW1536(z, x *uint, y uint) (c uint) {
16 | 	return addMulVVW(unsafe.Slice(z, 1536/_W), unsafe.Slice(x, 1536/_W), y)
17 | }
18 | 
19 | func addMulVVW2048(z, x *uint, y uint) (c uint) {
20 | 	return addMulVVW(unsafe.Slice(z, 2048/_W), unsafe.Slice(x, 2048/_W), y)
21 | }
22 | 


--------------------------------------------------------------------------------
/nat_ppc64x.s:
--------------------------------------------------------------------------------
 1 | // Copyright 2013 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego && (ppc64 || ppc64le)
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 | TEXT ·addMulVVW1024(SB), $0-32
11 | 	MOVD	$4, R6 // R6 = z_len/4
12 | 	JMP		addMulVVWx<>(SB)
13 | 
14 | // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 | TEXT ·addMulVVW1536(SB), $0-32
16 | 	MOVD	$6, R6 // R6 = z_len/4
17 | 	JMP		addMulVVWx<>(SB)
18 | 
19 | // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 | TEXT ·addMulVVW2048(SB), $0-32
21 | 	MOVD	$8, R6 // R6 = z_len/4
22 | 	JMP		addMulVVWx<>(SB)
23 | 
24 | // This local function expects to be called only by
25 | // callers above. R6 contains the z length/4
26 | // since 4 values are processed for each
27 | // loop iteration, and is guaranteed to be > 0.
28 | // If other callers are added this function might
29 | // need to change.
30 | TEXT addMulVVWx<>(SB), NOSPLIT, $0
31 | 	MOVD	z+0(FP), R3
32 | 	MOVD	x+8(FP), R4
33 | 	MOVD	y+16(FP), R5
34 | 
35 | 	MOVD	$0, R9		// R9 = c = 0
36 | 	MOVD	R6, CTR		// Initialize loop counter
37 | 	PCALIGN	$16
38 | 
39 | loop:
40 | 	MOVD	0(R4), R14	// x[i]
41 | 	MOVD	8(R4), R16	// x[i+1]
42 | 	MOVD	16(R4), R18	// x[i+2]
43 | 	MOVD	24(R4), R20	// x[i+3]
44 | 	MOVD	0(R3), R15	// z[i]
45 | 	MOVD	8(R3), R17	// z[i+1]
46 | 	MOVD	16(R3), R19	// z[i+2]
47 | 	MOVD	24(R3), R21	// z[i+3]
48 | 	MULLD	R5, R14, R10	// low x[i]*y
49 | 	MULHDU	R5, R14, R11	// high x[i]*y
50 | 	ADDC	R15, R10
51 | 	ADDZE	R11
52 | 	ADDC	R9, R10
53 | 	ADDZE	R11, R9
54 | 	MULLD	R5, R16, R14	// low x[i+1]*y
55 | 	MULHDU	R5, R16, R15	// high x[i+1]*y
56 | 	ADDC	R17, R14
57 | 	ADDZE	R15
58 | 	ADDC	R9, R14
59 | 	ADDZE	R15, R9
60 | 	MULLD	R5, R18, R16	// low x[i+2]*y
61 | 	MULHDU	R5, R18, R17	// high x[i+2]*y
62 | 	ADDC	R19, R16
63 | 	ADDZE	R17
64 | 	ADDC	R9, R16
65 | 	ADDZE	R17, R9
66 | 	MULLD	R5, R20, R18	// low x[i+3]*y
67 | 	MULHDU	R5, R20, R19	// high x[i+3]*y
68 | 	ADDC	R21, R18
69 | 	ADDZE	R19
70 | 	ADDC	R9, R18
71 | 	ADDZE	R19, R9
72 | 	MOVD	R10, 0(R3)	// z[i]
73 | 	MOVD	R14, 8(R3)	// z[i+1]
74 | 	MOVD	R16, 16(R3)	// z[i+2]
75 | 	MOVD	R18, 24(R3)	// z[i+3]
76 | 	ADD	$32, R3
77 | 	ADD	$32, R4
78 | 	BDNZ	loop
79 | 
80 | done:
81 | 	MOVD	R9, c+24(FP)
82 | 	RET
83 | 


--------------------------------------------------------------------------------
/nat_riscv64.s:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 | TEXT ·addMulVVW1024(SB),$0-32
11 | 	MOV	$16, X30
12 | 	JMP	addMulVVWx<>(SB)
13 | 
14 | // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 | TEXT ·addMulVVW1536(SB),$0-32
16 | 	MOV	$24, X30
17 | 	JMP	addMulVVWx<>(SB)
18 | 
19 | // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 | TEXT ·addMulVVW2048(SB),$0-32
21 | 	MOV	$32, X30
22 | 	JMP	addMulVVWx<>(SB)
23 | 
24 | TEXT addMulVVWx<>(SB),NOFRAME|NOSPLIT,$0
25 | 	MOV	z+0(FP), X5
26 | 	MOV	x+8(FP), X7
27 | 	MOV	y+16(FP), X6
28 | 	MOV	$0, X29
29 | 
30 | 	BEQZ	X30, done
31 | loop:
32 | 	MOV	0*8(X5), X10	// z[0]
33 | 	MOV	1*8(X5), X13	// z[1]
34 | 	MOV	2*8(X5), X16	// z[2]
35 | 	MOV	3*8(X5), X19	// z[3]
36 | 
37 | 	MOV	0*8(X7), X8	// x[0]
38 | 	MOV	1*8(X7), X11	// x[1]
39 | 	MOV	2*8(X7), X14	// x[2]
40 | 	MOV	3*8(X7), X17	// x[3]
41 | 
42 | 	MULHU	X8, X6, X9	// z_hi[0] = x[0] * y
43 | 	MUL	X8, X6, X8	// z_lo[0] = x[0] * y
44 | 	ADD	X8, X10, X21	// z_lo[0] = x[0] * y + z[0]
45 | 	SLTU	X8, X21, X22
46 | 	ADD	X9, X22, X9	// z_hi[0] = x[0] * y + z[0]
47 | 	ADD	X21, X29, X10	// z_lo[0] = x[0] * y + z[0] + c
48 | 	SLTU	X21, X10, X22
49 | 	ADD	X9, X22, X29	// next c
50 | 
51 | 	MULHU	X11, X6, X12	// z_hi[1] = x[1] * y
52 | 	MUL	X11, X6, X11	// z_lo[1] = x[1] * y
53 | 	ADD	X11, X13, X21	// z_lo[1] = x[1] * y + z[1]
54 | 	SLTU	X11, X21, X22
55 | 	ADD	X12, X22, X12	// z_hi[1] = x[1] * y + z[1]
56 | 	ADD	X21, X29, X13	// z_lo[1] = x[1] * y + z[1] + c
57 | 	SLTU	X21, X13, X22
58 | 	ADD	X12, X22, X29	// next c
59 | 
60 | 	MULHU	X14, X6, X15	// z_hi[2] = x[2] * y
61 | 	MUL	X14, X6, X14	// z_lo[2] = x[2] * y
62 | 	ADD	X14, X16, X21	// z_lo[2] = x[2] * y + z[2]
63 | 	SLTU	X14, X21, X22
64 | 	ADD	X15, X22, X15	// z_hi[2] = x[2] * y + z[2]
65 | 	ADD	X21, X29, X16	// z_lo[2] = x[2] * y + z[2] + c
66 | 	SLTU	X21, X16, X22
67 | 	ADD	X15, X22, X29	// next c
68 | 
69 | 	MULHU	X17, X6, X18	// z_hi[3] = x[3] * y
70 | 	MUL	X17, X6, X17	// z_lo[3] = x[3] * y
71 | 	ADD	X17, X19, X21	// z_lo[3] = x[3] * y + z[3]
72 | 	SLTU	X17, X21, X22
73 | 	ADD	X18, X22, X18	// z_hi[3] = x[3] * y + z[3]
74 | 	ADD	X21, X29, X19	// z_lo[3] = x[3] * y + z[3] + c
75 | 	SLTU	X21, X19, X22
76 | 	ADD	X18, X22, X29	// next c
77 | 
78 | 	MOV	X10, 0*8(X5)	// z[0]
79 | 	MOV	X13, 1*8(X5)	// z[1]
80 | 	MOV	X16, 2*8(X5)	// z[2]
81 | 	MOV	X19, 3*8(X5)	// z[3]
82 | 
83 | 	ADDI	$32, X5
84 | 	ADDI	$32, X7
85 | 
86 | 	ADDI	$-4, X30
87 | 	BNEZ	X30, loop
88 | 
89 | done:
90 | 	MOV	X29, c+24(FP)
91 | 	RET
92 | 


--------------------------------------------------------------------------------
/nat_s390x.s:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 | TEXT ·addMulVVW1024(SB), $0-32
11 | 	MOVD	$16, R5
12 | 	JMP		addMulVVWx<>(SB)
13 | 
14 | // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 | TEXT ·addMulVVW1536(SB), $0-32
16 | 	MOVD	$24, R5
17 | 	JMP		addMulVVWx<>(SB)
18 | 
19 | // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 | TEXT ·addMulVVW2048(SB), $0-32
21 | 	MOVD	$32, R5
22 | 	JMP		addMulVVWx<>(SB)
23 | 
24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0
25 | 	MOVD z+0(FP), R2
26 | 	MOVD x+8(FP), R8
27 | 	MOVD y+16(FP), R9
28 | 
29 | 	MOVD $0, R1 // i*8 = 0
30 | 	MOVD $0, R7 // i = 0
31 | 	MOVD $0, R0 // make sure it's zero
32 | 	MOVD $0, R4 // c = 0
33 | 
34 | 	MOVD   R5, R12
35 | 	AND    $-2, R12
36 | 	CMPBGE R5, $2, A6
37 | 	BR     E6
38 | 
39 | A6:
40 | 	MOVD   (R8)(R1*1), R6
41 | 	MULHDU R9, R6
42 | 	MOVD   (R2)(R1*1), R10
43 | 	ADDC   R10, R11        // add to low order bits
44 | 	ADDE   R0, R6
45 | 	ADDC   R4, R11
46 | 	ADDE   R0, R6
47 | 	MOVD   R6, R4
48 | 	MOVD   R11, (R2)(R1*1)
49 | 
50 | 	MOVD   (8)(R8)(R1*1), R6
51 | 	MULHDU R9, R6
52 | 	MOVD   (8)(R2)(R1*1), R10
53 | 	ADDC   R10, R11           // add to low order bits
54 | 	ADDE   R0, R6
55 | 	ADDC   R4, R11
56 | 	ADDE   R0, R6
57 | 	MOVD   R6, R4
58 | 	MOVD   R11, (8)(R2)(R1*1)
59 | 
60 | 	ADD $16, R1 // i*8 + 8
61 | 	ADD $2, R7  // i++
62 | 
63 | 	CMPBLT R7, R12, A6
64 | 	BR     E6
65 | 
66 | L6:
67 | 	// TODO: drop unused single-step loop.
68 | 	MOVD   (R8)(R1*1), R6
69 | 	MULHDU R9, R6
70 | 	MOVD   (R2)(R1*1), R10
71 | 	ADDC   R10, R11        // add to low order bits
72 | 	ADDE   R0, R6
73 | 	ADDC   R4, R11
74 | 	ADDE   R0, R6
75 | 	MOVD   R6, R4
76 | 	MOVD   R11, (R2)(R1*1)
77 | 
78 | 	ADD $8, R1 // i*8 + 8
79 | 	ADD $1, R7 // i++
80 | 
81 | E6:
82 | 	CMPBLT R7, R5, L6 // i < n
83 | 
84 | 	MOVD R4, c+24(FP)
85 | 	RET
86 | 


--------------------------------------------------------------------------------
/nat_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package bigmod
  6 | 
  7 | import (
  8 | 	"bufio"
  9 | 	"bytes"
 10 | 	cryptorand "crypto/rand"
 11 | 	"encoding/hex"
 12 | 	"fmt"
 13 | 	"math/big"
 14 | 	"math/bits"
 15 | 	"math/rand"
 16 | 	"os"
 17 | 	"reflect"
 18 | 	"slices"
 19 | 	"strings"
 20 | 	"testing"
 21 | 	"testing/quick"
 22 | )
 23 | 
 24 | // setBig assigns x = n, optionally resizing n to the appropriate size.
 25 | //
 26 | // The announced length of x is set based on the actual bit size of the input,
 27 | // ignoring leading zeroes.
 28 | func (x *Nat) setBig(n *big.Int) *Nat {
 29 | 	limbs := n.Bits()
 30 | 	x.reset(len(limbs))
 31 | 	for i := range limbs {
 32 | 		x.limbs[i] = uint(limbs[i])
 33 | 	}
 34 | 	return x
 35 | }
 36 | 
 37 | func (n *Nat) asBig() *big.Int {
 38 | 	bits := make([]big.Word, len(n.limbs))
 39 | 	for i := range n.limbs {
 40 | 		bits[i] = big.Word(n.limbs[i])
 41 | 	}
 42 | 	return new(big.Int).SetBits(bits)
 43 | }
 44 | 
 45 | func (n *Nat) String() string {
 46 | 	var limbs []string
 47 | 	for i := range n.limbs {
 48 | 		limbs = append(limbs, fmt.Sprintf("%016X", n.limbs[len(n.limbs)-1-i]))
 49 | 	}
 50 | 	return "{" + strings.Join(limbs, " ") + "}"
 51 | }
 52 | 
 53 | // Generate generates an even nat. It's used by testing/quick to produce random
 54 | // *nat values for quick.Check invocations.
 55 | func (*Nat) Generate(r *rand.Rand, size int) reflect.Value {
 56 | 	limbs := make([]uint, size)
 57 | 	for i := 0; i < size; i++ {
 58 | 		limbs[i] = uint(r.Uint64()) & ((1 << _W) - 2)
 59 | 	}
 60 | 	return reflect.ValueOf(&Nat{limbs})
 61 | }
 62 | 
 63 | func testModAddCommutative(a *Nat, b *Nat) bool {
 64 | 	m := maxModulus(uint(len(a.limbs)))
 65 | 	aPlusB := new(Nat).set(a)
 66 | 	aPlusB.Add(b, m)
 67 | 	bPlusA := new(Nat).set(b)
 68 | 	bPlusA.Add(a, m)
 69 | 	return aPlusB.Equal(bPlusA) == 1
 70 | }
 71 | 
 72 | func TestModAddCommutative(t *testing.T) {
 73 | 	err := quick.Check(testModAddCommutative, &quick.Config{})
 74 | 	if err != nil {
 75 | 		t.Error(err)
 76 | 	}
 77 | }
 78 | 
 79 | func testModSubThenAddIdentity(a *Nat, b *Nat) bool {
 80 | 	m := maxModulus(uint(len(a.limbs)))
 81 | 	original := new(Nat).set(a)
 82 | 	a.Sub(b, m)
 83 | 	a.Add(b, m)
 84 | 	return a.Equal(original) == 1
 85 | }
 86 | 
 87 | func TestModSubThenAddIdentity(t *testing.T) {
 88 | 	err := quick.Check(testModSubThenAddIdentity, &quick.Config{})
 89 | 	if err != nil {
 90 | 		t.Error(err)
 91 | 	}
 92 | }
 93 | 
 94 | func TestMontgomeryRoundtrip(t *testing.T) {
 95 | 	err := quick.Check(func(a *Nat) bool {
 96 | 		one := &Nat{make([]uint, len(a.limbs))}
 97 | 		one.limbs[0] = 1
 98 | 		aPlusOne := new(big.Int).SetBytes(natBytes(a))
 99 | 		aPlusOne.Add(aPlusOne, big.NewInt(1))
100 | 		m, _ := NewModulus(aPlusOne.Bytes())
101 | 		monty := new(Nat).set(a)
102 | 		monty.montgomeryRepresentation(m)
103 | 		aAgain := new(Nat).set(monty)
104 | 		aAgain.montgomeryMul(monty, one, m)
105 | 		if a.Equal(aAgain) != 1 {
106 | 			t.Errorf("%v != %v", a, aAgain)
107 | 			return false
108 | 		}
109 | 		return true
110 | 	}, &quick.Config{})
111 | 	if err != nil {
112 | 		t.Error(err)
113 | 	}
114 | }
115 | 
116 | func TestShiftIn(t *testing.T) {
117 | 	if bits.UintSize != 64 {
118 | 		t.Skip("examples are only valid in 64 bit")
119 | 	}
120 | 	examples := []struct {
121 | 		m, x, expected []byte
122 | 		y              uint64
123 | 	}{{
124 | 		m:        []byte{13},
125 | 		x:        []byte{0},
126 | 		y:        0xFFFF_FFFF_FFFF_FFFF,
127 | 		expected: []byte{2},
128 | 	}, {
129 | 		m:        []byte{13},
130 | 		x:        []byte{7},
131 | 		y:        0xFFFF_FFFF_FFFF_FFFF,
132 | 		expected: []byte{10},
133 | 	}, {
134 | 		m:        []byte{0x06, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d},
135 | 		x:        make([]byte, 9),
136 | 		y:        0xFFFF_FFFF_FFFF_FFFF,
137 | 		expected: []byte{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
138 | 	}, {
139 | 		m:        []byte{0x06, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d},
140 | 		x:        []byte{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
141 | 		y:        0,
142 | 		expected: []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06},
143 | 	}}
144 | 
145 | 	for i, tt := range examples {
146 | 		m := modulusFromBytes(tt.m)
147 | 		got := natFromBytes(tt.x).ExpandFor(m).shiftIn(uint(tt.y), m)
148 | 		if exp := natFromBytes(tt.expected).ExpandFor(m); got.Equal(exp) != 1 {
149 | 			t.Errorf("%d: got %v, expected %v", i, got, exp)
150 | 		}
151 | 	}
152 | }
153 | 
154 | func TestModulusAndNatSizes(t *testing.T) {
155 | 	// These are 126 bit (2 * _W on 64-bit architectures) values, serialized as
156 | 	// 128 bits worth of bytes. If leading zeroes are stripped, they fit in two
157 | 	// limbs, if they are not, they fit in three. This can be a problem because
158 | 	// modulus strips leading zeroes and nat does not.
159 | 	m := modulusFromBytes([]byte{
160 | 		0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
161 | 		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
162 | 	xb := []byte{0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
163 | 		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}
164 | 	natFromBytes(xb).ExpandFor(m) // must not panic for shrinking
165 | 	NewNat().SetBytes(xb, m)
166 | }
167 | 
168 | func TestSetBytes(t *testing.T) {
169 | 	tests := []struct {
170 | 		m, b []byte
171 | 		fail bool
172 | 	}{{
173 | 		m: []byte{0xff, 0xff},
174 | 		b: []byte{0x00, 0x01},
175 | 	}, {
176 | 		m:    []byte{0xff, 0xff},
177 | 		b:    []byte{0xff, 0xff},
178 | 		fail: true,
179 | 	}, {
180 | 		m: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
181 | 		b: []byte{0x00, 0x01},
182 | 	}, {
183 | 		m: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
184 | 		b: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe},
185 | 	}, {
186 | 		m:    []byte{0xff, 0xff},
187 | 		b:    []byte{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
188 | 		fail: true,
189 | 	}, {
190 | 		m:    []byte{0xff, 0xff},
191 | 		b:    []byte{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
192 | 		fail: true,
193 | 	}, {
194 | 		m: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
195 | 		b: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe},
196 | 	}, {
197 | 		m:    []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
198 | 		b:    []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe},
199 | 		fail: true,
200 | 	}, {
201 | 		m:    []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
202 | 		b:    []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
203 | 		fail: true,
204 | 	}, {
205 | 		m:    []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
206 | 		b:    []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe},
207 | 		fail: true,
208 | 	}, {
209 | 		m:    []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd},
210 | 		b:    []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
211 | 		fail: true,
212 | 	}}
213 | 
214 | 	for i, tt := range tests {
215 | 		m := modulusFromBytes(tt.m)
216 | 		got, err := NewNat().SetBytes(tt.b, m)
217 | 		if err != nil {
218 | 			if !tt.fail {
219 | 				t.Errorf("%d: unexpected error: %v", i, err)
220 | 			}
221 | 			continue
222 | 		}
223 | 		if tt.fail {
224 | 			t.Errorf("%d: unexpected success", i)
225 | 			continue
226 | 		}
227 | 		if expected := natFromBytes(tt.b).ExpandFor(m); choice(got.Equal(expected)) != yes {
228 | 			t.Errorf("%d: got %v, expected %v", i, got, expected)
229 | 		}
230 | 	}
231 | 
232 | 	f := func(xBytes []byte) bool {
233 | 		m := maxModulus(uint(len(xBytes)*8/_W + 1))
234 | 		got, err := NewNat().SetBytes(xBytes, m)
235 | 		if err != nil {
236 | 			return false
237 | 		}
238 | 		return choice(got.Equal(natFromBytes(xBytes).ExpandFor(m))) == yes
239 | 	}
240 | 
241 | 	err := quick.Check(f, &quick.Config{})
242 | 	if err != nil {
243 | 		t.Error(err)
244 | 	}
245 | }
246 | 
247 | func TestExpand(t *testing.T) {
248 | 	sliced := []uint{1, 2, 3, 4}
249 | 	examples := []struct {
250 | 		in  []uint
251 | 		n   int
252 | 		out []uint
253 | 	}{{
254 | 		[]uint{1, 2},
255 | 		4,
256 | 		[]uint{1, 2, 0, 0},
257 | 	}, {
258 | 		sliced[:2],
259 | 		4,
260 | 		[]uint{1, 2, 0, 0},
261 | 	}, {
262 | 		[]uint{1, 2},
263 | 		2,
264 | 		[]uint{1, 2},
265 | 	}}
266 | 
267 | 	for i, tt := range examples {
268 | 		got := (&Nat{tt.in}).expand(tt.n)
269 | 		if len(got.limbs) != len(tt.out) || got.Equal(&Nat{tt.out}) != 1 {
270 | 			t.Errorf("%d: got %v, expected %v", i, got, tt.out)
271 | 		}
272 | 	}
273 | }
274 | 
275 | func TestMod(t *testing.T) {
276 | 	m := modulusFromBytes([]byte{0x06, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d})
277 | 	x := natFromBytes([]byte{0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})
278 | 	out := new(Nat)
279 | 	out.Mod(x, m)
280 | 	expected := natFromBytes([]byte{0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09})
281 | 	if out.Equal(expected) != 1 {
282 | 		t.Errorf("%+v != %+v", out, expected)
283 | 	}
284 | }
285 | 
286 | func TestModSub(t *testing.T) {
287 | 	m := modulusFromBytes([]byte{13})
288 | 	x := &Nat{[]uint{6}}
289 | 	y := &Nat{[]uint{7}}
290 | 	x.Sub(y, m)
291 | 	expected := &Nat{[]uint{12}}
292 | 	if x.Equal(expected) != 1 {
293 | 		t.Errorf("%+v != %+v", x, expected)
294 | 	}
295 | 	x.Sub(y, m)
296 | 	expected = &Nat{[]uint{5}}
297 | 	if x.Equal(expected) != 1 {
298 | 		t.Errorf("%+v != %+v", x, expected)
299 | 	}
300 | }
301 | 
302 | func TestModAdd(t *testing.T) {
303 | 	m := modulusFromBytes([]byte{13})
304 | 	x := &Nat{[]uint{6}}
305 | 	y := &Nat{[]uint{7}}
306 | 	x.Add(y, m)
307 | 	expected := &Nat{[]uint{0}}
308 | 	if x.Equal(expected) != 1 {
309 | 		t.Errorf("%+v != %+v", x, expected)
310 | 	}
311 | 	x.Add(y, m)
312 | 	expected = &Nat{[]uint{7}}
313 | 	if x.Equal(expected) != 1 {
314 | 		t.Errorf("%+v != %+v", x, expected)
315 | 	}
316 | }
317 | 
318 | func TestExp(t *testing.T) {
319 | 	m := modulusFromBytes([]byte{13})
320 | 	x := &Nat{[]uint{3}}
321 | 	out := &Nat{[]uint{0}}
322 | 	out.Exp(x, []byte{12}, m)
323 | 	expected := &Nat{[]uint{1}}
324 | 	if out.Equal(expected) != 1 {
325 | 		t.Errorf("%+v != %+v", out, expected)
326 | 	}
327 | }
328 | 
329 | func TestExpShort(t *testing.T) {
330 | 	m := modulusFromBytes([]byte{13})
331 | 	x := &Nat{[]uint{3}}
332 | 	out := &Nat{[]uint{0}}
333 | 	out.ExpShortVarTime(x, 12, m)
334 | 	expected := &Nat{[]uint{1}}
335 | 	if out.Equal(expected) != 1 {
336 | 		t.Errorf("%+v != %+v", out, expected)
337 | 	}
338 | }
339 | 
340 | // TestMulReductions tests that Mul reduces results equal or slightly greater
341 | // than the modulus. Some Montgomery algorithms don't and need extra care to
342 | // return correct results. See https://go.dev/issue/13907.
343 | func TestMulReductions(t *testing.T) {
344 | 	// Two short but multi-limb primes.
345 | 	a, _ := new(big.Int).SetString("773608962677651230850240281261679752031633236267106044359907", 10)
346 | 	b, _ := new(big.Int).SetString("180692823610368451951102211649591374573781973061758082626801", 10)
347 | 	n := new(big.Int).Mul(a, b)
348 | 
349 | 	N, _ := NewModulus(n.Bytes())
350 | 	A := NewNat().setBig(a).ExpandFor(N)
351 | 	B := NewNat().setBig(b).ExpandFor(N)
352 | 
353 | 	if A.Mul(B, N).IsZero() != 1 {
354 | 		t.Error("a * b mod (a * b) != 0")
355 | 	}
356 | 
357 | 	i := new(big.Int).ModInverse(a, b)
358 | 	N, _ = NewModulus(b.Bytes())
359 | 	A = NewNat().setBig(a).ExpandFor(N)
360 | 	I := NewNat().setBig(i).ExpandFor(N)
361 | 	one := NewNat().setBig(big.NewInt(1)).ExpandFor(N)
362 | 
363 | 	if A.Mul(I, N).Equal(one) != 1 {
364 | 		t.Error("a * inv(a) mod b != 1")
365 | 	}
366 | }
367 | 
368 | func TestMul(t *testing.T) {
369 | 	t.Run("small", func(t *testing.T) { testMul(t, 760/8) })
370 | 	t.Run("1024", func(t *testing.T) { testMul(t, 1024/8) })
371 | 	t.Run("1536", func(t *testing.T) { testMul(t, 1536/8) })
372 | 	t.Run("2048", func(t *testing.T) { testMul(t, 2048/8) })
373 | }
374 | 
375 | func testMul(t *testing.T, n int) {
376 | 	a, b, m := make([]byte, n), make([]byte, n), make([]byte, n)
377 | 	cryptorand.Read(a)
378 | 	cryptorand.Read(b)
379 | 	cryptorand.Read(m)
380 | 
381 | 	// Pick the highest as the modulus.
382 | 	if bytes.Compare(a, m) > 0 {
383 | 		a, m = m, a
384 | 	}
385 | 	if bytes.Compare(b, m) > 0 {
386 | 		b, m = m, b
387 | 	}
388 | 
389 | 	M, err := NewModulus(m)
390 | 	if err != nil {
391 | 		t.Fatal(err)
392 | 	}
393 | 	A, err := NewNat().SetBytes(a, M)
394 | 	if err != nil {
395 | 		t.Fatal(err)
396 | 	}
397 | 	B, err := NewNat().SetBytes(b, M)
398 | 	if err != nil {
399 | 		t.Fatal(err)
400 | 	}
401 | 
402 | 	A.Mul(B, M)
403 | 	ABytes := A.Bytes(M)
404 | 
405 | 	mBig := new(big.Int).SetBytes(m)
406 | 	aBig := new(big.Int).SetBytes(a)
407 | 	bBig := new(big.Int).SetBytes(b)
408 | 	nBig := new(big.Int).Mul(aBig, bBig)
409 | 	nBig.Mod(nBig, mBig)
410 | 	nBigBytes := make([]byte, len(ABytes))
411 | 	nBig.FillBytes(nBigBytes)
412 | 
413 | 	if !bytes.Equal(ABytes, nBigBytes) {
414 | 		t.Errorf("got %x, want %x", ABytes, nBigBytes)
415 | 	}
416 | }
417 | 
418 | func TestIs(t *testing.T) {
419 | 	checkYes := func(c uint, err string) {
420 | 		t.Helper()
421 | 		if choice(c) != yes {
422 | 			t.Error(err)
423 | 		}
424 | 	}
425 | 	checkNot := func(c uint, err string) {
426 | 		t.Helper()
427 | 		if choice(c) != no {
428 | 			t.Error(err)
429 | 		}
430 | 	}
431 | 
432 | 	mFour := modulusFromBytes([]byte{4})
433 | 	n, err := NewNat().SetBytes([]byte{3}, mFour)
434 | 	if err != nil {
435 | 		t.Fatal(err)
436 | 	}
437 | 	checkYes(n.IsMinusOne(mFour), "3 is not -1 mod 4")
438 | 	checkNot(n.IsZero(), "3 is zero")
439 | 	checkNot(n.IsOne(), "3 is one")
440 | 	checkYes(n.IsOdd(), "3 is not odd")
441 | 	n.SubOne(mFour)
442 | 	checkNot(n.IsMinusOne(mFour), "2 is -1 mod 4")
443 | 	checkNot(n.IsZero(), "2 is zero")
444 | 	checkNot(n.IsOne(), "2 is one")
445 | 	checkNot(n.IsOdd(), "2 is odd")
446 | 	n.SubOne(mFour)
447 | 	checkNot(n.IsMinusOne(mFour), "1 is -1 mod 4")
448 | 	checkNot(n.IsZero(), "1 is zero")
449 | 	checkYes(n.IsOne(), "1 is not one")
450 | 	checkYes(n.IsOdd(), "1 is not odd")
451 | 	n.SubOne(mFour)
452 | 	checkNot(n.IsMinusOne(mFour), "0 is -1 mod 4")
453 | 	checkYes(n.IsZero(), "0 is not zero")
454 | 	checkNot(n.IsOne(), "0 is one")
455 | 	checkNot(n.IsOdd(), "0 is odd")
456 | 	n.SubOne(mFour)
457 | 	checkYes(n.IsMinusOne(mFour), "-1 is not -1 mod 4")
458 | 	checkNot(n.IsZero(), "-1 is zero")
459 | 	checkNot(n.IsOne(), "-1 is one")
460 | 	checkYes(n.IsOdd(), "-1 mod 4 is not odd")
461 | 
462 | 	mTwoLimbs := maxModulus(2)
463 | 	n, err = NewNat().SetBytes([]byte{0x01}, mTwoLimbs)
464 | 	if err != nil {
465 | 		t.Fatal(err)
466 | 	}
467 | 	if n.IsOne() != 1 {
468 | 		t.Errorf("1 is not one")
469 | 	}
470 | }
471 | 
472 | func TestTrailingZeroBits(t *testing.T) {
473 | 	nb := new(big.Int).SetBytes([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7e})
474 | 	nb.Lsh(nb, 128)
475 | 	expected := 129
476 | 	for expected >= 0 {
477 | 		n := NewNat().setBig(nb)
478 | 		if n.TrailingZeroBitsVarTime() != uint(expected) {
479 | 			t.Errorf("%d != %d", n.TrailingZeroBitsVarTime(), expected)
480 | 		}
481 | 		nb.Rsh(nb, 1)
482 | 		expected--
483 | 	}
484 | }
485 | 
486 | func TestRightShift(t *testing.T) {
487 | 	nb, err := cryptorand.Int(cryptorand.Reader, new(big.Int).Lsh(big.NewInt(1), 1024))
488 | 	if err != nil {
489 | 		t.Fatal(err)
490 | 	}
491 | 	for _, shift := range []uint{1, 32, 64, 128, 1024 - 128, 1024 - 64, 1024 - 32, 1024 - 1} {
492 | 		testShift := func(t *testing.T, shift uint) {
493 | 			n := NewNat().setBig(nb)
494 | 			oldLen := len(n.limbs)
495 | 			n.ShiftRightVarTime(shift)
496 | 			if len(n.limbs) != oldLen {
497 | 				t.Errorf("len(n.limbs) = %d, want %d", len(n.limbs), oldLen)
498 | 			}
499 | 			exp := new(big.Int).Rsh(nb, shift)
500 | 			if n.asBig().Cmp(exp) != 0 {
501 | 				t.Errorf("%v != %v", n.asBig(), exp)
502 | 			}
503 | 		}
504 | 		t.Run(fmt.Sprint(shift-1), func(t *testing.T) { testShift(t, shift-1) })
505 | 		t.Run(fmt.Sprint(shift), func(t *testing.T) { testShift(t, shift) })
506 | 		t.Run(fmt.Sprint(shift+1), func(t *testing.T) { testShift(t, shift+1) })
507 | 	}
508 | }
509 | 
510 | func natBytes(n *Nat) []byte {
511 | 	return n.Bytes(maxModulus(uint(len(n.limbs))))
512 | }
513 | 
514 | func natFromBytes(b []byte) *Nat {
515 | 	// Must not use Nat.SetBytes as it's used in TestSetBytes.
516 | 	bb := new(big.Int).SetBytes(b)
517 | 	return NewNat().setBig(bb)
518 | }
519 | 
520 | func modulusFromBytes(b []byte) *Modulus {
521 | 	bb := new(big.Int).SetBytes(b)
522 | 	m, _ := NewModulus(bb.Bytes())
523 | 	return m
524 | }
525 | 
526 | // maxModulus returns the biggest modulus that can fit in n limbs.
527 | func maxModulus(n uint) *Modulus {
528 | 	b := big.NewInt(1)
529 | 	b.Lsh(b, n*_W)
530 | 	b.Sub(b, big.NewInt(1))
531 | 	m, _ := NewModulus(b.Bytes())
532 | 	return m
533 | }
534 | 
535 | func makeBenchmarkModulus() *Modulus {
536 | 	return maxModulus(32)
537 | }
538 | 
539 | func makeBenchmarkValue() *Nat {
540 | 	x := make([]uint, 32)
541 | 	for i := 0; i < 32; i++ {
542 | 		x[i]--
543 | 	}
544 | 	return &Nat{limbs: x}
545 | }
546 | 
547 | func makeBenchmarkExponent() []byte {
548 | 	e := make([]byte, 256)
549 | 	for i := 0; i < 32; i++ {
550 | 		e[i] = 0xFF
551 | 	}
552 | 	return e
553 | }
554 | 
555 | func BenchmarkModAdd(b *testing.B) {
556 | 	x := makeBenchmarkValue()
557 | 	y := makeBenchmarkValue()
558 | 	m := makeBenchmarkModulus()
559 | 
560 | 	b.ResetTimer()
561 | 	for i := 0; i < b.N; i++ {
562 | 		x.Add(y, m)
563 | 	}
564 | }
565 | 
566 | func BenchmarkModSub(b *testing.B) {
567 | 	x := makeBenchmarkValue()
568 | 	y := makeBenchmarkValue()
569 | 	m := makeBenchmarkModulus()
570 | 
571 | 	b.ResetTimer()
572 | 	for i := 0; i < b.N; i++ {
573 | 		x.Sub(y, m)
574 | 	}
575 | }
576 | 
577 | func BenchmarkMontgomeryRepr(b *testing.B) {
578 | 	x := makeBenchmarkValue()
579 | 	m := makeBenchmarkModulus()
580 | 
581 | 	b.ResetTimer()
582 | 	for i := 0; i < b.N; i++ {
583 | 		x.montgomeryRepresentation(m)
584 | 	}
585 | }
586 | 
587 | func BenchmarkMontgomeryMul(b *testing.B) {
588 | 	x := makeBenchmarkValue()
589 | 	y := makeBenchmarkValue()
590 | 	out := makeBenchmarkValue()
591 | 	m := makeBenchmarkModulus()
592 | 
593 | 	b.ResetTimer()
594 | 	for i := 0; i < b.N; i++ {
595 | 		out.montgomeryMul(x, y, m)
596 | 	}
597 | }
598 | 
599 | func BenchmarkModMul(b *testing.B) {
600 | 	x := makeBenchmarkValue()
601 | 	y := makeBenchmarkValue()
602 | 	m := makeBenchmarkModulus()
603 | 
604 | 	b.ResetTimer()
605 | 	for i := 0; i < b.N; i++ {
606 | 		x.Mul(y, m)
607 | 	}
608 | }
609 | 
610 | func BenchmarkExpBig(b *testing.B) {
611 | 	out := new(big.Int)
612 | 	exponentBytes := makeBenchmarkExponent()
613 | 	x := new(big.Int).SetBytes(exponentBytes)
614 | 	e := new(big.Int).SetBytes(exponentBytes)
615 | 	n := new(big.Int).SetBytes(exponentBytes)
616 | 	one := new(big.Int).SetUint64(1)
617 | 	n.Add(n, one)
618 | 
619 | 	b.ResetTimer()
620 | 	for i := 0; i < b.N; i++ {
621 | 		out.Exp(x, e, n)
622 | 	}
623 | }
624 | 
625 | func BenchmarkExp(b *testing.B) {
626 | 	x := makeBenchmarkValue()
627 | 	e := makeBenchmarkExponent()
628 | 	out := makeBenchmarkValue()
629 | 	m := makeBenchmarkModulus()
630 | 
631 | 	b.ResetTimer()
632 | 	for i := 0; i < b.N; i++ {
633 | 		out.Exp(x, e, m)
634 | 	}
635 | }
636 | 
637 | func TestNewModulus(t *testing.T) {
638 | 	expected := "modulus must be > 1"
639 | 	_, err := NewModulus([]byte{})
640 | 	if err == nil || err.Error() != expected {
641 | 		t.Errorf("NewModulus(0) got %q, want %q", err, expected)
642 | 	}
643 | 	_, err = NewModulus([]byte{0})
644 | 	if err == nil || err.Error() != expected {
645 | 		t.Errorf("NewModulus(0) got %q, want %q", err, expected)
646 | 	}
647 | 	_, err = NewModulus([]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
648 | 	if err == nil || err.Error() != expected {
649 | 		t.Errorf("NewModulus(0) got %q, want %q", err, expected)
650 | 	}
651 | 	_, err = NewModulus([]byte{1})
652 | 	if err == nil || err.Error() != expected {
653 | 		t.Errorf("NewModulus(1) got %q, want %q", err, expected)
654 | 	}
655 | 	_, err = NewModulus([]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1})
656 | 	if err == nil || err.Error() != expected {
657 | 		t.Errorf("NewModulus(1) got %q, want %q", err, expected)
658 | 	}
659 | }
660 | 
661 | func makeTestValue(nbits int) []uint {
662 | 	n := nbits / _W
663 | 	x := make([]uint, n)
664 | 	for i := range n {
665 | 		x[i]--
666 | 	}
667 | 	return x
668 | }
669 | 
670 | func TestAddMulVVWSized(t *testing.T) {
671 | 	// Sized addMulVVW have architecture-specific implementations on
672 | 	// a number of architectures. Test that they match the generic
673 | 	// implementation.
674 | 	tests := []struct {
675 | 		n int
676 | 		f func(z, x *uint, y uint) uint
677 | 	}{
678 | 		{1024, addMulVVW1024},
679 | 		{1536, addMulVVW1536},
680 | 		{2048, addMulVVW2048},
681 | 	}
682 | 	for _, test := range tests {
683 | 		t.Run(fmt.Sprint(test.n), func(t *testing.T) {
684 | 			x := makeTestValue(test.n)
685 | 			z := makeTestValue(test.n)
686 | 			z2 := slices.Clone(z)
687 | 			var y uint
688 | 			y--
689 | 			c := addMulVVW(z, x, y)
690 | 			c2 := test.f(&z2[0], &x[0], y)
691 | 			if !slices.Equal(z, z2) || c != c2 {
692 | 				t.Errorf("%016X, %016X != %016X, %016X", z, c, z2, c2)
693 | 			}
694 | 		})
695 | 	}
696 | }
697 | 
698 | func TestInverse(t *testing.T) {
699 | 	f, err := os.Open("testdata/mod_inv_tests.txt")
700 | 	if err != nil {
701 | 		t.Fatal(err)
702 | 	}
703 | 
704 | 	var ModInv, A, M string
705 | 	var lineNum int
706 | 	scanner := bufio.NewScanner(f)
707 | 	for scanner.Scan() {
708 | 		lineNum++
709 | 		line := scanner.Text()
710 | 		if len(line) == 0 || line[0] == '#' {
711 | 			continue
712 | 		}
713 | 
714 | 		k, v, _ := strings.Cut(line, " = ")
715 | 		switch k {
716 | 		case "ModInv":
717 | 			ModInv = v
718 | 		case "A":
719 | 			A = v
720 | 		case "M":
721 | 			M = v
722 | 
723 | 			t.Run(fmt.Sprintf("line %d", lineNum), func(t *testing.T) {
724 | 				m, err := NewModulus(decodeHex(t, M))
725 | 				if err != nil {
726 | 					t.Skip("modulus <= 1")
727 | 				}
728 | 				a, err := NewNat().SetBytes(decodeHex(t, A), m)
729 | 				if err != nil {
730 | 					t.Fatal(err)
731 | 				}
732 | 
733 | 				got, ok := NewNat().InverseVarTime(a, m)
734 | 				if !ok {
735 | 					t.Fatal("not invertible")
736 | 				}
737 | 				exp, err := NewNat().SetBytes(decodeHex(t, ModInv), m)
738 | 				if err != nil {
739 | 					t.Fatal(err)
740 | 				}
741 | 				if got.Equal(exp) != 1 {
742 | 					t.Errorf("%v != %v", got, exp)
743 | 				}
744 | 			})
745 | 		default:
746 | 			t.Fatalf("unknown key %q on line %d", k, lineNum)
747 | 		}
748 | 	}
749 | 	if err := scanner.Err(); err != nil {
750 | 		t.Fatal(err)
751 | 	}
752 | }
753 | 
754 | func decodeHex(t *testing.T, s string) []byte {
755 | 	t.Helper()
756 | 	if len(s)%2 != 0 {
757 | 		s = "0" + s
758 | 	}
759 | 	b, err := hex.DecodeString(s)
760 | 	if err != nil {
761 | 		t.Fatalf("failed to decode hex %q: %v", s, err)
762 | 	}
763 | 	return b
764 | }
765 | 


--------------------------------------------------------------------------------
/nat_wasm.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build !purego
 6 | 
 7 | package bigmod
 8 | 
 9 | import "unsafe"
10 | 
11 | // The generic implementation relies on 64x64->128 bit multiplication and
12 | // 64-bit add-with-carry, which are compiler intrinsics on many architectures.
13 | // Wasm doesn't support those. Here we implement it with 32x32->64 bit
14 | // operations, which is more efficient on Wasm.
15 | 
16 | func idx(x *uint, i uintptr) *uint {
17 | 	return (*uint)(unsafe.Pointer(uintptr(unsafe.Pointer(x)) + i*8))
18 | }
19 | 
20 | func addMulVVWWasm(z, x *uint, y uint, n uintptr) (carry uint) {
21 | 	const mask32 = 1<<32 - 1
22 | 	y0 := y & mask32
23 | 	y1 := y >> 32
24 | 	for i := range n {
25 | 		xi := *idx(x, i)
26 | 		x0 := xi & mask32
27 | 		x1 := xi >> 32
28 | 		zi := *idx(z, i)
29 | 		z0 := zi & mask32
30 | 		z1 := zi >> 32
31 | 		c0 := carry & mask32
32 | 		c1 := carry >> 32
33 | 
34 | 		w00 := x0*y0 + z0 + c0
35 | 		l00 := w00 & mask32
36 | 		h00 := w00 >> 32
37 | 
38 | 		w01 := x0*y1 + z1 + h00
39 | 		l01 := w01 & mask32
40 | 		h01 := w01 >> 32
41 | 
42 | 		w10 := x1*y0 + c1 + l01
43 | 		h10 := w10 >> 32
44 | 
45 | 		carry = x1*y1 + h10 + h01
46 | 		*idx(z, i) = w10<<32 + l00
47 | 	}
48 | 	return carry
49 | }
50 | 
51 | func addMulVVW1024(z, x *uint, y uint) (c uint) {
52 | 	return addMulVVWWasm(z, x, y, 1024/_W)
53 | }
54 | 
55 | func addMulVVW1536(z, x *uint, y uint) (c uint) {
56 | 	return addMulVVWWasm(z, x, y, 1536/_W)
57 | }
58 | 
59 | func addMulVVW2048(z, x *uint, y uint) (c uint) {
60 | 	return addMulVVWWasm(z, x, y, 2048/_W)
61 | }
62 | 


--------------------------------------------------------------------------------
/testdata/mod_inv_tests.txt:
--------------------------------------------------------------------------------
  1 | # ModInv tests.
  2 | #
  3 | # These test vectors satisfy ModInv * A = 1 (mod M) and 0 <= ModInv < M.
  4 | 
  5 | ModInv = 00
  6 | A = 00
  7 | M = 01
  8 | 
  9 | ModInv = 00
 10 | A = 01
 11 | M = 01
 12 | 
 13 | ModInv = 00
 14 | A = 02
 15 | M = 01
 16 | 
 17 | ModInv = 00
 18 | A = 03
 19 | M = 01
 20 | 
 21 | ModInv = 64
 22 | A = 54
 23 | M = e3
 24 | 
 25 | ModInv = 13
 26 | A = 2b
 27 | M = 30
 28 | 
 29 | ModInv = 2f
 30 | A = 30
 31 | M = 37
 32 | 
 33 | ModInv = 4
 34 | A = 13
 35 | M = 4b
 36 | 
 37 | ModInv = 1c47
 38 | A = cd4
 39 | M = 6a21
 40 | 
 41 | ModInv = 2b97
 42 | A = 8e7
 43 | M = 49c0
 44 | 
 45 | ModInv = 29b9
 46 | A = fcb
 47 | M = 3092
 48 | 
 49 | ModInv = a83
 50 | A = 14bf
 51 | M = 41ae
 52 | 
 53 | ModInv = 18f15fe1
 54 | A = 11b5d53e
 55 | M = 322e92a1
 56 | 
 57 | ModInv = 32f9453b
 58 | A = 8af6df6
 59 | M = 33d45eb7
 60 | 
 61 | ModInv = d696369
 62 | A = c5f89dd5
 63 | M = fc09c17c
 64 | 
 65 | ModInv = 622839d8
 66 | A = 60c2526
 67 | M = 74200493
 68 | 
 69 | ModInv = fb5a8aee7bbc4ef
 70 | A = 24ebd835a70be4e2
 71 | M = 9c7256574e0c5e93
 72 | 
 73 | ModInv = 846bc225402419c
 74 | A = 23026003ab1fbdb
 75 | M = 1683cbe32779c59b
 76 | 
 77 | ModInv = 5ff84f63a78982f9
 78 | A = 4a2420dc733e1a0f
 79 | M = a73c6bfabefa09e6
 80 | 
 81 | ModInv = 133e74d28ef42b43
 82 | A = 2e9511ae29cdd41
 83 | M = 15234df99f19fcda
 84 | 
 85 | ModInv = 46ae1fabe9521e4b99b198fc8439609023aa69be2247c0d1e27c2a0ea332f9c5
 86 | A = 6331fec5f01014046788c919ed50dc86ac7a80c085f1b6f645dd179c0f0dc9cd
 87 | M = 8ef409de82318259a8655a39293b1e762fa2cc7e0aeb4c59713a1e1fff6af640
 88 | 
 89 | ModInv = 444ccea3a7b21677dd294d34de53cc8a5b51e69b37782310a00fc6bcc975709b
 90 | A = 679280bd880994c08322143a4ea8a0825d0466fda1bb6b3eb86fc8e90747512b
 91 | M = e4fecab84b365c63a0dab4244ce3f921a9c87ec64d69a2031939f55782e99a2e
 92 | 
 93 | ModInv = 1ac7d7a03ceec5f690f567c9d61bf3469c078285bcc5cf00ac944596e887ca17
 94 | A = 1593ef32d9c784f5091bdff952f5c5f592a3aed6ba8ea865efa6d7df87be1805
 95 | M = 1e276882f90c95e0c1976eb079f97af075445b1361c02018d6bd7191162e67b2
 96 | 
 97 | ModInv = 639108b90dfe946f498be21303058413bbb0e59d0bd6a6115788705abd0666d6
 98 | A = 9258d6238e4923d120b2d1033573ffcac691526ad0842a3b174dccdbb79887bd
 99 | M = ce62909c39371d463aaba3d4b72ea6da49cb9b529e39e1972ef3ccd9a66fe08f
100 | 
101 | ModInv = aebde7654cb17833a106231c4b9e2f519140e85faee1bfb4192830f03f385e773c0f4767e93e874ffdc3b7a6b7e6a710e5619901c739ee8760a26128e8c91ef8cf761d0e505d8b28ae078d17e6071c372893bb7b72538e518ebc57efa70b7615e406756c49729b7c6e74f84aed7a316b6fa748ff4b9f143129d29dad1bff98bb
102 | A = a29dacaf5487d354280fdd2745b9ace4cd50f2bde41d0ee529bf26a1913244f708085452ff32feab19a7418897990da46a0633f7c8375d583367319091bbbe069b0052c5e48a7daac9fb650db5af768cd2508ec3e2cda7456d4b9ce1c39459627a8b77e038b826cd7e326d0685b0cd0cb50f026f18300dae9f5fd42aa150ee8b
103 | M = d686f9b86697313251685e995c09b9f1e337ddfaa050bd2df15bf4ca1dc46c5565021314765299c434ea1a6ec42bf92a29a7d1ffff599f4e50b79a82243fb24813060580c770d4c1140aeb2ab2685007e948b6f1f62e8001a0545619477d498132c907774479f6d95899e6251e7136f79ab6d3b7c82e4aca421e7d22fe7db19c
104 | 
105 | ModInv = 1ec872f4f20439e203597ca4de9d1296743f95781b2fe85d5def808558bbadef02a46b8955f47c83e1625f8bb40228eab09cad2a35c9ad62ab77a30e3932872959c5898674162da244a0ec1f68c0ed89f4b0f3572bfdc658ad15bf1b1c6e1176b0784c9935bd3ff1f49bb43753eacee1d8ca1c0b652d39ec727da83984fe3a0f
106 | A = 2e527b0a1dc32460b2dd94ec446c692989f7b3c7451a5cbeebf69fc0ea9c4871fbe78682d5dc5b66689f7ed889b52161cd9830b589a93d21ab26dbede6c33959f5a0f0d107169e2daaac78bac8cf2d41a1eb1369cb6dc9e865e73bb2e51b886f4e896082db199175e3dde0c4ed826468f238a77bd894245d0918efc9ca84f945
107 | M = b13133a9ebe0645f987d170c077eea2aa44e85c9ab10386d02867419a590cb182d9826a882306c212dbe75225adde23f80f5b37ca75ed09df20fc277cc7fbbfac8d9ef37a50f6b68ea158f5447283618e64e1426406d26ea85232afb22bf546c75018c1c55cb84c374d58d9d44c0a13ba88ac2e387765cb4c3269e3a983250fa
108 | 
109 | ModInv = 30ffa1876313a69de1e4e6ee132ea1d3a3da32f3b56f5cfb11402b0ad517dce605cf8e91d69fa375dd887fa8507bd8a28b2d5ce745799126e86f416047709f93f07fbd88918a047f13100ea71b1d48f6fc6d12e5c917646df3041b302187af641eaedf4908abc36f12c204e1526a7d80e96e302fb0779c28d7da607243732f26
110 | A = 31157208bde6b85ebecaa63735947b3b36fa351b5c47e9e1c40c947339b78bf96066e5dbe21bb42629e6fcdb81f5f88db590bfdd5f4c0a6a0c3fc6377e5c1fd8235e46e291c688b6d6ecfb36604891c2a7c9cbcc58c26e44b43beecb9c5044b58bb58e35de3cf1128f3c116534fe4e421a33f83603c3df1ae36ec88092f67f2a
111 | M = 53408b23d6cb733e6c9bc3d1e2ea2286a5c83cc4e3e7470f8af3a1d9f28727f5b1f8ae348c1678f5d1105dc3edf2de64e65b9c99545c47e64b770b17c8b4ef5cf194b43a0538053e87a6b95ade1439cebf3d34c6aa72a11c1497f58f76011e16c5be087936d88aba7a740113120e939e27bd3ddcb6580c2841aa406566e33c35
112 | 
113 | ModInv = 87355002f305c81ba0dc97ca2234a2bc02528cefde38b94ac5bd95efc7bf4c140899107fff47f0df9e3c6aa70017ebc90610a750f112cd4f475b9c76b204a953444b4e7196ccf17e93fdaed160b7345ca9b397eddf9446e8ea8ee3676102ce70eaafbe9038a34639789e6f2f1e3f352638f2e8a8f5fc56aaea7ec705ee068dd5
114 | A = 42a25d0bc96f71750f5ac8a51a1605a41b506cca51c9a7ecf80cad713e56f70f1b4b6fa51cbb101f55fd74f318adefb3af04e0c8a7e281055d5a40dd40913c0e1211767c5be915972c73886106dc49325df6c2df49e9eea4536f0343a8e7d332c6159e4f5bdb20d89f90e67597c4a2a632c31b2ef2534080a9ac61f52303990d
115 | M = d3d3f95d50570351528a76ab1e806bae1968bd420899bdb3d87c823fac439a4354c31f6c888c939784f18fe10a95e6d203b1901caa18937ba6f8be033af10c35fc869cf3d16bef479f280f53b3499e645d0387554623207ca4989e5de00bfeaa5e9ab56474fc60dd4967b100e0832eaaf2fcb2ef82a181567057b880b3afef62
116 | 


--------------------------------------------------------------------------------