├── .github └── workflows │ └── test.yml ├── LICENSE ├── README.md ├── _asm ├── go.mod ├── go.sum └── nat_amd64_asm.go ├── extra_test.go ├── go.mod ├── go.sum ├── nat.go ├── nat_386.s ├── nat_amd64.s ├── nat_arm.s ├── nat_arm64.s ├── nat_asm.go ├── nat_loong64.s ├── nat_noasm.go ├── nat_ppc64x.s ├── nat_riscv64.s ├── nat_s390x.s ├── nat_test.go ├── nat_wasm.go └── testdata └── mod_inv_tests.txt /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Go tests 2 | on: [push, pull_request] 3 | jobs: 4 | test: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v2 8 | - uses: actions/setup-go@v4 9 | with: 10 | go-version-file: go.mod 11 | check-latest: true 12 | - run: go test -short ./... 13 | - run: go test -short -tags purego ./... 14 | - run: GOARCH=386 go test -c 15 | - run: GOARCH=arm go test -c 16 | - run: GOARCH=arm64 go test -c 17 | - run: GOARCH=ppc64 go test -c 18 | - run: GOARCH=ppc64le go test -c 19 | - run: GOARCH=riscv64 go test -c 20 | - run: GOARCH=s390x go test -c 21 | - run: GOARCH=loong64 go test -c 22 | - run: GOOS=js GOARCH=wasm go test -c 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Package bigmod implements constant-time big integer arithmetic modulo large 2 | moduli. Unlike math/big, this package is suitable for implementing 3 | security-sensitive cryptographic operations. It is a re-exported version the 4 | standard library package crypto/internal/fips140/bigmod used to implement 5 | crypto/rsa amongst others. 6 | 7 | v0.1.0 is up to date with Go 1.24. 8 | 9 | The API is NOT stable. 10 | -------------------------------------------------------------------------------- /_asm/go.mod: -------------------------------------------------------------------------------- 1 | module filippo.io/bigmod/_asm 2 | 3 | go 1.25 4 | 5 | require github.com/mmcloughlin/avo v0.6.0 6 | 7 | require ( 8 | golang.org/x/mod v0.14.0 // indirect 9 | golang.org/x/tools v0.16.1 // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /_asm/go.sum: -------------------------------------------------------------------------------- 1 | github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= 2 | github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= 3 | golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0= 4 | golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 5 | golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= 6 | golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 7 | golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA= 8 | golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= 9 | -------------------------------------------------------------------------------- /_asm/nat_amd64_asm.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "strconv" 9 | 10 | . "github.com/mmcloughlin/avo/build" 11 | . "github.com/mmcloughlin/avo/operand" 12 | . "github.com/mmcloughlin/avo/reg" 13 | ) 14 | 15 | //go:generate go run . -out ../nat_amd64.s -pkg bigmod 16 | 17 | func main() { 18 | Package("filippo.io/bigmod") 19 | ConstraintExpr("!purego") 20 | 21 | addMulVVW(1024) 22 | addMulVVW(1536) 23 | addMulVVW(2048) 24 | 25 | Generate() 26 | } 27 | 28 | func addMulVVW(bits int) { 29 | if bits%64 != 0 { 30 | panic("bit size unsupported") 31 | } 32 | 33 | Implement("addMulVVW" + strconv.Itoa(bits)) 34 | 35 | CMPB(Mem{Symbol: Symbol{Name: "·supportADX"}, Base: StaticBase}, Imm(1)) 36 | JEQ(LabelRef("adx")) 37 | 38 | z := Mem{Base: Load(Param("z"), GP64())} 39 | x := Mem{Base: Load(Param("x"), GP64())} 40 | y := Load(Param("y"), GP64()) 41 | 42 | carry := GP64() 43 | XORQ(carry, carry) // zero out carry 44 | 45 | for i := 0; i < bits/64; i++ { 46 | Comment("Iteration " + strconv.Itoa(i)) 47 | hi, lo := RDX, RAX // implicit MULQ inputs and outputs 48 | MOVQ(x.Offset(i*8), lo) 49 | MULQ(y) 50 | ADDQ(z.Offset(i*8), lo) 51 | ADCQ(Imm(0), hi) 52 | ADDQ(carry, lo) 53 | ADCQ(Imm(0), hi) 54 | MOVQ(hi, carry) 55 | MOVQ(lo, z.Offset(i*8)) 56 | } 57 | 58 | Store(carry, ReturnIndex(0)) 59 | RET() 60 | 61 | Label("adx") 62 | 63 | // The ADX strategy implements the following function, where c1 and c2 are 64 | // the overflow and the carry flag respectively. 65 | // 66 | // func addMulVVW(z, x []uint, y uint) (carry uint) { 67 | // var c1, c2 uint 68 | // for i := range z { 69 | // hi, lo := bits.Mul(x[i], y) 70 | // lo, c1 = bits.Add(lo, z[i], c1) 71 | // z[i], c2 = bits.Add(lo, carry, c2) 72 | // carry = hi 73 | // } 74 | // return carry + c1 + c2 75 | // } 76 | // 77 | // The loop is fully unrolled and the hi / carry registers are alternated 78 | // instead of introducing a MOV. 79 | 80 | z = Mem{Base: Load(Param("z"), GP64())} 81 | x = Mem{Base: Load(Param("x"), GP64())} 82 | Load(Param("y"), RDX) // implicit source of MULXQ 83 | 84 | carry = GP64() 85 | XORQ(carry, carry) // zero out carry 86 | z0 := GP64() 87 | XORQ(z0, z0) // unset flags and zero out z0 88 | 89 | for i := 0; i < bits/64; i++ { 90 | hi, lo := GP64(), GP64() 91 | 92 | Comment("Iteration " + strconv.Itoa(i)) 93 | MULXQ(x.Offset(i*8), lo, hi) 94 | ADCXQ(carry, lo) 95 | ADOXQ(z.Offset(i*8), lo) 96 | MOVQ(lo, z.Offset(i*8)) 97 | 98 | i++ 99 | 100 | Comment("Iteration " + strconv.Itoa(i)) 101 | MULXQ(x.Offset(i*8), lo, carry) 102 | ADCXQ(hi, lo) 103 | ADOXQ(z.Offset(i*8), lo) 104 | MOVQ(lo, z.Offset(i*8)) 105 | } 106 | 107 | Comment("Add back carry flags and return") 108 | ADCXQ(z0, carry) 109 | ADOXQ(z0, carry) 110 | 111 | Store(carry, ReturnIndex(0)) 112 | RET() 113 | } 114 | -------------------------------------------------------------------------------- /extra_test.go: -------------------------------------------------------------------------------- 1 | package bigmod_test 2 | 3 | import ( 4 | "crypto" 5 | "crypto/rand" 6 | "crypto/rsa" 7 | "testing" 8 | 9 | "filippo.io/bigmod" 10 | ) 11 | 12 | // TestLinkWithStdlib ensures this package can be linked with the standard 13 | // library package crypto/internal/bigmod, which might have duplicate global 14 | // symbol names in the assembly files. See Issue 1. 15 | func TestLinkWithStdlib(t *testing.T) { 16 | bigmod.NewNat() 17 | k, _ := rsa.GenerateKey(rand.Reader, 512) 18 | rsa.SignPSS(rand.Reader, k, crypto.SHA256, make([]byte, 32), nil) 19 | } 20 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module filippo.io/bigmod 2 | 3 | go 1.23 4 | 5 | require golang.org/x/sys v0.11.0 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= 2 | golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 3 | -------------------------------------------------------------------------------- /nat.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package bigmod implements constant-time big integer arithmetic modulo large 6 | // moduli. Unlike math/big, this package is suitable for implementing 7 | // security-sensitive cryptographic operations. It is a re-exported version the 8 | // standard library package crypto/internal/fips140/bigmod used to implement 9 | // crypto/rsa amongst others. 10 | // 11 | // The API is NOT stable. The caller is responsible for ensuring that Nats are 12 | // reduced modulo the Modulus they are used with. 13 | package bigmod 14 | 15 | import ( 16 | "encoding/binary" 17 | "errors" 18 | "math/bits" 19 | ) 20 | 21 | const ( 22 | // _W is the size in bits of our limbs. 23 | _W = bits.UintSize 24 | // _S is the size in bytes of our limbs. 25 | _S = _W / 8 26 | ) 27 | 28 | // Note: These functions make many loops over all the words in a Nat. 29 | // These loops used to be in assembly, invisible to -race, -asan, and -msan, 30 | // but now they are in Go and incur significant overhead in those modes. 31 | // To bring the old performance back, we mark all functions that loop 32 | // over Nat words with //go:norace. Because //go:norace does not 33 | // propagate across inlining, we must also mark functions that inline 34 | // //go:norace functions - specifically, those that inline add, addMulVVW, 35 | // assign, cmpGeq, rshift1, and sub. 36 | 37 | // choice represents a constant-time boolean. The value of choice is always 38 | // either 1 or 0. We use an int instead of bool in order to make decisions in 39 | // constant time by turning it into a mask. 40 | type choice uint 41 | 42 | func not(c choice) choice { return 1 ^ c } 43 | 44 | const yes = choice(1) 45 | const no = choice(0) 46 | 47 | // ctMask is all 1s if on is yes, and all 0s otherwise. 48 | func ctMask(on choice) uint { return -uint(on) } 49 | 50 | // ctEq returns 1 if x == y, and 0 otherwise. The execution time of this 51 | // function does not depend on its inputs. 52 | func ctEq(x, y uint) choice { 53 | // If x != y, then either x - y or y - x will generate a carry. 54 | _, c1 := bits.Sub(x, y, 0) 55 | _, c2 := bits.Sub(y, x, 0) 56 | return not(choice(c1 | c2)) 57 | } 58 | 59 | // Nat represents an arbitrary natural number 60 | // 61 | // Each Nat has an announced length, which is the number of limbs it has stored. 62 | // Operations on this number are allowed to leak this length, but will not leak 63 | // any information about the values contained in those limbs. 64 | type Nat struct { 65 | // limbs is little-endian in base 2^W with W = bits.UintSize. 66 | limbs []uint 67 | } 68 | 69 | // preallocTarget is the size in bits of the numbers used to implement the most 70 | // common and most performant RSA key size. It's also enough to cover some of 71 | // the operations of key sizes up to 4096. 72 | const preallocTarget = 2048 73 | const preallocLimbs = (preallocTarget + _W - 1) / _W 74 | 75 | // NewNat returns a new nat with a size of zero, just like new(Nat), but with 76 | // the preallocated capacity to hold a number of up to 2048 bits. 77 | // NewNat inlines, so the allocation can live on the stack. 78 | func NewNat() *Nat { 79 | limbs := make([]uint, 0, preallocLimbs) 80 | return &Nat{limbs} 81 | } 82 | 83 | // expand expands x to n limbs, leaving its value unchanged. 84 | func (x *Nat) expand(n int) *Nat { 85 | if len(x.limbs) > n { 86 | panic("bigmod: internal error: shrinking nat") 87 | } 88 | if cap(x.limbs) < n { 89 | newLimbs := make([]uint, n) 90 | copy(newLimbs, x.limbs) 91 | x.limbs = newLimbs 92 | return x 93 | } 94 | extraLimbs := x.limbs[len(x.limbs):n] 95 | clear(extraLimbs) 96 | x.limbs = x.limbs[:n] 97 | return x 98 | } 99 | 100 | // reset returns a zero nat of n limbs, reusing x's storage if n <= cap(x.limbs). 101 | func (x *Nat) reset(n int) *Nat { 102 | if cap(x.limbs) < n { 103 | x.limbs = make([]uint, n) 104 | return x 105 | } 106 | // Clear both the returned limbs and the previously used ones. 107 | clear(x.limbs[:max(n, len(x.limbs))]) 108 | x.limbs = x.limbs[:n] 109 | return x 110 | } 111 | 112 | // resetToBytes assigns x = b, where b is a slice of big-endian bytes, resizing 113 | // n to the appropriate size. 114 | // 115 | // The announced length of x is set based on the actual bit size of the input, 116 | // ignoring leading zeroes. 117 | func (x *Nat) resetToBytes(b []byte) *Nat { 118 | x.reset((len(b) + _S - 1) / _S) 119 | if err := x.setBytes(b); err != nil { 120 | panic("bigmod: internal error: bad arithmetic") 121 | } 122 | return x.trim() 123 | } 124 | 125 | // trim reduces the size of x to match its value. 126 | func (x *Nat) trim() *Nat { 127 | // Trim most significant (trailing in little-endian) zero limbs. 128 | // We assume comparison with zero (but not the branch) is constant time. 129 | for i := len(x.limbs) - 1; i >= 0; i-- { 130 | if x.limbs[i] != 0 { 131 | break 132 | } 133 | x.limbs = x.limbs[:i] 134 | } 135 | return x 136 | } 137 | 138 | // set assigns x = y, optionally resizing x to the appropriate size. 139 | func (x *Nat) set(y *Nat) *Nat { 140 | x.reset(len(y.limbs)) 141 | copy(x.limbs, y.limbs) 142 | return x 143 | } 144 | 145 | // Bits returns x as a little-endian slice of uint. The length of the slice 146 | // matches the announced length of x. The result and x share the same underlying 147 | // array. 148 | func (x *Nat) Bits() []uint { 149 | return x.limbs 150 | } 151 | 152 | // Bytes returns x as a zero-extended big-endian byte slice. The size of the 153 | // slice will match the size of m. 154 | // 155 | // x must have the same size as m and it must be less than or equal to m. 156 | func (x *Nat) Bytes(m *Modulus) []byte { 157 | i := m.Size() 158 | bytes := make([]byte, i) 159 | for _, limb := range x.limbs { 160 | for j := 0; j < _S; j++ { 161 | i-- 162 | if i < 0 { 163 | if limb == 0 { 164 | break 165 | } 166 | panic("bigmod: modulus is smaller than nat") 167 | } 168 | bytes[i] = byte(limb) 169 | limb >>= 8 170 | } 171 | } 172 | return bytes 173 | } 174 | 175 | // SetBytes assigns x = b, where b is a slice of big-endian bytes. 176 | // SetBytes returns an error if b >= m. 177 | // 178 | // The output will be resized to the size of m and overwritten. 179 | // 180 | //go:norace 181 | func (x *Nat) SetBytes(b []byte, m *Modulus) (*Nat, error) { 182 | x.resetFor(m) 183 | if err := x.setBytes(b); err != nil { 184 | return nil, err 185 | } 186 | if x.cmpGeq(m.nat) == yes { 187 | return nil, errors.New("input overflows the modulus") 188 | } 189 | return x, nil 190 | } 191 | 192 | // SetOverflowingBytes assigns x = b, where b is a slice of big-endian bytes. 193 | // SetOverflowingBytes returns an error if b has a longer bit length than m, but 194 | // reduces overflowing values up to 2^⌈log2(m)⌉ - 1. 195 | // 196 | // The output will be resized to the size of m and overwritten. 197 | func (x *Nat) SetOverflowingBytes(b []byte, m *Modulus) (*Nat, error) { 198 | x.resetFor(m) 199 | if err := x.setBytes(b); err != nil { 200 | return nil, err 201 | } 202 | // setBytes would have returned an error if the input overflowed the limb 203 | // size of the modulus, so now we only need to check if the most significant 204 | // limb of x has more bits than the most significant limb of the modulus. 205 | if bitLen(x.limbs[len(x.limbs)-1]) > bitLen(m.nat.limbs[len(m.nat.limbs)-1]) { 206 | return nil, errors.New("input overflows the modulus size") 207 | } 208 | x.maybeSubtractModulus(no, m) 209 | return x, nil 210 | } 211 | 212 | // bigEndianUint returns the contents of buf interpreted as a 213 | // big-endian encoded uint value. 214 | func bigEndianUint(buf []byte) uint { 215 | if _W == 64 { 216 | return uint(binary.BigEndian.Uint64(buf)) 217 | } 218 | return uint(binary.BigEndian.Uint32(buf)) 219 | } 220 | 221 | func (x *Nat) setBytes(b []byte) error { 222 | i, k := len(b), 0 223 | for k < len(x.limbs) && i >= _S { 224 | x.limbs[k] = bigEndianUint(b[i-_S : i]) 225 | i -= _S 226 | k++ 227 | } 228 | for s := 0; s < _W && k < len(x.limbs) && i > 0; s += 8 { 229 | x.limbs[k] |= uint(b[i-1]) << s 230 | i-- 231 | } 232 | if i > 0 { 233 | return errors.New("input overflows the modulus size") 234 | } 235 | return nil 236 | } 237 | 238 | // SetUint assigns x = y. 239 | // 240 | // The output will be resized to a single limb and overwritten. 241 | func (x *Nat) SetUint(y uint) *Nat { 242 | x.reset(1) 243 | x.limbs[0] = y 244 | return x 245 | } 246 | 247 | // Equal returns 1 if x == y, and 0 otherwise. 248 | // 249 | // Both operands must have the same announced length. 250 | // 251 | //go:norace 252 | func (x *Nat) Equal(y *Nat) uint { 253 | // Eliminate bounds checks in the loop. 254 | size := len(x.limbs) 255 | xLimbs := x.limbs[:size] 256 | yLimbs := y.limbs[:size] 257 | 258 | equal := yes 259 | for i := 0; i < size; i++ { 260 | equal &= ctEq(xLimbs[i], yLimbs[i]) 261 | } 262 | return uint(equal) 263 | } 264 | 265 | // IsZero returns 1 if x == 0, and 0 otherwise. 266 | // 267 | //go:norace 268 | func (x *Nat) IsZero() uint { 269 | // Eliminate bounds checks in the loop. 270 | size := len(x.limbs) 271 | xLimbs := x.limbs[:size] 272 | 273 | zero := yes 274 | for i := 0; i < size; i++ { 275 | zero &= ctEq(xLimbs[i], 0) 276 | } 277 | return uint(zero) 278 | } 279 | 280 | // IsOne returns 1 if x == 1, and 0 otherwise. 281 | // 282 | //go:norace 283 | func (x *Nat) IsOne() uint { 284 | // Eliminate bounds checks in the loop. 285 | size := len(x.limbs) 286 | xLimbs := x.limbs[:size] 287 | 288 | if len(xLimbs) == 0 { 289 | return uint(no) 290 | } 291 | 292 | one := ctEq(xLimbs[0], 1) 293 | for i := 1; i < size; i++ { 294 | one &= ctEq(xLimbs[i], 0) 295 | } 296 | return uint(one) 297 | } 298 | 299 | // IsMinusOne returns 1 if x == -1 mod m, and 0 otherwise. 300 | // 301 | // The length of x must be the same as the modulus. x must already be reduced 302 | // modulo m. 303 | // 304 | //go:norace 305 | func (x *Nat) IsMinusOne(m *Modulus) uint { 306 | minusOne := m.Nat() 307 | minusOne.SubOne(m) 308 | return x.Equal(minusOne) 309 | } 310 | 311 | // IsOdd returns 1 if x is odd, and 0 otherwise. 312 | func (x *Nat) IsOdd() uint { 313 | if len(x.limbs) == 0 { 314 | return uint(no) 315 | } 316 | return uint(x.limbs[0] & 1) 317 | } 318 | 319 | // TrailingZeroBitsVarTime returns the number of trailing zero bits in x. 320 | func (x *Nat) TrailingZeroBitsVarTime() uint { 321 | var t uint 322 | limbs := x.limbs 323 | for _, l := range limbs { 324 | if l == 0 { 325 | t += _W 326 | continue 327 | } 328 | t += uint(bits.TrailingZeros(l)) 329 | break 330 | } 331 | return t 332 | } 333 | 334 | // cmpGeq returns 1 if x >= y, and 0 otherwise. 335 | // 336 | // Both operands must have the same announced length. 337 | // 338 | //go:norace 339 | func (x *Nat) cmpGeq(y *Nat) choice { 340 | // Eliminate bounds checks in the loop. 341 | size := len(x.limbs) 342 | xLimbs := x.limbs[:size] 343 | yLimbs := y.limbs[:size] 344 | 345 | var c uint 346 | for i := 0; i < size; i++ { 347 | _, c = bits.Sub(xLimbs[i], yLimbs[i], c) 348 | } 349 | // If there was a carry, then subtracting y underflowed, so 350 | // x is not greater than or equal to y. 351 | return not(choice(c)) 352 | } 353 | 354 | // assign sets x <- y if on == 1, and does nothing otherwise. 355 | // 356 | // Both operands must have the same announced length. 357 | // 358 | //go:norace 359 | func (x *Nat) assign(on choice, y *Nat) *Nat { 360 | // Eliminate bounds checks in the loop. 361 | size := len(x.limbs) 362 | xLimbs := x.limbs[:size] 363 | yLimbs := y.limbs[:size] 364 | 365 | mask := ctMask(on) 366 | for i := 0; i < size; i++ { 367 | xLimbs[i] ^= mask & (xLimbs[i] ^ yLimbs[i]) 368 | } 369 | return x 370 | } 371 | 372 | // add computes x += y and returns the carry. 373 | // 374 | // Both operands must have the same announced length. 375 | // 376 | //go:norace 377 | func (x *Nat) add(y *Nat) (c uint) { 378 | // Eliminate bounds checks in the loop. 379 | size := len(x.limbs) 380 | xLimbs := x.limbs[:size] 381 | yLimbs := y.limbs[:size] 382 | 383 | for i := 0; i < size; i++ { 384 | xLimbs[i], c = bits.Add(xLimbs[i], yLimbs[i], c) 385 | } 386 | return 387 | } 388 | 389 | // sub computes x -= y. It returns the borrow of the subtraction. 390 | // 391 | // Both operands must have the same announced length. 392 | // 393 | //go:norace 394 | func (x *Nat) sub(y *Nat) (c uint) { 395 | // Eliminate bounds checks in the loop. 396 | size := len(x.limbs) 397 | xLimbs := x.limbs[:size] 398 | yLimbs := y.limbs[:size] 399 | 400 | for i := 0; i < size; i++ { 401 | xLimbs[i], c = bits.Sub(xLimbs[i], yLimbs[i], c) 402 | } 403 | return 404 | } 405 | 406 | // ShiftRightVarTime sets x = x >> n. 407 | // 408 | // The announced length of x is unchanged. 409 | // 410 | //go:norace 411 | func (x *Nat) ShiftRightVarTime(n uint) *Nat { 412 | // Eliminate bounds checks in the loop. 413 | size := len(x.limbs) 414 | xLimbs := x.limbs[:size] 415 | 416 | shift := int(n % _W) 417 | shiftLimbs := int(n / _W) 418 | 419 | var shiftedLimbs []uint 420 | if shiftLimbs < size { 421 | shiftedLimbs = xLimbs[shiftLimbs:] 422 | } 423 | 424 | for i := range xLimbs { 425 | if i >= len(shiftedLimbs) { 426 | xLimbs[i] = 0 427 | continue 428 | } 429 | 430 | xLimbs[i] = shiftedLimbs[i] >> shift 431 | if i+1 < len(shiftedLimbs) { 432 | xLimbs[i] |= shiftedLimbs[i+1] << (_W - shift) 433 | } 434 | } 435 | 436 | return x 437 | } 438 | 439 | // BitLenVarTime returns the actual size of x in bits. 440 | // 441 | // The actual size of x (but nothing more) leaks through timing side-channels. 442 | // Note that this is ordinarily secret, as opposed to the announced size of x. 443 | func (x *Nat) BitLenVarTime() int { 444 | // Eliminate bounds checks in the loop. 445 | size := len(x.limbs) 446 | xLimbs := x.limbs[:size] 447 | 448 | for i := size - 1; i >= 0; i-- { 449 | if xLimbs[i] != 0 { 450 | return i*_W + bitLen(xLimbs[i]) 451 | } 452 | } 453 | return 0 454 | } 455 | 456 | // bitLen is a version of bits.Len that only leaks the bit length of n, but not 457 | // its value. bits.Len and bits.LeadingZeros use a lookup table for the 458 | // low-order bits on some architectures. 459 | func bitLen(n uint) int { 460 | len := 0 461 | // We assume, here and elsewhere, that comparison to zero is constant time 462 | // with respect to different non-zero values. 463 | for n != 0 { 464 | len++ 465 | n >>= 1 466 | } 467 | return len 468 | } 469 | 470 | // Modulus is used for modular arithmetic, precomputing relevant constants. 471 | // 472 | // A Modulus can leak the exact number of bits needed to store its value 473 | // and is stored without padding. Its actual value is still kept secret. 474 | type Modulus struct { 475 | // The underlying natural number for this modulus. 476 | // 477 | // This will be stored without any padding, and shouldn't alias with any 478 | // other natural number being used. 479 | nat *Nat 480 | 481 | // If m is even, the following fields are not set. 482 | odd bool 483 | m0inv uint // -nat.limbs[0]⁻¹ mod _W 484 | rr *Nat // R*R for montgomeryRepresentation 485 | } 486 | 487 | // rr returns R*R with R = 2^(_W * n) and n = len(m.nat.limbs). 488 | func rr(m *Modulus) *Nat { 489 | rr := NewNat().ExpandFor(m) 490 | n := uint(len(rr.limbs)) 491 | mLen := uint(m.BitLen()) 492 | logR := _W * n 493 | 494 | // We start by computing R = 2^(_W * n) mod m. We can get pretty close, to 495 | // 2^⌊log₂m⌋, by setting the highest bit we can without having to reduce. 496 | rr.limbs[n-1] = 1 << ((mLen - 1) % _W) 497 | // Then we double until we reach 2^(_W * n). 498 | for i := mLen - 1; i < logR; i++ { 499 | rr.Add(rr, m) 500 | } 501 | 502 | // Next we need to get from R to 2^(_W * n) R mod m (aka from one to R in 503 | // the Montgomery domain, meaning we can use Montgomery multiplication now). 504 | // We could do that by doubling _W * n times, or with a square-and-double 505 | // chain log2(_W * n) long. Turns out the fastest thing is to start out with 506 | // doublings, and switch to square-and-double once the exponent is large 507 | // enough to justify the cost of the multiplications. 508 | 509 | // The threshold is selected experimentally as a linear function of n. 510 | threshold := n / 4 511 | 512 | // We calculate how many of the most-significant bits of the exponent we can 513 | // compute before crossing the threshold, and we do it with doublings. 514 | i := bits.UintSize 515 | for logR>>i <= threshold { 516 | i-- 517 | } 518 | for k := uint(0); k < logR>>i; k++ { 519 | rr.Add(rr, m) 520 | } 521 | 522 | // Then we process the remaining bits of the exponent with a 523 | // square-and-double chain. 524 | for i > 0 { 525 | rr.montgomeryMul(rr, rr, m) 526 | i-- 527 | if logR>>i&1 != 0 { 528 | rr.Add(rr, m) 529 | } 530 | } 531 | 532 | return rr 533 | } 534 | 535 | // minusInverseModW computes -x⁻¹ mod _W with x odd. 536 | // 537 | // This operation is used to precompute a constant involved in Montgomery 538 | // multiplication. 539 | func minusInverseModW(x uint) uint { 540 | // Every iteration of this loop doubles the least-significant bits of 541 | // correct inverse in y. The first three bits are already correct (1⁻¹ = 1, 542 | // 3⁻¹ = 3, 5⁻¹ = 5, and 7⁻¹ = 7 mod 8), so doubling five times is enough 543 | // for 64 bits (and wastes only one iteration for 32 bits). 544 | // 545 | // See https://crypto.stackexchange.com/a/47496. 546 | y := x 547 | for i := 0; i < 5; i++ { 548 | y = y * (2 - x*y) 549 | } 550 | return -y 551 | } 552 | 553 | // NewModulus creates a new Modulus from a slice of big-endian bytes. The 554 | // modulus must be greater than one. 555 | // 556 | // The number of significant bits and whether the modulus is even is leaked 557 | // through timing side-channels. 558 | func NewModulus(b []byte) (*Modulus, error) { 559 | n := NewNat().resetToBytes(b) 560 | return newModulus(n) 561 | } 562 | 563 | // NewModulusProduct creates a new Modulus from the product of two numbers 564 | // represented as big-endian byte slices. The result must be greater than one. 565 | // 566 | //go:norace 567 | func NewModulusProduct(a, b []byte) (*Modulus, error) { 568 | x := NewNat().resetToBytes(a) 569 | y := NewNat().resetToBytes(b) 570 | n := NewNat().reset(len(x.limbs) + len(y.limbs)) 571 | for i := range y.limbs { 572 | n.limbs[i+len(x.limbs)] = addMulVVW(n.limbs[i:i+len(x.limbs)], x.limbs, y.limbs[i]) 573 | } 574 | return newModulus(n.trim()) 575 | } 576 | 577 | func newModulus(n *Nat) (*Modulus, error) { 578 | m := &Modulus{nat: n} 579 | if m.nat.IsZero() == 1 || m.nat.IsOne() == 1 { 580 | return nil, errors.New("modulus must be > 1") 581 | } 582 | if m.nat.IsOdd() == 1 { 583 | m.odd = true 584 | m.m0inv = minusInverseModW(m.nat.limbs[0]) 585 | m.rr = rr(m) 586 | } 587 | return m, nil 588 | } 589 | 590 | // Size returns the size of m in bytes. 591 | func (m *Modulus) Size() int { 592 | return (m.BitLen() + 7) / 8 593 | } 594 | 595 | // BitLen returns the size of m in bits. 596 | func (m *Modulus) BitLen() int { 597 | return m.nat.BitLenVarTime() 598 | } 599 | 600 | // Nat returns m as a Nat. 601 | func (m *Modulus) Nat() *Nat { 602 | // Make a copy so that the caller can't modify m.nat or alias it with 603 | // another Nat in a modulus operation. 604 | n := NewNat() 605 | n.set(m.nat) 606 | return n 607 | } 608 | 609 | // shiftIn calculates x = x << _W + y mod m. 610 | // 611 | // This assumes that x is already reduced mod m. 612 | // 613 | //go:norace 614 | func (x *Nat) shiftIn(y uint, m *Modulus) *Nat { 615 | d := NewNat().resetFor(m) 616 | 617 | // Eliminate bounds checks in the loop. 618 | size := len(m.nat.limbs) 619 | xLimbs := x.limbs[:size] 620 | dLimbs := d.limbs[:size] 621 | mLimbs := m.nat.limbs[:size] 622 | 623 | // Each iteration of this loop computes x = 2x + b mod m, where b is a bit 624 | // from y. Effectively, it left-shifts x and adds y one bit at a time, 625 | // reducing it every time. 626 | // 627 | // To do the reduction, each iteration computes both 2x + b and 2x + b - m. 628 | // The next iteration (and finally the return line) will use either result 629 | // based on whether 2x + b overflows m. 630 | needSubtraction := no 631 | for i := _W - 1; i >= 0; i-- { 632 | carry := (y >> i) & 1 633 | var borrow uint 634 | mask := ctMask(needSubtraction) 635 | for i := 0; i < size; i++ { 636 | l := xLimbs[i] ^ (mask & (xLimbs[i] ^ dLimbs[i])) 637 | xLimbs[i], carry = bits.Add(l, l, carry) 638 | dLimbs[i], borrow = bits.Sub(xLimbs[i], mLimbs[i], borrow) 639 | } 640 | // Like in maybeSubtractModulus, we need the subtraction if either it 641 | // didn't underflow (meaning 2x + b > m) or if computing 2x + b 642 | // overflowed (meaning 2x + b > 2^_W*n > m). 643 | needSubtraction = not(choice(borrow)) | choice(carry) 644 | } 645 | return x.assign(needSubtraction, d) 646 | } 647 | 648 | // Mod calculates out = y mod m. 649 | // 650 | // This works regardless how large the value of y is. 651 | // 652 | // The output will be resized to the size of m and overwritten. 653 | // 654 | //go:norace 655 | func (x *Nat) Mod(y *Nat, m *Modulus) *Nat { 656 | out, x := x, y 657 | out.resetFor(m) 658 | // Working our way from the most significant to the least significant limb, 659 | // we can insert each limb at the least significant position, shifting all 660 | // previous limbs left by _W. This way each limb will get shifted by the 661 | // correct number of bits. We can insert at least N - 1 limbs without 662 | // overflowing m. After that, we need to reduce every time we shift. 663 | i := len(x.limbs) - 1 664 | // For the first N - 1 limbs we can skip the actual shifting and position 665 | // them at the shifted position, which starts at min(N - 2, i). 666 | start := len(m.nat.limbs) - 2 667 | if i < start { 668 | start = i 669 | } 670 | for j := start; j >= 0; j-- { 671 | out.limbs[j] = x.limbs[i] 672 | i-- 673 | } 674 | // We shift in the remaining limbs, reducing modulo m each time. 675 | for i >= 0 { 676 | out.shiftIn(x.limbs[i], m) 677 | i-- 678 | } 679 | return out 680 | } 681 | 682 | // ExpandFor ensures x has the right size to work with operations modulo m. 683 | // 684 | // The announced size of x must be smaller than or equal to that of m. 685 | func (x *Nat) ExpandFor(m *Modulus) *Nat { 686 | return x.expand(len(m.nat.limbs)) 687 | } 688 | 689 | // resetFor ensures x has the right size to work with operations modulo m. 690 | // 691 | // x is zeroed and may start at any size. 692 | func (x *Nat) resetFor(m *Modulus) *Nat { 693 | return x.reset(len(m.nat.limbs)) 694 | } 695 | 696 | // maybeSubtractModulus computes x -= m if and only if x >= m or if "always" is yes. 697 | // 698 | // It can be used to reduce modulo m a value up to 2m - 1, which is a common 699 | // range for results computed by higher level operations. 700 | // 701 | // always is usually a carry that indicates that the operation that produced x 702 | // overflowed its size, meaning abstractly x > 2^_W*n > m even if x < m. 703 | // 704 | // x and m operands must have the same announced length. 705 | // 706 | //go:norace 707 | func (x *Nat) maybeSubtractModulus(always choice, m *Modulus) { 708 | t := NewNat().set(x) 709 | underflow := t.sub(m.nat) 710 | // We keep the result if x - m didn't underflow (meaning x >= m) 711 | // or if always was set. 712 | keep := not(choice(underflow)) | choice(always) 713 | x.assign(keep, t) 714 | } 715 | 716 | // Sub computes x = x - y mod m. 717 | // 718 | // The length of both operands must be the same as the modulus. Both operands 719 | // must already be reduced modulo m. 720 | // 721 | //go:norace 722 | func (x *Nat) Sub(y *Nat, m *Modulus) *Nat { 723 | underflow := x.sub(y) 724 | // If the subtraction underflowed, add m. 725 | t := NewNat().set(x) 726 | t.add(m.nat) 727 | x.assign(choice(underflow), t) 728 | return x 729 | } 730 | 731 | // SubOne computes x = x - 1 mod m. 732 | // 733 | // The length of x must be the same as the modulus. 734 | func (x *Nat) SubOne(m *Modulus) *Nat { 735 | one := NewNat().ExpandFor(m) 736 | one.limbs[0] = 1 737 | // Sub asks for x to be reduced modulo m, while SubOne doesn't, but when 738 | // y = 1, it works, and this is an internal use. 739 | return x.Sub(one, m) 740 | } 741 | 742 | // Add computes x = x + y mod m. 743 | // 744 | // The length of both operands must be the same as the modulus. Both operands 745 | // must already be reduced modulo m. 746 | // 747 | //go:norace 748 | func (x *Nat) Add(y *Nat, m *Modulus) *Nat { 749 | overflow := x.add(y) 750 | x.maybeSubtractModulus(choice(overflow), m) 751 | return x 752 | } 753 | 754 | // montgomeryRepresentation calculates x = x * R mod m, with R = 2^(_W * n) and 755 | // n = len(m.nat.limbs). 756 | // 757 | // Faster Montgomery multiplication replaces standard modular multiplication for 758 | // numbers in this representation. 759 | // 760 | // This assumes that x is already reduced mod m. 761 | func (x *Nat) montgomeryRepresentation(m *Modulus) *Nat { 762 | // A Montgomery multiplication (which computes a * b / R) by R * R works out 763 | // to a multiplication by R, which takes the value out of the Montgomery domain. 764 | return x.montgomeryMul(x, m.rr, m) 765 | } 766 | 767 | // montgomeryReduction calculates x = x / R mod m, with R = 2^(_W * n) and 768 | // n = len(m.nat.limbs). 769 | // 770 | // This assumes that x is already reduced mod m. 771 | func (x *Nat) montgomeryReduction(m *Modulus) *Nat { 772 | // By Montgomery multiplying with 1 not in Montgomery representation, we 773 | // convert out back from Montgomery representation, because it works out to 774 | // dividing by R. 775 | one := NewNat().ExpandFor(m) 776 | one.limbs[0] = 1 777 | return x.montgomeryMul(x, one, m) 778 | } 779 | 780 | // montgomeryMul calculates x = a * b / R mod m, with R = 2^(_W * n) and 781 | // n = len(m.nat.limbs), also known as a Montgomery multiplication. 782 | // 783 | // All inputs should be the same length and already reduced modulo m. 784 | // x will be resized to the size of m and overwritten. 785 | // 786 | //go:norace 787 | func (x *Nat) montgomeryMul(a *Nat, b *Nat, m *Modulus) *Nat { 788 | n := len(m.nat.limbs) 789 | mLimbs := m.nat.limbs[:n] 790 | aLimbs := a.limbs[:n] 791 | bLimbs := b.limbs[:n] 792 | 793 | switch n { 794 | default: 795 | // Attempt to use a stack-allocated backing array. 796 | T := make([]uint, 0, preallocLimbs*2) 797 | if cap(T) < n*2 { 798 | T = make([]uint, 0, n*2) 799 | } 800 | T = T[:n*2] 801 | 802 | // This loop implements Word-by-Word Montgomery Multiplication, as 803 | // described in Algorithm 4 (Fig. 3) of "Efficient Software 804 | // Implementations of Modular Exponentiation" by Shay Gueron 805 | // [https://eprint.iacr.org/2011/239.pdf]. 806 | var c uint 807 | for i := 0; i < n; i++ { 808 | _ = T[n+i] // bounds check elimination hint 809 | 810 | // Step 1 (T = a × b) is computed as a large pen-and-paper column 811 | // multiplication of two numbers with n base-2^_W digits. If we just 812 | // wanted to produce 2n-wide T, we would do 813 | // 814 | // for i := 0; i < n; i++ { 815 | // d := bLimbs[i] 816 | // T[n+i] = addMulVVW(T[i:n+i], aLimbs, d) 817 | // } 818 | // 819 | // where d is a digit of the multiplier, T[i:n+i] is the shifted 820 | // position of the product of that digit, and T[n+i] is the final carry. 821 | // Note that T[i] isn't modified after processing the i-th digit. 822 | // 823 | // Instead of running two loops, one for Step 1 and one for Steps 2–6, 824 | // the result of Step 1 is computed during the next loop. This is 825 | // possible because each iteration only uses T[i] in Step 2 and then 826 | // discards it in Step 6. 827 | d := bLimbs[i] 828 | c1 := addMulVVW(T[i:n+i], aLimbs, d) 829 | 830 | // Step 6 is replaced by shifting the virtual window we operate 831 | // over: T of the algorithm is T[i:] for us. That means that T1 in 832 | // Step 2 (T mod 2^_W) is simply T[i]. k0 in Step 3 is our m0inv. 833 | Y := T[i] * m.m0inv 834 | 835 | // Step 4 and 5 add Y × m to T, which as mentioned above is stored 836 | // at T[i:]. The two carries (from a × d and Y × m) are added up in 837 | // the next word T[n+i], and the carry bit from that addition is 838 | // brought forward to the next iteration. 839 | c2 := addMulVVW(T[i:n+i], mLimbs, Y) 840 | T[n+i], c = bits.Add(c1, c2, c) 841 | } 842 | 843 | // Finally for Step 7 we copy the final T window into x, and subtract m 844 | // if necessary (which as explained in maybeSubtractModulus can be the 845 | // case both if x >= m, or if x overflowed). 846 | // 847 | // The paper suggests in Section 4 that we can do an "Almost Montgomery 848 | // Multiplication" by subtracting only in the overflow case, but the 849 | // cost is very similar since the constant time subtraction tells us if 850 | // x >= m as a side effect, and taking care of the broken invariant is 851 | // highly undesirable (see https://go.dev/issue/13907). 852 | copy(x.reset(n).limbs, T[n:]) 853 | x.maybeSubtractModulus(choice(c), m) 854 | 855 | // The following specialized cases follow the exact same algorithm, but 856 | // optimized for the sizes most used in RSA. addMulVVW is implemented in 857 | // assembly with loop unrolling depending on the architecture and bounds 858 | // checks are removed by the compiler thanks to the constant size. 859 | case 1024 / _W: 860 | const n = 1024 / _W // compiler hint 861 | T := make([]uint, n*2) 862 | var c uint 863 | for i := 0; i < n; i++ { 864 | d := bLimbs[i] 865 | c1 := addMulVVW1024(&T[i], &aLimbs[0], d) 866 | Y := T[i] * m.m0inv 867 | c2 := addMulVVW1024(&T[i], &mLimbs[0], Y) 868 | T[n+i], c = bits.Add(c1, c2, c) 869 | } 870 | copy(x.reset(n).limbs, T[n:]) 871 | x.maybeSubtractModulus(choice(c), m) 872 | 873 | case 1536 / _W: 874 | const n = 1536 / _W // compiler hint 875 | T := make([]uint, n*2) 876 | var c uint 877 | for i := 0; i < n; i++ { 878 | d := bLimbs[i] 879 | c1 := addMulVVW1536(&T[i], &aLimbs[0], d) 880 | Y := T[i] * m.m0inv 881 | c2 := addMulVVW1536(&T[i], &mLimbs[0], Y) 882 | T[n+i], c = bits.Add(c1, c2, c) 883 | } 884 | copy(x.reset(n).limbs, T[n:]) 885 | x.maybeSubtractModulus(choice(c), m) 886 | 887 | case 2048 / _W: 888 | const n = 2048 / _W // compiler hint 889 | T := make([]uint, n*2) 890 | var c uint 891 | for i := 0; i < n; i++ { 892 | d := bLimbs[i] 893 | c1 := addMulVVW2048(&T[i], &aLimbs[0], d) 894 | Y := T[i] * m.m0inv 895 | c2 := addMulVVW2048(&T[i], &mLimbs[0], Y) 896 | T[n+i], c = bits.Add(c1, c2, c) 897 | } 898 | copy(x.reset(n).limbs, T[n:]) 899 | x.maybeSubtractModulus(choice(c), m) 900 | } 901 | 902 | return x 903 | } 904 | 905 | // addMulVVW multiplies the multi-word value x by the single-word value y, 906 | // adding the result to the multi-word value z and returning the final carry. 907 | // It can be thought of as one row of a pen-and-paper column multiplication. 908 | // 909 | //go:norace 910 | func addMulVVW(z, x []uint, y uint) (carry uint) { 911 | _ = x[len(z)-1] // bounds check elimination hint 912 | for i := range z { 913 | hi, lo := bits.Mul(x[i], y) 914 | lo, c := bits.Add(lo, z[i], 0) 915 | // We use bits.Add with zero to get an add-with-carry instruction that 916 | // absorbs the carry from the previous bits.Add. 917 | hi, _ = bits.Add(hi, 0, c) 918 | lo, c = bits.Add(lo, carry, 0) 919 | hi, _ = bits.Add(hi, 0, c) 920 | carry = hi 921 | z[i] = lo 922 | } 923 | return carry 924 | } 925 | 926 | // Mul calculates x = x * y mod m. 927 | // 928 | // The length of both operands must be the same as the modulus. Both operands 929 | // must already be reduced modulo m. 930 | // 931 | //go:norace 932 | func (x *Nat) Mul(y *Nat, m *Modulus) *Nat { 933 | if m.odd { 934 | // A Montgomery multiplication by a value out of the Montgomery domain 935 | // takes the result out of Montgomery representation. 936 | xR := NewNat().set(x).montgomeryRepresentation(m) // xR = x * R mod m 937 | return x.montgomeryMul(xR, y, m) // x = xR * y / R mod m 938 | } 939 | 940 | n := len(m.nat.limbs) 941 | xLimbs := x.limbs[:n] 942 | yLimbs := y.limbs[:n] 943 | 944 | switch n { 945 | default: 946 | // Attempt to use a stack-allocated backing array. 947 | T := make([]uint, 0, preallocLimbs*2) 948 | if cap(T) < n*2 { 949 | T = make([]uint, 0, n*2) 950 | } 951 | T = T[:n*2] 952 | 953 | // T = x * y 954 | for i := 0; i < n; i++ { 955 | T[n+i] = addMulVVW(T[i:n+i], xLimbs, yLimbs[i]) 956 | } 957 | 958 | // x = T mod m 959 | return x.Mod(&Nat{limbs: T}, m) 960 | 961 | // The following specialized cases follow the exact same algorithm, but 962 | // optimized for the sizes most used in RSA. See montgomeryMul for details. 963 | case 1024 / _W: 964 | const n = 1024 / _W // compiler hint 965 | T := make([]uint, n*2) 966 | for i := 0; i < n; i++ { 967 | T[n+i] = addMulVVW1024(&T[i], &xLimbs[0], yLimbs[i]) 968 | } 969 | return x.Mod(&Nat{limbs: T}, m) 970 | case 1536 / _W: 971 | const n = 1536 / _W // compiler hint 972 | T := make([]uint, n*2) 973 | for i := 0; i < n; i++ { 974 | T[n+i] = addMulVVW1536(&T[i], &xLimbs[0], yLimbs[i]) 975 | } 976 | return x.Mod(&Nat{limbs: T}, m) 977 | case 2048 / _W: 978 | const n = 2048 / _W // compiler hint 979 | T := make([]uint, n*2) 980 | for i := 0; i < n; i++ { 981 | T[n+i] = addMulVVW2048(&T[i], &xLimbs[0], yLimbs[i]) 982 | } 983 | return x.Mod(&Nat{limbs: T}, m) 984 | } 985 | } 986 | 987 | // Exp calculates x = y^e mod m. 988 | // 989 | // The exponent e is represented in big-endian order. The output will be resized 990 | // to the size of m and overwritten. y must already be reduced modulo m. 991 | // 992 | // m must be odd, or Exp will panic. 993 | // 994 | //go:norace 995 | func (x *Nat) Exp(y *Nat, e []byte, m *Modulus) *Nat { 996 | out, x := x, y 997 | 998 | if !m.odd { 999 | panic("bigmod: modulus for Exp must be odd") 1000 | } 1001 | 1002 | // We use a 4 bit window. For our RSA workload, 4 bit windows are faster 1003 | // than 2 bit windows, but use an extra 12 nats worth of scratch space. 1004 | // Using bit sizes that don't divide 8 are more complex to implement, but 1005 | // are likely to be more efficient if necessary. 1006 | 1007 | table := [(1 << 4) - 1]*Nat{ // table[i] = x ^ (i+1) 1008 | // newNat calls are unrolled so they are allocated on the stack. 1009 | NewNat(), NewNat(), NewNat(), NewNat(), NewNat(), 1010 | NewNat(), NewNat(), NewNat(), NewNat(), NewNat(), 1011 | NewNat(), NewNat(), NewNat(), NewNat(), NewNat(), 1012 | } 1013 | table[0].set(x).montgomeryRepresentation(m) 1014 | for i := 1; i < len(table); i++ { 1015 | table[i].montgomeryMul(table[i-1], table[0], m) 1016 | } 1017 | 1018 | out.resetFor(m) 1019 | out.limbs[0] = 1 1020 | out.montgomeryRepresentation(m) 1021 | tmp := NewNat().ExpandFor(m) 1022 | for _, b := range e { 1023 | for _, j := range []int{4, 0} { 1024 | // Square four times. Optimization note: this can be implemented 1025 | // more efficiently than with generic Montgomery multiplication. 1026 | out.montgomeryMul(out, out, m) 1027 | out.montgomeryMul(out, out, m) 1028 | out.montgomeryMul(out, out, m) 1029 | out.montgomeryMul(out, out, m) 1030 | 1031 | // Select x^k in constant time from the table. 1032 | k := uint((b >> j) & 0b1111) 1033 | for i := range table { 1034 | tmp.assign(ctEq(k, uint(i+1)), table[i]) 1035 | } 1036 | 1037 | // Multiply by x^k, discarding the result if k = 0. 1038 | tmp.montgomeryMul(out, tmp, m) 1039 | out.assign(not(ctEq(k, 0)), tmp) 1040 | } 1041 | } 1042 | 1043 | return out.montgomeryReduction(m) 1044 | } 1045 | 1046 | // ExpShortVarTime calculates out = x^e mod m. 1047 | // 1048 | // The output will be resized to the size of m and overwritten. x must already 1049 | // be reduced modulo m. This leaks the exponent through timing side-channels. 1050 | // 1051 | // m must be odd, or ExpShortVarTime will panic. 1052 | func (x *Nat) ExpShortVarTime(y *Nat, e uint, m *Modulus) *Nat { 1053 | out, x := x, y 1054 | 1055 | if !m.odd { 1056 | panic("bigmod: modulus for ExpShortVarTime must be odd") 1057 | } 1058 | // For short exponents, precomputing a table and using a window like in Exp 1059 | // doesn't pay off. Instead, we do a simple conditional square-and-multiply 1060 | // chain, skipping the initial run of zeroes. 1061 | xR := NewNat().set(x).montgomeryRepresentation(m) 1062 | out.set(xR) 1063 | for i := bits.UintSize - bits.Len(e) + 1; i < bits.UintSize; i++ { 1064 | out.montgomeryMul(out, out, m) 1065 | if k := (e >> (bits.UintSize - i - 1)) & 1; k != 0 { 1066 | out.montgomeryMul(out, xR, m) 1067 | } 1068 | } 1069 | return out.montgomeryReduction(m) 1070 | } 1071 | 1072 | // InverseVarTime calculates x = a⁻¹ mod m and returns (x, true) if a is 1073 | // invertible. Otherwise, InverseVarTime returns (x, false) and x is not 1074 | // modified. 1075 | // 1076 | // a must be reduced modulo m, but doesn't need to have the same size. The 1077 | // output will be resized to the size of m and overwritten. 1078 | // 1079 | //go:norace 1080 | func (x *Nat) InverseVarTime(a *Nat, m *Modulus) (*Nat, bool) { 1081 | u, A, err := extendedGCD(a, m.nat) 1082 | if err != nil { 1083 | return x, false 1084 | } 1085 | if u.IsOne() == 0 { 1086 | return x, false 1087 | } 1088 | return x.set(A), true 1089 | } 1090 | 1091 | // GCDVarTime calculates x = GCD(a, b) where at least one of a or b is odd, and 1092 | // both are non-zero. If GCDVarTime returns an error, x is not modified. 1093 | // 1094 | // The output will be resized to the size of the larger of a and b. 1095 | func (x *Nat) GCDVarTime(a, b *Nat) (*Nat, error) { 1096 | u, _, err := extendedGCD(a, b) 1097 | if err != nil { 1098 | return nil, err 1099 | } 1100 | return x.set(u), nil 1101 | } 1102 | 1103 | // extendedGCD computes u and A such that a = GCD(a, m) and u = A*a - B*m. 1104 | // 1105 | // u will have the size of the larger of a and m, and A will have the size of m. 1106 | // 1107 | // It is an error if either a or m is zero, or if they are both even. 1108 | func extendedGCD(a, m *Nat) (u, A *Nat, err error) { 1109 | // This is the extended binary GCD algorithm described in the Handbook of 1110 | // Applied Cryptography, Algorithm 14.61, adapted by BoringSSL to bound 1111 | // coefficients and avoid negative numbers. For more details and proof of 1112 | // correctness, see https://github.com/mit-plv/fiat-crypto/pull/333/files. 1113 | // 1114 | // Following the proof linked in the PR above, the changes are: 1115 | // 1116 | // 1. Negate [B] and [C] so they are positive. The invariant now involves a 1117 | // subtraction. 1118 | // 2. If step 2 (both [x] and [y] are even) runs, abort immediately. This 1119 | // case needs to be handled by the caller. 1120 | // 3. Subtract copies of [x] and [y] as needed in step 6 (both [u] and [v] 1121 | // are odd) so coefficients stay in bounds. 1122 | // 4. Replace the [u >= v] check with [u > v]. This changes the end 1123 | // condition to [v = 0] rather than [u = 0]. This saves an extra 1124 | // subtraction due to which coefficients were negated. 1125 | // 5. Rename x and y to a and n, to capture that one is a modulus. 1126 | // 6. Rearrange steps 4 through 6 slightly. Merge the loops in steps 4 and 1127 | // 5 into the main loop (step 7's goto), and move step 6 to the start of 1128 | // the loop iteration, ensuring each loop iteration halves at least one 1129 | // value. 1130 | // 1131 | // Note this algorithm does not handle either input being zero. 1132 | 1133 | if a.IsZero() == 1 || m.IsZero() == 1 { 1134 | return nil, nil, errors.New("extendedGCD: a or m is zero") 1135 | } 1136 | if a.IsOdd() == 0 && m.IsOdd() == 0 { 1137 | return nil, nil, errors.New("extendedGCD: both a and m are even") 1138 | } 1139 | 1140 | size := max(len(a.limbs), len(m.limbs)) 1141 | u = NewNat().set(a).expand(size) 1142 | v := NewNat().set(m).expand(size) 1143 | 1144 | A = NewNat().reset(len(m.limbs)) 1145 | A.limbs[0] = 1 1146 | B := NewNat().reset(len(a.limbs)) 1147 | C := NewNat().reset(len(m.limbs)) 1148 | D := NewNat().reset(len(a.limbs)) 1149 | D.limbs[0] = 1 1150 | 1151 | // Before and after each loop iteration, the following hold: 1152 | // 1153 | // u = A*a - B*m 1154 | // v = D*m - C*a 1155 | // 0 < u <= a 1156 | // 0 <= v <= m 1157 | // 0 <= A < m 1158 | // 0 <= B <= a 1159 | // 0 <= C < m 1160 | // 0 <= D <= a 1161 | // 1162 | // After each loop iteration, u and v only get smaller, and at least one of 1163 | // them shrinks by at least a factor of two. 1164 | for { 1165 | // If both u and v are odd, subtract the smaller from the larger. 1166 | // If u = v, we need to subtract from v to hit the modified exit condition. 1167 | if u.IsOdd() == 1 && v.IsOdd() == 1 { 1168 | if v.cmpGeq(u) == 0 { 1169 | u.sub(v) 1170 | A.Add(C, &Modulus{nat: m}) 1171 | B.Add(D, &Modulus{nat: a}) 1172 | } else { 1173 | v.sub(u) 1174 | C.Add(A, &Modulus{nat: m}) 1175 | D.Add(B, &Modulus{nat: a}) 1176 | } 1177 | } 1178 | 1179 | // Exactly one of u and v is now even. 1180 | if u.IsOdd() == v.IsOdd() { 1181 | panic("bigmod: internal error: u and v are not in the expected state") 1182 | } 1183 | 1184 | // Halve the even one and adjust the corresponding coefficient. 1185 | if u.IsOdd() == 0 { 1186 | rshift1(u, 0) 1187 | if A.IsOdd() == 1 || B.IsOdd() == 1 { 1188 | rshift1(A, A.add(m)) 1189 | rshift1(B, B.add(a)) 1190 | } else { 1191 | rshift1(A, 0) 1192 | rshift1(B, 0) 1193 | } 1194 | } else { // v.IsOdd() == 0 1195 | rshift1(v, 0) 1196 | if C.IsOdd() == 1 || D.IsOdd() == 1 { 1197 | rshift1(C, C.add(m)) 1198 | rshift1(D, D.add(a)) 1199 | } else { 1200 | rshift1(C, 0) 1201 | rshift1(D, 0) 1202 | } 1203 | } 1204 | 1205 | if v.IsZero() == 1 { 1206 | return u, A, nil 1207 | } 1208 | } 1209 | } 1210 | 1211 | //go:norace 1212 | func rshift1(a *Nat, carry uint) { 1213 | size := len(a.limbs) 1214 | aLimbs := a.limbs[:size] 1215 | 1216 | for i := range size { 1217 | aLimbs[i] >>= 1 1218 | if i+1 < size { 1219 | aLimbs[i] |= aLimbs[i+1] << (_W - 1) 1220 | } else { 1221 | aLimbs[i] |= carry << (_W - 1) 1222 | } 1223 | } 1224 | } 1225 | 1226 | // DivShortVarTime calculates x = x / y and returns the remainder. 1227 | // 1228 | // It panics if y is zero. 1229 | // 1230 | //go:norace 1231 | func (x *Nat) DivShortVarTime(y uint) uint { 1232 | if y == 0 { 1233 | panic("bigmod: division by zero") 1234 | } 1235 | 1236 | var r uint 1237 | for i := len(x.limbs) - 1; i >= 0; i-- { 1238 | x.limbs[i], r = bits.Div(r, x.limbs[i], y) 1239 | } 1240 | return r 1241 | } 1242 | -------------------------------------------------------------------------------- /nat_386.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego 6 | 7 | #include "textflag.h" 8 | 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint) 10 | TEXT ·addMulVVW1024(SB), $0-16 11 | MOVL $32, BX 12 | JMP addMulVVWx<>(SB) 13 | 14 | // func addMulVVW1536(z, x *uint, y uint) (c uint) 15 | TEXT ·addMulVVW1536(SB), $0-16 16 | MOVL $48, BX 17 | JMP addMulVVWx<>(SB) 18 | 19 | // func addMulVVW2048(z, x *uint, y uint) (c uint) 20 | TEXT ·addMulVVW2048(SB), $0-16 21 | MOVL $64, BX 22 | JMP addMulVVWx<>(SB) 23 | 24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0 25 | MOVL z+0(FP), DI 26 | MOVL x+4(FP), SI 27 | MOVL y+8(FP), BP 28 | LEAL (DI)(BX*4), DI 29 | LEAL (SI)(BX*4), SI 30 | NEGL BX // i = -n 31 | MOVL $0, CX // c = 0 32 | JMP E6 33 | 34 | L6: MOVL (SI)(BX*4), AX 35 | MULL BP 36 | ADDL CX, AX 37 | ADCL $0, DX 38 | ADDL AX, (DI)(BX*4) 39 | ADCL $0, DX 40 | MOVL DX, CX 41 | ADDL $1, BX // i++ 42 | 43 | E6: CMPL BX, $0 // i < 0 44 | JL L6 45 | 46 | MOVL CX, c+12(FP) 47 | RET 48 | -------------------------------------------------------------------------------- /nat_amd64.s: -------------------------------------------------------------------------------- 1 | // Code generated by command: go run nat_amd64_asm.go -out ../nat_amd64.s -pkg bigmod. DO NOT EDIT. 2 | 3 | //go:build !purego 4 | 5 | // func addMulVVW1024(z *uint, x *uint, y uint) (c uint) 6 | // Requires: ADX, BMI2 7 | TEXT ·addMulVVW1024(SB), $0-32 8 | CMPB ·supportADX+0(SB), $0x01 9 | JEQ adx 10 | MOVQ z+0(FP), CX 11 | MOVQ x+8(FP), BX 12 | MOVQ y+16(FP), SI 13 | XORQ DI, DI 14 | 15 | // Iteration 0 16 | MOVQ (BX), AX 17 | MULQ SI 18 | ADDQ (CX), AX 19 | ADCQ $0x00, DX 20 | ADDQ DI, AX 21 | ADCQ $0x00, DX 22 | MOVQ DX, DI 23 | MOVQ AX, (CX) 24 | 25 | // Iteration 1 26 | MOVQ 8(BX), AX 27 | MULQ SI 28 | ADDQ 8(CX), AX 29 | ADCQ $0x00, DX 30 | ADDQ DI, AX 31 | ADCQ $0x00, DX 32 | MOVQ DX, DI 33 | MOVQ AX, 8(CX) 34 | 35 | // Iteration 2 36 | MOVQ 16(BX), AX 37 | MULQ SI 38 | ADDQ 16(CX), AX 39 | ADCQ $0x00, DX 40 | ADDQ DI, AX 41 | ADCQ $0x00, DX 42 | MOVQ DX, DI 43 | MOVQ AX, 16(CX) 44 | 45 | // Iteration 3 46 | MOVQ 24(BX), AX 47 | MULQ SI 48 | ADDQ 24(CX), AX 49 | ADCQ $0x00, DX 50 | ADDQ DI, AX 51 | ADCQ $0x00, DX 52 | MOVQ DX, DI 53 | MOVQ AX, 24(CX) 54 | 55 | // Iteration 4 56 | MOVQ 32(BX), AX 57 | MULQ SI 58 | ADDQ 32(CX), AX 59 | ADCQ $0x00, DX 60 | ADDQ DI, AX 61 | ADCQ $0x00, DX 62 | MOVQ DX, DI 63 | MOVQ AX, 32(CX) 64 | 65 | // Iteration 5 66 | MOVQ 40(BX), AX 67 | MULQ SI 68 | ADDQ 40(CX), AX 69 | ADCQ $0x00, DX 70 | ADDQ DI, AX 71 | ADCQ $0x00, DX 72 | MOVQ DX, DI 73 | MOVQ AX, 40(CX) 74 | 75 | // Iteration 6 76 | MOVQ 48(BX), AX 77 | MULQ SI 78 | ADDQ 48(CX), AX 79 | ADCQ $0x00, DX 80 | ADDQ DI, AX 81 | ADCQ $0x00, DX 82 | MOVQ DX, DI 83 | MOVQ AX, 48(CX) 84 | 85 | // Iteration 7 86 | MOVQ 56(BX), AX 87 | MULQ SI 88 | ADDQ 56(CX), AX 89 | ADCQ $0x00, DX 90 | ADDQ DI, AX 91 | ADCQ $0x00, DX 92 | MOVQ DX, DI 93 | MOVQ AX, 56(CX) 94 | 95 | // Iteration 8 96 | MOVQ 64(BX), AX 97 | MULQ SI 98 | ADDQ 64(CX), AX 99 | ADCQ $0x00, DX 100 | ADDQ DI, AX 101 | ADCQ $0x00, DX 102 | MOVQ DX, DI 103 | MOVQ AX, 64(CX) 104 | 105 | // Iteration 9 106 | MOVQ 72(BX), AX 107 | MULQ SI 108 | ADDQ 72(CX), AX 109 | ADCQ $0x00, DX 110 | ADDQ DI, AX 111 | ADCQ $0x00, DX 112 | MOVQ DX, DI 113 | MOVQ AX, 72(CX) 114 | 115 | // Iteration 10 116 | MOVQ 80(BX), AX 117 | MULQ SI 118 | ADDQ 80(CX), AX 119 | ADCQ $0x00, DX 120 | ADDQ DI, AX 121 | ADCQ $0x00, DX 122 | MOVQ DX, DI 123 | MOVQ AX, 80(CX) 124 | 125 | // Iteration 11 126 | MOVQ 88(BX), AX 127 | MULQ SI 128 | ADDQ 88(CX), AX 129 | ADCQ $0x00, DX 130 | ADDQ DI, AX 131 | ADCQ $0x00, DX 132 | MOVQ DX, DI 133 | MOVQ AX, 88(CX) 134 | 135 | // Iteration 12 136 | MOVQ 96(BX), AX 137 | MULQ SI 138 | ADDQ 96(CX), AX 139 | ADCQ $0x00, DX 140 | ADDQ DI, AX 141 | ADCQ $0x00, DX 142 | MOVQ DX, DI 143 | MOVQ AX, 96(CX) 144 | 145 | // Iteration 13 146 | MOVQ 104(BX), AX 147 | MULQ SI 148 | ADDQ 104(CX), AX 149 | ADCQ $0x00, DX 150 | ADDQ DI, AX 151 | ADCQ $0x00, DX 152 | MOVQ DX, DI 153 | MOVQ AX, 104(CX) 154 | 155 | // Iteration 14 156 | MOVQ 112(BX), AX 157 | MULQ SI 158 | ADDQ 112(CX), AX 159 | ADCQ $0x00, DX 160 | ADDQ DI, AX 161 | ADCQ $0x00, DX 162 | MOVQ DX, DI 163 | MOVQ AX, 112(CX) 164 | 165 | // Iteration 15 166 | MOVQ 120(BX), AX 167 | MULQ SI 168 | ADDQ 120(CX), AX 169 | ADCQ $0x00, DX 170 | ADDQ DI, AX 171 | ADCQ $0x00, DX 172 | MOVQ DX, DI 173 | MOVQ AX, 120(CX) 174 | MOVQ DI, c+24(FP) 175 | RET 176 | 177 | adx: 178 | MOVQ z+0(FP), AX 179 | MOVQ x+8(FP), CX 180 | MOVQ y+16(FP), DX 181 | XORQ BX, BX 182 | XORQ SI, SI 183 | 184 | // Iteration 0 185 | MULXQ (CX), R8, DI 186 | ADCXQ BX, R8 187 | ADOXQ (AX), R8 188 | MOVQ R8, (AX) 189 | 190 | // Iteration 1 191 | MULXQ 8(CX), R8, BX 192 | ADCXQ DI, R8 193 | ADOXQ 8(AX), R8 194 | MOVQ R8, 8(AX) 195 | 196 | // Iteration 2 197 | MULXQ 16(CX), R8, DI 198 | ADCXQ BX, R8 199 | ADOXQ 16(AX), R8 200 | MOVQ R8, 16(AX) 201 | 202 | // Iteration 3 203 | MULXQ 24(CX), R8, BX 204 | ADCXQ DI, R8 205 | ADOXQ 24(AX), R8 206 | MOVQ R8, 24(AX) 207 | 208 | // Iteration 4 209 | MULXQ 32(CX), R8, DI 210 | ADCXQ BX, R8 211 | ADOXQ 32(AX), R8 212 | MOVQ R8, 32(AX) 213 | 214 | // Iteration 5 215 | MULXQ 40(CX), R8, BX 216 | ADCXQ DI, R8 217 | ADOXQ 40(AX), R8 218 | MOVQ R8, 40(AX) 219 | 220 | // Iteration 6 221 | MULXQ 48(CX), R8, DI 222 | ADCXQ BX, R8 223 | ADOXQ 48(AX), R8 224 | MOVQ R8, 48(AX) 225 | 226 | // Iteration 7 227 | MULXQ 56(CX), R8, BX 228 | ADCXQ DI, R8 229 | ADOXQ 56(AX), R8 230 | MOVQ R8, 56(AX) 231 | 232 | // Iteration 8 233 | MULXQ 64(CX), R8, DI 234 | ADCXQ BX, R8 235 | ADOXQ 64(AX), R8 236 | MOVQ R8, 64(AX) 237 | 238 | // Iteration 9 239 | MULXQ 72(CX), R8, BX 240 | ADCXQ DI, R8 241 | ADOXQ 72(AX), R8 242 | MOVQ R8, 72(AX) 243 | 244 | // Iteration 10 245 | MULXQ 80(CX), R8, DI 246 | ADCXQ BX, R8 247 | ADOXQ 80(AX), R8 248 | MOVQ R8, 80(AX) 249 | 250 | // Iteration 11 251 | MULXQ 88(CX), R8, BX 252 | ADCXQ DI, R8 253 | ADOXQ 88(AX), R8 254 | MOVQ R8, 88(AX) 255 | 256 | // Iteration 12 257 | MULXQ 96(CX), R8, DI 258 | ADCXQ BX, R8 259 | ADOXQ 96(AX), R8 260 | MOVQ R8, 96(AX) 261 | 262 | // Iteration 13 263 | MULXQ 104(CX), R8, BX 264 | ADCXQ DI, R8 265 | ADOXQ 104(AX), R8 266 | MOVQ R8, 104(AX) 267 | 268 | // Iteration 14 269 | MULXQ 112(CX), R8, DI 270 | ADCXQ BX, R8 271 | ADOXQ 112(AX), R8 272 | MOVQ R8, 112(AX) 273 | 274 | // Iteration 15 275 | MULXQ 120(CX), R8, BX 276 | ADCXQ DI, R8 277 | ADOXQ 120(AX), R8 278 | MOVQ R8, 120(AX) 279 | 280 | // Add back carry flags and return 281 | ADCXQ SI, BX 282 | ADOXQ SI, BX 283 | MOVQ BX, c+24(FP) 284 | RET 285 | 286 | // func addMulVVW1536(z *uint, x *uint, y uint) (c uint) 287 | // Requires: ADX, BMI2 288 | TEXT ·addMulVVW1536(SB), $0-32 289 | CMPB ·supportADX+0(SB), $0x01 290 | JEQ adx 291 | MOVQ z+0(FP), CX 292 | MOVQ x+8(FP), BX 293 | MOVQ y+16(FP), SI 294 | XORQ DI, DI 295 | 296 | // Iteration 0 297 | MOVQ (BX), AX 298 | MULQ SI 299 | ADDQ (CX), AX 300 | ADCQ $0x00, DX 301 | ADDQ DI, AX 302 | ADCQ $0x00, DX 303 | MOVQ DX, DI 304 | MOVQ AX, (CX) 305 | 306 | // Iteration 1 307 | MOVQ 8(BX), AX 308 | MULQ SI 309 | ADDQ 8(CX), AX 310 | ADCQ $0x00, DX 311 | ADDQ DI, AX 312 | ADCQ $0x00, DX 313 | MOVQ DX, DI 314 | MOVQ AX, 8(CX) 315 | 316 | // Iteration 2 317 | MOVQ 16(BX), AX 318 | MULQ SI 319 | ADDQ 16(CX), AX 320 | ADCQ $0x00, DX 321 | ADDQ DI, AX 322 | ADCQ $0x00, DX 323 | MOVQ DX, DI 324 | MOVQ AX, 16(CX) 325 | 326 | // Iteration 3 327 | MOVQ 24(BX), AX 328 | MULQ SI 329 | ADDQ 24(CX), AX 330 | ADCQ $0x00, DX 331 | ADDQ DI, AX 332 | ADCQ $0x00, DX 333 | MOVQ DX, DI 334 | MOVQ AX, 24(CX) 335 | 336 | // Iteration 4 337 | MOVQ 32(BX), AX 338 | MULQ SI 339 | ADDQ 32(CX), AX 340 | ADCQ $0x00, DX 341 | ADDQ DI, AX 342 | ADCQ $0x00, DX 343 | MOVQ DX, DI 344 | MOVQ AX, 32(CX) 345 | 346 | // Iteration 5 347 | MOVQ 40(BX), AX 348 | MULQ SI 349 | ADDQ 40(CX), AX 350 | ADCQ $0x00, DX 351 | ADDQ DI, AX 352 | ADCQ $0x00, DX 353 | MOVQ DX, DI 354 | MOVQ AX, 40(CX) 355 | 356 | // Iteration 6 357 | MOVQ 48(BX), AX 358 | MULQ SI 359 | ADDQ 48(CX), AX 360 | ADCQ $0x00, DX 361 | ADDQ DI, AX 362 | ADCQ $0x00, DX 363 | MOVQ DX, DI 364 | MOVQ AX, 48(CX) 365 | 366 | // Iteration 7 367 | MOVQ 56(BX), AX 368 | MULQ SI 369 | ADDQ 56(CX), AX 370 | ADCQ $0x00, DX 371 | ADDQ DI, AX 372 | ADCQ $0x00, DX 373 | MOVQ DX, DI 374 | MOVQ AX, 56(CX) 375 | 376 | // Iteration 8 377 | MOVQ 64(BX), AX 378 | MULQ SI 379 | ADDQ 64(CX), AX 380 | ADCQ $0x00, DX 381 | ADDQ DI, AX 382 | ADCQ $0x00, DX 383 | MOVQ DX, DI 384 | MOVQ AX, 64(CX) 385 | 386 | // Iteration 9 387 | MOVQ 72(BX), AX 388 | MULQ SI 389 | ADDQ 72(CX), AX 390 | ADCQ $0x00, DX 391 | ADDQ DI, AX 392 | ADCQ $0x00, DX 393 | MOVQ DX, DI 394 | MOVQ AX, 72(CX) 395 | 396 | // Iteration 10 397 | MOVQ 80(BX), AX 398 | MULQ SI 399 | ADDQ 80(CX), AX 400 | ADCQ $0x00, DX 401 | ADDQ DI, AX 402 | ADCQ $0x00, DX 403 | MOVQ DX, DI 404 | MOVQ AX, 80(CX) 405 | 406 | // Iteration 11 407 | MOVQ 88(BX), AX 408 | MULQ SI 409 | ADDQ 88(CX), AX 410 | ADCQ $0x00, DX 411 | ADDQ DI, AX 412 | ADCQ $0x00, DX 413 | MOVQ DX, DI 414 | MOVQ AX, 88(CX) 415 | 416 | // Iteration 12 417 | MOVQ 96(BX), AX 418 | MULQ SI 419 | ADDQ 96(CX), AX 420 | ADCQ $0x00, DX 421 | ADDQ DI, AX 422 | ADCQ $0x00, DX 423 | MOVQ DX, DI 424 | MOVQ AX, 96(CX) 425 | 426 | // Iteration 13 427 | MOVQ 104(BX), AX 428 | MULQ SI 429 | ADDQ 104(CX), AX 430 | ADCQ $0x00, DX 431 | ADDQ DI, AX 432 | ADCQ $0x00, DX 433 | MOVQ DX, DI 434 | MOVQ AX, 104(CX) 435 | 436 | // Iteration 14 437 | MOVQ 112(BX), AX 438 | MULQ SI 439 | ADDQ 112(CX), AX 440 | ADCQ $0x00, DX 441 | ADDQ DI, AX 442 | ADCQ $0x00, DX 443 | MOVQ DX, DI 444 | MOVQ AX, 112(CX) 445 | 446 | // Iteration 15 447 | MOVQ 120(BX), AX 448 | MULQ SI 449 | ADDQ 120(CX), AX 450 | ADCQ $0x00, DX 451 | ADDQ DI, AX 452 | ADCQ $0x00, DX 453 | MOVQ DX, DI 454 | MOVQ AX, 120(CX) 455 | 456 | // Iteration 16 457 | MOVQ 128(BX), AX 458 | MULQ SI 459 | ADDQ 128(CX), AX 460 | ADCQ $0x00, DX 461 | ADDQ DI, AX 462 | ADCQ $0x00, DX 463 | MOVQ DX, DI 464 | MOVQ AX, 128(CX) 465 | 466 | // Iteration 17 467 | MOVQ 136(BX), AX 468 | MULQ SI 469 | ADDQ 136(CX), AX 470 | ADCQ $0x00, DX 471 | ADDQ DI, AX 472 | ADCQ $0x00, DX 473 | MOVQ DX, DI 474 | MOVQ AX, 136(CX) 475 | 476 | // Iteration 18 477 | MOVQ 144(BX), AX 478 | MULQ SI 479 | ADDQ 144(CX), AX 480 | ADCQ $0x00, DX 481 | ADDQ DI, AX 482 | ADCQ $0x00, DX 483 | MOVQ DX, DI 484 | MOVQ AX, 144(CX) 485 | 486 | // Iteration 19 487 | MOVQ 152(BX), AX 488 | MULQ SI 489 | ADDQ 152(CX), AX 490 | ADCQ $0x00, DX 491 | ADDQ DI, AX 492 | ADCQ $0x00, DX 493 | MOVQ DX, DI 494 | MOVQ AX, 152(CX) 495 | 496 | // Iteration 20 497 | MOVQ 160(BX), AX 498 | MULQ SI 499 | ADDQ 160(CX), AX 500 | ADCQ $0x00, DX 501 | ADDQ DI, AX 502 | ADCQ $0x00, DX 503 | MOVQ DX, DI 504 | MOVQ AX, 160(CX) 505 | 506 | // Iteration 21 507 | MOVQ 168(BX), AX 508 | MULQ SI 509 | ADDQ 168(CX), AX 510 | ADCQ $0x00, DX 511 | ADDQ DI, AX 512 | ADCQ $0x00, DX 513 | MOVQ DX, DI 514 | MOVQ AX, 168(CX) 515 | 516 | // Iteration 22 517 | MOVQ 176(BX), AX 518 | MULQ SI 519 | ADDQ 176(CX), AX 520 | ADCQ $0x00, DX 521 | ADDQ DI, AX 522 | ADCQ $0x00, DX 523 | MOVQ DX, DI 524 | MOVQ AX, 176(CX) 525 | 526 | // Iteration 23 527 | MOVQ 184(BX), AX 528 | MULQ SI 529 | ADDQ 184(CX), AX 530 | ADCQ $0x00, DX 531 | ADDQ DI, AX 532 | ADCQ $0x00, DX 533 | MOVQ DX, DI 534 | MOVQ AX, 184(CX) 535 | MOVQ DI, c+24(FP) 536 | RET 537 | 538 | adx: 539 | MOVQ z+0(FP), AX 540 | MOVQ x+8(FP), CX 541 | MOVQ y+16(FP), DX 542 | XORQ BX, BX 543 | XORQ SI, SI 544 | 545 | // Iteration 0 546 | MULXQ (CX), R8, DI 547 | ADCXQ BX, R8 548 | ADOXQ (AX), R8 549 | MOVQ R8, (AX) 550 | 551 | // Iteration 1 552 | MULXQ 8(CX), R8, BX 553 | ADCXQ DI, R8 554 | ADOXQ 8(AX), R8 555 | MOVQ R8, 8(AX) 556 | 557 | // Iteration 2 558 | MULXQ 16(CX), R8, DI 559 | ADCXQ BX, R8 560 | ADOXQ 16(AX), R8 561 | MOVQ R8, 16(AX) 562 | 563 | // Iteration 3 564 | MULXQ 24(CX), R8, BX 565 | ADCXQ DI, R8 566 | ADOXQ 24(AX), R8 567 | MOVQ R8, 24(AX) 568 | 569 | // Iteration 4 570 | MULXQ 32(CX), R8, DI 571 | ADCXQ BX, R8 572 | ADOXQ 32(AX), R8 573 | MOVQ R8, 32(AX) 574 | 575 | // Iteration 5 576 | MULXQ 40(CX), R8, BX 577 | ADCXQ DI, R8 578 | ADOXQ 40(AX), R8 579 | MOVQ R8, 40(AX) 580 | 581 | // Iteration 6 582 | MULXQ 48(CX), R8, DI 583 | ADCXQ BX, R8 584 | ADOXQ 48(AX), R8 585 | MOVQ R8, 48(AX) 586 | 587 | // Iteration 7 588 | MULXQ 56(CX), R8, BX 589 | ADCXQ DI, R8 590 | ADOXQ 56(AX), R8 591 | MOVQ R8, 56(AX) 592 | 593 | // Iteration 8 594 | MULXQ 64(CX), R8, DI 595 | ADCXQ BX, R8 596 | ADOXQ 64(AX), R8 597 | MOVQ R8, 64(AX) 598 | 599 | // Iteration 9 600 | MULXQ 72(CX), R8, BX 601 | ADCXQ DI, R8 602 | ADOXQ 72(AX), R8 603 | MOVQ R8, 72(AX) 604 | 605 | // Iteration 10 606 | MULXQ 80(CX), R8, DI 607 | ADCXQ BX, R8 608 | ADOXQ 80(AX), R8 609 | MOVQ R8, 80(AX) 610 | 611 | // Iteration 11 612 | MULXQ 88(CX), R8, BX 613 | ADCXQ DI, R8 614 | ADOXQ 88(AX), R8 615 | MOVQ R8, 88(AX) 616 | 617 | // Iteration 12 618 | MULXQ 96(CX), R8, DI 619 | ADCXQ BX, R8 620 | ADOXQ 96(AX), R8 621 | MOVQ R8, 96(AX) 622 | 623 | // Iteration 13 624 | MULXQ 104(CX), R8, BX 625 | ADCXQ DI, R8 626 | ADOXQ 104(AX), R8 627 | MOVQ R8, 104(AX) 628 | 629 | // Iteration 14 630 | MULXQ 112(CX), R8, DI 631 | ADCXQ BX, R8 632 | ADOXQ 112(AX), R8 633 | MOVQ R8, 112(AX) 634 | 635 | // Iteration 15 636 | MULXQ 120(CX), R8, BX 637 | ADCXQ DI, R8 638 | ADOXQ 120(AX), R8 639 | MOVQ R8, 120(AX) 640 | 641 | // Iteration 16 642 | MULXQ 128(CX), R8, DI 643 | ADCXQ BX, R8 644 | ADOXQ 128(AX), R8 645 | MOVQ R8, 128(AX) 646 | 647 | // Iteration 17 648 | MULXQ 136(CX), R8, BX 649 | ADCXQ DI, R8 650 | ADOXQ 136(AX), R8 651 | MOVQ R8, 136(AX) 652 | 653 | // Iteration 18 654 | MULXQ 144(CX), R8, DI 655 | ADCXQ BX, R8 656 | ADOXQ 144(AX), R8 657 | MOVQ R8, 144(AX) 658 | 659 | // Iteration 19 660 | MULXQ 152(CX), R8, BX 661 | ADCXQ DI, R8 662 | ADOXQ 152(AX), R8 663 | MOVQ R8, 152(AX) 664 | 665 | // Iteration 20 666 | MULXQ 160(CX), R8, DI 667 | ADCXQ BX, R8 668 | ADOXQ 160(AX), R8 669 | MOVQ R8, 160(AX) 670 | 671 | // Iteration 21 672 | MULXQ 168(CX), R8, BX 673 | ADCXQ DI, R8 674 | ADOXQ 168(AX), R8 675 | MOVQ R8, 168(AX) 676 | 677 | // Iteration 22 678 | MULXQ 176(CX), R8, DI 679 | ADCXQ BX, R8 680 | ADOXQ 176(AX), R8 681 | MOVQ R8, 176(AX) 682 | 683 | // Iteration 23 684 | MULXQ 184(CX), R8, BX 685 | ADCXQ DI, R8 686 | ADOXQ 184(AX), R8 687 | MOVQ R8, 184(AX) 688 | 689 | // Add back carry flags and return 690 | ADCXQ SI, BX 691 | ADOXQ SI, BX 692 | MOVQ BX, c+24(FP) 693 | RET 694 | 695 | // func addMulVVW2048(z *uint, x *uint, y uint) (c uint) 696 | // Requires: ADX, BMI2 697 | TEXT ·addMulVVW2048(SB), $0-32 698 | CMPB ·supportADX+0(SB), $0x01 699 | JEQ adx 700 | MOVQ z+0(FP), CX 701 | MOVQ x+8(FP), BX 702 | MOVQ y+16(FP), SI 703 | XORQ DI, DI 704 | 705 | // Iteration 0 706 | MOVQ (BX), AX 707 | MULQ SI 708 | ADDQ (CX), AX 709 | ADCQ $0x00, DX 710 | ADDQ DI, AX 711 | ADCQ $0x00, DX 712 | MOVQ DX, DI 713 | MOVQ AX, (CX) 714 | 715 | // Iteration 1 716 | MOVQ 8(BX), AX 717 | MULQ SI 718 | ADDQ 8(CX), AX 719 | ADCQ $0x00, DX 720 | ADDQ DI, AX 721 | ADCQ $0x00, DX 722 | MOVQ DX, DI 723 | MOVQ AX, 8(CX) 724 | 725 | // Iteration 2 726 | MOVQ 16(BX), AX 727 | MULQ SI 728 | ADDQ 16(CX), AX 729 | ADCQ $0x00, DX 730 | ADDQ DI, AX 731 | ADCQ $0x00, DX 732 | MOVQ DX, DI 733 | MOVQ AX, 16(CX) 734 | 735 | // Iteration 3 736 | MOVQ 24(BX), AX 737 | MULQ SI 738 | ADDQ 24(CX), AX 739 | ADCQ $0x00, DX 740 | ADDQ DI, AX 741 | ADCQ $0x00, DX 742 | MOVQ DX, DI 743 | MOVQ AX, 24(CX) 744 | 745 | // Iteration 4 746 | MOVQ 32(BX), AX 747 | MULQ SI 748 | ADDQ 32(CX), AX 749 | ADCQ $0x00, DX 750 | ADDQ DI, AX 751 | ADCQ $0x00, DX 752 | MOVQ DX, DI 753 | MOVQ AX, 32(CX) 754 | 755 | // Iteration 5 756 | MOVQ 40(BX), AX 757 | MULQ SI 758 | ADDQ 40(CX), AX 759 | ADCQ $0x00, DX 760 | ADDQ DI, AX 761 | ADCQ $0x00, DX 762 | MOVQ DX, DI 763 | MOVQ AX, 40(CX) 764 | 765 | // Iteration 6 766 | MOVQ 48(BX), AX 767 | MULQ SI 768 | ADDQ 48(CX), AX 769 | ADCQ $0x00, DX 770 | ADDQ DI, AX 771 | ADCQ $0x00, DX 772 | MOVQ DX, DI 773 | MOVQ AX, 48(CX) 774 | 775 | // Iteration 7 776 | MOVQ 56(BX), AX 777 | MULQ SI 778 | ADDQ 56(CX), AX 779 | ADCQ $0x00, DX 780 | ADDQ DI, AX 781 | ADCQ $0x00, DX 782 | MOVQ DX, DI 783 | MOVQ AX, 56(CX) 784 | 785 | // Iteration 8 786 | MOVQ 64(BX), AX 787 | MULQ SI 788 | ADDQ 64(CX), AX 789 | ADCQ $0x00, DX 790 | ADDQ DI, AX 791 | ADCQ $0x00, DX 792 | MOVQ DX, DI 793 | MOVQ AX, 64(CX) 794 | 795 | // Iteration 9 796 | MOVQ 72(BX), AX 797 | MULQ SI 798 | ADDQ 72(CX), AX 799 | ADCQ $0x00, DX 800 | ADDQ DI, AX 801 | ADCQ $0x00, DX 802 | MOVQ DX, DI 803 | MOVQ AX, 72(CX) 804 | 805 | // Iteration 10 806 | MOVQ 80(BX), AX 807 | MULQ SI 808 | ADDQ 80(CX), AX 809 | ADCQ $0x00, DX 810 | ADDQ DI, AX 811 | ADCQ $0x00, DX 812 | MOVQ DX, DI 813 | MOVQ AX, 80(CX) 814 | 815 | // Iteration 11 816 | MOVQ 88(BX), AX 817 | MULQ SI 818 | ADDQ 88(CX), AX 819 | ADCQ $0x00, DX 820 | ADDQ DI, AX 821 | ADCQ $0x00, DX 822 | MOVQ DX, DI 823 | MOVQ AX, 88(CX) 824 | 825 | // Iteration 12 826 | MOVQ 96(BX), AX 827 | MULQ SI 828 | ADDQ 96(CX), AX 829 | ADCQ $0x00, DX 830 | ADDQ DI, AX 831 | ADCQ $0x00, DX 832 | MOVQ DX, DI 833 | MOVQ AX, 96(CX) 834 | 835 | // Iteration 13 836 | MOVQ 104(BX), AX 837 | MULQ SI 838 | ADDQ 104(CX), AX 839 | ADCQ $0x00, DX 840 | ADDQ DI, AX 841 | ADCQ $0x00, DX 842 | MOVQ DX, DI 843 | MOVQ AX, 104(CX) 844 | 845 | // Iteration 14 846 | MOVQ 112(BX), AX 847 | MULQ SI 848 | ADDQ 112(CX), AX 849 | ADCQ $0x00, DX 850 | ADDQ DI, AX 851 | ADCQ $0x00, DX 852 | MOVQ DX, DI 853 | MOVQ AX, 112(CX) 854 | 855 | // Iteration 15 856 | MOVQ 120(BX), AX 857 | MULQ SI 858 | ADDQ 120(CX), AX 859 | ADCQ $0x00, DX 860 | ADDQ DI, AX 861 | ADCQ $0x00, DX 862 | MOVQ DX, DI 863 | MOVQ AX, 120(CX) 864 | 865 | // Iteration 16 866 | MOVQ 128(BX), AX 867 | MULQ SI 868 | ADDQ 128(CX), AX 869 | ADCQ $0x00, DX 870 | ADDQ DI, AX 871 | ADCQ $0x00, DX 872 | MOVQ DX, DI 873 | MOVQ AX, 128(CX) 874 | 875 | // Iteration 17 876 | MOVQ 136(BX), AX 877 | MULQ SI 878 | ADDQ 136(CX), AX 879 | ADCQ $0x00, DX 880 | ADDQ DI, AX 881 | ADCQ $0x00, DX 882 | MOVQ DX, DI 883 | MOVQ AX, 136(CX) 884 | 885 | // Iteration 18 886 | MOVQ 144(BX), AX 887 | MULQ SI 888 | ADDQ 144(CX), AX 889 | ADCQ $0x00, DX 890 | ADDQ DI, AX 891 | ADCQ $0x00, DX 892 | MOVQ DX, DI 893 | MOVQ AX, 144(CX) 894 | 895 | // Iteration 19 896 | MOVQ 152(BX), AX 897 | MULQ SI 898 | ADDQ 152(CX), AX 899 | ADCQ $0x00, DX 900 | ADDQ DI, AX 901 | ADCQ $0x00, DX 902 | MOVQ DX, DI 903 | MOVQ AX, 152(CX) 904 | 905 | // Iteration 20 906 | MOVQ 160(BX), AX 907 | MULQ SI 908 | ADDQ 160(CX), AX 909 | ADCQ $0x00, DX 910 | ADDQ DI, AX 911 | ADCQ $0x00, DX 912 | MOVQ DX, DI 913 | MOVQ AX, 160(CX) 914 | 915 | // Iteration 21 916 | MOVQ 168(BX), AX 917 | MULQ SI 918 | ADDQ 168(CX), AX 919 | ADCQ $0x00, DX 920 | ADDQ DI, AX 921 | ADCQ $0x00, DX 922 | MOVQ DX, DI 923 | MOVQ AX, 168(CX) 924 | 925 | // Iteration 22 926 | MOVQ 176(BX), AX 927 | MULQ SI 928 | ADDQ 176(CX), AX 929 | ADCQ $0x00, DX 930 | ADDQ DI, AX 931 | ADCQ $0x00, DX 932 | MOVQ DX, DI 933 | MOVQ AX, 176(CX) 934 | 935 | // Iteration 23 936 | MOVQ 184(BX), AX 937 | MULQ SI 938 | ADDQ 184(CX), AX 939 | ADCQ $0x00, DX 940 | ADDQ DI, AX 941 | ADCQ $0x00, DX 942 | MOVQ DX, DI 943 | MOVQ AX, 184(CX) 944 | 945 | // Iteration 24 946 | MOVQ 192(BX), AX 947 | MULQ SI 948 | ADDQ 192(CX), AX 949 | ADCQ $0x00, DX 950 | ADDQ DI, AX 951 | ADCQ $0x00, DX 952 | MOVQ DX, DI 953 | MOVQ AX, 192(CX) 954 | 955 | // Iteration 25 956 | MOVQ 200(BX), AX 957 | MULQ SI 958 | ADDQ 200(CX), AX 959 | ADCQ $0x00, DX 960 | ADDQ DI, AX 961 | ADCQ $0x00, DX 962 | MOVQ DX, DI 963 | MOVQ AX, 200(CX) 964 | 965 | // Iteration 26 966 | MOVQ 208(BX), AX 967 | MULQ SI 968 | ADDQ 208(CX), AX 969 | ADCQ $0x00, DX 970 | ADDQ DI, AX 971 | ADCQ $0x00, DX 972 | MOVQ DX, DI 973 | MOVQ AX, 208(CX) 974 | 975 | // Iteration 27 976 | MOVQ 216(BX), AX 977 | MULQ SI 978 | ADDQ 216(CX), AX 979 | ADCQ $0x00, DX 980 | ADDQ DI, AX 981 | ADCQ $0x00, DX 982 | MOVQ DX, DI 983 | MOVQ AX, 216(CX) 984 | 985 | // Iteration 28 986 | MOVQ 224(BX), AX 987 | MULQ SI 988 | ADDQ 224(CX), AX 989 | ADCQ $0x00, DX 990 | ADDQ DI, AX 991 | ADCQ $0x00, DX 992 | MOVQ DX, DI 993 | MOVQ AX, 224(CX) 994 | 995 | // Iteration 29 996 | MOVQ 232(BX), AX 997 | MULQ SI 998 | ADDQ 232(CX), AX 999 | ADCQ $0x00, DX 1000 | ADDQ DI, AX 1001 | ADCQ $0x00, DX 1002 | MOVQ DX, DI 1003 | MOVQ AX, 232(CX) 1004 | 1005 | // Iteration 30 1006 | MOVQ 240(BX), AX 1007 | MULQ SI 1008 | ADDQ 240(CX), AX 1009 | ADCQ $0x00, DX 1010 | ADDQ DI, AX 1011 | ADCQ $0x00, DX 1012 | MOVQ DX, DI 1013 | MOVQ AX, 240(CX) 1014 | 1015 | // Iteration 31 1016 | MOVQ 248(BX), AX 1017 | MULQ SI 1018 | ADDQ 248(CX), AX 1019 | ADCQ $0x00, DX 1020 | ADDQ DI, AX 1021 | ADCQ $0x00, DX 1022 | MOVQ DX, DI 1023 | MOVQ AX, 248(CX) 1024 | MOVQ DI, c+24(FP) 1025 | RET 1026 | 1027 | adx: 1028 | MOVQ z+0(FP), AX 1029 | MOVQ x+8(FP), CX 1030 | MOVQ y+16(FP), DX 1031 | XORQ BX, BX 1032 | XORQ SI, SI 1033 | 1034 | // Iteration 0 1035 | MULXQ (CX), R8, DI 1036 | ADCXQ BX, R8 1037 | ADOXQ (AX), R8 1038 | MOVQ R8, (AX) 1039 | 1040 | // Iteration 1 1041 | MULXQ 8(CX), R8, BX 1042 | ADCXQ DI, R8 1043 | ADOXQ 8(AX), R8 1044 | MOVQ R8, 8(AX) 1045 | 1046 | // Iteration 2 1047 | MULXQ 16(CX), R8, DI 1048 | ADCXQ BX, R8 1049 | ADOXQ 16(AX), R8 1050 | MOVQ R8, 16(AX) 1051 | 1052 | // Iteration 3 1053 | MULXQ 24(CX), R8, BX 1054 | ADCXQ DI, R8 1055 | ADOXQ 24(AX), R8 1056 | MOVQ R8, 24(AX) 1057 | 1058 | // Iteration 4 1059 | MULXQ 32(CX), R8, DI 1060 | ADCXQ BX, R8 1061 | ADOXQ 32(AX), R8 1062 | MOVQ R8, 32(AX) 1063 | 1064 | // Iteration 5 1065 | MULXQ 40(CX), R8, BX 1066 | ADCXQ DI, R8 1067 | ADOXQ 40(AX), R8 1068 | MOVQ R8, 40(AX) 1069 | 1070 | // Iteration 6 1071 | MULXQ 48(CX), R8, DI 1072 | ADCXQ BX, R8 1073 | ADOXQ 48(AX), R8 1074 | MOVQ R8, 48(AX) 1075 | 1076 | // Iteration 7 1077 | MULXQ 56(CX), R8, BX 1078 | ADCXQ DI, R8 1079 | ADOXQ 56(AX), R8 1080 | MOVQ R8, 56(AX) 1081 | 1082 | // Iteration 8 1083 | MULXQ 64(CX), R8, DI 1084 | ADCXQ BX, R8 1085 | ADOXQ 64(AX), R8 1086 | MOVQ R8, 64(AX) 1087 | 1088 | // Iteration 9 1089 | MULXQ 72(CX), R8, BX 1090 | ADCXQ DI, R8 1091 | ADOXQ 72(AX), R8 1092 | MOVQ R8, 72(AX) 1093 | 1094 | // Iteration 10 1095 | MULXQ 80(CX), R8, DI 1096 | ADCXQ BX, R8 1097 | ADOXQ 80(AX), R8 1098 | MOVQ R8, 80(AX) 1099 | 1100 | // Iteration 11 1101 | MULXQ 88(CX), R8, BX 1102 | ADCXQ DI, R8 1103 | ADOXQ 88(AX), R8 1104 | MOVQ R8, 88(AX) 1105 | 1106 | // Iteration 12 1107 | MULXQ 96(CX), R8, DI 1108 | ADCXQ BX, R8 1109 | ADOXQ 96(AX), R8 1110 | MOVQ R8, 96(AX) 1111 | 1112 | // Iteration 13 1113 | MULXQ 104(CX), R8, BX 1114 | ADCXQ DI, R8 1115 | ADOXQ 104(AX), R8 1116 | MOVQ R8, 104(AX) 1117 | 1118 | // Iteration 14 1119 | MULXQ 112(CX), R8, DI 1120 | ADCXQ BX, R8 1121 | ADOXQ 112(AX), R8 1122 | MOVQ R8, 112(AX) 1123 | 1124 | // Iteration 15 1125 | MULXQ 120(CX), R8, BX 1126 | ADCXQ DI, R8 1127 | ADOXQ 120(AX), R8 1128 | MOVQ R8, 120(AX) 1129 | 1130 | // Iteration 16 1131 | MULXQ 128(CX), R8, DI 1132 | ADCXQ BX, R8 1133 | ADOXQ 128(AX), R8 1134 | MOVQ R8, 128(AX) 1135 | 1136 | // Iteration 17 1137 | MULXQ 136(CX), R8, BX 1138 | ADCXQ DI, R8 1139 | ADOXQ 136(AX), R8 1140 | MOVQ R8, 136(AX) 1141 | 1142 | // Iteration 18 1143 | MULXQ 144(CX), R8, DI 1144 | ADCXQ BX, R8 1145 | ADOXQ 144(AX), R8 1146 | MOVQ R8, 144(AX) 1147 | 1148 | // Iteration 19 1149 | MULXQ 152(CX), R8, BX 1150 | ADCXQ DI, R8 1151 | ADOXQ 152(AX), R8 1152 | MOVQ R8, 152(AX) 1153 | 1154 | // Iteration 20 1155 | MULXQ 160(CX), R8, DI 1156 | ADCXQ BX, R8 1157 | ADOXQ 160(AX), R8 1158 | MOVQ R8, 160(AX) 1159 | 1160 | // Iteration 21 1161 | MULXQ 168(CX), R8, BX 1162 | ADCXQ DI, R8 1163 | ADOXQ 168(AX), R8 1164 | MOVQ R8, 168(AX) 1165 | 1166 | // Iteration 22 1167 | MULXQ 176(CX), R8, DI 1168 | ADCXQ BX, R8 1169 | ADOXQ 176(AX), R8 1170 | MOVQ R8, 176(AX) 1171 | 1172 | // Iteration 23 1173 | MULXQ 184(CX), R8, BX 1174 | ADCXQ DI, R8 1175 | ADOXQ 184(AX), R8 1176 | MOVQ R8, 184(AX) 1177 | 1178 | // Iteration 24 1179 | MULXQ 192(CX), R8, DI 1180 | ADCXQ BX, R8 1181 | ADOXQ 192(AX), R8 1182 | MOVQ R8, 192(AX) 1183 | 1184 | // Iteration 25 1185 | MULXQ 200(CX), R8, BX 1186 | ADCXQ DI, R8 1187 | ADOXQ 200(AX), R8 1188 | MOVQ R8, 200(AX) 1189 | 1190 | // Iteration 26 1191 | MULXQ 208(CX), R8, DI 1192 | ADCXQ BX, R8 1193 | ADOXQ 208(AX), R8 1194 | MOVQ R8, 208(AX) 1195 | 1196 | // Iteration 27 1197 | MULXQ 216(CX), R8, BX 1198 | ADCXQ DI, R8 1199 | ADOXQ 216(AX), R8 1200 | MOVQ R8, 216(AX) 1201 | 1202 | // Iteration 28 1203 | MULXQ 224(CX), R8, DI 1204 | ADCXQ BX, R8 1205 | ADOXQ 224(AX), R8 1206 | MOVQ R8, 224(AX) 1207 | 1208 | // Iteration 29 1209 | MULXQ 232(CX), R8, BX 1210 | ADCXQ DI, R8 1211 | ADOXQ 232(AX), R8 1212 | MOVQ R8, 232(AX) 1213 | 1214 | // Iteration 30 1215 | MULXQ 240(CX), R8, DI 1216 | ADCXQ BX, R8 1217 | ADOXQ 240(AX), R8 1218 | MOVQ R8, 240(AX) 1219 | 1220 | // Iteration 31 1221 | MULXQ 248(CX), R8, BX 1222 | ADCXQ DI, R8 1223 | ADOXQ 248(AX), R8 1224 | MOVQ R8, 248(AX) 1225 | 1226 | // Add back carry flags and return 1227 | ADCXQ SI, BX 1228 | ADOXQ SI, BX 1229 | MOVQ BX, c+24(FP) 1230 | RET 1231 | -------------------------------------------------------------------------------- /nat_arm.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego 6 | 7 | #include "textflag.h" 8 | 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint) 10 | TEXT ·addMulVVW1024(SB), $0-16 11 | MOVW $32, R5 12 | JMP addMulVVWx<>(SB) 13 | 14 | // func addMulVVW1536(z, x *uint, y uint) (c uint) 15 | TEXT ·addMulVVW1536(SB), $0-16 16 | MOVW $48, R5 17 | JMP addMulVVWx<>(SB) 18 | 19 | // func addMulVVW2048(z, x *uint, y uint) (c uint) 20 | TEXT ·addMulVVW2048(SB), $0-16 21 | MOVW $64, R5 22 | JMP addMulVVWx<>(SB) 23 | 24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0 25 | MOVW $0, R0 26 | MOVW z+0(FP), R1 27 | MOVW x+4(FP), R2 28 | MOVW y+8(FP), R3 29 | ADD R5<<2, R1, R5 30 | MOVW $0, R4 31 | B E9 32 | 33 | L9: MOVW.P 4(R2), R6 34 | MULLU R6, R3, (R7, R6) 35 | ADD.S R4, R6 36 | ADC R0, R7 37 | MOVW 0(R1), R4 38 | ADD.S R4, R6 39 | ADC R0, R7 40 | MOVW.P R6, 4(R1) 41 | MOVW R7, R4 42 | 43 | E9: TEQ R1, R5 44 | BNE L9 45 | 46 | MOVW R4, c+12(FP) 47 | RET 48 | -------------------------------------------------------------------------------- /nat_arm64.s: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego 6 | 7 | #include "textflag.h" 8 | 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint) 10 | TEXT ·addMulVVW1024(SB), $0-32 11 | MOVD $16, R0 12 | JMP addMulVVWx<>(SB) 13 | 14 | // func addMulVVW1536(z, x *uint, y uint) (c uint) 15 | TEXT ·addMulVVW1536(SB), $0-32 16 | MOVD $24, R0 17 | JMP addMulVVWx<>(SB) 18 | 19 | // func addMulVVW2048(z, x *uint, y uint) (c uint) 20 | TEXT ·addMulVVW2048(SB), $0-32 21 | MOVD $32, R0 22 | JMP addMulVVWx<>(SB) 23 | 24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0 25 | MOVD z+0(FP), R1 26 | MOVD x+8(FP), R2 27 | MOVD y+16(FP), R3 28 | MOVD $0, R4 29 | 30 | // The main loop of this code operates on a block of 4 words every iteration 31 | // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 32 | // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 33 | // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 34 | loop: 35 | CBZ R0, done 36 | 37 | LDP.P 16(R2), (R5, R6) 38 | LDP.P 16(R2), (R7, R8) 39 | 40 | LDP (R1), (R9, R10) 41 | ADDS R4, R9 42 | MUL R6, R3, R14 43 | ADCS R14, R10 44 | MUL R7, R3, R15 45 | LDP 16(R1), (R11, R12) 46 | ADCS R15, R11 47 | MUL R8, R3, R16 48 | ADCS R16, R12 49 | UMULH R8, R3, R20 50 | ADC $0, R20 51 | 52 | MUL R5, R3, R13 53 | ADDS R13, R9 54 | UMULH R5, R3, R17 55 | ADCS R17, R10 56 | UMULH R6, R3, R21 57 | STP.P (R9, R10), 16(R1) 58 | ADCS R21, R11 59 | UMULH R7, R3, R19 60 | ADCS R19, R12 61 | STP.P (R11, R12), 16(R1) 62 | ADC $0, R20, R4 63 | 64 | SUB $4, R0 65 | B loop 66 | 67 | done: 68 | MOVD R4, c+24(FP) 69 | RET 70 | -------------------------------------------------------------------------------- /nat_asm.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego && (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || riscv64 || s390x) 6 | 7 | package bigmod 8 | 9 | import "golang.org/x/sys/cpu" 10 | 11 | // amd64 assembly uses ADCX/ADOX/MULX if ADX is available to run two carry 12 | // chains in the flags in parallel across the whole operation, and aggressively 13 | // unrolls loops. arm64 processes four words at a time. 14 | // 15 | // It's unclear why the assembly for all other architectures, as well as for 16 | // amd64 without ADX, perform better than the compiler output. 17 | // TODO(filippo): file cmd/compile performance issue. 18 | 19 | var supportADX = cpu.X86.HasADX && cpu.X86.HasBMI2 20 | 21 | //go:noescape 22 | func addMulVVW1024(z, x *uint, y uint) (c uint) 23 | 24 | //go:noescape 25 | func addMulVVW1536(z, x *uint, y uint) (c uint) 26 | 27 | //go:noescape 28 | func addMulVVW2048(z, x *uint, y uint) (c uint) 29 | -------------------------------------------------------------------------------- /nat_loong64.s: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // derived from crypto/internal/fips140/bigmod/nat_riscv64.s 6 | 7 | //go:build !purego 8 | 9 | #include "textflag.h" 10 | 11 | // func addMulVVW1024(z, x *uint, y uint) (c uint) 12 | TEXT ·addMulVVW1024(SB),$0-32 13 | MOVV $16, R8 14 | JMP addMulVVWx<>(SB) 15 | 16 | // func addMulVVW1536(z, x *uint, y uint) (c uint) 17 | TEXT ·addMulVVW1536(SB),$0-32 18 | MOVV $24, R8 19 | JMP addMulVVWx<>(SB) 20 | 21 | // func addMulVVW2048(z, x *uint, y uint) (c uint) 22 | TEXT ·addMulVVW2048(SB),$0-32 23 | MOVV $32, R8 24 | JMP addMulVVWx<>(SB) 25 | 26 | TEXT addMulVVWx<>(SB),NOFRAME|NOSPLIT,$0 27 | MOVV z+0(FP), R4 28 | MOVV x+8(FP), R6 29 | MOVV y+16(FP), R5 30 | MOVV $0, R7 31 | 32 | BEQ R8, R0, done 33 | loop: 34 | MOVV 0*8(R4), R9 // z[0] 35 | MOVV 1*8(R4), R10 // z[1] 36 | MOVV 2*8(R4), R11 // z[2] 37 | MOVV 3*8(R4), R12 // z[3] 38 | 39 | MOVV 0*8(R6), R13 // x[0] 40 | MOVV 1*8(R6), R14 // x[1] 41 | MOVV 2*8(R6), R15 // x[2] 42 | MOVV 3*8(R6), R16 // x[3] 43 | 44 | MULHVU R13, R5, R17 // z_hi[0] = x[0] * y 45 | MULV R13, R5, R13 // z_lo[0] = x[0] * y 46 | ADDV R13, R9, R18 // z_lo[0] = x[0] * y + z[0] 47 | SGTU R13, R18, R19 48 | ADDV R17, R19, R17 // z_hi[0] = x[0] * y + z[0] 49 | ADDV R18, R7, R9 // z_lo[0] = x[0] * y + z[0] + c 50 | SGTU R18, R9, R19 51 | ADDV R17, R19, R7 // next c 52 | 53 | MULHVU R14, R5, R24 // z_hi[1] = x[1] * y 54 | MULV R14, R5, R14 // z_lo[1] = x[1] * y 55 | ADDV R14, R10, R18 // z_lo[1] = x[1] * y + z[1] 56 | SGTU R14, R18, R19 57 | ADDV R24, R19, R24 // z_hi[1] = x[1] * y + z[1] 58 | ADDV R18, R7, R10 // z_lo[1] = x[1] * y + z[1] + c 59 | SGTU R18, R10, R19 60 | ADDV R24, R19, R7 // next c 61 | 62 | MULHVU R15, R5, R25 // z_hi[2] = x[2] * y 63 | MULV R15, R5, R15 // z_lo[2] = x[2] * y 64 | ADDV R15, R11, R18 // z_lo[2] = x[2] * y + z[2] 65 | SGTU R15, R18, R19 66 | ADDV R25, R19, R25 // z_hi[2] = x[2] * y + z[2] 67 | ADDV R18, R7, R11 // z_lo[2] = x[2] * y + z[2] + c 68 | SGTU R18, R11, R19 69 | ADDV R25, R19, R7 // next c 70 | 71 | MULHVU R16, R5, R26 // z_hi[3] = x[3] * y 72 | MULV R16, R5, R16 // z_lo[3] = x[3] * y 73 | ADDV R16, R12, R18 // z_lo[3] = x[3] * y + z[3] 74 | SGTU R16, R18, R19 75 | ADDV R26, R19, R26 // z_hi[3] = x[3] * y + z[3] 76 | ADDV R18, R7, R12 // z_lo[3] = x[3] * y + z[3] + c 77 | SGTU R18, R12, R19 78 | ADDV R26, R19, R7 // next c 79 | 80 | MOVV R9, 0*8(R4) // z[0] 81 | MOVV R10, 1*8(R4) // z[1] 82 | MOVV R11, 2*8(R4) // z[2] 83 | MOVV R12, 3*8(R4) // z[3] 84 | 85 | ADDV $32, R4 86 | ADDV $32, R6 87 | 88 | SUBV $4, R8 89 | BNE R8, R0, loop 90 | 91 | done: 92 | MOVV R7, c+24(FP) 93 | RET 94 | -------------------------------------------------------------------------------- /nat_noasm.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build purego || !(386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || riscv64 || s390x || wasm) 6 | 7 | package bigmod 8 | 9 | import "unsafe" 10 | 11 | func addMulVVW1024(z, x *uint, y uint) (c uint) { 12 | return addMulVVW(unsafe.Slice(z, 1024/_W), unsafe.Slice(x, 1024/_W), y) 13 | } 14 | 15 | func addMulVVW1536(z, x *uint, y uint) (c uint) { 16 | return addMulVVW(unsafe.Slice(z, 1536/_W), unsafe.Slice(x, 1536/_W), y) 17 | } 18 | 19 | func addMulVVW2048(z, x *uint, y uint) (c uint) { 20 | return addMulVVW(unsafe.Slice(z, 2048/_W), unsafe.Slice(x, 2048/_W), y) 21 | } 22 | -------------------------------------------------------------------------------- /nat_ppc64x.s: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego && (ppc64 || ppc64le) 6 | 7 | #include "textflag.h" 8 | 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint) 10 | TEXT ·addMulVVW1024(SB), $0-32 11 | MOVD $4, R6 // R6 = z_len/4 12 | JMP addMulVVWx<>(SB) 13 | 14 | // func addMulVVW1536(z, x *uint, y uint) (c uint) 15 | TEXT ·addMulVVW1536(SB), $0-32 16 | MOVD $6, R6 // R6 = z_len/4 17 | JMP addMulVVWx<>(SB) 18 | 19 | // func addMulVVW2048(z, x *uint, y uint) (c uint) 20 | TEXT ·addMulVVW2048(SB), $0-32 21 | MOVD $8, R6 // R6 = z_len/4 22 | JMP addMulVVWx<>(SB) 23 | 24 | // This local function expects to be called only by 25 | // callers above. R6 contains the z length/4 26 | // since 4 values are processed for each 27 | // loop iteration, and is guaranteed to be > 0. 28 | // If other callers are added this function might 29 | // need to change. 30 | TEXT addMulVVWx<>(SB), NOSPLIT, $0 31 | MOVD z+0(FP), R3 32 | MOVD x+8(FP), R4 33 | MOVD y+16(FP), R5 34 | 35 | MOVD $0, R9 // R9 = c = 0 36 | MOVD R6, CTR // Initialize loop counter 37 | PCALIGN $16 38 | 39 | loop: 40 | MOVD 0(R4), R14 // x[i] 41 | MOVD 8(R4), R16 // x[i+1] 42 | MOVD 16(R4), R18 // x[i+2] 43 | MOVD 24(R4), R20 // x[i+3] 44 | MOVD 0(R3), R15 // z[i] 45 | MOVD 8(R3), R17 // z[i+1] 46 | MOVD 16(R3), R19 // z[i+2] 47 | MOVD 24(R3), R21 // z[i+3] 48 | MULLD R5, R14, R10 // low x[i]*y 49 | MULHDU R5, R14, R11 // high x[i]*y 50 | ADDC R15, R10 51 | ADDZE R11 52 | ADDC R9, R10 53 | ADDZE R11, R9 54 | MULLD R5, R16, R14 // low x[i+1]*y 55 | MULHDU R5, R16, R15 // high x[i+1]*y 56 | ADDC R17, R14 57 | ADDZE R15 58 | ADDC R9, R14 59 | ADDZE R15, R9 60 | MULLD R5, R18, R16 // low x[i+2]*y 61 | MULHDU R5, R18, R17 // high x[i+2]*y 62 | ADDC R19, R16 63 | ADDZE R17 64 | ADDC R9, R16 65 | ADDZE R17, R9 66 | MULLD R5, R20, R18 // low x[i+3]*y 67 | MULHDU R5, R20, R19 // high x[i+3]*y 68 | ADDC R21, R18 69 | ADDZE R19 70 | ADDC R9, R18 71 | ADDZE R19, R9 72 | MOVD R10, 0(R3) // z[i] 73 | MOVD R14, 8(R3) // z[i+1] 74 | MOVD R16, 16(R3) // z[i+2] 75 | MOVD R18, 24(R3) // z[i+3] 76 | ADD $32, R3 77 | ADD $32, R4 78 | BDNZ loop 79 | 80 | done: 81 | MOVD R9, c+24(FP) 82 | RET 83 | -------------------------------------------------------------------------------- /nat_riscv64.s: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego 6 | 7 | #include "textflag.h" 8 | 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint) 10 | TEXT ·addMulVVW1024(SB),$0-32 11 | MOV $16, X30 12 | JMP addMulVVWx<>(SB) 13 | 14 | // func addMulVVW1536(z, x *uint, y uint) (c uint) 15 | TEXT ·addMulVVW1536(SB),$0-32 16 | MOV $24, X30 17 | JMP addMulVVWx<>(SB) 18 | 19 | // func addMulVVW2048(z, x *uint, y uint) (c uint) 20 | TEXT ·addMulVVW2048(SB),$0-32 21 | MOV $32, X30 22 | JMP addMulVVWx<>(SB) 23 | 24 | TEXT addMulVVWx<>(SB),NOFRAME|NOSPLIT,$0 25 | MOV z+0(FP), X5 26 | MOV x+8(FP), X7 27 | MOV y+16(FP), X6 28 | MOV $0, X29 29 | 30 | BEQZ X30, done 31 | loop: 32 | MOV 0*8(X5), X10 // z[0] 33 | MOV 1*8(X5), X13 // z[1] 34 | MOV 2*8(X5), X16 // z[2] 35 | MOV 3*8(X5), X19 // z[3] 36 | 37 | MOV 0*8(X7), X8 // x[0] 38 | MOV 1*8(X7), X11 // x[1] 39 | MOV 2*8(X7), X14 // x[2] 40 | MOV 3*8(X7), X17 // x[3] 41 | 42 | MULHU X8, X6, X9 // z_hi[0] = x[0] * y 43 | MUL X8, X6, X8 // z_lo[0] = x[0] * y 44 | ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0] 45 | SLTU X8, X21, X22 46 | ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0] 47 | ADD X21, X29, X10 // z_lo[0] = x[0] * y + z[0] + c 48 | SLTU X21, X10, X22 49 | ADD X9, X22, X29 // next c 50 | 51 | MULHU X11, X6, X12 // z_hi[1] = x[1] * y 52 | MUL X11, X6, X11 // z_lo[1] = x[1] * y 53 | ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1] 54 | SLTU X11, X21, X22 55 | ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1] 56 | ADD X21, X29, X13 // z_lo[1] = x[1] * y + z[1] + c 57 | SLTU X21, X13, X22 58 | ADD X12, X22, X29 // next c 59 | 60 | MULHU X14, X6, X15 // z_hi[2] = x[2] * y 61 | MUL X14, X6, X14 // z_lo[2] = x[2] * y 62 | ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2] 63 | SLTU X14, X21, X22 64 | ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2] 65 | ADD X21, X29, X16 // z_lo[2] = x[2] * y + z[2] + c 66 | SLTU X21, X16, X22 67 | ADD X15, X22, X29 // next c 68 | 69 | MULHU X17, X6, X18 // z_hi[3] = x[3] * y 70 | MUL X17, X6, X17 // z_lo[3] = x[3] * y 71 | ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3] 72 | SLTU X17, X21, X22 73 | ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3] 74 | ADD X21, X29, X19 // z_lo[3] = x[3] * y + z[3] + c 75 | SLTU X21, X19, X22 76 | ADD X18, X22, X29 // next c 77 | 78 | MOV X10, 0*8(X5) // z[0] 79 | MOV X13, 1*8(X5) // z[1] 80 | MOV X16, 2*8(X5) // z[2] 81 | MOV X19, 3*8(X5) // z[3] 82 | 83 | ADDI $32, X5 84 | ADDI $32, X7 85 | 86 | ADDI $-4, X30 87 | BNEZ X30, loop 88 | 89 | done: 90 | MOV X29, c+24(FP) 91 | RET 92 | -------------------------------------------------------------------------------- /nat_s390x.s: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego 6 | 7 | #include "textflag.h" 8 | 9 | // func addMulVVW1024(z, x *uint, y uint) (c uint) 10 | TEXT ·addMulVVW1024(SB), $0-32 11 | MOVD $16, R5 12 | JMP addMulVVWx<>(SB) 13 | 14 | // func addMulVVW1536(z, x *uint, y uint) (c uint) 15 | TEXT ·addMulVVW1536(SB), $0-32 16 | MOVD $24, R5 17 | JMP addMulVVWx<>(SB) 18 | 19 | // func addMulVVW2048(z, x *uint, y uint) (c uint) 20 | TEXT ·addMulVVW2048(SB), $0-32 21 | MOVD $32, R5 22 | JMP addMulVVWx<>(SB) 23 | 24 | TEXT addMulVVWx<>(SB), NOFRAME|NOSPLIT, $0 25 | MOVD z+0(FP), R2 26 | MOVD x+8(FP), R8 27 | MOVD y+16(FP), R9 28 | 29 | MOVD $0, R1 // i*8 = 0 30 | MOVD $0, R7 // i = 0 31 | MOVD $0, R0 // make sure it's zero 32 | MOVD $0, R4 // c = 0 33 | 34 | MOVD R5, R12 35 | AND $-2, R12 36 | CMPBGE R5, $2, A6 37 | BR E6 38 | 39 | A6: 40 | MOVD (R8)(R1*1), R6 41 | MULHDU R9, R6 42 | MOVD (R2)(R1*1), R10 43 | ADDC R10, R11 // add to low order bits 44 | ADDE R0, R6 45 | ADDC R4, R11 46 | ADDE R0, R6 47 | MOVD R6, R4 48 | MOVD R11, (R2)(R1*1) 49 | 50 | MOVD (8)(R8)(R1*1), R6 51 | MULHDU R9, R6 52 | MOVD (8)(R2)(R1*1), R10 53 | ADDC R10, R11 // add to low order bits 54 | ADDE R0, R6 55 | ADDC R4, R11 56 | ADDE R0, R6 57 | MOVD R6, R4 58 | MOVD R11, (8)(R2)(R1*1) 59 | 60 | ADD $16, R1 // i*8 + 8 61 | ADD $2, R7 // i++ 62 | 63 | CMPBLT R7, R12, A6 64 | BR E6 65 | 66 | L6: 67 | // TODO: drop unused single-step loop. 68 | MOVD (R8)(R1*1), R6 69 | MULHDU R9, R6 70 | MOVD (R2)(R1*1), R10 71 | ADDC R10, R11 // add to low order bits 72 | ADDE R0, R6 73 | ADDC R4, R11 74 | ADDE R0, R6 75 | MOVD R6, R4 76 | MOVD R11, (R2)(R1*1) 77 | 78 | ADD $8, R1 // i*8 + 8 79 | ADD $1, R7 // i++ 80 | 81 | E6: 82 | CMPBLT R7, R5, L6 // i < n 83 | 84 | MOVD R4, c+24(FP) 85 | RET 86 | -------------------------------------------------------------------------------- /nat_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package bigmod 6 | 7 | import ( 8 | "bufio" 9 | "bytes" 10 | cryptorand "crypto/rand" 11 | "encoding/hex" 12 | "fmt" 13 | "math/big" 14 | "math/bits" 15 | "math/rand" 16 | "os" 17 | "reflect" 18 | "slices" 19 | "strings" 20 | "testing" 21 | "testing/quick" 22 | ) 23 | 24 | // setBig assigns x = n, optionally resizing n to the appropriate size. 25 | // 26 | // The announced length of x is set based on the actual bit size of the input, 27 | // ignoring leading zeroes. 28 | func (x *Nat) setBig(n *big.Int) *Nat { 29 | limbs := n.Bits() 30 | x.reset(len(limbs)) 31 | for i := range limbs { 32 | x.limbs[i] = uint(limbs[i]) 33 | } 34 | return x 35 | } 36 | 37 | func (n *Nat) asBig() *big.Int { 38 | bits := make([]big.Word, len(n.limbs)) 39 | for i := range n.limbs { 40 | bits[i] = big.Word(n.limbs[i]) 41 | } 42 | return new(big.Int).SetBits(bits) 43 | } 44 | 45 | func (n *Nat) String() string { 46 | var limbs []string 47 | for i := range n.limbs { 48 | limbs = append(limbs, fmt.Sprintf("%016X", n.limbs[len(n.limbs)-1-i])) 49 | } 50 | return "{" + strings.Join(limbs, " ") + "}" 51 | } 52 | 53 | // Generate generates an even nat. It's used by testing/quick to produce random 54 | // *nat values for quick.Check invocations. 55 | func (*Nat) Generate(r *rand.Rand, size int) reflect.Value { 56 | limbs := make([]uint, size) 57 | for i := 0; i < size; i++ { 58 | limbs[i] = uint(r.Uint64()) & ((1 << _W) - 2) 59 | } 60 | return reflect.ValueOf(&Nat{limbs}) 61 | } 62 | 63 | func testModAddCommutative(a *Nat, b *Nat) bool { 64 | m := maxModulus(uint(len(a.limbs))) 65 | aPlusB := new(Nat).set(a) 66 | aPlusB.Add(b, m) 67 | bPlusA := new(Nat).set(b) 68 | bPlusA.Add(a, m) 69 | return aPlusB.Equal(bPlusA) == 1 70 | } 71 | 72 | func TestModAddCommutative(t *testing.T) { 73 | err := quick.Check(testModAddCommutative, &quick.Config{}) 74 | if err != nil { 75 | t.Error(err) 76 | } 77 | } 78 | 79 | func testModSubThenAddIdentity(a *Nat, b *Nat) bool { 80 | m := maxModulus(uint(len(a.limbs))) 81 | original := new(Nat).set(a) 82 | a.Sub(b, m) 83 | a.Add(b, m) 84 | return a.Equal(original) == 1 85 | } 86 | 87 | func TestModSubThenAddIdentity(t *testing.T) { 88 | err := quick.Check(testModSubThenAddIdentity, &quick.Config{}) 89 | if err != nil { 90 | t.Error(err) 91 | } 92 | } 93 | 94 | func TestMontgomeryRoundtrip(t *testing.T) { 95 | err := quick.Check(func(a *Nat) bool { 96 | one := &Nat{make([]uint, len(a.limbs))} 97 | one.limbs[0] = 1 98 | aPlusOne := new(big.Int).SetBytes(natBytes(a)) 99 | aPlusOne.Add(aPlusOne, big.NewInt(1)) 100 | m, _ := NewModulus(aPlusOne.Bytes()) 101 | monty := new(Nat).set(a) 102 | monty.montgomeryRepresentation(m) 103 | aAgain := new(Nat).set(monty) 104 | aAgain.montgomeryMul(monty, one, m) 105 | if a.Equal(aAgain) != 1 { 106 | t.Errorf("%v != %v", a, aAgain) 107 | return false 108 | } 109 | return true 110 | }, &quick.Config{}) 111 | if err != nil { 112 | t.Error(err) 113 | } 114 | } 115 | 116 | func TestShiftIn(t *testing.T) { 117 | if bits.UintSize != 64 { 118 | t.Skip("examples are only valid in 64 bit") 119 | } 120 | examples := []struct { 121 | m, x, expected []byte 122 | y uint64 123 | }{{ 124 | m: []byte{13}, 125 | x: []byte{0}, 126 | y: 0xFFFF_FFFF_FFFF_FFFF, 127 | expected: []byte{2}, 128 | }, { 129 | m: []byte{13}, 130 | x: []byte{7}, 131 | y: 0xFFFF_FFFF_FFFF_FFFF, 132 | expected: []byte{10}, 133 | }, { 134 | m: []byte{0x06, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d}, 135 | x: make([]byte, 9), 136 | y: 0xFFFF_FFFF_FFFF_FFFF, 137 | expected: []byte{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 138 | }, { 139 | m: []byte{0x06, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d}, 140 | x: []byte{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 141 | y: 0, 142 | expected: []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06}, 143 | }} 144 | 145 | for i, tt := range examples { 146 | m := modulusFromBytes(tt.m) 147 | got := natFromBytes(tt.x).ExpandFor(m).shiftIn(uint(tt.y), m) 148 | if exp := natFromBytes(tt.expected).ExpandFor(m); got.Equal(exp) != 1 { 149 | t.Errorf("%d: got %v, expected %v", i, got, exp) 150 | } 151 | } 152 | } 153 | 154 | func TestModulusAndNatSizes(t *testing.T) { 155 | // These are 126 bit (2 * _W on 64-bit architectures) values, serialized as 156 | // 128 bits worth of bytes. If leading zeroes are stripped, they fit in two 157 | // limbs, if they are not, they fit in three. This can be a problem because 158 | // modulus strips leading zeroes and nat does not. 159 | m := modulusFromBytes([]byte{ 160 | 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 161 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}) 162 | xb := []byte{0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 163 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe} 164 | natFromBytes(xb).ExpandFor(m) // must not panic for shrinking 165 | NewNat().SetBytes(xb, m) 166 | } 167 | 168 | func TestSetBytes(t *testing.T) { 169 | tests := []struct { 170 | m, b []byte 171 | fail bool 172 | }{{ 173 | m: []byte{0xff, 0xff}, 174 | b: []byte{0x00, 0x01}, 175 | }, { 176 | m: []byte{0xff, 0xff}, 177 | b: []byte{0xff, 0xff}, 178 | fail: true, 179 | }, { 180 | m: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 181 | b: []byte{0x00, 0x01}, 182 | }, { 183 | m: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 184 | b: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}, 185 | }, { 186 | m: []byte{0xff, 0xff}, 187 | b: []byte{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, 188 | fail: true, 189 | }, { 190 | m: []byte{0xff, 0xff}, 191 | b: []byte{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, 192 | fail: true, 193 | }, { 194 | m: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 195 | b: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}, 196 | }, { 197 | m: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 198 | b: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}, 199 | fail: true, 200 | }, { 201 | m: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 202 | b: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 203 | fail: true, 204 | }, { 205 | m: []byte{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 206 | b: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}, 207 | fail: true, 208 | }, { 209 | m: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd}, 210 | b: []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 211 | fail: true, 212 | }} 213 | 214 | for i, tt := range tests { 215 | m := modulusFromBytes(tt.m) 216 | got, err := NewNat().SetBytes(tt.b, m) 217 | if err != nil { 218 | if !tt.fail { 219 | t.Errorf("%d: unexpected error: %v", i, err) 220 | } 221 | continue 222 | } 223 | if tt.fail { 224 | t.Errorf("%d: unexpected success", i) 225 | continue 226 | } 227 | if expected := natFromBytes(tt.b).ExpandFor(m); choice(got.Equal(expected)) != yes { 228 | t.Errorf("%d: got %v, expected %v", i, got, expected) 229 | } 230 | } 231 | 232 | f := func(xBytes []byte) bool { 233 | m := maxModulus(uint(len(xBytes)*8/_W + 1)) 234 | got, err := NewNat().SetBytes(xBytes, m) 235 | if err != nil { 236 | return false 237 | } 238 | return choice(got.Equal(natFromBytes(xBytes).ExpandFor(m))) == yes 239 | } 240 | 241 | err := quick.Check(f, &quick.Config{}) 242 | if err != nil { 243 | t.Error(err) 244 | } 245 | } 246 | 247 | func TestExpand(t *testing.T) { 248 | sliced := []uint{1, 2, 3, 4} 249 | examples := []struct { 250 | in []uint 251 | n int 252 | out []uint 253 | }{{ 254 | []uint{1, 2}, 255 | 4, 256 | []uint{1, 2, 0, 0}, 257 | }, { 258 | sliced[:2], 259 | 4, 260 | []uint{1, 2, 0, 0}, 261 | }, { 262 | []uint{1, 2}, 263 | 2, 264 | []uint{1, 2}, 265 | }} 266 | 267 | for i, tt := range examples { 268 | got := (&Nat{tt.in}).expand(tt.n) 269 | if len(got.limbs) != len(tt.out) || got.Equal(&Nat{tt.out}) != 1 { 270 | t.Errorf("%d: got %v, expected %v", i, got, tt.out) 271 | } 272 | } 273 | } 274 | 275 | func TestMod(t *testing.T) { 276 | m := modulusFromBytes([]byte{0x06, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d}) 277 | x := natFromBytes([]byte{0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) 278 | out := new(Nat) 279 | out.Mod(x, m) 280 | expected := natFromBytes([]byte{0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09}) 281 | if out.Equal(expected) != 1 { 282 | t.Errorf("%+v != %+v", out, expected) 283 | } 284 | } 285 | 286 | func TestModSub(t *testing.T) { 287 | m := modulusFromBytes([]byte{13}) 288 | x := &Nat{[]uint{6}} 289 | y := &Nat{[]uint{7}} 290 | x.Sub(y, m) 291 | expected := &Nat{[]uint{12}} 292 | if x.Equal(expected) != 1 { 293 | t.Errorf("%+v != %+v", x, expected) 294 | } 295 | x.Sub(y, m) 296 | expected = &Nat{[]uint{5}} 297 | if x.Equal(expected) != 1 { 298 | t.Errorf("%+v != %+v", x, expected) 299 | } 300 | } 301 | 302 | func TestModAdd(t *testing.T) { 303 | m := modulusFromBytes([]byte{13}) 304 | x := &Nat{[]uint{6}} 305 | y := &Nat{[]uint{7}} 306 | x.Add(y, m) 307 | expected := &Nat{[]uint{0}} 308 | if x.Equal(expected) != 1 { 309 | t.Errorf("%+v != %+v", x, expected) 310 | } 311 | x.Add(y, m) 312 | expected = &Nat{[]uint{7}} 313 | if x.Equal(expected) != 1 { 314 | t.Errorf("%+v != %+v", x, expected) 315 | } 316 | } 317 | 318 | func TestExp(t *testing.T) { 319 | m := modulusFromBytes([]byte{13}) 320 | x := &Nat{[]uint{3}} 321 | out := &Nat{[]uint{0}} 322 | out.Exp(x, []byte{12}, m) 323 | expected := &Nat{[]uint{1}} 324 | if out.Equal(expected) != 1 { 325 | t.Errorf("%+v != %+v", out, expected) 326 | } 327 | } 328 | 329 | func TestExpShort(t *testing.T) { 330 | m := modulusFromBytes([]byte{13}) 331 | x := &Nat{[]uint{3}} 332 | out := &Nat{[]uint{0}} 333 | out.ExpShortVarTime(x, 12, m) 334 | expected := &Nat{[]uint{1}} 335 | if out.Equal(expected) != 1 { 336 | t.Errorf("%+v != %+v", out, expected) 337 | } 338 | } 339 | 340 | // TestMulReductions tests that Mul reduces results equal or slightly greater 341 | // than the modulus. Some Montgomery algorithms don't and need extra care to 342 | // return correct results. See https://go.dev/issue/13907. 343 | func TestMulReductions(t *testing.T) { 344 | // Two short but multi-limb primes. 345 | a, _ := new(big.Int).SetString("773608962677651230850240281261679752031633236267106044359907", 10) 346 | b, _ := new(big.Int).SetString("180692823610368451951102211649591374573781973061758082626801", 10) 347 | n := new(big.Int).Mul(a, b) 348 | 349 | N, _ := NewModulus(n.Bytes()) 350 | A := NewNat().setBig(a).ExpandFor(N) 351 | B := NewNat().setBig(b).ExpandFor(N) 352 | 353 | if A.Mul(B, N).IsZero() != 1 { 354 | t.Error("a * b mod (a * b) != 0") 355 | } 356 | 357 | i := new(big.Int).ModInverse(a, b) 358 | N, _ = NewModulus(b.Bytes()) 359 | A = NewNat().setBig(a).ExpandFor(N) 360 | I := NewNat().setBig(i).ExpandFor(N) 361 | one := NewNat().setBig(big.NewInt(1)).ExpandFor(N) 362 | 363 | if A.Mul(I, N).Equal(one) != 1 { 364 | t.Error("a * inv(a) mod b != 1") 365 | } 366 | } 367 | 368 | func TestMul(t *testing.T) { 369 | t.Run("small", func(t *testing.T) { testMul(t, 760/8) }) 370 | t.Run("1024", func(t *testing.T) { testMul(t, 1024/8) }) 371 | t.Run("1536", func(t *testing.T) { testMul(t, 1536/8) }) 372 | t.Run("2048", func(t *testing.T) { testMul(t, 2048/8) }) 373 | } 374 | 375 | func testMul(t *testing.T, n int) { 376 | a, b, m := make([]byte, n), make([]byte, n), make([]byte, n) 377 | cryptorand.Read(a) 378 | cryptorand.Read(b) 379 | cryptorand.Read(m) 380 | 381 | // Pick the highest as the modulus. 382 | if bytes.Compare(a, m) > 0 { 383 | a, m = m, a 384 | } 385 | if bytes.Compare(b, m) > 0 { 386 | b, m = m, b 387 | } 388 | 389 | M, err := NewModulus(m) 390 | if err != nil { 391 | t.Fatal(err) 392 | } 393 | A, err := NewNat().SetBytes(a, M) 394 | if err != nil { 395 | t.Fatal(err) 396 | } 397 | B, err := NewNat().SetBytes(b, M) 398 | if err != nil { 399 | t.Fatal(err) 400 | } 401 | 402 | A.Mul(B, M) 403 | ABytes := A.Bytes(M) 404 | 405 | mBig := new(big.Int).SetBytes(m) 406 | aBig := new(big.Int).SetBytes(a) 407 | bBig := new(big.Int).SetBytes(b) 408 | nBig := new(big.Int).Mul(aBig, bBig) 409 | nBig.Mod(nBig, mBig) 410 | nBigBytes := make([]byte, len(ABytes)) 411 | nBig.FillBytes(nBigBytes) 412 | 413 | if !bytes.Equal(ABytes, nBigBytes) { 414 | t.Errorf("got %x, want %x", ABytes, nBigBytes) 415 | } 416 | } 417 | 418 | func TestIs(t *testing.T) { 419 | checkYes := func(c uint, err string) { 420 | t.Helper() 421 | if choice(c) != yes { 422 | t.Error(err) 423 | } 424 | } 425 | checkNot := func(c uint, err string) { 426 | t.Helper() 427 | if choice(c) != no { 428 | t.Error(err) 429 | } 430 | } 431 | 432 | mFour := modulusFromBytes([]byte{4}) 433 | n, err := NewNat().SetBytes([]byte{3}, mFour) 434 | if err != nil { 435 | t.Fatal(err) 436 | } 437 | checkYes(n.IsMinusOne(mFour), "3 is not -1 mod 4") 438 | checkNot(n.IsZero(), "3 is zero") 439 | checkNot(n.IsOne(), "3 is one") 440 | checkYes(n.IsOdd(), "3 is not odd") 441 | n.SubOne(mFour) 442 | checkNot(n.IsMinusOne(mFour), "2 is -1 mod 4") 443 | checkNot(n.IsZero(), "2 is zero") 444 | checkNot(n.IsOne(), "2 is one") 445 | checkNot(n.IsOdd(), "2 is odd") 446 | n.SubOne(mFour) 447 | checkNot(n.IsMinusOne(mFour), "1 is -1 mod 4") 448 | checkNot(n.IsZero(), "1 is zero") 449 | checkYes(n.IsOne(), "1 is not one") 450 | checkYes(n.IsOdd(), "1 is not odd") 451 | n.SubOne(mFour) 452 | checkNot(n.IsMinusOne(mFour), "0 is -1 mod 4") 453 | checkYes(n.IsZero(), "0 is not zero") 454 | checkNot(n.IsOne(), "0 is one") 455 | checkNot(n.IsOdd(), "0 is odd") 456 | n.SubOne(mFour) 457 | checkYes(n.IsMinusOne(mFour), "-1 is not -1 mod 4") 458 | checkNot(n.IsZero(), "-1 is zero") 459 | checkNot(n.IsOne(), "-1 is one") 460 | checkYes(n.IsOdd(), "-1 mod 4 is not odd") 461 | 462 | mTwoLimbs := maxModulus(2) 463 | n, err = NewNat().SetBytes([]byte{0x01}, mTwoLimbs) 464 | if err != nil { 465 | t.Fatal(err) 466 | } 467 | if n.IsOne() != 1 { 468 | t.Errorf("1 is not one") 469 | } 470 | } 471 | 472 | func TestTrailingZeroBits(t *testing.T) { 473 | nb := new(big.Int).SetBytes([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7e}) 474 | nb.Lsh(nb, 128) 475 | expected := 129 476 | for expected >= 0 { 477 | n := NewNat().setBig(nb) 478 | if n.TrailingZeroBitsVarTime() != uint(expected) { 479 | t.Errorf("%d != %d", n.TrailingZeroBitsVarTime(), expected) 480 | } 481 | nb.Rsh(nb, 1) 482 | expected-- 483 | } 484 | } 485 | 486 | func TestRightShift(t *testing.T) { 487 | nb, err := cryptorand.Int(cryptorand.Reader, new(big.Int).Lsh(big.NewInt(1), 1024)) 488 | if err != nil { 489 | t.Fatal(err) 490 | } 491 | for _, shift := range []uint{1, 32, 64, 128, 1024 - 128, 1024 - 64, 1024 - 32, 1024 - 1} { 492 | testShift := func(t *testing.T, shift uint) { 493 | n := NewNat().setBig(nb) 494 | oldLen := len(n.limbs) 495 | n.ShiftRightVarTime(shift) 496 | if len(n.limbs) != oldLen { 497 | t.Errorf("len(n.limbs) = %d, want %d", len(n.limbs), oldLen) 498 | } 499 | exp := new(big.Int).Rsh(nb, shift) 500 | if n.asBig().Cmp(exp) != 0 { 501 | t.Errorf("%v != %v", n.asBig(), exp) 502 | } 503 | } 504 | t.Run(fmt.Sprint(shift-1), func(t *testing.T) { testShift(t, shift-1) }) 505 | t.Run(fmt.Sprint(shift), func(t *testing.T) { testShift(t, shift) }) 506 | t.Run(fmt.Sprint(shift+1), func(t *testing.T) { testShift(t, shift+1) }) 507 | } 508 | } 509 | 510 | func natBytes(n *Nat) []byte { 511 | return n.Bytes(maxModulus(uint(len(n.limbs)))) 512 | } 513 | 514 | func natFromBytes(b []byte) *Nat { 515 | // Must not use Nat.SetBytes as it's used in TestSetBytes. 516 | bb := new(big.Int).SetBytes(b) 517 | return NewNat().setBig(bb) 518 | } 519 | 520 | func modulusFromBytes(b []byte) *Modulus { 521 | bb := new(big.Int).SetBytes(b) 522 | m, _ := NewModulus(bb.Bytes()) 523 | return m 524 | } 525 | 526 | // maxModulus returns the biggest modulus that can fit in n limbs. 527 | func maxModulus(n uint) *Modulus { 528 | b := big.NewInt(1) 529 | b.Lsh(b, n*_W) 530 | b.Sub(b, big.NewInt(1)) 531 | m, _ := NewModulus(b.Bytes()) 532 | return m 533 | } 534 | 535 | func makeBenchmarkModulus() *Modulus { 536 | return maxModulus(32) 537 | } 538 | 539 | func makeBenchmarkValue() *Nat { 540 | x := make([]uint, 32) 541 | for i := 0; i < 32; i++ { 542 | x[i]-- 543 | } 544 | return &Nat{limbs: x} 545 | } 546 | 547 | func makeBenchmarkExponent() []byte { 548 | e := make([]byte, 256) 549 | for i := 0; i < 32; i++ { 550 | e[i] = 0xFF 551 | } 552 | return e 553 | } 554 | 555 | func BenchmarkModAdd(b *testing.B) { 556 | x := makeBenchmarkValue() 557 | y := makeBenchmarkValue() 558 | m := makeBenchmarkModulus() 559 | 560 | b.ResetTimer() 561 | for i := 0; i < b.N; i++ { 562 | x.Add(y, m) 563 | } 564 | } 565 | 566 | func BenchmarkModSub(b *testing.B) { 567 | x := makeBenchmarkValue() 568 | y := makeBenchmarkValue() 569 | m := makeBenchmarkModulus() 570 | 571 | b.ResetTimer() 572 | for i := 0; i < b.N; i++ { 573 | x.Sub(y, m) 574 | } 575 | } 576 | 577 | func BenchmarkMontgomeryRepr(b *testing.B) { 578 | x := makeBenchmarkValue() 579 | m := makeBenchmarkModulus() 580 | 581 | b.ResetTimer() 582 | for i := 0; i < b.N; i++ { 583 | x.montgomeryRepresentation(m) 584 | } 585 | } 586 | 587 | func BenchmarkMontgomeryMul(b *testing.B) { 588 | x := makeBenchmarkValue() 589 | y := makeBenchmarkValue() 590 | out := makeBenchmarkValue() 591 | m := makeBenchmarkModulus() 592 | 593 | b.ResetTimer() 594 | for i := 0; i < b.N; i++ { 595 | out.montgomeryMul(x, y, m) 596 | } 597 | } 598 | 599 | func BenchmarkModMul(b *testing.B) { 600 | x := makeBenchmarkValue() 601 | y := makeBenchmarkValue() 602 | m := makeBenchmarkModulus() 603 | 604 | b.ResetTimer() 605 | for i := 0; i < b.N; i++ { 606 | x.Mul(y, m) 607 | } 608 | } 609 | 610 | func BenchmarkExpBig(b *testing.B) { 611 | out := new(big.Int) 612 | exponentBytes := makeBenchmarkExponent() 613 | x := new(big.Int).SetBytes(exponentBytes) 614 | e := new(big.Int).SetBytes(exponentBytes) 615 | n := new(big.Int).SetBytes(exponentBytes) 616 | one := new(big.Int).SetUint64(1) 617 | n.Add(n, one) 618 | 619 | b.ResetTimer() 620 | for i := 0; i < b.N; i++ { 621 | out.Exp(x, e, n) 622 | } 623 | } 624 | 625 | func BenchmarkExp(b *testing.B) { 626 | x := makeBenchmarkValue() 627 | e := makeBenchmarkExponent() 628 | out := makeBenchmarkValue() 629 | m := makeBenchmarkModulus() 630 | 631 | b.ResetTimer() 632 | for i := 0; i < b.N; i++ { 633 | out.Exp(x, e, m) 634 | } 635 | } 636 | 637 | func TestNewModulus(t *testing.T) { 638 | expected := "modulus must be > 1" 639 | _, err := NewModulus([]byte{}) 640 | if err == nil || err.Error() != expected { 641 | t.Errorf("NewModulus(0) got %q, want %q", err, expected) 642 | } 643 | _, err = NewModulus([]byte{0}) 644 | if err == nil || err.Error() != expected { 645 | t.Errorf("NewModulus(0) got %q, want %q", err, expected) 646 | } 647 | _, err = NewModulus([]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) 648 | if err == nil || err.Error() != expected { 649 | t.Errorf("NewModulus(0) got %q, want %q", err, expected) 650 | } 651 | _, err = NewModulus([]byte{1}) 652 | if err == nil || err.Error() != expected { 653 | t.Errorf("NewModulus(1) got %q, want %q", err, expected) 654 | } 655 | _, err = NewModulus([]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}) 656 | if err == nil || err.Error() != expected { 657 | t.Errorf("NewModulus(1) got %q, want %q", err, expected) 658 | } 659 | } 660 | 661 | func makeTestValue(nbits int) []uint { 662 | n := nbits / _W 663 | x := make([]uint, n) 664 | for i := range n { 665 | x[i]-- 666 | } 667 | return x 668 | } 669 | 670 | func TestAddMulVVWSized(t *testing.T) { 671 | // Sized addMulVVW have architecture-specific implementations on 672 | // a number of architectures. Test that they match the generic 673 | // implementation. 674 | tests := []struct { 675 | n int 676 | f func(z, x *uint, y uint) uint 677 | }{ 678 | {1024, addMulVVW1024}, 679 | {1536, addMulVVW1536}, 680 | {2048, addMulVVW2048}, 681 | } 682 | for _, test := range tests { 683 | t.Run(fmt.Sprint(test.n), func(t *testing.T) { 684 | x := makeTestValue(test.n) 685 | z := makeTestValue(test.n) 686 | z2 := slices.Clone(z) 687 | var y uint 688 | y-- 689 | c := addMulVVW(z, x, y) 690 | c2 := test.f(&z2[0], &x[0], y) 691 | if !slices.Equal(z, z2) || c != c2 { 692 | t.Errorf("%016X, %016X != %016X, %016X", z, c, z2, c2) 693 | } 694 | }) 695 | } 696 | } 697 | 698 | func TestInverse(t *testing.T) { 699 | f, err := os.Open("testdata/mod_inv_tests.txt") 700 | if err != nil { 701 | t.Fatal(err) 702 | } 703 | 704 | var ModInv, A, M string 705 | var lineNum int 706 | scanner := bufio.NewScanner(f) 707 | for scanner.Scan() { 708 | lineNum++ 709 | line := scanner.Text() 710 | if len(line) == 0 || line[0] == '#' { 711 | continue 712 | } 713 | 714 | k, v, _ := strings.Cut(line, " = ") 715 | switch k { 716 | case "ModInv": 717 | ModInv = v 718 | case "A": 719 | A = v 720 | case "M": 721 | M = v 722 | 723 | t.Run(fmt.Sprintf("line %d", lineNum), func(t *testing.T) { 724 | m, err := NewModulus(decodeHex(t, M)) 725 | if err != nil { 726 | t.Skip("modulus <= 1") 727 | } 728 | a, err := NewNat().SetBytes(decodeHex(t, A), m) 729 | if err != nil { 730 | t.Fatal(err) 731 | } 732 | 733 | got, ok := NewNat().InverseVarTime(a, m) 734 | if !ok { 735 | t.Fatal("not invertible") 736 | } 737 | exp, err := NewNat().SetBytes(decodeHex(t, ModInv), m) 738 | if err != nil { 739 | t.Fatal(err) 740 | } 741 | if got.Equal(exp) != 1 { 742 | t.Errorf("%v != %v", got, exp) 743 | } 744 | }) 745 | default: 746 | t.Fatalf("unknown key %q on line %d", k, lineNum) 747 | } 748 | } 749 | if err := scanner.Err(); err != nil { 750 | t.Fatal(err) 751 | } 752 | } 753 | 754 | func decodeHex(t *testing.T, s string) []byte { 755 | t.Helper() 756 | if len(s)%2 != 0 { 757 | s = "0" + s 758 | } 759 | b, err := hex.DecodeString(s) 760 | if err != nil { 761 | t.Fatalf("failed to decode hex %q: %v", s, err) 762 | } 763 | return b 764 | } 765 | -------------------------------------------------------------------------------- /nat_wasm.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build !purego 6 | 7 | package bigmod 8 | 9 | import "unsafe" 10 | 11 | // The generic implementation relies on 64x64->128 bit multiplication and 12 | // 64-bit add-with-carry, which are compiler intrinsics on many architectures. 13 | // Wasm doesn't support those. Here we implement it with 32x32->64 bit 14 | // operations, which is more efficient on Wasm. 15 | 16 | func idx(x *uint, i uintptr) *uint { 17 | return (*uint)(unsafe.Pointer(uintptr(unsafe.Pointer(x)) + i*8)) 18 | } 19 | 20 | func addMulVVWWasm(z, x *uint, y uint, n uintptr) (carry uint) { 21 | const mask32 = 1<<32 - 1 22 | y0 := y & mask32 23 | y1 := y >> 32 24 | for i := range n { 25 | xi := *idx(x, i) 26 | x0 := xi & mask32 27 | x1 := xi >> 32 28 | zi := *idx(z, i) 29 | z0 := zi & mask32 30 | z1 := zi >> 32 31 | c0 := carry & mask32 32 | c1 := carry >> 32 33 | 34 | w00 := x0*y0 + z0 + c0 35 | l00 := w00 & mask32 36 | h00 := w00 >> 32 37 | 38 | w01 := x0*y1 + z1 + h00 39 | l01 := w01 & mask32 40 | h01 := w01 >> 32 41 | 42 | w10 := x1*y0 + c1 + l01 43 | h10 := w10 >> 32 44 | 45 | carry = x1*y1 + h10 + h01 46 | *idx(z, i) = w10<<32 + l00 47 | } 48 | return carry 49 | } 50 | 51 | func addMulVVW1024(z, x *uint, y uint) (c uint) { 52 | return addMulVVWWasm(z, x, y, 1024/_W) 53 | } 54 | 55 | func addMulVVW1536(z, x *uint, y uint) (c uint) { 56 | return addMulVVWWasm(z, x, y, 1536/_W) 57 | } 58 | 59 | func addMulVVW2048(z, x *uint, y uint) (c uint) { 60 | return addMulVVWWasm(z, x, y, 2048/_W) 61 | } 62 | -------------------------------------------------------------------------------- /testdata/mod_inv_tests.txt: -------------------------------------------------------------------------------- 1 | # ModInv tests. 2 | # 3 | # These test vectors satisfy ModInv * A = 1 (mod M) and 0 <= ModInv < M. 4 | 5 | ModInv = 00 6 | A = 00 7 | M = 01 8 | 9 | ModInv = 00 10 | A = 01 11 | M = 01 12 | 13 | ModInv = 00 14 | A = 02 15 | M = 01 16 | 17 | ModInv = 00 18 | A = 03 19 | M = 01 20 | 21 | ModInv = 64 22 | A = 54 23 | M = e3 24 | 25 | ModInv = 13 26 | A = 2b 27 | M = 30 28 | 29 | ModInv = 2f 30 | A = 30 31 | M = 37 32 | 33 | ModInv = 4 34 | A = 13 35 | M = 4b 36 | 37 | ModInv = 1c47 38 | A = cd4 39 | M = 6a21 40 | 41 | ModInv = 2b97 42 | A = 8e7 43 | M = 49c0 44 | 45 | ModInv = 29b9 46 | A = fcb 47 | M = 3092 48 | 49 | ModInv = a83 50 | A = 14bf 51 | M = 41ae 52 | 53 | ModInv = 18f15fe1 54 | A = 11b5d53e 55 | M = 322e92a1 56 | 57 | ModInv = 32f9453b 58 | A = 8af6df6 59 | M = 33d45eb7 60 | 61 | ModInv = d696369 62 | A = c5f89dd5 63 | M = fc09c17c 64 | 65 | ModInv = 622839d8 66 | A = 60c2526 67 | M = 74200493 68 | 69 | ModInv = fb5a8aee7bbc4ef 70 | A = 24ebd835a70be4e2 71 | M = 9c7256574e0c5e93 72 | 73 | ModInv = 846bc225402419c 74 | A = 23026003ab1fbdb 75 | M = 1683cbe32779c59b 76 | 77 | ModInv = 5ff84f63a78982f9 78 | A = 4a2420dc733e1a0f 79 | M = a73c6bfabefa09e6 80 | 81 | ModInv = 133e74d28ef42b43 82 | A = 2e9511ae29cdd41 83 | M = 15234df99f19fcda 84 | 85 | ModInv = 46ae1fabe9521e4b99b198fc8439609023aa69be2247c0d1e27c2a0ea332f9c5 86 | A = 6331fec5f01014046788c919ed50dc86ac7a80c085f1b6f645dd179c0f0dc9cd 87 | M = 8ef409de82318259a8655a39293b1e762fa2cc7e0aeb4c59713a1e1fff6af640 88 | 89 | ModInv = 444ccea3a7b21677dd294d34de53cc8a5b51e69b37782310a00fc6bcc975709b 90 | A = 679280bd880994c08322143a4ea8a0825d0466fda1bb6b3eb86fc8e90747512b 91 | M = e4fecab84b365c63a0dab4244ce3f921a9c87ec64d69a2031939f55782e99a2e 92 | 93 | ModInv = 1ac7d7a03ceec5f690f567c9d61bf3469c078285bcc5cf00ac944596e887ca17 94 | A = 1593ef32d9c784f5091bdff952f5c5f592a3aed6ba8ea865efa6d7df87be1805 95 | M = 1e276882f90c95e0c1976eb079f97af075445b1361c02018d6bd7191162e67b2 96 | 97 | ModInv = 639108b90dfe946f498be21303058413bbb0e59d0bd6a6115788705abd0666d6 98 | A = 9258d6238e4923d120b2d1033573ffcac691526ad0842a3b174dccdbb79887bd 99 | M = ce62909c39371d463aaba3d4b72ea6da49cb9b529e39e1972ef3ccd9a66fe08f 100 | 101 | ModInv = aebde7654cb17833a106231c4b9e2f519140e85faee1bfb4192830f03f385e773c0f4767e93e874ffdc3b7a6b7e6a710e5619901c739ee8760a26128e8c91ef8cf761d0e505d8b28ae078d17e6071c372893bb7b72538e518ebc57efa70b7615e406756c49729b7c6e74f84aed7a316b6fa748ff4b9f143129d29dad1bff98bb 102 | A = a29dacaf5487d354280fdd2745b9ace4cd50f2bde41d0ee529bf26a1913244f708085452ff32feab19a7418897990da46a0633f7c8375d583367319091bbbe069b0052c5e48a7daac9fb650db5af768cd2508ec3e2cda7456d4b9ce1c39459627a8b77e038b826cd7e326d0685b0cd0cb50f026f18300dae9f5fd42aa150ee8b 103 | M = d686f9b86697313251685e995c09b9f1e337ddfaa050bd2df15bf4ca1dc46c5565021314765299c434ea1a6ec42bf92a29a7d1ffff599f4e50b79a82243fb24813060580c770d4c1140aeb2ab2685007e948b6f1f62e8001a0545619477d498132c907774479f6d95899e6251e7136f79ab6d3b7c82e4aca421e7d22fe7db19c 104 | 105 | ModInv = 1ec872f4f20439e203597ca4de9d1296743f95781b2fe85d5def808558bbadef02a46b8955f47c83e1625f8bb40228eab09cad2a35c9ad62ab77a30e3932872959c5898674162da244a0ec1f68c0ed89f4b0f3572bfdc658ad15bf1b1c6e1176b0784c9935bd3ff1f49bb43753eacee1d8ca1c0b652d39ec727da83984fe3a0f 106 | A = 2e527b0a1dc32460b2dd94ec446c692989f7b3c7451a5cbeebf69fc0ea9c4871fbe78682d5dc5b66689f7ed889b52161cd9830b589a93d21ab26dbede6c33959f5a0f0d107169e2daaac78bac8cf2d41a1eb1369cb6dc9e865e73bb2e51b886f4e896082db199175e3dde0c4ed826468f238a77bd894245d0918efc9ca84f945 107 | M = b13133a9ebe0645f987d170c077eea2aa44e85c9ab10386d02867419a590cb182d9826a882306c212dbe75225adde23f80f5b37ca75ed09df20fc277cc7fbbfac8d9ef37a50f6b68ea158f5447283618e64e1426406d26ea85232afb22bf546c75018c1c55cb84c374d58d9d44c0a13ba88ac2e387765cb4c3269e3a983250fa 108 | 109 | ModInv = 30ffa1876313a69de1e4e6ee132ea1d3a3da32f3b56f5cfb11402b0ad517dce605cf8e91d69fa375dd887fa8507bd8a28b2d5ce745799126e86f416047709f93f07fbd88918a047f13100ea71b1d48f6fc6d12e5c917646df3041b302187af641eaedf4908abc36f12c204e1526a7d80e96e302fb0779c28d7da607243732f26 110 | A = 31157208bde6b85ebecaa63735947b3b36fa351b5c47e9e1c40c947339b78bf96066e5dbe21bb42629e6fcdb81f5f88db590bfdd5f4c0a6a0c3fc6377e5c1fd8235e46e291c688b6d6ecfb36604891c2a7c9cbcc58c26e44b43beecb9c5044b58bb58e35de3cf1128f3c116534fe4e421a33f83603c3df1ae36ec88092f67f2a 111 | M = 53408b23d6cb733e6c9bc3d1e2ea2286a5c83cc4e3e7470f8af3a1d9f28727f5b1f8ae348c1678f5d1105dc3edf2de64e65b9c99545c47e64b770b17c8b4ef5cf194b43a0538053e87a6b95ade1439cebf3d34c6aa72a11c1497f58f76011e16c5be087936d88aba7a740113120e939e27bd3ddcb6580c2841aa406566e33c35 112 | 113 | ModInv = 87355002f305c81ba0dc97ca2234a2bc02528cefde38b94ac5bd95efc7bf4c140899107fff47f0df9e3c6aa70017ebc90610a750f112cd4f475b9c76b204a953444b4e7196ccf17e93fdaed160b7345ca9b397eddf9446e8ea8ee3676102ce70eaafbe9038a34639789e6f2f1e3f352638f2e8a8f5fc56aaea7ec705ee068dd5 114 | A = 42a25d0bc96f71750f5ac8a51a1605a41b506cca51c9a7ecf80cad713e56f70f1b4b6fa51cbb101f55fd74f318adefb3af04e0c8a7e281055d5a40dd40913c0e1211767c5be915972c73886106dc49325df6c2df49e9eea4536f0343a8e7d332c6159e4f5bdb20d89f90e67597c4a2a632c31b2ef2534080a9ac61f52303990d 115 | M = d3d3f95d50570351528a76ab1e806bae1968bd420899bdb3d87c823fac439a4354c31f6c888c939784f18fe10a95e6d203b1901caa18937ba6f8be033af10c35fc869cf3d16bef479f280f53b3499e645d0387554623207ca4989e5de00bfeaa5e9ab56474fc60dd4967b100e0832eaaf2fcb2ef82a181567057b880b3afef62 116 | --------------------------------------------------------------------------------